/mordor/string.cpp
C++ | 854 lines | 754 code | 88 blank | 12 comment | 161 complexity | 709add1edbe3dac740180b2e2b727250 MD5 | raw file
1// Copyright (c) 2009 - Mozy, Inc. 2 3#include <algorithm> 4 5#include <string.h> 6 7#include <openssl/md5.h> 8#include <openssl/sha.h> 9 10#ifdef HAVE_CONFIG_H 11#include "autoconfig.h" 12#ifdef HAVE_ICONV 13#include <iconv.h> 14#endif 15#endif 16 17#include "mordor/string.h" 18#include "mordor/util.h" 19 20#include "assert.h" 21#include "exception.h" 22 23#ifdef MSVC 24#pragma comment(lib, "libeay32") 25#endif 26 27namespace Mordor { 28 29std::string 30base64decode(const std::string &src) 31{ 32 std::string result; 33 result.resize(src.size() * 3 / 4); 34 char *writeBuf = &result[0]; 35 36 const char* ptr = src.c_str(); 37 const char* end = ptr + src.size(); 38 39 while(ptr < end) { 40 int i = 0; 41 int padding = 0; 42 int packed = 0; 43 for(; i < 4 && ptr < end; ++i, ++ptr) { 44 if(*ptr == '=') { 45 ++padding; 46 packed <<= 6; 47 continue; 48 } 49 50 // padding with "=" only 51 if (padding > 0) 52 return ""; 53 54 int val = 0; 55 if(*ptr >= 'A' && *ptr <= 'Z') 56 val = *ptr - 'A'; 57 else if(*ptr >= 'a' && *ptr <= 'z') 58 val = *ptr - 'a' + 26; 59 else if(*ptr >= '0' && *ptr <= '9') 60 val = *ptr - '0' + 52; 61 else if(*ptr == '+') 62 val = 62; 63 else if(*ptr == '/') 64 val = 63; 65 else 66 return ""; // invalid character 67 68 packed = (packed << 6) | val; 69 } 70 if (i != 4) 71 return ""; 72 if (padding > 0 && ptr != end) 73 return ""; 74 if (padding > 2) 75 return ""; 76 77 *writeBuf++ = (char)((packed >> 16) & 0xff); 78 if(padding != 2) 79 *writeBuf++ = (char)((packed >> 8) & 0xff); 80 if(padding == 0) 81 *writeBuf++ = (char)(packed & 0xff); 82 } 83 84 result.resize(writeBuf - result.c_str()); 85 return result; 86} 87 88std::string 89base64encode(const std::string& data) 90{ 91 return base64encode(data.c_str(), data.size()); 92} 93 94std::string 95base64encode(const void* data, size_t len) 96{ 97 const char* base64 = 98 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; 99 100 std::string ret; 101 ret.reserve(len * 4 / 3 + 2); 102 103 const unsigned char* ptr = (const unsigned char*)data; 104 const unsigned char* end = ptr + len; 105 106 while(ptr < end) { 107 unsigned int packed = 0; 108 int i = 0; 109 int padding = 0; 110 for(; i < 3 && ptr < end; ++i, ++ptr) 111 packed = (packed << 8) | *ptr; 112 if(i == 2) 113 padding = 1; 114 else if (i == 1) 115 padding = 2; 116 for(; i < 3; ++i) 117 packed <<= 8; 118 119 ret.append(1, base64[packed >> 18]); 120 ret.append(1, base64[(packed >> 12) & 0x3f]); 121 if(padding != 2) 122 ret.append(1, base64[(packed >> 6) & 0x3f]); 123 if(padding == 0) 124 ret.append(1, base64[packed & 0x3f]); 125 ret.append(padding, '='); 126 } 127 128 return ret; 129} 130 131std::string 132md5(const std::string &data) 133{ 134 return hexstringFromData(md5sum(data).c_str(), MD5_DIGEST_LENGTH); 135} 136 137std::string 138sha1(const std::string &data) 139{ 140 return hexstringFromData(sha1sum(data).c_str(), SHA_DIGEST_LENGTH); 141} 142 143std::string 144md5sum(const void *data, size_t len) 145{ 146 MD5_CTX ctx; 147 MD5_Init(&ctx); 148 MD5_Update(&ctx, data, len); 149 std::string result; 150 result.resize(MD5_DIGEST_LENGTH); 151 MD5_Final((unsigned char*)&result[0], &ctx); 152 return result; 153} 154 155std::string 156md5sum(const std::string &data) 157{ 158 return md5sum(data.c_str(), data.size()); 159} 160 161std::string 162sha0sum(const void *data, size_t len) 163{ 164 SHA_CTX ctx; 165 SHA_Init(&ctx); 166 SHA_Update(&ctx, data, len); 167 std::string result; 168 result.resize(SHA_DIGEST_LENGTH); 169 SHA_Final((unsigned char*)&result[0], &ctx); 170 return result; 171} 172 173std::string 174sha0sum(const std::string & data) 175{ 176 return sha0sum(data.c_str(), data.length()); 177} 178 179std::string 180sha1sum(const void *data, size_t len) 181{ 182 SHA_CTX ctx; 183 SHA1_Init(&ctx); 184 SHA1_Update(&ctx, data, len); 185 std::string result; 186 result.resize(SHA_DIGEST_LENGTH); 187 SHA1_Final((unsigned char*)&result[0], &ctx); 188 return result; 189} 190 191std::string 192sha1sum(const std::string &data) 193{ 194 return sha1sum(data.c_str(), data.size()); 195} 196 197struct xorStruct 198{ 199 xorStruct(char value) : m_value(value) {} 200 char m_value; 201 char operator()(char in) const { return in ^ m_value; } 202}; 203 204template <class CTX, 205 int (*Init)(CTX *), 206 int (*Update)(CTX *, const void *, size_t), 207 int (*Final)(unsigned char *, CTX *), 208 unsigned int B, unsigned int L> 209std::string 210hmac(const std::string &text, const std::string &key) 211{ 212 std::string keyLocal = key; 213 CTX ctx; 214 if (keyLocal.size() > B) { 215 Init(&ctx); 216 Update(&ctx, keyLocal.c_str(), keyLocal.size()); 217 keyLocal.resize(L); 218 Final((unsigned char *)&keyLocal[0], &ctx); 219 } 220 keyLocal.append(B - keyLocal.size(), '\0'); 221 std::string ipad = keyLocal, opad = keyLocal; 222 std::transform(ipad.begin(), ipad.end(), ipad.begin(), xorStruct(0x36)); 223 std::transform(opad.begin(), opad.end(), opad.begin(), xorStruct(0x5c)); 224 Init(&ctx); 225 Update(&ctx, ipad.c_str(), B); 226 Update(&ctx, text.c_str(), text.size()); 227 std::string result; 228 result.resize(L); 229 Final((unsigned char *)&result[0], &ctx); 230 Init(&ctx); 231 Update(&ctx, opad.c_str(), B); 232 Update(&ctx, result.c_str(), L); 233 Final((unsigned char *)&result[0], &ctx); 234 return result; 235} 236 237std::string 238hmacMd5(const std::string &text, const std::string &key) 239{ 240 return hmac<MD5_CTX, 241 &MD5_Init, 242 &MD5_Update, 243 &MD5_Final, 244 MD5_CBLOCK, MD5_DIGEST_LENGTH> 245 (text, key); 246} 247 248std::string 249hmacSha1(const std::string &text, const std::string &key) 250{ 251 return hmac<SHA_CTX, 252 &SHA1_Init, 253 &SHA1_Update, 254 &SHA1_Final, 255 SHA_CBLOCK, SHA_DIGEST_LENGTH> 256 (text, key); 257} 258 259std::string 260hmacSha256(const std::string &text, const std::string &key) 261{ 262 return hmac<SHA256_CTX, 263 &SHA256_Init, 264 &SHA256_Update, 265 &SHA256_Final, 266 SHA256_CBLOCK, SHA256_DIGEST_LENGTH> 267 (text, key); 268} 269 270void 271hexstringFromData(const void *data, size_t len, char *output) 272{ 273 const unsigned char *buf = (const unsigned char *)data; 274 size_t i, j; 275 for (i = j = 0; i < len; ++i) { 276 char c; 277 c = (buf[i] >> 4) & 0xf; 278 c = (c > 9) ? c + 'a' - 10 : c + '0'; 279 output[j++] = c; 280 c = (buf[i] & 0xf); 281 c = (c > 9) ? c + 'a' - 10 : c + '0'; 282 output[j++] = c; 283 } 284} 285 286std::string 287hexstringFromData(const void *data, size_t len) 288{ 289 if (len == 0) 290 return std::string(); 291 std::string result; 292 result.resize(len * 2); 293 hexstringFromData(data, len, &result[0]); 294 return result; 295} 296 297std::string 298hexstringFromData(const std::string &data) 299{ 300 return hexstringFromData(data.c_str(), data.size()); 301} 302 303void 304dataFromHexstring(const char *hexstring, size_t length, void *output) 305{ 306 unsigned char *buf = (unsigned char *)output; 307 unsigned char byte; 308 if (length % 2 != 0) 309 MORDOR_THROW_EXCEPTION(std::invalid_argument("length")); 310 for (size_t i = 0; i < length; ++i) { 311 switch (hexstring[i]) { 312 case 'a': 313 case 'b': 314 case 'c': 315 case 'd': 316 case 'e': 317 case 'f': 318 byte = (hexstring[i] - 'a' + 10) << 4; 319 break; 320 case 'A': 321 case 'B': 322 case 'C': 323 case 'D': 324 case 'E': 325 case 'F': 326 byte = (hexstring[i] - 'A' + 10) << 4; 327 break; 328 case '0': 329 case '1': 330 case '2': 331 case '3': 332 case '4': 333 case '5': 334 case '6': 335 case '7': 336 case '8': 337 case '9': 338 byte = (hexstring[i] - '0') << 4; 339 break; 340 default: 341 MORDOR_THROW_EXCEPTION(std::invalid_argument("hexstring")); 342 } 343 ++i; 344 switch (hexstring[i]) { 345 case 'a': 346 case 'b': 347 case 'c': 348 case 'd': 349 case 'e': 350 case 'f': 351 byte |= hexstring[i] - 'a' + 10; 352 break; 353 case 'A': 354 case 'B': 355 case 'C': 356 case 'D': 357 case 'E': 358 case 'F': 359 byte |= hexstring[i] - 'A' + 10; 360 break; 361 case '0': 362 case '1': 363 case '2': 364 case '3': 365 case '4': 366 case '5': 367 case '6': 368 case '7': 369 case '8': 370 case '9': 371 byte |= hexstring[i] - '0'; 372 break; 373 default: 374 MORDOR_THROW_EXCEPTION(std::invalid_argument("hexstring")); 375 } 376 *buf++ = byte; 377 } 378} 379 380std::string 381dataFromHexstring(const char *hexstring, size_t length) 382{ 383 if (length % 2 != 0) 384 MORDOR_THROW_EXCEPTION(std::invalid_argument("length")); 385 if (length == 0) 386 return std::string(); 387 std::string result; 388 result.resize(length / 2); 389 dataFromHexstring(hexstring, length, &result[0]); 390 return result; 391} 392 393std::string 394dataFromHexstring(const std::string &hexstring) 395{ 396 return dataFromHexstring(hexstring.c_str(), hexstring.size()); 397} 398 399void 400replace(std::string &str, char find, char replaceWith) 401{ 402 size_t index = str.find(find); 403 while (index != std::string::npos) { 404 str[index] = replaceWith; 405 index = str.find(find, index + 1); 406 } 407} 408 409void 410replace(std::string &str, char find, const std::string &replaceWith) 411{ 412 size_t index = str.find(find); 413 while (index != std::string::npos) { 414 str = str.substr(0, index) + replaceWith + str.substr(index + 1); 415 index = str.find(find, index + replaceWith.size()); 416 } 417} 418 419void 420replace(std::string &str, const std::string &find, const std::string &replaceWith) 421{ 422 size_t index = str.find(find); 423 while (index != std::string::npos) { 424 str = str.substr(0, index) + replaceWith + str.substr(index + find.size()); 425 index = str.find(find, index + replaceWith.size()); 426 } 427} 428 429std::vector<std::string> 430split(const std::string &str, char delim, size_t max) 431{ 432 MORDOR_ASSERT(max > 1); 433 std::vector<std::string> result; 434 if (str.empty()) 435 return result; 436 437 size_t last = 0; 438 size_t pos = str.find(delim); 439 while (pos != std::string::npos) { 440 result.push_back(str.substr(last, pos - last)); 441 last = pos + 1; 442 if (--max == 1) 443 break; 444 pos = str.find(delim, last); 445 } 446 result.push_back(str.substr(last)); 447 return result; 448} 449 450std::vector<std::string> 451split(const std::string &str, const char *delims, size_t max) 452{ 453 MORDOR_ASSERT(max > 1); 454 std::vector<std::string> result; 455 if (str.empty()) 456 return result; 457 458 size_t last = 0; 459 size_t pos = str.find_first_of(delims); 460 while (pos != std::string::npos) { 461 result.push_back(str.substr(last, pos - last)); 462 last = pos + 1; 463 if (--max == 1) 464 break; 465 pos = str.find_first_of(delims, last); 466 } 467 result.push_back(str.substr(last)); 468 return result; 469} 470 471static bool endsWith(const std::string &string, const std::string &suffix) 472{ 473 return string.size() >= suffix.size() && 474 strnicmp(string.c_str() + string.size() - suffix.size(), 475 suffix.c_str(), suffix.size()) == 0; 476} 477 478namespace { 479struct Suffix 480{ 481 std::string suffix; 482 unsigned long long multiplier; 483}; 484} 485 486unsigned long long stringToMicroseconds(const std::string &string) 487{ 488 static const Suffix suffixes[] = { 489 { "microseconds", 1ull }, 490 { "us", 1ull }, 491 { "milliseconds", 1000ull }, 492 { "ms", 1000ull }, 493 { "seconds", 1000000ull }, 494 { "minutes", 60 * 1000000ull }, 495 { "m", 60 * 1000000ull }, 496 { "hours", 60 * 60 * 1000000ull }, 497 { "h", 60 * 60 * 1000000ull }, 498 { "days", 24 * 60 * 60 * 1000000ull }, 499 { "d", 24 * 60 * 60 * 1000000ull }, 500 // s needs to go at the bottom since we're just suffix matching, and it 501 // would give a false positive for "minutes", etc. 502 { "s", 1000000ull } 503 }; 504 505 std::string copy(string); 506 unsigned long long multiplier = 1ull; 507 508 // Strip leading whitespace 509 while (copy.size() > 1 && copy[0] == ' ') 510 copy = copy.substr(1); 511 // Strip trailing whitespace 512 while (copy.size() > 1 && copy[copy.size() -1] == ' ') 513 copy.resize(copy.size() - 1); 514 515 for (size_t i = 0; i < sizeof(suffixes)/sizeof(suffixes[0]); ++i) { 516 if (endsWith(copy, suffixes[i].suffix)) { 517 multiplier = suffixes[i].multiplier; 518 copy.resize(copy.size() - suffixes[i].suffix.size()); 519 break; 520 } 521 } 522 523 // Strip whitespace between the number and the units 524 while (copy.size() > 1 && copy[copy.size() -1] == ' ') 525 copy.resize(copy.size() - 1); 526 527 // If there's a decimal point, use floating point arithmetic 528 if (copy.find('.') != std::string::npos) 529 return (unsigned long long)(multiplier * 530 boost::lexical_cast<double>(copy)); 531 else 532 return multiplier * boost::lexical_cast<unsigned long long>(copy); 533} 534 535#ifdef WINDOWS 536static DWORD g_wcFlags = WC_ERR_INVALID_CHARS; 537static DWORD g_mbFlags = MB_ERR_INVALID_CHARS; 538 539std::string 540toUtf8(const utf16char *str, size_t len) 541{ 542 if (len == (size_t)~0) 543 len = wcslen(str); 544 MORDOR_ASSERT(len < 0x80000000u); 545 std::string result; 546 if (len == 0) 547 return result; 548 int ret = WideCharToMultiByte(CP_UTF8, g_wcFlags, str, (int)len, NULL, 0, NULL, NULL); 549 MORDOR_ASSERT(ret >= 0); 550 if (ret == 0) { 551 if (lastError() == ERROR_INVALID_FLAGS) { 552 g_wcFlags = 0; 553 ret = WideCharToMultiByte(CP_UTF8, g_wcFlags, str, (int)len, NULL, 0, NULL, NULL); 554 MORDOR_ASSERT(ret >= 0); 555 } 556 if (ret == 0) 557 MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("WideCharToMultiByte"); 558 } 559 result.resize(ret); 560 ret = WideCharToMultiByte(CP_UTF8, g_wcFlags, str, (int)len, &result[0], ret, NULL, NULL); 561 MORDOR_ASSERT(ret >= 0); 562 if (ret == 0) 563 MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("WideCharToMultiByte"); 564 MORDOR_ASSERT(ret == result.size()); 565 566 return result; 567} 568 569std::string 570toUtf8(const std::wstring &str) 571{ 572 MORDOR_ASSERT(str.size() < 0x80000000u); 573 return toUtf8(str.c_str(), str.size()); 574} 575 576utf16string 577toUtf16(const char *str, size_t len) 578{ 579 if (len == (size_t)~0) 580 len = strlen(str); 581 MORDOR_ASSERT(len < 0x80000000u); 582 utf16string result; 583 if (len == 0) 584 return result; 585 int ret = MultiByteToWideChar(CP_UTF8, g_mbFlags, str, (int)len, NULL, 0); 586 MORDOR_ASSERT(ret >= 0); 587 if (ret == 0) { 588 if (lastError() == ERROR_INVALID_FLAGS) { 589 g_mbFlags = 0; 590 ret = MultiByteToWideChar(CP_UTF8, g_mbFlags, str, (int)len, NULL, 0); 591 MORDOR_ASSERT(ret >= 0); 592 } 593 if (ret == 0) 594 MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("MultiByteToWideChar"); 595 } 596 result.resize(ret); 597 ret = MultiByteToWideChar(CP_UTF8, g_mbFlags, str, (int)len, &result[0], ret); 598 if (ret == 0) 599 MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("MultiByteToWideChar"); 600 MORDOR_ASSERT(ret == result.size()); 601 602 return result; 603} 604 605utf16string 606toUtf16(const std::string &str) 607{ 608 MORDOR_ASSERT(str.size() < 0x80000000u); 609 return toUtf16(str.c_str(), str.size()); 610} 611#elif defined (OSX) 612 613std::string 614toUtf8(CFStringRef string) 615{ 616 const char *bytes = CFStringGetCStringPtr(string, kCFStringEncodingUTF8); 617 if (bytes) 618 return bytes; 619 std::string result; 620 CFIndex length = CFStringGetLength(string); 621 // Include extra byte for null termination 622 length = CFStringGetMaximumSizeForEncoding(length, kCFStringEncodingUTF8) + 1; 623 result.resize(length); 624 if (!CFStringGetCString(string, &result[0], length, kCFStringEncodingUTF8)) { 625 MORDOR_NOTREACHED(); 626 } 627 result.resize(strlen(result.c_str())); 628 return result; 629} 630 631utf16string 632toUtf16(const char * str, size_t length) 633{ 634 utf16string result; 635 if (length == 0u) 636 return result; 637 ScopedCFRef<CFStringRef> cfUtf8Str = CFStringCreateWithBytesNoCopy(NULL, 638 (const UInt8 *)str, (CFIndex)length, kCFStringEncodingUTF8, false, 639 kCFAllocatorNull); 640 if (!cfUtf8Str) 641 MORDOR_THROW_EXCEPTION(InvalidUnicodeException()); 642#if MORDOR_BYTE_ORDER == MORDOR_LITTLE_ENDIAN 643 ScopedCFRef<CFDataRef> cfUtf16Data = CFStringCreateExternalRepresentation( 644 NULL, cfUtf8Str, kCFStringEncodingUTF16LE, 0); 645#elif MORDOR_BYTE_ORDER == MORDOR_BIG_ENDIAN 646 ScopedCFRef<CFDataRef> cfUtf16Data = CFStringCreateExternalRepresentation( 647 NULL, cfUtf8Str, kCFStringEncodingUTF16BE, 0); 648#endif 649 MORDOR_ASSERT(cfUtf16Data); 650 MORDOR_ASSERT(CFDataGetLength(cfUtf16Data) % sizeof(utf16char) == 0); 651 result.resize(CFDataGetLength(cfUtf16Data) / sizeof(utf16char)); 652 CFDataGetBytes(cfUtf16Data, CFRangeMake(0,CFDataGetLength(cfUtf16Data)), 653 (UInt8 *)&result[0]); 654 return result; 655} 656 657utf16string 658toUtf16(const std::string &str) 659{ 660 return toUtf16(str.c_str(), str.size()); 661} 662 663#elif defined(HAVE_ICONV) 664 665namespace { 666 667class Iconv { 668 iconv_t m_iconv; 669public: 670 Iconv(const char* from, const char* to) 671 : m_iconv(iconv_open(to, from)) 672 { 673 MORDOR_ASSERT(m_iconv != (iconv_t)-1); 674 } 675 ~Iconv() { 676 iconv_close(m_iconv); 677 } 678 size_t operator()(char** inbuf, size_t* inlen, char** outbuf, size_t* outlen) { 679 return iconv(m_iconv, inbuf, inlen, outbuf, outlen); 680 } 681}; 682} 683 684utf16string 685toUtf16(const char *str, size_t len) 686{ 687 utf16string result; 688 if (len == 0u) 689 return result; 690 result.resize(len); // way enough (paired surrogate also) 691 size_t out_left = len * sizeof(utf16string::value_type); 692 char *out_buf = (char *)&result[0]; 693 Iconv conv("UTF-8", "UTF-16LE"); 694 size_t n = conv((char **)&str, &len, &out_buf, &out_left); 695 if (n == (size_t)-1) { 696 MORDOR_ASSERT(errno != E2BIG); 697 MORDOR_THROW_EXCEPTION(InvalidUnicodeException()); 698 } 699 MORDOR_ASSERT(out_left % sizeof(utf16string::value_type) == 0); 700 result.resize(result.size() - out_left/sizeof(utf16string::value_type)); 701 return result; 702} 703 704utf16string 705toUtf16(const std::string &str) 706{ 707 return toUtf16(str.data(), str.size()); 708} 709 710#endif 711 712std::string 713toUtf8(utf16char character) 714{ 715 return toUtf8((utf32char)character); 716} 717 718std::string 719toUtf8(utf32char character) 720{ 721 MORDOR_ASSERT(character <= 0x10ffff); 722 std::string result; 723 if (character <= 0x7f) { 724 result.append(1, (char)character); 725 } else if (character <= 0x7ff) { 726 result.resize(2); 727 result[0] = 0xc0 | ((character >> 6) & 0x1f); 728 result[1] = 0x80 | (character & 0x3f); 729 } else if (character <= 0xffff) { 730 result.resize(3); 731 result[0] = 0xe0 | ((character >> 12) & 0xf); 732 result[1] = 0x80 | ((character >> 6) & 0x3f); 733 result[2] = 0x80 | (character & 0x3f); 734 } else { 735 result.resize(4); 736 result[0] = 0xf0 | ((character >> 18) & 0x7); 737 result[1] = 0x80 | ((character >> 12) & 0x3f); 738 result[2] = 0x80 | ((character >> 6) & 0x3f); 739 result[3] = 0x80 | (character & 0x3f); 740 } 741 return result; 742} 743 744utf32char 745toUtf32(utf16char highSurrogate, utf16char lowSurrogate) 746{ 747 MORDOR_ASSERT(isHighSurrogate(highSurrogate)); 748 MORDOR_ASSERT(isLowSurrogate(lowSurrogate)); 749 return ((((utf32char)highSurrogate - 0xd800) << 10) | ((utf32char)lowSurrogate - 0xdc00)) + 0x10000; 750} 751 752std::string 753toUtf8(utf16char highSurrogate, utf16char lowSurrogate) 754{ 755 return toUtf8(toUtf32(highSurrogate, lowSurrogate)); 756} 757 758bool isHighSurrogate(utf16char character) 759{ 760 return character >= 0xd800 && character <= 0xdbff; 761} 762 763bool isLowSurrogate(utf16char character) 764{ 765 return character >= 0xdc00 && character <= 0xdfff; 766} 767 768// following content is nearly copied from glib completely. 769// get more info, please refer to https://git.gnome.org/browse/glib/tree/glib/gutf8.c, 770// as well as http://en.wikipedia.org/wiki/UTF-8 771typedef unsigned char guchar; 772 773bool 774validateUtf8(const std::string &str) 775{ 776 unsigned int val = 0; 777 unsigned int min = 0; 778 const char *begin = str.data(); 779 const size_t len = str.size(); 780 781#define CONTINUATION_CHAR \ 782 do { \ 783 if ((*(guchar *)p & 0xc0) != 0x80) /* 10xxxxxx */ \ 784 return (false); \ 785 val <<= 6; \ 786 val |= (*(guchar *)p) & 0x3f; \ 787 } while(0) 788 789#define CONTINUATION_CHARS(Count) \ 790 for(int i = 0; i < Count; i++) {\ 791 pos++; \ 792 if (pos >= len) \ 793 return false; \ 794 p++; \ 795 CONTINUATION_CHAR; \ 796 } 797 798 size_t pos = 0; 799 for (const char *p = begin; pos < len; pos++, p = begin + pos) { 800 if (*(guchar *)p < 128) 801 /* done */; 802 else { 803 if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */ { 804 if ((*(guchar *)p & 0x1e) == 0) 805 return false; 806 pos++; 807 if (pos >= len) 808 return false; 809 p++; 810 if ((*(guchar *)p & 0xc0) != 0x80) /* 10xxxxxx */ 811 return false; 812 } else { 813 if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */ { 814 min = (1 << 11); 815 val = *(guchar *)p & 0x0f; 816 CONTINUATION_CHARS(2); 817 } else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */ { 818 min = (1 << 16); 819 val = *(guchar *)p & 0x07; 820 CONTINUATION_CHARS(3); 821 } else if ((*(guchar *)p & 0xfc) == 0xf8) /* 111110xx */ { 822 min = (1 << 21); 823 val = *(guchar *)p & 0x03; 824 CONTINUATION_CHARS(4); 825 } else if ((*(guchar *)p & 0xfe) == 0xfc) /* 1111110x */ { 826 min = (1 << 26); 827 val = *(guchar *)p & 0x01; 828 CONTINUATION_CHARS(5); 829 } else 830 return false; 831 832 if (val < min) 833 return false; 834 } 835 } 836 } 837 return true; 838} 839 840bool 841caseinsensitiveless::operator ()(const std::string &lhs, const std::string &rhs) const 842{ 843 return stricmp(lhs.c_str(), rhs.c_str()) < 0; 844} 845 846std::ostream &operator <<(std::ostream &os, const charslice &slice) 847{ 848 for (size_t i = 0; i < slice.m_len; ++i) { 849 os.put(slice.m_slice[i]); 850 } 851 return os; 852} 853 854}