PageRenderTime 48ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/mordor/string.cpp

http://github.com/mozy/mordor
C++ | 854 lines | 754 code | 88 blank | 12 comment | 161 complexity | 709add1edbe3dac740180b2e2b727250 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. // Copyright (c) 2009 - Mozy, Inc.
  2. #include <algorithm>
  3. #include <string.h>
  4. #include <openssl/md5.h>
  5. #include <openssl/sha.h>
  6. #ifdef HAVE_CONFIG_H
  7. #include "autoconfig.h"
  8. #ifdef HAVE_ICONV
  9. #include <iconv.h>
  10. #endif
  11. #endif
  12. #include "mordor/string.h"
  13. #include "mordor/util.h"
  14. #include "assert.h"
  15. #include "exception.h"
  16. #ifdef MSVC
  17. #pragma comment(lib, "libeay32")
  18. #endif
  19. namespace Mordor {
  20. std::string
  21. base64decode(const std::string &src)
  22. {
  23. std::string result;
  24. result.resize(src.size() * 3 / 4);
  25. char *writeBuf = &result[0];
  26. const char* ptr = src.c_str();
  27. const char* end = ptr + src.size();
  28. while(ptr < end) {
  29. int i = 0;
  30. int padding = 0;
  31. int packed = 0;
  32. for(; i < 4 && ptr < end; ++i, ++ptr) {
  33. if(*ptr == '=') {
  34. ++padding;
  35. packed <<= 6;
  36. continue;
  37. }
  38. // padding with "=" only
  39. if (padding > 0)
  40. return "";
  41. int val = 0;
  42. if(*ptr >= 'A' && *ptr <= 'Z')
  43. val = *ptr - 'A';
  44. else if(*ptr >= 'a' && *ptr <= 'z')
  45. val = *ptr - 'a' + 26;
  46. else if(*ptr >= '0' && *ptr <= '9')
  47. val = *ptr - '0' + 52;
  48. else if(*ptr == '+')
  49. val = 62;
  50. else if(*ptr == '/')
  51. val = 63;
  52. else
  53. return ""; // invalid character
  54. packed = (packed << 6) | val;
  55. }
  56. if (i != 4)
  57. return "";
  58. if (padding > 0 && ptr != end)
  59. return "";
  60. if (padding > 2)
  61. return "";
  62. *writeBuf++ = (char)((packed >> 16) & 0xff);
  63. if(padding != 2)
  64. *writeBuf++ = (char)((packed >> 8) & 0xff);
  65. if(padding == 0)
  66. *writeBuf++ = (char)(packed & 0xff);
  67. }
  68. result.resize(writeBuf - result.c_str());
  69. return result;
  70. }
  71. std::string
  72. base64encode(const std::string& data)
  73. {
  74. return base64encode(data.c_str(), data.size());
  75. }
  76. std::string
  77. base64encode(const void* data, size_t len)
  78. {
  79. const char* base64 =
  80. "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
  81. std::string ret;
  82. ret.reserve(len * 4 / 3 + 2);
  83. const unsigned char* ptr = (const unsigned char*)data;
  84. const unsigned char* end = ptr + len;
  85. while(ptr < end) {
  86. unsigned int packed = 0;
  87. int i = 0;
  88. int padding = 0;
  89. for(; i < 3 && ptr < end; ++i, ++ptr)
  90. packed = (packed << 8) | *ptr;
  91. if(i == 2)
  92. padding = 1;
  93. else if (i == 1)
  94. padding = 2;
  95. for(; i < 3; ++i)
  96. packed <<= 8;
  97. ret.append(1, base64[packed >> 18]);
  98. ret.append(1, base64[(packed >> 12) & 0x3f]);
  99. if(padding != 2)
  100. ret.append(1, base64[(packed >> 6) & 0x3f]);
  101. if(padding == 0)
  102. ret.append(1, base64[packed & 0x3f]);
  103. ret.append(padding, '=');
  104. }
  105. return ret;
  106. }
  107. std::string
  108. md5(const std::string &data)
  109. {
  110. return hexstringFromData(md5sum(data).c_str(), MD5_DIGEST_LENGTH);
  111. }
  112. std::string
  113. sha1(const std::string &data)
  114. {
  115. return hexstringFromData(sha1sum(data).c_str(), SHA_DIGEST_LENGTH);
  116. }
  117. std::string
  118. md5sum(const void *data, size_t len)
  119. {
  120. MD5_CTX ctx;
  121. MD5_Init(&ctx);
  122. MD5_Update(&ctx, data, len);
  123. std::string result;
  124. result.resize(MD5_DIGEST_LENGTH);
  125. MD5_Final((unsigned char*)&result[0], &ctx);
  126. return result;
  127. }
  128. std::string
  129. md5sum(const std::string &data)
  130. {
  131. return md5sum(data.c_str(), data.size());
  132. }
  133. std::string
  134. sha0sum(const void *data, size_t len)
  135. {
  136. SHA_CTX ctx;
  137. SHA_Init(&ctx);
  138. SHA_Update(&ctx, data, len);
  139. std::string result;
  140. result.resize(SHA_DIGEST_LENGTH);
  141. SHA_Final((unsigned char*)&result[0], &ctx);
  142. return result;
  143. }
  144. std::string
  145. sha0sum(const std::string & data)
  146. {
  147. return sha0sum(data.c_str(), data.length());
  148. }
  149. std::string
  150. sha1sum(const void *data, size_t len)
  151. {
  152. SHA_CTX ctx;
  153. SHA1_Init(&ctx);
  154. SHA1_Update(&ctx, data, len);
  155. std::string result;
  156. result.resize(SHA_DIGEST_LENGTH);
  157. SHA1_Final((unsigned char*)&result[0], &ctx);
  158. return result;
  159. }
  160. std::string
  161. sha1sum(const std::string &data)
  162. {
  163. return sha1sum(data.c_str(), data.size());
  164. }
  165. struct xorStruct
  166. {
  167. xorStruct(char value) : m_value(value) {}
  168. char m_value;
  169. char operator()(char in) const { return in ^ m_value; }
  170. };
  171. template <class CTX,
  172. int (*Init)(CTX *),
  173. int (*Update)(CTX *, const void *, size_t),
  174. int (*Final)(unsigned char *, CTX *),
  175. unsigned int B, unsigned int L>
  176. std::string
  177. hmac(const std::string &text, const std::string &key)
  178. {
  179. std::string keyLocal = key;
  180. CTX ctx;
  181. if (keyLocal.size() > B) {
  182. Init(&ctx);
  183. Update(&ctx, keyLocal.c_str(), keyLocal.size());
  184. keyLocal.resize(L);
  185. Final((unsigned char *)&keyLocal[0], &ctx);
  186. }
  187. keyLocal.append(B - keyLocal.size(), '\0');
  188. std::string ipad = keyLocal, opad = keyLocal;
  189. std::transform(ipad.begin(), ipad.end(), ipad.begin(), xorStruct(0x36));
  190. std::transform(opad.begin(), opad.end(), opad.begin(), xorStruct(0x5c));
  191. Init(&ctx);
  192. Update(&ctx, ipad.c_str(), B);
  193. Update(&ctx, text.c_str(), text.size());
  194. std::string result;
  195. result.resize(L);
  196. Final((unsigned char *)&result[0], &ctx);
  197. Init(&ctx);
  198. Update(&ctx, opad.c_str(), B);
  199. Update(&ctx, result.c_str(), L);
  200. Final((unsigned char *)&result[0], &ctx);
  201. return result;
  202. }
  203. std::string
  204. hmacMd5(const std::string &text, const std::string &key)
  205. {
  206. return hmac<MD5_CTX,
  207. &MD5_Init,
  208. &MD5_Update,
  209. &MD5_Final,
  210. MD5_CBLOCK, MD5_DIGEST_LENGTH>
  211. (text, key);
  212. }
  213. std::string
  214. hmacSha1(const std::string &text, const std::string &key)
  215. {
  216. return hmac<SHA_CTX,
  217. &SHA1_Init,
  218. &SHA1_Update,
  219. &SHA1_Final,
  220. SHA_CBLOCK, SHA_DIGEST_LENGTH>
  221. (text, key);
  222. }
  223. std::string
  224. hmacSha256(const std::string &text, const std::string &key)
  225. {
  226. return hmac<SHA256_CTX,
  227. &SHA256_Init,
  228. &SHA256_Update,
  229. &SHA256_Final,
  230. SHA256_CBLOCK, SHA256_DIGEST_LENGTH>
  231. (text, key);
  232. }
  233. void
  234. hexstringFromData(const void *data, size_t len, char *output)
  235. {
  236. const unsigned char *buf = (const unsigned char *)data;
  237. size_t i, j;
  238. for (i = j = 0; i < len; ++i) {
  239. char c;
  240. c = (buf[i] >> 4) & 0xf;
  241. c = (c > 9) ? c + 'a' - 10 : c + '0';
  242. output[j++] = c;
  243. c = (buf[i] & 0xf);
  244. c = (c > 9) ? c + 'a' - 10 : c + '0';
  245. output[j++] = c;
  246. }
  247. }
  248. std::string
  249. hexstringFromData(const void *data, size_t len)
  250. {
  251. if (len == 0)
  252. return std::string();
  253. std::string result;
  254. result.resize(len * 2);
  255. hexstringFromData(data, len, &result[0]);
  256. return result;
  257. }
  258. std::string
  259. hexstringFromData(const std::string &data)
  260. {
  261. return hexstringFromData(data.c_str(), data.size());
  262. }
  263. void
  264. dataFromHexstring(const char *hexstring, size_t length, void *output)
  265. {
  266. unsigned char *buf = (unsigned char *)output;
  267. unsigned char byte;
  268. if (length % 2 != 0)
  269. MORDOR_THROW_EXCEPTION(std::invalid_argument("length"));
  270. for (size_t i = 0; i < length; ++i) {
  271. switch (hexstring[i]) {
  272. case 'a':
  273. case 'b':
  274. case 'c':
  275. case 'd':
  276. case 'e':
  277. case 'f':
  278. byte = (hexstring[i] - 'a' + 10) << 4;
  279. break;
  280. case 'A':
  281. case 'B':
  282. case 'C':
  283. case 'D':
  284. case 'E':
  285. case 'F':
  286. byte = (hexstring[i] - 'A' + 10) << 4;
  287. break;
  288. case '0':
  289. case '1':
  290. case '2':
  291. case '3':
  292. case '4':
  293. case '5':
  294. case '6':
  295. case '7':
  296. case '8':
  297. case '9':
  298. byte = (hexstring[i] - '0') << 4;
  299. break;
  300. default:
  301. MORDOR_THROW_EXCEPTION(std::invalid_argument("hexstring"));
  302. }
  303. ++i;
  304. switch (hexstring[i]) {
  305. case 'a':
  306. case 'b':
  307. case 'c':
  308. case 'd':
  309. case 'e':
  310. case 'f':
  311. byte |= hexstring[i] - 'a' + 10;
  312. break;
  313. case 'A':
  314. case 'B':
  315. case 'C':
  316. case 'D':
  317. case 'E':
  318. case 'F':
  319. byte |= hexstring[i] - 'A' + 10;
  320. break;
  321. case '0':
  322. case '1':
  323. case '2':
  324. case '3':
  325. case '4':
  326. case '5':
  327. case '6':
  328. case '7':
  329. case '8':
  330. case '9':
  331. byte |= hexstring[i] - '0';
  332. break;
  333. default:
  334. MORDOR_THROW_EXCEPTION(std::invalid_argument("hexstring"));
  335. }
  336. *buf++ = byte;
  337. }
  338. }
  339. std::string
  340. dataFromHexstring(const char *hexstring, size_t length)
  341. {
  342. if (length % 2 != 0)
  343. MORDOR_THROW_EXCEPTION(std::invalid_argument("length"));
  344. if (length == 0)
  345. return std::string();
  346. std::string result;
  347. result.resize(length / 2);
  348. dataFromHexstring(hexstring, length, &result[0]);
  349. return result;
  350. }
  351. std::string
  352. dataFromHexstring(const std::string &hexstring)
  353. {
  354. return dataFromHexstring(hexstring.c_str(), hexstring.size());
  355. }
  356. void
  357. replace(std::string &str, char find, char replaceWith)
  358. {
  359. size_t index = str.find(find);
  360. while (index != std::string::npos) {
  361. str[index] = replaceWith;
  362. index = str.find(find, index + 1);
  363. }
  364. }
  365. void
  366. replace(std::string &str, char find, const std::string &replaceWith)
  367. {
  368. size_t index = str.find(find);
  369. while (index != std::string::npos) {
  370. str = str.substr(0, index) + replaceWith + str.substr(index + 1);
  371. index = str.find(find, index + replaceWith.size());
  372. }
  373. }
  374. void
  375. replace(std::string &str, const std::string &find, const std::string &replaceWith)
  376. {
  377. size_t index = str.find(find);
  378. while (index != std::string::npos) {
  379. str = str.substr(0, index) + replaceWith + str.substr(index + find.size());
  380. index = str.find(find, index + replaceWith.size());
  381. }
  382. }
  383. std::vector<std::string>
  384. split(const std::string &str, char delim, size_t max)
  385. {
  386. MORDOR_ASSERT(max > 1);
  387. std::vector<std::string> result;
  388. if (str.empty())
  389. return result;
  390. size_t last = 0;
  391. size_t pos = str.find(delim);
  392. while (pos != std::string::npos) {
  393. result.push_back(str.substr(last, pos - last));
  394. last = pos + 1;
  395. if (--max == 1)
  396. break;
  397. pos = str.find(delim, last);
  398. }
  399. result.push_back(str.substr(last));
  400. return result;
  401. }
  402. std::vector<std::string>
  403. split(const std::string &str, const char *delims, size_t max)
  404. {
  405. MORDOR_ASSERT(max > 1);
  406. std::vector<std::string> result;
  407. if (str.empty())
  408. return result;
  409. size_t last = 0;
  410. size_t pos = str.find_first_of(delims);
  411. while (pos != std::string::npos) {
  412. result.push_back(str.substr(last, pos - last));
  413. last = pos + 1;
  414. if (--max == 1)
  415. break;
  416. pos = str.find_first_of(delims, last);
  417. }
  418. result.push_back(str.substr(last));
  419. return result;
  420. }
  421. static bool endsWith(const std::string &string, const std::string &suffix)
  422. {
  423. return string.size() >= suffix.size() &&
  424. strnicmp(string.c_str() + string.size() - suffix.size(),
  425. suffix.c_str(), suffix.size()) == 0;
  426. }
  427. namespace {
  428. struct Suffix
  429. {
  430. std::string suffix;
  431. unsigned long long multiplier;
  432. };
  433. }
  434. unsigned long long stringToMicroseconds(const std::string &string)
  435. {
  436. static const Suffix suffixes[] = {
  437. { "microseconds", 1ull },
  438. { "us", 1ull },
  439. { "milliseconds", 1000ull },
  440. { "ms", 1000ull },
  441. { "seconds", 1000000ull },
  442. { "minutes", 60 * 1000000ull },
  443. { "m", 60 * 1000000ull },
  444. { "hours", 60 * 60 * 1000000ull },
  445. { "h", 60 * 60 * 1000000ull },
  446. { "days", 24 * 60 * 60 * 1000000ull },
  447. { "d", 24 * 60 * 60 * 1000000ull },
  448. // s needs to go at the bottom since we're just suffix matching, and it
  449. // would give a false positive for "minutes", etc.
  450. { "s", 1000000ull }
  451. };
  452. std::string copy(string);
  453. unsigned long long multiplier = 1ull;
  454. // Strip leading whitespace
  455. while (copy.size() > 1 && copy[0] == ' ')
  456. copy = copy.substr(1);
  457. // Strip trailing whitespace
  458. while (copy.size() > 1 && copy[copy.size() -1] == ' ')
  459. copy.resize(copy.size() - 1);
  460. for (size_t i = 0; i < sizeof(suffixes)/sizeof(suffixes[0]); ++i) {
  461. if (endsWith(copy, suffixes[i].suffix)) {
  462. multiplier = suffixes[i].multiplier;
  463. copy.resize(copy.size() - suffixes[i].suffix.size());
  464. break;
  465. }
  466. }
  467. // Strip whitespace between the number and the units
  468. while (copy.size() > 1 && copy[copy.size() -1] == ' ')
  469. copy.resize(copy.size() - 1);
  470. // If there's a decimal point, use floating point arithmetic
  471. if (copy.find('.') != std::string::npos)
  472. return (unsigned long long)(multiplier *
  473. boost::lexical_cast<double>(copy));
  474. else
  475. return multiplier * boost::lexical_cast<unsigned long long>(copy);
  476. }
  477. #ifdef WINDOWS
  478. static DWORD g_wcFlags = WC_ERR_INVALID_CHARS;
  479. static DWORD g_mbFlags = MB_ERR_INVALID_CHARS;
  480. std::string
  481. toUtf8(const utf16char *str, size_t len)
  482. {
  483. if (len == (size_t)~0)
  484. len = wcslen(str);
  485. MORDOR_ASSERT(len < 0x80000000u);
  486. std::string result;
  487. if (len == 0)
  488. return result;
  489. int ret = WideCharToMultiByte(CP_UTF8, g_wcFlags, str, (int)len, NULL, 0, NULL, NULL);
  490. MORDOR_ASSERT(ret >= 0);
  491. if (ret == 0) {
  492. if (lastError() == ERROR_INVALID_FLAGS) {
  493. g_wcFlags = 0;
  494. ret = WideCharToMultiByte(CP_UTF8, g_wcFlags, str, (int)len, NULL, 0, NULL, NULL);
  495. MORDOR_ASSERT(ret >= 0);
  496. }
  497. if (ret == 0)
  498. MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("WideCharToMultiByte");
  499. }
  500. result.resize(ret);
  501. ret = WideCharToMultiByte(CP_UTF8, g_wcFlags, str, (int)len, &result[0], ret, NULL, NULL);
  502. MORDOR_ASSERT(ret >= 0);
  503. if (ret == 0)
  504. MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("WideCharToMultiByte");
  505. MORDOR_ASSERT(ret == result.size());
  506. return result;
  507. }
  508. std::string
  509. toUtf8(const std::wstring &str)
  510. {
  511. MORDOR_ASSERT(str.size() < 0x80000000u);
  512. return toUtf8(str.c_str(), str.size());
  513. }
  514. utf16string
  515. toUtf16(const char *str, size_t len)
  516. {
  517. if (len == (size_t)~0)
  518. len = strlen(str);
  519. MORDOR_ASSERT(len < 0x80000000u);
  520. utf16string result;
  521. if (len == 0)
  522. return result;
  523. int ret = MultiByteToWideChar(CP_UTF8, g_mbFlags, str, (int)len, NULL, 0);
  524. MORDOR_ASSERT(ret >= 0);
  525. if (ret == 0) {
  526. if (lastError() == ERROR_INVALID_FLAGS) {
  527. g_mbFlags = 0;
  528. ret = MultiByteToWideChar(CP_UTF8, g_mbFlags, str, (int)len, NULL, 0);
  529. MORDOR_ASSERT(ret >= 0);
  530. }
  531. if (ret == 0)
  532. MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("MultiByteToWideChar");
  533. }
  534. result.resize(ret);
  535. ret = MultiByteToWideChar(CP_UTF8, g_mbFlags, str, (int)len, &result[0], ret);
  536. if (ret == 0)
  537. MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("MultiByteToWideChar");
  538. MORDOR_ASSERT(ret == result.size());
  539. return result;
  540. }
  541. utf16string
  542. toUtf16(const std::string &str)
  543. {
  544. MORDOR_ASSERT(str.size() < 0x80000000u);
  545. return toUtf16(str.c_str(), str.size());
  546. }
  547. #elif defined (OSX)
  548. std::string
  549. toUtf8(CFStringRef string)
  550. {
  551. const char *bytes = CFStringGetCStringPtr(string, kCFStringEncodingUTF8);
  552. if (bytes)
  553. return bytes;
  554. std::string result;
  555. CFIndex length = CFStringGetLength(string);
  556. // Include extra byte for null termination
  557. length = CFStringGetMaximumSizeForEncoding(length, kCFStringEncodingUTF8) + 1;
  558. result.resize(length);
  559. if (!CFStringGetCString(string, &result[0], length, kCFStringEncodingUTF8)) {
  560. MORDOR_NOTREACHED();
  561. }
  562. result.resize(strlen(result.c_str()));
  563. return result;
  564. }
  565. utf16string
  566. toUtf16(const char * str, size_t length)
  567. {
  568. utf16string result;
  569. if (length == 0u)
  570. return result;
  571. ScopedCFRef<CFStringRef> cfUtf8Str = CFStringCreateWithBytesNoCopy(NULL,
  572. (const UInt8 *)str, (CFIndex)length, kCFStringEncodingUTF8, false,
  573. kCFAllocatorNull);
  574. if (!cfUtf8Str)
  575. MORDOR_THROW_EXCEPTION(InvalidUnicodeException());
  576. #if MORDOR_BYTE_ORDER == MORDOR_LITTLE_ENDIAN
  577. ScopedCFRef<CFDataRef> cfUtf16Data = CFStringCreateExternalRepresentation(
  578. NULL, cfUtf8Str, kCFStringEncodingUTF16LE, 0);
  579. #elif MORDOR_BYTE_ORDER == MORDOR_BIG_ENDIAN
  580. ScopedCFRef<CFDataRef> cfUtf16Data = CFStringCreateExternalRepresentation(
  581. NULL, cfUtf8Str, kCFStringEncodingUTF16BE, 0);
  582. #endif
  583. MORDOR_ASSERT(cfUtf16Data);
  584. MORDOR_ASSERT(CFDataGetLength(cfUtf16Data) % sizeof(utf16char) == 0);
  585. result.resize(CFDataGetLength(cfUtf16Data) / sizeof(utf16char));
  586. CFDataGetBytes(cfUtf16Data, CFRangeMake(0,CFDataGetLength(cfUtf16Data)),
  587. (UInt8 *)&result[0]);
  588. return result;
  589. }
  590. utf16string
  591. toUtf16(const std::string &str)
  592. {
  593. return toUtf16(str.c_str(), str.size());
  594. }
  595. #elif defined(HAVE_ICONV)
  596. namespace {
  597. class Iconv {
  598. iconv_t m_iconv;
  599. public:
  600. Iconv(const char* from, const char* to)
  601. : m_iconv(iconv_open(to, from))
  602. {
  603. MORDOR_ASSERT(m_iconv != (iconv_t)-1);
  604. }
  605. ~Iconv() {
  606. iconv_close(m_iconv);
  607. }
  608. size_t operator()(char** inbuf, size_t* inlen, char** outbuf, size_t* outlen) {
  609. return iconv(m_iconv, inbuf, inlen, outbuf, outlen);
  610. }
  611. };
  612. }
  613. utf16string
  614. toUtf16(const char *str, size_t len)
  615. {
  616. utf16string result;
  617. if (len == 0u)
  618. return result;
  619. result.resize(len); // way enough (paired surrogate also)
  620. size_t out_left = len * sizeof(utf16string::value_type);
  621. char *out_buf = (char *)&result[0];
  622. Iconv conv("UTF-8", "UTF-16LE");
  623. size_t n = conv((char **)&str, &len, &out_buf, &out_left);
  624. if (n == (size_t)-1) {
  625. MORDOR_ASSERT(errno != E2BIG);
  626. MORDOR_THROW_EXCEPTION(InvalidUnicodeException());
  627. }
  628. MORDOR_ASSERT(out_left % sizeof(utf16string::value_type) == 0);
  629. result.resize(result.size() - out_left/sizeof(utf16string::value_type));
  630. return result;
  631. }
  632. utf16string
  633. toUtf16(const std::string &str)
  634. {
  635. return toUtf16(str.data(), str.size());
  636. }
  637. #endif
  638. std::string
  639. toUtf8(utf16char character)
  640. {
  641. return toUtf8((utf32char)character);
  642. }
  643. std::string
  644. toUtf8(utf32char character)
  645. {
  646. MORDOR_ASSERT(character <= 0x10ffff);
  647. std::string result;
  648. if (character <= 0x7f) {
  649. result.append(1, (char)character);
  650. } else if (character <= 0x7ff) {
  651. result.resize(2);
  652. result[0] = 0xc0 | ((character >> 6) & 0x1f);
  653. result[1] = 0x80 | (character & 0x3f);
  654. } else if (character <= 0xffff) {
  655. result.resize(3);
  656. result[0] = 0xe0 | ((character >> 12) & 0xf);
  657. result[1] = 0x80 | ((character >> 6) & 0x3f);
  658. result[2] = 0x80 | (character & 0x3f);
  659. } else {
  660. result.resize(4);
  661. result[0] = 0xf0 | ((character >> 18) & 0x7);
  662. result[1] = 0x80 | ((character >> 12) & 0x3f);
  663. result[2] = 0x80 | ((character >> 6) & 0x3f);
  664. result[3] = 0x80 | (character & 0x3f);
  665. }
  666. return result;
  667. }
  668. utf32char
  669. toUtf32(utf16char highSurrogate, utf16char lowSurrogate)
  670. {
  671. MORDOR_ASSERT(isHighSurrogate(highSurrogate));
  672. MORDOR_ASSERT(isLowSurrogate(lowSurrogate));
  673. return ((((utf32char)highSurrogate - 0xd800) << 10) | ((utf32char)lowSurrogate - 0xdc00)) + 0x10000;
  674. }
  675. std::string
  676. toUtf8(utf16char highSurrogate, utf16char lowSurrogate)
  677. {
  678. return toUtf8(toUtf32(highSurrogate, lowSurrogate));
  679. }
  680. bool isHighSurrogate(utf16char character)
  681. {
  682. return character >= 0xd800 && character <= 0xdbff;
  683. }
  684. bool isLowSurrogate(utf16char character)
  685. {
  686. return character >= 0xdc00 && character <= 0xdfff;
  687. }
  688. // following content is nearly copied from glib completely.
  689. // get more info, please refer to https://git.gnome.org/browse/glib/tree/glib/gutf8.c,
  690. // as well as http://en.wikipedia.org/wiki/UTF-8
  691. typedef unsigned char guchar;
  692. bool
  693. validateUtf8(const std::string &str)
  694. {
  695. unsigned int val = 0;
  696. unsigned int min = 0;
  697. const char *begin = str.data();
  698. const size_t len = str.size();
  699. #define CONTINUATION_CHAR \
  700. do { \
  701. if ((*(guchar *)p & 0xc0) != 0x80) /* 10xxxxxx */ \
  702. return (false); \
  703. val <<= 6; \
  704. val |= (*(guchar *)p) & 0x3f; \
  705. } while(0)
  706. #define CONTINUATION_CHARS(Count) \
  707. for(int i = 0; i < Count; i++) {\
  708. pos++; \
  709. if (pos >= len) \
  710. return false; \
  711. p++; \
  712. CONTINUATION_CHAR; \
  713. }
  714. size_t pos = 0;
  715. for (const char *p = begin; pos < len; pos++, p = begin + pos) {
  716. if (*(guchar *)p < 128)
  717. /* done */;
  718. else {
  719. if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */ {
  720. if ((*(guchar *)p & 0x1e) == 0)
  721. return false;
  722. pos++;
  723. if (pos >= len)
  724. return false;
  725. p++;
  726. if ((*(guchar *)p & 0xc0) != 0x80) /* 10xxxxxx */
  727. return false;
  728. } else {
  729. if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */ {
  730. min = (1 << 11);
  731. val = *(guchar *)p & 0x0f;
  732. CONTINUATION_CHARS(2);
  733. } else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */ {
  734. min = (1 << 16);
  735. val = *(guchar *)p & 0x07;
  736. CONTINUATION_CHARS(3);
  737. } else if ((*(guchar *)p & 0xfc) == 0xf8) /* 111110xx */ {
  738. min = (1 << 21);
  739. val = *(guchar *)p & 0x03;
  740. CONTINUATION_CHARS(4);
  741. } else if ((*(guchar *)p & 0xfe) == 0xfc) /* 1111110x */ {
  742. min = (1 << 26);
  743. val = *(guchar *)p & 0x01;
  744. CONTINUATION_CHARS(5);
  745. } else
  746. return false;
  747. if (val < min)
  748. return false;
  749. }
  750. }
  751. }
  752. return true;
  753. }
  754. bool
  755. caseinsensitiveless::operator ()(const std::string &lhs, const std::string &rhs) const
  756. {
  757. return stricmp(lhs.c_str(), rhs.c_str()) < 0;
  758. }
  759. std::ostream &operator <<(std::ostream &os, const charslice &slice)
  760. {
  761. for (size_t i = 0; i < slice.m_len; ++i) {
  762. os.put(slice.m_slice[i]);
  763. }
  764. return os;
  765. }
  766. }