PageRenderTime 97ms CodeModel.GetById 20ms RepoModel.GetById 6ms app.codeStats 0ms

/runtime/internal/util/utf.d

https://bitbucket.org/prokhin_alexey/ldc2/
D | 851 lines | 660 code | 118 blank | 73 comment | 127 complexity | a5f0c6d28ffc2c8a1adc4d15bd029142 MD5 | raw file
Possible License(s): BSD-3-Clause, AGPL-1.0
  1. // utf.d
  2. /*
  3. * Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com
  4. * Written by Walter Bright
  5. *
  6. * This software is provided 'as-is', without any express or implied
  7. * warranty. In no event will the authors be held liable for any damages
  8. * arising from the use of this software.
  9. *
  10. * Permission is granted to anyone to use this software for any purpose,
  11. * including commercial applications, and to alter it and redistribute it
  12. * freely, subject to the following restrictions:
  13. *
  14. * o The origin of this software must not be misrepresented; you must not
  15. * claim that you wrote the original software. If you use this software
  16. * in a product, an acknowledgment in the product documentation would be
  17. * appreciated but is not required.
  18. * o Altered source versions must be plainly marked as such, and must not
  19. * be misrepresented as being the original software.
  20. * o This notice may not be removed or altered from any source
  21. * distribution.
  22. */
  23. // Description of UTF-8 at:
  24. // http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
  25. // http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335
  26. module util.utf;
  27. extern (C) void onUnicodeError( char[] msg, size_t idx );
  28. bool isValidDchar(dchar c)
  29. {
  30. /* Note: FFFE and FFFF are specifically permitted by the
  31. * Unicode standard for application internal use, but are not
  32. * allowed for interchange.
  33. * (thanks to Arcane Jill)
  34. */
  35. return c < 0xD800 ||
  36. (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
  37. }
  38. unittest
  39. {
  40. debug(utf) printf("utf.isValidDchar.unittest\n");
  41. assert(isValidDchar(cast(dchar)'a') == true);
  42. assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
  43. }
  44. /* This array gives the length of a UTF-8 sequence indexed by the value
  45. * of the leading byte. An FF represents an illegal starting value of
  46. * a UTF-8 sequence.
  47. * FF is used instead of 0 to avoid having loops hang.
  48. */
  49. ubyte[256] UTF8stride =
  50. [
  51. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  52. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  53. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  54. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  55. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  56. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  57. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  58. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  59. 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
  60. 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
  61. 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
  62. 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
  63. 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  64. 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  65. 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
  66. 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
  67. ];
  68. uint stride(char[] s, size_t i)
  69. {
  70. return UTF8stride[s[i]];
  71. }
  72. uint stride(wchar[] s, size_t i)
  73. { uint u = s[i];
  74. return 1 + (u >= 0xD800 && u <= 0xDBFF);
  75. }
  76. uint stride(dchar[] s, size_t i)
  77. {
  78. return 1;
  79. }
  80. /*******************************************
  81. * Given an index into an array of char's,
  82. * and assuming that index is at the start of a UTF character,
  83. * determine the number of UCS characters up to that index.
  84. */
  85. size_t toUCSindex(char[] s, size_t i)
  86. {
  87. size_t n;
  88. size_t j;
  89. size_t stride;
  90. for (j = 0; j < i; j += stride)
  91. {
  92. stride = UTF8stride[s[j]];
  93. if (stride == 0xFF)
  94. goto Lerr;
  95. n++;
  96. }
  97. if (j > i)
  98. {
  99. Lerr:
  100. onUnicodeError("invalid UTF-8 sequence", j);
  101. }
  102. return n;
  103. }
  104. size_t toUCSindex(wchar[] s, size_t i)
  105. {
  106. size_t n;
  107. size_t j;
  108. for (j = 0; j < i; )
  109. { uint u = s[j];
  110. j += 1 + (u >= 0xD800 && u <= 0xDBFF);
  111. n++;
  112. }
  113. if (j > i)
  114. {
  115. Lerr:
  116. onUnicodeError("invalid UTF-16 sequence", j);
  117. }
  118. return n;
  119. }
  120. size_t toUCSindex(dchar[] s, size_t i)
  121. {
  122. return i;
  123. }
  124. /******************************************
  125. * Given a UCS index into an array of characters, return the UTF index.
  126. */
  127. size_t toUTFindex(char[] s, size_t n)
  128. {
  129. size_t i;
  130. while (n--)
  131. {
  132. uint j = UTF8stride[s[i]];
  133. if (j == 0xFF)
  134. onUnicodeError("invalid UTF-8 sequence", i);
  135. i += j;
  136. }
  137. return i;
  138. }
  139. size_t toUTFindex(wchar[] s, size_t n)
  140. {
  141. size_t i;
  142. while (n--)
  143. { wchar u = s[i];
  144. i += 1 + (u >= 0xD800 && u <= 0xDBFF);
  145. }
  146. return i;
  147. }
  148. size_t toUTFindex(dchar[] s, size_t n)
  149. {
  150. return n;
  151. }
  152. /* =================== Decode ======================= */
  153. dchar decode(char[] s, inout size_t idx)
  154. in
  155. {
  156. assert(idx >= 0 && idx < s.length);
  157. }
  158. out (result)
  159. {
  160. assert(isValidDchar(result));
  161. }
  162. body
  163. {
  164. size_t len = s.length;
  165. dchar V;
  166. size_t i = idx;
  167. char u = s[i];
  168. if (u & 0x80)
  169. { uint n;
  170. char u2;
  171. /* The following encodings are valid, except for the 5 and 6 byte
  172. * combinations:
  173. * 0xxxxxxx
  174. * 110xxxxx 10xxxxxx
  175. * 1110xxxx 10xxxxxx 10xxxxxx
  176. * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  177. * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  178. * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  179. */
  180. for (n = 1; ; n++)
  181. {
  182. if (n > 4)
  183. goto Lerr; // only do the first 4 of 6 encodings
  184. if (((u << n) & 0x80) == 0)
  185. {
  186. if (n == 1)
  187. goto Lerr;
  188. break;
  189. }
  190. }
  191. // Pick off (7 - n) significant bits of B from first byte of octet
  192. V = cast(dchar)(u & ((1 << (7 - n)) - 1));
  193. if (i + (n - 1) >= len)
  194. goto Lerr; // off end of string
  195. /* The following combinations are overlong, and illegal:
  196. * 1100000x (10xxxxxx)
  197. * 11100000 100xxxxx (10xxxxxx)
  198. * 11110000 1000xxxx (10xxxxxx 10xxxxxx)
  199. * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
  200. * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
  201. */
  202. u2 = s[i + 1];
  203. if ((u & 0xFE) == 0xC0 ||
  204. (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
  205. (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
  206. (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
  207. (u == 0xFC && (u2 & 0xFC) == 0x80))
  208. goto Lerr; // overlong combination
  209. for (uint j = 1; j != n; j++)
  210. {
  211. u = s[i + j];
  212. if ((u & 0xC0) != 0x80)
  213. goto Lerr; // trailing bytes are 10xxxxxx
  214. V = (V << 6) | (u & 0x3F);
  215. }
  216. if (!isValidDchar(V))
  217. goto Lerr;
  218. i += n;
  219. }
  220. else
  221. {
  222. V = cast(dchar) u;
  223. i++;
  224. }
  225. idx = i;
  226. return V;
  227. Lerr:
  228. onUnicodeError("invalid UTF-8 sequence", i);
  229. return V; // dummy return
  230. }
  231. unittest
  232. { size_t i;
  233. dchar c;
  234. debug(utf) printf("utf.decode.unittest\n");
  235. static char[] s1 = "abcd";
  236. i = 0;
  237. c = decode(s1, i);
  238. assert(c == cast(dchar)'a');
  239. assert(i == 1);
  240. c = decode(s1, i);
  241. assert(c == cast(dchar)'b');
  242. assert(i == 2);
  243. static char[] s2 = "\xC2\xA9";
  244. i = 0;
  245. c = decode(s2, i);
  246. assert(c == cast(dchar)'\u00A9');
  247. assert(i == 2);
  248. static char[] s3 = "\xE2\x89\xA0";
  249. i = 0;
  250. c = decode(s3, i);
  251. assert(c == cast(dchar)'\u2260');
  252. assert(i == 3);
  253. static char[][] s4 =
  254. [ "\xE2\x89", // too short
  255. "\xC0\x8A",
  256. "\xE0\x80\x8A",
  257. "\xF0\x80\x80\x8A",
  258. "\xF8\x80\x80\x80\x8A",
  259. "\xFC\x80\x80\x80\x80\x8A",
  260. ];
  261. for (int j = 0; j < s4.length; j++)
  262. {
  263. try
  264. {
  265. i = 0;
  266. c = decode(s4[j], i);
  267. assert(0);
  268. }
  269. catch (Object o)
  270. {
  271. i = 23;
  272. }
  273. assert(i == 23);
  274. }
  275. }
  276. /********************************************************/
  277. dchar decode(wchar[] s, inout size_t idx)
  278. in
  279. {
  280. assert(idx >= 0 && idx < s.length);
  281. }
  282. out (result)
  283. {
  284. assert(isValidDchar(result));
  285. }
  286. body
  287. {
  288. char[] msg;
  289. dchar V;
  290. size_t i = idx;
  291. uint u = s[i];
  292. if (u & ~0x7F)
  293. { if (u >= 0xD800 && u <= 0xDBFF)
  294. { uint u2;
  295. if (i + 1 == s.length)
  296. { msg = "surrogate UTF-16 high value past end of string";
  297. goto Lerr;
  298. }
  299. u2 = s[i + 1];
  300. if (u2 < 0xDC00 || u2 > 0xDFFF)
  301. { msg = "surrogate UTF-16 low value out of range";
  302. goto Lerr;
  303. }
  304. u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
  305. i += 2;
  306. }
  307. else if (u >= 0xDC00 && u <= 0xDFFF)
  308. { msg = "unpaired surrogate UTF-16 value";
  309. goto Lerr;
  310. }
  311. else if (u == 0xFFFE || u == 0xFFFF)
  312. { msg = "illegal UTF-16 value";
  313. goto Lerr;
  314. }
  315. else
  316. i++;
  317. }
  318. else
  319. {
  320. i++;
  321. }
  322. idx = i;
  323. return cast(dchar)u;
  324. Lerr:
  325. onUnicodeError(msg, i);
  326. return cast(dchar)u; // dummy return
  327. }
  328. /********************************************************/
  329. dchar decode(dchar[] s, inout size_t idx)
  330. in
  331. {
  332. assert(idx >= 0 && idx < s.length);
  333. }
  334. body
  335. {
  336. size_t i = idx;
  337. dchar c = s[i];
  338. if (!isValidDchar(c))
  339. goto Lerr;
  340. idx = i + 1;
  341. return c;
  342. Lerr:
  343. onUnicodeError("invalid UTF-32 value", i);
  344. return c; // dummy return
  345. }
  346. /* =================== Encode ======================= */
  347. void encode(inout char[] s, dchar c)
  348. in
  349. {
  350. assert(isValidDchar(c));
  351. }
  352. body
  353. {
  354. char[] r = s;
  355. if (c <= 0x7F)
  356. {
  357. r ~= cast(char) c;
  358. }
  359. else
  360. {
  361. char[4] buf;
  362. uint L;
  363. if (c <= 0x7FF)
  364. {
  365. buf[0] = cast(char)(0xC0 | (c >> 6));
  366. buf[1] = cast(char)(0x80 | (c & 0x3F));
  367. L = 2;
  368. }
  369. else if (c <= 0xFFFF)
  370. {
  371. buf[0] = cast(char)(0xE0 | (c >> 12));
  372. buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
  373. buf[2] = cast(char)(0x80 | (c & 0x3F));
  374. L = 3;
  375. }
  376. else if (c <= 0x10FFFF)
  377. {
  378. buf[0] = cast(char)(0xF0 | (c >> 18));
  379. buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
  380. buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
  381. buf[3] = cast(char)(0x80 | (c & 0x3F));
  382. L = 4;
  383. }
  384. else
  385. {
  386. assert(0);
  387. }
  388. r ~= buf[0 .. L];
  389. }
  390. s = r;
  391. }
  392. unittest
  393. {
  394. debug(utf) printf("utf.encode.unittest\n");
  395. char[] s = "abcd";
  396. encode(s, cast(dchar)'a');
  397. assert(s.length == 5);
  398. assert(s == "abcda");
  399. encode(s, cast(dchar)'\u00A9');
  400. assert(s.length == 7);
  401. assert(s == "abcda\xC2\xA9");
  402. //assert(s == "abcda\u00A9"); // BUG: fix compiler
  403. encode(s, cast(dchar)'\u2260');
  404. assert(s.length == 10);
  405. assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
  406. }
  407. /********************************************************/
  408. void encode(inout wchar[] s, dchar c)
  409. in
  410. {
  411. assert(isValidDchar(c));
  412. }
  413. body
  414. {
  415. wchar[] r = s;
  416. if (c <= 0xFFFF)
  417. {
  418. r ~= cast(wchar) c;
  419. }
  420. else
  421. {
  422. wchar[2] buf;
  423. buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
  424. buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
  425. r ~= buf;
  426. }
  427. s = r;
  428. }
  429. void encode(inout dchar[] s, dchar c)
  430. in
  431. {
  432. assert(isValidDchar(c));
  433. }
  434. body
  435. {
  436. s ~= c;
  437. }
  438. /* =================== Validation ======================= */
  439. void validate(char[] s)
  440. {
  441. size_t len = s.length;
  442. size_t i;
  443. for (i = 0; i < len; )
  444. {
  445. decode(s, i);
  446. }
  447. }
  448. void validate(wchar[] s)
  449. {
  450. size_t len = s.length;
  451. size_t i;
  452. for (i = 0; i < len; )
  453. {
  454. decode(s, i);
  455. }
  456. }
  457. void validate(dchar[] s)
  458. {
  459. size_t len = s.length;
  460. size_t i;
  461. for (i = 0; i < len; )
  462. {
  463. decode(s, i);
  464. }
  465. }
  466. /* =================== Conversion to UTF8 ======================= */
  467. char[] toUTF8(char[4] buf, dchar c)
  468. in
  469. {
  470. assert(isValidDchar(c));
  471. }
  472. body
  473. {
  474. if (c <= 0x7F)
  475. {
  476. buf[0] = cast(char) c;
  477. return buf[0 .. 1];
  478. }
  479. else if (c <= 0x7FF)
  480. {
  481. buf[0] = cast(char)(0xC0 | (c >> 6));
  482. buf[1] = cast(char)(0x80 | (c & 0x3F));
  483. return buf[0 .. 2];
  484. }
  485. else if (c <= 0xFFFF)
  486. {
  487. buf[0] = cast(char)(0xE0 | (c >> 12));
  488. buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
  489. buf[2] = cast(char)(0x80 | (c & 0x3F));
  490. return buf[0 .. 3];
  491. }
  492. else if (c <= 0x10FFFF)
  493. {
  494. buf[0] = cast(char)(0xF0 | (c >> 18));
  495. buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
  496. buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
  497. buf[3] = cast(char)(0x80 | (c & 0x3F));
  498. return buf[0 .. 4];
  499. }
  500. assert(0);
  501. }
  502. char[] toUTF8(char[] s)
  503. in
  504. {
  505. validate(s);
  506. }
  507. body
  508. {
  509. return s;
  510. }
  511. char[] toUTF8(wchar[] s)
  512. {
  513. char[] r;
  514. size_t i;
  515. size_t slen = s.length;
  516. r.length = slen;
  517. for (i = 0; i < slen; i++)
  518. { wchar c = s[i];
  519. if (c <= 0x7F)
  520. r[i] = cast(char)c; // fast path for ascii
  521. else
  522. {
  523. r.length = i;
  524. foreach (dchar c; s[i .. slen])
  525. {
  526. encode(r, c);
  527. }
  528. break;
  529. }
  530. }
  531. return r;
  532. }
  533. char[] toUTF8(dchar[] s)
  534. {
  535. char[] r;
  536. size_t i;
  537. size_t slen = s.length;
  538. r.length = slen;
  539. for (i = 0; i < slen; i++)
  540. { dchar c = s[i];
  541. if (c <= 0x7F)
  542. r[i] = cast(char)c; // fast path for ascii
  543. else
  544. {
  545. r.length = i;
  546. foreach (dchar d; s[i .. slen])
  547. {
  548. encode(r, d);
  549. }
  550. break;
  551. }
  552. }
  553. return r;
  554. }
  555. /* =================== Conversion to UTF16 ======================= */
  556. wchar[] toUTF16(wchar[2] buf, dchar c)
  557. in
  558. {
  559. assert(isValidDchar(c));
  560. }
  561. body
  562. {
  563. if (c <= 0xFFFF)
  564. {
  565. buf[0] = cast(wchar) c;
  566. return buf[0 .. 1];
  567. }
  568. else
  569. {
  570. buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
  571. buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
  572. return buf[0 .. 2];
  573. }
  574. }
  575. wchar[] toUTF16(char[] s)
  576. {
  577. wchar[] r;
  578. size_t slen = s.length;
  579. r.length = slen;
  580. r.length = 0;
  581. for (size_t i = 0; i < slen; )
  582. {
  583. dchar c = s[i];
  584. if (c <= 0x7F)
  585. {
  586. i++;
  587. r ~= cast(wchar)c;
  588. }
  589. else
  590. {
  591. c = decode(s, i);
  592. encode(r, c);
  593. }
  594. }
  595. return r;
  596. }
  597. wchar* toUTF16z(char[] s)
  598. {
  599. wchar[] r;
  600. size_t slen = s.length;
  601. r.length = slen + 1;
  602. r.length = 0;
  603. for (size_t i = 0; i < slen; )
  604. {
  605. dchar c = s[i];
  606. if (c <= 0x7F)
  607. {
  608. i++;
  609. r ~= cast(wchar)c;
  610. }
  611. else
  612. {
  613. c = decode(s, i);
  614. encode(r, c);
  615. }
  616. }
  617. r ~= "\000";
  618. return r.ptr;
  619. }
  620. wchar[] toUTF16(wchar[] s)
  621. in
  622. {
  623. validate(s);
  624. }
  625. body
  626. {
  627. return s;
  628. }
  629. wchar[] toUTF16(dchar[] s)
  630. {
  631. wchar[] r;
  632. size_t slen = s.length;
  633. r.length = slen;
  634. r.length = 0;
  635. for (size_t i = 0; i < slen; i++)
  636. {
  637. encode(r, s[i]);
  638. }
  639. return r;
  640. }
  641. /* =================== Conversion to UTF32 ======================= */
  642. dchar[] toUTF32(char[] s)
  643. {
  644. dchar[] r;
  645. size_t slen = s.length;
  646. size_t j = 0;
  647. r.length = slen; // r[] will never be longer than s[]
  648. for (size_t i = 0; i < slen; )
  649. {
  650. dchar c = s[i];
  651. if (c >= 0x80)
  652. c = decode(s, i);
  653. else
  654. i++; // c is ascii, no need for decode
  655. r[j++] = c;
  656. }
  657. return r[0 .. j];
  658. }
  659. dchar[] toUTF32(wchar[] s)
  660. {
  661. dchar[] r;
  662. size_t slen = s.length;
  663. size_t j = 0;
  664. r.length = slen; // r[] will never be longer than s[]
  665. for (size_t i = 0; i < slen; )
  666. {
  667. dchar c = s[i];
  668. if (c >= 0x80)
  669. c = decode(s, i);
  670. else
  671. i++; // c is ascii, no need for decode
  672. r[j++] = c;
  673. }
  674. return r[0 .. j];
  675. }
  676. dchar[] toUTF32(dchar[] s)
  677. in
  678. {
  679. validate(s);
  680. }
  681. body
  682. {
  683. return s;
  684. }
  685. /* ================================ tests ================================== */
  686. unittest
  687. {
  688. debug(utf) printf("utf.toUTF.unittest\n");
  689. char[] c;
  690. wchar[] w;
  691. dchar[] d;
  692. c = "hello";
  693. w = toUTF16(c);
  694. assert(w == "hello");
  695. d = toUTF32(c);
  696. assert(d == "hello");
  697. c = toUTF8(w);
  698. assert(c == "hello");
  699. d = toUTF32(w);
  700. assert(d == "hello");
  701. c = toUTF8(d);
  702. assert(c == "hello");
  703. w = toUTF16(d);
  704. assert(w == "hello");
  705. c = "hel\u1234o";
  706. w = toUTF16(c);
  707. assert(w == "hel\u1234o");
  708. d = toUTF32(c);
  709. assert(d == "hel\u1234o");
  710. c = toUTF8(w);
  711. assert(c == "hel\u1234o");
  712. d = toUTF32(w);
  713. assert(d == "hel\u1234o");
  714. c = toUTF8(d);
  715. assert(c == "hel\u1234o");
  716. w = toUTF16(d);
  717. assert(w == "hel\u1234o");
  718. c = "he\U0010AAAAllo";
  719. w = toUTF16(c);
  720. //foreach (wchar c; w) printf("c = x%x\n", c);
  721. //foreach (wchar c; cast(wchar[])"he\U0010AAAAllo") printf("c = x%x\n", c);
  722. assert(w == "he\U0010AAAAllo");
  723. d = toUTF32(c);
  724. assert(d == "he\U0010AAAAllo");
  725. c = toUTF8(w);
  726. assert(c == "he\U0010AAAAllo");
  727. d = toUTF32(w);
  728. assert(d == "he\U0010AAAAllo");
  729. c = toUTF8(d);
  730. assert(c == "he\U0010AAAAllo");
  731. w = toUTF16(d);
  732. assert(w == "he\U0010AAAAllo");
  733. }