/src/rt/util/utf.d

http://github.com/AlexeyProkhin/druntime · D · 902 lines · 656 code · 112 blank · 134 comment · 128 complexity · 6a25b0feb3e2e01f96afe3eae6ec22a5 MD5 · raw file

  1. /********************************************
  2. * Encode and decode UTF-8, UTF-16 and UTF-32 strings.
  3. *
  4. * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D
  5. * wchar type.
  6. * For Posix systems, the C wchar_t type is UTF-32 and corresponds to
  7. * the D utf.dchar type.
  8. *
  9. * UTF character support is restricted to (\u0000 <= character <= \U0010FFFF).
  10. *
  11. * See_Also:
  12. * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
  13. * $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
  14. * $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
  15. * Macros:
  16. * WIKI = Phobos/StdUtf
  17. *
  18. * Copyright: Copyright Digital Mars 2003 - 2009.
  19. * License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
  20. * Authors: Walter Bright, Sean Kelly
  21. */
  22. /* Copyright Digital Mars 2003 - 2009.
  23. * Distributed under the Boost Software License, Version 1.0.
  24. * (See accompanying file LICENSE or copy at
  25. * http://www.boost.org/LICENSE_1_0.txt)
  26. */
  27. module rt.util.utf;
  28. extern (C) void onUnicodeError( string msg, size_t idx, string file = __FILE__, size_t line = __LINE__ );
  29. /*******************************
  30. * Test if c is a valid UTF-32 character.
  31. *
  32. * \uFFFE and \uFFFF are considered valid by this function,
  33. * as they are permitted for internal use by an application,
  34. * but they are not allowed for interchange by the Unicode standard.
  35. *
  36. * Returns: true if it is, false if not.
  37. */
  38. bool isValidDchar(dchar c)
  39. {
  40. /* Note: FFFE and FFFF are specifically permitted by the
  41. * Unicode standard for application internal use, but are not
  42. * allowed for interchange.
  43. * (thanks to Arcane Jill)
  44. */
  45. return c < 0xD800 ||
  46. (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
  47. }
  48. unittest
  49. {
  50. debug(utf) printf("utf.isValidDchar.unittest\n");
  51. assert(isValidDchar(cast(dchar)'a') == true);
  52. assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
  53. }
  54. static immutable UTF8stride =
  55. [
  56. cast(ubyte)
  57. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  58. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  59. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  60. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  61. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  62. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  63. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  64. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  65. 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
  66. 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
  67. 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
  68. 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
  69. 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  70. 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  71. 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
  72. 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
  73. ];
  74. /**
  75. * stride() returns the length of a UTF-8 sequence starting at index i
  76. * in string s.
  77. * Returns:
  78. * The number of bytes in the UTF-8 sequence or
  79. * 0xFF meaning s[i] is not the start of of UTF-8 sequence.
  80. */
  81. uint stride(in char[] s, size_t i)
  82. {
  83. return UTF8stride[s[i]];
  84. }
  85. /**
  86. * stride() returns the length of a UTF-16 sequence starting at index i
  87. * in string s.
  88. */
  89. uint stride(in wchar[] s, size_t i)
  90. { uint u = s[i];
  91. return 1 + (u >= 0xD800 && u <= 0xDBFF);
  92. }
  93. /**
  94. * stride() returns the length of a UTF-32 sequence starting at index i
  95. * in string s.
  96. * Returns: The return value will always be 1.
  97. */
  98. uint stride(in dchar[] s, size_t i)
  99. {
  100. return 1;
  101. }
  102. /*******************************************
  103. * Given an index i into an array of characters s[],
  104. * and assuming that index i is at the start of a UTF character,
  105. * determine the number of UCS characters up to that index i.
  106. */
  107. size_t toUCSindex(in char[] s, size_t i)
  108. {
  109. size_t n;
  110. size_t j;
  111. for (j = 0; j < i; )
  112. {
  113. j += stride(s, j);
  114. n++;
  115. }
  116. if (j > i)
  117. {
  118. onUnicodeError("invalid UTF-8 sequence", j);
  119. }
  120. return n;
  121. }
  122. /** ditto */
  123. size_t toUCSindex(in wchar[] s, size_t i)
  124. {
  125. size_t n;
  126. size_t j;
  127. for (j = 0; j < i; )
  128. {
  129. j += stride(s, j);
  130. n++;
  131. }
  132. if (j > i)
  133. {
  134. onUnicodeError("invalid UTF-16 sequence", j);
  135. }
  136. return n;
  137. }
  138. /** ditto */
  139. size_t toUCSindex(in dchar[] s, size_t i)
  140. {
  141. return i;
  142. }
  143. /******************************************
  144. * Given a UCS index n into an array of characters s[], return the UTF index.
  145. */
  146. size_t toUTFindex(in char[] s, size_t n)
  147. {
  148. size_t i;
  149. while (n--)
  150. {
  151. uint j = UTF8stride[s[i]];
  152. if (j == 0xFF)
  153. onUnicodeError("invalid UTF-8 sequence", i);
  154. i += j;
  155. }
  156. return i;
  157. }
  158. /** ditto */
  159. size_t toUTFindex(in wchar[] s, size_t n)
  160. {
  161. size_t i;
  162. while (n--)
  163. { wchar u = s[i];
  164. i += 1 + (u >= 0xD800 && u <= 0xDBFF);
  165. }
  166. return i;
  167. }
  168. /** ditto */
  169. size_t toUTFindex(in dchar[] s, size_t n)
  170. {
  171. return n;
  172. }
  173. /* =================== Decode ======================= */
  174. /***************
  175. * Decodes and returns character starting at s[idx]. idx is advanced past the
  176. * decoded character. If the character is not well formed, a UtfException is
  177. * thrown and idx remains unchanged.
  178. */
  179. dchar decode(in char[] s, ref size_t idx)
  180. in
  181. {
  182. assert(idx >= 0 && idx < s.length);
  183. }
  184. out (result)
  185. {
  186. assert(isValidDchar(result));
  187. }
  188. body
  189. {
  190. size_t len = s.length;
  191. dchar V;
  192. size_t i = idx;
  193. char u = s[i];
  194. if (u & 0x80)
  195. { uint n;
  196. char u2;
  197. /* The following encodings are valid, except for the 5 and 6 byte
  198. * combinations:
  199. * 0xxxxxxx
  200. * 110xxxxx 10xxxxxx
  201. * 1110xxxx 10xxxxxx 10xxxxxx
  202. * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  203. * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  204. * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  205. */
  206. for (n = 1; ; n++)
  207. {
  208. if (n > 4)
  209. goto Lerr; // only do the first 4 of 6 encodings
  210. if (((u << n) & 0x80) == 0)
  211. {
  212. if (n == 1)
  213. goto Lerr;
  214. break;
  215. }
  216. }
  217. // Pick off (7 - n) significant bits of B from first byte of octet
  218. V = cast(dchar)(u & ((1 << (7 - n)) - 1));
  219. if (i + (n - 1) >= len)
  220. goto Lerr; // off end of string
  221. /* The following combinations are overlong, and illegal:
  222. * 1100000x (10xxxxxx)
  223. * 11100000 100xxxxx (10xxxxxx)
  224. * 11110000 1000xxxx (10xxxxxx 10xxxxxx)
  225. * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
  226. * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
  227. */
  228. u2 = s[i + 1];
  229. if ((u & 0xFE) == 0xC0 ||
  230. (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
  231. (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
  232. (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
  233. (u == 0xFC && (u2 & 0xFC) == 0x80))
  234. goto Lerr; // overlong combination
  235. for (uint j = 1; j != n; j++)
  236. {
  237. u = s[i + j];
  238. if ((u & 0xC0) != 0x80)
  239. goto Lerr; // trailing bytes are 10xxxxxx
  240. V = (V << 6) | (u & 0x3F);
  241. }
  242. if (!isValidDchar(V))
  243. goto Lerr;
  244. i += n;
  245. }
  246. else
  247. {
  248. V = cast(dchar) u;
  249. i++;
  250. }
  251. idx = i;
  252. return V;
  253. Lerr:
  254. onUnicodeError("invalid UTF-8 sequence", i);
  255. return V; // dummy return
  256. }
  257. unittest
  258. { size_t i;
  259. dchar c;
  260. debug(utf) printf("utf.decode.unittest\n");
  261. static s1 = "abcd"c;
  262. i = 0;
  263. c = decode(s1, i);
  264. assert(c == cast(dchar)'a');
  265. assert(i == 1);
  266. c = decode(s1, i);
  267. assert(c == cast(dchar)'b');
  268. assert(i == 2);
  269. static s2 = "\xC2\xA9"c;
  270. i = 0;
  271. c = decode(s2, i);
  272. assert(c == cast(dchar)'\u00A9');
  273. assert(i == 2);
  274. static s3 = "\xE2\x89\xA0"c;
  275. i = 0;
  276. c = decode(s3, i);
  277. assert(c == cast(dchar)'\u2260');
  278. assert(i == 3);
  279. static s4 =
  280. [ "\xE2\x89"c[], // too short
  281. "\xC0\x8A",
  282. "\xE0\x80\x8A",
  283. "\xF0\x80\x80\x8A",
  284. "\xF8\x80\x80\x80\x8A",
  285. "\xFC\x80\x80\x80\x80\x8A",
  286. ];
  287. for (int j = 0; j < s4.length; j++)
  288. {
  289. try
  290. {
  291. i = 0;
  292. c = decode(s4[j], i);
  293. assert(0);
  294. }
  295. catch (Throwable o)
  296. {
  297. i = 23;
  298. }
  299. assert(i == 23);
  300. }
  301. }
  302. /** ditto */
  303. dchar decode(in wchar[] s, ref size_t idx)
  304. in
  305. {
  306. assert(idx >= 0 && idx < s.length);
  307. }
  308. out (result)
  309. {
  310. assert(isValidDchar(result));
  311. }
  312. body
  313. {
  314. string msg;
  315. dchar V;
  316. size_t i = idx;
  317. uint u = s[i];
  318. if (u & ~0x7F)
  319. { if (u >= 0xD800 && u <= 0xDBFF)
  320. { uint u2;
  321. if (i + 1 == s.length)
  322. { msg = "surrogate UTF-16 high value past end of string";
  323. goto Lerr;
  324. }
  325. u2 = s[i + 1];
  326. if (u2 < 0xDC00 || u2 > 0xDFFF)
  327. { msg = "surrogate UTF-16 low value out of range";
  328. goto Lerr;
  329. }
  330. u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
  331. i += 2;
  332. }
  333. else if (u >= 0xDC00 && u <= 0xDFFF)
  334. { msg = "unpaired surrogate UTF-16 value";
  335. goto Lerr;
  336. }
  337. else if (u == 0xFFFE || u == 0xFFFF)
  338. { msg = "illegal UTF-16 value";
  339. goto Lerr;
  340. }
  341. else
  342. i++;
  343. }
  344. else
  345. {
  346. i++;
  347. }
  348. idx = i;
  349. return cast(dchar)u;
  350. Lerr:
  351. onUnicodeError(msg, i);
  352. return cast(dchar)u; // dummy return
  353. }
  354. /** ditto */
  355. dchar decode(in dchar[] s, ref size_t idx)
  356. in
  357. {
  358. assert(idx >= 0 && idx < s.length);
  359. }
  360. body
  361. {
  362. size_t i = idx;
  363. dchar c = s[i];
  364. if (!isValidDchar(c))
  365. goto Lerr;
  366. idx = i + 1;
  367. return c;
  368. Lerr:
  369. onUnicodeError("invalid UTF-32 value", i);
  370. return c; // dummy return
  371. }
  372. /* =================== Encode ======================= */
  373. /*******************************
  374. * Encodes character c and appends it to array s[].
  375. */
  376. void encode(ref char[] s, dchar c)
  377. in
  378. {
  379. assert(isValidDchar(c));
  380. }
  381. body
  382. {
  383. char[] r = s;
  384. if (c <= 0x7F)
  385. {
  386. r ~= cast(char) c;
  387. }
  388. else
  389. {
  390. char[4] buf;
  391. uint L;
  392. if (c <= 0x7FF)
  393. {
  394. buf[0] = cast(char)(0xC0 | (c >> 6));
  395. buf[1] = cast(char)(0x80 | (c & 0x3F));
  396. L = 2;
  397. }
  398. else if (c <= 0xFFFF)
  399. {
  400. buf[0] = cast(char)(0xE0 | (c >> 12));
  401. buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
  402. buf[2] = cast(char)(0x80 | (c & 0x3F));
  403. L = 3;
  404. }
  405. else if (c <= 0x10FFFF)
  406. {
  407. buf[0] = cast(char)(0xF0 | (c >> 18));
  408. buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
  409. buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
  410. buf[3] = cast(char)(0x80 | (c & 0x3F));
  411. L = 4;
  412. }
  413. else
  414. {
  415. assert(0);
  416. }
  417. r ~= buf[0 .. L];
  418. }
  419. s = r;
  420. }
  421. unittest
  422. {
  423. debug(utf) printf("utf.encode.unittest\n");
  424. char[] s = "abcd".dup;
  425. encode(s, cast(dchar)'a');
  426. assert(s.length == 5);
  427. assert(s == "abcda");
  428. encode(s, cast(dchar)'\u00A9');
  429. assert(s.length == 7);
  430. assert(s == "abcda\xC2\xA9");
  431. //assert(s == "abcda\u00A9"); // BUG: fix compiler
  432. encode(s, cast(dchar)'\u2260');
  433. assert(s.length == 10);
  434. assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
  435. }
  436. /** ditto */
  437. void encode(ref wchar[] s, dchar c)
  438. in
  439. {
  440. assert(isValidDchar(c));
  441. }
  442. body
  443. {
  444. wchar[] r = s;
  445. if (c <= 0xFFFF)
  446. {
  447. r ~= cast(wchar) c;
  448. }
  449. else
  450. {
  451. wchar[2] buf;
  452. buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
  453. buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
  454. r ~= buf;
  455. }
  456. s = r;
  457. }
  458. /** ditto */
  459. void encode(ref dchar[] s, dchar c)
  460. in
  461. {
  462. assert(isValidDchar(c));
  463. }
  464. body
  465. {
  466. s ~= c;
  467. }
  468. /**
  469. Returns the code length of $(D c) in the encoding using $(D C) as a
  470. code point. The code is returned in character count, not in bytes.
  471. */
  472. ubyte codeLength(C)(dchar c)
  473. {
  474. static if (C.sizeof == 1)
  475. {
  476. return
  477. c <= 0x7F ? 1
  478. : c <= 0x7FF ? 2
  479. : c <= 0xFFFF ? 3
  480. : c <= 0x10FFFF ? 4
  481. : (assert(false), 6);
  482. }
  483. else static if (C.sizeof == 2)
  484. {
  485. return c <= 0xFFFF ? 1 : 2;
  486. }
  487. else
  488. {
  489. static assert(C.sizeof == 4);
  490. return 1;
  491. }
  492. }
  493. /* =================== Validation ======================= */
  494. /***********************************
  495. Checks to see if string is well formed or not. $(D S) can be an array
  496. of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException)
  497. if it is not. Use to check all untrusted input for correctness.
  498. */
  499. void validate(S)(in S s)
  500. {
  501. auto len = s.length;
  502. for (size_t i = 0; i < len; )
  503. {
  504. decode(s, i);
  505. }
  506. }
  507. /* =================== Conversion to UTF8 ======================= */
  508. char[] toUTF8(out char[4] buf, dchar c)
  509. in
  510. {
  511. assert(isValidDchar(c));
  512. }
  513. body
  514. {
  515. if (c <= 0x7F)
  516. {
  517. buf[0] = cast(char) c;
  518. return buf[0 .. 1];
  519. }
  520. else if (c <= 0x7FF)
  521. {
  522. buf[0] = cast(char)(0xC0 | (c >> 6));
  523. buf[1] = cast(char)(0x80 | (c & 0x3F));
  524. return buf[0 .. 2];
  525. }
  526. else if (c <= 0xFFFF)
  527. {
  528. buf[0] = cast(char)(0xE0 | (c >> 12));
  529. buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
  530. buf[2] = cast(char)(0x80 | (c & 0x3F));
  531. return buf[0 .. 3];
  532. }
  533. else if (c <= 0x10FFFF)
  534. {
  535. buf[0] = cast(char)(0xF0 | (c >> 18));
  536. buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
  537. buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
  538. buf[3] = cast(char)(0x80 | (c & 0x3F));
  539. return buf[0 .. 4];
  540. }
  541. assert(0);
  542. }
  543. /*******************
  544. * Encodes string s into UTF-8 and returns the encoded string.
  545. */
  546. string toUTF8(string s)
  547. in
  548. {
  549. validate(s);
  550. }
  551. body
  552. {
  553. return s;
  554. }
  555. /** ditto */
  556. string toUTF8(in wchar[] s)
  557. {
  558. char[] r;
  559. size_t i;
  560. size_t slen = s.length;
  561. r.length = slen;
  562. for (i = 0; i < slen; i++)
  563. { wchar c = s[i];
  564. if (c <= 0x7F)
  565. r[i] = cast(char)c; // fast path for ascii
  566. else
  567. {
  568. r.length = i;
  569. foreach (dchar c; s[i .. slen])
  570. {
  571. encode(r, c);
  572. }
  573. break;
  574. }
  575. }
  576. return cast(string)r;
  577. }
  578. /** ditto */
  579. string toUTF8(in dchar[] s)
  580. {
  581. char[] r;
  582. size_t i;
  583. size_t slen = s.length;
  584. r.length = slen;
  585. for (i = 0; i < slen; i++)
  586. { dchar c = s[i];
  587. if (c <= 0x7F)
  588. r[i] = cast(char)c; // fast path for ascii
  589. else
  590. {
  591. r.length = i;
  592. foreach (dchar d; s[i .. slen])
  593. {
  594. encode(r, d);
  595. }
  596. break;
  597. }
  598. }
  599. return cast(string)r;
  600. }
  601. /* =================== Conversion to UTF16 ======================= */
  602. wchar[] toUTF16(wchar[2] buf, dchar c)
  603. in
  604. {
  605. assert(isValidDchar(c));
  606. }
  607. body
  608. {
  609. if (c <= 0xFFFF)
  610. {
  611. buf[0] = cast(wchar) c;
  612. return buf[0 .. 1];
  613. }
  614. else
  615. {
  616. buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
  617. buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
  618. return buf[0 .. 2];
  619. }
  620. }
  621. /****************
  622. * Encodes string s into UTF-16 and returns the encoded string.
  623. * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take
  624. * an LPWSTR or LPCWSTR argument.
  625. */
  626. wstring toUTF16(in char[] s)
  627. {
  628. wchar[] r;
  629. size_t slen = s.length;
  630. r.length = slen;
  631. r.length = 0;
  632. for (size_t i = 0; i < slen; )
  633. {
  634. dchar c = s[i];
  635. if (c <= 0x7F)
  636. {
  637. i++;
  638. r ~= cast(wchar)c;
  639. }
  640. else
  641. {
  642. c = decode(s, i);
  643. encode(r, c);
  644. }
  645. }
  646. return cast(wstring)r;
  647. }
  648. alias const(wchar)* wptr;
  649. /** ditto */
  650. wptr toUTF16z(in char[] s)
  651. {
  652. wchar[] r;
  653. size_t slen = s.length;
  654. r.length = slen + 1;
  655. r.length = 0;
  656. for (size_t i = 0; i < slen; )
  657. {
  658. dchar c = s[i];
  659. if (c <= 0x7F)
  660. {
  661. i++;
  662. r ~= cast(wchar)c;
  663. }
  664. else
  665. {
  666. c = decode(s, i);
  667. encode(r, c);
  668. }
  669. }
  670. r ~= '\000';
  671. return r.ptr;
  672. }
  673. /** ditto */
  674. wstring toUTF16(wstring s)
  675. in
  676. {
  677. validate(s);
  678. }
  679. body
  680. {
  681. return s;
  682. }
  683. /** ditto */
  684. wstring toUTF16(in dchar[] s)
  685. {
  686. wchar[] r;
  687. size_t slen = s.length;
  688. r.length = slen;
  689. r.length = 0;
  690. for (size_t i = 0; i < slen; i++)
  691. {
  692. encode(r, s[i]);
  693. }
  694. return cast(wstring)r;
  695. }
  696. /* =================== Conversion to UTF32 ======================= */
  697. /*****
  698. * Encodes string s into UTF-32 and returns the encoded string.
  699. */
  700. dstring toUTF32(in char[] s)
  701. {
  702. dchar[] r;
  703. size_t slen = s.length;
  704. size_t j = 0;
  705. r.length = slen; // r[] will never be longer than s[]
  706. for (size_t i = 0; i < slen; )
  707. {
  708. dchar c = s[i];
  709. if (c >= 0x80)
  710. c = decode(s, i);
  711. else
  712. i++; // c is ascii, no need for decode
  713. r[j++] = c;
  714. }
  715. return cast(dstring)r[0 .. j];
  716. }
  717. /** ditto */
  718. dstring toUTF32(in wchar[] s)
  719. {
  720. dchar[] r;
  721. size_t slen = s.length;
  722. size_t j = 0;
  723. r.length = slen; // r[] will never be longer than s[]
  724. for (size_t i = 0; i < slen; )
  725. {
  726. dchar c = s[i];
  727. if (c >= 0x80)
  728. c = decode(s, i);
  729. else
  730. i++; // c is ascii, no need for decode
  731. r[j++] = c;
  732. }
  733. return cast(dstring)r[0 .. j];
  734. }
  735. /** ditto */
  736. dstring toUTF32(dstring s)
  737. in
  738. {
  739. validate(s);
  740. }
  741. body
  742. {
  743. return s;
  744. }
  745. /* ================================ tests ================================== */
  746. unittest
  747. {
  748. debug(utf) printf("utf.toUTF.unittest\n");
  749. auto c = "hello"c[];
  750. auto w = toUTF16(c);
  751. assert(w == "hello");
  752. auto d = toUTF32(c);
  753. assert(d == "hello");
  754. c = toUTF8(w);
  755. assert(c == "hello");
  756. d = toUTF32(w);
  757. assert(d == "hello");
  758. c = toUTF8(d);
  759. assert(c == "hello");
  760. w = toUTF16(d);
  761. assert(w == "hello");
  762. c = "hel\u1234o";
  763. w = toUTF16(c);
  764. assert(w == "hel\u1234o");
  765. d = toUTF32(c);
  766. assert(d == "hel\u1234o");
  767. c = toUTF8(w);
  768. assert(c == "hel\u1234o");
  769. d = toUTF32(w);
  770. assert(d == "hel\u1234o");
  771. c = toUTF8(d);
  772. assert(c == "hel\u1234o");
  773. w = toUTF16(d);
  774. assert(w == "hel\u1234o");
  775. c = "he\U000BAAAAllo";
  776. w = toUTF16(c);
  777. //foreach (wchar c; w) printf("c = x%x\n", c);
  778. //foreach (wchar c; cast(wstring)"he\U000BAAAAllo") printf("c = x%x\n", c);
  779. assert(w == "he\U000BAAAAllo");
  780. d = toUTF32(c);
  781. assert(d == "he\U000BAAAAllo");
  782. c = toUTF8(w);
  783. assert(c == "he\U000BAAAAllo");
  784. d = toUTF32(w);
  785. assert(d == "he\U000BAAAAllo");
  786. c = toUTF8(d);
  787. assert(c == "he\U000BAAAAllo");
  788. w = toUTF16(d);
  789. assert(w == "he\U000BAAAAllo");
  790. }