/src/rt/util/utf.d
D | 902 lines | 656 code | 112 blank | 134 comment | 128 complexity | 6a25b0feb3e2e01f96afe3eae6ec22a5 MD5 | raw file
1/******************************************** 2 * Encode and decode UTF-8, UTF-16 and UTF-32 strings. 3 * 4 * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D 5 * wchar type. 6 * For Posix systems, the C wchar_t type is UTF-32 and corresponds to 7 * the D utf.dchar type. 8 * 9 * UTF character support is restricted to (\u0000 <= character <= \U0010FFFF). 10 * 11 * See_Also: 12 * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br> 13 * $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br> 14 * $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335) 15 * Macros: 16 * WIKI = Phobos/StdUtf 17 * 18 * Copyright: Copyright Digital Mars 2003 - 2009. 19 * License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>. 20 * Authors: Walter Bright, Sean Kelly 21 */ 22 23/* Copyright Digital Mars 2003 - 2009. 24 * Distributed under the Boost Software License, Version 1.0. 25 * (See accompanying file LICENSE or copy at 26 * http://www.boost.org/LICENSE_1_0.txt) 27 */ 28module rt.util.utf; 29 30 31extern (C) void onUnicodeError( string msg, size_t idx, string file = __FILE__, size_t line = __LINE__ ); 32 33/******************************* 34 * Test if c is a valid UTF-32 character. 35 * 36 * \uFFFE and \uFFFF are considered valid by this function, 37 * as they are permitted for internal use by an application, 38 * but they are not allowed for interchange by the Unicode standard. 39 * 40 * Returns: true if it is, false if not. 41 */ 42 43bool isValidDchar(dchar c) 44{ 45 /* Note: FFFE and FFFF are specifically permitted by the 46 * Unicode standard for application internal use, but are not 47 * allowed for interchange. 48 * (thanks to Arcane Jill) 49 */ 50 51 return c < 0xD800 || 52 (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/); 53} 54 55unittest 56{ 57 debug(utf) printf("utf.isValidDchar.unittest\n"); 58 assert(isValidDchar(cast(dchar)'a') == true); 59 assert(isValidDchar(cast(dchar)0x1FFFFF) == false); 60} 61 62 63 64static immutable UTF8stride = 65[ 66 cast(ubyte) 67 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 68 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 69 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 70 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 71 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 72 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 73 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 74 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 75 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 76 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 77 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 78 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 79 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 80 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 81 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 82 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF, 83]; 84 85/** 86 * stride() returns the length of a UTF-8 sequence starting at index i 87 * in string s. 88 * Returns: 89 * The number of bytes in the UTF-8 sequence or 90 * 0xFF meaning s[i] is not the start of of UTF-8 sequence. 91 */ 92uint stride(in char[] s, size_t i) 93{ 94 return UTF8stride[s[i]]; 95} 96 97/** 98 * stride() returns the length of a UTF-16 sequence starting at index i 99 * in string s. 100 */ 101uint stride(in wchar[] s, size_t i) 102{ uint u = s[i]; 103 return 1 + (u >= 0xD800 && u <= 0xDBFF); 104} 105 106/** 107 * stride() returns the length of a UTF-32 sequence starting at index i 108 * in string s. 109 * Returns: The return value will always be 1. 110 */ 111uint stride(in dchar[] s, size_t i) 112{ 113 return 1; 114} 115 116/******************************************* 117 * Given an index i into an array of characters s[], 118 * and assuming that index i is at the start of a UTF character, 119 * determine the number of UCS characters up to that index i. 120 */ 121 122size_t toUCSindex(in char[] s, size_t i) 123{ 124 size_t n; 125 size_t j; 126 127 for (j = 0; j < i; ) 128 { 129 j += stride(s, j); 130 n++; 131 } 132 if (j > i) 133 { 134 onUnicodeError("invalid UTF-8 sequence", j); 135 } 136 return n; 137} 138 139/** ditto */ 140size_t toUCSindex(in wchar[] s, size_t i) 141{ 142 size_t n; 143 size_t j; 144 145 for (j = 0; j < i; ) 146 { 147 j += stride(s, j); 148 n++; 149 } 150 if (j > i) 151 { 152 onUnicodeError("invalid UTF-16 sequence", j); 153 } 154 return n; 155} 156 157/** ditto */ 158size_t toUCSindex(in dchar[] s, size_t i) 159{ 160 return i; 161} 162 163/****************************************** 164 * Given a UCS index n into an array of characters s[], return the UTF index. 165 */ 166 167size_t toUTFindex(in char[] s, size_t n) 168{ 169 size_t i; 170 171 while (n--) 172 { 173 uint j = UTF8stride[s[i]]; 174 if (j == 0xFF) 175 onUnicodeError("invalid UTF-8 sequence", i); 176 i += j; 177 } 178 return i; 179} 180 181/** ditto */ 182size_t toUTFindex(in wchar[] s, size_t n) 183{ 184 size_t i; 185 186 while (n--) 187 { wchar u = s[i]; 188 189 i += 1 + (u >= 0xD800 && u <= 0xDBFF); 190 } 191 return i; 192} 193 194/** ditto */ 195size_t toUTFindex(in dchar[] s, size_t n) 196{ 197 return n; 198} 199 200/* =================== Decode ======================= */ 201 202/*************** 203 * Decodes and returns character starting at s[idx]. idx is advanced past the 204 * decoded character. If the character is not well formed, a UtfException is 205 * thrown and idx remains unchanged. 206 */ 207dchar decode(in char[] s, ref size_t idx) 208 in 209 { 210 assert(idx >= 0 && idx < s.length); 211 } 212 out (result) 213 { 214 assert(isValidDchar(result)); 215 } 216 body 217 { 218 size_t len = s.length; 219 dchar V; 220 size_t i = idx; 221 char u = s[i]; 222 223 if (u & 0x80) 224 { uint n; 225 char u2; 226 227 /* The following encodings are valid, except for the 5 and 6 byte 228 * combinations: 229 * 0xxxxxxx 230 * 110xxxxx 10xxxxxx 231 * 1110xxxx 10xxxxxx 10xxxxxx 232 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 233 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 234 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 235 */ 236 for (n = 1; ; n++) 237 { 238 if (n > 4) 239 goto Lerr; // only do the first 4 of 6 encodings 240 if (((u << n) & 0x80) == 0) 241 { 242 if (n == 1) 243 goto Lerr; 244 break; 245 } 246 } 247 248 // Pick off (7 - n) significant bits of B from first byte of octet 249 V = cast(dchar)(u & ((1 << (7 - n)) - 1)); 250 251 if (i + (n - 1) >= len) 252 goto Lerr; // off end of string 253 254 /* The following combinations are overlong, and illegal: 255 * 1100000x (10xxxxxx) 256 * 11100000 100xxxxx (10xxxxxx) 257 * 11110000 1000xxxx (10xxxxxx 10xxxxxx) 258 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) 259 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) 260 */ 261 u2 = s[i + 1]; 262 if ((u & 0xFE) == 0xC0 || 263 (u == 0xE0 && (u2 & 0xE0) == 0x80) || 264 (u == 0xF0 && (u2 & 0xF0) == 0x80) || 265 (u == 0xF8 && (u2 & 0xF8) == 0x80) || 266 (u == 0xFC && (u2 & 0xFC) == 0x80)) 267 goto Lerr; // overlong combination 268 269 for (uint j = 1; j != n; j++) 270 { 271 u = s[i + j]; 272 if ((u & 0xC0) != 0x80) 273 goto Lerr; // trailing bytes are 10xxxxxx 274 V = (V << 6) | (u & 0x3F); 275 } 276 if (!isValidDchar(V)) 277 goto Lerr; 278 i += n; 279 } 280 else 281 { 282 V = cast(dchar) u; 283 i++; 284 } 285 286 idx = i; 287 return V; 288 289 Lerr: 290 onUnicodeError("invalid UTF-8 sequence", i); 291 return V; // dummy return 292 } 293 294unittest 295{ size_t i; 296 dchar c; 297 298 debug(utf) printf("utf.decode.unittest\n"); 299 300 static s1 = "abcd"c; 301 i = 0; 302 c = decode(s1, i); 303 assert(c == cast(dchar)'a'); 304 assert(i == 1); 305 c = decode(s1, i); 306 assert(c == cast(dchar)'b'); 307 assert(i == 2); 308 309 static s2 = "\xC2\xA9"c; 310 i = 0; 311 c = decode(s2, i); 312 assert(c == cast(dchar)'\u00A9'); 313 assert(i == 2); 314 315 static s3 = "\xE2\x89\xA0"c; 316 i = 0; 317 c = decode(s3, i); 318 assert(c == cast(dchar)'\u2260'); 319 assert(i == 3); 320 321 static s4 = 322 [ "\xE2\x89"c[], // too short 323 "\xC0\x8A", 324 "\xE0\x80\x8A", 325 "\xF0\x80\x80\x8A", 326 "\xF8\x80\x80\x80\x8A", 327 "\xFC\x80\x80\x80\x80\x8A", 328 ]; 329 330 for (int j = 0; j < s4.length; j++) 331 { 332 try 333 { 334 i = 0; 335 c = decode(s4[j], i); 336 assert(0); 337 } 338 catch (Throwable o) 339 { 340 i = 23; 341 } 342 assert(i == 23); 343 } 344} 345 346/** ditto */ 347 348dchar decode(in wchar[] s, ref size_t idx) 349 in 350 { 351 assert(idx >= 0 && idx < s.length); 352 } 353 out (result) 354 { 355 assert(isValidDchar(result)); 356 } 357 body 358 { 359 string msg; 360 dchar V; 361 size_t i = idx; 362 uint u = s[i]; 363 364 if (u & ~0x7F) 365 { if (u >= 0xD800 && u <= 0xDBFF) 366 { uint u2; 367 368 if (i + 1 == s.length) 369 { msg = "surrogate UTF-16 high value past end of string"; 370 goto Lerr; 371 } 372 u2 = s[i + 1]; 373 if (u2 < 0xDC00 || u2 > 0xDFFF) 374 { msg = "surrogate UTF-16 low value out of range"; 375 goto Lerr; 376 } 377 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); 378 i += 2; 379 } 380 else if (u >= 0xDC00 && u <= 0xDFFF) 381 { msg = "unpaired surrogate UTF-16 value"; 382 goto Lerr; 383 } 384 else if (u == 0xFFFE || u == 0xFFFF) 385 { msg = "illegal UTF-16 value"; 386 goto Lerr; 387 } 388 else 389 i++; 390 } 391 else 392 { 393 i++; 394 } 395 396 idx = i; 397 return cast(dchar)u; 398 399 Lerr: 400 onUnicodeError(msg, i); 401 return cast(dchar)u; // dummy return 402 } 403 404/** ditto */ 405 406dchar decode(in dchar[] s, ref size_t idx) 407 in 408 { 409 assert(idx >= 0 && idx < s.length); 410 } 411 body 412 { 413 size_t i = idx; 414 dchar c = s[i]; 415 416 if (!isValidDchar(c)) 417 goto Lerr; 418 idx = i + 1; 419 return c; 420 421 Lerr: 422 onUnicodeError("invalid UTF-32 value", i); 423 return c; // dummy return 424 } 425 426 427/* =================== Encode ======================= */ 428 429/******************************* 430 * Encodes character c and appends it to array s[]. 431 */ 432void encode(ref char[] s, dchar c) 433 in 434 { 435 assert(isValidDchar(c)); 436 } 437 body 438 { 439 char[] r = s; 440 441 if (c <= 0x7F) 442 { 443 r ~= cast(char) c; 444 } 445 else 446 { 447 char[4] buf; 448 uint L; 449 450 if (c <= 0x7FF) 451 { 452 buf[0] = cast(char)(0xC0 | (c >> 6)); 453 buf[1] = cast(char)(0x80 | (c & 0x3F)); 454 L = 2; 455 } 456 else if (c <= 0xFFFF) 457 { 458 buf[0] = cast(char)(0xE0 | (c >> 12)); 459 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 460 buf[2] = cast(char)(0x80 | (c & 0x3F)); 461 L = 3; 462 } 463 else if (c <= 0x10FFFF) 464 { 465 buf[0] = cast(char)(0xF0 | (c >> 18)); 466 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 467 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 468 buf[3] = cast(char)(0x80 | (c & 0x3F)); 469 L = 4; 470 } 471 else 472 { 473 assert(0); 474 } 475 r ~= buf[0 .. L]; 476 } 477 s = r; 478 } 479 480unittest 481{ 482 debug(utf) printf("utf.encode.unittest\n"); 483 484 char[] s = "abcd".dup; 485 encode(s, cast(dchar)'a'); 486 assert(s.length == 5); 487 assert(s == "abcda"); 488 489 encode(s, cast(dchar)'\u00A9'); 490 assert(s.length == 7); 491 assert(s == "abcda\xC2\xA9"); 492 //assert(s == "abcda\u00A9"); // BUG: fix compiler 493 494 encode(s, cast(dchar)'\u2260'); 495 assert(s.length == 10); 496 assert(s == "abcda\xC2\xA9\xE2\x89\xA0"); 497} 498 499/** ditto */ 500 501void encode(ref wchar[] s, dchar c) 502 in 503 { 504 assert(isValidDchar(c)); 505 } 506 body 507 { 508 wchar[] r = s; 509 510 if (c <= 0xFFFF) 511 { 512 r ~= cast(wchar) c; 513 } 514 else 515 { 516 wchar[2] buf; 517 518 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 519 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); 520 r ~= buf; 521 } 522 s = r; 523 } 524 525/** ditto */ 526void encode(ref dchar[] s, dchar c) 527 in 528 { 529 assert(isValidDchar(c)); 530 } 531 body 532 { 533 s ~= c; 534 } 535 536/** 537Returns the code length of $(D c) in the encoding using $(D C) as a 538code point. The code is returned in character count, not in bytes. 539 */ 540 541ubyte codeLength(C)(dchar c) 542{ 543 544 static if (C.sizeof == 1) 545 { 546 return 547 c <= 0x7F ? 1 548 : c <= 0x7FF ? 2 549 : c <= 0xFFFF ? 3 550 : c <= 0x10FFFF ? 4 551 : (assert(false), 6); 552} 553 554 else static if (C.sizeof == 2) 555{ 556 return c <= 0xFFFF ? 1 : 2; 557 } 558 else 559 { 560 static assert(C.sizeof == 4); 561 return 1; 562 } 563} 564 565/* =================== Validation ======================= */ 566 567/*********************************** 568Checks to see if string is well formed or not. $(D S) can be an array 569 of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException) 570 if it is not. Use to check all untrusted input for correctness. 571 */ 572void validate(S)(in S s) 573{ 574 auto len = s.length; 575 for (size_t i = 0; i < len; ) 576 { 577 decode(s, i); 578 } 579} 580 581/* =================== Conversion to UTF8 ======================= */ 582 583char[] toUTF8(out char[4] buf, dchar c) 584 in 585 { 586 assert(isValidDchar(c)); 587 } 588 body 589 { 590 if (c <= 0x7F) 591 { 592 buf[0] = cast(char) c; 593 return buf[0 .. 1]; 594 } 595 else if (c <= 0x7FF) 596 { 597 buf[0] = cast(char)(0xC0 | (c >> 6)); 598 buf[1] = cast(char)(0x80 | (c & 0x3F)); 599 return buf[0 .. 2]; 600 } 601 else if (c <= 0xFFFF) 602 { 603 buf[0] = cast(char)(0xE0 | (c >> 12)); 604 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 605 buf[2] = cast(char)(0x80 | (c & 0x3F)); 606 return buf[0 .. 3]; 607 } 608 else if (c <= 0x10FFFF) 609 { 610 buf[0] = cast(char)(0xF0 | (c >> 18)); 611 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 612 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 613 buf[3] = cast(char)(0x80 | (c & 0x3F)); 614 return buf[0 .. 4]; 615 } 616 assert(0); 617 } 618 619/******************* 620 * Encodes string s into UTF-8 and returns the encoded string. 621 */ 622string toUTF8(string s) 623 in 624 { 625 validate(s); 626 } 627 body 628 { 629 return s; 630 } 631 632/** ditto */ 633string toUTF8(in wchar[] s) 634{ 635 char[] r; 636 size_t i; 637 size_t slen = s.length; 638 639 r.length = slen; 640 641 for (i = 0; i < slen; i++) 642 { wchar c = s[i]; 643 644 if (c <= 0x7F) 645 r[i] = cast(char)c; // fast path for ascii 646 else 647 { 648 r.length = i; 649 foreach (dchar c; s[i .. slen]) 650 { 651 encode(r, c); 652 } 653 break; 654 } 655 } 656 return cast(string)r; 657} 658 659/** ditto */ 660string toUTF8(in dchar[] s) 661{ 662 char[] r; 663 size_t i; 664 size_t slen = s.length; 665 666 r.length = slen; 667 668 for (i = 0; i < slen; i++) 669 { dchar c = s[i]; 670 671 if (c <= 0x7F) 672 r[i] = cast(char)c; // fast path for ascii 673 else 674 { 675 r.length = i; 676 foreach (dchar d; s[i .. slen]) 677 { 678 encode(r, d); 679 } 680 break; 681 } 682 } 683 return cast(string)r; 684} 685 686/* =================== Conversion to UTF16 ======================= */ 687 688wchar[] toUTF16(wchar[2] buf, dchar c) 689 in 690 { 691 assert(isValidDchar(c)); 692 } 693 body 694 { 695 if (c <= 0xFFFF) 696 { 697 buf[0] = cast(wchar) c; 698 return buf[0 .. 1]; 699 } 700 else 701 { 702 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 703 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); 704 return buf[0 .. 2]; 705 } 706 } 707 708/**************** 709 * Encodes string s into UTF-16 and returns the encoded string. 710 * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take 711 * an LPWSTR or LPCWSTR argument. 712 */ 713wstring toUTF16(in char[] s) 714{ 715 wchar[] r; 716 size_t slen = s.length; 717 718 r.length = slen; 719 r.length = 0; 720 for (size_t i = 0; i < slen; ) 721 { 722 dchar c = s[i]; 723 if (c <= 0x7F) 724 { 725 i++; 726 r ~= cast(wchar)c; 727 } 728 else 729 { 730 c = decode(s, i); 731 encode(r, c); 732 } 733 } 734 return cast(wstring)r; 735} 736 737alias const(wchar)* wptr; 738/** ditto */ 739wptr toUTF16z(in char[] s) 740{ 741 wchar[] r; 742 size_t slen = s.length; 743 744 r.length = slen + 1; 745 r.length = 0; 746 for (size_t i = 0; i < slen; ) 747 { 748 dchar c = s[i]; 749 if (c <= 0x7F) 750 { 751 i++; 752 r ~= cast(wchar)c; 753 } 754 else 755 { 756 c = decode(s, i); 757 encode(r, c); 758 } 759 } 760 r ~= '\000'; 761 return r.ptr; 762} 763 764/** ditto */ 765wstring toUTF16(wstring s) 766 in 767 { 768 validate(s); 769 } 770 body 771 { 772 return s; 773 } 774 775/** ditto */ 776wstring toUTF16(in dchar[] s) 777{ 778 wchar[] r; 779 size_t slen = s.length; 780 781 r.length = slen; 782 r.length = 0; 783 for (size_t i = 0; i < slen; i++) 784 { 785 encode(r, s[i]); 786 } 787 return cast(wstring)r; 788} 789 790/* =================== Conversion to UTF32 ======================= */ 791 792/***** 793 * Encodes string s into UTF-32 and returns the encoded string. 794 */ 795dstring toUTF32(in char[] s) 796{ 797 dchar[] r; 798 size_t slen = s.length; 799 size_t j = 0; 800 801 r.length = slen; // r[] will never be longer than s[] 802 for (size_t i = 0; i < slen; ) 803 { 804 dchar c = s[i]; 805 if (c >= 0x80) 806 c = decode(s, i); 807 else 808 i++; // c is ascii, no need for decode 809 r[j++] = c; 810 } 811 return cast(dstring)r[0 .. j]; 812} 813 814/** ditto */ 815dstring toUTF32(in wchar[] s) 816{ 817 dchar[] r; 818 size_t slen = s.length; 819 size_t j = 0; 820 821 r.length = slen; // r[] will never be longer than s[] 822 for (size_t i = 0; i < slen; ) 823 { 824 dchar c = s[i]; 825 if (c >= 0x80) 826 c = decode(s, i); 827 else 828 i++; // c is ascii, no need for decode 829 r[j++] = c; 830 } 831 return cast(dstring)r[0 .. j]; 832} 833 834/** ditto */ 835dstring toUTF32(dstring s) 836 in 837 { 838 validate(s); 839 } 840 body 841 { 842 return s; 843 } 844 845/* ================================ tests ================================== */ 846 847unittest 848{ 849 debug(utf) printf("utf.toUTF.unittest\n"); 850 851 auto c = "hello"c[]; 852 auto w = toUTF16(c); 853 assert(w == "hello"); 854 auto d = toUTF32(c); 855 assert(d == "hello"); 856 857 c = toUTF8(w); 858 assert(c == "hello"); 859 d = toUTF32(w); 860 assert(d == "hello"); 861 862 c = toUTF8(d); 863 assert(c == "hello"); 864 w = toUTF16(d); 865 assert(w == "hello"); 866 867 868 c = "hel\u1234o"; 869 w = toUTF16(c); 870 assert(w == "hel\u1234o"); 871 d = toUTF32(c); 872 assert(d == "hel\u1234o"); 873 874 c = toUTF8(w); 875 assert(c == "hel\u1234o"); 876 d = toUTF32(w); 877 assert(d == "hel\u1234o"); 878 879 c = toUTF8(d); 880 assert(c == "hel\u1234o"); 881 w = toUTF16(d); 882 assert(w == "hel\u1234o"); 883 884 885 c = "he\U000BAAAAllo"; 886 w = toUTF16(c); 887 //foreach (wchar c; w) printf("c = x%x\n", c); 888 //foreach (wchar c; cast(wstring)"he\U000BAAAAllo") printf("c = x%x\n", c); 889 assert(w == "he\U000BAAAAllo"); 890 d = toUTF32(c); 891 assert(d == "he\U000BAAAAllo"); 892 893 c = toUTF8(w); 894 assert(c == "he\U000BAAAAllo"); 895 d = toUTF32(w); 896 assert(d == "he\U000BAAAAllo"); 897 898 c = toUTF8(d); 899 assert(c == "he\U000BAAAAllo"); 900 w = toUTF16(d); 901 assert(w == "he\U000BAAAAllo"); 902}