PageRenderTime 123ms CodeModel.GetById 19ms app.highlight 97ms RepoModel.GetById 1ms app.codeStats 0ms

/src/rt/util/utf.d

http://github.com/AlexeyProkhin/druntime
D | 902 lines | 656 code | 112 blank | 134 comment | 128 complexity | 6a25b0feb3e2e01f96afe3eae6ec22a5 MD5 | raw file
  1/********************************************
  2 * Encode and decode UTF-8, UTF-16 and UTF-32 strings.
  3 *
  4 * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D
  5 * wchar type.
  6 * For Posix systems, the C wchar_t type is UTF-32 and corresponds to
  7 * the D utf.dchar type.
  8 *
  9 * UTF character support is restricted to (\u0000 <= character <= \U0010FFFF).
 10 *
 11 * See_Also:
 12 *      $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
 13 *      $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
 14 *      $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
 15 * Macros:
 16 *      WIKI = Phobos/StdUtf
 17 *
 18 * Copyright: Copyright Digital Mars 2003 - 2009.
 19 * License:   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
 20 * Authors:   Walter Bright, Sean Kelly
 21 */
 22
 23/*          Copyright Digital Mars 2003 - 2009.
 24 * Distributed under the Boost Software License, Version 1.0.
 25 *    (See accompanying file LICENSE or copy at
 26 *          http://www.boost.org/LICENSE_1_0.txt)
 27 */
 28module rt.util.utf;
 29
 30
 31extern (C) void onUnicodeError( string msg, size_t idx, string file = __FILE__, size_t line = __LINE__ );
 32
 33/*******************************
 34 * Test if c is a valid UTF-32 character.
 35 *
 36 * \uFFFE and \uFFFF are considered valid by this function,
 37 * as they are permitted for internal use by an application,
 38 * but they are not allowed for interchange by the Unicode standard.
 39 *
 40 * Returns: true if it is, false if not.
 41 */
 42
 43bool isValidDchar(dchar c)
 44{
 45    /* Note: FFFE and FFFF are specifically permitted by the
 46     * Unicode standard for application internal use, but are not
 47     * allowed for interchange.
 48     * (thanks to Arcane Jill)
 49     */
 50
 51    return c < 0xD800 ||
 52        (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
 53}
 54
 55unittest
 56{
 57    debug(utf) printf("utf.isValidDchar.unittest\n");
 58    assert(isValidDchar(cast(dchar)'a') == true);
 59    assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
 60}
 61
 62
 63
 64static immutable UTF8stride =
 65[
 66    cast(ubyte)
 67    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 68    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 69    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 70    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 71    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 72    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 73    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 74    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 75    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
 76    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
 77    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
 78    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
 79    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 80    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 81    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
 82    4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
 83];
 84
 85/**
 86 * stride() returns the length of a UTF-8 sequence starting at index i
 87 * in string s.
 88 * Returns:
 89 *      The number of bytes in the UTF-8 sequence or
 90 *      0xFF meaning s[i] is not the start of of UTF-8 sequence.
 91 */
 92uint stride(in char[] s, size_t i)
 93{
 94    return UTF8stride[s[i]];
 95}
 96
 97/**
 98 * stride() returns the length of a UTF-16 sequence starting at index i
 99 * in string s.
100 */
101uint stride(in wchar[] s, size_t i)
102{   uint u = s[i];
103    return 1 + (u >= 0xD800 && u <= 0xDBFF);
104}
105
106/**
107 * stride() returns the length of a UTF-32 sequence starting at index i
108 * in string s.
109 * Returns: The return value will always be 1.
110 */
111uint stride(in dchar[] s, size_t i)
112{
113    return 1;
114}
115
116/*******************************************
117 * Given an index i into an array of characters s[],
118 * and assuming that index i is at the start of a UTF character,
119 * determine the number of UCS characters up to that index i.
120 */
121
122size_t toUCSindex(in char[] s, size_t i)
123{
124    size_t n;
125    size_t j;
126
127    for (j = 0; j < i; )
128    {
129        j += stride(s, j);
130        n++;
131    }
132    if (j > i)
133    {
134        onUnicodeError("invalid UTF-8 sequence", j);
135    }
136    return n;
137}
138
139/** ditto */
140size_t toUCSindex(in wchar[] s, size_t i)
141{
142    size_t n;
143    size_t j;
144
145    for (j = 0; j < i; )
146    {
147        j += stride(s, j);
148        n++;
149    }
150    if (j > i)
151    {
152        onUnicodeError("invalid UTF-16 sequence", j);
153    }
154    return n;
155}
156
157/** ditto */
158size_t toUCSindex(in dchar[] s, size_t i)
159{
160    return i;
161}
162
163/******************************************
164 * Given a UCS index n into an array of characters s[], return the UTF index.
165 */
166
167size_t toUTFindex(in char[] s, size_t n)
168{
169    size_t i;
170
171    while (n--)
172    {
173        uint j = UTF8stride[s[i]];
174        if (j == 0xFF)
175            onUnicodeError("invalid UTF-8 sequence", i);
176        i += j;
177    }
178    return i;
179}
180
181/** ditto */
182size_t toUTFindex(in wchar[] s, size_t n)
183{
184    size_t i;
185
186    while (n--)
187    {   wchar u = s[i];
188
189        i += 1 + (u >= 0xD800 && u <= 0xDBFF);
190    }
191    return i;
192}
193
194/** ditto */
195size_t toUTFindex(in dchar[] s, size_t n)
196{
197    return n;
198}
199
200/* =================== Decode ======================= */
201
202/***************
203 * Decodes and returns character starting at s[idx]. idx is advanced past the
204 * decoded character. If the character is not well formed, a UtfException is
205 * thrown and idx remains unchanged.
206 */
207dchar decode(in char[] s, ref size_t idx)
208    in
209    {
210        assert(idx >= 0 && idx < s.length);
211    }
212    out (result)
213    {
214        assert(isValidDchar(result));
215    }
216    body
217    {
218        size_t len = s.length;
219        dchar V;
220        size_t i = idx;
221        char u = s[i];
222
223        if (u & 0x80)
224        {   uint n;
225            char u2;
226
227            /* The following encodings are valid, except for the 5 and 6 byte
228             * combinations:
229             *  0xxxxxxx
230             *  110xxxxx 10xxxxxx
231             *  1110xxxx 10xxxxxx 10xxxxxx
232             *  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
233             *  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
234             *  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
235             */
236            for (n = 1; ; n++)
237            {
238                if (n > 4)
239                    goto Lerr;          // only do the first 4 of 6 encodings
240                if (((u << n) & 0x80) == 0)
241                {
242                    if (n == 1)
243                        goto Lerr;
244                    break;
245                }
246            }
247
248            // Pick off (7 - n) significant bits of B from first byte of octet
249            V = cast(dchar)(u & ((1 << (7 - n)) - 1));
250
251            if (i + (n - 1) >= len)
252                goto Lerr;                      // off end of string
253
254            /* The following combinations are overlong, and illegal:
255             *  1100000x (10xxxxxx)
256             *  11100000 100xxxxx (10xxxxxx)
257             *  11110000 1000xxxx (10xxxxxx 10xxxxxx)
258             *  11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
259             *  11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
260             */
261            u2 = s[i + 1];
262            if ((u & 0xFE) == 0xC0 ||
263                (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
264                (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
265                (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
266                (u == 0xFC && (u2 & 0xFC) == 0x80))
267                goto Lerr;                      // overlong combination
268
269            for (uint j = 1; j != n; j++)
270            {
271                u = s[i + j];
272                if ((u & 0xC0) != 0x80)
273                    goto Lerr;                  // trailing bytes are 10xxxxxx
274                V = (V << 6) | (u & 0x3F);
275            }
276            if (!isValidDchar(V))
277                goto Lerr;
278            i += n;
279        }
280        else
281        {
282            V = cast(dchar) u;
283            i++;
284        }
285
286        idx = i;
287        return V;
288
289      Lerr:
290      onUnicodeError("invalid UTF-8 sequence", i);
291    return V; // dummy return
292    }
293
294unittest
295{   size_t i;
296    dchar c;
297
298    debug(utf) printf("utf.decode.unittest\n");
299
300    static s1 = "abcd"c;
301    i = 0;
302    c = decode(s1, i);
303    assert(c == cast(dchar)'a');
304    assert(i == 1);
305    c = decode(s1, i);
306    assert(c == cast(dchar)'b');
307    assert(i == 2);
308
309    static s2 = "\xC2\xA9"c;
310    i = 0;
311    c = decode(s2, i);
312    assert(c == cast(dchar)'\u00A9');
313    assert(i == 2);
314
315    static s3 = "\xE2\x89\xA0"c;
316    i = 0;
317    c = decode(s3, i);
318    assert(c == cast(dchar)'\u2260');
319    assert(i == 3);
320
321    static s4 =
322    [   "\xE2\x89"c[],          // too short
323        "\xC0\x8A",
324        "\xE0\x80\x8A",
325        "\xF0\x80\x80\x8A",
326        "\xF8\x80\x80\x80\x8A",
327        "\xFC\x80\x80\x80\x80\x8A",
328    ];
329
330    for (int j = 0; j < s4.length; j++)
331    {
332        try
333        {
334            i = 0;
335            c = decode(s4[j], i);
336            assert(0);
337        }
338        catch (Throwable o)
339        {
340            i = 23;
341        }
342        assert(i == 23);
343    }
344}
345
346/** ditto */
347
348dchar decode(in wchar[] s, ref size_t idx)
349    in
350    {
351        assert(idx >= 0 && idx < s.length);
352    }
353    out (result)
354    {
355        assert(isValidDchar(result));
356    }
357    body
358    {
359        string msg;
360        dchar V;
361        size_t i = idx;
362        uint u = s[i];
363
364        if (u & ~0x7F)
365        {   if (u >= 0xD800 && u <= 0xDBFF)
366            {   uint u2;
367
368                if (i + 1 == s.length)
369                {   msg = "surrogate UTF-16 high value past end of string";
370                    goto Lerr;
371                }
372                u2 = s[i + 1];
373                if (u2 < 0xDC00 || u2 > 0xDFFF)
374                {   msg = "surrogate UTF-16 low value out of range";
375                    goto Lerr;
376                }
377                u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
378                i += 2;
379            }
380            else if (u >= 0xDC00 && u <= 0xDFFF)
381            {   msg = "unpaired surrogate UTF-16 value";
382                goto Lerr;
383            }
384            else if (u == 0xFFFE || u == 0xFFFF)
385            {   msg = "illegal UTF-16 value";
386                goto Lerr;
387            }
388            else
389                i++;
390        }
391        else
392        {
393            i++;
394        }
395
396        idx = i;
397        return cast(dchar)u;
398
399      Lerr:
400          onUnicodeError(msg, i);
401        return cast(dchar)u; // dummy return
402    }
403
404/** ditto */
405
406dchar decode(in dchar[] s, ref size_t idx)
407    in
408    {
409        assert(idx >= 0 && idx < s.length);
410    }
411    body
412    {
413        size_t i = idx;
414        dchar c = s[i];
415
416        if (!isValidDchar(c))
417            goto Lerr;
418        idx = i + 1;
419        return c;
420
421      Lerr:
422          onUnicodeError("invalid UTF-32 value", i);
423        return c; // dummy return
424    }
425
426
427/* =================== Encode ======================= */
428
429/*******************************
430 * Encodes character c and appends it to array s[].
431 */
432void encode(ref char[] s, dchar c)
433    in
434    {
435        assert(isValidDchar(c));
436    }
437    body
438    {
439        char[] r = s;
440
441        if (c <= 0x7F)
442        {
443            r ~= cast(char) c;
444        }
445        else
446        {
447            char[4] buf;
448            uint L;
449
450            if (c <= 0x7FF)
451            {
452                buf[0] = cast(char)(0xC0 | (c >> 6));
453                buf[1] = cast(char)(0x80 | (c & 0x3F));
454                L = 2;
455            }
456            else if (c <= 0xFFFF)
457            {
458                buf[0] = cast(char)(0xE0 | (c >> 12));
459                buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
460                buf[2] = cast(char)(0x80 | (c & 0x3F));
461                L = 3;
462            }
463            else if (c <= 0x10FFFF)
464            {
465                buf[0] = cast(char)(0xF0 | (c >> 18));
466                buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
467                buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
468                buf[3] = cast(char)(0x80 | (c & 0x3F));
469                L = 4;
470            }
471            else
472            {
473                assert(0);
474            }
475            r ~= buf[0 .. L];
476        }
477        s = r;
478    }
479
480unittest
481{
482    debug(utf) printf("utf.encode.unittest\n");
483
484    char[] s = "abcd".dup;
485    encode(s, cast(dchar)'a');
486    assert(s.length == 5);
487    assert(s == "abcda");
488
489    encode(s, cast(dchar)'\u00A9');
490    assert(s.length == 7);
491    assert(s == "abcda\xC2\xA9");
492    //assert(s == "abcda\u00A9");       // BUG: fix compiler
493
494    encode(s, cast(dchar)'\u2260');
495    assert(s.length == 10);
496    assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
497}
498
499/** ditto */
500
501void encode(ref wchar[] s, dchar c)
502    in
503    {
504        assert(isValidDchar(c));
505    }
506    body
507    {
508        wchar[] r = s;
509
510        if (c <= 0xFFFF)
511        {
512            r ~= cast(wchar) c;
513        }
514        else
515        {
516            wchar[2] buf;
517
518            buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
519            buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
520            r ~= buf;
521        }
522        s = r;
523    }
524
525/** ditto */
526void encode(ref dchar[] s, dchar c)
527    in
528    {
529        assert(isValidDchar(c));
530    }
531    body
532    {
533        s ~= c;
534    }
535
536/**
537Returns the code length of $(D c) in the encoding using $(D C) as a
538code point. The code is returned in character count, not in bytes.
539 */
540
541ubyte codeLength(C)(dchar c)
542{
543
544    static if (C.sizeof == 1)
545    {
546        return
547            c <= 0x7F ? 1
548            : c <= 0x7FF ? 2
549            : c <= 0xFFFF ? 3
550            : c <= 0x10FFFF ? 4
551            : (assert(false), 6);
552}
553
554    else static if (C.sizeof == 2)
555{
556        return c <= 0xFFFF ? 1 : 2;
557    }
558    else
559    {
560        static assert(C.sizeof == 4);
561        return 1;
562    }
563}
564
565/* =================== Validation ======================= */
566
567/***********************************
568Checks to see if string is well formed or not. $(D S) can be an array
569 of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException)
570 if it is not. Use to check all untrusted input for correctness.
571 */
572void validate(S)(in S s)
573{
574    auto len = s.length;
575    for (size_t i = 0; i < len; )
576    {
577        decode(s, i);
578    }
579}
580
581/* =================== Conversion to UTF8 ======================= */
582
583char[] toUTF8(out char[4] buf, dchar c)
584    in
585    {
586        assert(isValidDchar(c));
587    }
588    body
589    {
590        if (c <= 0x7F)
591        {
592            buf[0] = cast(char) c;
593            return buf[0 .. 1];
594        }
595        else if (c <= 0x7FF)
596        {
597            buf[0] = cast(char)(0xC0 | (c >> 6));
598            buf[1] = cast(char)(0x80 | (c & 0x3F));
599            return buf[0 .. 2];
600        }
601        else if (c <= 0xFFFF)
602        {
603            buf[0] = cast(char)(0xE0 | (c >> 12));
604            buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
605            buf[2] = cast(char)(0x80 | (c & 0x3F));
606            return buf[0 .. 3];
607        }
608        else if (c <= 0x10FFFF)
609        {
610            buf[0] = cast(char)(0xF0 | (c >> 18));
611            buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
612            buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
613            buf[3] = cast(char)(0x80 | (c & 0x3F));
614            return buf[0 .. 4];
615        }
616        assert(0);
617    }
618
619/*******************
620 * Encodes string s into UTF-8 and returns the encoded string.
621 */
622string toUTF8(string s)
623    in
624    {
625        validate(s);
626    }
627    body
628    {
629        return s;
630    }
631
632/** ditto */
633string toUTF8(in wchar[] s)
634{
635    char[] r;
636    size_t i;
637    size_t slen = s.length;
638
639    r.length = slen;
640
641    for (i = 0; i < slen; i++)
642    {   wchar c = s[i];
643
644        if (c <= 0x7F)
645            r[i] = cast(char)c;         // fast path for ascii
646        else
647        {
648            r.length = i;
649            foreach (dchar c; s[i .. slen])
650            {
651                encode(r, c);
652            }
653            break;
654        }
655    }
656    return cast(string)r;
657}
658
659/** ditto */
660string toUTF8(in dchar[] s)
661{
662    char[] r;
663    size_t i;
664    size_t slen = s.length;
665
666    r.length = slen;
667
668    for (i = 0; i < slen; i++)
669    {   dchar c = s[i];
670
671        if (c <= 0x7F)
672            r[i] = cast(char)c;         // fast path for ascii
673        else
674        {
675            r.length = i;
676            foreach (dchar d; s[i .. slen])
677            {
678                encode(r, d);
679            }
680            break;
681        }
682    }
683    return cast(string)r;
684}
685
686/* =================== Conversion to UTF16 ======================= */
687
688wchar[] toUTF16(wchar[2] buf, dchar c)
689    in
690    {
691        assert(isValidDchar(c));
692    }
693    body
694    {
695        if (c <= 0xFFFF)
696        {
697            buf[0] = cast(wchar) c;
698            return buf[0 .. 1];
699        }
700        else
701        {
702            buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
703            buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
704            return buf[0 .. 2];
705        }
706    }
707
708/****************
709 * Encodes string s into UTF-16 and returns the encoded string.
710 * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take
711 * an LPWSTR or LPCWSTR argument.
712 */
713wstring toUTF16(in char[] s)
714{
715    wchar[] r;
716    size_t slen = s.length;
717
718    r.length = slen;
719    r.length = 0;
720    for (size_t i = 0; i < slen; )
721    {
722        dchar c = s[i];
723        if (c <= 0x7F)
724        {
725            i++;
726            r ~= cast(wchar)c;
727        }
728        else
729        {
730            c = decode(s, i);
731            encode(r, c);
732        }
733    }
734    return cast(wstring)r;
735}
736
737alias const(wchar)* wptr;
738/** ditto */
739wptr toUTF16z(in char[] s)
740{
741    wchar[] r;
742    size_t slen = s.length;
743
744    r.length = slen + 1;
745    r.length = 0;
746    for (size_t i = 0; i < slen; )
747    {
748        dchar c = s[i];
749        if (c <= 0x7F)
750        {
751            i++;
752            r ~= cast(wchar)c;
753        }
754        else
755        {
756            c = decode(s, i);
757            encode(r, c);
758        }
759    }
760    r ~= '\000';
761    return r.ptr;
762}
763
764/** ditto */
765wstring toUTF16(wstring s)
766    in
767    {
768        validate(s);
769    }
770    body
771    {
772        return s;
773    }
774
775/** ditto */
776wstring toUTF16(in dchar[] s)
777{
778    wchar[] r;
779    size_t slen = s.length;
780
781    r.length = slen;
782    r.length = 0;
783    for (size_t i = 0; i < slen; i++)
784    {
785        encode(r, s[i]);
786    }
787    return cast(wstring)r;
788}
789
790/* =================== Conversion to UTF32 ======================= */
791
792/*****
793 * Encodes string s into UTF-32 and returns the encoded string.
794 */
795dstring toUTF32(in char[] s)
796{
797    dchar[] r;
798    size_t slen = s.length;
799    size_t j = 0;
800
801    r.length = slen;            // r[] will never be longer than s[]
802    for (size_t i = 0; i < slen; )
803    {
804        dchar c = s[i];
805        if (c >= 0x80)
806            c = decode(s, i);
807        else
808            i++;                // c is ascii, no need for decode
809        r[j++] = c;
810    }
811    return cast(dstring)r[0 .. j];
812}
813
814/** ditto */
815dstring toUTF32(in wchar[] s)
816{
817    dchar[] r;
818    size_t slen = s.length;
819    size_t j = 0;
820
821    r.length = slen;            // r[] will never be longer than s[]
822    for (size_t i = 0; i < slen; )
823    {
824        dchar c = s[i];
825        if (c >= 0x80)
826            c = decode(s, i);
827        else
828            i++;                // c is ascii, no need for decode
829        r[j++] = c;
830    }
831    return cast(dstring)r[0 .. j];
832}
833
834/** ditto */
835dstring toUTF32(dstring s)
836    in
837    {
838        validate(s);
839    }
840    body
841    {
842        return s;
843    }
844
845/* ================================ tests ================================== */
846
847unittest
848{
849    debug(utf) printf("utf.toUTF.unittest\n");
850
851    auto c = "hello"c[];
852    auto w = toUTF16(c);
853    assert(w == "hello");
854    auto d = toUTF32(c);
855    assert(d == "hello");
856
857    c = toUTF8(w);
858    assert(c == "hello");
859    d = toUTF32(w);
860    assert(d == "hello");
861
862    c = toUTF8(d);
863    assert(c == "hello");
864    w = toUTF16(d);
865    assert(w == "hello");
866
867
868    c = "hel\u1234o";
869    w = toUTF16(c);
870    assert(w == "hel\u1234o");
871    d = toUTF32(c);
872    assert(d == "hel\u1234o");
873
874    c = toUTF8(w);
875    assert(c == "hel\u1234o");
876    d = toUTF32(w);
877    assert(d == "hel\u1234o");
878
879    c = toUTF8(d);
880    assert(c == "hel\u1234o");
881    w = toUTF16(d);
882    assert(w == "hel\u1234o");
883
884
885    c = "he\U000BAAAAllo";
886    w = toUTF16(c);
887    //foreach (wchar c; w) printf("c = x%x\n", c);
888    //foreach (wchar c; cast(wstring)"he\U000BAAAAllo") printf("c = x%x\n", c);
889    assert(w == "he\U000BAAAAllo");
890    d = toUTF32(c);
891    assert(d == "he\U000BAAAAllo");
892
893    c = toUTF8(w);
894    assert(c == "he\U000BAAAAllo");
895    d = toUTF32(w);
896    assert(d == "he\U000BAAAAllo");
897
898    c = toUTF8(d);
899    assert(c == "he\U000BAAAAllo");
900    w = toUTF16(d);
901    assert(w == "he\U000BAAAAllo");
902}