PageRenderTime 207ms CodeModel.GetById 6ms app.highlight 183ms RepoModel.GetById 3ms app.codeStats 1ms

/std/encoding.d

http://github.com/jcd/phobos
D | 2998 lines | 1908 code | 306 blank | 784 comment | 236 complexity | 7d9a2a8af8eb451b31626eb19efa7784 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1// Written in the D programming language.
   2
   3/**
   4Classes and functions for handling and transcoding between various encodings.
   5
   6For cases where the _encoding is known at compile-time, functions are provided
   7for arbitrary _encoding and decoding of characters, arbitrary transcoding
   8between strings of different type, as well as validation and sanitization.
   9
  10Encodings currently supported are UTF-8, UTF-16, UTF-32, ASCII, ISO-8859-1
  11(also known as LATIN-1), and WINDOWS-1252.
  12
  13$(UL
  14$(LI The type $(D AsciiChar) represents an ASCII character.)
  15$(LI The type $(D AsciiString) represents an ASCII string.)
  16$(LI The type $(D Latin1Char) represents an ISO-8859-1 character.)
  17$(LI The type $(D Latin1String) represents an ISO-8859-1 string.)
  18$(LI The type $(D Windows1252Char) represents a Windows-1252 character.)
  19$(LI The type $(D Windows1252String) represents a Windows-1252 string.))
  20
  21For cases where the _encoding is not known at compile-time, but is
  22known at run-time, we provide the abstract class $(D EncodingScheme)
  23and its subclasses.  To construct a run-time encoder/decoder, one does
  24e.g.
  25
  26----------------------------------------------------
  27    auto e = EncodingScheme.create("utf-8");
  28----------------------------------------------------
  29
  30This library supplies $(D EncodingScheme) subclasses for ASCII,
  31ISO-8859-1 (also known as LATIN-1), WINDOWS-1252, UTF-8, and (on
  32little-endian architectures) UTF-16LE and UTF-32LE; or (on big-endian
  33architectures) UTF-16BE and UTF-32BE.
  34
  35This library provides a mechanism whereby other modules may add $(D
  36EncodingScheme) subclasses for any other _encoding.
  37
  38Macros:
  39    WIKI=Phobos/StdEncoding
  40
  41Copyright: Copyright Janice Caron 2008 - 2009.
  42License:   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
  43Authors:   Janice Caron
  44Source:    $(PHOBOSSRC std/_encoding.d)
  45*/
  46/*
  47         Copyright Janice Caron 2008 - 2009.
  48Distributed under the Boost Software License, Version 1.0.
  49   (See accompanying file LICENSE_1_0.txt or copy at
  50         http://www.boost.org/LICENSE_1_0.txt)
  51*/
  52module std.encoding;
  53
  54import std.string;
  55import std.traits;
  56import std.range;
  57
  58unittest
  59{
  60    static ubyte[][] validStrings =
  61    [
  62        // Plain ASCII
  63        cast(ubyte[])"hello",
  64
  65        // First possible sequence of a certain length
  66        [ 0x00 ],                       // U+00000000   one byte
  67        [ 0xC2, 0x80 ],                 // U+00000080   two bytes
  68        [ 0xE0, 0xA0, 0x80 ],           // U+00000800   three bytes
  69        [ 0xF0, 0x90, 0x80, 0x80 ],     // U+00010000   three bytes
  70
  71        // Last possible sequence of a certain length
  72        [ 0x7F ],                       // U+0000007F   one byte
  73        [ 0xDF, 0xBF ],                 // U+000007FF   two bytes
  74        [ 0xEF, 0xBF, 0xBF ],           // U+0000FFFF   three bytes
  75
  76        // Other boundary conditions
  77        [ 0xED, 0x9F, 0xBF ],
  78        // U+0000D7FF   Last character before surrogates
  79        [ 0xEE, 0x80, 0x80 ],
  80        // U+0000E000   First character after surrogates
  81        [ 0xEF, 0xBF, 0xBD ],
  82        // U+0000FFFD   Unicode replacement character
  83        [ 0xF4, 0x8F, 0xBF, 0xBF ],
  84        // U+0010FFFF   Very last character
  85
  86        // Non-character code points
  87        /*  NOTE: These are legal in UTF, and may be converted from
  88            one UTF to another, however they do not represent Unicode
  89            characters. These code points have been reserved by
  90            Unicode as non-character code points. They are permissible
  91            for data exchange within an application, but they are are
  92            not permitted to be used as characters. Since this module
  93            deals with UTF, and not with Unicode per se, we choose to
  94            accept them here. */
  95        [ 0xDF, 0xBE ],                 // U+0000FFFE
  96        [ 0xDF, 0xBF ],                 // U+0000FFFF
  97    ];
  98
  99    static ubyte[][] invalidStrings =
 100    [
 101        // First possible sequence of a certain length, but greater
 102        // than U+10FFFF
 103        [ 0xF8, 0x88, 0x80, 0x80, 0x80 ],           // U+00200000   five bytes
 104        [ 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80 ],     // U+04000000   six bytes
 105
 106        // Last possible sequence of a certain length, but greater than U+10FFFF
 107        [ 0xF7, 0xBF, 0xBF, 0xBF ],                 // U+001FFFFF   four bytes
 108        [ 0xFB, 0xBF, 0xBF, 0xBF, 0xBF ],           // U+03FFFFFF   five bytes
 109        [ 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF ],     // U+7FFFFFFF   six bytes
 110
 111        // Other boundary conditions
 112        [ 0xF4, 0x90, 0x80, 0x80 ],                 // U+00110000
 113                                                    // First code
 114                                                    // point after
 115                                                    // last character
 116
 117        // Unexpected continuation bytes
 118        [ 0x80 ],
 119        [ 0xBF ],
 120        [ 0x20, 0x80, 0x20 ],
 121        [ 0x20, 0xBF, 0x20 ],
 122        [ 0x80, 0x9F, 0xA0 ],
 123
 124        // Lonely start bytes
 125        [ 0xC0 ],
 126        [ 0xCF ],
 127        [ 0x20, 0xC0, 0x20 ],
 128        [ 0x20, 0xCF, 0x20 ],
 129        [ 0xD0 ],
 130        [ 0xDF ],
 131        [ 0x20, 0xD0, 0x20 ],
 132        [ 0x20, 0xDF, 0x20 ],
 133        [ 0xE0 ],
 134        [ 0xEF ],
 135        [ 0x20, 0xE0, 0x20 ],
 136        [ 0x20, 0xEF, 0x20 ],
 137        [ 0xF0 ],
 138        [ 0xF1 ],
 139        [ 0xF2 ],
 140        [ 0xF3 ],
 141        [ 0xF4 ],
 142        [ 0xF5 ],   // If this were legal it would start a character > U+10FFFF
 143        [ 0xF6 ],   // If this were legal it would start a character > U+10FFFF
 144        [ 0xF7 ],   // If this were legal it would start a character > U+10FFFF
 145
 146        [ 0xEF, 0xBF ],             // Three byte sequence with third byte missing
 147        [ 0xF7, 0xBF, 0xBF ],       // Four byte sequence with fourth byte missing
 148        [ 0xEF, 0xBF, 0xF7, 0xBF, 0xBF ],   // Concatenation of the above
 149
 150        // Impossible bytes
 151        [ 0xF8 ],
 152        [ 0xF9 ],
 153        [ 0xFA ],
 154        [ 0xFB ],
 155        [ 0xFC ],
 156        [ 0xFD ],
 157        [ 0xFE ],
 158        [ 0xFF ],
 159        [ 0x20, 0xF8, 0x20 ],
 160        [ 0x20, 0xF9, 0x20 ],
 161        [ 0x20, 0xFA, 0x20 ],
 162        [ 0x20, 0xFB, 0x20 ],
 163        [ 0x20, 0xFC, 0x20 ],
 164        [ 0x20, 0xFD, 0x20 ],
 165        [ 0x20, 0xFE, 0x20 ],
 166        [ 0x20, 0xFF, 0x20 ],
 167
 168        // Overlong sequences, all representing U+002F
 169        /*  With a safe UTF-8 decoder, all of the following five overlong
 170            representations of the ASCII character slash ("/") should be
 171            rejected like a malformed UTF-8 sequence */
 172        [ 0xC0, 0xAF ],
 173        [ 0xE0, 0x80, 0xAF ],
 174        [ 0xF0, 0x80, 0x80, 0xAF ],
 175        [ 0xF8, 0x80, 0x80, 0x80, 0xAF ],
 176        [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0xAF ],
 177
 178        // Maximum overlong sequences
 179        /*  Below you see the highest Unicode value that is still resulting in
 180            an overlong sequence if represented with the given number of bytes.
 181            This is a boundary test for safe UTF-8 decoders. All five
 182            characters should be rejected like malformed UTF-8 sequences. */
 183        [ 0xC1, 0xBF ],                             // U+0000007F
 184        [ 0xE0, 0x9F, 0xBF ],                       // U+000007FF
 185        [ 0xF0, 0x8F, 0xBF, 0xBF ],                 // U+0000FFFF
 186        [ 0xF8, 0x87, 0xBF, 0xBF, 0xBF ],           // U+001FFFFF
 187        [ 0xFC, 0x83, 0xBF, 0xBF, 0xBF, 0xBF ],     // U+03FFFFFF
 188
 189        // Overlong representation of the NUL character
 190        /*  The following five sequences should also be rejected like malformed
 191            UTF-8 sequences and should not be treated like the ASCII NUL
 192            character. */
 193        [ 0xC0, 0x80 ],
 194        [ 0xE0, 0x80, 0x80 ],
 195        [ 0xF0, 0x80, 0x80, 0x80 ],
 196        [ 0xF8, 0x80, 0x80, 0x80, 0x80 ],
 197        [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0x80 ],
 198
 199        // Illegal code positions
 200        /*  The following UTF-8 sequences should be rejected like malformed
 201            sequences, because they never represent valid ISO 10646 characters
 202            and a UTF-8 decoder that accepts them might introduce security
 203            problems comparable to overlong UTF-8 sequences. */
 204        [ 0xED, 0xA0, 0x80 ],       // U+D800
 205        [ 0xED, 0xAD, 0xBF ],       // U+DB7F
 206        [ 0xED, 0xAE, 0x80 ],       // U+DB80
 207        [ 0xED, 0xAF, 0xBF ],       // U+DBFF
 208        [ 0xED, 0xB0, 0x80 ],       // U+DC00
 209        [ 0xED, 0xBE, 0x80 ],       // U+DF80
 210        [ 0xED, 0xBF, 0xBF ],       // U+DFFF
 211    ];
 212
 213    static string[] sanitizedStrings =
 214    [
 215        "\uFFFD","\uFFFD",
 216        "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
 217        " \uFFFD ","\uFFFD\uFFFD\uFFFD","\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ",
 218        "\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ","\uFFFD","\uFFFD"," \uFFFD ",
 219        " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
 220        "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD\uFFFD","\uFFFD","\uFFFD",
 221        "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
 222        " \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD ",
 223        " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
 224        "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
 225        "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
 226    ];
 227
 228    // Make sure everything that should be valid, is
 229    foreach(a;validStrings)
 230    {
 231        string s = cast(string)a;
 232        assert(isValid(s),"Failed to validate: "~makeReadable(s));
 233    }
 234
 235    // Make sure everything that shouldn't be valid, isn't
 236    foreach(a;invalidStrings)
 237    {
 238        string s = cast(string)a;
 239        assert(!isValid(s),"Incorrectly validated: "~makeReadable(s));
 240    }
 241
 242    // Make sure we can sanitize everything bad
 243    assert(invalidStrings.length == sanitizedStrings.length);
 244    for(int i=0; i<invalidStrings.length; ++i)
 245    {
 246        string s = cast(string)invalidStrings[i];
 247        string t = sanitize(s);
 248        assert(isValid(t));
 249        assert(t == sanitizedStrings[i]);
 250        ubyte[] u = cast(ubyte[])t;
 251        validStrings ~= u;
 252    }
 253
 254    // Make sure all transcodings work in both directions, using both forward
 255    // and reverse iteration
 256    foreach(a; validStrings)
 257    {
 258        string s = cast(string)a;
 259        string s2;
 260        wstring ws, ws2;
 261        dstring ds, ds2;
 262
 263        transcode(s,ws);
 264        assert(isValid(ws));
 265        transcode(ws,s2);
 266        assert(s == s2);
 267
 268        transcode(s,ds);
 269        assert(isValid(ds));
 270        transcode(ds,s2);
 271        assert(s == s2);
 272
 273        transcode(ws,s);
 274        assert(isValid(s));
 275        transcode(s,ws2);
 276        assert(ws == ws2);
 277
 278        transcode(ws,ds);
 279        assert(isValid(ds));
 280        transcode(ds,ws2);
 281        assert(ws == ws2);
 282
 283        transcode(ds,s);
 284        assert(isValid(s));
 285        transcode(s,ds2);
 286        assert(ds == ds2);
 287
 288        transcode(ds,ws);
 289        assert(isValid(ws));
 290        transcode(ws,ds2);
 291        assert(ds == ds2);
 292
 293        transcodeReverse(s,ws);
 294        assert(isValid(ws));
 295        transcodeReverse(ws,s2);
 296        assert(s == s2);
 297
 298        transcodeReverse(s,ds);
 299        assert(isValid(ds));
 300        transcodeReverse(ds,s2);
 301        assert(s == s2);
 302
 303        transcodeReverse(ws,s);
 304        assert(isValid(s));
 305        transcodeReverse(s,ws2);
 306        assert(ws == ws2);
 307
 308        transcodeReverse(ws,ds);
 309        assert(isValid(ds));
 310        transcodeReverse(ds,ws2);
 311        assert(ws == ws2);
 312
 313        transcodeReverse(ds,s);
 314        assert(isValid(s));
 315        transcodeReverse(s,ds2);
 316        assert(ds == ds2);
 317
 318        transcodeReverse(ds,ws);
 319        assert(isValid(ws));
 320        transcodeReverse(ws,ds2);
 321        assert(ds == ds2);
 322    }
 323
 324    // Make sure the non-UTF encodings work too
 325    {
 326        auto s = "\u20AC100";
 327        Windows1252String t;
 328        transcode(s,t);
 329        assert(t == cast(Windows1252Char[])[0x80, '1', '0', '0']);
 330        string u;
 331        transcode(s,u);
 332        assert(s == u);
 333        Latin1String v;
 334        transcode(s,v);
 335        assert(cast(string)v == "?100");
 336        AsciiString w;
 337        transcode(v,w);
 338        assert(cast(string)w == "?100");
 339    }
 340
 341    // Make sure we can count properly
 342    {
 343        assert(encodedLength!(char)('A') == 1);
 344        assert(encodedLength!(char)('\u00E3') == 2);
 345        assert(encodedLength!(char)('\u2028') == 3);
 346        assert(encodedLength!(char)('\U0010FFF0') == 4);
 347        assert(encodedLength!(wchar)('A') == 1);
 348        assert(encodedLength!(wchar)('\U0010FFF0') == 2);
 349    }
 350
 351    // Make sure we can write into mutable arrays
 352    {
 353        char[4] buffer;
 354        auto n = encode(cast(dchar)'\u00E3',buffer);
 355        assert(n == 2);
 356        assert(buffer[0] == 0xC3);
 357        assert(buffer[1] == 0xA3);
 358    }
 359}
 360
 361//=============================================================================
 362
 363/** Special value returned by $(D safeDecode) */
 364enum dchar INVALID_SEQUENCE = cast(dchar) 0xFFFFFFFF;
 365
 366template EncoderFunctions()
 367{
 368    // Various forms of read
 369
 370    template ReadFromString()
 371    {
 372        @property bool canRead() { return s.length != 0; }
 373        E peek() { return s[0]; }
 374        E read() { E t = s[0]; s = s[1..$]; return t; }
 375    }
 376
 377    template ReverseReadFromString()
 378    {
 379        @property bool canRead() { return s.length != 0; }
 380        E peek() { return s[$-1]; }
 381        E read() { E t = s[$-1]; s = s[0..$-1]; return t; }
 382    }
 383
 384    // Various forms of Write
 385
 386    template WriteToString()
 387    {
 388        E[] s;
 389        void write(E c) { s ~= c; }
 390    }
 391
 392    template WriteToArray()
 393    {
 394        void write(E c) { array[0] = c; array = array[1..$]; }
 395    }
 396
 397    template WriteToDelegate()
 398    {
 399        void write(E c) { dg(c); }
 400    }
 401
 402    // Functions we will export
 403
 404    template EncodeViaWrite()
 405    {
 406        mixin encodeViaWrite;
 407        void encode(dchar c) { encodeViaWrite(c); }
 408    }
 409
 410    template SkipViaRead()
 411    {
 412        mixin skipViaRead;
 413        void skip() { skipViaRead(); }
 414    }
 415
 416    template DecodeViaRead()
 417    {
 418        mixin decodeViaRead;
 419        dchar decode() { return decodeViaRead(); }
 420    }
 421
 422    template SafeDecodeViaRead()
 423    {
 424        mixin safeDecodeViaRead;
 425        dchar safeDecode() { return safeDecodeViaRead(); }
 426    }
 427
 428    template DecodeReverseViaRead()
 429    {
 430        mixin decodeReverseViaRead;
 431        dchar decodeReverse() { return decodeReverseViaRead(); }
 432    }
 433
 434    // Encoding to different destinations
 435
 436    template EncodeToString()
 437    {
 438        mixin WriteToString;
 439        mixin EncodeViaWrite;
 440    }
 441
 442    template EncodeToArray()
 443    {
 444        mixin WriteToArray;
 445        mixin EncodeViaWrite;
 446    }
 447
 448    template EncodeToDelegate()
 449    {
 450        mixin WriteToDelegate;
 451        mixin EncodeViaWrite;
 452    }
 453
 454    // Decoding functions
 455
 456    template SkipFromString()
 457    {
 458        mixin ReadFromString;
 459        mixin SkipViaRead;
 460    }
 461
 462    template DecodeFromString()
 463    {
 464        mixin ReadFromString;
 465        mixin DecodeViaRead;
 466    }
 467
 468    template SafeDecodeFromString()
 469    {
 470        mixin ReadFromString;
 471        mixin SafeDecodeViaRead;
 472    }
 473
 474    template DecodeReverseFromString()
 475    {
 476        mixin ReverseReadFromString;
 477        mixin DecodeReverseViaRead;
 478    }
 479
 480    //=========================================================================
 481
 482    // Below are the functions we will ultimately expose to the user
 483
 484    E[] encode(dchar c)
 485    {
 486        mixin EncodeToString e;
 487        e.encode(c);
 488        return e.s;
 489    }
 490
 491    void encode(dchar c, ref E[] array)
 492    {
 493        mixin EncodeToArray e;
 494        e.encode(c);
 495    }
 496
 497    void encode(dchar c, void delegate(E) dg)
 498    {
 499        mixin EncodeToDelegate e;
 500        e.encode(c);
 501    }
 502
 503    void skip(ref const(E)[] s)
 504    {
 505        mixin SkipFromString e;
 506        e.skip();
 507    }
 508
 509    dchar decode(S)(ref S s)
 510    {
 511        mixin DecodeFromString e;
 512        return e.decode();
 513    }
 514
 515    dchar safeDecode(S)(ref S s)
 516    {
 517        mixin SafeDecodeFromString e;
 518        return e.safeDecode();
 519    }
 520
 521    dchar decodeReverse(ref const(E)[] s)
 522    {
 523        mixin DecodeReverseFromString e;
 524        return e.decodeReverse();
 525    }
 526}
 527
 528//=========================================================================
 529
 530struct CodePoints(E)
 531{
 532    const(E)[] s;
 533
 534    this(const(E)[] s)
 535    in
 536    {
 537        assert(isValid(s));
 538    }
 539    body
 540    {
 541        this.s = s;
 542    }
 543
 544    int opApply(scope int delegate(ref dchar) dg)
 545    {
 546        int result = 0;
 547        while (s.length != 0)
 548        {
 549            dchar c = decode(s);
 550            result = dg(c);
 551            if (result != 0) break;
 552        }
 553        return result;
 554    }
 555
 556    int opApply(scope int delegate(ref size_t, ref dchar) dg)
 557    {
 558        size_t i = 0;
 559        int result = 0;
 560        while (s.length != 0)
 561        {
 562            size_t len = s.length;
 563            dchar c = decode(s);
 564            size_t j = i; // We don't want the delegate corrupting i
 565            result = dg(j,c);
 566            if (result != 0) break;
 567            i += len - s.length;
 568        }
 569        return result;
 570    }
 571
 572    int opApplyReverse(scope int delegate(ref dchar) dg)
 573    {
 574        int result = 0;
 575        while (s.length != 0)
 576        {
 577            dchar c = decodeReverse(s);
 578            result = dg(c);
 579            if (result != 0) break;
 580        }
 581        return result;
 582    }
 583
 584    int opApplyReverse(scope int delegate(ref size_t, ref dchar) dg)
 585    {
 586        int result = 0;
 587        while (s.length != 0)
 588        {
 589            dchar c = decodeReverse(s);
 590            size_t i = s.length;
 591            result = dg(i,c);
 592            if (result != 0) break;
 593        }
 594        return result;
 595    }
 596}
 597
 598struct CodeUnits(E)
 599{
 600    E[] s;
 601
 602    this(dchar d)
 603    in
 604    {
 605        assert(isValidCodePoint(d));
 606    }
 607    body
 608    {
 609        s = encode!(E)(d);
 610    }
 611
 612    int opApply(scope int delegate(ref E) dg)
 613    {
 614        int result = 0;
 615        foreach(E c;s)
 616        {
 617            result = dg(c);
 618            if (result != 0) break;
 619        }
 620        return result;
 621    }
 622
 623    int opApplyReverse(scope int delegate(ref E) dg)
 624    {
 625        int result = 0;
 626        foreach_reverse(E c;s)
 627        {
 628            result = dg(c);
 629            if (result != 0) break;
 630        }
 631        return result;
 632    }
 633}
 634
 635//=============================================================================
 636
 637template EncoderInstance(E)
 638{
 639    static assert(false,"Cannot instantiate EncoderInstance for type "
 640        ~ E.stringof);
 641}
 642
 643//=============================================================================
 644//          ASCII
 645//=============================================================================
 646
 647/** Defines various character sets. */
 648enum AsciiChar : ubyte { init }
 649/// Ditto
 650alias immutable(AsciiChar)[] AsciiString;
 651
 652template EncoderInstance(CharType : AsciiChar)
 653{
 654    alias AsciiChar E;
 655    alias AsciiString EString;
 656
 657    @property string encodingName()
 658    {
 659        return "ASCII";
 660    }
 661
 662    bool canEncode(dchar c)
 663    {
 664        return c < 0x80;
 665    }
 666
 667    bool isValidCodeUnit(AsciiChar c)
 668    {
 669        return c < 0x80;
 670    }
 671
 672    size_t encodedLength(dchar c)
 673    in
 674    {
 675        assert(canEncode(c));
 676    }
 677    body
 678    {
 679        return 1;
 680    }
 681
 682    void encodeX(Range)(dchar c, Range r)
 683    {
 684        if (!canEncode(c)) c = '?';
 685        r.write(cast(AsciiChar) c);
 686    }
 687
 688    void encodeViaWrite()(dchar c)
 689    {
 690        if (!canEncode(c)) c = '?';
 691        write(cast(AsciiChar)c);
 692    }
 693
 694    void skipViaRead()()
 695    {
 696        read();
 697    }
 698
 699    dchar decodeViaRead()()
 700    {
 701        return read();
 702    }
 703
 704    dchar safeDecodeViaRead()()
 705    {
 706        dchar c = read();
 707        return canEncode(c) ? c : INVALID_SEQUENCE;
 708    }
 709
 710    dchar decodeReverseViaRead()()
 711    {
 712        return read();
 713    }
 714
 715    @property EString replacementSequence()
 716    {
 717        return cast(EString)("?");
 718    }
 719
 720    mixin EncoderFunctions;
 721}
 722
 723//=============================================================================
 724//          ISO-8859-1
 725//=============================================================================
 726
 727/** Defines an Latin1-encoded character. */
 728enum Latin1Char : ubyte { init }
 729/**
 730Defines an Latin1-encoded string (as an array of $(D
 731immutable(Latin1Char))).
 732 */
 733alias immutable(Latin1Char)[] Latin1String; ///
 734
 735template EncoderInstance(CharType : Latin1Char)
 736{
 737    alias Latin1Char E;
 738    alias Latin1String EString;
 739
 740    @property string encodingName()
 741    {
 742        return "ISO-8859-1";
 743    }
 744
 745    bool canEncode(dchar c)
 746    {
 747        return c < 0x100;
 748    }
 749
 750    bool isValidCodeUnit(Latin1Char c)
 751    {
 752        return true;
 753    }
 754
 755    size_t encodedLength(dchar c)
 756    in
 757    {
 758        assert(canEncode(c));
 759    }
 760    body
 761    {
 762                return 1;
 763    }
 764
 765    void encodeViaWrite()(dchar c)
 766    {
 767        if (!canEncode(c)) c = '?';
 768        write(cast(Latin1Char)c);
 769    }
 770
 771    void skipViaRead()()
 772    {
 773        read();
 774    }
 775
 776    dchar decodeViaRead()()
 777    {
 778        return read();
 779    }
 780
 781    dchar safeDecodeViaRead()()
 782    {
 783        return read();
 784    }
 785
 786    dchar decodeReverseViaRead()()
 787    {
 788        return read();
 789    }
 790
 791    @property EString replacementSequence()
 792    {
 793        return cast(EString)("?");
 794    }
 795
 796    mixin EncoderFunctions;
 797}
 798
 799//=============================================================================
 800//          WINDOWS-1252
 801//=============================================================================
 802
 803/** Defines a Windows1252-encoded character. */
 804enum Windows1252Char : ubyte { init }
 805/**
 806Defines an Windows1252-encoded string (as an array of $(D
 807immutable(Windows1252Char))).
 808 */
 809alias immutable(Windows1252Char)[] Windows1252String; ///
 810
 811template EncoderInstance(CharType : Windows1252Char)
 812{
 813    alias Windows1252Char E;
 814    alias Windows1252String EString;
 815
 816    @property string encodingName()
 817    {
 818        return "windows-1252";
 819    }
 820
 821    immutable wstring charMap =
 822        "\u20AC\uFFFD\u201A\u0192\u201E\u2026\u2020\u2021"
 823        "\u02C6\u2030\u0160\u2039\u0152\uFFFD\u017D\uFFFD"
 824        "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2103\u2014"
 825        "\u02DC\u2122\u0161\u203A\u0153\uFFFD\u017E\u0178"
 826    ;
 827
 828    bool canEncode(dchar c)
 829    {
 830        if (c < 0x80 || (c >= 0xA0 && c < 0x100)) return true;
 831        if (c >= 0xFFFD) return false;
 832        foreach(wchar d;charMap) { if (c == d) return true; }
 833        return false;
 834    }
 835
 836    bool isValidCodeUnit(Windows1252Char c)
 837    {
 838        if (c < 0x80 || c >= 0xA0) return true;
 839        return (charMap[c-0x80] != 0xFFFD);
 840    }
 841
 842    size_t encodedLength(dchar c)
 843    in
 844    {
 845        assert(canEncode(c));
 846    }
 847    body
 848    {
 849        return 1;
 850    }
 851
 852    void encodeViaWrite()(dchar c)
 853    {
 854        if (c < 0x80 || (c >= 0xA0 && c < 0x100)) {}
 855        else if (c >= 0xFFFD) { c = '?'; }
 856        else
 857        {
 858            ptrdiff_t n = -1;
 859            foreach (i, wchar d; charMap)
 860            {
 861                if (c == d)
 862                {
 863                    n = i;
 864                    break;
 865                }
 866            }
 867            c = n == -1 ? '?' : 0x80 + cast(dchar) n;
 868        }
 869        write(cast(Windows1252Char)c);
 870    }
 871
 872    void skipViaRead()()
 873    {
 874        read();
 875    }
 876
 877    dchar decodeViaRead()()
 878    {
 879        Windows1252Char c = read();
 880        return (c >= 0x80 && c < 0xA0) ? charMap[c-0x80] : c;
 881    }
 882
 883    dchar safeDecodeViaRead()()
 884    {
 885        Windows1252Char c = read();
 886        dchar d = (c >= 0x80 && c < 0xA0) ? charMap[c-0x80] : c;
 887        return d == 0xFFFD ? INVALID_SEQUENCE : d;
 888    }
 889
 890    dchar decodeReverseViaRead()()
 891    {
 892        Windows1252Char c = read();
 893        return (c >= 0x80 && c < 0xA0) ? charMap[c-0x80] : c;
 894    }
 895
 896    @property EString replacementSequence()
 897    {
 898        return cast(EString)("?");
 899    }
 900
 901    mixin EncoderFunctions;
 902}
 903
 904//=============================================================================
 905//          UTF-8
 906//=============================================================================
 907
 908template EncoderInstance(CharType : char)
 909{
 910    alias char E;
 911    alias immutable(char)[] EString;
 912
 913    @property string encodingName()
 914    {
 915        return "UTF-8";
 916    }
 917
 918    bool canEncode(dchar c)
 919    {
 920        return isValidCodePoint(c);
 921    }
 922
 923    bool isValidCodeUnit(char c)
 924    {
 925        return (c < 0xC0 || (c >= 0xC2 && c < 0xF5));
 926    }
 927
 928    immutable ubyte[128] tailTable =
 929    [
 930        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 931        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 932        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 933        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 934        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 935        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 936        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 937        3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,0,
 938    ];
 939
 940    private int tails(char c)
 941    in
 942    {
 943        assert(c >= 0x80);
 944    }
 945    body
 946    {
 947        return tailTable[c-0x80];
 948    }
 949
 950    size_t encodedLength(dchar c)
 951    in
 952    {
 953        assert(canEncode(c));
 954    }
 955    body
 956    {
 957        if (c < 0x80) return 1;
 958        if (c < 0x800) return 2;
 959        if (c < 0x10000) return 3;
 960        return 4;
 961    }
 962
 963    void encodeViaWrite()(dchar c)
 964    {
 965        if (c < 0x80)
 966        {
 967            write(cast(char)c);
 968        }
 969        else if (c < 0x800)
 970        {
 971            write(cast(char)((c >> 6) + 0xC0));
 972            write(cast(char)((c & 0x3F) + 0x80));
 973        }
 974        else if (c < 0x10000)
 975        {
 976            write(cast(char)((c >> 12) + 0xE0));
 977            write(cast(char)(((c >> 6) & 0x3F) + 0x80));
 978            write(cast(char)((c & 0x3F) + 0x80));
 979        }
 980        else
 981        {
 982            write(cast(char)((c >> 18) + 0xF0));
 983            write(cast(char)(((c >> 12) & 0x3F) + 0x80));
 984            write(cast(char)(((c >> 6) & 0x3F) + 0x80));
 985            write(cast(char)((c & 0x3F) + 0x80));
 986        }
 987    }
 988
 989    void skipViaRead()()
 990    {
 991        auto c = read();
 992        if (c < 0xC0) return;
 993        int n = tails(cast(char) c);
 994        for (size_t i=0; i<n; ++i)
 995        {
 996            read();
 997        }
 998    }
 999
1000    dchar decodeViaRead()()
1001    {
1002        dchar c = read();
1003        if (c < 0xC0) return c;
1004        int n = tails(cast(char) c);
1005        c &= (1 << (6 - n)) - 1;
1006        for (size_t i=0; i<n; ++i)
1007        {
1008            c = (c << 6) + (read() & 0x3F);
1009        }
1010        return c;
1011    }
1012
1013    dchar safeDecodeViaRead()()
1014    {
1015        dchar c = read();
1016        if (c < 0x80) return c;
1017        int n = tails(cast(char) c);
1018        if (n == 0) return INVALID_SEQUENCE;
1019
1020        if (!canRead) return INVALID_SEQUENCE;
1021        size_t d = peek();
1022        bool err =
1023        (
1024            (c < 0xC2)                              // fail overlong 2-byte sequences
1025        ||  (c > 0xF4)                              // fail overlong 4-6-byte sequences
1026        ||  (c == 0xE0 && ((d & 0xE0) == 0x80))     // fail overlong 3-byte sequences
1027        ||  (c == 0xED && ((d & 0xE0) == 0xA0))     // fail surrogates
1028        ||  (c == 0xF0 && ((d & 0xF0) == 0x80))     // fail overlong 4-byte sequences
1029        ||  (c == 0xF4 && ((d & 0xF0) >= 0x90))     // fail code points > 0x10FFFF
1030        );
1031
1032        c &= (1 << (6 - n)) - 1;
1033        for (size_t i=0; i<n; ++i)
1034        {
1035            if (!canRead) return INVALID_SEQUENCE;
1036            d = peek();
1037            if ((d & 0xC0) != 0x80) return INVALID_SEQUENCE;
1038            c = (c << 6) + (read() & 0x3F);
1039        }
1040
1041        return err ? INVALID_SEQUENCE : c;
1042    }
1043
1044    dchar decodeReverseViaRead()()
1045    {
1046        dchar c = read();
1047        if (c < 0x80) return c;
1048        size_t shift = 0;
1049        c &= 0x3F;
1050        for (size_t i=0; i<4; ++i)
1051        {
1052            shift += 6;
1053            auto d = read();
1054            size_t n = tails(cast(char) d);
1055            size_t mask = n == 0 ? 0x3F : (1 << (6 - n)) - 1;
1056            c += ((d & mask) << shift);
1057            if (n != 0) break;
1058        }
1059        return c;
1060    }
1061
1062    @property EString replacementSequence()
1063    {
1064        return "\uFFFD";
1065    }
1066
1067    mixin EncoderFunctions;
1068}
1069
1070//=============================================================================
1071//          UTF-16
1072//=============================================================================
1073
1074template EncoderInstance(CharType : wchar)
1075{
1076    alias wchar E;
1077    alias immutable(wchar)[] EString;
1078
1079    @property string encodingName()
1080    {
1081        return "UTF-16";
1082    }
1083
1084    bool canEncode(dchar c)
1085    {
1086        return isValidCodePoint(c);
1087    }
1088
1089    bool isValidCodeUnit(wchar c)
1090    {
1091        return true;
1092    }
1093
1094    size_t encodedLength(dchar c)
1095    in
1096    {
1097        assert(canEncode(c));
1098    }
1099    body
1100    {
1101                return (c < 0x10000) ? 1 : 2;
1102    }
1103
1104    void encodeViaWrite()(dchar c)
1105    {
1106        if (c < 0x10000)
1107        {
1108            write(cast(wchar)c);
1109        }
1110        else
1111        {
1112            size_t n = c - 0x10000;
1113            write(cast(wchar)(0xD800 + (n >> 10)));
1114            write(cast(wchar)(0xDC00 + (n & 0x3FF)));
1115        }
1116    }
1117
1118    void skipViaRead()()
1119    {
1120        wchar c = read();
1121        if (c < 0xD800 || c >= 0xE000) return;
1122        read();
1123    }
1124
1125    dchar decodeViaRead()()
1126    {
1127        wchar c = read();
1128        if (c < 0xD800 || c >= 0xE000) return cast(dchar)c;
1129        wchar d = read();
1130        c &= 0x3FF;
1131        d &= 0x3FF;
1132        return 0x10000 + (c << 10) + d;
1133    }
1134
1135    dchar safeDecodeViaRead()()
1136    {
1137        wchar c = read();
1138        if (c < 0xD800 || c >= 0xE000) return cast(dchar)c;
1139        if (c >= 0xDC00) return INVALID_SEQUENCE;
1140        if (!canRead) return INVALID_SEQUENCE;
1141        wchar d = peek();
1142        if (d < 0xDC00 || d >= 0xE000) return INVALID_SEQUENCE;
1143        d = read();
1144        c &= 0x3FF;
1145        d &= 0x3FF;
1146        return 0x10000 + (c << 10) + d;
1147    }
1148
1149    dchar decodeReverseViaRead()()
1150    {
1151        wchar c = read();
1152        if (c < 0xD800 || c >= 0xE000) return cast(dchar)c;
1153        wchar d = read();
1154        c &= 0x3FF;
1155        d &= 0x3FF;
1156        return 0x10000 + (d << 10) + c;
1157    }
1158
1159    @property EString replacementSequence()
1160    {
1161        return "\uFFFD"w;
1162    }
1163
1164    mixin EncoderFunctions;
1165}
1166
1167//=============================================================================
1168//          UTF-32
1169//=============================================================================
1170
1171template EncoderInstance(CharType : dchar)
1172{
1173    alias dchar E;
1174    alias immutable(dchar)[] EString;
1175
1176    @property string encodingName()
1177    {
1178        return "UTF-32";
1179    }
1180
1181    bool canEncode(dchar c)
1182    {
1183        return isValidCodePoint(c);
1184    }
1185
1186    bool isValidCodeUnit(dchar c)
1187    {
1188        return isValidCodePoint(c);
1189    }
1190
1191    size_t encodedLength(dchar c)
1192    in
1193    {
1194        assert(canEncode(c));
1195    }
1196    body
1197    {
1198                return 1;
1199    }
1200
1201    void encodeViaWrite()(dchar c)
1202    {
1203        write(c);
1204    }
1205
1206    void skipViaRead()()
1207    {
1208        read();
1209    }
1210
1211    dchar decodeViaRead()()
1212    {
1213        return cast(dchar)read();
1214    }
1215
1216    dchar safeDecodeViaRead()()
1217    {
1218        dchar c = read();
1219        return isValidCodePoint(c) ? c : INVALID_SEQUENCE;
1220    }
1221
1222    dchar decodeReverseViaRead()()
1223    {
1224        return cast(dchar)read();
1225    }
1226
1227    @property EString replacementSequence()
1228    {
1229        return "\uFFFD"d;
1230    }
1231
1232    mixin EncoderFunctions;
1233}
1234
1235//=============================================================================
1236// Below are forwarding functions which expose the function to the user
1237
1238/**
1239Returns true if c is a valid code point
1240
1241 Note that this includes the non-character code points U+FFFE and U+FFFF,
1242 since these are valid code points (even though they are not valid
1243 characters).
1244
1245 Supercedes:
1246 This function supercedes $(D std.utf.startsValidDchar()).
1247
1248 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1249
1250 Params:
1251    c = the code point to be tested
1252 */
1253bool isValidCodePoint(dchar c)
1254{
1255    return c < 0xD800 || (c >= 0xE000 && c < 0x110000);
1256}
1257
1258/**
1259 Returns the name of an encoding.
1260
1261 The type of encoding cannot be deduced. Therefore, it is necessary to
1262 explicitly specify the encoding type.
1263
1264 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1265
1266 Examples:
1267 -----------------------------------
1268 assert(encodingName!(Latin1Char) == "ISO-8859-1");
1269 -----------------------------------
1270 */
1271@property string encodingName(T)()
1272{
1273    return EncoderInstance!(T).encodingName;
1274}
1275
1276unittest
1277{
1278    assert(encodingName!(char) == "UTF-8");
1279    assert(encodingName!(wchar) == "UTF-16");
1280    assert(encodingName!(dchar) == "UTF-32");
1281    assert(encodingName!(AsciiChar) == "ASCII");
1282    assert(encodingName!(Latin1Char) == "ISO-8859-1");
1283    assert(encodingName!(Windows1252Char) == "windows-1252");
1284}
1285
1286/**
1287 Returns true iff it is possible to represent the specifed codepoint
1288 in the encoding.
1289
1290 The type of encoding cannot be deduced. Therefore, it is necessary to
1291 explicitly specify the encoding type.
1292
1293 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1294
1295 Examples:
1296 -----------------------------------
1297 assert(canEncode!(Latin1Char)('A'));
1298 -----------------------------------
1299 */
1300bool canEncode(E)(dchar c)
1301{
1302    return EncoderInstance!(E).canEncode(c);
1303}
1304
1305unittest
1306{
1307    assert(!canEncode!(AsciiChar)('\u00A0'));
1308    assert(canEncode!(Latin1Char)('\u00A0'));
1309    assert(canEncode!(Windows1252Char)('\u20AC'));
1310    assert(!canEncode!(Windows1252Char)('\u20AD'));
1311    assert(!canEncode!(Windows1252Char)('\uFFFD'));
1312    assert(!canEncode!(char)(cast(dchar)0x110000));
1313}
1314
1315/**
1316 Returns true if the code unit is legal. For example, the byte 0x80 would
1317 not be legal in ASCII, because ASCII code units must always be in the range
1318 0x00 to 0x7F.
1319
1320 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1321
1322 Params:
1323    c = the code unit to be tested
1324 */
1325bool isValidCodeUnit(E)(E c)
1326{
1327    return EncoderInstance!(E).isValidCodeUnit(c);
1328}
1329
1330unittest
1331{
1332    assert(!isValidCodeUnit(cast(AsciiChar)0xA0));
1333    assert( isValidCodeUnit(cast(Windows1252Char)0x80));
1334    assert(!isValidCodeUnit(cast(Windows1252Char)0x81));
1335    assert(!isValidCodeUnit(cast(char)0xC0));
1336    assert(!isValidCodeUnit(cast(char)0xFF));
1337    assert( isValidCodeUnit(cast(wchar)0xD800));
1338    assert(!isValidCodeUnit(cast(dchar)0xD800));
1339}
1340
1341/**
1342 Returns true if the string is encoded correctly
1343
1344 Supercedes:
1345 This function supercedes std.utf.validate(), however note that this
1346 function returns a bool indicating whether the input was valid or not,
1347 wheras the older funtion would throw an exception.
1348
1349 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1350
1351 Params:
1352    s = the string to be tested
1353 */
1354bool isValid(E)(const(E)[] s)
1355{
1356    return s.length == validLength(s);
1357}
1358
1359unittest
1360{
1361    assert(isValid("\u20AC100"));
1362}
1363
1364/**
1365 Returns the length of the longest possible substring, starting from
1366 the first code unit, which is validly encoded.
1367
1368 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1369
1370 Params:
1371    s = the string to be tested
1372 */
1373size_t validLength(E)(const(E)[] s)
1374{
1375    size_t result, before = void;
1376    while ((before = s.length) > 0)
1377    {
1378        if (EncoderInstance!(E).safeDecode(s) == INVALID_SEQUENCE)
1379            break;
1380        result += before - s.length;
1381    }
1382    return result;
1383}
1384
1385/**
1386 Sanitizes a string by replacing malformed code unit sequences with valid
1387 code unit sequences. The result is guaranteed to be valid for this encoding.
1388
1389 If the input string is already valid, this function returns the original,
1390 otherwise it constructs a new string by replacing all illegal code unit
1391 sequences with the encoding's replacement character, Invalid sequences will
1392 be replaced with the Unicode replacement character (U+FFFD) if the
1393 character repertoire contains it, otherwise invalid sequences will be
1394 replaced with '?'.
1395
1396 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1397
1398 Params:
1399    s = the string to be sanitized
1400 */
1401immutable(E)[] sanitize(E)(immutable(E)[] s)
1402{
1403    size_t n = validLength(s);
1404    if (n == s.length) return s;
1405
1406    auto repSeq = EncoderInstance!(E).replacementSequence;
1407
1408    // Count how long the string needs to be.
1409    // Overestimating is not a problem
1410    size_t len = s.length;
1411    const(E)[] t = s[n..$];
1412    while (t.length != 0)
1413    {
1414        dchar c = EncoderInstance!(E).safeDecode(t);
1415        assert(c == INVALID_SEQUENCE);
1416        len += repSeq.length;
1417        t = t[validLength(t)..$];
1418    }
1419
1420    // Now do the write
1421    E[] array = new E[len];
1422    array[0..n] = s[0..n];
1423    size_t offset = n;
1424
1425    t = s[n..$];
1426    while (t.length != 0)
1427    {
1428        dchar c = EncoderInstance!(E).safeDecode(t);
1429        assert(c == INVALID_SEQUENCE);
1430        array[offset..offset+repSeq.length] = repSeq[];
1431        offset += repSeq.length;
1432        n = validLength(t);
1433        array[offset..offset+n] = t[0..n];
1434        offset += n;
1435        t = t[n..$];
1436    }
1437    return cast(immutable(E)[])array[0..offset];
1438}
1439
1440unittest
1441{
1442    assert(sanitize("hello \xF0\x80world") == "hello \xEF\xBF\xBDworld");
1443}
1444
1445/**
1446Returns the length of the first encoded sequence.
1447
1448The input to this function MUST be validly encoded.
1449This is enforced by the function's in-contract.
1450
1451Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1452
1453Params:
1454s = the string to be sliced
1455 */
1456size_t firstSequence(E)(const(E)[] s)
1457in
1458{
1459    assert(s.length != 0);
1460    const(E)[] u = s;
1461    assert(safeDecode(u) != INVALID_SEQUENCE);
1462}
1463body
1464{
1465    auto before = s.length;
1466    EncoderInstance!(E).skip(s);
1467    return before - s.length;
1468}
1469
1470unittest
1471{
1472    assert(firstSequence("\u20AC1000") == "\u20AC".length);
1473}
1474
1475/**
1476 Returns the length the last encoded sequence.
1477
1478 The input to this function MUST be validly encoded.
1479 This is enforced by the function's in-contract.
1480
1481 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1482
1483 Params:
1484    s = the string to be sliced
1485 */
1486size_t lastSequence(E)(const(E)[] s)
1487in
1488{
1489    assert(s.length != 0);
1490    assert(isValid(s));
1491}
1492body
1493{
1494    const(E)[] t = s;
1495    EncoderInstance!(E).decodeReverse(s);
1496    return t.length - s.length;
1497}
1498
1499unittest
1500{
1501    assert(lastSequence("1000\u20AC") == "\u20AC".length);
1502}
1503
1504/**
1505 Returns the array index at which the (n+1)th code point begins.
1506
1507 The input to this function MUST be validly encoded.
1508 This is enforced by the function's in-contract.
1509
1510 Supercedes:
1511 This function supercedes std.utf.toUTFindex().
1512
1513 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1514
1515 Params:
1516    s = the string to be counted
1517    n = the current code point index
1518 */
1519ptrdiff_t index(E)(const(E)[] s,int n)
1520in
1521{
1522    assert(isValid(s));
1523    assert(n >= 0);
1524}
1525body
1526{
1527    const(E)[] t = s;
1528    for (size_t i=0; i<n; ++i) EncoderInstance!(E).skip(s);
1529    return t.length - s.length;
1530}
1531
1532unittest
1533{
1534    assert(index("\u20AC100",1) == 3);
1535}
1536
1537/**
1538 Decodes a single code point.
1539
1540 This function removes one or more code units from the start of a string,
1541 and returns the decoded code point which those code units represent.
1542
1543 The input to this function MUST be validly encoded.
1544 This is enforced by the function's in-contract.
1545
1546 Supercedes:
1547 This function supercedes std.utf.decode(), however, note that the
1548 function codePoints() supercedes it more conveniently.
1549
1550 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1551
1552 Params:
1553    s = the string whose first code point is to be decoded
1554 */
1555dchar decode(S)(ref S s)
1556in
1557{
1558    assert(s.length != 0);
1559    auto u = s;
1560    assert(safeDecode(u) != INVALID_SEQUENCE);
1561}
1562body
1563{
1564    return EncoderInstance!(typeof(s[0])).decode(s);
1565}
1566
1567/**
1568 Decodes a single code point from the end of a string.
1569
1570 This function removes one or more code units from the end of a string,
1571 and returns the decoded code point which those code units represent.
1572
1573 The input to this function MUST be validly encoded.
1574 This is enforced by the function's in-contract.
1575
1576 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1577
1578 Params:
1579    s = the string whose first code point is to be decoded
1580 */
1581dchar decodeReverse(E)(ref const(E)[] s)
1582in
1583{
1584    assert(s.length != 0);
1585    assert(isValid(s));
1586}
1587body
1588{
1589    return EncoderInstance!(E).decodeReverse(s);
1590}
1591
1592/**
1593 Decodes a single code point. The input does not have to be valid.
1594
1595 This function removes one or more code units from the start of a string,
1596 and returns the decoded code point which those code units represent.
1597
1598 This function will accept an invalidly encoded string as input.
1599 If an invalid sequence is found at the start of the string, this
1600 function will remove it, and return the value INVALID_SEQUENCE.
1601
1602 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1603
1604 Params:
1605    s = the string whose first code point is to be decoded
1606 */
1607dchar safeDecode(S)(ref S s)
1608in
1609{
1610    assert(s.length != 0);
1611}
1612body
1613{
1614    return EncoderInstance!(typeof(s[0])).safeDecode(s);
1615}
1616
1617/**
1618 Returns the number of code units required to encode a single code point.
1619
1620 The input to this function MUST be a valid code point.
1621 This is enforced by the function's in-contract.
1622
1623 The type of the output cannot be deduced. Therefore, it is necessary to
1624 explicitly specify the encoding as a template parameter.
1625
1626 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1627
1628 Params:
1629    c = the code point to be encoded
1630 */
1631size_t encodedLength(E)(dchar c)
1632in
1633{
1634    assert(isValidCodePoint(c));
1635}
1636body
1637{
1638    return EncoderInstance!(E).encodedLength(c);
1639}
1640
1641/**
1642 Encodes a single code point.
1643
1644 This function encodes a single code point into one or more code units.
1645 It returns a string containing those code units.
1646
1647 The input to this function MUST be a valid code point.
1648 This is enforced by the function's in-contract.
1649
1650 The type of the output cannot be deduced. Therefore, it is necessary to
1651 explicitly specify the encoding as a template parameter.
1652
1653 Supercedes:
1654 This function supercedes std.utf.encode(), however, note that the
1655 function codeUnits() supercedes it more conveniently.
1656
1657 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1658
1659 Params:
1660    c = the code point to be encoded
1661 */
1662E[] encode(E)(dchar c)
1663in
1664{
1665    assert(isValidCodePoint(c));
1666}
1667body
1668{
1669    return EncoderInstance!(E).encode(c);
1670}
1671
1672/**
1673 Encodes a single code point into an array.
1674
1675 This function encodes a single code point into one or more code units
1676 The code units are stored in a user-supplied fixed-size array,
1677 which must be passed by reference.
1678
1679 The input to this function MUST be a valid code point.
1680 This is enforced by the function's in-contract.
1681
1682 The type of the output cannot be deduced. Therefore, it is necessary to
1683 explicitly specify the encoding as a template parameter.
1684
1685 Supercedes:
1686 This function supercedes std.utf.encode(), however, note that the
1687 function codeUnits() supercedes it more conveniently.
1688
1689 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1690
1691 Params:
1692    c     = the code point to be encoded
1693    array = the destination array
1694
1695 Returns:
1696          the number of code units written to the array
1697 */
1698size_t encode(E)(dchar c, E[] array)
1699in
1700{
1701    assert(isValidCodePoint(c));
1702}
1703body
1704{
1705    E[] t = array;
1706    EncoderInstance!(E).encode(c,t);
1707    return array.length - t.length;
1708}
1709
1710// /**
1711//  * Encodes a single code point into a Buffer.
1712//  *
1713//  * This function encodes a single code point into one or more code units
1714//  * The code units are stored in a growable buffer.
1715//  *
1716//  * The input to this function MUST be a valid code point.
1717//  * This is enforced by the function's in-contract.
1718//  *
1719//  * The type of the output cannot be deduced. Therefore, it is necessary to
1720//  * explicitly specify the encoding as a template parameter.
1721//  *
1722//  * Supercedes:
1723//  * This function supercedes std.utf.encode(), however, note that the
1724//  * function codeUnits() supercedes it more conveniently.
1725//  *
1726//  * Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1727//  *
1728//  * Params:
1729//  *    c = the code point to be encoded
1730//  */
1731// deprecated void encode(E)(dchar c, ref Buffer!(E) buffer)
1732// in
1733// {
1734//     assert(isValidCodePoint(c));
1735// }
1736// body
1737// {
1738//     EncoderInstance!(E).encode(c,buffer);
1739// }
1740
1741/*
1742Encodes $(D c) in units of type $(D E) and writes the result to the
1743output range $(D R). Returns the number of $(D E)s written.
1744 */
1745
1746size_t encode(E, R)(dchar c, auto ref R range)
1747if (isNativeOutputRange!(R, E))
1748{
1749    static if (is(Unqual!E == char))
1750    {
1751        if (c <= 0x7F)
1752        {
1753            doPut(range, cast(char) c);
1754            return 1;
1755        }
1756        if (c <= 0x7FF)
1757        {
1758            doPut(range, cast(char)(0xC0 | (c >> 6)));
1759            doPut(range, cast(char)(0x80 | (c & 0x3F)));
1760            return 2;
1761        }
1762        if (c <= 0xFFFF)
1763        {
1764            doPut(range, cast(char)(0xE0 | (c >> 12)));
1765            doPut(range, cast(char)(0x80 | ((c >> 6) & 0x3F)));
1766            doPut(range, cast(char)(0x80 | (c & 0x3F)));
1767            return 3;
1768        }
1769        if (c <= 0x10FFFF)
1770        {
1771            doPut(range, cast(char)(0xF0 | (c >> 18)));
1772            doPut(range, cast(char)(0x80 | ((c >> 12) & 0x3F)));
1773            doPut(range, cast(char)(0x80 | ((c >> 6) & 0x3F)));
1774            doPut(range, cast(char)(0x80 | (c & 0x3F)));
1775            return 4;
1776        }
1777        else
1778        {
1779            assert(0);
1780        }
1781    }
1782    else static if (is(Unqual!E == wchar))
1783    {
1784        if (c <= 0xFFFF)
1785        {
1786            range.doPut(cast(wchar) c);
1787            return 1;
1788        }
1789        range.doPut(cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800));
1790        range.doPut(cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00));
1791        return 2;
1792    }
1793    else static if (is(Unqual!E == dchar))
1794    {
1795        range.doPut(c);
1796        return 1;
1797    }
1798    else
1799    {
1800        static assert(0);
1801    }
1802}
1803unittest
1804{
1805    Appender!(char[]) r;
1806    assert(encode!(char)('T', r) == 1);
1807    assert(encode!(wchar)('T', r) == 1);
1808    assert(encode!(dchar)('T', r) == 1);
1809}
1810
1811/**
1812 Encodes a single code point to a delegate.
1813
1814 This function encodes a single code point into one or more code units.
1815 The code units are passed one at a time to the supplied delegate.
1816
1817 The input to this function MUST be a valid code point.
1818 This is enforced by the function's in-contract.
1819
1820 The type of the output cannot be deduced. Therefore, it is necessary to
1821 explicitly specify the encoding as a template parameter.
1822
1823 Supercedes:
1824 This function supercedes std.utf.encode(), however, note that the
1825 function codeUnits() supercedes it more conveniently.
1826
1827 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1828
1829 Params:
1830    c  = the code point to be encoded
1831    dg = the delegate to invoke for each code unit
1832 */
1833void encode(E)(dchar c, void delegate(E) dg)
1834in
1835{
1836    assert(isValidCodePoint(c));
1837}
1838body
1839{
1840    EncoderInstance!(E).encode(c,dg);
1841}
1842
1843/**
1844 Returns a foreachable struct which can bidirectionally iterate over all
1845 code points in a string.
1846
1847 The input to this function MUST be validly encoded.
1848 This is enforced by the function's in-contract.
1849
1850 You can foreach either
1851 with or without an index. If an index is specified, it will be initialized
1852 at each iteration with the offset into the string at which the code point
1853 begins.
1854
1855 Supercedes:
1856 This function supercedes std.utf.decode().
1857
1858 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1859
1860 Params:
1861    s = the string to be decoded
1862
1863 Examples:
1864 --------------------------------------------------------
1865 string s = "hello world";
1866 foreach(c;codePoints(s))
1867 {
1868     // do something with c (which will always be a dchar)
1869 }
1870 --------------------------------------------------------
1871
1872 Note that, currently, foreach(c:codePoints(s)) is superior to foreach(c;s)
1873 in that the latter will fall over on encountering U+FFFF.
1874 */
1875CodePoints!(E) codePoints(E)(immutable(E)[] s)
1876in
1877{
1878    assert(isValid(s));
1879}
1880body
1881{
1882    return CodePoints!(E)(s);
1883}
1884
1885unittest
1886{
1887    string s = "hello";
1888    string t;
1889    foreach(c;codePoints(s))
1890    {
1891        t ~= cast(char)c;
1892    }
1893    assert(s == t);
1894}
1895
1896/**
1897 Returns a foreachable struct which can bidirectionally iterate over all
1898 code units in a code point.
1899
1900 The input to this function MUST be a valid code point.
1901 This is enforced by the function's in-contract.
1902
1903 The type of the output cannot be deduced. Therefore, it is necessary to
1904 explicitly specify the encoding type in the template parameter.
1905
1906 Supercedes:
1907 This function supercedes std.utf.encode().
1908
1909 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1910
1911 Params:
1912    c = the code point to be encoded
1913
1914 Examples:
1915 --------------------------------------------------------
1916 dchar d = '\u20AC';
1917 foreach(c;codeUnits!(char)(d))
1918 {
1919     writefln("%X",c)
1920 }
1921 // will print
1922 // E2
1923 // 82
1924 // AC
1925 --------------------------------------------------------
1926 */
1927CodeUnits!(E) codeUnits(E)(dchar c)
1928in
1929{
1930    assert(isValidCodePoint(c));
1931}
1932body
1933{
1934    return CodeUnits!(E)(c);
1935}
1936
1937unittest
1938{
1939    char[] a;
1940    foreach(c;codeUnits!(char)(cast(dchar)'\u20AC'))
1941    {
1942        a ~= c;
1943    }
1944    assert(a.length == 3);
1945    assert(a[0] == 0xE2);
1946    assert(a[1] == 0x82);
1947    assert(a[2] == 0xAC);
1948}
1949
1950/**
1951Encodes $(D c) in units of type $(D E) and writes the result to the
1952output range $(D R). Returns the number of $(D E)s written.
1953 */
1954
1955size_t encode(Tgt, Src, R)(in Src[] s, R range)
1956{
1957    size_t result;
1958    foreach (c; s)
1959    {
1960        result += encode!(Tgt)(c, range);
1961    }
1962    return result;
1963}
1964
1965/**
1966 Convert a string from one encoding to another. (See also to!() below).
1967
1968 The input to this function MUST be validly encoded.
1969 This is enforced by the function's in-contract.
1970
1971 Supercedes:
1972 This function supercedes std.utf.toUTF8(), std.utf.toUTF16() and
1973 std.utf.toUTF32()
1974 (but note that to!() supercedes it more conveniently).
1975
1976 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1977
1978 Params:
1979    s = the source string
1980    r = the destination string
1981
1982 Examples:
1983 --------------------------------------------------------
1984 wstring ws;
1985 transcode("hello world",ws);
1986     // transcode from UTF-8 to UTF-16
1987
1988 Latin1String ls;
1989 transcode(ws, ls);
1990     // transcode from UTF-16 to ISO-8859-1
1991  --------------------------------------------------------
1992 */
1993void transcode(Src,Dst)(immutable(Src)[] s,out immutable(Dst)[] r)
1994in
1995{
1996    assert(isValid(s));
1997}
1998body
1999{
2000    static if(is(Src==Dst))
2001    {
2002        r = s;
2003    }
2004    else static if(is(Src==AsciiChar))
2005    {
2006        transcode!(char,Dst)(cast(string)s,

Large files files are truncated, but you can click here to view the full file