/src/rt/util/utf.d
http://github.com/AlexeyProkhin/druntime · D · 902 lines · 656 code · 112 blank · 134 comment · 128 complexity · 6a25b0feb3e2e01f96afe3eae6ec22a5 MD5 · raw file
- /********************************************
- * Encode and decode UTF-8, UTF-16 and UTF-32 strings.
- *
- * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D
- * wchar type.
- * For Posix systems, the C wchar_t type is UTF-32 and corresponds to
- * the D utf.dchar type.
- *
- * UTF character support is restricted to (\u0000 <= character <= \U0010FFFF).
- *
- * See_Also:
- * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
- * $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
- * $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
- * Macros:
- * WIKI = Phobos/StdUtf
- *
- * Copyright: Copyright Digital Mars 2003 - 2009.
- * License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
- * Authors: Walter Bright, Sean Kelly
- */
- /* Copyright Digital Mars 2003 - 2009.
- * Distributed under the Boost Software License, Version 1.0.
- * (See accompanying file LICENSE or copy at
- * http://www.boost.org/LICENSE_1_0.txt)
- */
- module rt.util.utf;
- extern (C) void onUnicodeError( string msg, size_t idx, string file = __FILE__, size_t line = __LINE__ );
- /*******************************
- * Test if c is a valid UTF-32 character.
- *
- * \uFFFE and \uFFFF are considered valid by this function,
- * as they are permitted for internal use by an application,
- * but they are not allowed for interchange by the Unicode standard.
- *
- * Returns: true if it is, false if not.
- */
- bool isValidDchar(dchar c)
- {
- /* Note: FFFE and FFFF are specifically permitted by the
- * Unicode standard for application internal use, but are not
- * allowed for interchange.
- * (thanks to Arcane Jill)
- */
- return c < 0xD800 ||
- (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
- }
- unittest
- {
- debug(utf) printf("utf.isValidDchar.unittest\n");
- assert(isValidDchar(cast(dchar)'a') == true);
- assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
- }
- static immutable UTF8stride =
- [
- cast(ubyte)
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
- 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
- 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
- 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
- 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
- ];
- /**
- * stride() returns the length of a UTF-8 sequence starting at index i
- * in string s.
- * Returns:
- * The number of bytes in the UTF-8 sequence or
- * 0xFF meaning s[i] is not the start of of UTF-8 sequence.
- */
- uint stride(in char[] s, size_t i)
- {
- return UTF8stride[s[i]];
- }
- /**
- * stride() returns the length of a UTF-16 sequence starting at index i
- * in string s.
- */
- uint stride(in wchar[] s, size_t i)
- { uint u = s[i];
- return 1 + (u >= 0xD800 && u <= 0xDBFF);
- }
- /**
- * stride() returns the length of a UTF-32 sequence starting at index i
- * in string s.
- * Returns: The return value will always be 1.
- */
- uint stride(in dchar[] s, size_t i)
- {
- return 1;
- }
- /*******************************************
- * Given an index i into an array of characters s[],
- * and assuming that index i is at the start of a UTF character,
- * determine the number of UCS characters up to that index i.
- */
- size_t toUCSindex(in char[] s, size_t i)
- {
- size_t n;
- size_t j;
- for (j = 0; j < i; )
- {
- j += stride(s, j);
- n++;
- }
- if (j > i)
- {
- onUnicodeError("invalid UTF-8 sequence", j);
- }
- return n;
- }
- /** ditto */
- size_t toUCSindex(in wchar[] s, size_t i)
- {
- size_t n;
- size_t j;
- for (j = 0; j < i; )
- {
- j += stride(s, j);
- n++;
- }
- if (j > i)
- {
- onUnicodeError("invalid UTF-16 sequence", j);
- }
- return n;
- }
- /** ditto */
- size_t toUCSindex(in dchar[] s, size_t i)
- {
- return i;
- }
- /******************************************
- * Given a UCS index n into an array of characters s[], return the UTF index.
- */
- size_t toUTFindex(in char[] s, size_t n)
- {
- size_t i;
- while (n--)
- {
- uint j = UTF8stride[s[i]];
- if (j == 0xFF)
- onUnicodeError("invalid UTF-8 sequence", i);
- i += j;
- }
- return i;
- }
- /** ditto */
- size_t toUTFindex(in wchar[] s, size_t n)
- {
- size_t i;
- while (n--)
- { wchar u = s[i];
- i += 1 + (u >= 0xD800 && u <= 0xDBFF);
- }
- return i;
- }
- /** ditto */
- size_t toUTFindex(in dchar[] s, size_t n)
- {
- return n;
- }
- /* =================== Decode ======================= */
- /***************
- * Decodes and returns character starting at s[idx]. idx is advanced past the
- * decoded character. If the character is not well formed, a UtfException is
- * thrown and idx remains unchanged.
- */
- dchar decode(in char[] s, ref size_t idx)
- in
- {
- assert(idx >= 0 && idx < s.length);
- }
- out (result)
- {
- assert(isValidDchar(result));
- }
- body
- {
- size_t len = s.length;
- dchar V;
- size_t i = idx;
- char u = s[i];
- if (u & 0x80)
- { uint n;
- char u2;
- /* The following encodings are valid, except for the 5 and 6 byte
- * combinations:
- * 0xxxxxxx
- * 110xxxxx 10xxxxxx
- * 1110xxxx 10xxxxxx 10xxxxxx
- * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
- * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
- */
- for (n = 1; ; n++)
- {
- if (n > 4)
- goto Lerr; // only do the first 4 of 6 encodings
- if (((u << n) & 0x80) == 0)
- {
- if (n == 1)
- goto Lerr;
- break;
- }
- }
- // Pick off (7 - n) significant bits of B from first byte of octet
- V = cast(dchar)(u & ((1 << (7 - n)) - 1));
- if (i + (n - 1) >= len)
- goto Lerr; // off end of string
- /* The following combinations are overlong, and illegal:
- * 1100000x (10xxxxxx)
- * 11100000 100xxxxx (10xxxxxx)
- * 11110000 1000xxxx (10xxxxxx 10xxxxxx)
- * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
- * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
- */
- u2 = s[i + 1];
- if ((u & 0xFE) == 0xC0 ||
- (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
- (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
- (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
- (u == 0xFC && (u2 & 0xFC) == 0x80))
- goto Lerr; // overlong combination
- for (uint j = 1; j != n; j++)
- {
- u = s[i + j];
- if ((u & 0xC0) != 0x80)
- goto Lerr; // trailing bytes are 10xxxxxx
- V = (V << 6) | (u & 0x3F);
- }
- if (!isValidDchar(V))
- goto Lerr;
- i += n;
- }
- else
- {
- V = cast(dchar) u;
- i++;
- }
- idx = i;
- return V;
- Lerr:
- onUnicodeError("invalid UTF-8 sequence", i);
- return V; // dummy return
- }
- unittest
- { size_t i;
- dchar c;
- debug(utf) printf("utf.decode.unittest\n");
- static s1 = "abcd"c;
- i = 0;
- c = decode(s1, i);
- assert(c == cast(dchar)'a');
- assert(i == 1);
- c = decode(s1, i);
- assert(c == cast(dchar)'b');
- assert(i == 2);
- static s2 = "\xC2\xA9"c;
- i = 0;
- c = decode(s2, i);
- assert(c == cast(dchar)'\u00A9');
- assert(i == 2);
- static s3 = "\xE2\x89\xA0"c;
- i = 0;
- c = decode(s3, i);
- assert(c == cast(dchar)'\u2260');
- assert(i == 3);
- static s4 =
- [ "\xE2\x89"c[], // too short
- "\xC0\x8A",
- "\xE0\x80\x8A",
- "\xF0\x80\x80\x8A",
- "\xF8\x80\x80\x80\x8A",
- "\xFC\x80\x80\x80\x80\x8A",
- ];
- for (int j = 0; j < s4.length; j++)
- {
- try
- {
- i = 0;
- c = decode(s4[j], i);
- assert(0);
- }
- catch (Throwable o)
- {
- i = 23;
- }
- assert(i == 23);
- }
- }
- /** ditto */
- dchar decode(in wchar[] s, ref size_t idx)
- in
- {
- assert(idx >= 0 && idx < s.length);
- }
- out (result)
- {
- assert(isValidDchar(result));
- }
- body
- {
- string msg;
- dchar V;
- size_t i = idx;
- uint u = s[i];
- if (u & ~0x7F)
- { if (u >= 0xD800 && u <= 0xDBFF)
- { uint u2;
- if (i + 1 == s.length)
- { msg = "surrogate UTF-16 high value past end of string";
- goto Lerr;
- }
- u2 = s[i + 1];
- if (u2 < 0xDC00 || u2 > 0xDFFF)
- { msg = "surrogate UTF-16 low value out of range";
- goto Lerr;
- }
- u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
- i += 2;
- }
- else if (u >= 0xDC00 && u <= 0xDFFF)
- { msg = "unpaired surrogate UTF-16 value";
- goto Lerr;
- }
- else if (u == 0xFFFE || u == 0xFFFF)
- { msg = "illegal UTF-16 value";
- goto Lerr;
- }
- else
- i++;
- }
- else
- {
- i++;
- }
- idx = i;
- return cast(dchar)u;
- Lerr:
- onUnicodeError(msg, i);
- return cast(dchar)u; // dummy return
- }
- /** ditto */
- dchar decode(in dchar[] s, ref size_t idx)
- in
- {
- assert(idx >= 0 && idx < s.length);
- }
- body
- {
- size_t i = idx;
- dchar c = s[i];
- if (!isValidDchar(c))
- goto Lerr;
- idx = i + 1;
- return c;
- Lerr:
- onUnicodeError("invalid UTF-32 value", i);
- return c; // dummy return
- }
- /* =================== Encode ======================= */
- /*******************************
- * Encodes character c and appends it to array s[].
- */
- void encode(ref char[] s, dchar c)
- in
- {
- assert(isValidDchar(c));
- }
- body
- {
- char[] r = s;
- if (c <= 0x7F)
- {
- r ~= cast(char) c;
- }
- else
- {
- char[4] buf;
- uint L;
- if (c <= 0x7FF)
- {
- buf[0] = cast(char)(0xC0 | (c >> 6));
- buf[1] = cast(char)(0x80 | (c & 0x3F));
- L = 2;
- }
- else if (c <= 0xFFFF)
- {
- buf[0] = cast(char)(0xE0 | (c >> 12));
- buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
- buf[2] = cast(char)(0x80 | (c & 0x3F));
- L = 3;
- }
- else if (c <= 0x10FFFF)
- {
- buf[0] = cast(char)(0xF0 | (c >> 18));
- buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
- buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
- buf[3] = cast(char)(0x80 | (c & 0x3F));
- L = 4;
- }
- else
- {
- assert(0);
- }
- r ~= buf[0 .. L];
- }
- s = r;
- }
- unittest
- {
- debug(utf) printf("utf.encode.unittest\n");
- char[] s = "abcd".dup;
- encode(s, cast(dchar)'a');
- assert(s.length == 5);
- assert(s == "abcda");
- encode(s, cast(dchar)'\u00A9');
- assert(s.length == 7);
- assert(s == "abcda\xC2\xA9");
- //assert(s == "abcda\u00A9"); // BUG: fix compiler
- encode(s, cast(dchar)'\u2260');
- assert(s.length == 10);
- assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
- }
- /** ditto */
- void encode(ref wchar[] s, dchar c)
- in
- {
- assert(isValidDchar(c));
- }
- body
- {
- wchar[] r = s;
- if (c <= 0xFFFF)
- {
- r ~= cast(wchar) c;
- }
- else
- {
- wchar[2] buf;
- buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
- buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
- r ~= buf;
- }
- s = r;
- }
- /** ditto */
- void encode(ref dchar[] s, dchar c)
- in
- {
- assert(isValidDchar(c));
- }
- body
- {
- s ~= c;
- }
- /**
- Returns the code length of $(D c) in the encoding using $(D C) as a
- code point. The code is returned in character count, not in bytes.
- */
- ubyte codeLength(C)(dchar c)
- {
- static if (C.sizeof == 1)
- {
- return
- c <= 0x7F ? 1
- : c <= 0x7FF ? 2
- : c <= 0xFFFF ? 3
- : c <= 0x10FFFF ? 4
- : (assert(false), 6);
- }
- else static if (C.sizeof == 2)
- {
- return c <= 0xFFFF ? 1 : 2;
- }
- else
- {
- static assert(C.sizeof == 4);
- return 1;
- }
- }
- /* =================== Validation ======================= */
- /***********************************
- Checks to see if string is well formed or not. $(D S) can be an array
- of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException)
- if it is not. Use to check all untrusted input for correctness.
- */
- void validate(S)(in S s)
- {
- auto len = s.length;
- for (size_t i = 0; i < len; )
- {
- decode(s, i);
- }
- }
- /* =================== Conversion to UTF8 ======================= */
- char[] toUTF8(out char[4] buf, dchar c)
- in
- {
- assert(isValidDchar(c));
- }
- body
- {
- if (c <= 0x7F)
- {
- buf[0] = cast(char) c;
- return buf[0 .. 1];
- }
- else if (c <= 0x7FF)
- {
- buf[0] = cast(char)(0xC0 | (c >> 6));
- buf[1] = cast(char)(0x80 | (c & 0x3F));
- return buf[0 .. 2];
- }
- else if (c <= 0xFFFF)
- {
- buf[0] = cast(char)(0xE0 | (c >> 12));
- buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
- buf[2] = cast(char)(0x80 | (c & 0x3F));
- return buf[0 .. 3];
- }
- else if (c <= 0x10FFFF)
- {
- buf[0] = cast(char)(0xF0 | (c >> 18));
- buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
- buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
- buf[3] = cast(char)(0x80 | (c & 0x3F));
- return buf[0 .. 4];
- }
- assert(0);
- }
- /*******************
- * Encodes string s into UTF-8 and returns the encoded string.
- */
- string toUTF8(string s)
- in
- {
- validate(s);
- }
- body
- {
- return s;
- }
- /** ditto */
- string toUTF8(in wchar[] s)
- {
- char[] r;
- size_t i;
- size_t slen = s.length;
- r.length = slen;
- for (i = 0; i < slen; i++)
- { wchar c = s[i];
- if (c <= 0x7F)
- r[i] = cast(char)c; // fast path for ascii
- else
- {
- r.length = i;
- foreach (dchar c; s[i .. slen])
- {
- encode(r, c);
- }
- break;
- }
- }
- return cast(string)r;
- }
- /** ditto */
- string toUTF8(in dchar[] s)
- {
- char[] r;
- size_t i;
- size_t slen = s.length;
- r.length = slen;
- for (i = 0; i < slen; i++)
- { dchar c = s[i];
- if (c <= 0x7F)
- r[i] = cast(char)c; // fast path for ascii
- else
- {
- r.length = i;
- foreach (dchar d; s[i .. slen])
- {
- encode(r, d);
- }
- break;
- }
- }
- return cast(string)r;
- }
- /* =================== Conversion to UTF16 ======================= */
- wchar[] toUTF16(wchar[2] buf, dchar c)
- in
- {
- assert(isValidDchar(c));
- }
- body
- {
- if (c <= 0xFFFF)
- {
- buf[0] = cast(wchar) c;
- return buf[0 .. 1];
- }
- else
- {
- buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
- buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
- return buf[0 .. 2];
- }
- }
- /****************
- * Encodes string s into UTF-16 and returns the encoded string.
- * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take
- * an LPWSTR or LPCWSTR argument.
- */
- wstring toUTF16(in char[] s)
- {
- wchar[] r;
- size_t slen = s.length;
- r.length = slen;
- r.length = 0;
- for (size_t i = 0; i < slen; )
- {
- dchar c = s[i];
- if (c <= 0x7F)
- {
- i++;
- r ~= cast(wchar)c;
- }
- else
- {
- c = decode(s, i);
- encode(r, c);
- }
- }
- return cast(wstring)r;
- }
- alias const(wchar)* wptr;
- /** ditto */
- wptr toUTF16z(in char[] s)
- {
- wchar[] r;
- size_t slen = s.length;
- r.length = slen + 1;
- r.length = 0;
- for (size_t i = 0; i < slen; )
- {
- dchar c = s[i];
- if (c <= 0x7F)
- {
- i++;
- r ~= cast(wchar)c;
- }
- else
- {
- c = decode(s, i);
- encode(r, c);
- }
- }
- r ~= '\000';
- return r.ptr;
- }
- /** ditto */
- wstring toUTF16(wstring s)
- in
- {
- validate(s);
- }
- body
- {
- return s;
- }
- /** ditto */
- wstring toUTF16(in dchar[] s)
- {
- wchar[] r;
- size_t slen = s.length;
- r.length = slen;
- r.length = 0;
- for (size_t i = 0; i < slen; i++)
- {
- encode(r, s[i]);
- }
- return cast(wstring)r;
- }
- /* =================== Conversion to UTF32 ======================= */
- /*****
- * Encodes string s into UTF-32 and returns the encoded string.
- */
- dstring toUTF32(in char[] s)
- {
- dchar[] r;
- size_t slen = s.length;
- size_t j = 0;
- r.length = slen; // r[] will never be longer than s[]
- for (size_t i = 0; i < slen; )
- {
- dchar c = s[i];
- if (c >= 0x80)
- c = decode(s, i);
- else
- i++; // c is ascii, no need for decode
- r[j++] = c;
- }
- return cast(dstring)r[0 .. j];
- }
- /** ditto */
- dstring toUTF32(in wchar[] s)
- {
- dchar[] r;
- size_t slen = s.length;
- size_t j = 0;
- r.length = slen; // r[] will never be longer than s[]
- for (size_t i = 0; i < slen; )
- {
- dchar c = s[i];
- if (c >= 0x80)
- c = decode(s, i);
- else
- i++; // c is ascii, no need for decode
- r[j++] = c;
- }
- return cast(dstring)r[0 .. j];
- }
- /** ditto */
- dstring toUTF32(dstring s)
- in
- {
- validate(s);
- }
- body
- {
- return s;
- }
- /* ================================ tests ================================== */
- unittest
- {
- debug(utf) printf("utf.toUTF.unittest\n");
- auto c = "hello"c[];
- auto w = toUTF16(c);
- assert(w == "hello");
- auto d = toUTF32(c);
- assert(d == "hello");
- c = toUTF8(w);
- assert(c == "hello");
- d = toUTF32(w);
- assert(d == "hello");
- c = toUTF8(d);
- assert(c == "hello");
- w = toUTF16(d);
- assert(w == "hello");
- c = "hel\u1234o";
- w = toUTF16(c);
- assert(w == "hel\u1234o");
- d = toUTF32(c);
- assert(d == "hel\u1234o");
- c = toUTF8(w);
- assert(c == "hel\u1234o");
- d = toUTF32(w);
- assert(d == "hel\u1234o");
- c = toUTF8(d);
- assert(c == "hel\u1234o");
- w = toUTF16(d);
- assert(w == "hel\u1234o");
- c = "he\U000BAAAAllo";
- w = toUTF16(c);
- //foreach (wchar c; w) printf("c = x%x\n", c);
- //foreach (wchar c; cast(wstring)"he\U000BAAAAllo") printf("c = x%x\n", c);
- assert(w == "he\U000BAAAAllo");
- d = toUTF32(c);
- assert(d == "he\U000BAAAAllo");
- c = toUTF8(w);
- assert(c == "he\U000BAAAAllo");
- d = toUTF32(w);
- assert(d == "he\U000BAAAAllo");
- c = toUTF8(d);
- assert(c == "he\U000BAAAAllo");
- w = toUTF16(d);
- assert(w == "he\U000BAAAAllo");
- }