PageRenderTime 56ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/std/encoding.d

http://github.com/jcd/phobos
D | 2998 lines | 1908 code | 306 blank | 784 comment | 236 complexity | 7d9a2a8af8eb451b31626eb19efa7784 MD5 | raw file
  1. // Written in the D programming language.
  2. /**
  3. Classes and functions for handling and transcoding between various encodings.
  4. For cases where the _encoding is known at compile-time, functions are provided
  5. for arbitrary _encoding and decoding of characters, arbitrary transcoding
  6. between strings of different type, as well as validation and sanitization.
  7. Encodings currently supported are UTF-8, UTF-16, UTF-32, ASCII, ISO-8859-1
  8. (also known as LATIN-1), and WINDOWS-1252.
  9. $(UL
  10. $(LI The type $(D AsciiChar) represents an ASCII character.)
  11. $(LI The type $(D AsciiString) represents an ASCII string.)
  12. $(LI The type $(D Latin1Char) represents an ISO-8859-1 character.)
  13. $(LI The type $(D Latin1String) represents an ISO-8859-1 string.)
  14. $(LI The type $(D Windows1252Char) represents a Windows-1252 character.)
  15. $(LI The type $(D Windows1252String) represents a Windows-1252 string.))
  16. For cases where the _encoding is not known at compile-time, but is
  17. known at run-time, we provide the abstract class $(D EncodingScheme)
  18. and its subclasses. To construct a run-time encoder/decoder, one does
  19. e.g.
  20. ----------------------------------------------------
  21. auto e = EncodingScheme.create("utf-8");
  22. ----------------------------------------------------
  23. This library supplies $(D EncodingScheme) subclasses for ASCII,
  24. ISO-8859-1 (also known as LATIN-1), WINDOWS-1252, UTF-8, and (on
  25. little-endian architectures) UTF-16LE and UTF-32LE; or (on big-endian
  26. architectures) UTF-16BE and UTF-32BE.
  27. This library provides a mechanism whereby other modules may add $(D
  28. EncodingScheme) subclasses for any other _encoding.
  29. Macros:
  30. WIKI=Phobos/StdEncoding
  31. Copyright: Copyright Janice Caron 2008 - 2009.
  32. License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
  33. Authors: Janice Caron
  34. Source: $(PHOBOSSRC std/_encoding.d)
  35. */
  36. /*
  37. Copyright Janice Caron 2008 - 2009.
  38. Distributed under the Boost Software License, Version 1.0.
  39. (See accompanying file LICENSE_1_0.txt or copy at
  40. http://www.boost.org/LICENSE_1_0.txt)
  41. */
  42. module std.encoding;
  43. import std.string;
  44. import std.traits;
  45. import std.range;
  46. unittest
  47. {
  48. static ubyte[][] validStrings =
  49. [
  50. // Plain ASCII
  51. cast(ubyte[])"hello",
  52. // First possible sequence of a certain length
  53. [ 0x00 ], // U+00000000 one byte
  54. [ 0xC2, 0x80 ], // U+00000080 two bytes
  55. [ 0xE0, 0xA0, 0x80 ], // U+00000800 three bytes
  56. [ 0xF0, 0x90, 0x80, 0x80 ], // U+00010000 three bytes
  57. // Last possible sequence of a certain length
  58. [ 0x7F ], // U+0000007F one byte
  59. [ 0xDF, 0xBF ], // U+000007FF two bytes
  60. [ 0xEF, 0xBF, 0xBF ], // U+0000FFFF three bytes
  61. // Other boundary conditions
  62. [ 0xED, 0x9F, 0xBF ],
  63. // U+0000D7FF Last character before surrogates
  64. [ 0xEE, 0x80, 0x80 ],
  65. // U+0000E000 First character after surrogates
  66. [ 0xEF, 0xBF, 0xBD ],
  67. // U+0000FFFD Unicode replacement character
  68. [ 0xF4, 0x8F, 0xBF, 0xBF ],
  69. // U+0010FFFF Very last character
  70. // Non-character code points
  71. /* NOTE: These are legal in UTF, and may be converted from
  72. one UTF to another, however they do not represent Unicode
  73. characters. These code points have been reserved by
  74. Unicode as non-character code points. They are permissible
  75. for data exchange within an application, but they are are
  76. not permitted to be used as characters. Since this module
  77. deals with UTF, and not with Unicode per se, we choose to
  78. accept them here. */
  79. [ 0xDF, 0xBE ], // U+0000FFFE
  80. [ 0xDF, 0xBF ], // U+0000FFFF
  81. ];
  82. static ubyte[][] invalidStrings =
  83. [
  84. // First possible sequence of a certain length, but greater
  85. // than U+10FFFF
  86. [ 0xF8, 0x88, 0x80, 0x80, 0x80 ], // U+00200000 five bytes
  87. [ 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80 ], // U+04000000 six bytes
  88. // Last possible sequence of a certain length, but greater than U+10FFFF
  89. [ 0xF7, 0xBF, 0xBF, 0xBF ], // U+001FFFFF four bytes
  90. [ 0xFB, 0xBF, 0xBF, 0xBF, 0xBF ], // U+03FFFFFF five bytes
  91. [ 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF ], // U+7FFFFFFF six bytes
  92. // Other boundary conditions
  93. [ 0xF4, 0x90, 0x80, 0x80 ], // U+00110000
  94. // First code
  95. // point after
  96. // last character
  97. // Unexpected continuation bytes
  98. [ 0x80 ],
  99. [ 0xBF ],
  100. [ 0x20, 0x80, 0x20 ],
  101. [ 0x20, 0xBF, 0x20 ],
  102. [ 0x80, 0x9F, 0xA0 ],
  103. // Lonely start bytes
  104. [ 0xC0 ],
  105. [ 0xCF ],
  106. [ 0x20, 0xC0, 0x20 ],
  107. [ 0x20, 0xCF, 0x20 ],
  108. [ 0xD0 ],
  109. [ 0xDF ],
  110. [ 0x20, 0xD0, 0x20 ],
  111. [ 0x20, 0xDF, 0x20 ],
  112. [ 0xE0 ],
  113. [ 0xEF ],
  114. [ 0x20, 0xE0, 0x20 ],
  115. [ 0x20, 0xEF, 0x20 ],
  116. [ 0xF0 ],
  117. [ 0xF1 ],
  118. [ 0xF2 ],
  119. [ 0xF3 ],
  120. [ 0xF4 ],
  121. [ 0xF5 ], // If this were legal it would start a character > U+10FFFF
  122. [ 0xF6 ], // If this were legal it would start a character > U+10FFFF
  123. [ 0xF7 ], // If this were legal it would start a character > U+10FFFF
  124. [ 0xEF, 0xBF ], // Three byte sequence with third byte missing
  125. [ 0xF7, 0xBF, 0xBF ], // Four byte sequence with fourth byte missing
  126. [ 0xEF, 0xBF, 0xF7, 0xBF, 0xBF ], // Concatenation of the above
  127. // Impossible bytes
  128. [ 0xF8 ],
  129. [ 0xF9 ],
  130. [ 0xFA ],
  131. [ 0xFB ],
  132. [ 0xFC ],
  133. [ 0xFD ],
  134. [ 0xFE ],
  135. [ 0xFF ],
  136. [ 0x20, 0xF8, 0x20 ],
  137. [ 0x20, 0xF9, 0x20 ],
  138. [ 0x20, 0xFA, 0x20 ],
  139. [ 0x20, 0xFB, 0x20 ],
  140. [ 0x20, 0xFC, 0x20 ],
  141. [ 0x20, 0xFD, 0x20 ],
  142. [ 0x20, 0xFE, 0x20 ],
  143. [ 0x20, 0xFF, 0x20 ],
  144. // Overlong sequences, all representing U+002F
  145. /* With a safe UTF-8 decoder, all of the following five overlong
  146. representations of the ASCII character slash ("/") should be
  147. rejected like a malformed UTF-8 sequence */
  148. [ 0xC0, 0xAF ],
  149. [ 0xE0, 0x80, 0xAF ],
  150. [ 0xF0, 0x80, 0x80, 0xAF ],
  151. [ 0xF8, 0x80, 0x80, 0x80, 0xAF ],
  152. [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0xAF ],
  153. // Maximum overlong sequences
  154. /* Below you see the highest Unicode value that is still resulting in
  155. an overlong sequence if represented with the given number of bytes.
  156. This is a boundary test for safe UTF-8 decoders. All five
  157. characters should be rejected like malformed UTF-8 sequences. */
  158. [ 0xC1, 0xBF ], // U+0000007F
  159. [ 0xE0, 0x9F, 0xBF ], // U+000007FF
  160. [ 0xF0, 0x8F, 0xBF, 0xBF ], // U+0000FFFF
  161. [ 0xF8, 0x87, 0xBF, 0xBF, 0xBF ], // U+001FFFFF
  162. [ 0xFC, 0x83, 0xBF, 0xBF, 0xBF, 0xBF ], // U+03FFFFFF
  163. // Overlong representation of the NUL character
  164. /* The following five sequences should also be rejected like malformed
  165. UTF-8 sequences and should not be treated like the ASCII NUL
  166. character. */
  167. [ 0xC0, 0x80 ],
  168. [ 0xE0, 0x80, 0x80 ],
  169. [ 0xF0, 0x80, 0x80, 0x80 ],
  170. [ 0xF8, 0x80, 0x80, 0x80, 0x80 ],
  171. [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0x80 ],
  172. // Illegal code positions
  173. /* The following UTF-8 sequences should be rejected like malformed
  174. sequences, because they never represent valid ISO 10646 characters
  175. and a UTF-8 decoder that accepts them might introduce security
  176. problems comparable to overlong UTF-8 sequences. */
  177. [ 0xED, 0xA0, 0x80 ], // U+D800
  178. [ 0xED, 0xAD, 0xBF ], // U+DB7F
  179. [ 0xED, 0xAE, 0x80 ], // U+DB80
  180. [ 0xED, 0xAF, 0xBF ], // U+DBFF
  181. [ 0xED, 0xB0, 0x80 ], // U+DC00
  182. [ 0xED, 0xBE, 0x80 ], // U+DF80
  183. [ 0xED, 0xBF, 0xBF ], // U+DFFF
  184. ];
  185. static string[] sanitizedStrings =
  186. [
  187. "\uFFFD","\uFFFD",
  188. "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
  189. " \uFFFD ","\uFFFD\uFFFD\uFFFD","\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ",
  190. "\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ","\uFFFD","\uFFFD"," \uFFFD ",
  191. " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
  192. "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD\uFFFD","\uFFFD","\uFFFD",
  193. "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
  194. " \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD ",
  195. " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
  196. "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
  197. "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
  198. ];
  199. // Make sure everything that should be valid, is
  200. foreach(a;validStrings)
  201. {
  202. string s = cast(string)a;
  203. assert(isValid(s),"Failed to validate: "~makeReadable(s));
  204. }
  205. // Make sure everything that shouldn't be valid, isn't
  206. foreach(a;invalidStrings)
  207. {
  208. string s = cast(string)a;
  209. assert(!isValid(s),"Incorrectly validated: "~makeReadable(s));
  210. }
  211. // Make sure we can sanitize everything bad
  212. assert(invalidStrings.length == sanitizedStrings.length);
  213. for(int i=0; i<invalidStrings.length; ++i)
  214. {
  215. string s = cast(string)invalidStrings[i];
  216. string t = sanitize(s);
  217. assert(isValid(t));
  218. assert(t == sanitizedStrings[i]);
  219. ubyte[] u = cast(ubyte[])t;
  220. validStrings ~= u;
  221. }
  222. // Make sure all transcodings work in both directions, using both forward
  223. // and reverse iteration
  224. foreach(a; validStrings)
  225. {
  226. string s = cast(string)a;
  227. string s2;
  228. wstring ws, ws2;
  229. dstring ds, ds2;
  230. transcode(s,ws);
  231. assert(isValid(ws));
  232. transcode(ws,s2);
  233. assert(s == s2);
  234. transcode(s,ds);
  235. assert(isValid(ds));
  236. transcode(ds,s2);
  237. assert(s == s2);
  238. transcode(ws,s);
  239. assert(isValid(s));
  240. transcode(s,ws2);
  241. assert(ws == ws2);
  242. transcode(ws,ds);
  243. assert(isValid(ds));
  244. transcode(ds,ws2);
  245. assert(ws == ws2);
  246. transcode(ds,s);
  247. assert(isValid(s));
  248. transcode(s,ds2);
  249. assert(ds == ds2);
  250. transcode(ds,ws);
  251. assert(isValid(ws));
  252. transcode(ws,ds2);
  253. assert(ds == ds2);
  254. transcodeReverse(s,ws);
  255. assert(isValid(ws));
  256. transcodeReverse(ws,s2);
  257. assert(s == s2);
  258. transcodeReverse(s,ds);
  259. assert(isValid(ds));
  260. transcodeReverse(ds,s2);
  261. assert(s == s2);
  262. transcodeReverse(ws,s);
  263. assert(isValid(s));
  264. transcodeReverse(s,ws2);
  265. assert(ws == ws2);
  266. transcodeReverse(ws,ds);
  267. assert(isValid(ds));
  268. transcodeReverse(ds,ws2);
  269. assert(ws == ws2);
  270. transcodeReverse(ds,s);
  271. assert(isValid(s));
  272. transcodeReverse(s,ds2);
  273. assert(ds == ds2);
  274. transcodeReverse(ds,ws);
  275. assert(isValid(ws));
  276. transcodeReverse(ws,ds2);
  277. assert(ds == ds2);
  278. }
  279. // Make sure the non-UTF encodings work too
  280. {
  281. auto s = "\u20AC100";
  282. Windows1252String t;
  283. transcode(s,t);
  284. assert(t == cast(Windows1252Char[])[0x80, '1', '0', '0']);
  285. string u;
  286. transcode(s,u);
  287. assert(s == u);
  288. Latin1String v;
  289. transcode(s,v);
  290. assert(cast(string)v == "?100");
  291. AsciiString w;
  292. transcode(v,w);
  293. assert(cast(string)w == "?100");
  294. }
  295. // Make sure we can count properly
  296. {
  297. assert(encodedLength!(char)('A') == 1);
  298. assert(encodedLength!(char)('\u00E3') == 2);
  299. assert(encodedLength!(char)('\u2028') == 3);
  300. assert(encodedLength!(char)('\U0010FFF0') == 4);
  301. assert(encodedLength!(wchar)('A') == 1);
  302. assert(encodedLength!(wchar)('\U0010FFF0') == 2);
  303. }
  304. // Make sure we can write into mutable arrays
  305. {
  306. char[4] buffer;
  307. auto n = encode(cast(dchar)'\u00E3',buffer);
  308. assert(n == 2);
  309. assert(buffer[0] == 0xC3);
  310. assert(buffer[1] == 0xA3);
  311. }
  312. }
  313. //=============================================================================
  314. /** Special value returned by $(D safeDecode) */
  315. enum dchar INVALID_SEQUENCE = cast(dchar) 0xFFFFFFFF;
  316. template EncoderFunctions()
  317. {
  318. // Various forms of read
  319. template ReadFromString()
  320. {
  321. @property bool canRead() { return s.length != 0; }
  322. E peek() { return s[0]; }
  323. E read() { E t = s[0]; s = s[1..$]; return t; }
  324. }
  325. template ReverseReadFromString()
  326. {
  327. @property bool canRead() { return s.length != 0; }
  328. E peek() { return s[$-1]; }
  329. E read() { E t = s[$-1]; s = s[0..$-1]; return t; }
  330. }
  331. // Various forms of Write
  332. template WriteToString()
  333. {
  334. E[] s;
  335. void write(E c) { s ~= c; }
  336. }
  337. template WriteToArray()
  338. {
  339. void write(E c) { array[0] = c; array = array[1..$]; }
  340. }
  341. template WriteToDelegate()
  342. {
  343. void write(E c) { dg(c); }
  344. }
  345. // Functions we will export
  346. template EncodeViaWrite()
  347. {
  348. mixin encodeViaWrite;
  349. void encode(dchar c) { encodeViaWrite(c); }
  350. }
  351. template SkipViaRead()
  352. {
  353. mixin skipViaRead;
  354. void skip() { skipViaRead(); }
  355. }
  356. template DecodeViaRead()
  357. {
  358. mixin decodeViaRead;
  359. dchar decode() { return decodeViaRead(); }
  360. }
  361. template SafeDecodeViaRead()
  362. {
  363. mixin safeDecodeViaRead;
  364. dchar safeDecode() { return safeDecodeViaRead(); }
  365. }
  366. template DecodeReverseViaRead()
  367. {
  368. mixin decodeReverseViaRead;
  369. dchar decodeReverse() { return decodeReverseViaRead(); }
  370. }
  371. // Encoding to different destinations
  372. template EncodeToString()
  373. {
  374. mixin WriteToString;
  375. mixin EncodeViaWrite;
  376. }
  377. template EncodeToArray()
  378. {
  379. mixin WriteToArray;
  380. mixin EncodeViaWrite;
  381. }
  382. template EncodeToDelegate()
  383. {
  384. mixin WriteToDelegate;
  385. mixin EncodeViaWrite;
  386. }
  387. // Decoding functions
  388. template SkipFromString()
  389. {
  390. mixin ReadFromString;
  391. mixin SkipViaRead;
  392. }
  393. template DecodeFromString()
  394. {
  395. mixin ReadFromString;
  396. mixin DecodeViaRead;
  397. }
  398. template SafeDecodeFromString()
  399. {
  400. mixin ReadFromString;
  401. mixin SafeDecodeViaRead;
  402. }
  403. template DecodeReverseFromString()
  404. {
  405. mixin ReverseReadFromString;
  406. mixin DecodeReverseViaRead;
  407. }
  408. //=========================================================================
  409. // Below are the functions we will ultimately expose to the user
  410. E[] encode(dchar c)
  411. {
  412. mixin EncodeToString e;
  413. e.encode(c);
  414. return e.s;
  415. }
  416. void encode(dchar c, ref E[] array)
  417. {
  418. mixin EncodeToArray e;
  419. e.encode(c);
  420. }
  421. void encode(dchar c, void delegate(E) dg)
  422. {
  423. mixin EncodeToDelegate e;
  424. e.encode(c);
  425. }
  426. void skip(ref const(E)[] s)
  427. {
  428. mixin SkipFromString e;
  429. e.skip();
  430. }
  431. dchar decode(S)(ref S s)
  432. {
  433. mixin DecodeFromString e;
  434. return e.decode();
  435. }
  436. dchar safeDecode(S)(ref S s)
  437. {
  438. mixin SafeDecodeFromString e;
  439. return e.safeDecode();
  440. }
  441. dchar decodeReverse(ref const(E)[] s)
  442. {
  443. mixin DecodeReverseFromString e;
  444. return e.decodeReverse();
  445. }
  446. }
  447. //=========================================================================
  448. struct CodePoints(E)
  449. {
  450. const(E)[] s;
  451. this(const(E)[] s)
  452. in
  453. {
  454. assert(isValid(s));
  455. }
  456. body
  457. {
  458. this.s = s;
  459. }
  460. int opApply(scope int delegate(ref dchar) dg)
  461. {
  462. int result = 0;
  463. while (s.length != 0)
  464. {
  465. dchar c = decode(s);
  466. result = dg(c);
  467. if (result != 0) break;
  468. }
  469. return result;
  470. }
  471. int opApply(scope int delegate(ref size_t, ref dchar) dg)
  472. {
  473. size_t i = 0;
  474. int result = 0;
  475. while (s.length != 0)
  476. {
  477. size_t len = s.length;
  478. dchar c = decode(s);
  479. size_t j = i; // We don't want the delegate corrupting i
  480. result = dg(j,c);
  481. if (result != 0) break;
  482. i += len - s.length;
  483. }
  484. return result;
  485. }
  486. int opApplyReverse(scope int delegate(ref dchar) dg)
  487. {
  488. int result = 0;
  489. while (s.length != 0)
  490. {
  491. dchar c = decodeReverse(s);
  492. result = dg(c);
  493. if (result != 0) break;
  494. }
  495. return result;
  496. }
  497. int opApplyReverse(scope int delegate(ref size_t, ref dchar) dg)
  498. {
  499. int result = 0;
  500. while (s.length != 0)
  501. {
  502. dchar c = decodeReverse(s);
  503. size_t i = s.length;
  504. result = dg(i,c);
  505. if (result != 0) break;
  506. }
  507. return result;
  508. }
  509. }
  510. struct CodeUnits(E)
  511. {
  512. E[] s;
  513. this(dchar d)
  514. in
  515. {
  516. assert(isValidCodePoint(d));
  517. }
  518. body
  519. {
  520. s = encode!(E)(d);
  521. }
  522. int opApply(scope int delegate(ref E) dg)
  523. {
  524. int result = 0;
  525. foreach(E c;s)
  526. {
  527. result = dg(c);
  528. if (result != 0) break;
  529. }
  530. return result;
  531. }
  532. int opApplyReverse(scope int delegate(ref E) dg)
  533. {
  534. int result = 0;
  535. foreach_reverse(E c;s)
  536. {
  537. result = dg(c);
  538. if (result != 0) break;
  539. }
  540. return result;
  541. }
  542. }
  543. //=============================================================================
  544. template EncoderInstance(E)
  545. {
  546. static assert(false,"Cannot instantiate EncoderInstance for type "
  547. ~ E.stringof);
  548. }
  549. //=============================================================================
  550. // ASCII
  551. //=============================================================================
  552. /** Defines various character sets. */
  553. enum AsciiChar : ubyte { init }
  554. /// Ditto
  555. alias immutable(AsciiChar)[] AsciiString;
  556. template EncoderInstance(CharType : AsciiChar)
  557. {
  558. alias AsciiChar E;
  559. alias AsciiString EString;
  560. @property string encodingName()
  561. {
  562. return "ASCII";
  563. }
  564. bool canEncode(dchar c)
  565. {
  566. return c < 0x80;
  567. }
  568. bool isValidCodeUnit(AsciiChar c)
  569. {
  570. return c < 0x80;
  571. }
  572. size_t encodedLength(dchar c)
  573. in
  574. {
  575. assert(canEncode(c));
  576. }
  577. body
  578. {
  579. return 1;
  580. }
  581. void encodeX(Range)(dchar c, Range r)
  582. {
  583. if (!canEncode(c)) c = '?';
  584. r.write(cast(AsciiChar) c);
  585. }
  586. void encodeViaWrite()(dchar c)
  587. {
  588. if (!canEncode(c)) c = '?';
  589. write(cast(AsciiChar)c);
  590. }
  591. void skipViaRead()()
  592. {
  593. read();
  594. }
  595. dchar decodeViaRead()()
  596. {
  597. return read();
  598. }
  599. dchar safeDecodeViaRead()()
  600. {
  601. dchar c = read();
  602. return canEncode(c) ? c : INVALID_SEQUENCE;
  603. }
  604. dchar decodeReverseViaRead()()
  605. {
  606. return read();
  607. }
  608. @property EString replacementSequence()
  609. {
  610. return cast(EString)("?");
  611. }
  612. mixin EncoderFunctions;
  613. }
  614. //=============================================================================
  615. // ISO-8859-1
  616. //=============================================================================
  617. /** Defines an Latin1-encoded character. */
  618. enum Latin1Char : ubyte { init }
  619. /**
  620. Defines an Latin1-encoded string (as an array of $(D
  621. immutable(Latin1Char))).
  622. */
  623. alias immutable(Latin1Char)[] Latin1String; ///
  624. template EncoderInstance(CharType : Latin1Char)
  625. {
  626. alias Latin1Char E;
  627. alias Latin1String EString;
  628. @property string encodingName()
  629. {
  630. return "ISO-8859-1";
  631. }
  632. bool canEncode(dchar c)
  633. {
  634. return c < 0x100;
  635. }
  636. bool isValidCodeUnit(Latin1Char c)
  637. {
  638. return true;
  639. }
  640. size_t encodedLength(dchar c)
  641. in
  642. {
  643. assert(canEncode(c));
  644. }
  645. body
  646. {
  647. return 1;
  648. }
  649. void encodeViaWrite()(dchar c)
  650. {
  651. if (!canEncode(c)) c = '?';
  652. write(cast(Latin1Char)c);
  653. }
  654. void skipViaRead()()
  655. {
  656. read();
  657. }
  658. dchar decodeViaRead()()
  659. {
  660. return read();
  661. }
  662. dchar safeDecodeViaRead()()
  663. {
  664. return read();
  665. }
  666. dchar decodeReverseViaRead()()
  667. {
  668. return read();
  669. }
  670. @property EString replacementSequence()
  671. {
  672. return cast(EString)("?");
  673. }
  674. mixin EncoderFunctions;
  675. }
  676. //=============================================================================
  677. // WINDOWS-1252
  678. //=============================================================================
  679. /** Defines a Windows1252-encoded character. */
  680. enum Windows1252Char : ubyte { init }
  681. /**
  682. Defines an Windows1252-encoded string (as an array of $(D
  683. immutable(Windows1252Char))).
  684. */
  685. alias immutable(Windows1252Char)[] Windows1252String; ///
  686. template EncoderInstance(CharType : Windows1252Char)
  687. {
  688. alias Windows1252Char E;
  689. alias Windows1252String EString;
  690. @property string encodingName()
  691. {
  692. return "windows-1252";
  693. }
  694. immutable wstring charMap =
  695. "\u20AC\uFFFD\u201A\u0192\u201E\u2026\u2020\u2021"
  696. "\u02C6\u2030\u0160\u2039\u0152\uFFFD\u017D\uFFFD"
  697. "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2103\u2014"
  698. "\u02DC\u2122\u0161\u203A\u0153\uFFFD\u017E\u0178"
  699. ;
  700. bool canEncode(dchar c)
  701. {
  702. if (c < 0x80 || (c >= 0xA0 && c < 0x100)) return true;
  703. if (c >= 0xFFFD) return false;
  704. foreach(wchar d;charMap) { if (c == d) return true; }
  705. return false;
  706. }
  707. bool isValidCodeUnit(Windows1252Char c)
  708. {
  709. if (c < 0x80 || c >= 0xA0) return true;
  710. return (charMap[c-0x80] != 0xFFFD);
  711. }
  712. size_t encodedLength(dchar c)
  713. in
  714. {
  715. assert(canEncode(c));
  716. }
  717. body
  718. {
  719. return 1;
  720. }
  721. void encodeViaWrite()(dchar c)
  722. {
  723. if (c < 0x80 || (c >= 0xA0 && c < 0x100)) {}
  724. else if (c >= 0xFFFD) { c = '?'; }
  725. else
  726. {
  727. ptrdiff_t n = -1;
  728. foreach (i, wchar d; charMap)
  729. {
  730. if (c == d)
  731. {
  732. n = i;
  733. break;
  734. }
  735. }
  736. c = n == -1 ? '?' : 0x80 + cast(dchar) n;
  737. }
  738. write(cast(Windows1252Char)c);
  739. }
  740. void skipViaRead()()
  741. {
  742. read();
  743. }
  744. dchar decodeViaRead()()
  745. {
  746. Windows1252Char c = read();
  747. return (c >= 0x80 && c < 0xA0) ? charMap[c-0x80] : c;
  748. }
  749. dchar safeDecodeViaRead()()
  750. {
  751. Windows1252Char c = read();
  752. dchar d = (c >= 0x80 && c < 0xA0) ? charMap[c-0x80] : c;
  753. return d == 0xFFFD ? INVALID_SEQUENCE : d;
  754. }
  755. dchar decodeReverseViaRead()()
  756. {
  757. Windows1252Char c = read();
  758. return (c >= 0x80 && c < 0xA0) ? charMap[c-0x80] : c;
  759. }
  760. @property EString replacementSequence()
  761. {
  762. return cast(EString)("?");
  763. }
  764. mixin EncoderFunctions;
  765. }
  766. //=============================================================================
  767. // UTF-8
  768. //=============================================================================
  769. template EncoderInstance(CharType : char)
  770. {
  771. alias char E;
  772. alias immutable(char)[] EString;
  773. @property string encodingName()
  774. {
  775. return "UTF-8";
  776. }
  777. bool canEncode(dchar c)
  778. {
  779. return isValidCodePoint(c);
  780. }
  781. bool isValidCodeUnit(char c)
  782. {
  783. return (c < 0xC0 || (c >= 0xC2 && c < 0xF5));
  784. }
  785. immutable ubyte[128] tailTable =
  786. [
  787. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  788. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  789. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  790. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  791. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  792. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  793. 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  794. 3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,0,
  795. ];
  796. private int tails(char c)
  797. in
  798. {
  799. assert(c >= 0x80);
  800. }
  801. body
  802. {
  803. return tailTable[c-0x80];
  804. }
  805. size_t encodedLength(dchar c)
  806. in
  807. {
  808. assert(canEncode(c));
  809. }
  810. body
  811. {
  812. if (c < 0x80) return 1;
  813. if (c < 0x800) return 2;
  814. if (c < 0x10000) return 3;
  815. return 4;
  816. }
  817. void encodeViaWrite()(dchar c)
  818. {
  819. if (c < 0x80)
  820. {
  821. write(cast(char)c);
  822. }
  823. else if (c < 0x800)
  824. {
  825. write(cast(char)((c >> 6) + 0xC0));
  826. write(cast(char)((c & 0x3F) + 0x80));
  827. }
  828. else if (c < 0x10000)
  829. {
  830. write(cast(char)((c >> 12) + 0xE0));
  831. write(cast(char)(((c >> 6) & 0x3F) + 0x80));
  832. write(cast(char)((c & 0x3F) + 0x80));
  833. }
  834. else
  835. {
  836. write(cast(char)((c >> 18) + 0xF0));
  837. write(cast(char)(((c >> 12) & 0x3F) + 0x80));
  838. write(cast(char)(((c >> 6) & 0x3F) + 0x80));
  839. write(cast(char)((c & 0x3F) + 0x80));
  840. }
  841. }
  842. void skipViaRead()()
  843. {
  844. auto c = read();
  845. if (c < 0xC0) return;
  846. int n = tails(cast(char) c);
  847. for (size_t i=0; i<n; ++i)
  848. {
  849. read();
  850. }
  851. }
  852. dchar decodeViaRead()()
  853. {
  854. dchar c = read();
  855. if (c < 0xC0) return c;
  856. int n = tails(cast(char) c);
  857. c &= (1 << (6 - n)) - 1;
  858. for (size_t i=0; i<n; ++i)
  859. {
  860. c = (c << 6) + (read() & 0x3F);
  861. }
  862. return c;
  863. }
  864. dchar safeDecodeViaRead()()
  865. {
  866. dchar c = read();
  867. if (c < 0x80) return c;
  868. int n = tails(cast(char) c);
  869. if (n == 0) return INVALID_SEQUENCE;
  870. if (!canRead) return INVALID_SEQUENCE;
  871. size_t d = peek();
  872. bool err =
  873. (
  874. (c < 0xC2) // fail overlong 2-byte sequences
  875. || (c > 0xF4) // fail overlong 4-6-byte sequences
  876. || (c == 0xE0 && ((d & 0xE0) == 0x80)) // fail overlong 3-byte sequences
  877. || (c == 0xED && ((d & 0xE0) == 0xA0)) // fail surrogates
  878. || (c == 0xF0 && ((d & 0xF0) == 0x80)) // fail overlong 4-byte sequences
  879. || (c == 0xF4 && ((d & 0xF0) >= 0x90)) // fail code points > 0x10FFFF
  880. );
  881. c &= (1 << (6 - n)) - 1;
  882. for (size_t i=0; i<n; ++i)
  883. {
  884. if (!canRead) return INVALID_SEQUENCE;
  885. d = peek();
  886. if ((d & 0xC0) != 0x80) return INVALID_SEQUENCE;
  887. c = (c << 6) + (read() & 0x3F);
  888. }
  889. return err ? INVALID_SEQUENCE : c;
  890. }
  891. dchar decodeReverseViaRead()()
  892. {
  893. dchar c = read();
  894. if (c < 0x80) return c;
  895. size_t shift = 0;
  896. c &= 0x3F;
  897. for (size_t i=0; i<4; ++i)
  898. {
  899. shift += 6;
  900. auto d = read();
  901. size_t n = tails(cast(char) d);
  902. size_t mask = n == 0 ? 0x3F : (1 << (6 - n)) - 1;
  903. c += ((d & mask) << shift);
  904. if (n != 0) break;
  905. }
  906. return c;
  907. }
  908. @property EString replacementSequence()
  909. {
  910. return "\uFFFD";
  911. }
  912. mixin EncoderFunctions;
  913. }
  914. //=============================================================================
  915. // UTF-16
  916. //=============================================================================
  917. template EncoderInstance(CharType : wchar)
  918. {
  919. alias wchar E;
  920. alias immutable(wchar)[] EString;
  921. @property string encodingName()
  922. {
  923. return "UTF-16";
  924. }
  925. bool canEncode(dchar c)
  926. {
  927. return isValidCodePoint(c);
  928. }
  929. bool isValidCodeUnit(wchar c)
  930. {
  931. return true;
  932. }
  933. size_t encodedLength(dchar c)
  934. in
  935. {
  936. assert(canEncode(c));
  937. }
  938. body
  939. {
  940. return (c < 0x10000) ? 1 : 2;
  941. }
  942. void encodeViaWrite()(dchar c)
  943. {
  944. if (c < 0x10000)
  945. {
  946. write(cast(wchar)c);
  947. }
  948. else
  949. {
  950. size_t n = c - 0x10000;
  951. write(cast(wchar)(0xD800 + (n >> 10)));
  952. write(cast(wchar)(0xDC00 + (n & 0x3FF)));
  953. }
  954. }
  955. void skipViaRead()()
  956. {
  957. wchar c = read();
  958. if (c < 0xD800 || c >= 0xE000) return;
  959. read();
  960. }
  961. dchar decodeViaRead()()
  962. {
  963. wchar c = read();
  964. if (c < 0xD800 || c >= 0xE000) return cast(dchar)c;
  965. wchar d = read();
  966. c &= 0x3FF;
  967. d &= 0x3FF;
  968. return 0x10000 + (c << 10) + d;
  969. }
  970. dchar safeDecodeViaRead()()
  971. {
  972. wchar c = read();
  973. if (c < 0xD800 || c >= 0xE000) return cast(dchar)c;
  974. if (c >= 0xDC00) return INVALID_SEQUENCE;
  975. if (!canRead) return INVALID_SEQUENCE;
  976. wchar d = peek();
  977. if (d < 0xDC00 || d >= 0xE000) return INVALID_SEQUENCE;
  978. d = read();
  979. c &= 0x3FF;
  980. d &= 0x3FF;
  981. return 0x10000 + (c << 10) + d;
  982. }
  983. dchar decodeReverseViaRead()()
  984. {
  985. wchar c = read();
  986. if (c < 0xD800 || c >= 0xE000) return cast(dchar)c;
  987. wchar d = read();
  988. c &= 0x3FF;
  989. d &= 0x3FF;
  990. return 0x10000 + (d << 10) + c;
  991. }
  992. @property EString replacementSequence()
  993. {
  994. return "\uFFFD"w;
  995. }
  996. mixin EncoderFunctions;
  997. }
  998. //=============================================================================
  999. // UTF-32
  1000. //=============================================================================
  1001. template EncoderInstance(CharType : dchar)
  1002. {
  1003. alias dchar E;
  1004. alias immutable(dchar)[] EString;
  1005. @property string encodingName()
  1006. {
  1007. return "UTF-32";
  1008. }
  1009. bool canEncode(dchar c)
  1010. {
  1011. return isValidCodePoint(c);
  1012. }
  1013. bool isValidCodeUnit(dchar c)
  1014. {
  1015. return isValidCodePoint(c);
  1016. }
  1017. size_t encodedLength(dchar c)
  1018. in
  1019. {
  1020. assert(canEncode(c));
  1021. }
  1022. body
  1023. {
  1024. return 1;
  1025. }
  1026. void encodeViaWrite()(dchar c)
  1027. {
  1028. write(c);
  1029. }
  1030. void skipViaRead()()
  1031. {
  1032. read();
  1033. }
  1034. dchar decodeViaRead()()
  1035. {
  1036. return cast(dchar)read();
  1037. }
  1038. dchar safeDecodeViaRead()()
  1039. {
  1040. dchar c = read();
  1041. return isValidCodePoint(c) ? c : INVALID_SEQUENCE;
  1042. }
  1043. dchar decodeReverseViaRead()()
  1044. {
  1045. return cast(dchar)read();
  1046. }
  1047. @property EString replacementSequence()
  1048. {
  1049. return "\uFFFD"d;
  1050. }
  1051. mixin EncoderFunctions;
  1052. }
  1053. //=============================================================================
  1054. // Below are forwarding functions which expose the function to the user
  1055. /**
  1056. Returns true if c is a valid code point
  1057. Note that this includes the non-character code points U+FFFE and U+FFFF,
  1058. since these are valid code points (even though they are not valid
  1059. characters).
  1060. Supercedes:
  1061. This function supercedes $(D std.utf.startsValidDchar()).
  1062. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
  1063. Params:
  1064. c = the code point to be tested
  1065. */
  1066. bool isValidCodePoint(dchar c)
  1067. {
  1068. return c < 0xD800 || (c >= 0xE000 && c < 0x110000);
  1069. }
  1070. /**
  1071. Returns the name of an encoding.
  1072. The type of encoding cannot be deduced. Therefore, it is necessary to
  1073. explicitly specify the encoding type.
  1074. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
  1075. Examples:
  1076. -----------------------------------
  1077. assert(encodingName!(Latin1Char) == "ISO-8859-1");
  1078. -----------------------------------
  1079. */
  1080. @property string encodingName(T)()
  1081. {
  1082. return EncoderInstance!(T).encodingName;
  1083. }
  1084. unittest
  1085. {
  1086. assert(encodingName!(char) == "UTF-8");
  1087. assert(encodingName!(wchar) == "UTF-16");
  1088. assert(encodingName!(dchar) == "UTF-32");
  1089. assert(encodingName!(AsciiChar) == "ASCII");
  1090. assert(encodingName!(Latin1Char) == "ISO-8859-1");
  1091. assert(encodingName!(Windows1252Char) == "windows-1252");
  1092. }
  1093. /**
  1094. Returns true iff it is possible to represent the specifed codepoint
  1095. in the encoding.
  1096. The type of encoding cannot be deduced. Therefore, it is necessary to
  1097. explicitly specify the encoding type.
  1098. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
  1099. Examples:
  1100. -----------------------------------
  1101. assert(canEncode!(Latin1Char)('A'));
  1102. -----------------------------------
  1103. */
  1104. bool canEncode(E)(dchar c)
  1105. {
  1106. return EncoderInstance!(E).canEncode(c);
  1107. }
  1108. unittest
  1109. {
  1110. assert(!canEncode!(AsciiChar)('\u00A0'));
  1111. assert(canEncode!(Latin1Char)('\u00A0'));
  1112. assert(canEncode!(Windows1252Char)('\u20AC'));
  1113. assert(!canEncode!(Windows1252Char)('\u20AD'));
  1114. assert(!canEncode!(Windows1252Char)('\uFFFD'));
  1115. assert(!canEncode!(char)(cast(dchar)0x110000));
  1116. }
  1117. /**
  1118. Returns true if the code unit is legal. For example, the byte 0x80 would
  1119. not be legal in ASCII, because ASCII code units must always be in the range
  1120. 0x00 to 0x7F.
  1121. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
  1122. Params:
  1123. c = the code unit to be tested
  1124. */
  1125. bool isValidCodeUnit(E)(E c)
  1126. {
  1127. return EncoderInstance!(E).isValidCodeUnit(c);
  1128. }
  1129. unittest
  1130. {
  1131. assert(!isValidCodeUnit(cast(AsciiChar)0xA0));
  1132. assert( isValidCodeUnit(cast(Windows1252Char)0x80));
  1133. assert(!isValidCodeUnit(cast(Windows1252Char)0x81));
  1134. assert(!isValidCodeUnit(cast(char)0xC0));
  1135. assert(!isValidCodeUnit(cast(char)0xFF));
  1136. assert( isValidCodeUnit(cast(wchar)0xD800));
  1137. assert(!isValidCodeUnit(cast(dchar)0xD800));
  1138. }
  1139. /**
  1140. Returns true if the string is encoded correctly
  1141. Supercedes:
  1142. This function supercedes std.utf.validate(), however note that this
  1143. function returns a bool indicating whether the input was valid or not,
  1144. wheras the older funtion would throw an exception.
  1145. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
  1146. Params:
  1147. s = the string to be tested
  1148. */
  1149. bool isValid(E)(const(E)[] s)
  1150. {
  1151. return s.length == validLength(s);
  1152. }
  1153. unittest
  1154. {
  1155. assert(isValid("\u20AC100"));
  1156. }
  1157. /**
  1158. Returns the length of the longest possible substring, starting from
  1159. the first code unit, which is validly encoded.
  1160. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
  1161. Params:
  1162. s = the string to be tested
  1163. */
  1164. size_t validLength(E)(const(E)[] s)
  1165. {
  1166. size_t result, before = void;
  1167. while ((before = s.length) > 0)
  1168. {
  1169. if (EncoderInstance!(E).safeDecode(s) == INVALID_SEQUENCE)
  1170. break;
  1171. result += before - s.length;
  1172. }
  1173. return result;
  1174. }
  1175. /**
  1176. Sanitizes a string by replacing malformed code unit sequences with valid
  1177. code unit sequences. The result is guaranteed to be valid for this encoding.
  1178. If the input string is already valid, this function returns the original,
  1179. otherwise it constructs a new string by replacing all illegal code unit
  1180. sequences with the encoding's replacement character, Invalid sequences will
  1181. be replaced with the Unicode replacement character (U+FFFD) if the
  1182. character repertoire contains it, otherwise invalid sequences will be
  1183. replaced with '?'.
  1184. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
  1185. Params:
  1186. s = the string to be sanitized
  1187. */
  1188. immutable(E)[] sanitize(E)(immutable(E)[] s)
  1189. {
  1190. size_t n = validLength(s);
  1191. if (n == s.length) return s;
  1192. auto repSeq = EncoderInstance!(E).replacementSequence;
  1193. // Count how long the string needs to be.
  1194. // Overestimating is not a problem
  1195. size_t len = s.length;
  1196. const(E)[] t = s[n..$];
  1197. while (t.length != 0)
  1198. {
  1199. dchar c = EncoderInstance!(E).safeDecode(t);
  1200. assert(c == INVALID_SEQUENCE);
  1201. len += repSeq.length;
  1202. t = t[validLength(t)..$];
  1203. }
  1204. // Now do the write
  1205. E[] array = new E[len];
  1206. array[0..n] = s[0..n];
  1207. size_t offset = n;
  1208. t = s[n..$];
  1209. while (t.length != 0)
  1210. {
  1211. dchar c = EncoderInstance!(E).safeDecode(t);
  1212. assert(c == INVALID_SEQUENCE);
  1213. array[offset..offset+repSeq.length] = repSeq[];
  1214. offset += repSeq.length;
  1215. n = validLength(t);
  1216. array[offset..offset+n] = t[0..n];
  1217. offset += n;
  1218. t = t[n..$];
  1219. }
  1220. return cast(immutable(E)[])array[0..offset];
  1221. }
  1222. unittest
  1223. {
  1224. assert(sanitize("hello \xF0\x80world") == "hello \xEF\xBF\xBDworld");
  1225. }
  1226. /**
  1227. Returns the length of the first encoded sequence.
  1228. The input to this function MUST be validly encoded.
  1229. This is enforced by the function's in-contract.
  1230. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
  1231. Params:
  1232. s = the string to be sliced
  1233. */
  1234. size_t firstSequence(E)(const(E)[] s)
  1235. in
  1236. {
  1237. assert(s.length != 0);
  1238. const(E)[] u = s;
  1239. assert(safeDecode(u) != INVALID_SEQUENCE);
  1240. }
  1241. body
  1242. {
  1243. auto before = s.length;
  1244. EncoderInstance!(E).skip(s);
  1245. return before - s.length;
  1246. }
  1247. unittest
  1248. {
  1249. assert(firstSequence("\u20AC1000") == "\u20AC".length);
  1250. }
  1251. /**
  1252. Returns the length the last encoded sequence.
  1253. The input to this function MUST be validly encoded.
  1254. This is enforced by the function's in-contract.
  1255. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
  1256. Params:
  1257. s = the string to be sliced
  1258. */
  1259. size_t lastSequence(E)(const(E)[] s)
  1260. in
  1261. {
  1262. assert(s.length != 0);
  1263. assert(isValid(s));
  1264. }
  1265. body
  1266. {
  1267. const(E)[] t = s;
  1268. EncoderInstance!(E).decodeReverse(s);
  1269. return t.length - s.length;
  1270. }
  1271. unittest
  1272. {
  1273. assert(lastSequence("1000\u20AC") == "\u20AC".length);
  1274. }
  1275. /**
  1276. Returns the array index at which the (n+1)th code point begins.
  1277. The input to this function MUST be validly encoded.
  1278. This is enforced by the function's in-contract.
  1279. Supercedes:
  1280. This function supercedes std.utf.toUTFindex().
  1281. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
  1282. Params:
  1283. s = the string to be counted
  1284. n = the current code point index
  1285. */
  1286. ptrdiff_t index(E)(const(E)[] s,int n)
  1287. in
  1288. {
  1289. assert(isValid(s));
  1290. assert(n >= 0);
  1291. }
  1292. body
  1293. {
  1294. const(E)[] t = s;
  1295. for (size_t i=0; i<n; ++i) EncoderInstance!(E).skip(s);
  1296. return t.length - s.length;
  1297. }
  1298. unittest
  1299. {
  1300. assert(index("\u20AC100",1) == 3);
  1301. }
  1302. /**
  1303. Decodes a single code point.
  1304. This function removes one or more code units from the start of a string,
  1305. and returns the decoded code point which those code units represent.
  1306. The input to this function MUST be validly encoded.
  1307. This is enforced by the function's in-contract.
  1308. Supercedes:
  1309. This function supercedes std.utf.decode(), however, note that the
  1310. function codePoints() supercedes it more conveniently.
  1311. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
  1312. Params:
  1313. s = the string whose first code point is to be decoded
  1314. */
  1315. dchar decode(S)(ref S s)
  1316. in
  1317. {
  1318. assert(s.length != 0);
  1319. auto u = s;
  1320. assert(safeDecode(u) != INVALID_SEQUENCE);
  1321. }
  1322. body
  1323. {
  1324. return EncoderInstance!(typeof(s[0])).decode(s);
  1325. }
  1326. /**
  1327. Decodes a single code point from the end of a string.
  1328. This function removes one or more code units from the end of a string,
  1329. and returns the decoded code point which those code units represent.
  1330. The input to this function MUST be validly encoded.
  1331. This is enforced by the function's in-contract.
  1332. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
  1333. Params:
  1334. s = the string whose first code point is to be decoded
  1335. */
  1336. dchar decodeReverse(E)(ref const(E)[] s)
  1337. in
  1338. {
  1339. assert(s.length != 0);
  1340. assert(isValid(s));
  1341. }
  1342. body
  1343. {
  1344. return EncoderInstance!(E).decodeReverse(s);
  1345. }
  1346. /**
  1347. Decodes a single code point. The input does not have to be valid.
  1348. This function removes one or more code units from the start of a string,
  1349. and returns the decoded code point which those code units represent.
  1350. This function will accept an invalidly encoded string as input.
  1351. If an invalid sequence is found at the start of the string, this
  1352. function will remove it, and return the value INVALID_SEQUENCE.
  1353. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
  1354. Params:
  1355. s = the string whose first code point is to be decoded
  1356. */
  1357. dchar safeDecode(S)(ref S s)
  1358. in
  1359. {
  1360. assert(s.length != 0);
  1361. }
  1362. body
  1363. {
  1364. return EncoderInstance!(typeof(s[0])).safeDecode(s);
  1365. }
  1366. /**
  1367. Returns the number of code units required to encode a single code point.
  1368. The input to this function MUST be a valid code point.
  1369. This is enforced by the function's in-contract.
  1370. The type of the output cannot be deduced. Therefore, it is necessary to
  1371. explicitly specify the encoding as a template parameter.
  1372. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
  1373. Params:
  1374. c = the code point to be encoded
  1375. */
  1376. size_t encodedLength(E)(dchar c)
  1377. in
  1378. {
  1379. assert(isValidCodePoint(c));
  1380. }
  1381. body
  1382. {
  1383. return EncoderInstance!(E).encodedLength(c);
  1384. }
  1385. /**
  1386. Encodes a single code point.
  1387. This function encodes a single code point into one or more code units.
  1388. It returns a string containing those code units.
  1389. The input to this function MUST be a valid code point.
  1390. This is enforced by the function's in-contract.
  1391. The type of the output cannot be deduced. Therefore, it is necessary to
  1392. explicitly specify the encoding as a template parameter.
  1393. Supercedes:
  1394. This function supercedes std.utf.encode(), however, note that the
  1395. function codeUnits() supercedes it more conveniently.
  1396. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
  1397. Params:
  1398. c = the code point to be encoded
  1399. */
  1400. E[] encode(E)(dchar c)
  1401. in
  1402. {
  1403. assert(isValidCodePoint(c));
  1404. }
  1405. body
  1406. {
  1407. return EncoderInstance!(E).encode(c);
  1408. }
  1409. /**
  1410. Encodes a single code point into an array.
  1411. This function encodes a single code point into one or more code units
  1412. The code units are stored in a user-supplied fixed-size array,
  1413. which must be passed by reference.
  1414. The input to this function MUST be a valid code point.
  1415. This is enforced by the function's in-contract.
  1416. The type of the output cannot be deduced. Therefore, it is necessary to
  1417. explicitly specify the encoding as a template parameter.
  1418. Supercedes:
  1419. This function supercedes std.utf.encode(), however, note that the
  1420. function codeUnits() supercedes it more conveniently.
  1421. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
  1422. Params:
  1423. c = the code point to be encoded
  1424. array = the destination array
  1425. Returns:
  1426. the number of code units written to the array
  1427. */
  1428. size_t encode(E)(dchar c, E[] array)
  1429. in
  1430. {
  1431. assert(isValidCodePoint(c));
  1432. }
  1433. body
  1434. {
  1435. E[] t = array;
  1436. EncoderInstance!(E).encode(c,t);
  1437. return array.length - t.length;
  1438. }
  1439. // /**
  1440. // * Encodes a single code point into a Buffer.
  1441. // *
  1442. // * This function encodes a single code point into one or more code units
  1443. // * The code units are stored in a growable buffer.
  1444. // *
  1445. // * The input to this function MUST be a valid code point.
  1446. // * This is enforced by the function's in-contract.
  1447. // *
  1448. // * The type of the output cannot be deduced. Therefore, it is necessary to
  1449. // * explicitly specify the encoding as a template parameter.
  1450. // *
  1451. // * Supercedes:
  1452. // * This function supercedes std.utf.encode(), however, note that the
  1453. // * function codeUnits() supercedes it more conveniently.
  1454. // *
  1455. // * Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
  1456. // *
  1457. // * Params:
  1458. // * c = the code point to be encoded
  1459. // */
  1460. // deprecated void encode(E)(dchar c, ref Buffer!(E) buffer)
  1461. // in
  1462. // {
  1463. // assert(isValidCodePoint(c));
  1464. // }
  1465. // body
  1466. // {
  1467. // EncoderInstance!(E).encode(c,buffer);
  1468. // }
  1469. /*
  1470. Encodes $(D c) in units of type $(D E) and writes the result to the
  1471. output range $(D R). Returns the number of $(D E)s written.
  1472. */
  1473. size_t encode(E, R)(dchar c, auto ref R range)
  1474. if (isNativeOutputRange!(R, E))
  1475. {
  1476. static if (is(Unqual!E == char))
  1477. {
  1478. if (c <= 0x7F)
  1479. {
  1480. doPut(range, cast(char) c);
  1481. return 1;
  1482. }
  1483. if (c <= 0x7FF)
  1484. {
  1485. doPut(range, cast(char)(0xC0 | (c >> 6)));
  1486. doPut(range, cast(char)(0x80 | (c & 0x3F)));
  1487. return 2;
  1488. }
  1489. if (c <= 0xFFFF)
  1490. {
  1491. doPut(range, cast(char)(0xE0 | (c >> 12)));
  1492. doPut(range, cast(char)(0x80 | ((c >> 6) & 0x3F)));
  1493. doPut(range, cast(char)(0x80 | (c & 0x3F)));
  1494. return 3;
  1495. }
  1496. if (c <= 0x10FFFF)
  1497. {
  1498. doPut(range, cast(char)(0xF0 | (c >> 18)));
  1499. doPut(range, cast(char)(0x80 | ((c >> 12) & 0x3F)));
  1500. doPut(range, cast(char)(0x80 | ((c >> 6) & 0x3F)));
  1501. doPut(range, cast(char)(0x80 | (c & 0x3F)));
  1502. return 4;
  1503. }
  1504. else
  1505. {
  1506. assert(0);
  1507. }
  1508. }
  1509. else static if (is(Unqual!E == wchar))
  1510. {
  1511. if (c <= 0xFFFF)
  1512. {
  1513. range.doPut(cast(wchar) c);
  1514. return 1;
  1515. }
  1516. range.doPut(cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800));
  1517. range.doPut(cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00));
  1518. return 2;
  1519. }
  1520. else static if (is(Unqual!E == dchar))
  1521. {
  1522. range.doPut(c);
  1523. return 1;
  1524. }
  1525. else
  1526. {
  1527. static assert(0);
  1528. }
  1529. }
  1530. unittest
  1531. {
  1532. Appender!(char[]) r;
  1533. assert(encode!(char)('T', r) == 1);
  1534. assert(encode!(wchar)('T', r) == 1);
  1535. assert(encode!(dchar)('T', r) == 1);
  1536. }
  1537. /**
  1538. Encodes a single code point to a delegate.
  1539. This function encodes a single code point into one or more code units.
  1540. The code units are passed one at a time to the supplied delegate.
  1541. The input to this function MUST be a valid code point.
  1542. This is enforced by the function's in-contract.
  1543. The type of the output cannot be deduced. Therefore, it is necessary to
  1544. explicitly specify the encoding as a template parameter.
  1545. Supercedes:
  1546. This function supercedes std.utf.encode(), however, note that the
  1547. function codeUnits() supercedes it more conveniently.
  1548. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
  1549. Params:
  1550. c = the code point to be encoded
  1551. dg = the delegate to invoke for each code unit
  1552. */
  1553. void encode(E)(dchar c, void delegate(E) dg)
  1554. in
  1555. {
  1556. assert(isValidCodePoint(c));
  1557. }
  1558. body
  1559. {
  1560. EncoderInstance!(E).encode(c,dg);
  1561. }
  1562. /**
  1563. Returns a foreachable struct which can bidirectionally iterate over all
  1564. code points in a string.
  1565. The input to this function MUST be validly encoded.
  1566. This is enforced by the function's in-contract.
  1567. You can foreach either
  1568. with or without an index. If an index is specified, it will be initialized
  1569. at each iteration with the offset into the string at which the code point
  1570. begins.
  1571. Supercedes:
  1572. This function supercedes std.utf.decode().
  1573. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
  1574. Params:
  1575. s = the string to be decoded
  1576. Examples:
  1577. --------------------------------------------------------
  1578. string s = "hello world";
  1579. foreach(c;codePoints(s))
  1580. {
  1581. // do something with c (which will always be a dchar)
  1582. }
  1583. --------------------------------------------------------
  1584. Note that, currently, foreach(c:codePoints(s)) is superior to foreach(c;s)
  1585. in that the latter will fall over on encountering U+FFFF.
  1586. */
  1587. CodePoints!(E) codePoints(E)(immutable(E)[] s)
  1588. in
  1589. {
  1590. assert(isValid(s));
  1591. }
  1592. body
  1593. {
  1594. return CodePoints!(E)(s);
  1595. }
  1596. unittest
  1597. {
  1598. string s = "hello";
  1599. string t;
  1600. foreach(c;codePoints(s))
  1601. {
  1602. t ~= cast(char)c;
  1603. }
  1604. assert(s == t);
  1605. }
  1606. /**
  1607. Returns a foreachable struct which can bidirectionally iterate over all
  1608. code units in a code point.
  1609. The input to this function MUST be a valid code point.
  1610. This is enforced by the function's in-contract.
  1611. The type of the output cannot be deduced. Therefore, it is necessary to
  1612. explicitly specify the encoding type in the template parameter.
  1613. Supercedes:
  1614. This function supercedes std.utf.encode().
  1615. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
  1616. Params:
  1617. c = the code point to be encoded
  1618. Examples:
  1619. --------------------------------------------------------
  1620. dchar d = '\u20AC';
  1621. foreach(c;codeUnits!(char)(d))
  1622. {
  1623. writefln("%X",c)
  1624. }
  1625. // will print
  1626. // E2
  1627. // 82
  1628. // AC
  1629. --------------------------------------------------------
  1630. */
  1631. CodeUnits!(E) codeUnits(E)(dchar c)
  1632. in
  1633. {
  1634. assert(isValidCodePoint(c));
  1635. }
  1636. body
  1637. {
  1638. return CodeUnits!(E)(c);
  1639. }
  1640. unittest
  1641. {
  1642. char[] a;
  1643. foreach(c;codeUnits!(char)(cast(dchar)'\u20AC'))
  1644. {
  1645. a ~= c;
  1646. }
  1647. assert(a.length == 3);
  1648. assert(a[0] == 0xE2);
  1649. assert(a[1] == 0x82);
  1650. assert(a[2] == 0xAC);
  1651. }
  1652. /**
  1653. Encodes $(D c) in units of type $(D E) and writes the result to the
  1654. output range $(D R). Returns the number of $(D E)s written.
  1655. */
  1656. size_t encode(Tgt, Src, R)(in Src[] s, R range)
  1657. {
  1658. size_t result;
  1659. foreach (c; s)
  1660. {
  1661. result += encode!(Tgt)(c, range);
  1662. }
  1663. return result;
  1664. }
  1665. /**
  1666. Convert a string from one encoding to another. (See also to!() below).
  1667. The input to this function MUST be validly encoded.
  1668. This is enforced by the function's in-contract.
  1669. Supercedes:
  1670. This function supercedes std.utf.toUTF8(), std.utf.toUTF16() and
  1671. std.utf.toUTF32()
  1672. (but note that to!() supercedes it more conveniently).
  1673. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
  1674. Params:
  1675. s = the source string
  1676. r = the destination string
  1677. Examples:
  1678. --------------------------------------------------------
  1679. wstring ws;
  1680. transcode("hello world",ws);
  1681. // transcode from UTF-8 to UTF-16
  1682. Latin1String ls;
  1683. transcode(ws, ls);
  1684. // transcode from UTF-16 to ISO-8859-1
  1685. --------------------------------------------------------
  1686. */
  1687. void transcode(Src,Dst)(immutable(Src)[] s,out immutable(Dst)[] r)
  1688. in
  1689. {
  1690. assert(isValid(s));
  1691. }
  1692. body
  1693. {
  1694. static if(is(Src==Dst))
  1695. {
  1696. r = s;
  1697. }
  1698. else static if(is(Src==AsciiChar))
  1699. {
  1700. transcode!(char,Dst)(cast(string)s,r);
  1701. }
  1702. else
  1703. {
  1704. static if(is(Dst == wchar))
  1705. {
  1706. immutable minReservePlace = 2;
  1707. }
  1708. else static if(is(Dst == dchar))
  1709. {
  1710. immutable minReservePlace = 1;
  1711. }
  1712. else
  1713. {
  1714. immutable minReservePlace = 6;
  1715. }
  1716. Dst[] buffer = new Dst[s.length];
  1717. Dst[] tmpBuffer = buffer;
  1718. const(Src)[] t = s;
  1719. while (t.length != 0)
  1720. {
  1721. if(tmpBuffer.length < minReservePlace)
  1722. {
  1723. size_t prevLength = buffer.length;
  1724. buffer.length += t.length + minReservePlace;
  1725. tmpBuffer = buffer[prevLength - tmpBuffer.length .. $];
  1726. }
  1727. EncoderInstance!(Dst).encode(decode(t), tmpBuffer);
  1728. }
  1729. r = cast(immutable)buffer[0 .. buffer.length - tmpBuffer.length];
  1730. }
  1731. }
  1732. unittest
  1733. {
  1734. import std.typetuple;
  1735. {
  1736. import std.conv : to;
  1737. string asciiCharString = to!string(iota(0, 128, 1));
  1738. alias Types = TypeTuple!(string, Latin1String, AsciiString, Windows1252String, dstring, wstring);
  1739. foreach(S; Types)
  1740. foreach(D; Types)
  1741. {
  1742. string str;
  1743. S sStr;
  1744. D dStr;
  1745. transcode(asciiCharString, sStr);
  1746. transcode(sStr, dStr);
  1747. transcode(dStr, str);
  1748. assert(asciiCharString == str);
  1749. }
  1750. }
  1751. {
  1752. string czechChars = "Příliš žluťoučký kůň úpěl ďábelské ódy.";
  1753. alias Types = TypeTuple!(string, dstring, wstring);
  1754. foreach(S; Types)
  1755. foreach(D; Types)
  1756. {
  1757. string str;
  1758. S sStr;
  1759. D dStr;
  1760. transcode(czechChars, sStr);
  1761. transcode(sStr, dStr);
  1762. transcode(dStr, str);
  1763. assert(czechChars == str);
  1764. }
  1765. }
  1766. }
  1767. /*
  1768. Convert a string from one encoding to another. (See also transcode() above).
  1769. The input to this function MUST be validly encoded.
  1770. This is enforced by the function's in-contract.
  1771. Supercedes:
  1772. This function supercedes std.utf.toUTF8(), std.utf.toUTF16() and
  1773. std.utf.toUTF32().
  1774. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
  1775. Params:
  1776. Dst = the destination encoding type
  1777. s = the source string
  1778. Examples:
  1779. -----------------------------------------------------------------------------
  1780. auto ws = to!(wchar)("hello world"); // transcode from UTF-8 to UTF-16
  1781. auto ls = to!(Latin1Char)(ws); // transcode from UTF-16 to ISO-8859-1
  1782. -----------------------------------------------------------------------------
  1783. */
  1784. // TODO: Commented out for no - to be moved to std.conv
  1785. // Dst to(Dst,Src)(immutable(Src)[] s)
  1786. // in
  1787. // {
  1788. // assert(isValid(s));
  1789. // }
  1790. // body
  1791. // {
  1792. // Dst r;
  1793. // transcode(s,r);
  1794. // return r;
  1795. // }
  1796. //=============================================================================
  1797. /** The base class for exceptions thrown by this module */
  1798. class EncodingException : Exception { this(string msg) { super(msg); } }
  1799. class UnrecognizedEncodingException : EncodingException
  1800. {
  1801. private this(string msg) { super(msg); }
  1802. }
  1803. /** Abstract base class of all encoding schemes */
  1804. abstract class EncodingScheme
  1805. {
  1806. /**
  1807. * Registers a subclass of EncodingScheme.
  1808. *
  1809. * This function allows user-defined subclasses of EncodingScheme to
  1810. * be declared in other modules.
  1811. *
  1812. * Examples:
  1813. * ----------------------------------------------
  1814. * class Amiga1251 : EncodingScheme
  1815. * {
  1816. * shared static this()
  1817. * {
  1818. * EncodingScheme.register("path.to.Amiga1251");
  1819. * }
  1820. * }
  1821. * ----------------------------------------------
  1822. */
  1823. static void register(string className)
  1824. {
  1825. auto scheme = cast(EncodingScheme)ClassInfo.find(className).create();
  1826. if (scheme is null)
  1827. throw new EncodingException("Unable to create class "~className);
  1828. foreach(encodingName;scheme.names())
  1829. {
  1830. supported[toLower(encodingName)] = className;
  1831. }
  1832. }
  1833. /**
  1834. * Obtains a subclass of EncodingScheme which is capable of encoding
  1835. * and decoding the named encoding scheme.
  1836. *
  1837. * This function is only aware of EncodingSchemes which have been
  1838. * registered with the register() function.
  1839. *
  1840. * Examples:
  1841. * ---------------------------------------------------
  1842. * auto scheme = EncodingScheme.create("Amiga-1251");
  1843. * ---------------------------------------------------
  1844. */
  1845. static EncodingScheme create(string encodingName)
  1846. {
  1847. auto p = std.string.toLower(encodingName) in supported;
  1848. if (p is null)
  1849. throw new EncodingException("Unrecognized Encoding: "~encodingName);
  1850. string className = *p;
  1851. auto scheme = cast(EncodingScheme)ClassInfo.find(className).create();
  1852. if (scheme is null) throw new EncodingException("Unable to create class "~className);
  1853. return scheme;
  1854. }
  1855. const
  1856. {
  1857. /**
  1858. * Returns the standard name of the encoding scheme
  1859. */
  1860. abstract override string toString();
  1861. /**
  1862. * Returns an array of all known names for this encoding scheme
  1863. */
  1864. abstract string[] names();
  1865. /**
  1866. * Returns true if the character c can be represented
  1867. * in this encoding scheme.
  1868. */
  1869. abstract bool canEncode(dchar c);
  1870. /**
  1871. * Returns the number of ubytes required to encode this code point.
  1872. *
  1873. * The input to this function MUST be a valid code point.
  1874. *
  1875. * Params:
  1876. * c = the code point to be encoded
  1877. *
  1878. * Returns:
  1879. * the number of ubytes required.
  1880. */
  1881. abstract size_t encodedLength(dchar c);
  1882. /**
  1883. * Encodes a single code point into a user-supplied, fixed-size buffer.
  1884. *
  1885. * This function encodes a single code point into one or more ubytes.
  1886. * The supplied buffer must be code unit aligned.
  1887. * (For example, UTF-16LE or UTF-16BE must be wchar-aligned,
  1888. * UTF-32LE or UTF-32BE must be dchar-aligned, etc.)
  1889. *
  1890. * The input to this function MUST be a valid code point.
  1891. *
  1892. * Params:
  1893. * c = the code point to be encoded
  1894. * buffer = the destination array
  1895. *
  1896. * Returns:
  1897. * the number of ubytes written.
  1898. */
  1899. abstract size_t encode(dchar c, ubyte[] buffer);
  1900. /**
  1901. * Decodes a single code point.
  1902. *
  1903. * This function removes one or more ubytes from the start of an array,
  1904. * and returns the decoded code point which those ubytes represent.
  1905. *
  1906. * The input to this function MUST be validly encoded.
  1907. *
  1908. * Params:
  1909. * s = the array whose first code point is to be decoded
  1910. */
  1911. abstract dchar decode(ref const(ubyte)[] s);
  1912. /**
  1913. * Decodes a single code point. The input does not have to be valid.
  1914. *
  1915. * This function removes one or more ubytes from the start of an array,
  1916. * and returns the decoded code point which those ubytes represent.
  1917. *
  1918. * This function will accept an invalidly encoded array as input.
  1919. * If an invalid sequence is found at the start of the string, this
  1920. * function will remove it, and return the value INVALID_SEQUENCE.
  1921. *
  1922. * Params:
  1923. * s = the array whose first code point is to be decoded
  1924. */
  1925. abstract dchar safeDecode(ref const(ubyte)[] s);
  1926. /**
  1927. * Returns the sequence of ubytes to be used to represent
  1928. * any character which cannot be represented in the encoding scheme.
  1929. *
  1930. * Normally this will be a representation of some substitution
  1931. * character, such as U+FFFD or '?'.
  1932. */
  1933. abstract @property immutable(ubyte)[] replacementSequence();
  1934. }
  1935. /**
  1936. * Returns true if the array is encoded correctly
  1937. *
  1938. * Params:
  1939. * s = the array to be tested
  1940. */
  1941. bool isValid(const(ubyte)[] s)
  1942. {
  1943. while (s.length != 0)
  1944. {
  1945. dchar d = safeDecode(s);
  1946. if (d == INVALID_SEQUENCE)
  1947. return false;
  1948. }
  1949. return true;
  1950. }
  1951. /**
  1952. * Returns the length of the longest possible substring, starting from
  1953. * the first element, which is validly encoded.
  1954. *
  1955. * Params:
  1956. * s = the array to be tested
  1957. */
  1958. size_t validLength(const(ubyte)[] s)
  1959. {
  1960. const(ubyte)[] r = s;
  1961. const(ubyte)[] t = s;
  1962. while (s.length != 0)
  1963. {
  1964. if (safeDecode(s) == INVALID_SEQUENCE) break;
  1965. t = s;
  1966. }
  1967. return r.length - t.length;
  1968. }
  1969. /**
  1970. * Sanitizes an array by replacing malformed ubyte sequences with valid
  1971. * ubyte sequences. The result is guaranteed to be valid for this
  1972. * encoding scheme.
  1973. *
  1974. * If the input array is already valid, this function returns the
  1975. * original, otherwise it constructs a new array by replacing all illegal
  1976. * sequences with the encoding scheme's replacement sequence.
  1977. *
  1978. * Params:
  1979. * s = the string to be sanitized
  1980. */
  1981. immutable(ubyte)[] sanitize(immutable(ubyte)[] s)
  1982. {
  1983. auto n = validLength(s);
  1984. if (n == s.length) return s;
  1985. auto repSeq = replacementSequence;
  1986. // Count how long the string needs to be.
  1987. // Overestimating is not a problem
  1988. auto len = s.length;
  1989. const(ubyte)[] t = s[n..$];
  1990. while (t.length != 0)
  1991. {
  1992. dchar c = safeDecode(t);
  1993. assert(c == INVALID_SEQUENCE);
  1994. len += repSeq.length;
  1995. t = t[validLength(t)..$];
  1996. }
  1997. // Now do the write
  1998. ubyte[] array = new ubyte[len];
  1999. array[0..n] = s[0..n];
  2000. auto offset = n;
  2001. t = s[n..$];
  2002. while (t.length != 0)
  2003. {
  2004. dchar c = safeDecode(t);
  2005. assert(c == INVALID_SEQUENCE);
  2006. array[offset..offset+repSeq.length] = repSeq[];
  2007. offset += repSeq.length;
  2008. n = validLength(t);
  2009. array[offset..offset+n] = t[0..n];
  2010. offset += n;
  2011. t = t[n..$];
  2012. }
  2013. return cast(immutable(ubyte)[])array[0..offset];
  2014. }
  2015. /**
  2016. * Returns the length of the first encoded sequence.
  2017. *
  2018. * The input to this function MUST be validly encoded.
  2019. * This is enforced by the function's in-contract.
  2020. *
  2021. * Params:
  2022. * s = the array to be sliced
  2023. */
  2024. size_t firstSequence(const(ubyte)[] s)
  2025. in
  2026. {
  2027. assert(s.length != 0);
  2028. const(ubyte)[] u = s;
  2029. assert(safeDecode(u) != INVALID_SEQUENCE);
  2030. }
  2031. body
  2032. {
  2033. const(ubyte)[] t = s;
  2034. decode(s);
  2035. return t.length - s.length;
  2036. }
  2037. /**
  2038. * Returns the total number of code points encoded in a ubyte array.
  2039. *
  2040. * The input to this function MUST be validly encoded.
  2041. * This is enforced by the function's in-contract.
  2042. *
  2043. * Params:
  2044. * s = the string to be counted
  2045. */
  2046. size_t count(const(ubyte)[] s)
  2047. in
  2048. {
  2049. assert(isValid(s));
  2050. }
  2051. body
  2052. {
  2053. size_t n = 0;
  2054. while (s.length != 0)
  2055. {
  2056. decode(s);
  2057. ++n;
  2058. }
  2059. return n;
  2060. }
  2061. /**
  2062. * Returns the array index at which the (n+1)th code point begins.
  2063. *
  2064. * The input to this function MUST be validly encoded.
  2065. * This is enforced by the function's in-contract.
  2066. *
  2067. * Params:
  2068. * s = the string to be counted
  2069. * n = the current code point index
  2070. */
  2071. ptrdiff_t index(const(ubyte)[] s, size_t n)
  2072. in
  2073. {
  2074. assert(isValid(s));
  2075. assert(n >= 0);
  2076. }
  2077. body
  2078. {
  2079. const(ubyte)[] t = s;
  2080. for (size_t i=0; i<n; ++i) decode(s);
  2081. return t.length - s.length;
  2082. }
  2083. __gshared string[string] supported;
  2084. }
  2085. /**
  2086. EncodingScheme to handle ASCII
  2087. This scheme recognises the following names:
  2088. "ANSI_X3.4-1968",
  2089. "ANSI_X3.4-1986",
  2090. "ASCII",
  2091. "IBM367",
  2092. "ISO646-US",
  2093. "ISO_646.irv:1991",
  2094. "US-ASCII",
  2095. "cp367",
  2096. "csASCII"
  2097. "iso-ir-6",
  2098. "us"
  2099. */
  2100. class EncodingSchemeASCII : EncodingScheme
  2101. {
  2102. shared static this()
  2103. {
  2104. EncodingScheme.register("std.encoding.EncodingSchemeASCII");
  2105. }
  2106. const
  2107. {
  2108. override string[] names()
  2109. {
  2110. return
  2111. [
  2112. cast(string)
  2113. "ANSI_X3.4-1968",
  2114. "ANSI_X3.4-1986",
  2115. "ASCII",
  2116. "IBM367",
  2117. "ISO646-US",
  2118. "ISO_646.irv:1991",
  2119. "US-ASCII",
  2120. "cp367",
  2121. "csASCII"
  2122. "iso-ir-6",
  2123. "us"
  2124. ];
  2125. }
  2126. override string toString()
  2127. {
  2128. return "ASCII";
  2129. }
  2130. override bool canEncode(dchar c)
  2131. {
  2132. return std.encoding.canEncode!(AsciiChar)(c);
  2133. }
  2134. override size_t encodedLength(dchar c)
  2135. {
  2136. return std.encoding.encodedLength!(AsciiChar)(c);
  2137. }
  2138. override size_t encode(dchar c, ubyte[] buffer)
  2139. {
  2140. auto r = cast(AsciiChar[])buffer;
  2141. return std.encoding.encode(c,r);
  2142. }
  2143. override dchar decode(ref const(ubyte)[] s)
  2144. {
  2145. auto t = cast(const(AsciiChar)[]) s;
  2146. dchar c = std.encoding.decode(t);
  2147. s = s[$-t.length..$];
  2148. return c;
  2149. }
  2150. override dchar safeDecode(ref const(ubyte)[] s)
  2151. {
  2152. auto t = cast(const(AsciiChar)[]) s;
  2153. dchar c = std.encoding.safeDecode(t);
  2154. s = s[$-t.length..$];
  2155. return c;
  2156. }
  2157. override @property immutable(ubyte)[] replacementSequence()
  2158. {
  2159. return cast(immutable(ubyte)[])"?";
  2160. }
  2161. }
  2162. }
  2163. /**
  2164. EncodingScheme to handle Latin-1
  2165. This scheme recognises the following names:
  2166. "CP819",
  2167. "IBM819",
  2168. "ISO-8859-1",
  2169. "ISO_8859-1",
  2170. "ISO_8859-1:1987",
  2171. "csISOLatin1",
  2172. "iso-ir-100",
  2173. "l1",
  2174. "latin1"
  2175. */
  2176. class EncodingSchemeLatin1 : EncodingScheme
  2177. {
  2178. shared static this()
  2179. {
  2180. EncodingScheme.register("std.encoding.EncodingSchemeLatin1");
  2181. }
  2182. const
  2183. {
  2184. override string[] names()
  2185. {
  2186. return
  2187. [
  2188. cast(string)
  2189. "CP819",
  2190. "IBM819",
  2191. "ISO-8859-1",
  2192. "ISO_8859-1",
  2193. "ISO_8859-1:1987",
  2194. "csISOLatin1",
  2195. "iso-ir-100",
  2196. "l1",
  2197. "latin1"
  2198. ];
  2199. }
  2200. override string toString()
  2201. {
  2202. return "ISO-8859-1";
  2203. }
  2204. override bool canEncode(dchar c)
  2205. {
  2206. return std.encoding.canEncode!(Latin1Char)(c);
  2207. }
  2208. override size_t encodedLength(dchar c)
  2209. {
  2210. return std.encoding.encodedLength!(Latin1Char)(c);
  2211. }
  2212. override size_t encode(dchar c, ubyte[] buffer)
  2213. {
  2214. auto r = cast(Latin1Char[])buffer;
  2215. return std.encoding.encode(c,r);
  2216. }
  2217. override dchar decode(ref const(ubyte)[] s)
  2218. {
  2219. auto t = cast(const(Latin1Char)[]) s;
  2220. dchar c = std.encoding.decode(t);
  2221. s = s[$-t.length..$];
  2222. return c;
  2223. }
  2224. override dchar safeDecode(ref const(ubyte)[] s)
  2225. {
  2226. auto t = cast(const(Latin1Char)[]) s;
  2227. dchar c = std.encoding.safeDecode(t);
  2228. s = s[$-t.length..$];
  2229. return c;
  2230. }
  2231. override @property immutable(ubyte)[] replacementSequence()
  2232. {
  2233. return cast(immutable(ubyte)[])"?";
  2234. }
  2235. }
  2236. }
  2237. /**
  2238. EncodingScheme to handle Windows-1252
  2239. This scheme recognises the following names:
  2240. "windows-1252"
  2241. */
  2242. class EncodingSchemeWindows1252 : EncodingScheme
  2243. {
  2244. shared static this()
  2245. {
  2246. EncodingScheme.register("std.encoding.EncodingSchemeWindows1252");
  2247. }
  2248. const
  2249. {
  2250. override string[] names()
  2251. {
  2252. return
  2253. [
  2254. cast(string)
  2255. "windows-1252"
  2256. ];
  2257. }
  2258. override string toString()
  2259. {
  2260. return "windows-1252";
  2261. }
  2262. override bool canEncode(dchar c)
  2263. {
  2264. return std.encoding.canEncode!(Windows1252Char)(c);
  2265. }
  2266. override size_t encodedLength(dchar c)
  2267. {
  2268. return std.encoding.encodedLength!(Windows1252Char)(c);
  2269. }
  2270. override size_t encode(dchar c, ubyte[] buffer)
  2271. {
  2272. auto r = cast(Windows1252Char[])buffer;
  2273. return std.encoding.encode(c,r);
  2274. }
  2275. override dchar decode(ref const(ubyte)[] s)
  2276. {
  2277. auto t = cast(const(Windows1252Char)[]) s;
  2278. dchar c = std.encoding.decode(t);
  2279. s = s[$-t.length..$];
  2280. return c;
  2281. }
  2282. override dchar safeDecode(ref const(ubyte)[] s)
  2283. {
  2284. auto t = cast(const(Windows1252Char)[]) s;
  2285. dchar c = std.encoding.safeDecode(t);
  2286. s = s[$-t.length..$];
  2287. return c;
  2288. }
  2289. override @property immutable(ubyte)[] replacementSequence()
  2290. {
  2291. return cast(immutable(ubyte)[])"?";
  2292. }
  2293. }
  2294. }
  2295. /**
  2296. EncodingScheme to handle UTF-8
  2297. This scheme recognises the following names:
  2298. "UTF-8"
  2299. */
  2300. class EncodingSchemeUtf8 : EncodingScheme
  2301. {
  2302. shared static this()
  2303. {
  2304. EncodingScheme.register("std.encoding.EncodingSchemeUtf8");
  2305. }
  2306. const
  2307. {
  2308. override string[] names()
  2309. {
  2310. return
  2311. [
  2312. cast(string)
  2313. "UTF-8"
  2314. ];
  2315. }
  2316. override string toString()
  2317. {
  2318. return "UTF-8";
  2319. }
  2320. override bool canEncode(dchar c)
  2321. {
  2322. return std.encoding.canEncode!(char)(c);
  2323. }
  2324. override size_t encodedLength(dchar c)
  2325. {
  2326. return std.encoding.encodedLength!(char)(c);
  2327. }
  2328. override size_t encode(dchar c, ubyte[] buffer)
  2329. {
  2330. auto r = cast(char[])buffer;
  2331. return std.encoding.encode(c,r);
  2332. }
  2333. override dchar decode(ref const(ubyte)[] s)
  2334. {
  2335. auto t = cast(const(char)[]) s;
  2336. dchar c = std.encoding.decode(t);
  2337. s = s[$-t.length..$];
  2338. return c;
  2339. }
  2340. override dchar safeDecode(ref const(ubyte)[] s)
  2341. {
  2342. auto t = cast(const(char)[]) s;
  2343. dchar c = std.encoding.safeDecode(t);
  2344. s = s[$-t.length..$];
  2345. return c;
  2346. }
  2347. override @property immutable(ubyte)[] replacementSequence()
  2348. {
  2349. return cast(immutable(ubyte)[])"\uFFFD";
  2350. }
  2351. }
  2352. }
  2353. /**
  2354. EncodingScheme to handle UTF-16 in native byte order
  2355. This scheme recognises the following names:
  2356. "UTF-16LE" (little-endian architecture only)
  2357. "UTF-16BE" (big-endian architecture only)
  2358. */
  2359. class EncodingSchemeUtf16Native : EncodingScheme
  2360. {
  2361. shared static this()
  2362. {
  2363. EncodingScheme.register("std.encoding.EncodingSchemeUtf16Native");
  2364. }
  2365. const
  2366. {
  2367. version(LittleEndian) { enum string NAME = "UTF-16LE"; }
  2368. version(BigEndian) { enum string NAME = "UTF-16BE"; }
  2369. override string[] names()
  2370. {
  2371. return [ NAME ];
  2372. }
  2373. override string toString()
  2374. {
  2375. return NAME;
  2376. }
  2377. override bool canEncode(dchar c)
  2378. {
  2379. return std.encoding.canEncode!(wchar)(c);
  2380. }
  2381. override size_t encodedLength(dchar c)
  2382. {
  2383. return std.encoding.encodedLength!(wchar)(c);
  2384. }
  2385. override size_t encode(dchar c, ubyte[] buffer)
  2386. {
  2387. auto r = cast(wchar[])buffer;
  2388. return wchar.sizeof * std.encoding.encode(c,r);
  2389. }
  2390. override dchar decode(ref const(ubyte)[] s)
  2391. in
  2392. {
  2393. assert((s.length & 1) == 0);
  2394. }
  2395. body
  2396. {
  2397. auto t = cast(const(wchar)[]) s;
  2398. dchar c = std.encoding.decode(t);
  2399. s = s[$-t.length..$];
  2400. return c;
  2401. }
  2402. override dchar safeDecode(ref const(ubyte)[] s)
  2403. in
  2404. {
  2405. assert((s.length & 1) == 0);
  2406. }
  2407. body
  2408. {
  2409. auto t = cast(const(wchar)[]) s;
  2410. dchar c = std.encoding.safeDecode(t);
  2411. s = s[$-t.length..$];
  2412. return c;
  2413. }
  2414. override @property immutable(ubyte)[] replacementSequence()
  2415. {
  2416. return cast(immutable(ubyte)[])"\uFFFD"w;
  2417. }
  2418. }
  2419. }
  2420. /**
  2421. EncodingScheme to handle UTF-32 in native byte order
  2422. This scheme recognises the following names:
  2423. "UTF-32LE" (little-endian architecture only)
  2424. "UTF-32BE" (big-endian architecture only)
  2425. */
  2426. class EncodingSchemeUtf32Native : EncodingScheme
  2427. {
  2428. shared static this()
  2429. {
  2430. EncodingScheme.register("std.encoding.EncodingSchemeUtf32Native");
  2431. }
  2432. const
  2433. {
  2434. version(LittleEndian) { enum string NAME = "UTF-32LE"; }
  2435. version(BigEndian) { enum string NAME = "UTF-32BE"; }
  2436. override string[] names()
  2437. {
  2438. return [ NAME ];
  2439. }
  2440. override string toString()
  2441. {
  2442. return NAME;
  2443. }
  2444. override bool canEncode(dchar c)
  2445. {
  2446. return std.encoding.canEncode!(dchar)(c);
  2447. }
  2448. override size_t encodedLength(dchar c)
  2449. {
  2450. return std.encoding.encodedLength!(dchar)(c);
  2451. }
  2452. override size_t encode(dchar c, ubyte[] buffer)
  2453. {
  2454. auto r = cast(dchar[])buffer;
  2455. return dchar.sizeof * std.encoding.encode(c,r);
  2456. }
  2457. override dchar decode(ref const(ubyte)[] s)
  2458. in
  2459. {
  2460. assert((s.length & 3) == 0);
  2461. }
  2462. body
  2463. {
  2464. auto t = cast(const(dchar)[]) s;
  2465. dchar c = std.encoding.decode(t);
  2466. s = s[$-t.length..$];
  2467. return c;
  2468. }
  2469. override dchar safeDecode(ref const(ubyte)[] s)
  2470. in
  2471. {
  2472. assert((s.length & 3) == 0);
  2473. }
  2474. body
  2475. {
  2476. auto t = cast(const(dchar)[]) s;
  2477. dchar c = std.encoding.safeDecode(t);
  2478. s = s[$-t.length..$];
  2479. return c;
  2480. }
  2481. override @property immutable(ubyte)[] replacementSequence()
  2482. {
  2483. return cast(immutable(ubyte)[])"\uFFFD"d;
  2484. }
  2485. }
  2486. }
  2487. //=============================================================================
  2488. // Helper functions
  2489. version(unittest)
  2490. {
  2491. void transcodeReverse(Src,Dst)(immutable(Src)[] s, out immutable(Dst)[] r)
  2492. {
  2493. static if(is(Src==Dst))
  2494. {
  2495. return s;
  2496. }
  2497. else static if(is(Src==AsciiChar))
  2498. {
  2499. transcodeReverse!(char,Dst)(cast(string)s,r);
  2500. }
  2501. else
  2502. {
  2503. foreach_reverse(d;codePoints(s))
  2504. {
  2505. foreach_reverse(c;codeUnits!(Dst)(d))
  2506. {
  2507. r = c ~ r;
  2508. }
  2509. }
  2510. }
  2511. }
  2512. string makeReadable(string s)
  2513. {
  2514. string r = "\"";
  2515. foreach(char c;s)
  2516. {
  2517. if (c >= 0x20 && c < 0x80)
  2518. {
  2519. r ~= c;
  2520. }
  2521. else
  2522. {
  2523. r ~= "\\x";
  2524. r ~= toHexDigit(c >> 4);
  2525. r ~= toHexDigit(c);
  2526. }
  2527. }
  2528. r ~= "\"";
  2529. return r;
  2530. }
  2531. string makeReadable(wstring s)
  2532. {
  2533. string r = "\"";
  2534. foreach(wchar c;s)
  2535. {
  2536. if (c >= 0x20 && c < 0x80)
  2537. {
  2538. r ~= cast(char) c;
  2539. }
  2540. else
  2541. {
  2542. r ~= "\\u";
  2543. r ~= toHexDigit(c >> 12);
  2544. r ~= toHexDigit(c >> 8);
  2545. r ~= toHexDigit(c >> 4);
  2546. r ~= toHexDigit(c);
  2547. }
  2548. }
  2549. r ~= "\"w";
  2550. return r;
  2551. }
  2552. string makeReadable(dstring s)
  2553. {
  2554. string r = "\"";
  2555. foreach(dchar c; s)
  2556. {
  2557. if (c >= 0x20 && c < 0x80)
  2558. {
  2559. r ~= cast(char) c;
  2560. }
  2561. else if (c < 0x10000)
  2562. {
  2563. r ~= "\\u";
  2564. r ~= toHexDigit(c >> 12);
  2565. r ~= toHexDigit(c >> 8);
  2566. r ~= toHexDigit(c >> 4);
  2567. r ~= toHexDigit(c);
  2568. }
  2569. else
  2570. {
  2571. r ~= "\\U00";
  2572. r ~= toHexDigit(c >> 20);
  2573. r ~= toHexDigit(c >> 16);
  2574. r ~= toHexDigit(c >> 12);
  2575. r ~= toHexDigit(c >> 8);
  2576. r ~= toHexDigit(c >> 4);
  2577. r ~= toHexDigit(c);
  2578. }
  2579. }
  2580. r ~= "\"d";
  2581. return r;
  2582. }
  2583. char toHexDigit(int n)
  2584. {
  2585. return "0123456789ABCDEF"[n & 0xF];
  2586. }
  2587. }