/core/unicode.d

http://github.com/wilkie/djehuty · D · 1081 lines · 725 code · 205 blank · 151 comment · 209 complexity · b305b6c838fc8b68a11c4b8195c90c9c MD5 · raw file

  1. /*
  2. * unicode.d
  3. *
  4. * This module implements unicode functions that were badly needed.
  5. *
  6. * Author: Dave Wilkinson
  7. *
  8. */
  9. module core.unicode;
  10. import core.definitions;
  11. private static const uint halfShift = 10;
  12. private static const uint halfBase = 0x0010000;
  13. private static const uint halfMask = 0x3FF;
  14. private const auto UNI_SUR_HIGH_START = 0xD800;
  15. private const auto UNI_SUR_HIGH_END = 0xDBFF;
  16. private const auto UNI_SUR_LOW_START = 0xDC00;
  17. private const auto UNI_SUR_LOW_END = 0xDFFF;
  18. private const auto UNI_REPLACEMENT_CHAR = cast(dchar)0x0000FFFD;
  19. private const auto UNI_MAX_BMP = cast(dchar)0x0000FFFF;
  20. private const auto UNI_MAX_UTF16 = cast(dchar)0x0010FFFF;
  21. private const auto UNI_MAX_UTF32 = cast(dchar)0x7FFFFFFF;
  22. private const auto UNI_MAX_LEGAL_UTF32 = cast(dchar)0x0010FFFF;
  23. private static const ubyte firstByteMark[7] = [ 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC ];
  24. /*
  25. * Index into the table below with the first byte of a UTF-8 sequence to
  26. * get the number of trailing bytes that are supposed to follow it.
  27. * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
  28. * left as-is for anyone who may want to do such conversion, which was
  29. * allowed in earlier algorithms.
  30. */
  31. static const char trailingBytesForUTF8[256] = [
  32. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  33. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  34. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  35. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  36. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  37. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  38. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  39. 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
  40. ];
  41. /*
  42. * Magic values subtracted from a buffer value during UTF8 conversion.
  43. * This table contains as many values as there might be trailing bytes
  44. * in a UTF-8 sequence.
  45. */
  46. static const uint offsetsFromUTF8[6] = [ 0x00000000, 0x00003080, 0x000E2080,
  47. 0x03C82080, 0xFA082080, 0x82082080 ];
  48. /*
  49. * Utility routine to tell whether a sequence of bytes is legal UTF-8.
  50. * This must be called with the length pre-determined by the first byte.
  51. * If not calling this from ConvertUTF8to*, then the length can be set by:
  52. * length = trailingBytesForUTF8[*source]+1;
  53. * and the sequence is illegal right away if there aren't that many bytes
  54. * available.
  55. * If presented with a length > 4, this returns false. The Unicode
  56. * definition of UTF-8 goes up to 4-byte sequences.
  57. */
  58. private bool isLegalUTF8(char* source, int length) {
  59. char a;
  60. char *srcptr = source+length;
  61. switch (length) {
  62. default: return false;
  63. /* Everything else falls through when "true"... */
  64. case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
  65. case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
  66. case 2: if ((a = (*--srcptr)) > 0xBF) return false;
  67. switch (*source) {
  68. /* no fall-through in this inner switch */
  69. case 0xE0: if (a < 0xA0) return false; break;
  70. case 0xED: if (a > 0x9F) return false; break;
  71. case 0xF0: if (a < 0x90) return false; break;
  72. case 0xF4: if (a > 0x8F) return false; break;
  73. default: if (a < 0x80) return false;
  74. }
  75. case 1: if (*source >= 0x80 && *source < 0xC2) return false;
  76. }
  77. if (*source > 0xF4) return false;
  78. return true;
  79. }
  80. // For efficiency, we have full
  81. // control of the buffer length.
  82. struct Unicode {
  83. static:
  84. string toUtf8(string src) {
  85. return cast(string)src.dup;
  86. }
  87. string toUtf8(wstring src) {
  88. if (src.length == 0) {
  89. return cast(string)"";
  90. }
  91. char[] container = new char[src.length*4];
  92. const auto byteMask = 0xBF;
  93. const auto byteMark = 0x80;
  94. wchar* source = src.ptr;
  95. wchar* sourceEnd = &src[$-1] + 1;
  96. char* target = container.ptr;
  97. char* targetEnd = &container[$-1] + 1;
  98. uint bytesToWrite;
  99. dchar ch;
  100. while(source !is sourceEnd) {
  101. ch = *source++;
  102. // If we have a surrogate pair, we convert to UTF-32
  103. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
  104. dchar ch2 = cast(dchar)*source;
  105. /* If it's a low surrogate, convert to UTF32. */
  106. if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
  107. ch = ((ch - UNI_SUR_HIGH_START) << 10) + (ch2 - UNI_SUR_LOW_START) + halfBase;
  108. source++;
  109. }
  110. else {
  111. // unpaired high surrogate
  112. // illegal
  113. // TODO: do not break, just add a character and continue to produce valid string
  114. source--;
  115. break;
  116. }
  117. }
  118. else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
  119. // illegal
  120. // TODO: do not break, just add a character and continue to produce valid string
  121. source--;
  122. break;
  123. }
  124. /* Figure out how many bytes the result will require */
  125. if (ch < cast(dchar)0x80) {
  126. bytesToWrite = 1;
  127. }
  128. else if (ch < cast(dchar)0x800) {
  129. bytesToWrite = 2;
  130. }
  131. else if (ch < cast(dchar)0x10000) {
  132. bytesToWrite = 3;
  133. }
  134. else if (ch < cast(dchar)0x110000) {
  135. bytesToWrite = 4;
  136. }
  137. else {
  138. bytesToWrite = 3;
  139. ch = UNI_REPLACEMENT_CHAR;
  140. }
  141. target += bytesToWrite;
  142. switch (bytesToWrite) { /* note: everything falls through. */
  143. case 4: *--target = cast(char)((ch | byteMark) & byteMask); ch >>= 6;
  144. case 3: *--target = cast(char)((ch | byteMark) & byteMask); ch >>= 6;
  145. case 2: *--target = cast(char)((ch | byteMark) & byteMask); ch >>= 6;
  146. case 1: *--target = cast(char)(ch | firstByteMark[bytesToWrite]);
  147. default: break;
  148. }
  149. target += bytesToWrite;
  150. }
  151. return container[0..target - container.ptr];
  152. // return "";
  153. }
  154. string toUtf8(dstring src) {
  155. if (src is null || src.length == 0) {
  156. return cast(string)"";
  157. }
  158. char[] container = new char[src.length*4];
  159. const auto byteMask = 0xBF;
  160. const auto byteMark = 0x80;
  161. dchar* source = src.ptr;
  162. dchar* sourceEnd = &src[$-1] + 1;
  163. char* target = container.ptr;
  164. char* targetEnd = &container[$-1] + 1;
  165. uint bytesToWrite;
  166. dchar ch;
  167. while (source < sourceEnd) {
  168. bytesToWrite = 0;
  169. ch = *source++;
  170. /*
  171. * Figure out how many bytes the result will require. Turn any
  172. * illegally large UTF32 things (> Plane 17) into replacement chars.
  173. */
  174. if (ch < cast(dchar)0x80) {
  175. bytesToWrite = 1;
  176. }
  177. else if (ch < cast(dchar)0x800) {
  178. bytesToWrite = 2;
  179. }
  180. else if (ch < cast(dchar)0x10000) {
  181. bytesToWrite = 3;
  182. }
  183. else if (ch <= UNI_MAX_LEGAL_UTF32) {
  184. bytesToWrite = 4;
  185. }
  186. else {
  187. bytesToWrite = 3;
  188. ch = UNI_REPLACEMENT_CHAR;
  189. }
  190. target += bytesToWrite;
  191. switch (bytesToWrite) { /* note: everything falls through. */
  192. case 4: *--target = cast(char)((ch | byteMark) & byteMask); ch >>= 6;
  193. case 3: *--target = cast(char)((ch | byteMark) & byteMask); ch >>= 6;
  194. case 2: *--target = cast(char)((ch | byteMark) & byteMask); ch >>= 6;
  195. case 1: *--target = cast(char) (ch | firstByteMark[bytesToWrite]);
  196. default: break;
  197. }
  198. target += bytesToWrite;
  199. }
  200. uint targetLen = target - container.ptr;
  201. string ret = cast(string)container[0..targetLen];
  202. return ret;
  203. }
  204. wstring toUtf16(string src) {
  205. if (src.length == 0) {
  206. return cast(wstring)"";
  207. }
  208. wchar[] container = new wchar[src.length];
  209. char* source = src.ptr;
  210. char* sourceEnd = &src[$-1] + 1;
  211. wchar* target = container.ptr;
  212. wchar* targetEnd = &container[$-1] + 1;
  213. dchar ch;
  214. while (source < sourceEnd) {
  215. ch = 0;
  216. ushort extraBytesToRead = trailingBytesForUTF8[*source];
  217. if (source + extraBytesToRead >= sourceEnd) {
  218. // sourceExhausted
  219. break;
  220. }
  221. /* Do this check whether lenient or strict */
  222. if (! isLegalUTF8(source, extraBytesToRead+1)) {
  223. // sourceIllegal
  224. break;
  225. }
  226. switch (extraBytesToRead) {
  227. case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
  228. case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
  229. case 3: ch += *source++; ch <<= 6;
  230. case 2: ch += *source++; ch <<= 6;
  231. case 1: ch += *source++; ch <<= 6;
  232. case 0: ch += *source++;
  233. default: break;
  234. }
  235. ch -= offsetsFromUTF8[extraBytesToRead];
  236. if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
  237. /* UTF-16 surrogate values are illegal in UTF-32 */
  238. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  239. // illegal
  240. *target++ = UNI_REPLACEMENT_CHAR;
  241. }
  242. else {
  243. *target++ = cast(wchar)ch; /* normal case */
  244. }
  245. }
  246. else if (ch > UNI_MAX_UTF16) {
  247. // illegal
  248. *target++ = UNI_REPLACEMENT_CHAR;
  249. }
  250. else {
  251. /* target is a character in range 0xFFFF - 0x10FFFF. */
  252. ch -= halfBase;
  253. *target++ = cast(wchar)((ch >> halfShift) + UNI_SUR_HIGH_START);
  254. *target++ = cast(wchar)((ch & halfMask) + UNI_SUR_LOW_START);
  255. }
  256. }
  257. return cast(wstring)container[0..target - container.ptr];
  258. }
  259. wstring toUtf16(wstring src) {
  260. return cast(wstring)src.dup;
  261. }
  262. wstring toUtf16(dstring src) {
  263. if (src.length == 0) {
  264. return cast(wstring)"";
  265. }
  266. wchar[] container = new wchar[src.length];
  267. dchar* source = src.ptr;
  268. dchar* sourceEnd = &src[$-1] + 1;
  269. wchar* target = container.ptr;
  270. wchar* targetEnd = &container[$-1] + 1;
  271. dchar ch;
  272. while (source < sourceEnd) {
  273. ch = *source++;
  274. if (ch <= UNI_MAX_BMP) {
  275. /* Target is a character <= 0xFFFF */
  276. /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
  277. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  278. *target++ = UNI_REPLACEMENT_CHAR;
  279. }
  280. else {
  281. *target++ = cast(wchar)ch; /* normal case */
  282. }
  283. }
  284. else if (ch > UNI_MAX_LEGAL_UTF32) {
  285. *target++ = UNI_REPLACEMENT_CHAR;
  286. }
  287. else {
  288. /* target is a character in range 0xFFFF - 0x10FFFF. */
  289. ch -= halfBase;
  290. *target++ = cast(wchar)((ch >> halfShift) + UNI_SUR_HIGH_START);
  291. *target++ = cast(wchar)((ch & halfMask) + UNI_SUR_LOW_START);
  292. }
  293. }
  294. return cast(wstring)container[0..target - container.ptr];
  295. }
  296. dstring toUtf32(string src) {
  297. if (src.length == 0) {
  298. return cast(dstring)"";
  299. }
  300. dchar[] container = new dchar[src.length];
  301. char* source = src.ptr;
  302. char* sourceEnd = &src[$-1] + 1;
  303. dchar* target = container.ptr;
  304. dchar* targetEnd = &container[$-1] + 1;
  305. ushort extraBytesToRead;
  306. dchar ch;
  307. while (source < sourceEnd) {
  308. ch = 0;
  309. extraBytesToRead = trailingBytesForUTF8[*source];
  310. if (source + extraBytesToRead >= sourceEnd) {
  311. // sourceExhausted
  312. break;
  313. }
  314. if (!isLegalUTF8(source, extraBytesToRead+1)) {
  315. // sourceIllegal
  316. break;
  317. }
  318. /*
  319. * The cases all fall through. See "Note A" below.
  320. */
  321. switch (extraBytesToRead) {
  322. case 5: ch += *source++; ch <<= 6;
  323. case 4: ch += *source++; ch <<= 6;
  324. case 3: ch += *source++; ch <<= 6;
  325. case 2: ch += *source++; ch <<= 6;
  326. case 1: ch += *source++; ch <<= 6;
  327. case 0: ch += *source++;
  328. default: break;
  329. }
  330. ch -= offsetsFromUTF8[extraBytesToRead];
  331. if (ch <= UNI_MAX_LEGAL_UTF32) {
  332. /*
  333. * UTF-16 surrogate values are illegal in UTF-32, and anything
  334. * over Plane 17 (> 0x10FFFF) is illegal.
  335. */
  336. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  337. *target++ = UNI_REPLACEMENT_CHAR;
  338. }
  339. else {
  340. *target++ = ch;
  341. }
  342. }
  343. else {
  344. /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
  345. // sourceIllegal
  346. *target++ = UNI_REPLACEMENT_CHAR;
  347. }
  348. }
  349. return cast(dstring)container[0..target - container.ptr];
  350. }
  351. dstring toUtf32(wstring src) {
  352. if (src.length == 0) {
  353. return cast(dstring)"";
  354. }
  355. dchar[] container = new dchar[src.length];
  356. wchar* source = src.ptr;
  357. wchar* sourceEnd = &src[$-1] + 1;
  358. dchar* target = container.ptr;
  359. dchar* targetEnd = &container[$-1] + 1;
  360. dchar ch, ch2;
  361. while (source < sourceEnd) {
  362. ch = *source++;
  363. /* If we have a surrogate pair, convert to UTF32 first. */
  364. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
  365. /* If the 16 bits following the high surrogate are in the source buffer... */
  366. if (source < sourceEnd) {
  367. ch2 = *source;
  368. /* If it's a low surrogate, convert to UTF32. */
  369. if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
  370. ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) + halfBase;
  371. source++;
  372. }
  373. }
  374. else {
  375. /* We don't have the 16 bits following the high surrogate. */
  376. //--source; /* return to the high surrogate */
  377. // sourceExhausted
  378. break;
  379. }
  380. }
  381. *target++ = ch;
  382. }
  383. return cast(dstring)container[0..target - container.ptr];
  384. }
  385. dstring toUtf32(dstring src) {
  386. return cast(dstring)src.dup;
  387. }
  388. // character conversions
  389. dchar toUtf32Char(string src) {
  390. // grab the first character,
  391. // convert it to a UTF-32 character,
  392. // and then return
  393. return toUtf32(src)[0];
  394. }
  395. dchar toUtf32Char(wstring src) {
  396. return toUtf32(src)[0];
  397. }
  398. dchar toUtf32Char(dstring src) {
  399. // Useless function
  400. return src[0];
  401. }
  402. bool isDeadChar(char[] chr) {
  403. dchar dchr = toUtf32Char(chr);
  404. return isDeadChar(dchr);
  405. }
  406. bool isDeadChar(wchar[] chr) {
  407. dchar dchr = toUtf32Char(chr);
  408. return isDeadChar(dchr);
  409. }
  410. bool isDeadChar(dchar[] chr) {
  411. return isDeadChar(chr[0]);
  412. }
  413. bool isDeadChar(dchar chr) {
  414. // if it is a dead character
  415. return ((
  416. (chr >= 0x300 && chr <= 0x36F) || // Combining Diacritical Marks
  417. (chr >= 0x1DC0 && chr <= 0x1DFF) || // Combining Diacritical Marks Supplement
  418. (chr >= 0x20D0 && chr <= 0x20FF) || // Combining Diacritical Marks for Symbols
  419. (chr >= 0xFE20 && chr <= 0xFE2F) // Combining Half Marks
  420. ));
  421. }
  422. // character conversions
  423. dchar[] toUtf32Chars(string src) {
  424. // grab the first character,
  425. // convert it to a UTF-32 character,
  426. // and then return
  427. dchar[] container;
  428. if (src.length == 0) {
  429. return [];
  430. }
  431. char* source = src.ptr;
  432. char* sourceEnd = &src[$-1] + 1;
  433. ushort extraBytesToRead;
  434. dchar ch;
  435. while (source < sourceEnd) {
  436. ch = 0;
  437. extraBytesToRead = trailingBytesForUTF8[*source];
  438. if (source + extraBytesToRead >= sourceEnd) {
  439. // sourceExhausted
  440. if (container.length == 0) {
  441. container ~= UNI_REPLACEMENT_CHAR;
  442. }
  443. return container;
  444. }
  445. if (!isLegalUTF8(source, extraBytesToRead+1)) {
  446. // sourceIllegal
  447. if (container.length == 0) {
  448. container ~= UNI_REPLACEMENT_CHAR;
  449. }
  450. return container;
  451. }
  452. /*
  453. * The cases all fall through. See "Note A" below.
  454. */
  455. switch (extraBytesToRead) {
  456. case 5: ch += *source++; ch <<= 6;
  457. case 4: ch += *source++; ch <<= 6;
  458. case 3: ch += *source++; ch <<= 6;
  459. case 2: ch += *source++; ch <<= 6;
  460. case 1: ch += *source++; ch <<= 6;
  461. case 0: ch += *source++;
  462. default: break;
  463. }
  464. ch -= offsetsFromUTF8[extraBytesToRead];
  465. if (ch <= UNI_MAX_LEGAL_UTF32) {
  466. /*
  467. * UTF-16 surrogate values are illegal in UTF-32, and anything
  468. * over Plane 17 (> 0x10FFFF) is illegal.
  469. */
  470. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  471. if (container.length == 0) {
  472. container ~= UNI_REPLACEMENT_CHAR;
  473. }
  474. return container;
  475. }
  476. // else: found a valid character
  477. }
  478. else {
  479. /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
  480. // sourceIllegal
  481. if (container.length == 0) {
  482. container ~= UNI_REPLACEMENT_CHAR;
  483. }
  484. return container;
  485. }
  486. if (container.length > 0) {
  487. if (!isDeadChar(ch)) {
  488. break;
  489. }
  490. }
  491. container ~= ch;
  492. }
  493. return container;
  494. }
  495. dchar[] toUtf32Chars(wstring src) {
  496. // grab the first character,
  497. // convert it to a UTF-32 character,
  498. // and then return
  499. dchar[] container;
  500. if (src.length == 0) {
  501. return [];
  502. }
  503. wchar* source = src.ptr;
  504. wchar* sourceEnd = &src[$-1] + 1;
  505. dchar ch, ch2;
  506. while(source < sourceEnd) {
  507. ch = *source++;
  508. /* If we have a surrogate pair, convert to UTF32 first. */
  509. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
  510. /* If the 16 bits following the high surrogate are in the source buffer... */
  511. if (source < sourceEnd) {
  512. ch2 = *source;
  513. /* If it's a low surrogate, convert to UTF32. */
  514. if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
  515. ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) + halfBase;
  516. // found a valid character
  517. }
  518. else {
  519. container ~= UNI_REPLACEMENT_CHAR;
  520. return container;
  521. }
  522. }
  523. else {
  524. /* We don't have the 16 bits following the high surrogate. */
  525. // sourceExhausted
  526. container ~= UNI_REPLACEMENT_CHAR;
  527. return container;
  528. }
  529. }
  530. // else: found a valid character
  531. if (container.length > 0) {
  532. if (isDeadChar(ch)) {
  533. container ~= ch;
  534. }
  535. else {
  536. break;
  537. }
  538. }
  539. else {
  540. container ~= ch;
  541. }
  542. }
  543. return container;
  544. }
  545. dchar[] toUtf32Chars(dstring src) {
  546. dchar[] container;
  547. if (src.length == 0) {
  548. return [];
  549. }
  550. container ~= src[0];
  551. foreach(s; src[1..$]) {
  552. if (isDeadChar(s)) {
  553. container ~= s;
  554. }
  555. else {
  556. break;
  557. }
  558. }
  559. return cast(dchar[])container;
  560. }
  561. wchar[] toUtf16Chars(dstring src) {
  562. wchar[] container;
  563. if (src.length == 0) {
  564. return cast(wchar[])container;
  565. }
  566. dchar* source = src.ptr;
  567. dchar* sourceEnd = &src[$-1] + 1;
  568. dchar ch;
  569. while (source < sourceEnd) {
  570. ch = *source++;
  571. if (ch <= UNI_MAX_BMP) {
  572. /* Target is a character <= 0xFFFF */
  573. /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
  574. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  575. if (container.length == 0) {
  576. container ~= UNI_REPLACEMENT_CHAR;
  577. }
  578. return cast(wchar[])container;
  579. }
  580. else {
  581. if (container.length > 0 && !isDeadChar(ch)) {
  582. break;
  583. }
  584. container ~= cast(wchar)ch; /* normal case */
  585. }
  586. }
  587. else if (ch > UNI_MAX_LEGAL_UTF32) {
  588. if (container.length == 0) {
  589. container ~= UNI_REPLACEMENT_CHAR;
  590. }
  591. return cast(wchar[])container;
  592. }
  593. else {
  594. /* target is a character in range 0xFFFF - 0x10FFFF. */
  595. ch -= halfBase;
  596. if (container.length > 0 && !isDeadChar(ch)) {
  597. break;
  598. }
  599. container ~= cast(wchar)((ch >> halfShift) + UNI_SUR_HIGH_START);
  600. container ~= cast(wchar)((ch & halfMask) + UNI_SUR_LOW_START);
  601. }
  602. }
  603. return cast(wchar[])container;
  604. }
  605. char[] toUtf8Chars(dstring src) {
  606. char[] container;
  607. if (src.length == 0) {
  608. return [];
  609. }
  610. dchar* source = src.ptr;
  611. dchar* sourceEnd = &src[$-1] + 1;
  612. dchar ch;
  613. return cast(char[])container;
  614. }
  615. // string length stuffs
  616. uint utflen(string src) {
  617. if (src.length == 0) {
  618. return 0;
  619. }
  620. char* source = src.ptr;
  621. char* sourceEnd = &src[$-1] + 1;
  622. ushort extraBytesToRead;
  623. dchar ch;
  624. uint len;
  625. while (source < sourceEnd) {
  626. ch = 0;
  627. extraBytesToRead = trailingBytesForUTF8[*source];
  628. if (source + extraBytesToRead >= sourceEnd) {
  629. // sourceExhausted
  630. break;
  631. }
  632. if (!isLegalUTF8(source, extraBytesToRead+1)) {
  633. // sourceIllegal
  634. break;
  635. }
  636. /*
  637. * The cases all fall through. See "Note A" below.
  638. */
  639. switch (extraBytesToRead) {
  640. case 5: ch += *source++; ch <<= 6;
  641. case 4: ch += *source++; ch <<= 6;
  642. case 3: ch += *source++; ch <<= 6;
  643. case 2: ch += *source++; ch <<= 6;
  644. case 1: ch += *source++; ch <<= 6;
  645. case 0: ch += *source++;
  646. default: break;
  647. }
  648. ch -= offsetsFromUTF8[extraBytesToRead];
  649. if (ch <= UNI_MAX_LEGAL_UTF32) {
  650. /*
  651. * UTF-16 surrogate values are illegal in UTF-32, and anything
  652. * over Plane 17 (> 0x10FFFF) is illegal.
  653. */
  654. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  655. ch = UNI_REPLACEMENT_CHAR;
  656. }
  657. }
  658. else {
  659. /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
  660. // sourceIllegal
  661. ch = UNI_REPLACEMENT_CHAR;
  662. }
  663. // if it is not a dead character
  664. if (!isDeadChar(ch)) {
  665. // it is a valid character
  666. len++;
  667. }
  668. }
  669. return len;
  670. }
  671. uint utflen(wstring src) {
  672. if (src.length == 0) {
  673. return 0;
  674. }
  675. wchar* source = src.ptr;
  676. wchar* sourceEnd = &src[$-1] + 1;
  677. uint len = 0;
  678. dchar ch, ch2;
  679. while(source < sourceEnd) {
  680. ch = *source++;
  681. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
  682. if (source < sourceEnd) {
  683. ch2 = *source;
  684. if (!(ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)) {
  685. // invalid surrogate
  686. source--;
  687. ch = UNI_REPLACEMENT_CHAR;
  688. }
  689. else {
  690. ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) + halfBase;
  691. }
  692. }
  693. else {
  694. break;
  695. }
  696. }
  697. // if it is not a dead character
  698. if (!isDeadChar(ch)) {
  699. // it is a valid character
  700. len++;
  701. }
  702. }
  703. return len;
  704. }
  705. uint utflen(dstring src) {
  706. if (src.length == 0) {
  707. return 0;
  708. }
  709. uint len;
  710. for (int i=0; i<src.length; i++) {
  711. // if it is not a dead character
  712. if (!isDeadChar(src[i])) {
  713. // it is a valid character
  714. len++;
  715. }
  716. }
  717. return len;
  718. }
  719. // Unicode Indices
  720. uint[] calcIndices(string src) {
  721. if (src is null || src == "") {
  722. return [];
  723. }
  724. uint[] ret = new uint[src.length];
  725. char* source = src.ptr;
  726. char* sourceEnd = &src[$-1] + 1;
  727. ushort extraBytesToRead;
  728. dchar ch;
  729. uint len;
  730. uint i;
  731. uint* retPtr = ret.ptr;
  732. while (source < sourceEnd) {
  733. ch = 0;
  734. extraBytesToRead = trailingBytesForUTF8[*source];
  735. if (source + extraBytesToRead >= sourceEnd) {
  736. // sourceExhausted
  737. break;
  738. }
  739. if (!isLegalUTF8(source, extraBytesToRead+1)) {
  740. // sourceIllegal
  741. break;
  742. }
  743. /*
  744. * The cases all fall through. See "Note A" below.
  745. */
  746. switch (extraBytesToRead) {
  747. case 5: ch += *source++; ch <<= 6;
  748. case 4: ch += *source++; ch <<= 6;
  749. case 3: ch += *source++; ch <<= 6;
  750. case 2: ch += *source++; ch <<= 6;
  751. case 1: ch += *source++; ch <<= 6;
  752. case 0: ch += *source++;
  753. default: break;
  754. }
  755. ch -= offsetsFromUTF8[extraBytesToRead];
  756. if (ch <= UNI_MAX_LEGAL_UTF32) {
  757. /*
  758. * UTF-16 surrogate values are illegal in UTF-32, and anything
  759. * over Plane 17 (> 0x10FFFF) is illegal.
  760. */
  761. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  762. ch = UNI_REPLACEMENT_CHAR;
  763. }
  764. }
  765. else {
  766. /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
  767. // sourceIllegal
  768. ch = UNI_REPLACEMENT_CHAR;
  769. }
  770. // if it is not a dead character
  771. if (!isDeadChar(ch)) {
  772. // it is a valid character
  773. *retPtr++ = i;
  774. len++;
  775. }
  776. i += extraBytesToRead+1;
  777. }
  778. return ret[0..len];
  779. }
  780. uint[] calcIndices(wstring src) {
  781. if (src is null || src == "") {
  782. return [];
  783. }
  784. uint[] ret = new uint[src.length];
  785. wchar* source = src.ptr;
  786. wchar* sourceEnd = &src[$-1] + 1;
  787. uint len;
  788. uint i;
  789. uint mv;
  790. uint* retPtr = ret.ptr;
  791. dchar ch, ch2;
  792. while(source < sourceEnd) {
  793. ch = *source++;
  794. mv++;
  795. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
  796. if (source < sourceEnd) {
  797. ch2 = *source++;
  798. mv++;
  799. if (!(ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)) {
  800. // invalid surrogate
  801. mv--;
  802. source--;
  803. ch = UNI_REPLACEMENT_CHAR;
  804. }
  805. else {
  806. ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) + halfBase;
  807. }
  808. }
  809. else {
  810. break;
  811. }
  812. }
  813. // if it is not a dead character
  814. if (!isDeadChar(ch)) {
  815. // it is a valid character
  816. *retPtr++ = i;
  817. len++;
  818. }
  819. i += mv;
  820. mv = 0;
  821. }
  822. return ret[0..len];
  823. }
  824. uint[] calcIndices(dstring src) {
  825. if (src is null || src == "") {
  826. return [];
  827. }
  828. uint[] ret = new uint[src.length];
  829. uint len;
  830. for (int i=0; i<src.length; i++) {
  831. // if it is not a dead character
  832. if (!isDeadChar(src[i])) {
  833. // it is a valid character
  834. ret[len] = i;
  835. len++;
  836. }
  837. }
  838. return ret;
  839. }
  840. bool isStartChar(char chr) {
  841. // Look for non-surrogate entries
  842. if ((chr & 0b11000000) == 0b10000000) { // Signature for a follow up byte
  843. return false;
  844. }
  845. return true;
  846. }
  847. bool isStartChar(wchar chr) {
  848. // Look for non-surrogate entries
  849. if (chr >= UNI_SUR_LOW_START && chr <= UNI_SUR_LOW_END) {
  850. return false;
  851. }
  852. return true;
  853. }
  854. bool isStartChar(dchar chr) {
  855. // Obvious
  856. return true;
  857. }
  858. dchar fromCP866(char chr) {
  859. if (chr < 0x80) {
  860. return cast(dchar)chr;
  861. }
  862. return CP866_to_UTF32[chr-128];
  863. }
  864. private:
  865. // Codepage Encodings
  866. dchar CP866_to_UTF32[] = [
  867. 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,
  868. 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,
  869. 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f,
  870. 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510,
  871. 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F, 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567,
  872. 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B, 0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580,
  873. 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044F,
  874. 0x0401, 0x0451, 0x0404, 0x0454, 0x0407, 0x0457, 0x040E, 0x045E, 0x00B0, 0x2219, 0x00B7, 0x221A, 0x2116, 0x00A4, 0x25A0, 0x00A0,
  875. ];
  876. }