/thirdparty/breakpad/common/convert_UTF.c

http://github.com/tomahawk-player/tomahawk · C · 533 lines · 359 code · 36 blank · 138 comment · 122 complexity · 4d044a3fc0cf0a802ea0a9b7997bbf2c MD5 · raw file

  1. /*
  2. * Copyright 2001-2004 Unicode, Inc.
  3. *
  4. * Disclaimer
  5. *
  6. * This source code is provided as is by Unicode, Inc. No claims are
  7. * made as to fitness for any particular purpose. No warranties of any
  8. * kind are expressed or implied. The recipient agrees to determine
  9. * applicability of information provided. If this file has been
  10. * purchased on magnetic or optical media from Unicode, Inc., the
  11. * sole remedy for any claim will be exchange of defective media
  12. * within 90 days of receipt.
  13. *
  14. * Limitations on Rights to Redistribute This Code
  15. *
  16. * Unicode, Inc. hereby grants the right to freely use the information
  17. * supplied in this file in the creation of products supporting the
  18. * Unicode Standard, and to make copies of this file in any form
  19. * for internal or external distribution as long as this notice
  20. * remains attached.
  21. */
  22. /* ---------------------------------------------------------------------
  23. Conversions between UTF32, UTF-16, and UTF-8. Source code file.
  24. Author: Mark E. Davis, 1994.
  25. Rev History: Rick McGowan, fixes & updates May 2001.
  26. Sept 2001: fixed const & error conditions per
  27. mods suggested by S. Parent & A. Lillich.
  28. June 2002: Tim Dodd added detection and handling of incomplete
  29. source sequences, enhanced error detection, added casts
  30. to eliminate compiler warnings.
  31. July 2003: slight mods to back out aggressive FFFE detection.
  32. Jan 2004: updated switches in from-UTF8 conversions.
  33. Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
  34. See the header file "ConvertUTF.h" for complete documentation.
  35. ------------------------------------------------------------------------ */
  36. #include "convert_UTF.h"
  37. #ifdef CVTUTF_DEBUG
  38. #include <stdio.h>
  39. #endif
  40. static const int halfShift = 10; /* used for shifting by 10 bits */
  41. static const UTF32 halfBase = 0x0010000UL;
  42. static const UTF32 halfMask = 0x3FFUL;
  43. #define UNI_SUR_HIGH_START (UTF32)0xD800
  44. #define UNI_SUR_HIGH_END (UTF32)0xDBFF
  45. #define UNI_SUR_LOW_START (UTF32)0xDC00
  46. #define UNI_SUR_LOW_END (UTF32)0xDFFF
  47. #define false 0
  48. #define true 1
  49. /* --------------------------------------------------------------------- */
  50. ConversionResult ConvertUTF32toUTF16 (const UTF32** sourceStart, const UTF32* sourceEnd,
  51. UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
  52. ConversionResult result = conversionOK;
  53. const UTF32* source = *sourceStart;
  54. UTF16* target = *targetStart;
  55. while (source < sourceEnd) {
  56. UTF32 ch;
  57. if (target >= targetEnd) {
  58. result = targetExhausted; break;
  59. }
  60. ch = *source++;
  61. if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
  62. /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
  63. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  64. if (flags == strictConversion) {
  65. --source; /* return to the illegal value itself */
  66. result = sourceIllegal;
  67. break;
  68. } else {
  69. *target++ = UNI_REPLACEMENT_CHAR;
  70. }
  71. } else {
  72. *target++ = (UTF16)ch; /* normal case */
  73. }
  74. } else if (ch > UNI_MAX_LEGAL_UTF32) {
  75. if (flags == strictConversion) {
  76. result = sourceIllegal;
  77. } else {
  78. *target++ = UNI_REPLACEMENT_CHAR;
  79. }
  80. } else {
  81. /* target is a character in range 0xFFFF - 0x10FFFF. */
  82. if (target + 1 >= targetEnd) {
  83. --source; /* Back up source pointer! */
  84. result = targetExhausted; break;
  85. }
  86. ch -= halfBase;
  87. *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
  88. *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
  89. }
  90. }
  91. *sourceStart = source;
  92. *targetStart = target;
  93. return result;
  94. }
  95. /* --------------------------------------------------------------------- */
  96. ConversionResult ConvertUTF16toUTF32 (const UTF16** sourceStart, const UTF16* sourceEnd,
  97. UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
  98. ConversionResult result = conversionOK;
  99. const UTF16* source = *sourceStart;
  100. UTF32* target = *targetStart;
  101. UTF32 ch, ch2;
  102. while (source < sourceEnd) {
  103. const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
  104. ch = *source++;
  105. /* If we have a surrogate pair, convert to UTF32 first. */
  106. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
  107. /* If the 16 bits following the high surrogate are in the source buffer... */
  108. if (source < sourceEnd) {
  109. ch2 = *source;
  110. /* If it's a low surrogate, convert to UTF32. */
  111. if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
  112. ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
  113. + (ch2 - UNI_SUR_LOW_START) + halfBase;
  114. ++source;
  115. } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
  116. --source; /* return to the illegal value itself */
  117. result = sourceIllegal;
  118. break;
  119. }
  120. } else { /* We don't have the 16 bits following the high surrogate. */
  121. --source; /* return to the high surrogate */
  122. result = sourceExhausted;
  123. break;
  124. }
  125. } else if (flags == strictConversion) {
  126. /* UTF-16 surrogate values are illegal in UTF-32 */
  127. if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
  128. --source; /* return to the illegal value itself */
  129. result = sourceIllegal;
  130. break;
  131. }
  132. }
  133. if (target >= targetEnd) {
  134. source = oldSource; /* Back up source pointer! */
  135. result = targetExhausted; break;
  136. }
  137. *target++ = ch;
  138. }
  139. *sourceStart = source;
  140. *targetStart = target;
  141. #ifdef CVTUTF_DEBUG
  142. if (result == sourceIllegal) {
  143. fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
  144. fflush(stderr);
  145. }
  146. #endif
  147. return result;
  148. }
  149. /* --------------------------------------------------------------------- */
  150. /*
  151. * Index into the table below with the first byte of a UTF-8 sequence to
  152. * get the number of trailing bytes that are supposed to follow it.
  153. * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
  154. * left as-is for anyone who may want to do such conversion, which was
  155. * allowed in earlier algorithms.
  156. */
  157. static const char trailingBytesForUTF8[256] = {
  158. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  159. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  160. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  161. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  162. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  163. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  164. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  165. 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
  166. };
  167. /*
  168. * Magic values subtracted from a buffer value during UTF8 conversion.
  169. * This table contains as many values as there might be trailing bytes
  170. * in a UTF-8 sequence.
  171. */
  172. static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
  173. 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
  174. /*
  175. * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
  176. * into the first byte, depending on how many bytes follow. There are
  177. * as many entries in this table as there are UTF-8 sequence types.
  178. * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
  179. * for *legal* UTF-8 will be 4 or fewer bytes total.
  180. */
  181. static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
  182. /* --------------------------------------------------------------------- */
  183. /* The interface converts a whole buffer to avoid function-call overhead.
  184. * Constants have been gathered. Loops & conditionals have been removed as
  185. * much as possible for efficiency, in favor of drop-through switches.
  186. * (See "Note A" at the bottom of the file for equivalent code.)
  187. * If your compiler supports it, the "isLegalUTF8" call can be turned
  188. * into an inline function.
  189. */
  190. /* --------------------------------------------------------------------- */
  191. ConversionResult ConvertUTF16toUTF8 (const UTF16** sourceStart, const UTF16* sourceEnd,
  192. UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
  193. ConversionResult result = conversionOK;
  194. const UTF16* source = *sourceStart;
  195. UTF8* target = *targetStart;
  196. while (source < sourceEnd) {
  197. UTF32 ch;
  198. unsigned short bytesToWrite = 0;
  199. const UTF32 byteMask = 0xBF;
  200. const UTF32 byteMark = 0x80;
  201. const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
  202. ch = *source++;
  203. /* If we have a surrogate pair, convert to UTF32 first. */
  204. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
  205. /* If the 16 bits following the high surrogate are in the source buffer... */
  206. if (source < sourceEnd) {
  207. UTF32 ch2 = *source;
  208. /* If it's a low surrogate, convert to UTF32. */
  209. if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
  210. ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
  211. + (ch2 - UNI_SUR_LOW_START) + halfBase;
  212. ++source;
  213. } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
  214. --source; /* return to the illegal value itself */
  215. result = sourceIllegal;
  216. break;
  217. }
  218. } else { /* We don't have the 16 bits following the high surrogate. */
  219. --source; /* return to the high surrogate */
  220. result = sourceExhausted;
  221. break;
  222. }
  223. } else if (flags == strictConversion) {
  224. /* UTF-16 surrogate values are illegal in UTF-32 */
  225. if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
  226. --source; /* return to the illegal value itself */
  227. result = sourceIllegal;
  228. break;
  229. }
  230. }
  231. /* Figure out how many bytes the result will require */
  232. if (ch < (UTF32)0x80) { bytesToWrite = 1;
  233. } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
  234. } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
  235. } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
  236. } else { bytesToWrite = 3;
  237. ch = UNI_REPLACEMENT_CHAR;
  238. }
  239. target += bytesToWrite;
  240. if (target > targetEnd) {
  241. source = oldSource; /* Back up source pointer! */
  242. target -= bytesToWrite; result = targetExhausted; break;
  243. }
  244. switch (bytesToWrite) { /* note: everything falls through. */
  245. case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  246. case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  247. case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  248. case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
  249. }
  250. target += bytesToWrite;
  251. }
  252. *sourceStart = source;
  253. *targetStart = target;
  254. return result;
  255. }
  256. /* --------------------------------------------------------------------- */
  257. /*
  258. * Utility routine to tell whether a sequence of bytes is legal UTF-8.
  259. * This must be called with the length pre-determined by the first byte.
  260. * If not calling this from ConvertUTF8to*, then the length can be set by:
  261. * length = trailingBytesForUTF8[*source]+1;
  262. * and the sequence is illegal right away if there aren't that many bytes
  263. * available.
  264. * If presented with a length > 4, this returns false. The Unicode
  265. * definition of UTF-8 goes up to 4-byte sequences.
  266. */
  267. static Boolean isLegalUTF8(const UTF8 *source, int length) {
  268. UTF8 a;
  269. const UTF8 *srcptr = source+length;
  270. switch (length) {
  271. default: return false;
  272. /* Everything else falls through when "true"... */
  273. case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
  274. case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
  275. case 2: if ((a = (*--srcptr)) > 0xBF) return false;
  276. switch (*source) {
  277. /* no fall-through in this inner switch */
  278. case 0xE0: if (a < 0xA0) return false; break;
  279. case 0xED: if (a > 0x9F) return false; break;
  280. case 0xF0: if (a < 0x90) return false; break;
  281. case 0xF4: if (a > 0x8F) return false; break;
  282. default: if (a < 0x80) return false;
  283. }
  284. case 1: if (*source >= 0x80 && *source < 0xC2) return false;
  285. }
  286. if (*source > 0xF4) return false;
  287. return true;
  288. }
  289. /* --------------------------------------------------------------------- */
  290. /*
  291. * Exported function to return whether a UTF-8 sequence is legal or not.
  292. * This is not used here; it's just exported.
  293. */
  294. Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
  295. int length = trailingBytesForUTF8[*source]+1;
  296. if (source+length > sourceEnd) {
  297. return false;
  298. }
  299. return isLegalUTF8(source, length);
  300. }
  301. /* --------------------------------------------------------------------- */
  302. ConversionResult ConvertUTF8toUTF16 (const UTF8** sourceStart, const UTF8* sourceEnd,
  303. UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
  304. ConversionResult result = conversionOK;
  305. const UTF8* source = *sourceStart;
  306. UTF16* target = *targetStart;
  307. while (source < sourceEnd) {
  308. UTF32 ch = 0;
  309. unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
  310. if (source + extraBytesToRead >= sourceEnd) {
  311. result = sourceExhausted; break;
  312. }
  313. /* Do this check whether lenient or strict */
  314. if (! isLegalUTF8(source, extraBytesToRead+1)) {
  315. result = sourceIllegal;
  316. break;
  317. }
  318. /*
  319. * The cases all fall through. See "Note A" below.
  320. */
  321. switch (extraBytesToRead) {
  322. case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
  323. case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
  324. case 3: ch += *source++; ch <<= 6;
  325. case 2: ch += *source++; ch <<= 6;
  326. case 1: ch += *source++; ch <<= 6;
  327. case 0: ch += *source++;
  328. }
  329. ch -= offsetsFromUTF8[extraBytesToRead];
  330. if (target >= targetEnd) {
  331. source -= (extraBytesToRead+1); /* Back up source pointer! */
  332. result = targetExhausted; break;
  333. }
  334. if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
  335. /* UTF-16 surrogate values are illegal in UTF-32 */
  336. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  337. if (flags == strictConversion) {
  338. source -= (extraBytesToRead+1); /* return to the illegal value itself */
  339. result = sourceIllegal;
  340. break;
  341. } else {
  342. *target++ = UNI_REPLACEMENT_CHAR;
  343. }
  344. } else {
  345. *target++ = (UTF16)ch; /* normal case */
  346. }
  347. } else if (ch > UNI_MAX_UTF16) {
  348. if (flags == strictConversion) {
  349. result = sourceIllegal;
  350. source -= (extraBytesToRead+1); /* return to the start */
  351. break; /* Bail out; shouldn't continue */
  352. } else {
  353. *target++ = UNI_REPLACEMENT_CHAR;
  354. }
  355. } else {
  356. /* target is a character in range 0xFFFF - 0x10FFFF. */
  357. if (target + 1 >= targetEnd) {
  358. source -= (extraBytesToRead+1); /* Back up source pointer! */
  359. result = targetExhausted; break;
  360. }
  361. ch -= halfBase;
  362. *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
  363. *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
  364. }
  365. }
  366. *sourceStart = source;
  367. *targetStart = target;
  368. return result;
  369. }
  370. /* --------------------------------------------------------------------- */
  371. ConversionResult ConvertUTF32toUTF8 (const UTF32** sourceStart, const UTF32* sourceEnd,
  372. UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
  373. ConversionResult result = conversionOK;
  374. const UTF32* source = *sourceStart;
  375. UTF8* target = *targetStart;
  376. while (source < sourceEnd) {
  377. UTF32 ch;
  378. unsigned short bytesToWrite = 0;
  379. const UTF32 byteMask = 0xBF;
  380. const UTF32 byteMark = 0x80;
  381. ch = *source++;
  382. if (flags == strictConversion ) {
  383. /* UTF-16 surrogate values are illegal in UTF-32 */
  384. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  385. --source; /* return to the illegal value itself */
  386. result = sourceIllegal;
  387. break;
  388. }
  389. }
  390. /*
  391. * Figure out how many bytes the result will require. Turn any
  392. * illegally large UTF32 things (> Plane 17) into replacement chars.
  393. */
  394. if (ch < (UTF32)0x80) { bytesToWrite = 1;
  395. } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
  396. } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
  397. } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
  398. } else { bytesToWrite = 3;
  399. ch = UNI_REPLACEMENT_CHAR;
  400. result = sourceIllegal;
  401. }
  402. target += bytesToWrite;
  403. if (target > targetEnd) {
  404. --source; /* Back up source pointer! */
  405. target -= bytesToWrite; result = targetExhausted; break;
  406. }
  407. switch (bytesToWrite) { /* note: everything falls through. */
  408. case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  409. case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  410. case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  411. case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
  412. }
  413. target += bytesToWrite;
  414. }
  415. *sourceStart = source;
  416. *targetStart = target;
  417. return result;
  418. }
  419. /* --------------------------------------------------------------------- */
  420. ConversionResult ConvertUTF8toUTF32 (const UTF8** sourceStart, const UTF8* sourceEnd,
  421. UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
  422. ConversionResult result = conversionOK;
  423. const UTF8* source = *sourceStart;
  424. UTF32* target = *targetStart;
  425. while (source < sourceEnd) {
  426. UTF32 ch = 0;
  427. unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
  428. if (source + extraBytesToRead >= sourceEnd) {
  429. result = sourceExhausted; break;
  430. }
  431. /* Do this check whether lenient or strict */
  432. if (! isLegalUTF8(source, extraBytesToRead+1)) {
  433. result = sourceIllegal;
  434. break;
  435. }
  436. /*
  437. * The cases all fall through. See "Note A" below.
  438. */
  439. switch (extraBytesToRead) {
  440. case 5: ch += *source++; ch <<= 6;
  441. case 4: ch += *source++; ch <<= 6;
  442. case 3: ch += *source++; ch <<= 6;
  443. case 2: ch += *source++; ch <<= 6;
  444. case 1: ch += *source++; ch <<= 6;
  445. case 0: ch += *source++;
  446. }
  447. ch -= offsetsFromUTF8[extraBytesToRead];
  448. if (target >= targetEnd) {
  449. source -= (extraBytesToRead+1); /* Back up the source pointer! */
  450. result = targetExhausted; break;
  451. }
  452. if (ch <= UNI_MAX_LEGAL_UTF32) {
  453. /*
  454. * UTF-16 surrogate values are illegal in UTF-32, and anything
  455. * over Plane 17 (> 0x10FFFF) is illegal.
  456. */
  457. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  458. if (flags == strictConversion) {
  459. source -= (extraBytesToRead+1); /* return to the illegal value itself */
  460. result = sourceIllegal;
  461. break;
  462. } else {
  463. *target++ = UNI_REPLACEMENT_CHAR;
  464. }
  465. } else {
  466. *target++ = ch;
  467. }
  468. } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
  469. result = sourceIllegal;
  470. *target++ = UNI_REPLACEMENT_CHAR;
  471. }
  472. }
  473. *sourceStart = source;
  474. *targetStart = target;
  475. return result;
  476. }
  477. /* ---------------------------------------------------------------------
  478. Note A.
  479. The fall-through switches in UTF-8 reading code save a
  480. temp variable, some decrements & conditionals. The switches
  481. are equivalent to the following loop:
  482. {
  483. int tmpBytesToRead = extraBytesToRead+1;
  484. do {
  485. ch += *source++;
  486. --tmpBytesToRead;
  487. if (tmpBytesToRead) ch <<= 6;
  488. } while (tmpBytesToRead > 0);
  489. }
  490. In UTF-8 writing code, the switches on "bytesToWrite" are
  491. similarly unrolled loops.
  492. --------------------------------------------------------------------- */