/sombok-2.2.1/lib/utf8.c

# · C · 321 lines · 255 code · 18 blank · 48 comment · 146 complexity · 1e7576dcd1853c377843a5fa7e9eedc5 MD5 · raw file

  1. /*
  2. * utf8.c - Handle UTF-8 sequence.
  3. *
  4. * Copyright (C) 2012 by Hatuka*nezumi - IKEDA Soji.
  5. *
  6. * This file is part of the Sombok Package. This program is free
  7. * software; you can redistribute it and/or modify it under the terms of
  8. * either the GNU General Public License or the Artistic License, as
  9. * specified in the README file.
  10. *
  11. */
  12. #include "sombok.h"
  13. /** @defgroup utf8 utf8
  14. * @brief Handle UTF-8 sequence.
  15. *
  16. * @note This module was introduced by release 2.1.0.
  17. *
  18. *@{*/
  19. /** Decode UTF-8 string to Unicode string
  20. *
  21. * @param[out] unistr Unicode string, must not be NULL.
  22. * @param[in] maxchars maximum number of characters to be decoded.
  23. * 0 means infinite
  24. * @param[in] utf8 source UTF-8 string
  25. * @param[in] utf8len length of string
  26. * @param[in] check 0: no check; 1: check malformed sequence; 2: check
  27. * surrogate too; 3: check codes beyond Unicode too
  28. *
  29. * @returns Unicode string.
  30. * If unistr->str was NULL or maxchars was 0 (infinite), required buffer will
  31. * be (re-)allocated.
  32. * If error occurred, NULL is returned and errno is set.
  33. *
  34. * @note unistr->str must not point to static memory.
  35. */
  36. unistr_t *sombok_decode_utf8(unistr_t *unistr, size_t maxchars,
  37. const char *utf8, size_t utf8len, int check)
  38. {
  39. size_t i, unilen;
  40. unichar_t unichar, *uni;
  41. int pass;
  42. if (unistr == NULL) {
  43. errno = EINVAL;
  44. return NULL;
  45. }
  46. uni = unistr->str;
  47. if (utf8 == NULL)
  48. utf8len = 0;
  49. for (pass = 1; pass <= 2; pass++) {
  50. for (i = 0, unilen = 0; i < utf8len; unilen++) {
  51. if (maxchars != 0 && maxchars < unilen + 1)
  52. break;
  53. if ((utf8[i] & 0x80) == 0) {
  54. if (pass == 2)
  55. uni[unilen] = utf8[i];
  56. i++;
  57. } else if (i + 1 < utf8len &&
  58. (utf8[i] & 0xE0) == 0xC0 &&
  59. (utf8[i + 1] & 0xC0) == 0x80) {
  60. if (pass == 2) {
  61. unichar = utf8[i] & 0x1F;
  62. unichar <<= 6;
  63. unichar |= utf8[i + 1] & 0x3F;
  64. uni[unilen] = unichar;
  65. }
  66. i += 2;
  67. } else if (i + 2 < utf8len &&
  68. (utf8[i] & 0xF0) == 0xE0 &&
  69. (utf8[i + 1] & 0xC0) == 0x80 &&
  70. (utf8[i + 2] & 0xC0) == 0x80) {
  71. if (SOMBOK_UTF8_CHECK_SURROGATE <= check &&
  72. (utf8[i] & 0x0F) == 0x0D && (utf8[i + 1] & 0x20) == 0x20) {
  73. errno = EPERM;
  74. return NULL;
  75. }
  76. if (pass == 2) {
  77. unichar = utf8[i] & 0x0F;
  78. unichar <<= 6;
  79. unichar |= utf8[i + 1] & 0x3F;
  80. unichar <<= 6;
  81. unichar |= utf8[i + 2] & 0x3F;
  82. uni[unilen] = unichar;
  83. }
  84. i += 3;
  85. } else if (i + 3 < utf8len &&
  86. (utf8[i] & 0xF8) == 0xF0 &&
  87. (utf8[i + 1] & 0xC0) == 0x80 &&
  88. (utf8[i + 2] & 0xC0) == 0x80 &&
  89. (utf8[i + 3] & 0xC0) == 0x80) {
  90. if (SOMBOK_UTF8_CHECK_NONUNICODE <= check &&
  91. 0x10 <
  92. (((utf8[i] & 0x07) << 2) | ((utf8[i + 1] & 0x30) >> 4))) {
  93. errno = EPERM;
  94. return NULL;
  95. }
  96. if (pass == 2) {
  97. unichar = utf8[i] & 0x07;
  98. unichar <<= 6;
  99. unichar |= utf8[i + 1] & 0x3F;
  100. unichar <<= 6;
  101. unichar |= utf8[i + 2] & 0x3F;
  102. unichar <<= 6;
  103. unichar |= utf8[i + 3] & 0x3F;
  104. uni[unilen] = unichar;
  105. }
  106. i += 4;
  107. } else if (SOMBOK_UTF8_CHECK_NONUNICODE <= check) {
  108. errno = EPERM;
  109. return NULL;
  110. } else if (i + 4 < utf8len &&
  111. (utf8[i] & 0xFC) == 0xF8 &&
  112. (utf8[i + 1] & 0xC0) == 0x80 &&
  113. (utf8[i + 2] & 0xC0) == 0x80 &&
  114. (utf8[i + 3] & 0xC0) == 0x80 &&
  115. (utf8[i + 4] & 0xC0) == 0x80) {
  116. if (pass == 2) {
  117. unichar = utf8[i] & 0x03;
  118. unichar <<= 6;
  119. unichar |= utf8[i + 1] & 0x3F;
  120. unichar <<= 6;
  121. unichar |= utf8[i + 2] & 0x3F;
  122. unichar <<= 6;
  123. unichar |= utf8[i + 3] & 0x3F;
  124. unichar <<= 6;
  125. unichar |= utf8[i + 4] & 0x3F;
  126. uni[unilen] = unichar;
  127. }
  128. i += 5;
  129. } else if (i + 5 < utf8len &&
  130. (utf8[i] & 0xFE) == 0xFC &&
  131. (utf8[i + 1] & 0xC0) == 0x80 &&
  132. (utf8[i + 2] & 0xC0) == 0x80 &&
  133. (utf8[i + 3] & 0xC0) == 0x80 &&
  134. (utf8[i + 4] & 0xC0) == 0x80 &&
  135. (utf8[i + 5] & 0xC0) == 0x80) {
  136. if (pass == 2) {
  137. unichar = utf8[i] & 0x01;
  138. unichar <<= 6;
  139. unichar |= utf8[i + 1] & 0x3F;
  140. unichar <<= 6;
  141. unichar |= utf8[i + 2] & 0x3F;
  142. unichar <<= 6;
  143. unichar |= utf8[i + 3] & 0x3F;
  144. unichar <<= 6;
  145. unichar |= utf8[i + 4] & 0x3F;
  146. unichar <<= 6;
  147. unichar |= utf8[i + 5] & 0x3F;
  148. uni[unilen] = unichar;
  149. }
  150. i += 6;
  151. } else {
  152. if (SOMBOK_UTF8_CHECK_MALFORMED <= check) {
  153. errno = EPERM;
  154. return NULL;
  155. }
  156. if (pass == 2)
  157. uni[unilen] = utf8[i];
  158. i++;
  159. }
  160. }
  161. if (pass == 1) {
  162. if (uni == NULL) {
  163. if ((uni = malloc(sizeof(unichar_t) * (unilen + 1))) == NULL)
  164. return NULL;
  165. uni[unilen] = 0;
  166. } else if (maxchars == 0) {
  167. if ((uni = realloc(uni,
  168. sizeof(unichar_t) * (unilen + 1))) == NULL)
  169. return NULL;
  170. uni[unilen] = 0;
  171. } else if (unilen < maxchars)
  172. uni[unilen] = 0;
  173. unistr->str = uni;
  174. unistr->len = unilen;
  175. }
  176. }
  177. return unistr;
  178. }
  179. /** Encode Unicode string to UTF-8 string
  180. *
  181. * @param[out] utf8 string buffer, may be NULL.
  182. * @param[out] utf8lenp pointer to length of buffer, may be NULL.
  183. * @param[in] maxbytes maximum number of bytes to be encoded. 0 means infinite
  184. * @param[in] unistr source Unicode string, must not be NULL.
  185. *
  186. * @returns string buffer.
  187. * If utf8 was NULL or maxbytes was 0 (infinite), required buffer will be
  188. * (re-)allocated.
  189. * If error occurred, NULL is returned and errno is set.
  190. *
  191. * @note utf8 must not point to static memory.
  192. */
  193. char *sombok_encode_utf8(char *utf8, size_t *utf8lenp, size_t maxbytes,
  194. unistr_t *unistr)
  195. {
  196. size_t i, utf8len, unilen;
  197. unichar_t unichar;
  198. int pass;
  199. if (unistr == NULL) {
  200. errno = EINVAL;
  201. return NULL;
  202. }
  203. if (unistr->str == NULL)
  204. unilen = 0;
  205. else
  206. unilen = unistr->len;
  207. for (pass = 1; pass <= 2; pass++) {
  208. for (i = 0, utf8len = 0; i < unilen; i++) {
  209. unichar = unistr->str[i];
  210. if (unichar == (unichar & 0x007F)) {
  211. if (maxbytes != 0 && maxbytes < utf8len + 1)
  212. break;
  213. if (pass == 2)
  214. utf8[utf8len] = (char) unichar;
  215. utf8len++;
  216. } else if (unichar == (unichar & 0x07FF)) {
  217. if (maxbytes != 0 && maxbytes < utf8len + 2)
  218. break;
  219. if (pass == 2) {
  220. utf8[utf8len + 1] = (char) (unichar & 0x3F) | 0x80;
  221. unichar >>= 6;
  222. utf8[utf8len] = (char) (unichar & 0x1F) | 0xC0;
  223. }
  224. utf8len += 2;
  225. } else if (unichar == (unichar & 0x00FFFF)) {
  226. if (maxbytes != 0 && maxbytes < utf8len + 3)
  227. break;
  228. if (pass == 2) {
  229. utf8[utf8len + 2] = (char) (unichar & 0x3F) | 0x80;
  230. unichar >>= 6;
  231. utf8[utf8len + 1] = (char) (unichar & 0x3F) | 0x80;
  232. unichar >>= 6;
  233. utf8[utf8len] = (char) (unichar & 0x0F) | 0xE0;
  234. }
  235. utf8len += 3;
  236. } else if (unichar == (unichar & 0x001FFFFF)) {
  237. if (maxbytes != 0 && maxbytes < utf8len + 4)
  238. break;
  239. if (pass == 2) {
  240. utf8[utf8len + 3] = (char) (unichar & 0x3F) | 0x80;
  241. unichar >>= 6;
  242. utf8[utf8len + 2] = (char) (unichar & 0x3F) | 0x80;
  243. unichar >>= 6;
  244. utf8[utf8len + 1] = (char) (unichar & 0x3F) | 0x80;
  245. unichar >>= 6;
  246. utf8[utf8len] = (char) (unichar & 0x07) | 0xF0;
  247. }
  248. utf8len += 4;
  249. } else if (unichar == (unichar & 0x03FFFFFF)) {
  250. if (maxbytes != 0 && maxbytes < utf8len + 5)
  251. break;
  252. if (pass == 2) {
  253. utf8[utf8len + 4] = (char) (unichar & 0x3F) | 0x80;
  254. unichar >>= 6;
  255. utf8[utf8len + 3] = (char) (unichar & 0x3F) | 0x80;
  256. unichar >>= 6;
  257. utf8[utf8len + 2] = (char) (unichar & 0x3F) | 0x80;
  258. unichar >>= 6;
  259. utf8[utf8len + 1] = (char) (unichar & 0x3F) | 0x80;
  260. unichar >>= 6;
  261. utf8[utf8len] = (char) (unichar & 0x03) | 0xF8;
  262. }
  263. utf8len += 5;
  264. } else if (unichar == (unichar & 0x7FFFFFFF)) {
  265. if (maxbytes != 0 && maxbytes < utf8len + 6)
  266. break;
  267. if (pass == 2) {
  268. utf8[utf8len + 5] = (char) (unichar & 0x3F) | 0x80;
  269. unichar >>= 6;
  270. utf8[utf8len + 4] = (char) (unichar & 0x3F) | 0x80;
  271. unichar >>= 6;
  272. utf8[utf8len + 3] = (char) (unichar & 0x3F) | 0x80;
  273. unichar >>= 6;
  274. utf8[utf8len + 2] = (char) (unichar & 0x3F) | 0x80;
  275. unichar >>= 6;
  276. utf8[utf8len + 1] = (char) (unichar & 0x3F) | 0x80;
  277. unichar >>= 6;
  278. utf8[utf8len] = (char) (unichar & 0x01) | 0xFC;
  279. }
  280. utf8len += 6;
  281. } else {
  282. errno = EPERM;
  283. return NULL;
  284. }
  285. }
  286. if (pass == 1) {
  287. if (utf8 == NULL) {
  288. if ((utf8 = malloc(sizeof(char) * (utf8len + 1))) == NULL)
  289. return NULL;
  290. utf8[utf8len] = '\0';
  291. } else if (maxbytes == 0) {
  292. if ((utf8 = realloc(utf8,
  293. sizeof(char) * (utf8len + 1))) == NULL)
  294. return NULL;
  295. utf8[utf8len] = '\0';
  296. } else if (utf8len < maxbytes)
  297. utf8[utf8len] = '\0';
  298. if (utf8lenp != NULL)
  299. *utf8lenp = utf8len;
  300. }
  301. }
  302. return utf8;
  303. }