/xmlfy-1.5.5/xmlfy/utf_io/utf_lib.c

# · C · 834 lines · 611 code · 89 blank · 134 comment · 349 complexity · 5d7eded3c26f1f5d058b3d964dbc004a MD5 · raw file

  1. /*
  2. * BSD License for xmlfy
  3. * Copyright (c) 2008-2011, Arthur Gouros
  4. * All rights reserved.
  5. *
  6. * Redistribution and use in source and binary forms, with or without
  7. * modification, are permitted provided that the following conditions are met:
  8. *
  9. * - Redistributions of source code must retain the above copyright notice,
  10. * this list of conditions and the following disclaimer.
  11. * - Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. * - Neither the name of Arthur Gouros nor the names of its contributors
  15. * may be used to endorse or promote products derived from this software
  16. * without specific prior written permission.
  17. *
  18. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  19. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  22. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  23. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  24. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  25. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  26. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  27. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  28. * POSSIBILITY OF SUCH DAMAGE.
  29. */
  30. /**************************************************************************
  31. * xmlfy - Convert to XML on the fly. *
  32. * Original Author: Arthur Gouros. *
  33. * *
  34. * Please consult the documentation for further information. *
  35. **************************************************************************
  36. */
  37. #include "utf_io.h"
  38. /* function templates */
  39. void strcpy_utf8(int level, const unsigned char *s_src);
  40. void strncpy_utf8(const unsigned char *s_src, long n);
  41. void strcat_utf8(const unsigned char *s_src);
  42. void strncat_utf8(const unsigned char *s_src, long n);
  43. void strcpy_utf16be(int level, const unsigned char *s_src);
  44. void strncpy_utf16be(const unsigned char *s_src, long n);
  45. void strcat_utf16be(const unsigned char *s_src);
  46. void strncat_utf16be(const unsigned char *s_src, long n);
  47. void strcpy_utf16le(int level, const unsigned char *s_src);
  48. void strncpy_utf16le(const unsigned char *s_src, long n);
  49. void strcat_utf16le(const unsigned char *s_src);
  50. void strncat_utf16le(const unsigned char *s_src, long n);
  51. void strcpy_utf32be(int level, const unsigned char *s_src);
  52. void strncpy_utf32be(const unsigned char *s_src, long n);
  53. void strcat_utf32be(const unsigned char *s_src);
  54. void strncat_utf32be(const unsigned char *s_src, long n);
  55. void strcpy_utf32le(int level, const unsigned char *s_src);
  56. void strncpy_utf32le(const unsigned char *s_src, long n);
  57. void strcat_utf32le(const unsigned char *s_src);
  58. void strncat_utf32le(const unsigned char *s_src, long n);
  59. int utf_xstrncmp(const unsigned char *utf_s, long utf_s_length, int utf_s_encoding, const unsigned char *ascii_s, long n);
  60. int utf_char_equals(long i, unsigned char byte_c);
  61. int utf_char_equals_2(long i, unsigned char byte_c, const unsigned char *utf_s, long utf_s_length, int utf_s_encoding);
  62. int utf_char_equals_level1(long i, unsigned char byte_c);
  63. int utf_isspace(long i);
  64. int utf_iscomposite(long i);
  65. long utf_get_first_word_length(const unsigned char *utf_s, long utf_s_length, int utf_s_encoding);
  66. /*****
  67. * UTF-8 routines
  68. *****/
  69. void strcpy_utf8(int level, const unsigned char *s_src)
  70. {
  71. /* Write s_src to start of output_record.line array with padding */
  72. long i;
  73. output_record.length = 0;
  74. /* pad */
  75. while (output_record.length < level * 2)
  76. output_record.line[output_record.length++] = ' ';
  77. /* copy */
  78. for (i = 0 ; s_src[i] != '\0' ; i++)
  79. output_record.line[output_record.length++] = s_src[i];
  80. }
  81. void strncpy_utf8(const unsigned char *s_src, long n)
  82. {
  83. /* Write at most n characters of s_src to start of output_record.line array */
  84. for (output_record.length = 0 ; output_record.length < n && s_src[output_record.length] != '\0' ; output_record.length++)
  85. output_record.line[output_record.length] = s_src[output_record.length];
  86. }
  87. void strcat_utf8(const unsigned char *s_src)
  88. {
  89. /* Concatenate s_src to output_record.line array
  90. * starting from output_record.length. */
  91. long i;
  92. for (i = 0 ; s_src[i] != '\0' ; i++)
  93. output_record.line[output_record.length++] = s_src[i];
  94. }
  95. void strncat_utf8(const unsigned char *s_src, long n)
  96. {
  97. /* Concatenate at most n characters of s_src to output_record.line array
  98. * starting from output_record.length. */
  99. long i;
  100. for (i = 0 ; i < n && s_src[i] != '\0' ; i++)
  101. output_record.line[output_record.length++] = s_src[i];
  102. }
  103. /*****
  104. * UTF-16 and UTF-16BE routines
  105. *****/
  106. void strcpy_utf16be(int level, const unsigned char *s_src)
  107. {
  108. /* Write s_src to start of output_record.line array with padding
  109. * whilst converting to UTF-16BE format. */
  110. long i;
  111. output_record.length = 0;
  112. /* pad */
  113. while (output_record.length < level * 4)
  114. {
  115. output_record.line[output_record.length++] = 0;
  116. output_record.line[output_record.length++] = ' ';
  117. }
  118. /* copy */
  119. for (i = 0 ; s_src[i] != '\0' ; i++)
  120. {
  121. output_record.line[output_record.length++] = 0;
  122. output_record.line[output_record.length++] = s_src[i];
  123. }
  124. }
  125. void strncpy_utf16be(const unsigned char *s_src, long n)
  126. {
  127. /* Write at most n characters of s_src to start of output_record.line array
  128. * whilst converting to UTF-16BE format. */
  129. long i;
  130. output_record.length = 0;
  131. for (i = 0 ; i < n && s_src[i] != '\0' ; i++)
  132. {
  133. output_record.line[output_record.length++] = 0;
  134. output_record.line[output_record.length++] = s_src[i];
  135. }
  136. }
  137. void strcat_utf16be(const unsigned char *s_src)
  138. {
  139. /* Concatenate s_src to output_record.line array
  140. * starting from output_record.length,
  141. * whilst converting to UTF-16BE format. */
  142. long i;
  143. /* mem checks */
  144. if (output_record.length > memseg_output_record)
  145. memadd_output_record(output_record.length);
  146. for (i = 0 ; s_src[i] != '\0' ; i++)
  147. {
  148. output_record.line[output_record.length++] = 0;
  149. output_record.line[output_record.length++] = s_src[i];
  150. }
  151. }
  152. void strncat_utf16be(const unsigned char *s_src, long n)
  153. {
  154. /* Concatenate at most n characters of s_src to output_record.line array
  155. * starting from output_record.length,
  156. * whilst converting to UTF-16BE format. */
  157. long i;
  158. /* mem checks */
  159. if (output_record.length > memseg_output_record)
  160. memadd_output_record(output_record.length);
  161. for (i = 0 ; i < n && s_src[i] != '\0' ; i++)
  162. {
  163. output_record.line[output_record.length++] = 0;
  164. output_record.line[output_record.length++] = s_src[i];
  165. }
  166. }
  167. /*****
  168. * UTF-16LE routines
  169. *****/
  170. void strcpy_utf16le(int level, const unsigned char *s_src)
  171. {
  172. /* Write s_src to start of output_record.line array with padding
  173. * whilst converting to UTF-16LE format. */
  174. long i;
  175. output_record.length = 0;
  176. /* pad */
  177. while (output_record.length < level * 4)
  178. {
  179. output_record.line[output_record.length++] = ' ';
  180. output_record.line[output_record.length++] = 0;
  181. }
  182. /* copy */
  183. for (i = 0 ; s_src[i] != '\0' ; i++)
  184. {
  185. output_record.line[output_record.length++] = s_src[i];
  186. output_record.line[output_record.length++] = 0;
  187. }
  188. }
  189. void strncpy_utf16le(const unsigned char *s_src, long n)
  190. {
  191. /* Write at most n characters of s_src to start of output_record.line array
  192. * whilst converting to UTF-16LE format. */
  193. long i;
  194. output_record.length = 0;
  195. for (i = 0 ; i < n && s_src[i] != '\0' ; i++)
  196. {
  197. output_record.line[output_record.length++] = s_src[i];
  198. output_record.line[output_record.length++] = 0;
  199. }
  200. }
  201. void strcat_utf16le(const unsigned char *s_src)
  202. {
  203. /* Concatenate s_src to output_record.line array
  204. * starting from output_record.length,
  205. * whilst converting to UTF-16LE format. */
  206. long i;
  207. /* mem checks */
  208. if (output_record.length > memseg_output_record)
  209. memadd_output_record(output_record.length);
  210. for (i = 0 ; s_src[i] != '\0' ; i++)
  211. {
  212. output_record.line[output_record.length++] = s_src[i];
  213. output_record.line[output_record.length++] = 0;
  214. }
  215. }
  216. void strncat_utf16le(const unsigned char *s_src, long n)
  217. {
  218. /* Concatenate at most n characters of s_src to output_record.line array
  219. * starting from output_record.length,
  220. * whilst converting to UTF-16LE format. */
  221. long i;
  222. /* mem checks */
  223. if (output_record.length > memseg_output_record)
  224. memadd_output_record(output_record.length);
  225. for (i = 0 ; i < n && s_src[i] != '\0' ; i++)
  226. {
  227. output_record.line[output_record.length++] = s_src[i];
  228. output_record.line[output_record.length++] = 0;
  229. }
  230. }
  231. /*****
  232. * UTF-32 and UTF-32BE routines
  233. *****/
  234. void strcpy_utf32be(int level, const unsigned char *s_src)
  235. {
  236. /* Write s_src to start of output_record.line array with padding
  237. * whilst converting to UTF-32BE format. */
  238. long i;
  239. output_record.length = 0;
  240. /* pad */
  241. while (output_record.length < level * 8)
  242. {
  243. output_record.line[output_record.length++] = 0;
  244. output_record.line[output_record.length++] = 0;
  245. output_record.line[output_record.length++] = 0;
  246. output_record.line[output_record.length++] = ' ';
  247. }
  248. /* copy */
  249. for (i = 0 ; s_src[i] != '\0' ; i++)
  250. {
  251. output_record.line[output_record.length++] = 0;
  252. output_record.line[output_record.length++] = 0;
  253. output_record.line[output_record.length++] = 0;
  254. output_record.line[output_record.length++] = s_src[i];
  255. }
  256. }
  257. void strncpy_utf32be(const unsigned char *s_src, long n)
  258. {
  259. /* Write at most n characters of s_src to start of output_record.line array
  260. * whilst converting to UTF-32BE format. */
  261. long i;
  262. output_record.length = 0;
  263. for (i = 0 ; i < n && s_src[i] != '\0' ; i++)
  264. {
  265. output_record.line[output_record.length++] = 0;
  266. output_record.line[output_record.length++] = 0;
  267. output_record.line[output_record.length++] = 0;
  268. output_record.line[output_record.length++] = s_src[i];
  269. }
  270. }
  271. void strcat_utf32be(const unsigned char *s_src)
  272. {
  273. /* Concatenate s_src to output_record.line array
  274. * starting from output_record.length,
  275. * whilst converting to UTF-32BE format. */
  276. long i;
  277. /* mem checks */
  278. if (output_record.length > memseg_output_record)
  279. memadd_output_record(output_record.length);
  280. for (i = 0 ; s_src[i] != '\0' ; i++)
  281. {
  282. output_record.line[output_record.length++] = 0;
  283. output_record.line[output_record.length++] = 0;
  284. output_record.line[output_record.length++] = 0;
  285. output_record.line[output_record.length++] = s_src[i];
  286. }
  287. }
  288. void strncat_utf32be(const unsigned char *s_src, long n)
  289. {
  290. /* Concatenate at most n characters of s_src to output_record.line array
  291. * starting from output_record.length,
  292. * whilst converting to UTF-32BE format. */
  293. long i;
  294. /* mem checks */
  295. if (output_record.length > memseg_output_record)
  296. memadd_output_record(output_record.length);
  297. for (i = 0 ; i < n && s_src[i] != '\0' ; i++)
  298. {
  299. output_record.line[output_record.length++] = 0;
  300. output_record.line[output_record.length++] = 0;
  301. output_record.line[output_record.length++] = 0;
  302. output_record.line[output_record.length++] = s_src[i];
  303. }
  304. }
  305. /*****
  306. * UTF-32LE routines
  307. *****/
  308. void strcpy_utf32le(int level, const unsigned char *s_src)
  309. {
  310. /* Write s_src to start of output_record.line array with padding
  311. * whilst converting to UTF-32LE format. */
  312. long i;
  313. output_record.length = 0;
  314. /* pad */
  315. while (output_record.length < level * 8)
  316. {
  317. output_record.line[output_record.length++] = ' ';
  318. output_record.line[output_record.length++] = 0;
  319. output_record.line[output_record.length++] = 0;
  320. output_record.line[output_record.length++] = 0;
  321. }
  322. /* copy */
  323. for (i = 0 ; s_src[i] != '\0' ; i++)
  324. {
  325. output_record.line[output_record.length++] = s_src[i];
  326. output_record.line[output_record.length++] = 0;
  327. output_record.line[output_record.length++] = 0;
  328. output_record.line[output_record.length++] = 0;
  329. }
  330. }
  331. void strncpy_utf32le(const unsigned char *s_src, long n)
  332. {
  333. /* Write at most n characters of s_src to start of output_record.line array
  334. * whilst converting to UTF-32LE format. */
  335. long i;
  336. output_record.length = 0;
  337. for (i = 0 ; i < n && s_src[i] != '\0' ; i++)
  338. {
  339. output_record.line[output_record.length++] = s_src[i];
  340. output_record.line[output_record.length++] = 0;
  341. output_record.line[output_record.length++] = 0;
  342. output_record.line[output_record.length++] = 0;
  343. }
  344. }
  345. void strcat_utf32le(const unsigned char *s_src)
  346. {
  347. /* Concatenate s_src to output_record.line array
  348. * starting from output_record.length,
  349. * whilst converting to UTF-32LE format. */
  350. long i;
  351. /* mem checks */
  352. if (output_record.length > memseg_output_record)
  353. memadd_output_record(output_record.length);
  354. for (i = 0 ; s_src[i] != '\0' ; i++)
  355. {
  356. output_record.line[output_record.length++] = s_src[i];
  357. output_record.line[output_record.length++] = 0;
  358. output_record.line[output_record.length++] = 0;
  359. output_record.line[output_record.length++] = 0;
  360. }
  361. }
  362. void strncat_utf32le(const unsigned char *s_src, long n)
  363. {
  364. /* Concatenate at most n characters of s_src to output_record.line array
  365. * starting from output_record.length,
  366. * whilst converting to UTF-32LE format. */
  367. long i;
  368. /* mem checks */
  369. if (output_record.length > memseg_output_record)
  370. memadd_output_record(output_record.length);
  371. for (i = 0 ; i < n && s_src[i] != '\0' ; i++)
  372. {
  373. output_record.line[output_record.length++] = s_src[i];
  374. output_record.line[output_record.length++] = 0;
  375. output_record.line[output_record.length++] = 0;
  376. output_record.line[output_record.length++] = 0;
  377. }
  378. }
  379. int utf_xstrncmp(const unsigned char *utf_s, long utf_s_length, int utf_s_encoding, const unsigned char *ascii_s, long n)
  380. {
  381. /* Check if utf_s == ascii_s for up to n ascii chars and ONLY return 0 (match) or 1 (nomatch)
  382. * whilst being encoding sensitive. */
  383. long i, j, n_match;
  384. int ret = 1;
  385. n_match = 0;
  386. i = 0;
  387. for (j = 0 ; j < n && ascii_s[j] != '\0' ; j++)
  388. {
  389. if (utf_s_encoding == UTF_8)
  390. {
  391. if (i < utf_s_length
  392. && utf_s[i] == ascii_s[j])
  393. {
  394. i++;
  395. n_match++;
  396. }
  397. else
  398. break; /* for/j */
  399. }
  400. else if (utf_s_encoding == UTF_16BE)
  401. {
  402. if (i + 1 < utf_s_length
  403. && utf_s[i] == 0
  404. && utf_s[i + 1] == ascii_s[j])
  405. {
  406. i += 2;
  407. n_match++;
  408. }
  409. else
  410. break; /* for/j */
  411. }
  412. else if (utf_s_encoding == UTF_16LE)
  413. {
  414. if (i + 1 < utf_s_length
  415. && utf_s[i] == ascii_s[j]
  416. && utf_s[i + 1] == 0)
  417. {
  418. i += 2;
  419. n_match++;
  420. }
  421. else
  422. break; /* for/j */
  423. }
  424. else if (utf_s_encoding == UTF_32BE)
  425. {
  426. if (i + 3 < utf_s_length
  427. && utf_s[i] == 0
  428. && utf_s[i + 1] == 0
  429. && utf_s[i + 2] == 0
  430. && utf_s[i + 3] == ascii_s[j])
  431. {
  432. i += 4;
  433. n_match++;
  434. }
  435. else
  436. break; /* for/j */
  437. }
  438. else if (utf_s_encoding == UTF_32LE)
  439. {
  440. if (i + 3 < utf_s_length
  441. && utf_s[i] == ascii_s[j]
  442. && utf_s[i + 1] == 0
  443. && utf_s[i + 2] == 0
  444. && utf_s[i + 3] == 0)
  445. {
  446. i += 4;
  447. n_match++;
  448. }
  449. else
  450. break; /* for/j */
  451. }
  452. } /* for/j */
  453. if (n == n_match)
  454. ret = 0;
  455. return (ret);
  456. }
  457. int utf_char_equals(long i, unsigned char byte_c)
  458. {
  459. /* check for matching char in input.record.line[i] array
  460. * whilst being encoding sensitive. */
  461. int ret = FALSE;
  462. if (input_record.encoding == UTF_8)
  463. {
  464. if (input_record.line[i] == byte_c)
  465. ret = TRUE;
  466. }
  467. else if (input_record.encoding == UTF_16BE)
  468. {
  469. if (i + 1 < input_record.length
  470. && input_record.line[i] == 0
  471. && input_record.line[i + 1] == byte_c)
  472. {
  473. ret = TRUE;
  474. }
  475. }
  476. else if (input_record.encoding == UTF_16LE)
  477. {
  478. if (i + 1 < input_record.length
  479. && input_record.line[i] == byte_c
  480. && input_record.line[i + 1] == 0)
  481. {
  482. ret = TRUE;
  483. }
  484. }
  485. else if (input_record.encoding == UTF_32BE)
  486. {
  487. if (i + 3 < input_record.length
  488. && input_record.line[i] == 0
  489. && input_record.line[i + 1] == 0
  490. && input_record.line[i + 2] == 0
  491. && input_record.line[i + 3] == byte_c)
  492. {
  493. ret = TRUE;
  494. }
  495. }
  496. else if (input_record.encoding == UTF_32LE)
  497. {
  498. if (i + 3 < input_record.length
  499. && input_record.line[i] == byte_c
  500. && input_record.line[i + 1] == 0
  501. && input_record.line[i + 2] == 0
  502. && input_record.line[i + 3] == 0)
  503. {
  504. ret = TRUE;
  505. }
  506. }
  507. return(ret);
  508. }
  509. int utf_char_equals_2(long i, unsigned char byte_c, const unsigned char *utf_s, long utf_s_length, int utf_s_encoding)
  510. {
  511. /* check for matching char in supplied array
  512. * whilst being encoding sensitive. */
  513. int ret = FALSE;
  514. if (utf_s_encoding == UTF_8)
  515. {
  516. if (utf_s[i] == byte_c)
  517. ret = TRUE;
  518. }
  519. else if (utf_s_encoding == UTF_16BE)
  520. {
  521. if (i + 1 < utf_s_length
  522. && utf_s[i] == 0
  523. && utf_s[i + 1] == byte_c)
  524. {
  525. ret = TRUE;
  526. }
  527. }
  528. else if (utf_s_encoding == UTF_16LE)
  529. {
  530. if (i + 1 < utf_s_length
  531. && utf_s[i] == byte_c
  532. && utf_s[i + 1] == 0)
  533. {
  534. ret = TRUE;
  535. }
  536. }
  537. else if (utf_s_encoding == UTF_32BE)
  538. {
  539. if (i + 3 < utf_s_length
  540. && utf_s[i] == 0
  541. && utf_s[i + 1] == 0
  542. && utf_s[i + 2] == 0
  543. && utf_s[i + 3] == byte_c)
  544. {
  545. ret = TRUE;
  546. }
  547. }
  548. else if (utf_s_encoding == UTF_32LE)
  549. {
  550. if (i + 3 < utf_s_length
  551. && utf_s[i] == byte_c
  552. && utf_s[i + 1] == 0
  553. && utf_s[i + 2] == 0
  554. && utf_s[i + 3] == 0)
  555. {
  556. ret = TRUE;
  557. }
  558. }
  559. return(ret);
  560. }
  561. int utf_char_equals_level1(long i, unsigned char byte_c)
  562. {
  563. /* check for default level 1 matching char in input.record.line[i] array.
  564. * NOTE: Unlike the utf_char_equals() functions above, level 1 record
  565. * default delimiter matching has a growing input_record.length,
  566. * so wide UTF encoding must be matched starting at the end
  567. * and working backwards */
  568. int ret = FALSE;
  569. if (input_record.encoding == UTF_8)
  570. {
  571. if (input_record.line[i] == byte_c)
  572. ret = TRUE;
  573. }
  574. else if (input_record.encoding == UTF_16BE)
  575. {
  576. if (i % 2
  577. && i > 0
  578. && input_record.line[i - 1] == 0
  579. && input_record.line[i] == byte_c)
  580. {
  581. ret = TRUE;
  582. }
  583. }
  584. else if (input_record.encoding == UTF_16LE)
  585. {
  586. if (i % 2
  587. && i > 0
  588. && input_record.line[i - 1] == byte_c
  589. && input_record.line[i] == 0)
  590. {
  591. ret = TRUE;
  592. }
  593. }
  594. else if (input_record.encoding == UTF_32BE)
  595. {
  596. if (i % 4 == 3 /* input array starts at 0 i.e. [3][2][1][0] */
  597. && i > 2
  598. && input_record.line[i - 3] == 0
  599. && input_record.line[i - 2] == 0
  600. && input_record.line[i - 1] == 0
  601. && input_record.line[i] == byte_c)
  602. {
  603. ret = TRUE;
  604. }
  605. }
  606. else if (input_record.encoding == UTF_32LE)
  607. {
  608. if (i % 4 == 3 /* input array starts at 0 i.e. [3][2][1][0] */
  609. && i > 2
  610. && input_record.line[i - 3] == byte_c
  611. && input_record.line[i - 2] == 0
  612. && input_record.line[i - 1] == 0
  613. && input_record.line[i] == 0)
  614. {
  615. ret = TRUE;
  616. }
  617. }
  618. return(ret);
  619. }
  620. int utf_isspace(long i)
  621. {
  622. /* check for whitespace
  623. * whilst being encoding sensitive. */
  624. int ret = FALSE;
  625. if (input_record.encoding == UTF_8)
  626. {
  627. if(isspace(input_record.line[i]))
  628. ret = TRUE;
  629. }
  630. else if (input_record.encoding == UTF_16BE)
  631. {
  632. if (i + 1 < input_record.length
  633. && isspace(input_record.line[i + 1])
  634. && input_record.line[i] == 0)
  635. {
  636. ret = TRUE;
  637. }
  638. }
  639. else if (input_record.encoding == UTF_16LE)
  640. {
  641. if (i + 1 < input_record.length
  642. && isspace(input_record.line[i])
  643. && input_record.line[i + 1] == 0)
  644. {
  645. ret = TRUE;
  646. }
  647. }
  648. else if (input_record.encoding == UTF_32BE)
  649. {
  650. if (i + 3 < input_record.length
  651. && isspace(input_record.line[i + 3])
  652. && input_record.line[i + 2] == 0
  653. && input_record.line[i + 1] == 0
  654. && input_record.line[i] == 0)
  655. {
  656. ret = TRUE;
  657. }
  658. }
  659. else if (input_record.encoding == UTF_32LE)
  660. {
  661. if (i + 3 < input_record.length
  662. && isspace(input_record.line[i])
  663. && input_record.line[i + 1] == 0
  664. && input_record.line[i + 2] == 0
  665. && input_record.line[i + 3] == 0)
  666. {
  667. ret = TRUE;
  668. }
  669. }
  670. return(ret);
  671. }
  672. int utf_iscomposite(long i)
  673. {
  674. /* check for composite char in input.record.line[i] array
  675. * E.g. Multi byte UTF-8 sequences.
  676. * E.g. Surrogate pairing in UTF-16. */
  677. int ret = FALSE;
  678. if (input_record.encoding == UTF_8)
  679. {
  680. if (input_record.line[i] > 0x7f && input_record.line[i] < 0xc0)
  681. ret = TRUE;
  682. }
  683. else if (input_record.encoding == UTF_16BE)
  684. {
  685. if (input_record.line[i] > 0xd7 && input_record.line[i] < 0xdc)
  686. ret = TRUE;
  687. }
  688. else if (input_record.encoding == UTF_16LE)
  689. {
  690. if (i + 1 < input_record.length
  691. && input_record.line[i + 1] > 0xd7 && input_record.line[i + 1] < 0xdc)
  692. ret = TRUE;
  693. }
  694. return(ret);
  695. }
  696. long utf_get_first_word_length(const unsigned char *utf_s, long utf_s_length, int utf_s_encoding)
  697. {
  698. /* Get the length of the first word in string utf_s */
  699. long i;
  700. /* Initialise */
  701. i = utf_s_length;
  702. if (utf_s_encoding == UTF_8)
  703. {
  704. for (i = 0 ; i < utf_s_length ; i++)
  705. if (isspace(utf_s[i]))
  706. break; /* for/i */
  707. }
  708. else if (utf_s_encoding == UTF_16BE)
  709. {
  710. for (i = 0 ; i < utf_s_length ; i += 2)
  711. if (i + 1 < utf_s_length
  712. && utf_s[i] == 0
  713. && isspace(utf_s[i + 1]))
  714. break; /* for/i */
  715. }
  716. else if (utf_s_encoding == UTF_16LE)
  717. {
  718. for (i = 0 ; i < utf_s_length ; i += 2)
  719. if (i + 1 < utf_s_length
  720. && isspace(utf_s[i])
  721. && utf_s[i + 1] == 0)
  722. break; /* for/i */
  723. }
  724. else if (utf_s_encoding == UTF_32BE)
  725. {
  726. for (i = 0 ; i < utf_s_length ; i += 4)
  727. if (i + 3 < utf_s_length
  728. && utf_s[i] == 0
  729. && utf_s[i + 1] == 0
  730. && utf_s[i + 2] == 0
  731. && isspace(utf_s[i + 3]))
  732. break; /* for/i */
  733. }
  734. else if (utf_s_encoding == UTF_32LE)
  735. {
  736. for (i = 0 ; i < utf_s_length ; i += 4)
  737. if (i + 3 < utf_s_length
  738. && isspace(utf_s[i])
  739. && utf_s[i + 1] == 0
  740. && utf_s[i + 2] == 0
  741. && utf_s[i + 3] == 0)
  742. break; /* for/i */
  743. }
  744. return(i);
  745. }