/src/linebreak/linebreak.c

http://ftk.googlecode.com/ · C · 734 lines · 482 code · 39 blank · 213 comment · 67 complexity · 2c9982e70e464692f69c1c214521dace MD5 · raw file

  1. /* vim: set tabstop=4 shiftwidth=4: */
  2. /*
  3. * Line breaking in a Unicode sequence. Designed to be used in a
  4. * generic text renderer.
  5. *
  6. * Copyright (C) 2008-2010 Wu Yongwei <wuyongwei at gmail dot com>
  7. *
  8. * This software is provided 'as-is', without any express or implied
  9. * warranty. In no event will the author be held liable for any damages
  10. * arising from the use of this software.
  11. *
  12. * Permission is granted to anyone to use this software for any purpose,
  13. * including commercial applications, and to alter it and redistribute
  14. * it freely, subject to the following restrictions:
  15. *
  16. * 1. The origin of this software must not be misrepresented; you must
  17. * not claim that you wrote the original software. If you use this
  18. * software in a product, an acknowledgement in the product
  19. * documentation would be appreciated but is not required.
  20. * 2. Altered source versions must be plainly marked as such, and must
  21. * not be misrepresented as being the original software.
  22. * 3. This notice may not be removed or altered from any source
  23. * distribution.
  24. *
  25. * The main reference is Unicode Standard Annex 14 (UAX #14):
  26. * <URL:http://www.unicode.org/reports/tr14/>
  27. *
  28. * When this library was designed, this annex was at Revision 19, for
  29. * Unicode 5.0.0:
  30. * <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
  31. *
  32. * This library has been updated according to Revision 24, for
  33. * Unicode 5.2.0:
  34. * <URL:http://www.unicode.org/reports/tr14/tr14-24.html>
  35. *
  36. * The Unicode Terms of Use are available at
  37. * <URL:http://www.unicode.org/copyright.html>
  38. */
  39. /**
  40. * @file linebreak.c
  41. *
  42. * Implementation of the line breaking algorithm as described in Unicode
  43. * Standard Annex 14.
  44. *
  45. * @version 2.0, 2010/01/03
  46. * @author Wu Yongwei
  47. */
  48. #include <assert.h>
  49. #include <stddef.h>
  50. #include <string.h>
  51. #include "linebreak.h"
  52. #include "linebreakdef.h"
  53. /**
  54. * Size of the second-level index to the line breaking properties.
  55. */
  56. #define LINEBREAK_INDEX_SIZE 40
  57. /**
  58. * Version number of the library.
  59. */
  60. const int linebreak_version = LINEBREAK_VERSION;
  61. /**
  62. * Enumeration of break actions. They are used in the break action
  63. * pair table below.
  64. */
  65. enum BreakAction
  66. {
  67. DIR_BRK, /**< Direct break opportunity */
  68. IND_BRK, /**< Indirect break opportunity */
  69. CMI_BRK, /**< Indirect break opportunity for combining marks */
  70. CMP_BRK, /**< Prohibited break for combining marks */
  71. PRH_BRK /**< Prohibited break */
  72. };
  73. /**
  74. * Break action pair table. This is a direct mapping of Table 2 of
  75. * Unicode Standard Annex 14, Revision 24.
  76. */
  77. static enum BreakAction baTable[LBP_JT][LBP_JT] = {
  78. { /* OP */
  79. PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
  80. PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
  81. PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, CMP_BRK,
  82. PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK },
  83. { /* CL */
  84. DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
  85. PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
  86. DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
  87. PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
  88. { /* CP */
  89. DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
  90. PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
  91. DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
  92. PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
  93. { /* QU */
  94. PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
  95. PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
  96. IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
  97. PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
  98. { /* GL */
  99. IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
  100. PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
  101. IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
  102. PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
  103. { /* NS */
  104. DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
  105. PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
  106. DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
  107. PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
  108. { /* EX */
  109. DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
  110. PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
  111. DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
  112. PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
  113. { /* SY */
  114. DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
  115. PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
  116. DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
  117. PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
  118. { /* IS */
  119. DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
  120. PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
  121. DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
  122. PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
  123. { /* PR */
  124. IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
  125. PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
  126. DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
  127. PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
  128. { /* PO */
  129. IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
  130. PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
  131. DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
  132. PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
  133. { /* NU */
  134. IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
  135. PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
  136. IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
  137. PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
  138. { /* AL */
  139. IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
  140. PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
  141. IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
  142. PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
  143. { /* ID */
  144. DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
  145. PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
  146. IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
  147. PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
  148. { /* IN */
  149. DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
  150. PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
  151. IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
  152. PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
  153. { /* HY */
  154. DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
  155. PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
  156. DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
  157. PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
  158. { /* BA */
  159. DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
  160. PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
  161. DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
  162. PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
  163. { /* BB */
  164. IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
  165. PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
  166. IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
  167. PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
  168. { /* B2 */
  169. DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
  170. PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
  171. DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK, CMI_BRK,
  172. PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
  173. { /* ZW */
  174. DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
  175. DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
  176. DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK, DIR_BRK,
  177. DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
  178. { /* CM */
  179. IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
  180. PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
  181. IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
  182. PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
  183. { /* WJ */
  184. IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
  185. PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
  186. IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
  187. PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
  188. { /* H2 */
  189. DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
  190. PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
  191. IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
  192. PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK },
  193. { /* H3 */
  194. DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
  195. PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
  196. IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
  197. PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK },
  198. { /* JL */
  199. DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
  200. PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
  201. IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
  202. PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK },
  203. { /* JV */
  204. DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
  205. PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
  206. IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
  207. PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK },
  208. { /* JT */
  209. DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
  210. PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
  211. IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
  212. PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK }
  213. };
  214. /**
  215. * Struct for the second-level index to the line breaking properties.
  216. */
  217. struct LineBreakPropertiesIndex
  218. {
  219. utf32_t end; /**< End coding point */
  220. struct LineBreakProperties *lbp;/**< Pointer to line breaking properties */
  221. };
  222. /**
  223. * Second-level index to the line breaking properties.
  224. */
  225. static struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] =
  226. {
  227. { 0xFFFFFFFF, lb_prop_default }
  228. };
  229. /**
  230. * Initializes the second-level index to the line breaking properties.
  231. * If it is not called, the performance of #get_char_lb_class_lang (and
  232. * thus the main functionality) can be pretty bad, especially for big
  233. * code points like those of Chinese.
  234. */
  235. void init_linebreak(void)
  236. {
  237. size_t i;
  238. size_t iPropDefault;
  239. size_t len;
  240. size_t step;
  241. len = 0;
  242. while (lb_prop_default[len].prop != LBP_Undefined)
  243. ++len;
  244. step = len / LINEBREAK_INDEX_SIZE;
  245. iPropDefault = 0;
  246. for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i)
  247. {
  248. lb_prop_index[i].lbp = lb_prop_default + iPropDefault;
  249. iPropDefault += step;
  250. lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1;
  251. }
  252. lb_prop_index[--i].end = 0xFFFFFFFF;
  253. }
  254. /**
  255. * Gets the language-specific line breaking properties.
  256. *
  257. * @param lang language of the text
  258. * @return pointer to the language-specific line breaking
  259. * properties array if found; \c NULL otherwise
  260. */
  261. static struct LineBreakProperties *get_lb_prop_lang(const char *lang)
  262. {
  263. struct LineBreakPropertiesLang *lbplIter;
  264. if (lang != NULL)
  265. {
  266. for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter)
  267. {
  268. if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0)
  269. {
  270. return lbplIter->lbp;
  271. }
  272. }
  273. }
  274. return NULL;
  275. }
  276. /**
  277. * Gets the line breaking class of a character from a line breaking
  278. * properties array.
  279. *
  280. * @param ch character to check
  281. * @param lbp pointer to the line breaking properties array
  282. * @return the line breaking class if found; \c LBP_XX otherwise
  283. */
  284. static enum LineBreakClass get_char_lb_class(
  285. utf32_t ch,
  286. struct LineBreakProperties *lbp)
  287. {
  288. while (lbp->prop != LBP_Undefined && ch >= lbp->start)
  289. {
  290. if (ch <= lbp->end)
  291. return lbp->prop;
  292. ++lbp;
  293. }
  294. return LBP_XX;
  295. }
  296. /**
  297. * Gets the line breaking class of a character from the default line
  298. * breaking properties array.
  299. *
  300. * @param ch character to check
  301. * @return the line breaking class if found; \c LBP_XX otherwise
  302. */
  303. static enum LineBreakClass get_char_lb_class_default(
  304. utf32_t ch)
  305. {
  306. size_t i = 0;
  307. while (ch > lb_prop_index[i].end)
  308. ++i;
  309. assert(i < LINEBREAK_INDEX_SIZE);
  310. return get_char_lb_class(ch, lb_prop_index[i].lbp);
  311. }
  312. /**
  313. * Gets the line breaking class of a character for a specific
  314. * language. This function will check the language-specific data first,
  315. * and then the default data if there is no language-specific property
  316. * available for the character.
  317. *
  318. * @param ch character to check
  319. * @param lbpLang pointer to the language-specific line breaking
  320. * properties array
  321. * @return the line breaking class if found; \c LBP_XX
  322. * otherwise
  323. */
  324. static enum LineBreakClass get_char_lb_class_lang(
  325. utf32_t ch,
  326. struct LineBreakProperties *lbpLang)
  327. {
  328. enum LineBreakClass lbcResult;
  329. /* Find the language-specific line breaking class for a character */
  330. if (lbpLang)
  331. {
  332. lbcResult = get_char_lb_class(ch, lbpLang);
  333. if (lbcResult != LBP_XX)
  334. return lbcResult;
  335. }
  336. /* Find the generic language-specific line breaking class, if no
  337. * language context is provided, or language-specific data are not
  338. * available for the specific character in the specified language */
  339. return get_char_lb_class_default(ch);
  340. }
  341. /**
  342. * Resolves the line breaking class for certain ambiguous or complicated
  343. * characters. They are treated in a simplistic way in this
  344. * implementation.
  345. *
  346. * @param lbc line breaking class to resolve
  347. * @param lang language of the text
  348. * @return the resolved line breaking class
  349. */
  350. static enum LineBreakClass resolve_lb_class(
  351. enum LineBreakClass lbc,
  352. const char *lang)
  353. {
  354. switch (lbc)
  355. {
  356. case LBP_AI:
  357. if (lang != NULL &&
  358. (strncmp(lang, "zh", 2) == 0 || /* Chinese */
  359. strncmp(lang, "ja", 2) == 0 || /* Japanese */
  360. strncmp(lang, "ko", 2) == 0)) /* Korean */
  361. {
  362. return LBP_ID;
  363. }
  364. /* Fall through */
  365. case LBP_SA:
  366. case LBP_SG:
  367. case LBP_XX:
  368. return LBP_AL;
  369. default:
  370. return lbc;
  371. }
  372. }
  373. /**
  374. * Gets the next Unicode character in a UTF-8 sequence. The index will
  375. * be advanced to the next complete character, unless the end of string
  376. * is reached in the middle of a UTF-8 sequence.
  377. *
  378. * @param[in] s input UTF-8 string
  379. * @param[in] len length of the string in bytes
  380. * @param[in,out] ip pointer to the index
  381. * @return the Unicode character beginning at the index; or
  382. * #EOS if end of input is encountered
  383. */
  384. utf32_t lb_get_next_char_utf8(
  385. const utf8_t *s,
  386. size_t len,
  387. size_t *ip)
  388. {
  389. utf8_t ch;
  390. utf32_t res;
  391. assert(*ip <= len);
  392. if (*ip == len)
  393. return EOS;
  394. ch = s[*ip];
  395. if (ch < 0xC2 || ch > 0xF4)
  396. { /* One-byte sequence, tail (should not occur), or invalid */
  397. *ip += 1;
  398. return ch;
  399. }
  400. else if (ch < 0xE0)
  401. { /* Two-byte sequence */
  402. if (*ip + 2 > len)
  403. return EOS;
  404. res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
  405. *ip += 2;
  406. return res;
  407. }
  408. else if (ch < 0xF0)
  409. { /* Three-byte sequence */
  410. if (*ip + 3 > len)
  411. return EOS;
  412. res = ((ch & 0x0F) << 12) +
  413. ((s[*ip + 1] & 0x3F) << 6) +
  414. ((s[*ip + 2] & 0x3F));
  415. *ip += 3;
  416. return res;
  417. }
  418. else
  419. { /* Four-byte sequence */
  420. if (*ip + 4 > len)
  421. return EOS;
  422. res = ((ch & 0x07) << 18) +
  423. ((s[*ip + 1] & 0x3F) << 12) +
  424. ((s[*ip + 2] & 0x3F) << 6) +
  425. ((s[*ip + 3] & 0x3F));
  426. *ip += 4;
  427. return res;
  428. }
  429. }
  430. /**
  431. * Gets the next Unicode character in a UTF-16 sequence. The index will
  432. * be advanced to the next complete character, unless the end of string
  433. * is reached in the middle of a UTF-16 surrogate pair.
  434. *
  435. * @param[in] s input UTF-16 string
  436. * @param[in] len length of the string in words
  437. * @param[in,out] ip pointer to the index
  438. * @return the Unicode character beginning at the index; or
  439. * #EOS if end of input is encountered
  440. */
  441. utf32_t lb_get_next_char_utf16(
  442. const utf16_t *s,
  443. size_t len,
  444. size_t *ip)
  445. {
  446. utf16_t ch;
  447. assert(*ip <= len);
  448. if (*ip == len)
  449. return EOS;
  450. ch = s[(*ip)++];
  451. if (ch < 0xD800 || ch > 0xDBFF)
  452. { /* If the character is not a high surrogate */
  453. return ch;
  454. }
  455. if (*ip == len)
  456. { /* If the input ends here (an error) */
  457. --(*ip);
  458. return EOS;
  459. }
  460. if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
  461. { /* If the next character is not the low surrogate (an error) */
  462. return ch;
  463. }
  464. /* Return the constructed character and advance the index again */
  465. return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
  466. }
  467. /**
  468. * Gets the next Unicode character in a UTF-32 sequence. The index will
  469. * be advanced to the next character.
  470. *
  471. * @param[in] s input UTF-32 string
  472. * @param[in] len length of the string in dwords
  473. * @param[in,out] ip pointer to the index
  474. * @return the Unicode character beginning at the index; or
  475. * #EOS if end of input is encountered
  476. */
  477. utf32_t lb_get_next_char_utf32(
  478. const utf32_t *s,
  479. size_t len,
  480. size_t *ip)
  481. {
  482. assert(*ip <= len);
  483. if (*ip == len)
  484. return EOS;
  485. return s[(*ip)++];
  486. }
  487. /**
  488. * Sets the line breaking information for a generic input string.
  489. *
  490. * @param[in] s input string
  491. * @param[in] len length of the input
  492. * @param[in] lang language of the input
  493. * @param[out] brks pointer to the output breaking data,
  494. * containing #LINEBREAK_MUSTBREAK,
  495. * #LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK,
  496. * or #LINEBREAK_INSIDEACHAR
  497. * @param[in] get_next_char function to get the next UTF-32 character
  498. */
  499. void set_linebreaks(
  500. const void *s,
  501. size_t len,
  502. const char *lang,
  503. char *brks,
  504. get_next_char_t get_next_char)
  505. {
  506. utf32_t ch;
  507. enum LineBreakClass lbcCur;
  508. enum LineBreakClass lbcNew;
  509. enum LineBreakClass lbcLast;
  510. struct LineBreakProperties *lbpLang;
  511. size_t posCur = 0;
  512. size_t posLast = 0;
  513. --posLast; /* To be ++'d later */
  514. ch = get_next_char(s, len, &posCur);
  515. if (ch == EOS)
  516. return;
  517. lbpLang = get_lb_prop_lang(lang);
  518. lbcCur = resolve_lb_class(get_char_lb_class_lang(ch, lbpLang), lang);
  519. lbcNew = LBP_Undefined;
  520. nextline:
  521. /* Special treatment for the first character */
  522. switch (lbcCur)
  523. {
  524. case LBP_LF:
  525. case LBP_NL:
  526. lbcCur = LBP_BK;
  527. break;
  528. case LBP_SP:
  529. lbcCur = LBP_WJ;
  530. break;
  531. default:
  532. break;
  533. }
  534. /* Process a line till an explicit break or end of string */
  535. for (;;)
  536. {
  537. for (++posLast; posLast < posCur - 1; ++posLast)
  538. {
  539. brks[posLast] = LINEBREAK_INSIDEACHAR;
  540. }
  541. assert(posLast == posCur - 1);
  542. lbcLast = lbcNew;
  543. ch = get_next_char(s, len, &posCur);
  544. if (ch == EOS)
  545. break;
  546. lbcNew = get_char_lb_class_lang(ch, lbpLang);
  547. if (lbcCur == LBP_BK || (lbcCur == LBP_CR && lbcNew != LBP_LF))
  548. {
  549. brks[posLast] = LINEBREAK_MUSTBREAK;
  550. lbcCur = resolve_lb_class(lbcNew, lang);
  551. goto nextline;
  552. }
  553. switch (lbcNew)
  554. {
  555. case LBP_SP:
  556. brks[posLast] = LINEBREAK_NOBREAK;
  557. continue;
  558. case LBP_BK:
  559. case LBP_LF:
  560. case LBP_NL:
  561. brks[posLast] = LINEBREAK_NOBREAK;
  562. lbcCur = LBP_BK;
  563. continue;
  564. case LBP_CR:
  565. brks[posLast] = LINEBREAK_NOBREAK;
  566. lbcCur = LBP_CR;
  567. continue;
  568. case LBP_CB:
  569. brks[posLast] = LINEBREAK_ALLOWBREAK;
  570. lbcCur = LBP_BA;
  571. continue;
  572. default:
  573. break;
  574. }
  575. lbcNew = resolve_lb_class(lbcNew, lang);
  576. assert(lbcCur <= LBP_JT);
  577. assert(lbcNew <= LBP_JT);
  578. switch (baTable[lbcCur - 1][lbcNew - 1])
  579. {
  580. case DIR_BRK:
  581. brks[posLast] = LINEBREAK_ALLOWBREAK;
  582. break;
  583. case CMI_BRK:
  584. case IND_BRK:
  585. if (lbcLast == LBP_SP)
  586. {
  587. brks[posLast] = LINEBREAK_ALLOWBREAK;
  588. }
  589. else
  590. {
  591. brks[posLast] = LINEBREAK_NOBREAK;
  592. }
  593. break;
  594. case CMP_BRK:
  595. brks[posLast] = LINEBREAK_NOBREAK;
  596. if (lbcLast != LBP_SP)
  597. continue;
  598. break;
  599. case PRH_BRK:
  600. brks[posLast] = LINEBREAK_NOBREAK;
  601. break;
  602. }
  603. lbcCur = lbcNew;
  604. }
  605. assert(posLast == posCur - 1 && posCur <= len);
  606. /* Break after the last character */
  607. brks[posLast] = LINEBREAK_MUSTBREAK;
  608. /* When the input contains incomplete sequences */
  609. while (posCur < len)
  610. {
  611. brks[posCur++] = LINEBREAK_INSIDEACHAR;
  612. }
  613. }
  614. /**
  615. * Sets the line breaking information for a UTF-8 input string.
  616. *
  617. * @param[in] s input UTF-8 string
  618. * @param[in] len length of the input
  619. * @param[in] lang language of the input
  620. * @param[out] brks pointer to the output breaking data, containing
  621. * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
  622. * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
  623. */
  624. void set_linebreaks_utf8(
  625. const utf8_t *s,
  626. size_t len,
  627. const char *lang,
  628. char *brks)
  629. {
  630. set_linebreaks(s, len, lang, brks,
  631. (get_next_char_t)lb_get_next_char_utf8);
  632. }
  633. /**
  634. * Sets the line breaking information for a UTF-16 input string.
  635. *
  636. * @param[in] s input UTF-16 string
  637. * @param[in] len length of the input
  638. * @param[in] lang language of the input
  639. * @param[out] brks pointer to the output breaking data, containing
  640. * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
  641. * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
  642. */
  643. void set_linebreaks_utf16(
  644. const utf16_t *s,
  645. size_t len,
  646. const char *lang,
  647. char *brks)
  648. {
  649. set_linebreaks(s, len, lang, brks,
  650. (get_next_char_t)lb_get_next_char_utf16);
  651. }
  652. /**
  653. * Sets the line breaking information for a UTF-32 input string.
  654. *
  655. * @param[in] s input UTF-32 string
  656. * @param[in] len length of the input
  657. * @param[in] lang language of the input
  658. * @param[out] brks pointer to the output breaking data, containing
  659. * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
  660. * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
  661. */
  662. void set_linebreaks_utf32(
  663. const utf32_t *s,
  664. size_t len,
  665. const char *lang,
  666. char *brks)
  667. {
  668. set_linebreaks(s, len, lang, brks,
  669. (get_next_char_t)lb_get_next_char_utf32);
  670. }
  671. /**
  672. * Tells whether a line break can occur between two Unicode characters.
  673. * This is a wrapper function to expose a simple interface. Generally
  674. * speaking, it is better to use #set_linebreaks_utf32 instead, since
  675. * complicated cases involving combining marks, spaces, etc. cannot be
  676. * correctly processed.
  677. *
  678. * @param char1 the first Unicode character
  679. * @param char2 the second Unicode character
  680. * @param lang language of the input
  681. * @return one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
  682. * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
  683. */
  684. int is_line_breakable(
  685. utf32_t char1,
  686. utf32_t char2,
  687. const char* lang)
  688. {
  689. utf32_t s[2];
  690. char brks[2];
  691. s[0] = char1;
  692. s[1] = char2;
  693. set_linebreaks_utf32(s, 2, lang, brks);
  694. return brks[0];
  695. }