PageRenderTime 50ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/sombok-2.2.1/lib/utils.c

#
C | 607 lines | 483 code | 40 blank | 84 comment | 159 complexity | 76d4ff8ce32af891dcbe153700e75529 MD5 | raw file
Possible License(s): AGPL-1.0
  1. /*
  2. * utls.c - Utility functions.
  3. *
  4. * Copyright (C) 2009-2011 by Hatuka*nezumi - IKEDA Soji.
  5. *
  6. * This file is part of the Sombok Package. This program is free
  7. * software; you can redistribute it and/or modify it under the terms of
  8. * either the GNU General Public License or the Artistic License, as
  9. * specified in the README file.
  10. *
  11. */
  12. #include "sombok_constants.h"
  13. #include "sombok.h"
  14. /** @defgroup linebreak_utils utils
  15. * @brief Callback functions used by linebreak
  16. *@{*/
  17. /** @name Preprocessing callback
  18. * gcstring_t *callback(linebreak_t *lbobj, void *data, unistr_t *str, unistr_t *text)
  19. *
  20. * Preprocessing behaviors specified by item of ``prep_func'' member of
  21. * linebreak_t. Corresponding item of ``prep_data'' member can be used to
  22. * modify behavior.
  23. * @param[in] obj linebreak object.
  24. * @param[in] data an item of prep_data correspondig to callback.
  25. * @param[in,out] substr pointer to Unicode string.
  26. * @param[in] text whole text to be broken, or NULL.
  27. * @return This callback is past twice by each substring of text:
  28. *
  29. * On the first pass, when text is not NULL, it should return the first
  30. * occurrance in substr matching its criteria, update substr->str to be
  31. * matching position and substr->len to be length. Otherwise, should set
  32. * NULL to substr->str.
  33. * Return value shall be discarded.
  34. *
  35. * On the second pass, when text is NULL, it should return new grapheme
  36. * cluster string created from substr. Return value should not share
  37. * Unicode buffer with substr (i.e. use gcstring_newcopy()).
  38. *
  39. * If error occurred, callback must set lbobj->errnum nonzero then return NULL.
  40. */
  41. /*@{*/
  42. static
  43. int startswith(unistr_t * unistr, size_t idx, char *str, size_t len,
  44. int cs)
  45. {
  46. size_t i;
  47. unichar_t uc, c;
  48. if (unistr->str == NULL)
  49. return 0;
  50. if (unistr->len - idx < len)
  51. return 0;
  52. for (i = 0; i < len; i++) {
  53. uc = unistr->str[idx + i];
  54. c = (unichar_t) str[i];
  55. if (!cs) {
  56. if ((unichar_t) 'A' <= uc && uc <= (unichar_t) 'Z')
  57. uc += (unichar_t) ('a' - 'A');
  58. if ((unichar_t) 'A' <= c && c <= (unichar_t) 'Z')
  59. c += (unichar_t) ('a' - 'A');
  60. }
  61. if (uc != c)
  62. return 0;
  63. }
  64. return 1;
  65. }
  66. #define is(str, i, c) \
  67. ((i) < (str)->len && (str)->str[i] == (c))
  68. #define _is_alpha(s) \
  69. (('a' <= (s) && (s) <= 'z') || ('A' <= (s) && (s) <= 'Z'))
  70. #define is_alpha(str, i) \
  71. ((i) < (str)->len && _is_alpha((str)->str[i]))
  72. #define _is_digit(s) \
  73. ('0' <= (s) && (s) <= '9')
  74. #define is_digit(str, i) \
  75. ((i) < (str)->len && _is_digit((str)->str[i]))
  76. #define _is_hexdig(s) \
  77. (_is_digit(s) || ('a' <= (s) && (s) <= 'f') || ('A' <= (s) && (s) <= 'F'))
  78. #define is_hexdig(str, i) \
  79. ((i) < (str)->len && _is_hexdig((str)->str[i]))
  80. #define _is_sub_delim(s) \
  81. ((s) == '!' || (s) == '$' || (s) == '&' || (s) == '\'' || (s) == '(' || \
  82. (s) == ')' || (s) == '*' || (s) == '+' || (s) == ',' || (s) == ';' || \
  83. (s) == '=')
  84. #define is_sub_delim(str, i) \
  85. ((i) < (str)->len && _is_sub_delim((str)->str[i]))
  86. #define _is_unreserved(s) \
  87. (_is_alpha(s) || _is_digit(s) || \
  88. (s) == '-' || (s) == '.' || (s) == '_' || (s) == '~')
  89. #define is_unreserved(str, i) \
  90. ((i) < (str)->len && _is_unreserved((str)->str[i]))
  91. #define _is_pct_encoded(s) \
  92. ((s) == '%' || _is_hexdig(s))
  93. #define is_pct_encoded(str, i) \
  94. ((i) < (str)->len && _is_pct_encoded((str)->str[i]))
  95. #define _is_pchar(s) \
  96. (_is_unreserved(s) || _is_pct_encoded(s) || _is_sub_delim(s) || \
  97. (s) == ':' || (s) == '@')
  98. #define is_pchar(str, i) \
  99. ((i) < (str)->len && _is_pchar((str)->str[i]))
  100. /** Built-in preprocessing callback
  101. *
  102. * Built-in preprocessing callback to break or not to break URLs according to
  103. * some rules by Chicago Manual of Style 15th ed.
  104. * If data is NULL, prohibit break.
  105. * Otherwise, allow break by rule above.
  106. */
  107. gcstring_t *linebreak_prep_URIBREAK(linebreak_t * lbobj, void *data,
  108. unistr_t * str, unistr_t * text)
  109. {
  110. gcstring_t *gcstr;
  111. size_t i;
  112. unichar_t *ptr;
  113. /* Pass I */
  114. if (text != NULL) {
  115. /*
  116. * Search URL in str.
  117. * Following code loosely refers RFC3986 but some practical
  118. * assumptions are put:
  119. *
  120. * o Broken pct-encoded sequences (e.g. single "%") are allowed.
  121. * o scheme names must end with alphanumeric, must be longer than
  122. * or equal to two octets, and must not contain more than one
  123. * non-alphanumeric ("+", "-" or ".").
  124. * o URLs containing neither non-empty path, query part nor fragment
  125. * (e.g. "about:") are omitted: they are treated as ordinal words.
  126. */
  127. for (ptr = NULL, i = 0; i < str->len; ptr = NULL, i++) {
  128. int has_double_slash, has_authority, has_empty_path,
  129. has_no_query, has_no_fragment;
  130. size_t alphadigit, nonalphadigit;
  131. /* skip non-alpha. */
  132. if (!is_alpha(str, i))
  133. continue;
  134. ptr = str->str + i;
  135. /* "url:" - case insensitive */
  136. if (startswith(str, i, "url:", 4, 0))
  137. i += 4;
  138. /* scheme */
  139. if (is_alpha(str, i))
  140. i++;
  141. else
  142. continue;
  143. nonalphadigit = 0;
  144. alphadigit = 1;
  145. while (1) {
  146. if (is_alpha(str, i) || is_digit(str, i))
  147. alphadigit++;
  148. else if (is(str, i, '+') || is(str, i, '-') || is(str, i, '.'))
  149. nonalphadigit++;
  150. else
  151. break;
  152. i++;
  153. }
  154. if (alphadigit < 2 || 1 < nonalphadigit ||
  155. ! (is_digit(str, i - 1) || is_alpha(str, i - 1)))
  156. continue;
  157. /* ":" */
  158. if (is(str, i, ':'))
  159. i++;
  160. else
  161. continue;
  162. /* hier-part */
  163. has_double_slash = 0;
  164. has_authority = 0;
  165. has_empty_path = 0;
  166. has_no_query = 0;
  167. has_no_fragment = 0;
  168. if (startswith(str, i, "//", 2, 0)) {
  169. /* "//" */
  170. has_double_slash = 1;
  171. i += 2;
  172. /* authority - FIXME:syntax relaxed */
  173. if (is(str, i, '[') || is(str, i, ':') || is(str, i, '@') ||
  174. is_unreserved(str, i) || is_pct_encoded(str, i) ||
  175. is_sub_delim(str, i)) {
  176. has_authority = 1;
  177. i++;
  178. while (is(str, i, '[') || is(str, i, ']') ||
  179. is(str, i, ':') || is(str, i, '@') ||
  180. is_unreserved(str, i) || is_pct_encoded(str, i) ||
  181. is_sub_delim(str, i))
  182. i++;
  183. }
  184. }
  185. /* path */
  186. if (has_double_slash) {
  187. if (has_authority)
  188. goto path_abempty;
  189. else
  190. goto path_absolute;
  191. } /* else goto path_rootless; */
  192. /* path_rootless: */
  193. if (is_pchar(str, i)) { /* FIXME:path-noscheme not concerned */
  194. i++;
  195. while (is_pchar(str, i))
  196. i++;
  197. goto path_abempty;
  198. } else {
  199. has_empty_path = 1;
  200. goto path_empty;
  201. }
  202. path_absolute:
  203. if (startswith(str, i, "//", 2, 0))
  204. continue;
  205. else if (is(str, i, '/')) {
  206. i++;
  207. if (is_pchar(str, i)) {
  208. i++;
  209. while (is_pchar(str, i))
  210. i++;
  211. }
  212. goto path_abempty;
  213. } else
  214. continue;
  215. path_abempty:
  216. if (is(str, i, '/')) {
  217. i++;
  218. while (is(str, i, '/') || is_pchar(str, i))
  219. i++;
  220. } /* else goto path_empty; */
  221. path_empty:
  222. ;
  223. /* query */
  224. if (is(str, i, '?')) {
  225. i++;
  226. while (is(str, i, '/') || is(str, i, '?') || is_pchar(str, i))
  227. i++;
  228. } else
  229. has_no_query = 1;
  230. /* fragment */
  231. if (is(str, i, '#')) {
  232. i++;
  233. while (is(str, i, '/') || is(str, i, '?') || is_pchar(str, i))
  234. i++;
  235. } else
  236. has_no_fragment = 1;
  237. if (has_empty_path && has_no_query && has_no_fragment)
  238. continue;
  239. break;
  240. }
  241. if (ptr != NULL)
  242. str->len = i - (ptr - str->str);
  243. str->str = ptr;
  244. return NULL;
  245. }
  246. /* Pass II */
  247. if ((gcstr = gcstring_newcopy(str, lbobj)) == NULL) {
  248. lbobj->errnum = errno ? errno : ENOMEM;
  249. return NULL;
  250. }
  251. /* non-break URI. */
  252. if (data == NULL) {
  253. for (i = 1; i < gcstr->gclen; i++)
  254. gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE;
  255. return gcstr;
  256. }
  257. /* break URI. */
  258. if (startswith((unistr_t *) gcstr, 0, "url:", 4, 0)) {
  259. gcstr->gcstr[4].flag = LINEBREAK_FLAG_ALLOW_BEFORE;
  260. i = 5;
  261. } else
  262. i = 1;
  263. for (; i < gcstr->gclen; i++) {
  264. unichar_t u, v;
  265. u = gcstr->str[gcstr->gcstr[i - 1].idx];
  266. v = gcstr->str[gcstr->gcstr[i].idx];
  267. /*
  268. * Some rules based on CMoS 15th ed.
  269. * 17.11 1.1: [/] ÷ [^/]
  270. * 17.11 2: [-] ×
  271. * 6.17 2: [.] ×
  272. * 17.11 1.2: ÷ [-~.,_?#%]
  273. * 17.11 1.3: ÷ [=&]
  274. * 17.11 1.3: [=&] ÷
  275. * Default: ALL × ALL
  276. */
  277. if (u == '/' && v != '/')
  278. gcstr->gcstr[i].flag = LINEBREAK_FLAG_ALLOW_BEFORE;
  279. else if (u == '-' || u == '.')
  280. gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE;
  281. else if (v == '-' || v == '~' || v == '.' || v == ',' ||
  282. v == '_' || v == '?' || v == '#' || v == '%' ||
  283. u == '=' || v == '=' || u == '&' || v == '&')
  284. gcstr->gcstr[i].flag = LINEBREAK_FLAG_ALLOW_BEFORE;
  285. else
  286. gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE;
  287. }
  288. /* Won't break punctuations at end of matches. */
  289. for (i = gcstr->gclen - 1; 1 <= i; i--) {
  290. unichar_t u = gcstr->str[gcstr->gcstr[i].idx];
  291. if (gcstr->gcstr[i].flag == LINEBREAK_FLAG_ALLOW_BEFORE &&
  292. (u == '"' || u == '.' || u == ':' || u == ';' || u == ',' ||
  293. u == '>'))
  294. gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE;
  295. else
  296. break;
  297. }
  298. return gcstr;
  299. }
  300. /*@}*/
  301. /** @name Sizing callback
  302. * double callback(linebreak_t *obj, double len, gcstring_t *pre, gcstring_t *spc, gcstring_t *str)
  303. *
  304. * Sizing behavior specified by ``sizing_func'' member of linebreak_t.
  305. * ``sizing_data'' member can be used to modify behavior.
  306. * @param[in] obj linebreak object.
  307. * @param[in] len Number of columns of preceding grapheme cluster string.
  308. * @param[in] pre Preceding grapheme cluster string.
  309. * @param[in] spc Trailing spaces of preceding string.
  310. * @param[in] str Appended grapheme cluster string.
  311. * @return number of columns of pre+spc+str.
  312. * If error occurred, callback must set lbobj->errnum nonzero then return NULL.
  313. */
  314. /*@{*/
  315. /** Built-in Sizing callback
  316. *
  317. * Built-in Sizing callback based on UAX #11.
  318. */
  319. double linebreak_sizing_UAX11(linebreak_t * obj, double len,
  320. gcstring_t * pre, gcstring_t * spc,
  321. gcstring_t * str)
  322. {
  323. gcstring_t *spcstr;
  324. if ((!spc || !spc->str || !spc->len) &&
  325. (!str || !str->str || !str->len))
  326. return len;
  327. if (!spc || !spc->str)
  328. spcstr = gcstring_copy(str);
  329. else if ((spcstr = gcstring_concat(spc, str)) == NULL)
  330. return -1.0;
  331. len += (double) gcstring_columns(spcstr);
  332. gcstring_destroy(spcstr);
  333. return len;
  334. }
  335. /*@}*/
  336. /** @name Formatting callback
  337. * gcstring_t *callback(linebreak_t *lbobj, linebreak_state_t state, gcstring_t *gcstr)
  338. *
  339. * Formatting behaviors specified by ``format_func'' member of linebreak_t.
  340. * ``formt_data'' member can be used to modify behavior.
  341. * @param[in] obj linebreak object.
  342. * @param[in] state state.
  343. * @param[in] gcstr text fragment.
  344. * @return new text fragment or, if no modification needed, NULL.
  345. * If error occurred, callback must set lbobj->errnum nonzero then return NULL.
  346. *
  347. * Following table describes behavior of built-in format callbacks.
  348. *
  349. * @verbatim
  350. * state| SIMPLE | NEWLINE | TRIM
  351. * -----+-----------------+-------------------+-------------------
  352. * SOT |
  353. * SOP | not modify
  354. * SOL |
  355. * LINE |
  356. * EOL | append newline | replace by newline| replace by newline
  357. * EOP | not modify | replace by newline| remove SPACEs
  358. * EOT | not modify | replace by newline| remove SPACEs
  359. * ----------------------------------------------------------------
  360. * @endverbatim
  361. */
  362. /*@{*/
  363. /** Built-in formatting callback
  364. *
  365. */
  366. gcstring_t *linebreak_format_SIMPLE(linebreak_t * lbobj,
  367. linebreak_state_t state,
  368. gcstring_t * gcstr)
  369. {
  370. gcstring_t *t, *result;
  371. unistr_t unistr;
  372. switch (state) {
  373. case LINEBREAK_STATE_EOL:
  374. if ((result = gcstring_copy(gcstr)) == NULL)
  375. return NULL;
  376. unistr.str = lbobj->newline.str;
  377. unistr.len = lbobj->newline.len;
  378. if ((t = gcstring_new(&unistr, lbobj)) == NULL)
  379. return NULL;
  380. if (gcstring_append(result, t) == NULL) {
  381. t->str = NULL;
  382. gcstring_destroy(t);
  383. return NULL;
  384. }
  385. t->str = NULL;
  386. gcstring_destroy(t);
  387. return result;
  388. default:
  389. errno = 0;
  390. return NULL;
  391. }
  392. }
  393. /** Built-in formatting callback
  394. *
  395. */
  396. gcstring_t *linebreak_format_NEWLINE(linebreak_t * lbobj,
  397. linebreak_state_t state,
  398. gcstring_t * gcstr)
  399. {
  400. gcstring_t *result;
  401. unistr_t unistr;
  402. switch (state) {
  403. case LINEBREAK_STATE_EOL:
  404. case LINEBREAK_STATE_EOP:
  405. case LINEBREAK_STATE_EOT:
  406. unistr.str = lbobj->newline.str;
  407. unistr.len = lbobj->newline.len;
  408. if ((result = gcstring_newcopy(&unistr, lbobj)) == NULL)
  409. return NULL;
  410. return result;
  411. default:
  412. errno = 0;
  413. return NULL;
  414. }
  415. }
  416. /** Built-in formatting callback
  417. *
  418. */
  419. gcstring_t *linebreak_format_TRIM(linebreak_t * lbobj,
  420. linebreak_state_t state,
  421. gcstring_t * gcstr)
  422. {
  423. gcstring_t *result;
  424. unistr_t unistr = { NULL, 0 };
  425. size_t i;
  426. switch (state) {
  427. case LINEBREAK_STATE_EOL:
  428. unistr.str = lbobj->newline.str;
  429. unistr.len = lbobj->newline.len;
  430. if ((result = gcstring_newcopy(&unistr, lbobj)) == NULL)
  431. return NULL;
  432. return result;
  433. case LINEBREAK_STATE_EOP:
  434. case LINEBREAK_STATE_EOT:
  435. if (gcstr->str == NULL || gcstr->len == 0) {
  436. if ((result = gcstring_newcopy(&unistr, lbobj)) == NULL)
  437. return NULL;
  438. return result;
  439. }
  440. for (i = 0; i < gcstr->gclen && gcstr->gcstr[i].lbc == LB_SP; i++);
  441. if ((result = gcstring_substr(gcstr, i, gcstr->gclen)) == NULL)
  442. return NULL;
  443. return result;
  444. default:
  445. errno = 0;
  446. return NULL;
  447. }
  448. }
  449. /*@}*/
  450. /** @name Urgent breaking callbacks
  451. * gcstring_t *callback(linebreak_t *lbobj, gcstring_t *str)
  452. *
  453. * Urgent breaking behaviors specified by ``urgent_func'' member of
  454. * linebreak_t. ``urgent_data'' member can be used to modify behavior.
  455. * @param[in] obj linebreak object.
  456. * @param[in] str text to be broken.
  457. * @return new text or, if no modification needed, NULL.
  458. * If error occurred, callback must set lbobj->errnum nonzero then return NULL.
  459. *
  460. * There are two built-in urgent breaking callbacks.
  461. */
  462. /*@{*/
  463. /** Built-in urgent brealing callback
  464. *
  465. * Abort processing. lbobj->errnum is set to LINEBREAK_ELONG.
  466. */
  467. gcstring_t *linebreak_urgent_ABORT(linebreak_t * lbobj, gcstring_t * str)
  468. {
  469. lbobj->errnum = LINEBREAK_ELONG;
  470. return NULL;
  471. }
  472. /** Built-in urgent brealing callback
  473. *
  474. * Force breaking lines.
  475. */
  476. gcstring_t *linebreak_urgent_FORCE(linebreak_t * lbobj, gcstring_t * str)
  477. {
  478. gcstring_t *result, *s, empty = { NULL, 0, NULL, 0, 0, lbobj };
  479. if (!str || !str->len)
  480. return gcstring_new(NULL, lbobj);
  481. result = gcstring_new(NULL, lbobj);
  482. s = gcstring_copy(str);
  483. while (1) {
  484. size_t i;
  485. gcstring_t *t;
  486. double cols;
  487. for (i = 0; i < s->gclen; i++) {
  488. t = gcstring_substr(s, 0, i + 1);
  489. if (lbobj->sizing_func != NULL)
  490. cols =
  491. (*(lbobj->sizing_func)) (lbobj, 0.0, &empty, &empty,
  492. t);
  493. else
  494. cols = (double) t->gclen;
  495. gcstring_destroy(t);
  496. if (lbobj->colmax < cols)
  497. break;
  498. }
  499. if (0 < i) {
  500. t = gcstring_substr(s, 0, i);
  501. if (t->gclen) {
  502. t->gcstr[0].flag = LINEBREAK_FLAG_ALLOW_BEFORE;
  503. gcstring_append(result, t);
  504. }
  505. gcstring_destroy(t);
  506. t = gcstring_substr(s, i, s->gclen - i);
  507. gcstring_destroy(s);
  508. s = t;
  509. if (!s->gclen)
  510. break;
  511. } else {
  512. if (s->gclen) {
  513. s->gcstr[0].flag = LINEBREAK_FLAG_ALLOW_BEFORE;
  514. gcstring_append(result, s);
  515. }
  516. break;
  517. }
  518. }
  519. gcstring_destroy(s);
  520. return result;
  521. }
  522. /*@}*/
  523. /** @name Preprocessing callbacks - obsoleted form
  524. * gcstring_t *callback(linebreak_t *lbobj, unistr_t *str)
  525. * Preprocessing behaviors specified by ``user_func'' member of linebreak_t.
  526. * ``user_data'' member can be used to modify behavior.
  527. * @param[in] obj linebreak object.
  528. * @param[in] str Unicode string (not grapheme cluster string) to be processed.
  529. * @return new grapheme cluster string. NULL means no data.
  530. * If error occurred, callback must set lbobj->errnum nonzero then return NULL.
  531. *
  532. * Currently no built-in preprocessing callbacks are defined.
  533. * NOTE: Feature of this callback described here is planned to be changed
  534. * by next release.
  535. */