/sombok-2.2.1/lib/break.c
C | 1351 lines | 947 code | 151 blank | 253 comment | 296 complexity | 0c6ee660e53668a89a50fa0ede5bbd70 MD5 | raw file
Possible License(s): AGPL-1.0
- /*
- * break.c - an implementation of Unicode line breaking algorithm.
- *
- * Copyright (C) 2009-2012 by Hatuka*nezumi - IKEDA Soji.
- *
- * This file is part of the Sombok Package. This program is free
- * software; you can redistribute it and/or modify it under the terms of
- * either the GNU General Public License or the Artistic License, as
- * specified in the README file.
- *
- */
- #include "sombok_constants.h"
- #include "sombok.h"
- extern propval_t *linebreak_rules[];
- extern size_t linebreak_rulessiz;
- /**
- * @defgroup linebreak_break break
- * @brief Perform line breaking algorithm
- *@{*/
- static
- gcstring_t *_user(linebreak_t * lbobj, unistr_t * str)
- {
- gcstring_t *result;
- if (str == NULL)
- return NULL;
- else if (lbobj->user_func == NULL ||
- ((result = (*(lbobj->user_func)) (lbobj, str)) == NULL &&
- !lbobj->errnum)) {
- if ((result = gcstring_newcopy(str, lbobj)) == NULL)
- lbobj->errnum = errno ? errno : ENOMEM;
- }
- return result;
- }
- static
- gcstring_t *_prep_sub(linebreak_t * lbobj, unistr_t * substr,
- unistr_t * text, size_t findex)
- {
- unistr_t unistr = { NULL, 0 };
- gcstring_t *ret, *s;
- unichar_t *prev_str;
- size_t prev_len;
- gcstring_t *(*func) (linebreak_t *, void *, unistr_t *, unistr_t *);
- void *data;
- if ((func = lbobj->prep_func[findex]) == NULL) {
- if ((ret = gcstring_newcopy(substr, lbobj)) == NULL)
- lbobj->errnum = errno ? errno : ENOMEM;
- return ret;
- }
- if (lbobj->prep_data == NULL)
- data = NULL;
- else
- data = lbobj->prep_data[findex];
- if ((ret = gcstring_new(NULL, lbobj)) == NULL) {
- lbobj->errnum = errno ? errno : ENOMEM;
- return NULL;
- }
- prev_str = substr->str;
- prev_len = substr->len;
- while (1) {
- /* Pass I: search. */
- unistr.str = prev_str;
- unistr.len = prev_len;
- gcstring_destroy((*func) (lbobj, data, &unistr, text));
- /* - no match: stop searching. */
- if (unistr.str == NULL)
- break;
- /* - buffer may be modified: abort. */
- if (unistr.len < 0 ||
- unistr.str < text->str ||
- text->str + text->len < unistr.str + unistr.len) {
- gcstring_destroy(ret);
- lbobj->errnum = EINVAL;
- return NULL;
- }
- /* - out of range: stop searching. */
- if (unistr.str < substr->str ||
- substr->str + substr->len < unistr.str + unistr.len)
- break;
- /* apply next callback to unmatched part. */
- if (prev_str <= unistr.str) {
- unistr_t us;
- us.len = unistr.str - prev_str;
- us.str = prev_str;
- if ((s = _prep_sub(lbobj, &us, text, findex + 1)) == NULL) {
- gcstring_destroy(ret);
- return NULL;
- }
- if (gcstring_append(ret, s) == NULL) {
- gcstring_destroy(ret);
- gcstring_destroy(s);
- lbobj->errnum = errno ? errno : ENOMEM;
- return NULL;
- }
- gcstring_destroy(s);
- }
- /* Pass II: process matched string. */
- if ((s = (*func) (lbobj, data, &unistr, NULL)) == NULL) {
- if (lbobj->errnum != 0) {
- gcstring_destroy(ret);
- return NULL;
- }
- if ((s = gcstring_newcopy(&unistr, lbobj)) == NULL) {
- gcstring_destroy(ret);
- lbobj->errnum = errno ? errno : ENOMEM;
- return NULL;
- }
- }
- if (gcstring_append(ret, s) == NULL) {
- gcstring_destroy(ret);
- gcstring_destroy(s);
- lbobj->errnum = errno ? errno : ENOMEM;
- return NULL;
- }
- gcstring_destroy(s);
- /* skip zero length match to avoid infinite loop. */
- if (unistr.len == 0) {
- if (substr->str + substr->len <= unistr.str) {
- prev_str = unistr.str;
- prev_len = 0;
- break;
- } else {
- prev_str = unistr.str + 1;
- prev_len = substr->str + substr->len - prev_str;
- continue;
- }
- }
- prev_str = unistr.str + unistr.len;
- prev_len = substr->str + substr->len - prev_str;
- }
- /* apply next callback to the rest of string. */
- if (prev_str < substr->str + substr->len) {
- unistr.str = prev_str;
- unistr.len = prev_len;
- if ((s = _prep_sub(lbobj, &unistr, text, findex + 1)) == NULL) {
- gcstring_destroy(ret);
- return NULL;
- }
- if (gcstring_append(ret, s) == NULL) {
- gcstring_destroy(ret);
- gcstring_destroy(s);
- lbobj->errnum = errno ? errno : ENOMEM;
- return NULL;
- }
- gcstring_destroy(s);
- }
- return ret;
- }
- static
- gcstring_t *_prep(linebreak_t * lbobj, unistr_t * text)
- {
- gcstring_t *ret;
- if (lbobj->prep_func == NULL) {
- if ((ret = gcstring_newcopy(text, lbobj)) == NULL)
- lbobj->errnum = errno ? errno : ENOMEM;
- return ret;
- }
- return _prep_sub(lbobj, text, text, 0);
- }
- static
- gcstring_t *_format(linebreak_t * lbobj, linebreak_state_t action,
- gcstring_t * str)
- {
- gcstring_t *result;
- if (str == NULL)
- return NULL;
- else if (lbobj->format_func == NULL ||
- ((result =
- (*(lbobj->format_func)) (lbobj, action, str)) == NULL &&
- !lbobj->errnum)) {
- if ((result = gcstring_copy(str)) == NULL)
- lbobj->errnum = errno ? errno : ENOMEM;
- }
- return result;
- }
- static
- double _sizing(linebreak_t * lbobj, double len,
- gcstring_t * pre, gcstring_t * spc, gcstring_t * str)
- {
- double ret;
- if (lbobj->sizing_func == NULL ||
- ((ret = (*(lbobj->sizing_func)) (lbobj, len, pre, spc, str))
- < 0.0 && !lbobj->errnum)) {
- if (spc != NULL)
- len += (double) spc->gclen;
- if (str != NULL)
- len += (double) str->gclen;
- return len;
- }
- return ret;
- }
- static
- gcstring_t *_urgent_break(linebreak_t * lbobj, gcstring_t * str)
- {
- gcstring_t *result;
- if (lbobj->urgent_func == NULL ||
- ((result = (*(lbobj->urgent_func)) (lbobj, str)) == NULL &&
- !lbobj->errnum)) {
- if ((result = gcstring_copy(str)) == NULL)
- lbobj->errnum = errno ? errno : ENOMEM;
- }
- return result;
- }
- #define gcstring_DESTROY(gcstr) \
- gcstring_destroy(gcstr); gcstr = NULL;
- #define IF_NULL_THEN_ABORT(x) \
- if ((x) == NULL) { \
- size_t i; \
- if (lbobj->errnum == 0) \
- lbobj->errnum = errno? errno: EINVAL; \
- gcstring_destroy(str); \
- gcstring_destroy(bufStr); \
- gcstring_destroy(bufSpc); \
- for (i = 0; i < reslen; i++) \
- gcstring_destroy(results[i]); \
- free(results); \
- gcstring_destroy(s); \
- gcstring_destroy(t); \
- gcstring_destroy(beforeFrg); \
- gcstring_destroy(fmt); \
- gcstring_destroy(broken); \
- return NULL; \
- }
- /** @fn propval_t linebreak_lbrule(propval_t b_idx, propval_t a_idx)
- * @deprecated Use linebreak_get_lbrule().
- *
- * Get breaking rule between two classes
- *
- * From given two line breaking classes, get breaking rule determined by
- * internal data.
- * @param[in] a_idx line breaking class.
- * @param[in] b_idx line breaking class.
- * @return line breaking action: MANDATORY, DIRECT, INDIRECT or PROHIBITED.
- * If action was not determined, returns DIRECT.
- *
- * @note This method gives just approximate description of line breaking
- * behavior. Especially, it won't give meaningful value related to classes
- * AI and CJ.
- * See also linebreak_get_lbrule().
- *
- */
- static
- propval_t _lbruleinfo(propval_t b_idx, propval_t a_idx)
- {
- propval_t result = PROP_UNKNOWN;
- if (b_idx < 0 || linebreak_rulessiz <= b_idx ||
- a_idx < 0 || linebreak_rulessiz <= a_idx);
- else
- result = linebreak_rules[b_idx][a_idx];
- if (result == PROP_UNKNOWN)
- return LINEBREAK_ACTION_DIRECT;
- return result;
- }
- propval_t linebreak_lbrule(propval_t b_idx, propval_t a_idx)
- {
- /* Resolve before-side class. */
- switch (b_idx) {
- /* LB1: Resolve SA, SG, XX to AL; AI and CJ cannot be resolved. */
- case LB_SA:
- case LB_SG:
- case LB_XX:
- /* LB10: Resolve CM to AL. */
- case LB_CM:
- /* Resolve HL to AL. */
- case LB_HL:
- b_idx = LB_AL;
- break;
- }
- /* Resolve after-side class. */
- switch (a_idx) {
- /* LB1 */
- case LB_SA:
- case LB_SG:
- case LB_XX:
- a_idx = LB_AL;
- break;
- /* LB9, LB10 */
- case LB_CM:
- /* LB9: Treat X CM as if it were X, with some exceptions. */
- switch (b_idx) {
- case LB_BK:
- case LB_CR:
- case LB_LF:
- case LB_NL:
- case LB_SP:
- case LB_ZW:
- break;
- default:
- return LINEBREAK_ACTION_PROHIBITED;
- }
- /* XXX Legacy-CM rule cannot be applied. */
- /* LB10: Treat any remaining combining mark as AL. */
- a_idx = LB_AL;
- if (b_idx == LB_CM)
- b_idx = LB_AL;
- break;
- /* Resolve HL to AL. */
- case LB_HL:
- a_idx = LB_AL;
- break;
- }
- /* LB25, simplified:
- * (CL|CP|NU) × (PO|PR)
- * (PO|PR) × (OP|NU)
- * (HY|IS|NU|SY) × NU
- */
- if (((b_idx == LB_CL || b_idx == LB_CP || b_idx == LB_NU) &&
- (a_idx == LB_PO || a_idx == LB_PR)) ||
- ((b_idx == LB_PO || b_idx == LB_PR) &&
- (a_idx == LB_OP || a_idx == LB_NU)) ||
- ((b_idx == LB_HY || b_idx == LB_IS || b_idx == LB_NU ||
- b_idx == LB_SY) && a_idx == LB_NU))
- return LINEBREAK_ACTION_PROHIBITED;
- return _lbruleinfo(b_idx, a_idx);
- }
- /** @fn gcstring_t** linebreak_break_partial(linebreak_t *lbobj, unistr_t *input)
- *
- * Perform line breaking algorithm with incremental inputs.
- *
- * @param[in] lbobj linebreak object.
- * @param[in] input Unicode string; give NULL to specify end of input.
- * @return array of (partial) broken grapheme cluster strings terminated by NULL.
- * If internal error occurred, lbobj->errnum is set then NULL is returned.
- */
- static
- gcstring_t **_break_partial(linebreak_t * lbobj, unistr_t * input,
- size_t * lenp, int eot)
- {
- int state;
- gcstring_t *str = NULL, *bufStr = NULL, *bufSpc = NULL;
- double bufCols;
- size_t bBeg, bLen, bCM, bSpc, aCM, urgEnd;
- gcstring_t **results = NULL;
- size_t reslen = 0;
- gcstring_t *s = NULL, *t = NULL, *beforeFrg = NULL, *fmt = NULL,
- *broken = NULL;
- unistr_t unistr;
- size_t i;
- gcstring_t empty = { NULL, 0, NULL, 0, 0, lbobj };
- /***
- *** Unread and additional input.
- ***/
- unistr.str = lbobj->unread.str;
- unistr.len = lbobj->unread.len;
- lbobj->unread.str = NULL;
- lbobj->unread.len = 0;
- if (input != NULL && input->len != 0) {
- unichar_t *_u;
- if ((_u = realloc(unistr.str,
- sizeof(unichar_t) * (unistr.len + input->len)))
- == NULL) {
- lbobj->errnum = errno;
- free(unistr.str);
- return NULL;
- } else
- unistr.str = _u;
- memcpy(unistr.str + unistr.len, input->str,
- sizeof(unichar_t) * input->len);
- unistr.len += input->len;
- }
- /***
- *** Preprocessing.
- ***/
- /* perform user breaking */
- if (lbobj->user_func != NULL)
- str = _user(lbobj, &unistr);
- else
- str = _prep(lbobj, &unistr);
- free(unistr.str);
- if (str == NULL)
- return NULL;
- /* South East Asian complex breaking. */
- errno = 0;
- linebreak_southeastasian_flagbreak(str);
- if (errno) {
- lbobj->errnum = errno;
- gcstring_DESTROY(str);
- return NULL;
- }
- /* LB21a (as of 6.1.0): HL (HY | BA) × [^ CB] */
- if (str != NULL && str->gclen) {
- propval_t lbc;
- for (i = 0; i < str->gclen; i++) {
- /* HL */
- if ((lbc = gcstring_lbclass(str, i)) == LB_HL &&
- gcstring_lbclass_ext(str, i) == lbc)
- /* avoid non-CM grapheme extenders */
- i++;
- else
- continue;
- /* CM* */
- while (i < str->gclen && gcstring_lbclass(str, i) == LB_CM)
- i++;
- if (str->gclen <= i)
- break;
- /* (HY|BA) */
- if (((lbc = gcstring_lbclass(str, i)) == LB_HY ||
- lbc == LB_BA) && gcstring_lbclass_ext(str, i) == lbc)
- /* avoid non-CM grapheme extenders */
- i++;
- else
- continue;
- /* CM* */
- while (i < str->gclen && gcstring_lbclass(str, i) == LB_CM)
- i++;
- if (str->gclen <= i)
- break;
- /* [^CB] */
- switch (gcstring_lbclass(str, i)) {
- /* prohibit break by default */
- case LB_BK: /* LB6 */
- case LB_CR:
- case LB_LF:
- case LB_NL:
- case LB_SP: /* LB7 */
- case LB_ZW:
- case LB_CM: /* LB9 */
- case LB_WJ: /* LB11 */
- /* allow break by default */
- case LB_CB: /* LB20 */
- continue;
- }
- if (!str->gcstr[i].flag)
- str->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE;
- }
- }
- /* LB25: not break in (PR|PO)? (OP|HY)? NU (NU|SY|IS)* (CL|CP)? (PR|PO)? */
- /* FIXME:Avoid non-CM grapheme extenders */
- if (str != NULL && str->gclen) {
- size_t st, et;
- for (i = 0; i < str->gclen; i++) {
- st = et = (size_t) - 1;
- /* (PR|PO)? */
- switch (gcstring_lbclass(str, i)) {
- case LB_PR:
- case LB_PO:
- if (st == (size_t) - 1)
- st = i;
- LB25_PRPO_PREFIX:
- i++;
- /* CM* */
- while (i < str->gclen && gcstring_lbclass(str, i) == LB_CM)
- i++;
- if (str->gclen <= i)
- goto LB25_BREAK;
- }
- /* (OP|HY)? */
- switch (gcstring_lbclass(str, i)) {
- case LB_OP:
- case LB_HY:
- if (st == (size_t) - 1)
- st = i;
- LB25_OPHY_PREFIX:
- i++;
- /* CM* */
- while (i < str->gclen && gcstring_lbclass(str, i) == LB_CM)
- i++;
- if (str->gclen <= i) {
- if (eot)
- goto LB25_BREAK;
- else
- goto LB25_FOUND; /* save possible partial sequence. */
- }
- }
- /* NU (NU|SY|IS)* */
- switch (gcstring_lbclass(str, i)) {
- case LB_NU:
- if (st == (size_t) - 1)
- st = i;
- i++;
- /* (NU|SY|IS|CM)* */
- while (i < str->gclen)
- switch (gcstring_lbclass(str, i)) {
- case LB_NU:
- case LB_SY:
- case LB_IS:
- case LB_CM:
- i++;
- break;
- /* (CL|CP) */
- case LB_CL:
- case LB_CP:
- goto LB25_CLCP_SUFFIX;
- /* (PR|PO) */
- case LB_PR:
- case LB_PO:
- goto LB25_PRPO_SUFFIX;
- default:
- goto LB25_FOUND;
- }
- if (str->gclen <= i)
- goto LB25_FOUND;
- break;
- case LB_PR:
- case LB_PO:
- st = i;
- goto LB25_PRPO_PREFIX;
- case LB_OP:
- case LB_HY:
- st = i;
- goto LB25_OPHY_PREFIX;
- default:
- continue;
- }
- /* (CL|CP)? */
- switch (gcstring_lbclass(str, i)) {
- case LB_CL:
- case LB_CP:
- LB25_CLCP_SUFFIX:
- i++;
- /* CM* */
- while (i < str->gclen && gcstring_lbclass(str, i) == LB_CM)
- i++;
- if (str->gclen <= i)
- goto LB25_FOUND;
- }
- /* (PR|PO)? */
- switch (gcstring_lbclass(str, i)) {
- case LB_PR:
- case LB_PO:
- LB25_PRPO_SUFFIX:
- et = i;
- i++;
- /* CM* */
- while (i < str->gclen && gcstring_lbclass(str, i) == LB_CM)
- i++;
- if (str->gclen <= i)
- goto LB25_FOUND;
- }
- LB25_FOUND:
- for (st++; st < i; st++) {
- if (!str->gcstr[st].flag)
- str->gcstr[st].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE;
- }
- /* match may be overwrapped */
- if (et != (size_t) - 1) {
- i = st = et;
- et = (size_t) - 1;
- goto LB25_PRPO_PREFIX;
- }
- }
- LB25_BREAK:
- ;
- }
- /***
- *** Initialize status.
- ***/
- str->pos = 0;
- /*
- * Line buffer.
- * bufStr: Unbreakable text fragment.
- * bufSpc: Trailing spaces.
- * bufCols: Columns of bufStr: can be differ from gcstring_columns().
- * state: Start of text/paragraph status.
- * 0: Start of text not done.
- * 1: Start of text done while start of paragraph not done.
- * 2: Start of paragraph done while end of paragraph not done.
- */
- state = lbobj->state;
- unistr.str = lbobj->bufstr.str;
- unistr.len = lbobj->bufstr.len;
- lbobj->bufstr.str = NULL;
- lbobj->bufstr.len = 0;
- IF_NULL_THEN_ABORT(bufStr = gcstring_new(&unistr, lbobj));
- unistr.str = lbobj->bufspc.str;
- unistr.len = lbobj->bufspc.len;
- lbobj->bufspc.str = NULL;
- lbobj->bufspc.len = 0;
- IF_NULL_THEN_ABORT(bufSpc = gcstring_new(&unistr, lbobj));
- bufCols = lbobj->bufcols;
- /*
- * Indexes and flags
- * bBeg: Start of unbreakable text fragment.
- * bLen: Length of unbreakable text fragment.
- * bSpc: Length of trailing spaces.
- * urgEnd: End of substring broken by urgent breaking.
- *
- * ...read...| before :CM | spaces | after :CM |...unread...|
- * ^ ->bCM<- ^ ->aCM<- ^
- * |<-- bLen -->|<- bSpc ->| ^ |
- * bBeg candidate str->pos end of
- * breaking input
- * point
- * `read' positions shall never be read again.
- */
- bBeg = bLen = bCM = bSpc = aCM = urgEnd = 0;
- /* Result. */
- IF_NULL_THEN_ABORT(results = malloc(sizeof(gcstring_t **)));
- results[0] = NULL;
- while (1) {
- /***
- *** Chop off a pair of unbreakable character clusters from text.
- ***/
- int action = 0;
- propval_t lbc;
- double newcols;
- /* Go ahead reading input. */
- while (!gcstring_eos(str)) {
- lbc = gcstring_lbclass(str, str->pos);
- /**
- ** Append SP/ZW/eop to ``before'' buffer.
- **/
- switch (lbc) {
- /* - Explicit breaks and non-breaks */
- /* LB7(1): × SP+ */
- case LB_SP:
- gcstring_next(str);
- bSpc++;
- /* End of input. */
- continue; /* while (!gcstring_eos(str)) */
- /* - Mandatory breaks */
- /* LB4 - LB7: × SP* (BK | CR LF | CR | LF | NL) ! */
- case LB_BK:
- case LB_CR:
- case LB_LF:
- case LB_NL:
- gcstring_next(str);
- bSpc++;
- goto last_CHARACTER_PAIR; /* while (!gcstring_eos(str)) */
- /* - Explicit breaks and non-breaks */
- /* LB7(2): × (SP* ZW+)+ */
- case LB_ZW:
- gcstring_next(str);
- bLen += bSpc + 1;
- bCM = 0;
- bSpc = 0;
- /* End of input */
- continue; /* while (!gcstring_eos(str)) */
- }
- /**
- ** Then fill ``after'' buffer.
- **/
- gcstring_next(str);
- /* skip to end of unbreakable fragment by user/complex/urgent
- * breaking. */
- while (!gcstring_eos(str) && str->gcstr[str->pos].flag &
- LINEBREAK_FLAG_PROHIBIT_BEFORE)
- gcstring_next(str);
- /* - Combining marks */
- /* LB9: Treat X CM+ as if it were X
- * where X is anything except BK, CR, LF, NL, SP or ZW
- * (NB: Some CM characters may be single grapheme cluster
- * since they have Grapheme_Cluster_Break property Control.) */
- while (!gcstring_eos(str) &&
- gcstring_lbclass(str, str->pos) == LB_CM) {
- gcstring_next(str);
- aCM++;
- }
- /* - Start of text */
- /* LB2: sot × */
- if (0 < bLen || 0 < bSpc)
- break; /* while (!gcstring_eos(str)) */
- /* shift buffers. */
- bLen = str->pos - bBeg;
- bSpc = 0;
- bCM = aCM;
- aCM = 0;
- } /* while (!gcstring_eos(str)) */
- last_CHARACTER_PAIR:
- /***
- *** Determin line breaking action by classes of adjacent characters.
- ***/
- /* Mandatory break. */
- if (0 < bSpc &&
- (lbc = gcstring_lbclass(str, bBeg + bLen + bSpc - 1)) != LB_SP
- && (lbc != LB_CR || eot || !gcstring_eos(str))) {
- /* CR at end of input may be part of CR LF therefore not be eop. */
- action = LINEBREAK_ACTION_MANDATORY;
- /* LB11, LB12 and tailorable rules LB13 - LB31.
- * Or urgent breaking. */
- } else if (bBeg + bLen + bSpc < str->pos) {
- if (str->gcstr[bBeg + bLen + bSpc].flag &
- LINEBREAK_FLAG_ALLOW_BEFORE)
- action = LINEBREAK_ACTION_DIRECT;
- else if (str->gcstr[bBeg + bLen + bSpc].flag &
- LINEBREAK_FLAG_PROHIBIT_BEFORE)
- action = LINEBREAK_ACTION_PROHIBITED;
- else if (lbobj->options & LINEBREAK_OPTION_BREAK_INDENT &&
- bLen == 0 && 0 < bSpc)
- /* Allow break at sot or after breaking,
- * although rules don't tell it obviously. */
- action = LINEBREAK_ACTION_DIRECT;
- else {
- propval_t blbc, albc;
- size_t btail;
- if (bLen == 0)
- btail = bBeg + bSpc - 1; /* before buffer is SP only. */
- else
- btail = bBeg + bLen - bCM - 1; /* LB9 */
- blbc = gcstring_lbclass_ext(str, btail);
- switch (blbc) {
- /* (SG and XX are already resolved). */
- /* LB1: Resolve AI and CJ. */
- case LB_AI:
- blbc = (lbobj->options &
- LINEBREAK_OPTION_EASTASIAN_CONTEXT) ?
- LB_ID : LB_AL;
- break;
- case LB_CJ:
- blbc = (lbobj->options &
- LINEBREAK_OPTION_NONSTARTER_LOOSE) ?
- LB_ID : LB_NS;
- break;
- /* LB1: SA is resolved to AL. */
- case LB_SA:
- blbc = LB_AL;
- break;
- /* LB10: Treat any remaining CM+ as if it were AL. */
- case LB_CM:
- blbc = LB_AL;
- break;
- /* (As of 6.1.0): Treat HL as AL. */
- case LB_HL:
- blbc = LB_AL;
- break;
- /* Optionally, treat hangul syllable as if it were AL. */
- case LB_H2:
- case LB_H3:
- case LB_JL:
- case LB_JV:
- case LB_JT:
- if (lbobj->options & LINEBREAK_OPTION_HANGUL_AS_AL)
- blbc = LB_AL;
- break;
- }
- albc = gcstring_lbclass(str, bBeg + bLen + bSpc);
- switch (albc) {
- /* (SG and XX are already resolved). */
- /* LB1: Resolve AI and CJ. */
- case LB_AI:
- albc = (lbobj->options &
- LINEBREAK_OPTION_EASTASIAN_CONTEXT) ?
- LB_ID : LB_AL;
- break;
- case LB_CJ:
- albc = (lbobj->options &
- LINEBREAK_OPTION_NONSTARTER_LOOSE) ?
- LB_ID : LB_NS;
- break;
- /* LB1: SA is resolved to AL. */
- case LB_SA:
- albc = LB_AL;
- break;
- /* LB10: Treat any remaining CM+ as if it were AL. */
- case LB_CM:
- albc = LB_AL;
- break;
- /* (As of 6.1.0): Treat HL as AL. */
- case LB_HL:
- albc = LB_AL;
- break;
- /* Optionally, treat hangul syllable as if it were AL. */
- case LB_H2:
- case LB_H3:
- case LB_JL:
- case LB_JV:
- case LB_JT:
- if (lbobj->options & LINEBREAK_OPTION_HANGUL_AS_AL)
- albc = LB_AL;
- break;
- }
- action = _lbruleinfo(blbc, albc);
- }
- /* Check prohibited break. */
- if (action == LINEBREAK_ACTION_PROHIBITED ||
- (action == LINEBREAK_ACTION_INDIRECT && bSpc == 0)) {
- /* When conjunction is expected to exceed charmax,
- * try urgent breaking. */
- if (urgEnd < bBeg + bLen + bSpc &&
- 0 < lbobj->charmax &&
- lbobj->charmax < str->gcstr[str->pos - 1].idx +
- str->gcstr[str->pos - 1].len - str->gcstr[bBeg].idx) {
- size_t charmax, chars;
- IF_NULL_THEN_ABORT(s = gcstring_substr(str, bBeg,
- str->pos -
- bBeg));
- IF_NULL_THEN_ABORT(broken = _urgent_break(lbobj, s));
- gcstring_DESTROY(s);
- /* If any of urgently broken fragments still
- * exceed CharactersMax, force chop them. */
- charmax = lbobj->charmax;
- broken->pos = 0;
- chars = gcstring_next(broken)->len;
- while (!gcstring_eos(broken)) {
- if (broken->gcstr[broken->pos].flag &
- LINEBREAK_FLAG_ALLOW_BEFORE)
- chars = 0;
- else if (charmax <
- chars + broken->gcstr[broken->pos].len) {
- broken->gcstr[broken->pos].flag |=
- LINEBREAK_FLAG_ALLOW_BEFORE;
- chars = 0;
- } else
- chars += broken->gcstr[broken->pos].len;
- gcstring_next(broken);
- } /* while (!gcstring_eos(broken)) */
- urgEnd = broken->gclen;
- gcstring_replace(str, 0, str->pos, broken);
- gcstring_DESTROY(broken);
- str->pos = 0;
- bBeg = bLen = bCM = bSpc = aCM = 0;
- continue; /* while (1) */
- }
- /* if (urgEnd < ...) */
- /* Otherwise, fragments may be conjuncted safely. Read more. */
- bLen = str->pos - bBeg;
- bSpc = 0;
- bCM = aCM;
- aCM = 0;
- continue; /* while (1) */
- } /* if (action == ...) */
- } /* if (0 < bSpc && ...) */
- /***
- *** Check end of input.
- ***/
- if (!eot && str->gclen <= bBeg + bLen + bSpc) {
- /* Save status then output partial result. */
- lbobj->bufstr.str = bufStr->str;
- lbobj->bufstr.len = bufStr->len;
- bufStr->str = NULL;
- bufStr->len = 0;
- gcstring_DESTROY(bufStr);
- lbobj->bufspc.str = bufSpc->str;
- lbobj->bufspc.len = bufSpc->len;
- bufSpc->str = NULL;
- bufSpc->len = 0;
- gcstring_DESTROY(bufSpc);
- lbobj->bufcols = bufCols;
- s = gcstring_substr(str, bBeg, str->gclen - bBeg);
- lbobj->unread.str = s->str;
- lbobj->unread.len = s->len;
- s->str = NULL;
- s->len = 0;
- gcstring_DESTROY(s);
- lbobj->state = state;
- /* clenup. */
- gcstring_DESTROY(str);
- if (lenp != NULL)
- *lenp = reslen;
- return results;
- }
- /* After all, possible actions are MANDATORY and arbitrary. */
- /***
- *** Examine line breaking action
- ***/
- IF_NULL_THEN_ABORT(beforeFrg = gcstring_substr(str, bBeg, bLen));
- if (state == LINEBREAK_STATE_NONE) { /* sot undone. */
- /* Process start of text. */
- IF_NULL_THEN_ABORT(fmt = _format(lbobj, LINEBREAK_STATE_SOT,
- beforeFrg));
- if (gcstring_cmp(beforeFrg, fmt) != 0) {
- s = gcstring_substr(str, bBeg + bLen, bSpc);
- gcstring_append(fmt, s);
- gcstring_DESTROY(s);
- s = gcstring_substr(str, bBeg + bLen + bSpc,
- str->pos - (bBeg + bLen + bSpc));
- gcstring_append(fmt, s);
- gcstring_DESTROY(s);
- gcstring_replace(str, 0, str->pos, fmt);
- str->pos = 0;
- bBeg = bLen = bCM = bSpc = aCM = 0;
- urgEnd = 0;
- state = LINEBREAK_STATE_SOT_FORMAT;
- gcstring_DESTROY(fmt);
- gcstring_DESTROY(beforeFrg);
- continue; /* while (1) */
- }
- gcstring_DESTROY(fmt);
- state = LINEBREAK_STATE_SOL;
- } else if (state == LINEBREAK_STATE_SOT_FORMAT)
- state = LINEBREAK_STATE_SOL;
- else if (state == LINEBREAK_STATE_SOT) { /* sop undone. */
- /* Process start of paragraph. */
- IF_NULL_THEN_ABORT(fmt = _format(lbobj, LINEBREAK_STATE_SOP,
- beforeFrg));
- if (gcstring_cmp(beforeFrg, fmt) != 0) {
- s = gcstring_substr(str, bBeg + bLen, bSpc);
- gcstring_append(fmt, s);
- gcstring_DESTROY(s);
- s = gcstring_substr(str, bBeg + bLen + bSpc,
- str->pos - (bBeg + bLen + bSpc));
- gcstring_append(fmt, s);
- gcstring_DESTROY(s);
- gcstring_replace(str, 0, str->pos, fmt);
- str->pos = 0;
- bBeg = bLen = bCM = bSpc = aCM = 0;
- urgEnd = 0;
- state = LINEBREAK_STATE_SOP_FORMAT;
- gcstring_DESTROY(fmt);
- gcstring_DESTROY(beforeFrg);
- continue; /* while (1) */
- }
- gcstring_DESTROY(fmt);
- state = LINEBREAK_STATE_SOP;
- } else if (state == LINEBREAK_STATE_SOP_FORMAT)
- state = LINEBREAK_STATE_SOP;
- /***
- *** Check if arbitrary break is needed.
- ***/
- newcols = _sizing(lbobj, bufCols, bufStr, bufSpc, beforeFrg);
- if (newcols < 0.0) {
- IF_NULL_THEN_ABORT(NULL);
- }
- if (0 < lbobj->colmax && lbobj->colmax < newcols) {
- newcols = _sizing(lbobj, 0.0, &empty, &empty, beforeFrg);
- if (newcols < 0.0) {
- IF_NULL_THEN_ABORT(NULL);
- }
- /**
- ** When arbitrary break is expected to generate a line shorter
- ** than colmin or, beforeFrg will exceed colmax, try urgent
- ** breaking.
- **/
- if (urgEnd < bBeg + bLen + bSpc) {
- broken = NULL;
- if (0.0 < bufCols && bufCols < lbobj->colmin) {
- gcstring_replace(beforeFrg, 0, 0, bufSpc);
- gcstring_replace(beforeFrg, 0, 0, bufStr);
- gcstring_shrink(bufSpc, 0);
- gcstring_shrink(bufStr, 0);
- bufCols = 0.0;
- IF_NULL_THEN_ABORT(broken = _urgent_break(lbobj,
- beforeFrg));
- } else if (lbobj->colmax < newcols) {
- IF_NULL_THEN_ABORT(broken = _urgent_break(lbobj,
- beforeFrg));
- }
- if (broken != NULL) {
- s = gcstring_substr(str, bBeg + bLen, bSpc);
- gcstring_append(broken, s);
- gcstring_DESTROY(s);
- gcstring_replace(str, 0, bBeg + bLen + bSpc, broken);
- str->pos = 0;
- urgEnd = broken->gclen;
- bBeg = bLen = bCM = bSpc = aCM = 0;
- gcstring_DESTROY(broken);
- gcstring_DESTROY(beforeFrg);
- continue; /* while (1) */
- }
- }
- /**
- ** Otherwise, process arbitrary break.
- **/
- if (bufStr->len || bufSpc->len) {
- gcstring_t **r;
- IF_NULL_THEN_ABORT(r = realloc(results,
- sizeof(gcstring_t *) *
- (reslen + 2)));
- (results = r)[reslen + 1] = NULL;
- IF_NULL_THEN_ABORT(s = _format(lbobj, LINEBREAK_STATE_LINE,
- bufStr));
- IF_NULL_THEN_ABORT(t = _format(lbobj, LINEBREAK_STATE_EOL,
- bufSpc));
- IF_NULL_THEN_ABORT(results[reslen] =
- gcstring_concat(s, t));
- reslen++;
- gcstring_DESTROY(s);
- gcstring_DESTROY(t);
- IF_NULL_THEN_ABORT(fmt =
- _format(lbobj, LINEBREAK_STATE_SOL,
- beforeFrg));
- if (gcstring_cmp(beforeFrg, fmt) != 0) {
- gcstring_DESTROY(beforeFrg);
- beforeFrg = fmt;
- newcols =
- _sizing(lbobj, 0.0, &empty, &empty, beforeFrg);
- if (newcols < 0.0) {
- IF_NULL_THEN_ABORT(NULL);
- }
- } else
- gcstring_DESTROY(fmt);
- }
- gcstring_shrink(bufStr, 0);
- gcstring_append(bufStr, beforeFrg);
- gcstring_shrink(bufSpc, 0);
- s = gcstring_substr(str, bBeg + bLen, bSpc);
- gcstring_append(bufSpc, s);
- gcstring_DESTROY(s);
- bufCols = newcols;
- /***
- *** Arbitrary break is not needed.
- ***/
- } else {
- gcstring_append(bufStr, bufSpc);
- gcstring_append(bufStr, beforeFrg);
- gcstring_shrink(bufSpc, 0);
- s = gcstring_substr(str, bBeg + bLen, bSpc);
- gcstring_append(bufSpc, s);
- gcstring_DESTROY(s);
- bufCols = newcols;
- } /* if (0 < lbobj->colmax ... ) */
- gcstring_DESTROY(beforeFrg);
- /***
- *** Mandatory break or end-of-text.
- ***/
- if (eot && str->gclen <= bBeg + bLen + bSpc)
- break; /* while (1) */
- if (action == LINEBREAK_ACTION_MANDATORY) {
- /* Process mandatory break. */
- gcstring_t **r;
- IF_NULL_THEN_ABORT(r = realloc(results,
- sizeof(gcstring_t *) *
- (reslen + 2)));
- (results = r)[reslen + 1] = NULL;
- IF_NULL_THEN_ABORT(s = _format(lbobj, LINEBREAK_STATE_LINE,
- bufStr));
- IF_NULL_THEN_ABORT(t = _format(lbobj, LINEBREAK_STATE_EOP,
- bufSpc));
- IF_NULL_THEN_ABORT(results[reslen] = gcstring_concat(s, t));
- reslen++;
- gcstring_DESTROY(s);
- gcstring_DESTROY(t);
- /* eop done then sop must be carried out. */
- state = LINEBREAK_STATE_SOT;
- gcstring_shrink(bufStr, 0);
- gcstring_shrink(bufSpc, 0);
- bufCols = 0.0;
- }
- /***
- *** Shift buffers.
- ***/
- bBeg += bLen + bSpc;
- bLen = str->pos - bBeg;
- bSpc = 0;
- bCM = aCM;
- aCM = 0;
- } /* while (1) */
- /***
- *** Process end of text.
- ***/
- {
- gcstring_t **r;
- IF_NULL_THEN_ABORT(r = realloc(results,
- sizeof(gcstring_t *) * (reslen +
- 2)));
- (results = r)[reslen + 1] = NULL;
- IF_NULL_THEN_ABORT(s =
- _format(lbobj, LINEBREAK_STATE_LINE, bufStr));
- IF_NULL_THEN_ABORT(t =
- _format(lbobj, LINEBREAK_STATE_EOT, bufSpc));
- IF_NULL_THEN_ABORT(results[reslen] = gcstring_concat(s, t));
- reslen++;
- gcstring_DESTROY(s);
- gcstring_DESTROY(t);
- }
- /* clenup. */
- gcstring_DESTROY(str);
- gcstring_DESTROY(bufStr);
- gcstring_DESTROY(bufSpc);
- /* Reset status then return the rest of result. */
- linebreak_reset(lbobj);
- if (lenp != NULL)
- *lenp = reslen;
- return results;
- }
- gcstring_t **linebreak_break_partial(linebreak_t * lbobj, unistr_t * input)
- {
- return _break_partial(lbobj, input, NULL, (input == NULL));
- }
- /**
- * Perform line breaking algorithm on complete input.
- *
- * This function will consume heap size proportional to input size.
- * linebreak_break() is highly recommended.
- *
- * @param[in] lbobj linebreak object.
- * @param[in] input Unicode string.
- * @return array of broken grapheme cluster strings terminated by NULL.
- * If internal error occurred, lbobj->errnum is set then NULL is returned.
- */
- gcstring_t **linebreak_break_fast(linebreak_t * lbobj, unistr_t * input)
- {
- gcstring_t **ret;
- if (input == NULL) {
- if ((ret = malloc(sizeof(gcstring_t *))) == NULL)
- lbobj->errnum = errno ? errno : ENOMEM;
- else
- ret[0] = NULL;
- return ret;
- }
- return _break_partial(lbobj, input, NULL, 1);
- }
- #define PARTIAL_LENGTH (1000)
- /** Perform line breaking algorithm on complete input.
- *
- * This function will consume constant size of heap.
- *
- * @param[in] lbobj linebreak object.
- * @param[in] input Unicode string.
- * @return array of broken grapheme cluster strings terminated by NULL.
- * If internal error occurred, lbobj->errnum is set then NULL is returned.
- */
- gcstring_t **linebreak_break(linebreak_t * lbobj, unistr_t * input)
- {
- unistr_t unistr = { NULL, 0 };
- gcstring_t **ret, **appe, **r;
- size_t i, j, k, retlen, appelen;
- if ((ret = malloc(sizeof(gcstring_t *))) == NULL) {
- lbobj->errnum = errno ? errno : ENOMEM;
- return NULL;
- } else
- ret[0] = NULL;
- if (input == NULL)
- return ret;
- retlen = 0;
- unistr.len = PARTIAL_LENGTH;
- for (k = 0; PARTIAL_LENGTH < input->len - k; k += PARTIAL_LENGTH) {
- unistr.str = input->str + k;
- if ((appe = _break_partial(lbobj, &unistr, &appelen, 0)) == NULL) {
- for (i = 0; i < retlen; i++)
- gcstring_destroy(ret[i]);
- free(ret);
- return NULL;
- }
- if (appelen) {
- if ((r = realloc(ret,
- sizeof(gcstring_t *) *
- (retlen + appelen + 1))) == NULL) {
- lbobj->errnum = errno ? errno : ENOMEM;
- for (i = 0; i < retlen; i++)
- gcstring_destroy(ret[i]);
- free(ret);
- for (j = 0; j < appelen; j++)
- gcstring_destroy(appe[j]);
- free(appe);
- return NULL;
- } else
- ret = r;
- memcpy(ret + retlen, appe,
- sizeof(gcstring_t *) * (appelen + 1));
- retlen += appelen;
- }
- free(appe);
- }
- unistr.len = input->len - k;
- unistr.str = input->str + k;
- if (k < input->len) {
- if ((appe = _break_partial(lbobj, &unistr, &appelen, 1)) == NULL) {
- for (i = 0; i < retlen; i++)
- gcstring_destroy(ret[i]);
- free(ret);
- return NULL;
- }
- if (appelen) {
- if ((r = realloc(ret,
- sizeof(gcstring_t *) *
- (retlen + appelen + 1))) == NULL) {
- lbobj->errnum = errno ? errno : ENOMEM;
- for (i = 0; i < retlen; i++)
- gcstring_destroy(ret[i]);
- free(ret);
- for (j = 0; j < appelen; j++)
- gcstring_destroy(appe[j]);
- free(appe);
- return NULL;
- } else
- ret = r;
- memcpy(ret + retlen, appe,
- sizeof(gcstring_t *) * (appelen + 1));
- retlen += appelen;
- }
- free(appe);
- }
- return ret;
- }
- /** Perform line breaking algorithm on UTF-8 text
- *
- * This function will consume constant size of heap.
- *
- * @param[in] lbobj linebreak object.
- * @param[in] input UTF-8 string, must not be NULL.
- * @param[in] len length of UTF-8 string.
- * @param[in] check check input. See sombok_decode_utf8().
- * @return array of broken grapheme cluster strings terminated by NULL.
- * If internal error occurred, lbobj->errnum is set then NULL is returned.
- */
- gcstring_t **linebreak_break_from_utf8(linebreak_t * lbobj,
- char *input, size_t len, int check)
- {
- unistr_t unistr = { NULL, 0 };
- gcstring_t **ret;
- if (input == NULL) {
- lbobj->errnum = EINVAL;
- return NULL;
- }
- if (sombok_decode_utf8(&unistr, 0, input, len, check) == NULL)
- return NULL;
- ret = linebreak_break(lbobj, &unistr);
- free(unistr.str);
- return ret;
- }
- void linebreak_free_result(gcstring_t ** result, int deep)
- {
- size_t i;
- if (result == NULL)
- return;
- if (deep)
- for (i = 0; result[i] != NULL; i++)
- gcstring_destroy(result[i]);
- free(result);
- }