PageRenderTime 62ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 1ms

/Main/Libraries/Hunspell/hyphen/hyphen.cpp

#
C++ | 1074 lines | 854 code | 103 blank | 117 comment | 283 complexity | 328e54392e703a35d4c58e018132dee4 MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception
  1. /* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
  2. * licenses follows.
  3. */
  4. /* LibHnj - a library for high quality hyphenation and justification
  5. * Copyright (C) 1998 Raph Levien,
  6. * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
  7. * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
  8. * (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo)
  9. *
  10. * This library is free software; you can redistribute it and/or
  11. * modify it under the terms of the GNU Library General Public
  12. * License as published by the Free Software Foundation; either
  13. * version 2 of the License, or (at your option) any later version.
  14. *
  15. * This library is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * Library General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU Library General Public
  21. * License along with this library; if not, write to the
  22. * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  23. * Boston, MA 02111-1307 USA.
  24. */
  25. /*
  26. * The contents of this file are subject to the Mozilla Public License
  27. * Version 1.0 (the "MPL"); you may not use this file except in
  28. * compliance with the MPL. You may obtain a copy of the MPL at
  29. * http://www.mozilla.org/MPL/
  30. *
  31. * Software distributed under the MPL is distributed on an "AS IS" basis,
  32. * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
  33. * for the specific language governing rights and limitations under the
  34. * MPL.
  35. *
  36. */
  37. #include <stdlib.h> /* for NULL, malloc */
  38. #include <stdio.h> /* for fprintf */
  39. #include <string.h> /* for strdup */
  40. #include "../NHunspellExtensions.h"
  41. #ifdef UNX
  42. #include <unistd.h> /* for exit */
  43. #endif
  44. #define noVERBOSE
  45. /* calculate hyphenmin values with long ligature length (2 or 3 characters
  46. * instead of 1 or 2) for comparison with hyphenation without ligatures */
  47. #define noLONG_LIGATURE
  48. #ifdef LONG_LIGATURE
  49. #define LIG_xx 1
  50. #define LIG_xxx 2
  51. #else
  52. #define LIG_xx 0
  53. #define LIG_xxx 1
  54. #endif
  55. #include "hnjalloc.h"
  56. #include "hyphen.h"
  57. static char *
  58. hnj_strdup (const char *s)
  59. {
  60. char *newc;
  61. int l;
  62. l = strlen (s);
  63. newc = (char *) hnj_malloc (l + 1);
  64. memcpy (newc, s, l);
  65. newc[l] = 0;
  66. return newc;
  67. }
  68. /* remove cross-platform text line end characters */
  69. void hnj_strchomp(char * s)
  70. {
  71. int k = strlen(s);
  72. if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
  73. if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
  74. }
  75. /* a little bit of a hash table implementation. This simply maps strings
  76. to state numbers */
  77. typedef struct _HashTab HashTab;
  78. typedef struct _HashEntry HashEntry;
  79. /* A cheap, but effective, hack. */
  80. #define HASH_SIZE 31627
  81. struct _HashTab {
  82. HashEntry *entries[HASH_SIZE];
  83. };
  84. struct _HashEntry {
  85. HashEntry *next;
  86. char *key;
  87. int val;
  88. };
  89. /* a char* hash function from ASU - adapted from Gtk+ */
  90. static unsigned int
  91. hnj_string_hash (const char *s)
  92. {
  93. const char *p;
  94. unsigned int h=0, g;
  95. for(p = s; *p != '\0'; p += 1) {
  96. h = ( h << 4 ) + *p;
  97. if ( ( g = h & 0xf0000000 ) ) {
  98. h = h ^ (g >> 24);
  99. h = h ^ g;
  100. }
  101. }
  102. return h /* % M */;
  103. }
  104. static HashTab *
  105. hnj_hash_new (void)
  106. {
  107. HashTab *hashtab;
  108. int i;
  109. hashtab = (HashTab *) hnj_malloc (sizeof(HashTab));
  110. for (i = 0; i < HASH_SIZE; i++)
  111. hashtab->entries[i] = NULL;
  112. return hashtab;
  113. }
  114. static void
  115. hnj_hash_free (HashTab *hashtab)
  116. {
  117. int i;
  118. HashEntry *e, *next;
  119. for (i = 0; i < HASH_SIZE; i++)
  120. for (e = hashtab->entries[i]; e; e = next)
  121. {
  122. next = e->next;
  123. hnj_free (e->key);
  124. hnj_free (e);
  125. }
  126. hnj_free (hashtab);
  127. }
  128. /* assumes that key is not already present! */
  129. static void
  130. hnj_hash_insert (HashTab *hashtab, const char *key, int val)
  131. {
  132. int i;
  133. HashEntry *e;
  134. i = hnj_string_hash (key) % HASH_SIZE;
  135. e = (HashEntry *) hnj_malloc (sizeof(HashEntry));
  136. e->next = hashtab->entries[i];
  137. e->key = hnj_strdup (key);
  138. e->val = val;
  139. hashtab->entries[i] = e;
  140. }
  141. /* return val if found, otherwise -1 */
  142. static int
  143. hnj_hash_lookup (HashTab *hashtab, const char *key)
  144. {
  145. int i;
  146. HashEntry *e;
  147. i = hnj_string_hash (key) % HASH_SIZE;
  148. for (e = hashtab->entries[i]; e; e = e->next)
  149. if (!strcmp (key, e->key))
  150. return e->val;
  151. return -1;
  152. }
  153. /* Get the state number, allocating a new state if necessary. */
  154. static int
  155. hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
  156. {
  157. int state_num;
  158. state_num = hnj_hash_lookup (hashtab, string);
  159. if (state_num >= 0)
  160. return state_num;
  161. hnj_hash_insert (hashtab, string, dict->num_states);
  162. /* predicate is true if dict->num_states is a power of two */
  163. if (!(dict->num_states & (dict->num_states - 1)))
  164. {
  165. dict->states = (HyphenState *) hnj_realloc (dict->states,
  166. (dict->num_states << 1) *
  167. sizeof(HyphenState));
  168. }
  169. dict->states[dict->num_states].match = NULL;
  170. dict->states[dict->num_states].repl = NULL;
  171. dict->states[dict->num_states].fallback_state = -1;
  172. dict->states[dict->num_states].num_trans = 0;
  173. dict->states[dict->num_states].trans = NULL;
  174. return dict->num_states++;
  175. }
  176. /* add a transition from state1 to state2 through ch - assumes that the
  177. transition does not already exist */
  178. static void
  179. hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
  180. {
  181. int num_trans;
  182. num_trans = dict->states[state1].num_trans;
  183. if (num_trans == 0)
  184. {
  185. dict->states[state1].trans = (HyphenTrans *) hnj_malloc (sizeof(HyphenTrans));
  186. }
  187. else if (!(num_trans & (num_trans - 1)))
  188. {
  189. dict->states[state1].trans = (HyphenTrans *) hnj_realloc (dict->states[state1].trans,
  190. (num_trans << 1) *
  191. sizeof(HyphenTrans));
  192. }
  193. dict->states[state1].trans[num_trans].ch = ch;
  194. dict->states[state1].trans[num_trans].new_state = state2;
  195. dict->states[state1].num_trans++;
  196. }
  197. #ifdef VERBOSE
  198. HashTab *global;
  199. static char *
  200. get_state_str (int state)
  201. {
  202. int i;
  203. HashEntry *e;
  204. for (i = 0; i < HASH_SIZE; i++)
  205. for (e = global->entries[i]; e; e = e->next)
  206. if (e->val == state)
  207. return e->key;
  208. return NULL;
  209. }
  210. #endif
  211. HyphenDict *
  212. hnj_hyphen_load (const char *fn)
  213. {
  214. HyphenDict *dict[2];
  215. HashTab *hashtab;
  216. FILE *f;
  217. char buf[MAX_CHARS];
  218. char word[MAX_CHARS];
  219. char pattern[MAX_CHARS];
  220. char * repl;
  221. signed char replindex;
  222. signed char replcut;
  223. int state_num = 0, last_state;
  224. int i, j, k;
  225. char ch;
  226. int found;
  227. HashEntry *e;
  228. int nextlevel = 0;
  229. f = fopen (fn, "r");
  230. if (f == NULL)
  231. return NULL;
  232. // loading one or two dictionaries (separated by NEXTLEVEL keyword)
  233. for (k = 0; k == 0 || (k == 1 && nextlevel); k++) {
  234. hashtab = hnj_hash_new ();
  235. #ifdef VERBOSE
  236. global = hashtab;
  237. #endif
  238. hnj_hash_insert (hashtab, "", 0);
  239. dict[k] = (HyphenDict *) hnj_malloc (sizeof(HyphenDict));
  240. dict[k]->num_states = 1;
  241. dict[k]->states = (HyphenState *) hnj_malloc (sizeof(HyphenState));
  242. dict[k]->states[0].match = NULL;
  243. dict[k]->states[0].repl = NULL;
  244. dict[k]->states[0].fallback_state = -1;
  245. dict[k]->states[0].num_trans = 0;
  246. dict[k]->states[0].trans = NULL;
  247. dict[k]->nextlevel = NULL;
  248. dict[k]->lhmin = 0;
  249. dict[k]->rhmin = 0;
  250. dict[k]->clhmin = 0;
  251. dict[k]->crhmin = 0;
  252. /* read in character set info */
  253. if (k == 0) {
  254. for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
  255. fgets(dict[k]->cset, sizeof(dict[k]->cset),f);
  256. for (i=0;i<MAX_NAME;i++)
  257. if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
  258. dict[k]->cset[i] = 0;
  259. dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
  260. } else {
  261. strcpy(dict[k]->cset, dict[0]->cset);
  262. dict[k]->utf8 = dict[0]->utf8;
  263. }
  264. while (fgets (buf, sizeof(buf), f) != NULL)
  265. {
  266. if (buf[0] != '%')
  267. {
  268. if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
  269. nextlevel = 1;
  270. break;
  271. } else if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
  272. dict[k]->lhmin = atoi(buf + 13);
  273. continue;
  274. } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
  275. dict[k]->rhmin = atoi(buf + 14);
  276. continue;
  277. } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
  278. dict[k]->clhmin = atoi(buf + 21);
  279. continue;
  280. } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
  281. dict[k]->crhmin = atoi(buf + 22);
  282. continue;
  283. }
  284. j = 0;
  285. pattern[j] = '0';
  286. repl = strchr(buf, '/');
  287. replindex = 0;
  288. replcut = 0;
  289. if (repl) {
  290. char * index = strchr(repl + 1, ',');
  291. *repl = '\0';
  292. if (index) {
  293. char * index2 = strchr(index + 1, ',');
  294. *index = '\0';
  295. if (index2) {
  296. *index2 = '\0';
  297. replindex = (signed char) atoi(index + 1) - 1;
  298. replcut = (signed char) atoi(index2 + 1);
  299. }
  300. } else {
  301. hnj_strchomp(repl + 1);
  302. replindex = 0;
  303. replcut = (signed char) strlen(buf);
  304. }
  305. repl = hnj_strdup(repl + 1);
  306. }
  307. for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
  308. {
  309. if (buf[i] >= '0' && buf[i] <= '9')
  310. pattern[j] = buf[i];
  311. else
  312. {
  313. word[j] = buf[i];
  314. pattern[++j] = '0';
  315. }
  316. }
  317. word[j] = '\0';
  318. pattern[j + 1] = '\0';
  319. i = 0;
  320. if (!repl) {
  321. /* Optimize away leading zeroes */
  322. for (; pattern[i] == '0'; i++);
  323. } else {
  324. if (*word == '.') i++;
  325. /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
  326. if (dict[k]->utf8) {
  327. int pu = -1; /* unicode character position */
  328. int ps = -1; /* unicode start position (original replindex) */
  329. size_t pc = (*word == '.') ? 1: 0; /* 8-bit character position */
  330. for (; pc < (strlen(word) + 1); pc++) {
  331. /* beginning of an UTF-8 character (not '10' start bits) */
  332. if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
  333. if ((ps < 0) && (replindex == pu)) {
  334. ps = replindex;
  335. replindex = (signed char) pc;
  336. }
  337. if ((ps >= 0) && ((pu - ps) == replcut)) {
  338. replcut = (signed char) (pc - replindex);
  339. break;
  340. }
  341. }
  342. if (*word == '.') replindex--;
  343. }
  344. }
  345. #ifdef VERBOSE
  346. printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j, repl);
  347. #endif
  348. found = hnj_hash_lookup (hashtab, word);
  349. state_num = hnj_get_state (dict[k], hashtab, word);
  350. dict[k]->states[state_num].match = hnj_strdup (pattern + i);
  351. dict[k]->states[state_num].repl = repl;
  352. dict[k]->states[state_num].replindex = replindex;
  353. if (!replcut) {
  354. dict[k]->states[state_num].replcut = (signed char) strlen(word);
  355. } else {
  356. dict[k]->states[state_num].replcut = replcut;
  357. }
  358. /* now, put in the prefix transitions */
  359. for (; found < 0 ;j--)
  360. {
  361. last_state = state_num;
  362. ch = word[j - 1];
  363. word[j - 1] = '\0';
  364. found = hnj_hash_lookup (hashtab, word);
  365. state_num = hnj_get_state (dict[k], hashtab, word);
  366. hnj_add_trans (dict[k], state_num, last_state, ch);
  367. }
  368. }
  369. }
  370. /* Could do unioning of matches here (instead of the preprocessor script).
  371. If we did, the pseudocode would look something like this:
  372. foreach state in the hash table
  373. foreach i = [1..length(state) - 1]
  374. state to check is substr (state, i)
  375. look it up
  376. if found, and if there is a match, union the match in.
  377. It's also possible to avoid the quadratic blowup by doing the
  378. search in order of increasing state string sizes - then you
  379. can break the loop after finding the first match.
  380. This step should be optional in any case - if there is a
  381. preprocessed rule table, it's always faster to use that.
  382. */
  383. /* put in the fallback states */
  384. for (i = 0; i < HASH_SIZE; i++)
  385. for (e = hashtab->entries[i]; e; e = e->next)
  386. {
  387. if (*(e->key)) for (j = 1; 1; j++)
  388. {
  389. state_num = hnj_hash_lookup (hashtab, e->key + j);
  390. if (state_num >= 0)
  391. break;
  392. }
  393. /* KBH: FIXME state 0 fallback_state should always be -1? */
  394. if (e->val)
  395. dict[k]->states[e->val].fallback_state = state_num;
  396. }
  397. #ifdef VERBOSE
  398. for (i = 0; i < HASH_SIZE; i++)
  399. for (e = hashtab->entries[i]; e; e = e->next)
  400. {
  401. printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
  402. dict[k]->states[e->val].fallback_state);
  403. for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
  404. printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
  405. dict[k]->states[e->val].trans[j].new_state);
  406. }
  407. #endif
  408. #ifndef VERBOSE
  409. hnj_hash_free (hashtab);
  410. #endif
  411. state_num = 0;
  412. }
  413. fclose(f);
  414. if (k == 2) dict[0]->nextlevel = dict[1];
  415. return dict[0];
  416. }
  417. void hnj_hyphen_free (HyphenDict *dict)
  418. {
  419. int state_num;
  420. HyphenState *hstate;
  421. for (state_num = 0; state_num < dict->num_states; state_num++)
  422. {
  423. hstate = &dict->states[state_num];
  424. if (hstate->match)
  425. hnj_free (hstate->match);
  426. if (hstate->repl)
  427. hnj_free (hstate->repl);
  428. if (hstate->trans)
  429. hnj_free (hstate->trans);
  430. }
  431. if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);
  432. hnj_free (dict->states);
  433. hnj_free (dict);
  434. }
  435. #define MAX_WORD 256
  436. int hnj_hyphen_hyphenate (HyphenDict *dict,
  437. const char *word, int word_size,
  438. char *hyphens)
  439. {
  440. char prep_word_buf[MAX_WORD];
  441. char *prep_word;
  442. int i, j, k;
  443. int state;
  444. char ch;
  445. HyphenState *hstate;
  446. char *match;
  447. int offset;
  448. if (word_size + 3 < MAX_WORD)
  449. prep_word = prep_word_buf;
  450. else
  451. prep_word = (char *) hnj_malloc (word_size + 3);
  452. j = 0;
  453. prep_word[j++] = '.';
  454. for (i = 0; i < word_size; i++)
  455. prep_word[j++] = word[i];
  456. prep_word[j++] = '.';
  457. prep_word[j] = '\0';
  458. for (i = 0; i < word_size + 5; i++)
  459. hyphens[i] = '0';
  460. #ifdef VERBOSE
  461. printf ("prep_word = %s\n", prep_word);
  462. #endif
  463. /* now, run the finite state machine */
  464. state = 0;
  465. for (i = 0; i < j; i++)
  466. {
  467. ch = prep_word[i];
  468. for (;;)
  469. {
  470. if (state == -1) {
  471. /* return 1; */
  472. /* KBH: FIXME shouldn't this be as follows? */
  473. state = 0;
  474. goto try_next_letter;
  475. }
  476. #ifdef VERBOSE
  477. char *state_str;
  478. state_str = get_state_str (state);
  479. for (k = 0; k < i - strlen (state_str); k++)
  480. putchar (' ');
  481. printf ("%s", state_str);
  482. #endif
  483. hstate = &dict->states[state];
  484. for (k = 0; k < hstate->num_trans; k++)
  485. if (hstate->trans[k].ch == ch)
  486. {
  487. state = hstate->trans[k].new_state;
  488. goto found_state;
  489. }
  490. state = hstate->fallback_state;
  491. #ifdef VERBOSE
  492. printf (" falling back, fallback_state %d\n", state);
  493. #endif
  494. }
  495. found_state:
  496. #ifdef VERBOSE
  497. printf ("found state %d\n",state);
  498. #endif
  499. /* Additional optimization is possible here - especially,
  500. elimination of trailing zeroes from the match. Leading zeroes
  501. have already been optimized. */
  502. match = dict->states[state].match;
  503. /* replacing rules not handled by hyphen_hyphenate() */
  504. if (match && !dict->states[state].repl)
  505. {
  506. offset = i + 1 - strlen (match);
  507. #ifdef VERBOSE
  508. for (k = 0; k < offset; k++)
  509. putchar (' ');
  510. printf ("%s\n", match);
  511. #endif
  512. /* This is a linear search because I tried a binary search and
  513. found it to be just a teeny bit slower. */
  514. for (k = 0; match[k]; k++)
  515. if (hyphens[offset + k] < match[k])
  516. hyphens[offset + k] = match[k];
  517. }
  518. /* KBH: we need this to make sure we keep looking in a word */
  519. /* for patterns even if the current character is not known in state 0 */
  520. /* since patterns for hyphenation may occur anywhere in the word */
  521. try_next_letter: ;
  522. }
  523. #ifdef VERBOSE
  524. for (i = 0; i < j; i++)
  525. putchar (hyphens[i]);
  526. putchar ('\n');
  527. #endif
  528. for (i = 0; i < j - 4; i++)
  529. #if 0
  530. if (hyphens[i + 1] & 1)
  531. hyphens[i] = '-';
  532. #else
  533. hyphens[i] = hyphens[i + 1];
  534. #endif
  535. hyphens[0] = '0';
  536. for (; i < word_size; i++)
  537. hyphens[i] = '0';
  538. hyphens[word_size] = '\0';
  539. if (prep_word != prep_word_buf)
  540. hnj_free (prep_word);
  541. return 0;
  542. }
  543. /* Unicode ligature length */
  544. int hnj_ligature(unsigned char c) {
  545. switch (c) {
  546. case 0x80: /* ff */
  547. case 0x81: /* fi */
  548. case 0x82: return LIG_xx; /* fl */
  549. case 0x83: /* ffi */
  550. case 0x84: return LIG_xxx; /* ffl */
  551. case 0x85: /* long st */
  552. case 0x86: return LIG_xx; /* st */
  553. }
  554. return 0;
  555. }
  556. /* character length of the first n byte of the input word */
  557. int hnj_hyphen_strnlen(const char * word, int n, int utf8)
  558. {
  559. int i = 0;
  560. int j = 0;
  561. while (j < n && word[j] != '\0') {
  562. i++;
  563. // Unicode ligature support
  564. if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {
  565. i += hnj_ligature(word[j + 2]);
  566. }
  567. for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
  568. }
  569. return i;
  570. }
  571. int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
  572. char *** rep, int ** pos, int ** cut, int lhmin)
  573. {
  574. int i = 1, j;
  575. // Unicode ligature support
  576. if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC)) {
  577. i += hnj_ligature(word[2]);
  578. }
  579. for (j = 0; i < lhmin && word[j] != '\0'; i++) do {
  580. // check length of the non-standard part
  581. if (*rep && *pos && *cut && (*rep)[j]) {
  582. char * rh = strchr((*rep)[j], '=');
  583. if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
  584. hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
  585. free((*rep)[j]);
  586. (*rep)[j] = NULL;
  587. hyphens[j] = '0';
  588. }
  589. } else {
  590. hyphens[j] = '0';
  591. }
  592. j++;
  593. // Unicode ligature support
  594. if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {
  595. i += hnj_ligature(word[j + 2]);
  596. }
  597. } while (utf8 && (word[j] & 0xc0) == 0x80);
  598. return 0;
  599. }
  600. int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
  601. char *** rep, int ** pos, int ** cut, int rhmin)
  602. {
  603. int i;
  604. int j = word_size - 2;
  605. for (i = 1; i < rhmin && j > 0; j--) {
  606. // check length of the non-standard part
  607. if (*rep && *pos && *cut && (*rep)[j]) {
  608. char * rh = strchr((*rep)[j], '=');
  609. if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +
  610. hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
  611. free((*rep)[j]);
  612. (*rep)[j] = NULL;
  613. hyphens[j] = '0';
  614. }
  615. } else {
  616. hyphens[j] = '0';
  617. }
  618. if (!utf8 || (word[j] & 0xc0) != 0xc0) i++;
  619. }
  620. return 0;
  621. }
  622. // recursive function for compound level hyphenation
  623. int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
  624. char * hyphens, char *** rep, int ** pos, int ** cut,
  625. int clhmin, int crhmin, int lend, int rend)
  626. {
  627. char prep_word_buf[MAX_WORD];
  628. char *prep_word;
  629. int i, j, k;
  630. int state;
  631. char ch;
  632. HyphenState *hstate;
  633. char *match;
  634. char *repl;
  635. signed char replindex;
  636. signed char replcut;
  637. int offset;
  638. int matchlen_buf[MAX_CHARS];
  639. int matchindex_buf[MAX_CHARS];
  640. char * matchrepl_buf[MAX_CHARS];
  641. int * matchlen;
  642. int * matchindex;
  643. char ** matchrepl;
  644. int isrepl = 0;
  645. int nHyphCount;
  646. if (word_size + 3 < MAX_CHARS) {
  647. prep_word = prep_word_buf;
  648. matchlen = matchlen_buf;
  649. matchindex = matchindex_buf;
  650. matchrepl = matchrepl_buf;
  651. } else {
  652. prep_word = (char *) hnj_malloc (word_size + 3);
  653. matchlen = (int *) hnj_malloc ((word_size + 3) * sizeof(int));
  654. matchindex = (int *) hnj_malloc ((word_size + 3) * sizeof(int));
  655. matchrepl = (char **) hnj_malloc ((word_size + 3) * sizeof(char *));
  656. }
  657. j = 0;
  658. prep_word[j++] = '.';
  659. for (i = 0; i < word_size; i++)
  660. prep_word[j++] = word[i];
  661. prep_word[j++] = '.';
  662. prep_word[j] = '\0';
  663. for (i = 0; i < j; i++)
  664. hyphens[i] = '0';
  665. #ifdef VERBOSE
  666. printf ("prep_word = %s\n", prep_word);
  667. #endif
  668. /* now, run the finite state machine */
  669. state = 0;
  670. for (i = 0; i < j; i++)
  671. {
  672. ch = prep_word[i];
  673. for (;;)
  674. {
  675. if (state == -1) {
  676. /* return 1; */
  677. /* KBH: FIXME shouldn't this be as follows? */
  678. state = 0;
  679. goto try_next_letter;
  680. }
  681. #ifdef VERBOSE
  682. char *state_str;
  683. state_str = get_state_str (state);
  684. for (k = 0; k < i - strlen (state_str); k++)
  685. putchar (' ');
  686. printf ("%s", state_str);
  687. #endif
  688. hstate = &dict->states[state];
  689. for (k = 0; k < hstate->num_trans; k++)
  690. if (hstate->trans[k].ch == ch)
  691. {
  692. state = hstate->trans[k].new_state;
  693. goto found_state;
  694. }
  695. state = hstate->fallback_state;
  696. #ifdef VERBOSE
  697. printf (" falling back, fallback_state %d\n", state);
  698. #endif
  699. }
  700. found_state:
  701. #ifdef VERBOSE
  702. printf ("found state %d\n",state);
  703. #endif
  704. /* Additional optimization is possible here - especially,
  705. elimination of trailing zeroes from the match. Leading zeroes
  706. have already been optimized. */
  707. match = dict->states[state].match;
  708. repl = dict->states[state].repl;
  709. replindex = dict->states[state].replindex;
  710. replcut = dict->states[state].replcut;
  711. /* replacing rules not handled by hyphen_hyphenate() */
  712. if (match)
  713. {
  714. offset = i + 1 - strlen (match);
  715. #ifdef VERBOSE
  716. for (k = 0; k < offset; k++)
  717. putchar (' ');
  718. printf ("%s (%s)\n", match, repl);
  719. #endif
  720. if (repl) {
  721. if (!isrepl) for(; isrepl < word_size; isrepl++) {
  722. matchrepl[isrepl] = NULL;
  723. matchindex[isrepl] = -1;
  724. }
  725. matchlen[offset + replindex] = replcut;
  726. }
  727. /* This is a linear search because I tried a binary search and
  728. found it to be just a teeny bit slower. */
  729. for (k = 0; match[k]; k++) {
  730. if ((hyphens[offset + k] < match[k])) {
  731. hyphens[offset + k] = match[k];
  732. if (match[k]&1) {
  733. matchrepl[offset + k] = repl;
  734. if (repl && (k >= replindex) && (k <= replindex + replcut)) {
  735. matchindex[offset + replindex] = offset + k;
  736. }
  737. }
  738. }
  739. }
  740. }
  741. /* KBH: we need this to make sure we keep looking in a word */
  742. /* for patterns even if the current character is not known in state 0 */
  743. /* since patterns for hyphenation may occur anywhere in the word */
  744. try_next_letter: ;
  745. }
  746. #ifdef VERBOSE
  747. for (i = 0; i < j; i++)
  748. putchar (hyphens[i]);
  749. putchar ('\n');
  750. #endif
  751. for (i = 0; i < j - 3; i++)
  752. #if 0
  753. if (hyphens[i + 1] & 1)
  754. hyphens[i] = '-';
  755. #else
  756. hyphens[i] = hyphens[i + 1];
  757. #endif
  758. for (; i < word_size; i++)
  759. hyphens[i] = '0';
  760. hyphens[word_size] = '\0';
  761. /* now create a new char string showing hyphenation positions */
  762. /* count the hyphens and allocate space for the new hyphenated string */
  763. nHyphCount = 0;
  764. for (i = 0; i < word_size; i++)
  765. if (hyphens[i]&1)
  766. nHyphCount++;
  767. j = 0;
  768. for (i = 0; i < word_size; i++) {
  769. if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) {
  770. if (rep && pos && cut) {
  771. if (!*rep && !*pos && !*cut) {
  772. int k;
  773. *rep = (char **) malloc(sizeof(char *) * word_size);
  774. *pos = (int *) malloc(sizeof(int) * word_size);
  775. *cut = (int *) malloc(sizeof(int) * word_size);
  776. for (k = 0; k < word_size; k++) {
  777. (*rep)[k] = NULL;
  778. (*pos)[k] = 0;
  779. (*cut)[k] = 0;
  780. }
  781. }
  782. (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]);
  783. (*pos)[matchindex[i] - 1] = matchindex[i] - i;
  784. (*cut)[matchindex[i] - 1] = matchlen[i];
  785. }
  786. j += strlen(matchrepl[matchindex[i]]);
  787. i += matchlen[i] - 1;
  788. }
  789. }
  790. if (matchrepl != matchrepl_buf) {
  791. hnj_free (matchrepl);
  792. hnj_free (matchlen);
  793. hnj_free (matchindex);
  794. }
  795. // recursive hyphenation of the first (compound) level segments
  796. if (dict->nextlevel) {
  797. char * rep2_buf[MAX_WORD];
  798. int pos2_buf[MAX_WORD];
  799. int cut2_buf[MAX_WORD];
  800. char hyphens2_buf[MAX_WORD];
  801. char ** rep2;
  802. int * pos2;
  803. int * cut2;
  804. char * hyphens2;
  805. int begin = 0;
  806. if (word_size < MAX_CHARS) {
  807. rep2 = rep2_buf;
  808. pos2 = pos2_buf;
  809. cut2 = cut2_buf;
  810. hyphens2 = hyphens2_buf;
  811. } else {
  812. rep2 = (char **) hnj_malloc (word_size * sizeof(char *));
  813. pos2 = (int *) hnj_malloc (word_size * sizeof(int));
  814. cut2 = (int *) hnj_malloc (word_size * sizeof(int));
  815. hyphens2 = (char *) hnj_malloc (word_size);
  816. }
  817. for (i = 0; i < word_size; i++) rep2[i] = NULL;
  818. for (i = 0; i < word_size; i++) if
  819. (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
  820. if (i - begin > 1) {
  821. int hyph = 0;
  822. prep_word[i + 2] = '\0';
  823. /* non-standard hyphenation at compound boundary (Schiffahrt) */
  824. if (*rep && *pos && *cut && (*rep)[i]) {
  825. char * l = strchr((*rep)[i], '=');
  826. strcpy(prep_word + 2 + i - (*pos)[i], (*rep)[i]);
  827. if (l) {
  828. hyph = (l - (*rep)[i]) - (*pos)[i];
  829. prep_word[2 + i + hyph] = '\0';
  830. }
  831. }
  832. hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,
  833. hyphens2, &rep2, &pos2, &cut2, clhmin,
  834. crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
  835. for (j = 0; j < i - begin - 1; j++) {
  836. hyphens[begin + j] = hyphens2[j];
  837. if (rep2[j] && rep && pos && cut) {
  838. if (!*rep && !*pos && !*cut) {
  839. int k;
  840. *rep = (char **) malloc(sizeof(char *) * word_size);
  841. *pos = (int *) malloc(sizeof(int) * word_size);
  842. *cut = (int *) malloc(sizeof(int) * word_size);
  843. for (k = 0; k < word_size; k++) {
  844. (*rep)[k] = NULL;
  845. (*pos)[k] = 0;
  846. (*cut)[k] = 0;
  847. }
  848. }
  849. (*rep)[begin + j] = rep2[j];
  850. (*pos)[begin + j] = pos2[j];
  851. (*cut)[begin + j] = cut2[j];
  852. }
  853. }
  854. prep_word[i + 2] = word[i + 1];
  855. if (*rep && *pos && *cut && (*rep)[i]) {
  856. strcpy(prep_word + 1, word);
  857. }
  858. }
  859. begin = i + 1;
  860. for (j = 0; j < word_size; j++) rep2[j] = NULL;
  861. }
  862. // non-compound
  863. if (begin == 0) {
  864. hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
  865. hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
  866. if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
  867. rep, pos, cut, clhmin);
  868. if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
  869. rep, pos, cut, crhmin);
  870. }
  871. if (rep2 != rep2_buf) {
  872. free(rep2);
  873. free(cut2);
  874. free(pos2);
  875. free(hyphens2);
  876. }
  877. }
  878. if (prep_word != prep_word_buf) hnj_free (prep_word);
  879. return 0;
  880. }
  881. /* UTF-8 normalization of hyphen and non-standard positions */
  882. int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
  883. char *** rep, int ** pos, int ** cut)
  884. {
  885. int i, j, k;
  886. if ((((unsigned char) word[0]) >> 6) == 2) {
  887. fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);
  888. return 1;
  889. }
  890. /* calculate UTF-8 character positions */
  891. for (i = 0, j = -1; i < word_size; i++) {
  892. /* beginning of an UTF-8 character (not '10' start bits) */
  893. if ((((unsigned char) word[i]) >> 6) != 2) j++;
  894. hyphens[j] = hyphens[i];
  895. if (rep && pos && cut && *rep && *pos && *cut) {
  896. int l = (*pos)[i];
  897. (*pos)[j] = 0;
  898. for (k = 0; k < l; k++) {
  899. if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;
  900. }
  901. k = i - l + 1;
  902. l = k + (*cut)[i];
  903. (*cut)[j] = 0;
  904. for (; k < l; k++) {
  905. if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;
  906. }
  907. (*rep)[j] = (*rep)[i];
  908. if (j < i) {
  909. (*rep)[i] = NULL;
  910. (*pos)[i] = 0;
  911. (*cut)[i] = 0;
  912. }
  913. }
  914. }
  915. hyphens[j + 1] = '\0';
  916. return 0;
  917. }
  918. /* get the word with all possible hyphenations (output: hyphword) */
  919. void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens,
  920. char * hyphword, char *** rep, int ** pos, int ** cut)
  921. {
  922. int i, j;
  923. for (i = 0, j = 0; i < l; i++, j++) {
  924. if (hyphens[i]&1) {
  925. hyphword[j] = word[i];
  926. if (*rep && *pos && *cut && (*rep)[i]) {
  927. strcpy(hyphword + j - (*pos)[i] + 1, (*rep)[i]);
  928. j += strlen((*rep)[i]) - (*pos)[i];
  929. i += (*cut)[i] - (*pos)[i];
  930. } else hyphword[++j] = '=';
  931. } else hyphword[j] = word[i];
  932. }
  933. hyphword[j] = '\0';
  934. }
  935. /* main api function with default hyphenmin parameters */
  936. int hnj_hyphen_hyphenate2 (HyphenDict *dict,
  937. const char *word, int word_size, char * hyphens,
  938. char *hyphword, char *** rep, int ** pos, int ** cut)
  939. {
  940. hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
  941. dict->clhmin, dict->crhmin, 1, 1);
  942. hnj_hyphen_lhmin(dict->utf8, word, word_size,
  943. hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
  944. hnj_hyphen_rhmin(dict->utf8, word, word_size,
  945. hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
  946. if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
  947. if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
  948. return 0;
  949. }
  950. /* previous main api function with hyphenmin parameters */
  951. int hnj_hyphen_hyphenate3 (HyphenDict *dict,
  952. const char *word, int word_size, char * hyphens,
  953. char *hyphword, char *** rep, int ** pos, int ** cut,
  954. int lhmin, int rhmin, int clhmin, int crhmin)
  955. {
  956. lhmin = (lhmin > 0 ? lhmin : dict->lhmin);
  957. rhmin = (rhmin > 0 ? rhmin : dict->rhmin);
  958. hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
  959. clhmin, crhmin, 1, 1);
  960. hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
  961. rep, pos, cut, (lhmin > 0 ? lhmin : 2));
  962. hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
  963. rep, pos, cut, (rhmin > 0 ? rhmin : 2));
  964. if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
  965. if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
  966. return 0;
  967. }