PageRenderTime 57ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 1ms

/ext/hunspell/affixmgr.cxx

https://github.com/csware/test
C++ | 3976 lines | 3145 code | 418 blank | 413 comment | 1291 complexity | 1cd26bf8aa3cdc888a4490b61d8d99f2 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.0, GPL-3.0, LGPL-3.0, MPL-2.0-no-copyleft-exception

Large files files are truncated, but you can click here to view the full file

  1. #include "license.hunspell"
  2. #include "license.myspell"
  3. #ifndef MOZILLA_CLIENT
  4. #include <cstdlib>
  5. #include <cstring>
  6. #include <cctype>
  7. #include <cstdio>
  8. #else
  9. #include <stdlib.h>
  10. #include <string.h>
  11. #include <stdio.h>
  12. #include <ctype.h>
  13. #endif
  14. #include "affixmgr.hxx"
  15. #include "affentry.hxx"
  16. #include "langnum.hxx"
  17. #include "csutil.hxx"
  18. #ifndef MOZILLA_CLIENT
  19. #ifndef W32
  20. using namespace std;
  21. #endif
  22. #endif
  23. AffixMgr::AffixMgr(const char * affpath, HashMgr* ptr)
  24. {
  25. // register hash manager and load affix data from aff file
  26. pHMgr = ptr;
  27. trystring = NULL;
  28. encoding=NULL;
  29. utf8 = 0;
  30. complexprefixes = 0;
  31. maptable = NULL;
  32. nummap = 0;
  33. breaktable = NULL;
  34. numbreak = 0;
  35. reptable = NULL;
  36. numrep = 0;
  37. checkcpdtable = NULL;
  38. numcheckcpd = 0;
  39. defcpdtable = NULL;
  40. numdefcpd = 0;
  41. compoundflag = FLAG_NULL; // permits word in compound forms
  42. compoundbegin = FLAG_NULL; // may be first word in compound forms
  43. compoundmiddle = FLAG_NULL; // may be middle word in compound forms
  44. compoundend = FLAG_NULL; // may be last word in compound forms
  45. compoundroot = FLAG_NULL; // compound word signing flag
  46. compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word
  47. compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word
  48. checkcompounddup = 0; // forbid double words in compounds
  49. checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a REP substitution)
  50. checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds
  51. checkcompoundtriple = 0; // forbid compounds with triple letters
  52. forbiddenword = FLAG_NULL; // forbidden word signing flag
  53. nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag
  54. lang = NULL; // language
  55. langnum = 0; // language code (see http://l10n.openoffice.org/languages.html)
  56. pseudoroot = FLAG_NULL; // forbidden root, allowed only with suffixes
  57. cpdwordmax = -1; // default: unlimited wordcount in compound words
  58. cpdmin = -1; // undefined
  59. cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words
  60. cpdvowels=NULL; // vowels (for calculating of Hungarian compounding limit, O(n) search! XXX)
  61. cpdvowels_utf16=NULL; // vowels for UTF-8 encoding (bsearch instead of O(n) search)
  62. cpdvowels_utf16_len=0; // vowels
  63. pfxappnd=NULL; // previous prefix for counting the syllables of prefix BUG
  64. sfxappnd=NULL; // previous suffix for counting a special syllables BUG
  65. cpdsyllablenum=NULL; // syllable count incrementing flag
  66. checknum=0; // checking numbers, and word with numbers
  67. wordchars=NULL; // letters + spec. word characters
  68. wordchars_utf16=NULL; // letters + spec. word characters
  69. wordchars_utf16_len=0; // letters + spec. word characters
  70. ignorechars=NULL; // letters + spec. word characters
  71. ignorechars_utf16=NULL; // letters + spec. word characters
  72. ignorechars_utf16_len=0; // letters + spec. word characters
  73. version=NULL; // affix and dictionary file version string
  74. havecontclass=0; // flags of possible continuing classes (double affix)
  75. // LEMMA_PRESENT: not put root into the morphological output. Lemma presents
  76. // in morhological description in dictionary file. It's often combined with PSEUDOROOT.
  77. lemma_present = FLAG_NULL;
  78. circumfix = FLAG_NULL;
  79. onlyincompound = FLAG_NULL;
  80. flag_mode = FLAG_CHAR; // default one-character flags in affix and dic file
  81. maxngramsugs = -1; // undefined
  82. nosplitsugs = 0;
  83. sugswithdots = 0;
  84. keepcase = 0;
  85. checksharps = 0;
  86. derived = NULL; // XXX not threadsafe variable for experimental stemming
  87. sfx = NULL;
  88. pfx = NULL;
  89. for (int i=0; i < SETSIZE; i++) {
  90. pStart[i] = NULL;
  91. sStart[i] = NULL;
  92. pFlag[i] = NULL;
  93. sFlag[i] = NULL;
  94. }
  95. for (int j=0; j < CONTSIZE; j++) {
  96. contclasses[j] = 0;
  97. }
  98. if (parse_file(affpath)) {
  99. HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath);
  100. wordchars = mystrdup("qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM");
  101. }
  102. if (cpdmin == -1) cpdmin = MINCPDLEN;
  103. }
  104. AffixMgr::~AffixMgr()
  105. {
  106. // pass through linked prefix entries and clean up
  107. for (int i=0; i < SETSIZE ;i++) {
  108. pFlag[i] = NULL;
  109. PfxEntry * ptr = (PfxEntry *)pStart[i];
  110. PfxEntry * nptr = NULL;
  111. while (ptr) {
  112. nptr = ptr->getNext();
  113. delete(ptr);
  114. ptr = nptr;
  115. nptr = NULL;
  116. }
  117. }
  118. // pass through linked suffix entries and clean up
  119. for (int j=0; j < SETSIZE ; j++) {
  120. sFlag[j] = NULL;
  121. SfxEntry * ptr = (SfxEntry *)sStart[j];
  122. SfxEntry * nptr = NULL;
  123. while (ptr) {
  124. nptr = ptr->getNext();
  125. delete(ptr);
  126. ptr = nptr;
  127. nptr = NULL;
  128. }
  129. sStart[j] = NULL;
  130. }
  131. if (trystring) free(trystring);
  132. trystring=NULL;
  133. if (encoding) free(encoding);
  134. encoding=NULL;
  135. if (maptable) {
  136. for (int j=0; j < nummap; j++) {
  137. if (maptable[j].set) free(maptable[j].set);
  138. if (maptable[j].set_utf16) free(maptable[j].set_utf16);
  139. maptable[j].set = NULL;
  140. maptable[j].len = 0;
  141. }
  142. free(maptable);
  143. maptable = NULL;
  144. }
  145. nummap = 0;
  146. if (breaktable) {
  147. for (int j=0; j < numbreak; j++) {
  148. if (breaktable[j]) free(breaktable[j]);
  149. breaktable[j] = NULL;
  150. }
  151. free(breaktable);
  152. breaktable = NULL;
  153. }
  154. numbreak = 0;
  155. if (reptable) {
  156. for (int j=0; j < numrep; j++) {
  157. free(reptable[j].pattern);
  158. free(reptable[j].pattern2);
  159. reptable[j].pattern = NULL;
  160. reptable[j].pattern2 = NULL;
  161. }
  162. free(reptable);
  163. reptable = NULL;
  164. }
  165. if (defcpdtable) {
  166. for (int j=0; j < numdefcpd; j++) {
  167. free(defcpdtable[j].def);
  168. defcpdtable[j].def = NULL;
  169. }
  170. free(defcpdtable);
  171. defcpdtable = NULL;
  172. }
  173. numrep = 0;
  174. if (checkcpdtable) {
  175. for (int j=0; j < numcheckcpd; j++) {
  176. free(checkcpdtable[j].pattern);
  177. free(checkcpdtable[j].pattern2);
  178. checkcpdtable[j].pattern = NULL;
  179. checkcpdtable[j].pattern2 = NULL;
  180. }
  181. free(checkcpdtable);
  182. checkcpdtable = NULL;
  183. }
  184. numcheckcpd = 0;
  185. FREE_FLAG(compoundflag);
  186. FREE_FLAG(compoundbegin);
  187. FREE_FLAG(compoundmiddle);
  188. FREE_FLAG(compoundend);
  189. FREE_FLAG(compoundpermitflag);
  190. FREE_FLAG(compoundforbidflag);
  191. FREE_FLAG(compoundroot);
  192. FREE_FLAG(forbiddenword);
  193. FREE_FLAG(nosuggest);
  194. FREE_FLAG(pseudoroot);
  195. FREE_FLAG(lemma_present);
  196. FREE_FLAG(circumfix);
  197. FREE_FLAG(onlyincompound);
  198. cpdwordmax = 0;
  199. pHMgr = NULL;
  200. cpdmin = 0;
  201. cpdmaxsyllable = 0;
  202. if (cpdvowels) free(cpdvowels);
  203. if (cpdvowels_utf16) free(cpdvowels_utf16);
  204. if (cpdsyllablenum) free(cpdsyllablenum);
  205. free_utf_tbl();
  206. if (lang) free(lang);
  207. if (wordchars) free(wordchars);
  208. if (wordchars_utf16) free(wordchars_utf16);
  209. if (ignorechars) free(ignorechars);
  210. if (ignorechars_utf16) free(ignorechars_utf16);
  211. if (version) free(version);
  212. if (derived) free(derived);
  213. checknum=0;
  214. }
  215. // read in aff file and build up prefix and suffix entry objects
  216. int AffixMgr::parse_file(const char * affpath)
  217. {
  218. // io buffers
  219. char line[MAXLNLEN+1];
  220. // affix type
  221. char ft;
  222. // checking flag duplication
  223. char dupflags[CONTSIZE];
  224. char dupflags_ini = 1;
  225. // first line indicator for removing byte order mark
  226. int firstline = 1;
  227. // open the affix file
  228. FILE * afflst;
  229. afflst = fopen(affpath,"r");
  230. if (!afflst) {
  231. HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath);
  232. return 1;
  233. }
  234. // step one is to parse the affix file building up the internal
  235. // affix data structures
  236. // read in each line ignoring any that do not
  237. // start with a known line type indicator
  238. while (fgets(line,MAXLNLEN,afflst)) {
  239. mychomp(line);
  240. /* remove byte order mark */
  241. if (firstline) {
  242. firstline = 0;
  243. if (strncmp(line,"",3) == 0) {
  244. memmove(line, line+3, strlen(line+3)+1);
  245. HUNSPELL_WARNING(stderr, "warning: affix file begins with byte order mark: possible incompatibility with old Hunspell versions\n");
  246. }
  247. }
  248. /* parse in the try string */
  249. if (strncmp(line,"TRY",3) == 0) {
  250. if (parse_string(line, &trystring, "TRY")) {
  251. fclose(afflst);
  252. return 1;
  253. }
  254. }
  255. /* parse in the name of the character set used by the .dict and .aff */
  256. if (strncmp(line,"SET",3) == 0) {
  257. if (parse_string(line, &encoding, "SET")) {
  258. fclose(afflst);
  259. return 1;
  260. }
  261. if (strcmp(encoding, "UTF-8") == 0) {
  262. utf8 = 1;
  263. #ifndef OPENOFFICEORG
  264. #ifndef MOZILLA_CLIENT
  265. if (initialize_utf_tbl()) {
  266. fclose(afflst);
  267. return 1;
  268. }
  269. #endif
  270. #endif
  271. }
  272. }
  273. /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left writing system */
  274. if (strncmp(line,"COMPLEXPREFIXES",15) == 0)
  275. complexprefixes = 1;
  276. /* parse in the flag used by the controlled compound words */
  277. if (strncmp(line,"COMPOUNDFLAG",12) == 0) {
  278. if (parse_flag(line, &compoundflag, "COMPOUNDFLAG")) {
  279. fclose(afflst);
  280. return 1;
  281. }
  282. }
  283. /* parse in the flag used by compound words */
  284. if (strncmp(line,"COMPOUNDBEGIN",13) == 0) {
  285. if (complexprefixes) {
  286. if (parse_flag(line, &compoundend, "COMPOUNDBEGIN")) {
  287. fclose(afflst);
  288. return 1;
  289. }
  290. } else {
  291. if (parse_flag(line, &compoundbegin, "COMPOUNDBEGIN")) {
  292. fclose(afflst);
  293. return 1;
  294. }
  295. }
  296. }
  297. /* parse in the flag used by compound words */
  298. if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) {
  299. if (parse_flag(line, &compoundmiddle, "COMPOUNDMIDDLE")) {
  300. fclose(afflst);
  301. return 1;
  302. }
  303. }
  304. /* parse in the flag used by compound words */
  305. if (strncmp(line,"COMPOUNDEND",11) == 0) {
  306. if (complexprefixes) {
  307. if (parse_flag(line, &compoundbegin, "COMPOUNDEND")) {
  308. fclose(afflst);
  309. return 1;
  310. }
  311. } else {
  312. if (parse_flag(line, &compoundend, "COMPOUNDEND")) {
  313. fclose(afflst);
  314. return 1;
  315. }
  316. }
  317. }
  318. /* parse in the data used by compound_check() method */
  319. if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) {
  320. if (parse_num(line, &cpdwordmax, "COMPOUNDWORDMAX")) {
  321. fclose(afflst);
  322. return 1;
  323. }
  324. }
  325. /* parse in the flag sign compounds in dictionary */
  326. if (strncmp(line,"COMPOUNDROOT",12) == 0) {
  327. if (parse_flag(line, &compoundroot, "COMPOUNDROOT")) {
  328. fclose(afflst);
  329. return 1;
  330. }
  331. }
  332. /* parse in the flag used by compound_check() method */
  333. if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) {
  334. if (parse_flag(line, &compoundpermitflag, "COMPOUNDPERMITFLAG")) {
  335. fclose(afflst);
  336. return 1;
  337. }
  338. }
  339. /* parse in the flag used by compound_check() method */
  340. if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) {
  341. if (parse_flag(line, &compoundforbidflag, "COMPOUNDFORBIDFLAG")) {
  342. fclose(afflst);
  343. return 1;
  344. }
  345. }
  346. if (strncmp(line,"CHECKCOMPOUNDDUP",16) == 0) {
  347. checkcompounddup = 1;
  348. }
  349. if (strncmp(line,"CHECKCOMPOUNDREP",16) == 0) {
  350. checkcompoundrep = 1;
  351. }
  352. if (strncmp(line,"CHECKCOMPOUNDTRIPLE",19) == 0) {
  353. checkcompoundtriple = 1;
  354. }
  355. if (strncmp(line,"CHECKCOMPOUNDCASE",17) == 0) {
  356. checkcompoundcase = 1;
  357. }
  358. if (strncmp(line,"NOSUGGEST",9) == 0) {
  359. if (parse_flag(line, &nosuggest, "NOSUGGEST")) {
  360. fclose(afflst);
  361. return 1;
  362. }
  363. }
  364. /* parse in the flag used by forbidden words */
  365. if (strncmp(line,"FORBIDDENWORD",13) == 0) {
  366. if (parse_flag(line, &forbiddenword, "FORBIDDENWORD")) {
  367. fclose(afflst);
  368. return 1;
  369. }
  370. }
  371. /* parse in the flag used by forbidden words */
  372. if (strncmp(line,"LEMMA_PRESENT",13) == 0) {
  373. if (parse_flag(line, &lemma_present, "LEMMA_PRESENT")) {
  374. fclose(afflst);
  375. return 1;
  376. }
  377. }
  378. /* parse in the flag used by circumfixes */
  379. if (strncmp(line,"CIRCUMFIX",9) == 0) {
  380. if (parse_flag(line, &circumfix, "CIRCUMFIX")) {
  381. fclose(afflst);
  382. return 1;
  383. }
  384. }
  385. /* parse in the flag used by fogemorphemes */
  386. if (strncmp(line,"ONLYINCOMPOUND",14) == 0) {
  387. if (parse_flag(line, &onlyincompound, "ONLYINCOMPOUND")) {
  388. fclose(afflst);
  389. return 1;
  390. }
  391. }
  392. /* parse in the flag used by `pseudoroots' */
  393. if (strncmp(line,"PSEUDOROOT",10) == 0) {
  394. if (parse_flag(line, &pseudoroot, "PSEUDOROOT")) {
  395. fclose(afflst);
  396. return 1;
  397. }
  398. }
  399. /* parse in the flag used by `pseudoroots' */
  400. if (strncmp(line,"NEEDAFFIX",9) == 0) {
  401. if (parse_flag(line, &pseudoroot, "NEEDAFFIX")) {
  402. fclose(afflst);
  403. return 1;
  404. }
  405. }
  406. /* parse in the minimal length for words in compounds */
  407. if (strncmp(line,"COMPOUNDMIN",11) == 0) {
  408. if (parse_num(line, &cpdmin, "COMPOUNDMIN")) {
  409. fclose(afflst);
  410. return 1;
  411. }
  412. if (cpdmin < 1) cpdmin = 1;
  413. }
  414. /* parse in the max. words and syllables in compounds */
  415. if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) {
  416. if (parse_cpdsyllable(line)) {
  417. fclose(afflst);
  418. return 1;
  419. }
  420. }
  421. /* parse in the flag used by compound_check() method */
  422. if (strncmp(line,"SYLLABLENUM",11) == 0) {
  423. if (parse_string(line, &cpdsyllablenum, "SYLLABLENUM")) {
  424. fclose(afflst);
  425. return 1;
  426. }
  427. }
  428. /* parse in the flag used by the controlled compound words */
  429. if (strncmp(line,"CHECKNUM",8) == 0) {
  430. checknum=1;
  431. }
  432. /* parse in the extra word characters */
  433. if (strncmp(line,"WORDCHARS",9) == 0) {
  434. if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, "WORDCHARS", utf8)) {
  435. fclose(afflst);
  436. return 1;
  437. }
  438. }
  439. /* parse in the ignored characters (for example, Arabic optional diacretics charachters */
  440. if (strncmp(line,"IGNORE",6) == 0) {
  441. if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, "IGNORE", utf8)) {
  442. fclose(afflst);
  443. return 1;
  444. }
  445. }
  446. /* parse in the typical fault correcting table */
  447. if (strncmp(line,"REP",3) == 0) {
  448. if (parse_reptable(line, afflst)) {
  449. fclose(afflst);
  450. return 1;
  451. }
  452. }
  453. /* parse in the checkcompoundpattern table */
  454. if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) {
  455. if (parse_checkcpdtable(line, afflst)) {
  456. fclose(afflst);
  457. return 1;
  458. }
  459. }
  460. /* parse in the defcompound table */
  461. if (strncmp(line,"COMPOUNDRULE",12) == 0) {
  462. if (parse_defcpdtable(line, afflst)) {
  463. fclose(afflst);
  464. return 1;
  465. }
  466. }
  467. /* parse in the related character map table */
  468. if (strncmp(line,"MAP",3) == 0) {
  469. if (parse_maptable(line, afflst)) {
  470. fclose(afflst);
  471. return 1;
  472. }
  473. }
  474. /* parse in the word breakpoints table */
  475. if (strncmp(line,"BREAK",5) == 0) {
  476. if (parse_breaktable(line, afflst)) {
  477. fclose(afflst);
  478. return 1;
  479. }
  480. }
  481. /* parse in the language for language specific codes */
  482. if (strncmp(line,"LANG",4) == 0) {
  483. if (parse_string(line, &lang, "LANG")) {
  484. fclose(afflst);
  485. return 1;
  486. }
  487. langnum = get_lang_num(lang);
  488. }
  489. if (strncmp(line,"VERSION",7) == 0) {
  490. if (parse_string(line, &version, "VERSION")) {
  491. fclose(afflst);
  492. return 1;
  493. }
  494. }
  495. if (strncmp(line,"MAXNGRAMSUGS",12) == 0) {
  496. if (parse_num(line, &maxngramsugs, "MAXNGRAMSUGS")) {
  497. fclose(afflst);
  498. return 1;
  499. }
  500. }
  501. if (strncmp(line,"NOSPLITSUGS",11) == 0) {
  502. nosplitsugs=1;
  503. }
  504. if (strncmp(line,"SUGSWITHDOTS",12) == 0) {
  505. sugswithdots=1;
  506. }
  507. /* parse in the flag used by forbidden words */
  508. if (strncmp(line,"KEEPCASE",8) == 0) {
  509. if (parse_flag(line, &keepcase, "KEEPCASE")) {
  510. fclose(afflst);
  511. return 1;
  512. }
  513. }
  514. if (strncmp(line,"CHECKSHARPS",11) == 0) {
  515. checksharps=1;
  516. }
  517. /* parse this affix: P - prefix, S - suffix */
  518. ft = ' ';
  519. if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P';
  520. if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S';
  521. if (ft != ' ') {
  522. if (dupflags_ini) {
  523. for (int i = 0; i < CONTSIZE; i++) dupflags[i] = 0;
  524. dupflags_ini = 0;
  525. }
  526. if (parse_affix(line, ft, afflst, dupflags)) {
  527. fclose(afflst);
  528. process_pfx_tree_to_list();
  529. process_sfx_tree_to_list();
  530. return 1;
  531. }
  532. }
  533. }
  534. fclose(afflst);
  535. // convert affix trees to sorted list
  536. process_pfx_tree_to_list();
  537. process_sfx_tree_to_list();
  538. // now we can speed up performance greatly taking advantage of the
  539. // relationship between the affixes and the idea of "subsets".
  540. // View each prefix as a potential leading subset of another and view
  541. // each suffix (reversed) as a potential trailing subset of another.
  542. // To illustrate this relationship if we know the prefix "ab" is found in the
  543. // word to examine, only prefixes that "ab" is a leading subset of need be examined.
  544. // Furthermore is "ab" is not present then none of the prefixes that "ab" is
  545. // is a subset need be examined.
  546. // The same argument goes for suffix string that are reversed.
  547. // Then to top this off why not examine the first char of the word to quickly
  548. // limit the set of prefixes to examine (i.e. the prefixes to examine must
  549. // be leading supersets of the first character of the word (if they exist)
  550. // To take advantage of this "subset" relationship, we need to add two links
  551. // from entry. One to take next if the current prefix is found (call it nexteq)
  552. // and one to take next if the current prefix is not found (call it nextne).
  553. // Since we have built ordered lists, all that remains is to properly intialize
  554. // the nextne and nexteq pointers that relate them
  555. process_pfx_order();
  556. process_sfx_order();
  557. // expand wordchars string, based on csutil (for external tokenization)
  558. char * enc = get_encoding();
  559. csconv = get_current_cs(enc);
  560. free(enc);
  561. enc = NULL;
  562. char expw[MAXLNLEN];
  563. if (wordchars) {
  564. strcpy(expw, wordchars);
  565. free(wordchars);
  566. } else *expw = '\0';
  567. for (int i = 0; i <= 255; i++) {
  568. if ( (csconv[i].cupper != csconv[i].clower) &&
  569. (! strchr(expw, (char) i))) {
  570. *(expw + strlen(expw) + 1) = '\0';
  571. *(expw + strlen(expw)) = (char) i;
  572. }
  573. }
  574. wordchars = mystrdup(expw);
  575. // temporary BREAK definition for German dash handling (OOo issue 64400)
  576. if ((langnum == LANG_de) && (!breaktable)) {
  577. breaktable = (char **) malloc(sizeof(char *));
  578. if (!breaktable) return 1;
  579. breaktable[0] = mystrdup("-");
  580. numbreak = 1;
  581. }
  582. return 0;
  583. }
  584. // we want to be able to quickly access prefix information
  585. // both by prefix flag, and sorted by prefix string itself
  586. // so we need to set up two indexes
  587. int AffixMgr::build_pfxtree(AffEntry* pfxptr)
  588. {
  589. PfxEntry * ptr;
  590. PfxEntry * pptr;
  591. PfxEntry * ep = (PfxEntry*) pfxptr;
  592. // get the right starting points
  593. const char * key = ep->getKey();
  594. const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
  595. // first index by flag which must exist
  596. ptr = (PfxEntry*)pFlag[flg];
  597. ep->setFlgNxt(ptr);
  598. pFlag[flg] = (AffEntry *) ep;
  599. // handle the special case of null affix string
  600. if (strlen(key) == 0) {
  601. // always inset them at head of list at element 0
  602. ptr = (PfxEntry*)pStart[0];
  603. ep->setNext(ptr);
  604. pStart[0] = (AffEntry*)ep;
  605. return 0;
  606. }
  607. // now handle the normal case
  608. ep->setNextEQ(NULL);
  609. ep->setNextNE(NULL);
  610. unsigned char sp = *((const unsigned char *)key);
  611. ptr = (PfxEntry*)pStart[sp];
  612. // handle the first insert
  613. if (!ptr) {
  614. pStart[sp] = (AffEntry*)ep;
  615. return 0;
  616. }
  617. // otherwise use binary tree insertion so that a sorted
  618. // list can easily be generated later
  619. pptr = NULL;
  620. for (;;) {
  621. pptr = ptr;
  622. if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
  623. ptr = ptr->getNextEQ();
  624. if (!ptr) {
  625. pptr->setNextEQ(ep);
  626. break;
  627. }
  628. } else {
  629. ptr = ptr->getNextNE();
  630. if (!ptr) {
  631. pptr->setNextNE(ep);
  632. break;
  633. }
  634. }
  635. }
  636. return 0;
  637. }
  638. // we want to be able to quickly access suffix information
  639. // both by suffix flag, and sorted by the reverse of the
  640. // suffix string itself; so we need to set up two indexes
  641. int AffixMgr::build_sfxtree(AffEntry* sfxptr)
  642. {
  643. SfxEntry * ptr;
  644. SfxEntry * pptr;
  645. SfxEntry * ep = (SfxEntry *) sfxptr;
  646. /* get the right starting point */
  647. const char * key = ep->getKey();
  648. const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
  649. // first index by flag which must exist
  650. ptr = (SfxEntry*)sFlag[flg];
  651. ep->setFlgNxt(ptr);
  652. sFlag[flg] = (AffEntry *) ep;
  653. // next index by affix string
  654. // handle the special case of null affix string
  655. if (strlen(key) == 0) {
  656. // always inset them at head of list at element 0
  657. ptr = (SfxEntry*)sStart[0];
  658. ep->setNext(ptr);
  659. sStart[0] = (AffEntry*)ep;
  660. return 0;
  661. }
  662. // now handle the normal case
  663. ep->setNextEQ(NULL);
  664. ep->setNextNE(NULL);
  665. unsigned char sp = *((const unsigned char *)key);
  666. ptr = (SfxEntry*)sStart[sp];
  667. // handle the first insert
  668. if (!ptr) {
  669. sStart[sp] = (AffEntry*)ep;
  670. return 0;
  671. }
  672. // otherwise use binary tree insertion so that a sorted
  673. // list can easily be generated later
  674. pptr = NULL;
  675. for (;;) {
  676. pptr = ptr;
  677. if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
  678. ptr = ptr->getNextEQ();
  679. if (!ptr) {
  680. pptr->setNextEQ(ep);
  681. break;
  682. }
  683. } else {
  684. ptr = ptr->getNextNE();
  685. if (!ptr) {
  686. pptr->setNextNE(ep);
  687. break;
  688. }
  689. }
  690. }
  691. return 0;
  692. }
  693. // convert from binary tree to sorted list
  694. int AffixMgr::process_pfx_tree_to_list()
  695. {
  696. for (int i=1; i< SETSIZE; i++) {
  697. pStart[i] = process_pfx_in_order(pStart[i],NULL);
  698. }
  699. return 0;
  700. }
  701. AffEntry* AffixMgr::process_pfx_in_order(AffEntry* ptr, AffEntry* nptr)
  702. {
  703. if (ptr) {
  704. nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextNE(), nptr);
  705. ((PfxEntry*) ptr)->setNext((PfxEntry*) nptr);
  706. nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextEQ(), ptr);
  707. }
  708. return nptr;
  709. }
  710. // convert from binary tree to sorted list
  711. int AffixMgr:: process_sfx_tree_to_list()
  712. {
  713. for (int i=1; i< SETSIZE; i++) {
  714. sStart[i] = process_sfx_in_order(sStart[i],NULL);
  715. }
  716. return 0;
  717. }
  718. AffEntry* AffixMgr::process_sfx_in_order(AffEntry* ptr, AffEntry* nptr)
  719. {
  720. if (ptr) {
  721. nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextNE(), nptr);
  722. ((SfxEntry*) ptr)->setNext((SfxEntry*) nptr);
  723. nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextEQ(), ptr);
  724. }
  725. return nptr;
  726. }
  727. // reinitialize the PfxEntry links NextEQ and NextNE to speed searching
  728. // using the idea of leading subsets this time
  729. int AffixMgr::process_pfx_order()
  730. {
  731. PfxEntry* ptr;
  732. // loop through each prefix list starting point
  733. for (int i=1; i < SETSIZE; i++) {
  734. ptr = (PfxEntry*)pStart[i];
  735. // look through the remainder of the list
  736. // and find next entry with affix that
  737. // the current one is not a subset of
  738. // mark that as destination for NextNE
  739. // use next in list that you are a subset
  740. // of as NextEQ
  741. for (; ptr != NULL; ptr = ptr->getNext()) {
  742. PfxEntry * nptr = ptr->getNext();
  743. for (; nptr != NULL; nptr = nptr->getNext()) {
  744. if (! isSubset( ptr->getKey() , nptr->getKey() )) break;
  745. }
  746. ptr->setNextNE(nptr);
  747. ptr->setNextEQ(NULL);
  748. if ((ptr->getNext()) && isSubset(ptr->getKey() , (ptr->getNext())->getKey()))
  749. ptr->setNextEQ(ptr->getNext());
  750. }
  751. // now clean up by adding smart search termination strings:
  752. // if you are already a superset of the previous prefix
  753. // but not a subset of the next, search can end here
  754. // so set NextNE properly
  755. ptr = (PfxEntry *) pStart[i];
  756. for (; ptr != NULL; ptr = ptr->getNext()) {
  757. PfxEntry * nptr = ptr->getNext();
  758. PfxEntry * mptr = NULL;
  759. for (; nptr != NULL; nptr = nptr->getNext()) {
  760. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  761. mptr = nptr;
  762. }
  763. if (mptr) mptr->setNextNE(NULL);
  764. }
  765. }
  766. return 0;
  767. }
  768. // initialize the SfxEntry links NextEQ and NextNE to speed searching
  769. // using the idea of leading subsets this time
  770. int AffixMgr::process_sfx_order()
  771. {
  772. SfxEntry* ptr;
  773. // loop through each prefix list starting point
  774. for (int i=1; i < SETSIZE; i++) {
  775. ptr = (SfxEntry *) sStart[i];
  776. // look through the remainder of the list
  777. // and find next entry with affix that
  778. // the current one is not a subset of
  779. // mark that as destination for NextNE
  780. // use next in list that you are a subset
  781. // of as NextEQ
  782. for (; ptr != NULL; ptr = ptr->getNext()) {
  783. SfxEntry * nptr = ptr->getNext();
  784. for (; nptr != NULL; nptr = nptr->getNext()) {
  785. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  786. }
  787. ptr->setNextNE(nptr);
  788. ptr->setNextEQ(NULL);
  789. if ((ptr->getNext()) && isSubset(ptr->getKey(),(ptr->getNext())->getKey()))
  790. ptr->setNextEQ(ptr->getNext());
  791. }
  792. // now clean up by adding smart search termination strings:
  793. // if you are already a superset of the previous suffix
  794. // but not a subset of the next, search can end here
  795. // so set NextNE properly
  796. ptr = (SfxEntry *) sStart[i];
  797. for (; ptr != NULL; ptr = ptr->getNext()) {
  798. SfxEntry * nptr = ptr->getNext();
  799. SfxEntry * mptr = NULL;
  800. for (; nptr != NULL; nptr = nptr->getNext()) {
  801. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  802. mptr = nptr;
  803. }
  804. if (mptr) mptr->setNextNE(NULL);
  805. }
  806. }
  807. return 0;
  808. }
  809. // takes aff file condition string and creates the
  810. // conds array - please see the appendix at the end of the
  811. // file affentry.cxx which describes what is going on here
  812. // in much more detail
  813. int AffixMgr::encodeit(struct affentry * ptr, char * cs)
  814. {
  815. unsigned char c;
  816. int i, j, k;
  817. unsigned char mbr[MAXLNLEN];
  818. w_char wmbr[MAXLNLEN];
  819. w_char * wpos = wmbr;
  820. // now clear the conditions array */
  821. for (i=0;i<SETSIZE;i++) ptr->conds.base[i] = (unsigned char) 0;
  822. // now parse the string to create the conds array */
  823. int nc = strlen(cs);
  824. unsigned char neg = 0; // complement indicator
  825. int grp = 0; // group indicator
  826. unsigned char n = 0; // number of conditions
  827. int ec = 0; // end condition indicator
  828. int nm = 0; // number of member in group
  829. // if no condition just return
  830. if (strcmp(cs,".")==0) {
  831. ptr->numconds = 0;
  832. return 0;
  833. }
  834. i = 0;
  835. while (i < nc) {
  836. c = *((unsigned char *)(cs + i));
  837. // start group indicator
  838. if (c == '[') {
  839. grp = 1;
  840. c = 0;
  841. }
  842. // complement flag
  843. if ((grp == 1) && (c == '^')) {
  844. neg = 1;
  845. c = 0;
  846. }
  847. // end goup indicator
  848. if (c == ']') {
  849. ec = 1;
  850. c = 0;
  851. }
  852. // add character of group to list
  853. if ((grp == 1) && (c != 0)) {
  854. *(mbr + nm) = c;
  855. nm++;
  856. c = 0;
  857. }
  858. // end of condition
  859. if (c != 0) {
  860. ec = 1;
  861. }
  862. if (ec) {
  863. if (!utf8) {
  864. if (grp == 1) {
  865. if (neg == 0) {
  866. // set the proper bits in the condition array vals for those chars
  867. for (j=0;j<nm;j++) {
  868. k = (unsigned int) mbr[j];
  869. ptr->conds.base[k] = ptr->conds.base[k] | ((unsigned char)1 << n);
  870. }
  871. } else {
  872. // complement so set all of them and then unset indicated ones
  873. for (j=0;j<SETSIZE;j++) ptr->conds.base[j] = ptr->conds.base[j] | ((unsigned char)1 << n);
  874. for (j=0;j<nm;j++) {
  875. k = (unsigned int) mbr[j];
  876. ptr->conds.base[k] = ptr->conds.base[k] & ~((unsigned char)1 << n);
  877. }
  878. }
  879. neg = 0;
  880. grp = 0;
  881. nm = 0;
  882. } else {
  883. // not a group so just set the proper bit for this char
  884. // but first handle special case of . inside condition
  885. if (c == '.') {
  886. // wild card character so set them all
  887. for (j=0;j<SETSIZE;j++) ptr->conds.base[j] = ptr->conds.base[j] | ((unsigned char)1 << n);
  888. } else {
  889. ptr->conds.base[(unsigned int) c] = ptr->conds.base[(unsigned int)c] | ((unsigned char)1 << n);
  890. }
  891. }
  892. n++;
  893. ec = 0;
  894. } else { // UTF-8 character set
  895. if (grp == 1) {
  896. ptr->conds.utf8.neg[n] = neg;
  897. if (neg == 0) {
  898. // set the proper bits in the condition array vals for those chars
  899. for (j=0;j<nm;j++) {
  900. k = (unsigned int) mbr[j];
  901. if (k >> 7) {
  902. u8_u16(wpos, 1, (char *) mbr + j);
  903. wpos++;
  904. if ((k & 0xe0) == 0xe0) j+=2; else j++; // 3-byte UTF-8 character
  905. } else {
  906. ptr->conds.utf8.ascii[k] = ptr->conds.utf8.ascii[k] | ((unsigned char)1 << n);
  907. }
  908. }
  909. } else { // neg == 1
  910. // complement so set all of them and then unset indicated ones
  911. for (j=0;j<(SETSIZE/2);j++) ptr->conds.utf8.ascii[j] = ptr->conds.utf8.ascii[j] | ((unsigned char)1 << n);
  912. for (j=0;j<nm;j++) {
  913. k = (unsigned int) mbr[j];
  914. if (k >> 7) {
  915. u8_u16(wpos, 1, (char *) mbr + j);
  916. wpos++;
  917. if ((k & 0xe0) == 0xe0) j+=2; else j++; // 3-byte UTF-8 character
  918. } else {
  919. ptr->conds.utf8.ascii[k] = ptr->conds.utf8.ascii[k] & ~((unsigned char)1 << n);
  920. }
  921. }
  922. }
  923. neg = 0;
  924. grp = 0;
  925. nm = 0;
  926. ptr->conds.utf8.wlen[n] = wpos - wmbr;
  927. if ((wpos - wmbr) != 0) {
  928. ptr->conds.utf8.wchars[n] = (w_char *) malloc(sizeof(w_char) * (wpos - wmbr));
  929. if (!ptr->conds.utf8.wchars[n]) return 1;
  930. memcpy(ptr->conds.utf8.wchars[n], wmbr, sizeof(w_char) * (wpos - wmbr));
  931. flag_qsort((unsigned short *) ptr->conds.utf8.wchars[n], 0, ptr->conds.utf8.wlen[n]);
  932. wpos = wmbr;
  933. }
  934. } else { // grp == 0
  935. // is UTF-8 character?
  936. if (c >> 7) {
  937. ptr->conds.utf8.wchars[n] = (w_char *) malloc(sizeof(w_char));
  938. if (!ptr->conds.utf8.wchars[n]) return 1;
  939. ptr->conds.utf8.wlen[n] = 1;
  940. u8_u16(ptr->conds.utf8.wchars[n], 1, cs + i);
  941. if ((c & 0xe0) == 0xe0) i+=2; else i++; // 3-byte UFT-8 character
  942. } else {
  943. ptr->conds.utf8.wchars[n] = NULL;
  944. // not a group so just set the proper bit for this char
  945. // but first handle special case of . inside condition
  946. if (c == '.') {
  947. ptr->conds.utf8.all[n] = 1;
  948. // wild card character so set them all
  949. for (j=0;j<(SETSIZE/2);j++) ptr->conds.utf8.ascii[j] = ptr->conds.utf8.ascii[j] | ((unsigned char)1 << n);
  950. } else {
  951. ptr->conds.utf8.all[n] = 0;
  952. ptr->conds.utf8.ascii[(unsigned int) c] = ptr->conds.utf8.ascii[(unsigned int)c] | ((unsigned char)1 << n);
  953. }
  954. }
  955. neg = 0;
  956. }
  957. n++;
  958. ec = 0;
  959. neg = 0;
  960. }
  961. }
  962. i++;
  963. }
  964. ptr->numconds = n;
  965. return 0;
  966. }
  967. // return 1 if s1 is a leading subset of s2
  968. /* inline int AffixMgr::isSubset(const char * s1, const char * s2)
  969. {
  970. while ((*s1 == *s2) && *s1) {
  971. s1++;
  972. s2++;
  973. }
  974. return (*s1 == '\0');
  975. }
  976. */
  977. // return 1 if s1 is a leading subset of s2 (dots are for infixes)
  978. inline int AffixMgr::isSubset(const char * s1, const char * s2)
  979. {
  980. while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) {
  981. s1++;
  982. s2++;
  983. }
  984. return (*s1 == '\0');
  985. }
  986. // check word for prefixes
  987. struct hentry * AffixMgr::prefix_check(const char * word, int len, char in_compound,
  988. const FLAG needflag)
  989. {
  990. struct hentry * rv= NULL;
  991. pfx = NULL;
  992. pfxappnd = NULL;
  993. sfxappnd = NULL;
  994. // first handle the special case of 0 length prefixes
  995. PfxEntry * pe = (PfxEntry *) pStart[0];
  996. while (pe) {
  997. if (
  998. // fogemorpheme
  999. ((in_compound != IN_CPD_NOT) || !(pe->getCont() &&
  1000. (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) &&
  1001. // permit prefixes in compounds
  1002. ((in_compound != IN_CPD_END) || (pe->getCont() &&
  1003. (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))
  1004. ) {
  1005. // check prefix
  1006. rv = pe->checkword(word, len, in_compound, needflag);
  1007. if (rv) {
  1008. pfx=(AffEntry *)pe; // BUG: pfx not stateless
  1009. return rv;
  1010. }
  1011. }
  1012. pe = pe->getNext();
  1013. }
  1014. // now handle the general case
  1015. unsigned char sp = *((const unsigned char *)word);
  1016. PfxEntry * pptr = (PfxEntry *)pStart[sp];
  1017. while (pptr) {
  1018. if (isSubset(pptr->getKey(),word)) {
  1019. if (
  1020. // fogemorpheme
  1021. ((in_compound != IN_CPD_NOT) || !(pptr->getCont() &&
  1022. (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) &&
  1023. // permit prefixes in compounds
  1024. ((in_compound != IN_CPD_END) || (pptr->getCont() &&
  1025. (TESTAFF(pptr->getCont(), compoundpermitflag, pptr->getContLen()))))
  1026. ) {
  1027. // check prefix
  1028. rv = pptr->checkword(word, len, in_compound, needflag);
  1029. if (rv) {
  1030. pfx=(AffEntry *)pptr; // BUG: pfx not stateless
  1031. return rv;
  1032. }
  1033. }
  1034. pptr = pptr->getNextEQ();
  1035. } else {
  1036. pptr = pptr->getNextNE();
  1037. }
  1038. }
  1039. return NULL;
  1040. }
  1041. // check word for prefixes
  1042. struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len,
  1043. char in_compound, const FLAG needflag)
  1044. {
  1045. struct hentry * rv= NULL;
  1046. pfx = NULL;
  1047. sfxappnd = NULL;
  1048. // first handle the special case of 0 length prefixes
  1049. PfxEntry * pe = (PfxEntry *) pStart[0];
  1050. while (pe) {
  1051. rv = pe->check_twosfx(word, len, in_compound, needflag);
  1052. if (rv) return rv;
  1053. pe = pe->getNext();
  1054. }
  1055. // now handle the general case
  1056. unsigned char sp = *((const unsigned char *)word);
  1057. PfxEntry * pptr = (PfxEntry *)pStart[sp];
  1058. while (pptr) {
  1059. if (isSubset(pptr->getKey(),word)) {
  1060. rv = pptr->check_twosfx(word, len, in_compound, needflag);
  1061. if (rv) {
  1062. pfx = (AffEntry *)pptr;
  1063. return rv;
  1064. }
  1065. pptr = pptr->getNextEQ();
  1066. } else {
  1067. pptr = pptr->getNextNE();
  1068. }
  1069. }
  1070. return NULL;
  1071. }
  1072. #ifdef HUNSPELL_EXPERIMENTAL
  1073. // check word for prefixes
  1074. char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound,
  1075. const FLAG needflag)
  1076. {
  1077. char * st;
  1078. char result[MAXLNLEN];
  1079. result[0] = '\0';
  1080. pfx = NULL;
  1081. sfxappnd = NULL;
  1082. // first handle the special case of 0 length prefixes
  1083. PfxEntry * pe = (PfxEntry *) pStart[0];
  1084. while (pe) {
  1085. st = pe->check_morph(word,len,in_compound, needflag);
  1086. if (st) {
  1087. strcat(result, st);
  1088. free(st);
  1089. }
  1090. // if (rv) return rv;
  1091. pe = pe->getNext();
  1092. }
  1093. // now handle the general case
  1094. unsigned char sp = *((const unsigned char *)word);
  1095. PfxEntry * pptr = (PfxEntry *)pStart[sp];
  1096. while (pptr) {
  1097. if (isSubset(pptr->getKey(),word)) {
  1098. st = pptr->check_morph(word,len,in_compound, needflag);
  1099. if (st) {
  1100. // fogemorpheme
  1101. if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() &&
  1102. (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))))) {
  1103. strcat(result, st);
  1104. pfx = (AffEntry *)pptr;
  1105. }
  1106. free(st);
  1107. }
  1108. pptr = pptr->getNextEQ();
  1109. } else {
  1110. pptr = pptr->getNextNE();
  1111. }
  1112. }
  1113. if (*result) return mystrdup(result);
  1114. return NULL;
  1115. }
  1116. // check word for prefixes
  1117. char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len,
  1118. char in_compound, const FLAG needflag)
  1119. {
  1120. char * st;
  1121. char result[MAXLNLEN];
  1122. result[0] = '\0';
  1123. pfx = NULL;
  1124. sfxappnd = NULL;
  1125. // first handle the special case of 0 length prefixes
  1126. PfxEntry * pe = (PfxEntry *) pStart[0];
  1127. while (pe) {
  1128. st = pe->check_twosfx_morph(word,len,in_compound, needflag);
  1129. if (st) {
  1130. strcat(result, st);
  1131. free(st);
  1132. }
  1133. pe = pe->getNext();
  1134. }
  1135. // now handle the general case
  1136. unsigned char sp = *((const unsigned char *)word);
  1137. PfxEntry * pptr = (PfxEntry *)pStart[sp];
  1138. while (pptr) {
  1139. if (isSubset(pptr->getKey(),word)) {
  1140. st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
  1141. if (st) {
  1142. strcat(result, st);
  1143. free(st);
  1144. pfx = (AffEntry *)pptr;
  1145. }
  1146. pptr = pptr->getNextEQ();
  1147. } else {
  1148. pptr = pptr->getNextNE();
  1149. }
  1150. }
  1151. if (*result) return mystrdup(result);
  1152. return NULL;
  1153. }
  1154. #endif // END OF HUNSPELL_EXPERIMENTAL CODE
  1155. // Is word a non compound with a REP substitution (see checkcompoundrep)?
  1156. int AffixMgr::cpdrep_check(const char * word, int wl)
  1157. {
  1158. char candidate[MAXLNLEN];
  1159. const char * r;
  1160. int lenr, lenp;
  1161. if ((wl < 2) || !numrep) return 0;
  1162. for (int i=0; i < numrep; i++ ) {
  1163. r = word;
  1164. lenr = strlen(reptable[i].pattern2);
  1165. lenp = strlen(reptable[i].pattern);
  1166. // search every occurence of the pattern in the word
  1167. while ((r=strstr(r, reptable[i].pattern)) != NULL) {
  1168. strcpy(candidate, word);
  1169. if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break;
  1170. strcpy(candidate+(r-word),reptable[i].pattern2);
  1171. strcpy(candidate+(r-word)+lenr, r+lenp);
  1172. if (candidate_check(candidate,strlen(candidate))) return 1;
  1173. r++; // search for the next letter
  1174. }
  1175. }
  1176. return 0;
  1177. }
  1178. // forbid compoundings when there are special patterns at word bound
  1179. int AffixMgr::cpdpat_check(const char * word, int pos)
  1180. {
  1181. int len;
  1182. for (int i = 0; i < numcheckcpd; i++) {
  1183. if (isSubset(checkcpdtable[i].pattern2, word + pos) &&
  1184. (len = strlen(checkcpdtable[i].pattern)) && (pos > len) &&
  1185. (strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)) return 1;
  1186. }
  1187. return 0;
  1188. }
  1189. // forbid compounding with neighbouring upper and lower case characters at word bounds
  1190. int AffixMgr::cpdcase_check(const char * word, int pos)
  1191. {
  1192. if (utf8) {
  1193. w_char u, w;
  1194. const char * p;
  1195. u8_u16(&u, 1, word + pos);
  1196. for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--);
  1197. u8_u16(&w, 1, p);
  1198. unsigned short a = (u.h << 8) + u.l;
  1199. unsigned short b = (w.h << 8) + w.l;
  1200. if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b))) return 1;
  1201. } else {
  1202. unsigned char a = *(word + pos - 1);
  1203. unsigned char b = *(word + pos);
  1204. if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-')) return 1;
  1205. }
  1206. return 0;
  1207. }
  1208. // check compound patterns
  1209. int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** def, char all)
  1210. {
  1211. signed short btpp[MAXWORDLEN]; // metacharacter (*, ?) positions for backtracking
  1212. signed short btwp[MAXWORDLEN]; // word positions for metacharacters
  1213. int btnum[MAXWORDLEN]; // number of matched characters in metacharacter positions
  1214. short bt = 0;
  1215. int i;
  1216. int ok;
  1217. int w = 0;
  1218. if (!*words) {
  1219. w = 1;
  1220. *words = def;
  1221. }
  1222. (*words)[wnum] = rv;
  1223. for (i = 0; i < numdefcpd; i++) {
  1224. signed short pp = 0; // pattern position
  1225. signed short wp = 0; // "words" position
  1226. int ok2;
  1227. ok = 1;
  1228. ok2 = 1;
  1229. do {
  1230. while ((pp < defcpdtable[i].len) && (wp <= wnum)) {
  1231. if (((pp+1) < defcpdtable[i].len) &&
  1232. ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) {
  1233. int wend = (defcpdtable[i].def[pp+1] == '?') ? wp : wnum;
  1234. ok2 = 1;
  1235. pp+=2;
  1236. btpp[bt] = pp;
  1237. btwp[bt] = wp;
  1238. while (wp <= wend) {
  1239. if (!(*words)[wp]->alen ||
  1240. !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp-2], (*words)[wp]->alen)) {
  1241. ok2 = 0;
  1242. break;
  1243. }
  1244. wp++;
  1245. }
  1246. if (wp <= wnum) ok2 = 0;
  1247. btnum[bt] = wp - btwp[bt];
  1248. if (btnum[bt] > 0) bt++;
  1249. if (ok2) break;
  1250. } else {
  1251. ok2 = 1;
  1252. if (!(*words)[wp] || !(*words)[wp]->alen ||
  1253. !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp], (*words)[wp]->alen)) {
  1254. ok = 0;
  1255. break;
  1256. }
  1257. pp++;
  1258. wp++;
  1259. if ((defcpdtable[i].len == pp) && !(wp > wnum)) ok = 0;
  1260. }
  1261. }
  1262. if (ok && ok2) {
  1263. int r = pp;
  1264. while ((defcpdtable[i].len > r) && ((r+1) < defcpdtable[i].len) &&
  1265. ((defcpdtable[i].def[r+1] == '*') || (defcpdtable[i].def[r+1] == '?'))) r+=2;
  1266. if (defcpdtable[i].len <= r) return 1;
  1267. }
  1268. // backtrack
  1269. if (bt) do {
  1270. ok = 1;
  1271. btnum[bt - 1]--;
  1272. pp = btpp[bt - 1];
  1273. wp = btwp[bt - 1] + btnum[bt - 1];
  1274. } while ((btnum[bt - 1] < 0) && --bt);
  1275. } while (bt);
  1276. if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1;
  1277. // check zero ending
  1278. while (ok && ok2 && (defcpdtable[i].len > pp) && ((pp+1) < defcpdtable[i].len) &&
  1279. ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) pp+=2;
  1280. if (ok && ok2 && (defcpdtable[i].len <= pp)) return 1;
  1281. }
  1282. (*words)[wnum] = NULL;
  1283. if (w) *words = NULL;
  1284. return 0;
  1285. }
  1286. inline int AffixMgr::candidate_check(const char * word, int len)
  1287. {
  1288. struct hentry * rv=NULL;
  1289. rv = lookup(word);
  1290. if (rv) return 1;
  1291. // rv = prefix_check(word,len,1);
  1292. // if (rv) return 1;
  1293. rv = affix_check(word,len);
  1294. if (rv) return 1;
  1295. return 0;
  1296. }
  1297. // calculate number of syllable for compound-checking
  1298. short AffixMgr::get_syllable(const char * word, int wlen)
  1299. {
  1300. if (cpdmaxsyllable==0) return 0;
  1301. short num=0;
  1302. if (!utf8) {
  1303. for (int i=0; i<wlen; i++) {
  1304. if (strchr(cpdvowels, word[i])) num++;
  1305. }
  1306. } else if (cpdvowels_utf16) {
  1307. w_char w[MAXWORDUTF8LEN];
  1308. int i = u8_u16(w, MAXWORDUTF8LEN, word);
  1309. for (; i; i--) {
  1310. if (flag_bsearch((unsigned short *) cpdvowels_utf16,
  1311. ((unsigned short *) w)[i - 1], cpdvowels_utf16_len)) num++;
  1312. }
  1313. }
  1314. return num;
  1315. }
  1316. // check if compound word is correctly spelled
  1317. // hu_mov_rule = spec. Hungarian rule (XXX)
  1318. struct hentry * AffixMgr::compound_check(const char * word, int len,
  1319. short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words = NULL,
  1320. char hu_mov_rule = 0, int * cmpdstemnum = NULL, int * cmpdstem = NULL, char is_sug = 0)
  1321. {
  1322. int i;
  1323. short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
  1324. int oldcmpdstemnum = 0;
  1325. struct hentry * rv = NULL;
  1326. struct hentry * rv_first;
  1327. struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
  1328. char st [MAXWORDUTF8LEN + 4];
  1329. char ch;
  1330. int cmin;
  1331. int cmax;
  1332. int checked_prefix;
  1333. #ifdef HUNSTEM
  1334. if (cmpdstemnum) {
  1335. if (wordnum == 0) {
  1336. *cmpdstemnum = 1;
  1337. } else {
  1338. (*cmpdstemnum)++;
  1339. }
  1340. }
  1341. #endif
  1342. if (utf8) {
  1343. for (cmin = 0, i = 0; (i < cpdmin) && word[cmin]; i++) {
  1344. cmin++;
  1345. for (; (word[cmin] & 0xc0) == 0x80; cmin++);
  1346. }
  1347. for (cmax = len, i = 0; (i < (cpdmin - 1)) && cmax; i++) {
  1348. cmax--;
  1349. for (; (word[cmax] & 0xc0) == 0x80; cmax--);
  1350. }
  1351. } else {
  1352. cmin = cpdmin;
  1353. cmax = len - cpdmin + 1;
  1354. }
  1355. strcpy(st, word);
  1356. for (i = cmin; i < cmax; i++) {
  1357. oldnumsyllable = numsyllable;
  1358. oldwordnum = wordnum;
  1359. checked_prefix = 0;
  1360. // go to end of the UTF-8 character
  1361. if (utf8) {
  1362. for (; (st[i] & 0xc0) == 0x80; i++);
  1363. if (i >= cmax) return NULL;
  1364. }
  1365. ch = st[i];
  1366. st[i] = '\0';
  1367. sfx = NULL;
  1368. pfx = NULL;
  1369. // FIRST WORD
  1370. rv = lookup(st); // perhaps without prefix
  1371. // search homonym with compound flag
  1372. while ((rv) && !hu_mov_rule &&
  1373. ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) ||
  1374. !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1375. (compoundbegin && !wordnum &&
  1376. TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
  1377. (compoundmiddle && wordnum && !words &&
  1378. TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
  1379. (numdefcpd &&
  1380. ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||
  1381. (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))
  1382. ))) {
  1383. rv = rv->next_homonym;
  1384. }
  1385. if (!rv) {
  1386. if (compoundflag &&
  1387. !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {
  1388. if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,
  1389. FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule &&
  1390. ((SfxEntry*)sfx)->getCont() &&
  1391. ((compoundforbidflag && TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
  1392. ((SfxEntry*)sfx)->getContLen())) || (compoundend &&
  1393. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend,
  1394. ((SfxEntry*)sfx)->getContLen())))) {
  1395. rv = NULL;
  1396. }
  1397. }
  1398. if (rv ||
  1399. (((wordnum == 0) && compoundbegin &&
  1400. ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_

Large files files are truncated, but you can click here to view the full file