PageRenderTime 75ms CodeModel.GetById 27ms RepoModel.GetById 1ms app.codeStats 1ms

/Hunspell/src/hunspell/affixmgr.cxx

https://bitbucket.org/texniccenter/texniccenter
C++ | 4521 lines | 3606 code | 489 blank | 426 comment | 1549 complexity | 8e7e785e713ee59041c2cc3cb66fb85d MD5 | raw file
Possible License(s): LGPL-2.0, GPL-2.0, MPL-2.0-no-copyleft-exception, LGPL-2.1, GPL-3.0

Large files files are truncated, but you can click here to view the full file

  1. #include "license.hunspell"
  2. #include "license.myspell"
  3. #include <stdlib.h>
  4. #include <string.h>
  5. #include <stdio.h>
  6. #include <ctype.h>
  7. #include <vector>
  8. #include "affixmgr.hxx"
  9. #include "affentry.hxx"
  10. #include "langnum.hxx"
  11. #include "csutil.hxx"
  12. AffixMgr::AffixMgr(const char * affpath, HashMgr** ptr, int * md, const char * key)
  13. {
  14. // register hash manager and load affix data from aff file
  15. pHMgr = ptr[0];
  16. alldic = ptr;
  17. maxdic = md;
  18. keystring = NULL;
  19. trystring = NULL;
  20. encoding=NULL;
  21. csconv=NULL;
  22. utf8 = 0;
  23. complexprefixes = 0;
  24. maptable = NULL;
  25. nummap = 0;
  26. breaktable = NULL;
  27. numbreak = -1;
  28. reptable = NULL;
  29. numrep = 0;
  30. iconvtable = NULL;
  31. oconvtable = NULL;
  32. checkcpdtable = NULL;
  33. // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN)
  34. simplifiedcpd = 0;
  35. numcheckcpd = 0;
  36. defcpdtable = NULL;
  37. numdefcpd = 0;
  38. phone = NULL;
  39. compoundflag = FLAG_NULL; // permits word in compound forms
  40. compoundbegin = FLAG_NULL; // may be first word in compound forms
  41. compoundmiddle = FLAG_NULL; // may be middle word in compound forms
  42. compoundend = FLAG_NULL; // may be last word in compound forms
  43. compoundroot = FLAG_NULL; // compound word signing flag
  44. compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word
  45. compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word
  46. checkcompounddup = 0; // forbid double words in compounds
  47. checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a REP substitution)
  48. checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds
  49. checkcompoundtriple = 0; // forbid compounds with triple letters
  50. simplifiedtriple = 0; // allow simplified triple letters in compounds (Schiff+fahrt -> Schiffahrt)
  51. forbiddenword = FORBIDDENWORD; // forbidden word signing flag
  52. nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag
  53. nongramsuggest = FLAG_NULL;
  54. lang = NULL; // language
  55. langnum = 0; // language code (see http://l10n.openoffice.org/languages.html)
  56. needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes
  57. cpdwordmax = -1; // default: unlimited wordcount in compound words
  58. cpdmin = -1; // undefined
  59. cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words
  60. cpdvowels=NULL; // vowels (for calculating of Hungarian compounding limit, O(n) search! XXX)
  61. cpdvowels_utf16=NULL; // vowels for UTF-8 encoding (bsearch instead of O(n) search)
  62. cpdvowels_utf16_len=0; // vowels
  63. pfxappnd=NULL; // previous prefix for counting the syllables of prefix BUG
  64. sfxappnd=NULL; // previous suffix for counting a special syllables BUG
  65. cpdsyllablenum=NULL; // syllable count incrementing flag
  66. checknum=0; // checking numbers, and word with numbers
  67. wordchars=NULL; // letters + spec. word characters
  68. wordchars_utf16=NULL; // letters + spec. word characters
  69. wordchars_utf16_len=0; // letters + spec. word characters
  70. ignorechars=NULL; // letters + spec. word characters
  71. ignorechars_utf16=NULL; // letters + spec. word characters
  72. ignorechars_utf16_len=0; // letters + spec. word characters
  73. version=NULL; // affix and dictionary file version string
  74. havecontclass=0; // flags of possible continuing classes (double affix)
  75. // LEMMA_PRESENT: not put root into the morphological output. Lemma presents
  76. // in morhological description in dictionary file. It's often combined with PSEUDOROOT.
  77. lemma_present = FLAG_NULL;
  78. circumfix = FLAG_NULL;
  79. onlyincompound = FLAG_NULL;
  80. maxngramsugs = -1; // undefined
  81. maxdiff = -1; // undefined
  82. onlymaxdiff = 0;
  83. maxcpdsugs = -1; // undefined
  84. nosplitsugs = 0;
  85. sugswithdots = 0;
  86. keepcase = 0;
  87. forceucase = 0;
  88. warn = 0;
  89. forbidwarn = 0;
  90. checksharps = 0;
  91. substandard = FLAG_NULL;
  92. fullstrip = 0;
  93. sfx = NULL;
  94. pfx = NULL;
  95. for (int i=0; i < SETSIZE; i++) {
  96. pStart[i] = NULL;
  97. sStart[i] = NULL;
  98. pFlag[i] = NULL;
  99. sFlag[i] = NULL;
  100. }
  101. for (int j=0; j < CONTSIZE; j++) {
  102. contclasses[j] = 0;
  103. }
  104. if (parse_file(affpath, key)) {
  105. HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath);
  106. }
  107. if (cpdmin == -1) cpdmin = MINCPDLEN;
  108. }
  109. AffixMgr::~AffixMgr()
  110. {
  111. // pass through linked prefix entries and clean up
  112. for (int i=0; i < SETSIZE ;i++) {
  113. pFlag[i] = NULL;
  114. PfxEntry * ptr = pStart[i];
  115. PfxEntry * nptr = NULL;
  116. while (ptr) {
  117. nptr = ptr->getNext();
  118. delete(ptr);
  119. ptr = nptr;
  120. nptr = NULL;
  121. }
  122. }
  123. // pass through linked suffix entries and clean up
  124. for (int j=0; j < SETSIZE ; j++) {
  125. sFlag[j] = NULL;
  126. SfxEntry * ptr = sStart[j];
  127. SfxEntry * nptr = NULL;
  128. while (ptr) {
  129. nptr = ptr->getNext();
  130. delete(ptr);
  131. ptr = nptr;
  132. nptr = NULL;
  133. }
  134. sStart[j] = NULL;
  135. }
  136. if (keystring) free(keystring);
  137. keystring=NULL;
  138. if (trystring) free(trystring);
  139. trystring=NULL;
  140. if (encoding) free(encoding);
  141. encoding=NULL;
  142. if (maptable) {
  143. for (int j=0; j < nummap; j++) {
  144. for (int k=0; k < maptable[j].len; k++) {
  145. if (maptable[j].set[k]) free(maptable[j].set[k]);
  146. }
  147. free(maptable[j].set);
  148. maptable[j].set = NULL;
  149. maptable[j].len = 0;
  150. }
  151. free(maptable);
  152. maptable = NULL;
  153. }
  154. nummap = 0;
  155. if (breaktable) {
  156. for (int j=0; j < numbreak; j++) {
  157. if (breaktable[j]) free(breaktable[j]);
  158. breaktable[j] = NULL;
  159. }
  160. free(breaktable);
  161. breaktable = NULL;
  162. }
  163. numbreak = 0;
  164. if (reptable) {
  165. for (int j=0; j < numrep; j++) {
  166. free(reptable[j].pattern);
  167. free(reptable[j].pattern2);
  168. }
  169. free(reptable);
  170. reptable = NULL;
  171. }
  172. if (iconvtable) delete iconvtable;
  173. if (oconvtable) delete oconvtable;
  174. if (phone && phone->rules) {
  175. for (int j=0; j < phone->num + 1; j++) {
  176. free(phone->rules[j * 2]);
  177. free(phone->rules[j * 2 + 1]);
  178. }
  179. free(phone->rules);
  180. free(phone);
  181. phone = NULL;
  182. }
  183. if (defcpdtable) {
  184. for (int j=0; j < numdefcpd; j++) {
  185. free(defcpdtable[j].def);
  186. defcpdtable[j].def = NULL;
  187. }
  188. free(defcpdtable);
  189. defcpdtable = NULL;
  190. }
  191. numrep = 0;
  192. if (checkcpdtable) {
  193. for (int j=0; j < numcheckcpd; j++) {
  194. free(checkcpdtable[j].pattern);
  195. free(checkcpdtable[j].pattern2);
  196. free(checkcpdtable[j].pattern3);
  197. checkcpdtable[j].pattern = NULL;
  198. checkcpdtable[j].pattern2 = NULL;
  199. checkcpdtable[j].pattern3 = NULL;
  200. }
  201. free(checkcpdtable);
  202. checkcpdtable = NULL;
  203. }
  204. numcheckcpd = 0;
  205. FREE_FLAG(compoundflag);
  206. FREE_FLAG(compoundbegin);
  207. FREE_FLAG(compoundmiddle);
  208. FREE_FLAG(compoundend);
  209. FREE_FLAG(compoundpermitflag);
  210. FREE_FLAG(compoundforbidflag);
  211. FREE_FLAG(compoundroot);
  212. FREE_FLAG(forbiddenword);
  213. FREE_FLAG(nosuggest);
  214. FREE_FLAG(nongramsuggest);
  215. FREE_FLAG(needaffix);
  216. FREE_FLAG(lemma_present);
  217. FREE_FLAG(circumfix);
  218. FREE_FLAG(onlyincompound);
  219. cpdwordmax = 0;
  220. pHMgr = NULL;
  221. cpdmin = 0;
  222. cpdmaxsyllable = 0;
  223. if (cpdvowels) free(cpdvowels);
  224. if (cpdvowels_utf16) free(cpdvowels_utf16);
  225. if (cpdsyllablenum) free(cpdsyllablenum);
  226. free_utf_tbl();
  227. if (lang) free(lang);
  228. if (wordchars) free(wordchars);
  229. if (wordchars_utf16) free(wordchars_utf16);
  230. if (ignorechars) free(ignorechars);
  231. if (ignorechars_utf16) free(ignorechars_utf16);
  232. if (version) free(version);
  233. checknum=0;
  234. #ifdef MOZILLA_CLIENT
  235. delete [] csconv;
  236. #endif
  237. }
  238. // read in aff file and build up prefix and suffix entry objects
  239. int AffixMgr::parse_file(const char * affpath, const char * key)
  240. {
  241. char * line; // io buffers
  242. char ft; // affix type
  243. // checking flag duplication
  244. char dupflags[CONTSIZE];
  245. char dupflags_ini = 1;
  246. // first line indicator for removing byte order mark
  247. int firstline = 1;
  248. // open the affix file
  249. FileMgr * afflst = new FileMgr(affpath, key);
  250. if (!afflst) {
  251. HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath);
  252. return 1;
  253. }
  254. // step one is to parse the affix file building up the internal
  255. // affix data structures
  256. // read in each line ignoring any that do not
  257. // start with a known line type indicator
  258. while ((line = afflst->getline())) {
  259. mychomp(line);
  260. /* remove byte order mark */
  261. if (firstline) {
  262. firstline = 0;
  263. // Affix file begins with byte order mark: possible incompatibility with old Hunspell versions
  264. if (strncmp(line,"\xEF\xBB\xBF",3) == 0) {
  265. memmove(line, line+3, strlen(line+3)+1);
  266. }
  267. }
  268. /* parse in the keyboard string */
  269. if (strncmp(line,"KEY",3) == 0) {
  270. if (parse_string(line, &keystring, afflst->getlinenum())) {
  271. delete afflst;
  272. return 1;
  273. }
  274. }
  275. /* parse in the try string */
  276. if (strncmp(line,"TRY",3) == 0) {
  277. if (parse_string(line, &trystring, afflst->getlinenum())) {
  278. delete afflst;
  279. return 1;
  280. }
  281. }
  282. /* parse in the name of the character set used by the .dict and .aff */
  283. if (strncmp(line,"SET",3) == 0) {
  284. if (parse_string(line, &encoding, afflst->getlinenum())) {
  285. delete afflst;
  286. return 1;
  287. }
  288. if (strcmp(encoding, "UTF-8") == 0) {
  289. utf8 = 1;
  290. #ifndef OPENOFFICEORG
  291. #ifndef MOZILLA_CLIENT
  292. if (initialize_utf_tbl()) return 1;
  293. #endif
  294. #endif
  295. }
  296. }
  297. /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left writing system */
  298. if (strncmp(line,"COMPLEXPREFIXES",15) == 0)
  299. complexprefixes = 1;
  300. /* parse in the flag used by the controlled compound words */
  301. if (strncmp(line,"COMPOUNDFLAG",12) == 0) {
  302. if (parse_flag(line, &compoundflag, afflst)) {
  303. delete afflst;
  304. return 1;
  305. }
  306. }
  307. /* parse in the flag used by compound words */
  308. if (strncmp(line,"COMPOUNDBEGIN",13) == 0) {
  309. if (complexprefixes) {
  310. if (parse_flag(line, &compoundend, afflst)) {
  311. delete afflst;
  312. return 1;
  313. }
  314. } else {
  315. if (parse_flag(line, &compoundbegin, afflst)) {
  316. delete afflst;
  317. return 1;
  318. }
  319. }
  320. }
  321. /* parse in the flag used by compound words */
  322. if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) {
  323. if (parse_flag(line, &compoundmiddle, afflst)) {
  324. delete afflst;
  325. return 1;
  326. }
  327. }
  328. /* parse in the flag used by compound words */
  329. if (strncmp(line,"COMPOUNDEND",11) == 0) {
  330. if (complexprefixes) {
  331. if (parse_flag(line, &compoundbegin, afflst)) {
  332. delete afflst;
  333. return 1;
  334. }
  335. } else {
  336. if (parse_flag(line, &compoundend, afflst)) {
  337. delete afflst;
  338. return 1;
  339. }
  340. }
  341. }
  342. /* parse in the data used by compound_check() method */
  343. if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) {
  344. if (parse_num(line, &cpdwordmax, afflst)) {
  345. delete afflst;
  346. return 1;
  347. }
  348. }
  349. /* parse in the flag sign compounds in dictionary */
  350. if (strncmp(line,"COMPOUNDROOT",12) == 0) {
  351. if (parse_flag(line, &compoundroot, afflst)) {
  352. delete afflst;
  353. return 1;
  354. }
  355. }
  356. /* parse in the flag used by compound_check() method */
  357. if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) {
  358. if (parse_flag(line, &compoundpermitflag, afflst)) {
  359. delete afflst;
  360. return 1;
  361. }
  362. }
  363. /* parse in the flag used by compound_check() method */
  364. if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) {
  365. if (parse_flag(line, &compoundforbidflag, afflst)) {
  366. delete afflst;
  367. return 1;
  368. }
  369. }
  370. if (strncmp(line,"CHECKCOMPOUNDDUP",16) == 0) {
  371. checkcompounddup = 1;
  372. }
  373. if (strncmp(line,"CHECKCOMPOUNDREP",16) == 0) {
  374. checkcompoundrep = 1;
  375. }
  376. if (strncmp(line,"CHECKCOMPOUNDTRIPLE",19) == 0) {
  377. checkcompoundtriple = 1;
  378. }
  379. if (strncmp(line,"SIMPLIFIEDTRIPLE",16) == 0) {
  380. simplifiedtriple = 1;
  381. }
  382. if (strncmp(line,"CHECKCOMPOUNDCASE",17) == 0) {
  383. checkcompoundcase = 1;
  384. }
  385. if (strncmp(line,"NOSUGGEST",9) == 0) {
  386. if (parse_flag(line, &nosuggest, afflst)) {
  387. delete afflst;
  388. return 1;
  389. }
  390. }
  391. if (strncmp(line,"NONGRAMSUGGEST",14) == 0) {
  392. if (parse_flag(line, &nongramsuggest, afflst)) {
  393. delete afflst;
  394. return 1;
  395. }
  396. }
  397. /* parse in the flag used by forbidden words */
  398. if (strncmp(line,"FORBIDDENWORD",13) == 0) {
  399. if (parse_flag(line, &forbiddenword, afflst)) {
  400. delete afflst;
  401. return 1;
  402. }
  403. }
  404. /* parse in the flag used by forbidden words */
  405. if (strncmp(line,"LEMMA_PRESENT",13) == 0) {
  406. if (parse_flag(line, &lemma_present, afflst)) {
  407. delete afflst;
  408. return 1;
  409. }
  410. }
  411. /* parse in the flag used by circumfixes */
  412. if (strncmp(line,"CIRCUMFIX",9) == 0) {
  413. if (parse_flag(line, &circumfix, afflst)) {
  414. delete afflst;
  415. return 1;
  416. }
  417. }
  418. /* parse in the flag used by fogemorphemes */
  419. if (strncmp(line,"ONLYINCOMPOUND",14) == 0) {
  420. if (parse_flag(line, &onlyincompound, afflst)) {
  421. delete afflst;
  422. return 1;
  423. }
  424. }
  425. /* parse in the flag used by `needaffixs' */
  426. if (strncmp(line,"PSEUDOROOT",10) == 0) {
  427. if (parse_flag(line, &needaffix, afflst)) {
  428. delete afflst;
  429. return 1;
  430. }
  431. }
  432. /* parse in the flag used by `needaffixs' */
  433. if (strncmp(line,"NEEDAFFIX",9) == 0) {
  434. if (parse_flag(line, &needaffix, afflst)) {
  435. delete afflst;
  436. return 1;
  437. }
  438. }
  439. /* parse in the minimal length for words in compounds */
  440. if (strncmp(line,"COMPOUNDMIN",11) == 0) {
  441. if (parse_num(line, &cpdmin, afflst)) {
  442. delete afflst;
  443. return 1;
  444. }
  445. if (cpdmin < 1) cpdmin = 1;
  446. }
  447. /* parse in the max. words and syllables in compounds */
  448. if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) {
  449. if (parse_cpdsyllable(line, afflst)) {
  450. delete afflst;
  451. return 1;
  452. }
  453. }
  454. /* parse in the flag used by compound_check() method */
  455. if (strncmp(line,"SYLLABLENUM",11) == 0) {
  456. if (parse_string(line, &cpdsyllablenum, afflst->getlinenum())) {
  457. delete afflst;
  458. return 1;
  459. }
  460. }
  461. /* parse in the flag used by the controlled compound words */
  462. if (strncmp(line,"CHECKNUM",8) == 0) {
  463. checknum=1;
  464. }
  465. /* parse in the extra word characters */
  466. if (strncmp(line,"WORDCHARS",9) == 0) {
  467. if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, utf8, afflst->getlinenum())) {
  468. delete afflst;
  469. return 1;
  470. }
  471. }
  472. /* parse in the ignored characters (for example, Arabic optional diacretics charachters */
  473. if (strncmp(line,"IGNORE",6) == 0) {
  474. if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, utf8, afflst->getlinenum())) {
  475. delete afflst;
  476. return 1;
  477. }
  478. }
  479. /* parse in the typical fault correcting table */
  480. if (strncmp(line,"REP",3) == 0) {
  481. if (parse_reptable(line, afflst)) {
  482. delete afflst;
  483. return 1;
  484. }
  485. }
  486. /* parse in the input conversion table */
  487. if (strncmp(line,"ICONV",5) == 0) {
  488. if (parse_convtable(line, afflst, &iconvtable, "ICONV")) {
  489. delete afflst;
  490. return 1;
  491. }
  492. }
  493. /* parse in the input conversion table */
  494. if (strncmp(line,"OCONV",5) == 0) {
  495. if (parse_convtable(line, afflst, &oconvtable, "OCONV")) {
  496. delete afflst;
  497. return 1;
  498. }
  499. }
  500. /* parse in the phonetic translation table */
  501. if (strncmp(line,"PHONE",5) == 0) {
  502. if (parse_phonetable(line, afflst)) {
  503. delete afflst;
  504. return 1;
  505. }
  506. }
  507. /* parse in the checkcompoundpattern table */
  508. if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) {
  509. if (parse_checkcpdtable(line, afflst)) {
  510. delete afflst;
  511. return 1;
  512. }
  513. }
  514. /* parse in the defcompound table */
  515. if (strncmp(line,"COMPOUNDRULE",12) == 0) {
  516. if (parse_defcpdtable(line, afflst)) {
  517. delete afflst;
  518. return 1;
  519. }
  520. }
  521. /* parse in the related character map table */
  522. if (strncmp(line,"MAP",3) == 0) {
  523. if (parse_maptable(line, afflst)) {
  524. delete afflst;
  525. return 1;
  526. }
  527. }
  528. /* parse in the word breakpoints table */
  529. if (strncmp(line,"BREAK",5) == 0) {
  530. if (parse_breaktable(line, afflst)) {
  531. delete afflst;
  532. return 1;
  533. }
  534. }
  535. /* parse in the language for language specific codes */
  536. if (strncmp(line,"LANG",4) == 0) {
  537. if (parse_string(line, &lang, afflst->getlinenum())) {
  538. delete afflst;
  539. return 1;
  540. }
  541. langnum = get_lang_num(lang);
  542. }
  543. if (strncmp(line,"VERSION",7) == 0) {
  544. for(line = line + 7; *line == ' ' || *line == '\t'; line++);
  545. version = mystrdup(line);
  546. }
  547. if (strncmp(line,"MAXNGRAMSUGS",12) == 0) {
  548. if (parse_num(line, &maxngramsugs, afflst)) {
  549. delete afflst;
  550. return 1;
  551. }
  552. }
  553. if (strncmp(line,"ONLYMAXDIFF", 11) == 0)
  554. onlymaxdiff = 1;
  555. if (strncmp(line,"MAXDIFF",7) == 0) {
  556. if (parse_num(line, &maxdiff, afflst)) {
  557. delete afflst;
  558. return 1;
  559. }
  560. }
  561. if (strncmp(line,"MAXCPDSUGS",10) == 0) {
  562. if (parse_num(line, &maxcpdsugs, afflst)) {
  563. delete afflst;
  564. return 1;
  565. }
  566. }
  567. if (strncmp(line,"NOSPLITSUGS",11) == 0) {
  568. nosplitsugs=1;
  569. }
  570. if (strncmp(line,"FULLSTRIP",9) == 0) {
  571. fullstrip=1;
  572. }
  573. if (strncmp(line,"SUGSWITHDOTS",12) == 0) {
  574. sugswithdots=1;
  575. }
  576. /* parse in the flag used by forbidden words */
  577. if (strncmp(line,"KEEPCASE",8) == 0) {
  578. if (parse_flag(line, &keepcase, afflst)) {
  579. delete afflst;
  580. return 1;
  581. }
  582. }
  583. /* parse in the flag used by `forceucase' */
  584. if (strncmp(line,"FORCEUCASE",10) == 0) {
  585. if (parse_flag(line, &forceucase, afflst)) {
  586. delete afflst;
  587. return 1;
  588. }
  589. }
  590. /* parse in the flag used by `warn' */
  591. if (strncmp(line,"WARN",4) == 0) {
  592. if (parse_flag(line, &warn, afflst)) {
  593. delete afflst;
  594. return 1;
  595. }
  596. }
  597. if (strncmp(line,"FORBIDWARN",10) == 0) {
  598. forbidwarn=1;
  599. }
  600. /* parse in the flag used by the affix generator */
  601. if (strncmp(line,"SUBSTANDARD",11) == 0) {
  602. if (parse_flag(line, &substandard, afflst)) {
  603. delete afflst;
  604. return 1;
  605. }
  606. }
  607. if (strncmp(line,"CHECKSHARPS",11) == 0) {
  608. checksharps=1;
  609. }
  610. /* parse this affix: P - prefix, S - suffix */
  611. ft = ' ';
  612. if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P';
  613. if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S';
  614. if (ft != ' ') {
  615. if (dupflags_ini) {
  616. memset(dupflags, 0, sizeof(dupflags));
  617. dupflags_ini = 0;
  618. }
  619. if (parse_affix(line, ft, afflst, dupflags)) {
  620. delete afflst;
  621. process_pfx_tree_to_list();
  622. process_sfx_tree_to_list();
  623. return 1;
  624. }
  625. }
  626. }
  627. delete afflst;
  628. // convert affix trees to sorted list
  629. process_pfx_tree_to_list();
  630. process_sfx_tree_to_list();
  631. // now we can speed up performance greatly taking advantage of the
  632. // relationship between the affixes and the idea of "subsets".
  633. // View each prefix as a potential leading subset of another and view
  634. // each suffix (reversed) as a potential trailing subset of another.
  635. // To illustrate this relationship if we know the prefix "ab" is found in the
  636. // word to examine, only prefixes that "ab" is a leading subset of need be examined.
  637. // Furthermore is "ab" is not present then none of the prefixes that "ab" is
  638. // is a subset need be examined.
  639. // The same argument goes for suffix string that are reversed.
  640. // Then to top this off why not examine the first char of the word to quickly
  641. // limit the set of prefixes to examine (i.e. the prefixes to examine must
  642. // be leading supersets of the first character of the word (if they exist)
  643. // To take advantage of this "subset" relationship, we need to add two links
  644. // from entry. One to take next if the current prefix is found (call it nexteq)
  645. // and one to take next if the current prefix is not found (call it nextne).
  646. // Since we have built ordered lists, all that remains is to properly initialize
  647. // the nextne and nexteq pointers that relate them
  648. process_pfx_order();
  649. process_sfx_order();
  650. /* get encoding for CHECKCOMPOUNDCASE */
  651. if (!utf8) {
  652. char * enc = get_encoding();
  653. csconv = get_current_cs(enc);
  654. free(enc);
  655. enc = NULL;
  656. char expw[MAXLNLEN];
  657. if (wordchars) {
  658. strcpy(expw, wordchars);
  659. free(wordchars);
  660. } else *expw = '\0';
  661. for (int i = 0; i <= 255; i++) {
  662. if ( (csconv[i].cupper != csconv[i].clower) &&
  663. (! strchr(expw, (char) i))) {
  664. *(expw + strlen(expw) + 1) = '\0';
  665. *(expw + strlen(expw)) = (char) i;
  666. }
  667. }
  668. wordchars = mystrdup(expw);
  669. }
  670. // default BREAK definition
  671. if (numbreak == -1) {
  672. breaktable = (char **) malloc(sizeof(char *) * 3);
  673. if (!breaktable) return 1;
  674. breaktable[0] = mystrdup("-");
  675. breaktable[1] = mystrdup("^-");
  676. breaktable[2] = mystrdup("-$");
  677. if (breaktable[0] && breaktable[1] && breaktable[2]) numbreak = 3;
  678. }
  679. return 0;
  680. }
  681. // we want to be able to quickly access prefix information
  682. // both by prefix flag, and sorted by prefix string itself
  683. // so we need to set up two indexes
  684. int AffixMgr::build_pfxtree(PfxEntry* pfxptr)
  685. {
  686. PfxEntry * ptr;
  687. PfxEntry * pptr;
  688. PfxEntry * ep = pfxptr;
  689. // get the right starting points
  690. const char * key = ep->getKey();
  691. const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
  692. // first index by flag which must exist
  693. ptr = pFlag[flg];
  694. ep->setFlgNxt(ptr);
  695. pFlag[flg] = ep;
  696. // handle the special case of null affix string
  697. if (strlen(key) == 0) {
  698. // always inset them at head of list at element 0
  699. ptr = pStart[0];
  700. ep->setNext(ptr);
  701. pStart[0] = ep;
  702. return 0;
  703. }
  704. // now handle the normal case
  705. ep->setNextEQ(NULL);
  706. ep->setNextNE(NULL);
  707. unsigned char sp = *((const unsigned char *)key);
  708. ptr = pStart[sp];
  709. // handle the first insert
  710. if (!ptr) {
  711. pStart[sp] = ep;
  712. return 0;
  713. }
  714. // otherwise use binary tree insertion so that a sorted
  715. // list can easily be generated later
  716. pptr = NULL;
  717. for (;;) {
  718. pptr = ptr;
  719. if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
  720. ptr = ptr->getNextEQ();
  721. if (!ptr) {
  722. pptr->setNextEQ(ep);
  723. break;
  724. }
  725. } else {
  726. ptr = ptr->getNextNE();
  727. if (!ptr) {
  728. pptr->setNextNE(ep);
  729. break;
  730. }
  731. }
  732. }
  733. return 0;
  734. }
  735. // we want to be able to quickly access suffix information
  736. // both by suffix flag, and sorted by the reverse of the
  737. // suffix string itself; so we need to set up two indexes
  738. int AffixMgr::build_sfxtree(SfxEntry* sfxptr)
  739. {
  740. SfxEntry * ptr;
  741. SfxEntry * pptr;
  742. SfxEntry * ep = sfxptr;
  743. /* get the right starting point */
  744. const char * key = ep->getKey();
  745. const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
  746. // first index by flag which must exist
  747. ptr = sFlag[flg];
  748. ep->setFlgNxt(ptr);
  749. sFlag[flg] = ep;
  750. // next index by affix string
  751. // handle the special case of null affix string
  752. if (strlen(key) == 0) {
  753. // always inset them at head of list at element 0
  754. ptr = sStart[0];
  755. ep->setNext(ptr);
  756. sStart[0] = ep;
  757. return 0;
  758. }
  759. // now handle the normal case
  760. ep->setNextEQ(NULL);
  761. ep->setNextNE(NULL);
  762. unsigned char sp = *((const unsigned char *)key);
  763. ptr = sStart[sp];
  764. // handle the first insert
  765. if (!ptr) {
  766. sStart[sp] = ep;
  767. return 0;
  768. }
  769. // otherwise use binary tree insertion so that a sorted
  770. // list can easily be generated later
  771. pptr = NULL;
  772. for (;;) {
  773. pptr = ptr;
  774. if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
  775. ptr = ptr->getNextEQ();
  776. if (!ptr) {
  777. pptr->setNextEQ(ep);
  778. break;
  779. }
  780. } else {
  781. ptr = ptr->getNextNE();
  782. if (!ptr) {
  783. pptr->setNextNE(ep);
  784. break;
  785. }
  786. }
  787. }
  788. return 0;
  789. }
  790. // convert from binary tree to sorted list
  791. int AffixMgr::process_pfx_tree_to_list()
  792. {
  793. for (int i=1; i< SETSIZE; i++) {
  794. pStart[i] = process_pfx_in_order(pStart[i],NULL);
  795. }
  796. return 0;
  797. }
  798. PfxEntry* AffixMgr::process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr)
  799. {
  800. if (ptr) {
  801. nptr = process_pfx_in_order(ptr->getNextNE(), nptr);
  802. ptr->setNext(nptr);
  803. nptr = process_pfx_in_order(ptr->getNextEQ(), ptr);
  804. }
  805. return nptr;
  806. }
  807. // convert from binary tree to sorted list
  808. int AffixMgr:: process_sfx_tree_to_list()
  809. {
  810. for (int i=1; i< SETSIZE; i++) {
  811. sStart[i] = process_sfx_in_order(sStart[i],NULL);
  812. }
  813. return 0;
  814. }
  815. SfxEntry* AffixMgr::process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr)
  816. {
  817. if (ptr) {
  818. nptr = process_sfx_in_order(ptr->getNextNE(), nptr);
  819. ptr->setNext(nptr);
  820. nptr = process_sfx_in_order(ptr->getNextEQ(), ptr);
  821. }
  822. return nptr;
  823. }
  824. // reinitialize the PfxEntry links NextEQ and NextNE to speed searching
  825. // using the idea of leading subsets this time
  826. int AffixMgr::process_pfx_order()
  827. {
  828. PfxEntry* ptr;
  829. // loop through each prefix list starting point
  830. for (int i=1; i < SETSIZE; i++) {
  831. ptr = pStart[i];
  832. // look through the remainder of the list
  833. // and find next entry with affix that
  834. // the current one is not a subset of
  835. // mark that as destination for NextNE
  836. // use next in list that you are a subset
  837. // of as NextEQ
  838. for (; ptr != NULL; ptr = ptr->getNext()) {
  839. PfxEntry * nptr = ptr->getNext();
  840. for (; nptr != NULL; nptr = nptr->getNext()) {
  841. if (! isSubset( ptr->getKey() , nptr->getKey() )) break;
  842. }
  843. ptr->setNextNE(nptr);
  844. ptr->setNextEQ(NULL);
  845. if ((ptr->getNext()) && isSubset(ptr->getKey() , (ptr->getNext())->getKey()))
  846. ptr->setNextEQ(ptr->getNext());
  847. }
  848. // now clean up by adding smart search termination strings:
  849. // if you are already a superset of the previous prefix
  850. // but not a subset of the next, search can end here
  851. // so set NextNE properly
  852. ptr = pStart[i];
  853. for (; ptr != NULL; ptr = ptr->getNext()) {
  854. PfxEntry * nptr = ptr->getNext();
  855. PfxEntry * mptr = NULL;
  856. for (; nptr != NULL; nptr = nptr->getNext()) {
  857. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  858. mptr = nptr;
  859. }
  860. if (mptr) mptr->setNextNE(NULL);
  861. }
  862. }
  863. return 0;
  864. }
  865. // initialize the SfxEntry links NextEQ and NextNE to speed searching
  866. // using the idea of leading subsets this time
  867. int AffixMgr::process_sfx_order()
  868. {
  869. SfxEntry* ptr;
  870. // loop through each prefix list starting point
  871. for (int i=1; i < SETSIZE; i++) {
  872. ptr = sStart[i];
  873. // look through the remainder of the list
  874. // and find next entry with affix that
  875. // the current one is not a subset of
  876. // mark that as destination for NextNE
  877. // use next in list that you are a subset
  878. // of as NextEQ
  879. for (; ptr != NULL; ptr = ptr->getNext()) {
  880. SfxEntry * nptr = ptr->getNext();
  881. for (; nptr != NULL; nptr = nptr->getNext()) {
  882. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  883. }
  884. ptr->setNextNE(nptr);
  885. ptr->setNextEQ(NULL);
  886. if ((ptr->getNext()) && isSubset(ptr->getKey(),(ptr->getNext())->getKey()))
  887. ptr->setNextEQ(ptr->getNext());
  888. }
  889. // now clean up by adding smart search termination strings:
  890. // if you are already a superset of the previous suffix
  891. // but not a subset of the next, search can end here
  892. // so set NextNE properly
  893. ptr = sStart[i];
  894. for (; ptr != NULL; ptr = ptr->getNext()) {
  895. SfxEntry * nptr = ptr->getNext();
  896. SfxEntry * mptr = NULL;
  897. for (; nptr != NULL; nptr = nptr->getNext()) {
  898. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  899. mptr = nptr;
  900. }
  901. if (mptr) mptr->setNextNE(NULL);
  902. }
  903. }
  904. return 0;
  905. }
  906. // add flags to the result for dictionary debugging
  907. void AffixMgr::debugflag(char * result, unsigned short flag) {
  908. char * st = encode_flag(flag);
  909. mystrcat(result, " ", MAXLNLEN);
  910. mystrcat(result, MORPH_FLAG, MAXLNLEN);
  911. if (st) {
  912. mystrcat(result, st, MAXLNLEN);
  913. free(st);
  914. }
  915. }
  916. // calculate the character length of the condition
  917. int AffixMgr::condlen(char * st)
  918. {
  919. int l = 0;
  920. bool group = false;
  921. for(; *st; st++) {
  922. if (*st == '[') {
  923. group = true;
  924. l++;
  925. } else if (*st == ']') group = false;
  926. else if (!group && (!utf8 ||
  927. (!(*st & 0x80) || ((*st & 0xc0) == 0x80)))) l++;
  928. }
  929. return l;
  930. }
  931. int AffixMgr::encodeit(affentry &entry, char * cs)
  932. {
  933. if (strcmp(cs,".") != 0) {
  934. entry.numconds = (char) condlen(cs);
  935. strncpy(entry.c.conds, cs, MAXCONDLEN);
  936. // long condition (end of conds padded by strncpy)
  937. if (entry.c.conds[MAXCONDLEN - 1] && cs[MAXCONDLEN]) {
  938. entry.opts += aeLONGCOND;
  939. entry.c.l.conds2 = mystrdup(cs + MAXCONDLEN_1);
  940. if (!entry.c.l.conds2) return 1;
  941. }
  942. } else {
  943. entry.numconds = 0;
  944. entry.c.conds[0] = '\0';
  945. }
  946. return 0;
  947. }
  948. // return 1 if s1 is a leading subset of s2 (dots are for infixes)
  949. inline int AffixMgr::isSubset(const char * s1, const char * s2)
  950. {
  951. while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) {
  952. s1++;
  953. s2++;
  954. }
  955. return (*s1 == '\0');
  956. }
  957. // check word for prefixes
  958. struct hentry * AffixMgr::prefix_check(const char * word, int len, char in_compound,
  959. const FLAG needflag)
  960. {
  961. struct hentry * rv= NULL;
  962. pfx = NULL;
  963. pfxappnd = NULL;
  964. sfxappnd = NULL;
  965. // first handle the special case of 0 length prefixes
  966. PfxEntry * pe = pStart[0];
  967. while (pe) {
  968. if (
  969. // fogemorpheme
  970. ((in_compound != IN_CPD_NOT) || !(pe->getCont() &&
  971. (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) &&
  972. // permit prefixes in compounds
  973. ((in_compound != IN_CPD_END) || (pe->getCont() &&
  974. (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))
  975. ) {
  976. // check prefix
  977. rv = pe->checkword(word, len, in_compound, needflag);
  978. if (rv) {
  979. pfx=pe; // BUG: pfx not stateless
  980. return rv;
  981. }
  982. }
  983. pe = pe->getNext();
  984. }
  985. // now handle the general case
  986. unsigned char sp = *((const unsigned char *)word);
  987. PfxEntry * pptr = pStart[sp];
  988. while (pptr) {
  989. if (isSubset(pptr->getKey(),word)) {
  990. if (
  991. // fogemorpheme
  992. ((in_compound != IN_CPD_NOT) || !(pptr->getCont() &&
  993. (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) &&
  994. // permit prefixes in compounds
  995. ((in_compound != IN_CPD_END) || (pptr->getCont() &&
  996. (TESTAFF(pptr->getCont(), compoundpermitflag, pptr->getContLen()))))
  997. ) {
  998. // check prefix
  999. rv = pptr->checkword(word, len, in_compound, needflag);
  1000. if (rv) {
  1001. pfx=pptr; // BUG: pfx not stateless
  1002. return rv;
  1003. }
  1004. }
  1005. pptr = pptr->getNextEQ();
  1006. } else {
  1007. pptr = pptr->getNextNE();
  1008. }
  1009. }
  1010. return NULL;
  1011. }
  1012. // check word for prefixes
  1013. struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len,
  1014. char in_compound, const FLAG needflag)
  1015. {
  1016. struct hentry * rv= NULL;
  1017. pfx = NULL;
  1018. sfxappnd = NULL;
  1019. // first handle the special case of 0 length prefixes
  1020. PfxEntry * pe = pStart[0];
  1021. while (pe) {
  1022. rv = pe->check_twosfx(word, len, in_compound, needflag);
  1023. if (rv) return rv;
  1024. pe = pe->getNext();
  1025. }
  1026. // now handle the general case
  1027. unsigned char sp = *((const unsigned char *)word);
  1028. PfxEntry * pptr = pStart[sp];
  1029. while (pptr) {
  1030. if (isSubset(pptr->getKey(),word)) {
  1031. rv = pptr->check_twosfx(word, len, in_compound, needflag);
  1032. if (rv) {
  1033. pfx = pptr;
  1034. return rv;
  1035. }
  1036. pptr = pptr->getNextEQ();
  1037. } else {
  1038. pptr = pptr->getNextNE();
  1039. }
  1040. }
  1041. return NULL;
  1042. }
  1043. // check word for prefixes
  1044. char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound,
  1045. const FLAG needflag)
  1046. {
  1047. char * st;
  1048. char result[MAXLNLEN];
  1049. result[0] = '\0';
  1050. pfx = NULL;
  1051. sfxappnd = NULL;
  1052. // first handle the special case of 0 length prefixes
  1053. PfxEntry * pe = pStart[0];
  1054. while (pe) {
  1055. st = pe->check_morph(word,len,in_compound, needflag);
  1056. if (st) {
  1057. mystrcat(result, st, MAXLNLEN);
  1058. free(st);
  1059. }
  1060. // if (rv) return rv;
  1061. pe = pe->getNext();
  1062. }
  1063. // now handle the general case
  1064. unsigned char sp = *((const unsigned char *)word);
  1065. PfxEntry * pptr = pStart[sp];
  1066. while (pptr) {
  1067. if (isSubset(pptr->getKey(),word)) {
  1068. st = pptr->check_morph(word,len,in_compound, needflag);
  1069. if (st) {
  1070. // fogemorpheme
  1071. if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() &&
  1072. (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))))) {
  1073. mystrcat(result, st, MAXLNLEN);
  1074. pfx = pptr;
  1075. }
  1076. free(st);
  1077. }
  1078. pptr = pptr->getNextEQ();
  1079. } else {
  1080. pptr = pptr->getNextNE();
  1081. }
  1082. }
  1083. if (*result) return mystrdup(result);
  1084. return NULL;
  1085. }
  1086. // check word for prefixes
  1087. char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len,
  1088. char in_compound, const FLAG needflag)
  1089. {
  1090. char * st;
  1091. char result[MAXLNLEN];
  1092. result[0] = '\0';
  1093. pfx = NULL;
  1094. sfxappnd = NULL;
  1095. // first handle the special case of 0 length prefixes
  1096. PfxEntry * pe = pStart[0];
  1097. while (pe) {
  1098. st = pe->check_twosfx_morph(word,len,in_compound, needflag);
  1099. if (st) {
  1100. mystrcat(result, st, MAXLNLEN);
  1101. free(st);
  1102. }
  1103. pe = pe->getNext();
  1104. }
  1105. // now handle the general case
  1106. unsigned char sp = *((const unsigned char *)word);
  1107. PfxEntry * pptr = pStart[sp];
  1108. while (pptr) {
  1109. if (isSubset(pptr->getKey(),word)) {
  1110. st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
  1111. if (st) {
  1112. mystrcat(result, st, MAXLNLEN);
  1113. free(st);
  1114. pfx = pptr;
  1115. }
  1116. pptr = pptr->getNextEQ();
  1117. } else {
  1118. pptr = pptr->getNextNE();
  1119. }
  1120. }
  1121. if (*result) return mystrdup(result);
  1122. return NULL;
  1123. }
  1124. // Is word a non compound with a REP substitution (see checkcompoundrep)?
  1125. int AffixMgr::cpdrep_check(const char * word, int wl)
  1126. {
  1127. char candidate[MAXLNLEN];
  1128. const char * r;
  1129. int lenr, lenp;
  1130. if ((wl < 2) || !numrep) return 0;
  1131. for (int i=0; i < numrep; i++ ) {
  1132. r = word;
  1133. lenr = strlen(reptable[i].pattern2);
  1134. lenp = strlen(reptable[i].pattern);
  1135. // search every occurence of the pattern in the word
  1136. while ((r=strstr(r, reptable[i].pattern)) != NULL) {
  1137. strcpy(candidate, word);
  1138. if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break;
  1139. strcpy(candidate+(r-word),reptable[i].pattern2);
  1140. strcpy(candidate+(r-word)+lenr, r+lenp);
  1141. if (candidate_check(candidate,strlen(candidate))) return 1;
  1142. r++; // search for the next letter
  1143. }
  1144. }
  1145. return 0;
  1146. }
  1147. // forbid compoundings when there are special patterns at word bound
  1148. int AffixMgr::cpdpat_check(const char * word, int pos, hentry * r1, hentry * r2, const char affixed)
  1149. {
  1150. int len;
  1151. for (int i = 0; i < numcheckcpd; i++) {
  1152. if (isSubset(checkcpdtable[i].pattern2, word + pos) &&
  1153. (!r1 || !checkcpdtable[i].cond ||
  1154. (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen))) &&
  1155. (!r2 || !checkcpdtable[i].cond2 ||
  1156. (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen))) &&
  1157. // zero length pattern => only TESTAFF
  1158. // zero pattern (0/flag) => unmodified stem (zero affixes allowed)
  1159. (!*(checkcpdtable[i].pattern) || (
  1160. (*(checkcpdtable[i].pattern)=='0' && r1->blen <= pos && strncmp(word + pos - r1->blen, r1->word, r1->blen) == 0) ||
  1161. (*(checkcpdtable[i].pattern)!='0' && (len = strlen(checkcpdtable[i].pattern)) &&
  1162. strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)))) {
  1163. return 1;
  1164. }
  1165. }
  1166. return 0;
  1167. }
  1168. // forbid compounding with neighbouring upper and lower case characters at word bounds
  1169. int AffixMgr::cpdcase_check(const char * word, int pos)
  1170. {
  1171. if (utf8) {
  1172. w_char u, w;
  1173. const char * p;
  1174. u8_u16(&u, 1, word + pos);
  1175. for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--);
  1176. u8_u16(&w, 1, p);
  1177. unsigned short a = (u.h << 8) + u.l;
  1178. unsigned short b = (w.h << 8) + w.l;
  1179. if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b)) &&
  1180. (a != '-') && (b != '-')) return 1;
  1181. } else {
  1182. unsigned char a = *(word + pos - 1);
  1183. unsigned char b = *(word + pos);
  1184. if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-')) return 1;
  1185. }
  1186. return 0;
  1187. }
  1188. // check compound patterns
  1189. int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** def, char all)
  1190. {
  1191. signed short btpp[MAXWORDLEN]; // metacharacter (*, ?) positions for backtracking
  1192. signed short btwp[MAXWORDLEN]; // word positions for metacharacters
  1193. int btnum[MAXWORDLEN]; // number of matched characters in metacharacter positions
  1194. short bt = 0;
  1195. int i, j;
  1196. int ok;
  1197. int w = 0;
  1198. if (!*words) {
  1199. w = 1;
  1200. *words = def;
  1201. }
  1202. if (!*words) {
  1203. return 0;
  1204. }
  1205. (*words)[wnum] = rv;
  1206. // has the last word COMPOUNDRULE flag?
  1207. if (rv->alen == 0) {
  1208. (*words)[wnum] = NULL;
  1209. if (w) *words = NULL;
  1210. return 0;
  1211. }
  1212. ok = 0;
  1213. for (i = 0; i < numdefcpd; i++) {
  1214. for (j = 0; j < defcpdtable[i].len; j++) {
  1215. if (defcpdtable[i].def[j] != '*' && defcpdtable[i].def[j] != '?' &&
  1216. TESTAFF(rv->astr, defcpdtable[i].def[j], rv->alen)) ok = 1;
  1217. }
  1218. }
  1219. if (ok == 0) {
  1220. (*words)[wnum] = NULL;
  1221. if (w) *words = NULL;
  1222. return 0;
  1223. }
  1224. for (i = 0; i < numdefcpd; i++) {
  1225. signed short pp = 0; // pattern position
  1226. signed short wp = 0; // "words" position
  1227. int ok2;
  1228. ok = 1;
  1229. ok2 = 1;
  1230. do {
  1231. while ((pp < defcpdtable[i].len) && (wp <= wnum)) {
  1232. if (((pp+1) < defcpdtable[i].len) &&
  1233. ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) {
  1234. int wend = (defcpdtable[i].def[pp+1] == '?') ? wp : wnum;
  1235. ok2 = 1;
  1236. pp+=2;
  1237. btpp[bt] = pp;
  1238. btwp[bt] = wp;
  1239. while (wp <= wend) {
  1240. if (!(*words)[wp]->alen ||
  1241. !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp-2], (*words)[wp]->alen)) {
  1242. ok2 = 0;
  1243. break;
  1244. }
  1245. wp++;
  1246. }
  1247. if (wp <= wnum) ok2 = 0;
  1248. btnum[bt] = wp - btwp[bt];
  1249. if (btnum[bt] > 0) bt++;
  1250. if (ok2) break;
  1251. } else {
  1252. ok2 = 1;
  1253. if (!(*words)[wp] || !(*words)[wp]->alen ||
  1254. !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp], (*words)[wp]->alen)) {
  1255. ok = 0;
  1256. break;
  1257. }
  1258. pp++;
  1259. wp++;
  1260. if ((defcpdtable[i].len == pp) && !(wp > wnum)) ok = 0;
  1261. }
  1262. }
  1263. if (ok && ok2) {
  1264. int r = pp;
  1265. while ((defcpdtable[i].len > r) && ((r+1) < defcpdtable[i].len) &&
  1266. ((defcpdtable[i].def[r+1] == '*') || (defcpdtable[i].def[r+1] == '?'))) r+=2;
  1267. if (defcpdtable[i].len <= r) return 1;
  1268. }
  1269. // backtrack
  1270. if (bt) do {
  1271. ok = 1;
  1272. btnum[bt - 1]--;
  1273. pp = btpp[bt - 1];
  1274. wp = btwp[bt - 1] + (signed short) btnum[bt - 1];
  1275. } while ((btnum[bt - 1] < 0) && --bt);
  1276. } while (bt);
  1277. if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1;
  1278. // check zero ending
  1279. while (ok && ok2 && (defcpdtable[i].len > pp) && ((pp+1) < defcpdtable[i].len) &&
  1280. ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) pp+=2;
  1281. if (ok && ok2 && (defcpdtable[i].len <= pp)) return 1;
  1282. }
  1283. (*words)[wnum] = NULL;
  1284. if (w) *words = NULL;
  1285. return 0;
  1286. }
  1287. inline int AffixMgr::candidate_check(const char * word, int len)
  1288. {
  1289. struct hentry * rv=NULL;
  1290. rv = lookup(word);
  1291. if (rv) return 1;
  1292. // rv = prefix_check(word,len,1);
  1293. // if (rv) return 1;
  1294. rv = affix_check(word,len);
  1295. if (rv) return 1;
  1296. return 0;
  1297. }
  1298. // calculate number of syllable for compound-checking
  1299. short AffixMgr::get_syllable(const char * word, int wlen)
  1300. {
  1301. if (cpdmaxsyllable==0) return 0;
  1302. short num=0;
  1303. if (!utf8) {
  1304. for (int i=0; i<wlen; i++) {
  1305. if (strchr(cpdvowels, word[i])) num++;
  1306. }
  1307. } else if (cpdvowels_utf16) {
  1308. w_char w[MAXWORDUTF8LEN];
  1309. int i = u8_u16(w, MAXWORDUTF8LEN, word);
  1310. for (; i > 0; i--) {
  1311. if (flag_bsearch((unsigned short *) cpdvowels_utf16,
  1312. ((unsigned short *) w)[i - 1], cpdvowels_utf16_len)) num++;
  1313. }
  1314. }
  1315. return num;
  1316. }
  1317. void AffixMgr::setcminmax(int * cmin, int * cmax, const char * word, int len) {
  1318. if (utf8) {
  1319. int i;
  1320. for (*cmin = 0, i = 0; (i < cpdmin) && word[*cmin]; i++) {
  1321. for ((*cmin)++; (word[*cmin] & 0xc0) == 0x80; (*cmin)++);
  1322. }
  1323. for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax; i++) {
  1324. for ((*cmax)--; (word[*cmax] & 0xc0) == 0x80; (*cmax)--);
  1325. }
  1326. } else {
  1327. *cmin = cpdmin;
  1328. *cmax = len - cpdmin + 1;
  1329. }
  1330. }
  1331. // check if compound word is correctly spelled
  1332. // hu_mov_rule = spec. Hungarian rule (XXX)
  1333. struct hentry * AffixMgr::compound_check(const char * word, int len,
  1334. short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words = NULL,
  1335. char hu_mov_rule = 0, char is_sug = 0, int * info = NULL)
  1336. {
  1337. int i;
  1338. short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
  1339. struct hentry * rv = NULL;
  1340. struct hentry * rv_first;
  1341. struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
  1342. char st [MAXWORDUTF8LEN + 4];
  1343. char ch = '\0';
  1344. int cmin;
  1345. int cmax;
  1346. int striple = 0;
  1347. int scpd = 0;
  1348. int soldi = 0;
  1349. int oldcmin = 0;
  1350. int oldcmax = 0;
  1351. int oldlen = 0;
  1352. int checkedstriple = 0;
  1353. int onlycpdrule;
  1354. int affixed = 0;
  1355. hentry ** oldwords = words;
  1356. int checked_prefix;
  1357. setcminmax(&cmin, &cmax, word, len);
  1358. strcpy(st, word);
  1359. for (i = cmin; i < cmax; i++) {
  1360. // go to end of the UTF-8 character
  1361. if (utf8) {
  1362. for (; (st[i] & 0xc0) == 0x80; i++);
  1363. if (i >= cmax) return NULL;
  1364. }
  1365. words = oldwords;
  1366. onlycpdrule = (words) ? 1 : 0;
  1367. do { // onlycpdrule loop
  1368. oldnumsyllable = numsyllable;
  1369. oldwordnum = wordnum;
  1370. checked_prefix = 0;
  1371. do { // simplified checkcompoundpattern loop
  1372. if (scpd > 0) {
  1373. for (; scpd <= numcheckcpd && (!checkcpdtable[scpd-1].pattern3 ||
  1374. strncmp(word + i, checkcpdtable[scpd-1].pattern3, strlen(checkcpdtable[scpd-1].pattern3)) != 0); scpd++);
  1375. if (scpd > numcheckcpd) break; // break simplified checkcompoundpattern loop
  1376. strcpy(st + i, checkcpdtable[scpd-1].pattern);
  1377. soldi = i;
  1378. i += strlen(checkcpdtable[scpd-1].pattern);
  1379. strcpy(st + i, checkcpdtable[scpd-1].pattern2);
  1380. strcpy(st + i + strlen(checkcpdtable[scpd-1].pattern2), word + soldi + strlen(checkcpdtable[scpd-1].pattern3));
  1381. oldlen = len;
  1382. len += strlen(checkcpdtable[scpd-1].pattern) + strlen(checkcpdtable[scpd-1].pattern2) - strlen(checkcpdtable[scpd-1].pattern3);
  1383. oldcmin = cmin;
  1384. oldcmax = cmax;
  1385. setcminmax(&cmin, &cmax, st, len);
  1386. cmax = len - cpdmin + 1;
  1387. }
  1388. ch = st[i];
  1389. st[i] = '\0';
  1390. sfx = NULL;
  1391. pfx = NULL;
  1392. // FIRST WORD
  1393. affixed = 1;
  1394. rv = lookup(st); // perhaps without prefix
  1395. // search homonym with compound flag
  1396. while ((rv) && !hu_mov_rule &&
  1397. ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
  1398. !((compoundflag && !words && !onlycpdrule && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1399. (compoundbegin && !wordnum && !onlycpdrule &&
  1400. TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
  1401. (compoundmiddle && wordnum && !words && !onlycpdrule &&
  1402. TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
  1403. (numdefcpd && onlycpdrule &&
  1404. ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||
  1405. (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))) ||
  1406. (scpd != 0 && checkcpdtable[scpd-1].cond != FLAG_NULL &&
  1407. !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen)))
  1408. ) {
  1409. rv = rv->next_homonym;
  1410. }
  1411. if (rv) affixed = 0;
  1412. if (!rv) {
  1413. if (onlycpdrule) break;
  1414. if (compoundflag &&
  1415. !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {
  1416. if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,
  1417. FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule &&
  1418. sfx->getCont() &&
  1419. ((compoundforbidflag && TESTAFF(sfx->getCont(), compoundforbidflag,
  1420. sfx->getContLen())) || (compoundend &&
  1421. TESTAFF(sfx->getCont(), compoundend,
  1422. sfx->getContLen())))) {
  1423. rv = NULL;
  1424. }
  1425. }
  1426. if (rv ||
  1427. (((wordnum == 0) && compoundbegin &&
  1428. ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
  1429. (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) ||
  1430. ((wordnum > 0) && compoundmiddle &&
  1431. ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
  1432. (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle)))))
  1433. ) checked_prefix = 1;
  1434. // else check forbiddenwords and needaffix
  1435. } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
  1436. TESTAFF(rv->astr, needaffix, rv->alen) ||
  1437. TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
  1438. (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen))
  1439. )) {
  1440. st[i] = ch;
  1441. //continue;
  1442. break;
  1443. }
  1444. // check non_compound flag in suffix and prefix
  1445. if ((rv) && !hu_mov_rule &&
  1446. ((pfx && pfx->getCont() &&
  1447. TESTAFF(pfx->getCont(), compoundforbidflag,
  1448. pfx->getContLen())) ||

Large files files are truncated, but you can click here to view the full file