PageRenderTime 139ms CodeModel.GetById 3ms RepoModel.GetById 0ms app.codeStats 0ms

/ext/hunspell/affixmgr.cxx

https://github.com/csware/test
C++ | 3976 lines | 3145 code | 418 blank | 413 comment | 1291 complexity | 1cd26bf8aa3cdc888a4490b61d8d99f2 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.0, GPL-3.0, LGPL-3.0, MPL-2.0-no-copyleft-exception
  1. #include "license.hunspell"
  2. #include "license.myspell"
  3. #ifndef MOZILLA_CLIENT
  4. #include <cstdlib>
  5. #include <cstring>
  6. #include <cctype>
  7. #include <cstdio>
  8. #else
  9. #include <stdlib.h>
  10. #include <string.h>
  11. #include <stdio.h>
  12. #include <ctype.h>
  13. #endif
  14. #include "affixmgr.hxx"
  15. #include "affentry.hxx"
  16. #include "langnum.hxx"
  17. #include "csutil.hxx"
  18. #ifndef MOZILLA_CLIENT
  19. #ifndef W32
  20. using namespace std;
  21. #endif
  22. #endif
  23. AffixMgr::AffixMgr(const char * affpath, HashMgr* ptr)
  24. {
  25. // register hash manager and load affix data from aff file
  26. pHMgr = ptr;
  27. trystring = NULL;
  28. encoding=NULL;
  29. utf8 = 0;
  30. complexprefixes = 0;
  31. maptable = NULL;
  32. nummap = 0;
  33. breaktable = NULL;
  34. numbreak = 0;
  35. reptable = NULL;
  36. numrep = 0;
  37. checkcpdtable = NULL;
  38. numcheckcpd = 0;
  39. defcpdtable = NULL;
  40. numdefcpd = 0;
  41. compoundflag = FLAG_NULL; // permits word in compound forms
  42. compoundbegin = FLAG_NULL; // may be first word in compound forms
  43. compoundmiddle = FLAG_NULL; // may be middle word in compound forms
  44. compoundend = FLAG_NULL; // may be last word in compound forms
  45. compoundroot = FLAG_NULL; // compound word signing flag
  46. compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word
  47. compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word
  48. checkcompounddup = 0; // forbid double words in compounds
  49. checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a REP substitution)
  50. checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds
  51. checkcompoundtriple = 0; // forbid compounds with triple letters
  52. forbiddenword = FLAG_NULL; // forbidden word signing flag
  53. nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag
  54. lang = NULL; // language
  55. langnum = 0; // language code (see http://l10n.openoffice.org/languages.html)
  56. pseudoroot = FLAG_NULL; // forbidden root, allowed only with suffixes
  57. cpdwordmax = -1; // default: unlimited wordcount in compound words
  58. cpdmin = -1; // undefined
  59. cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words
  60. cpdvowels=NULL; // vowels (for calculating of Hungarian compounding limit, O(n) search! XXX)
  61. cpdvowels_utf16=NULL; // vowels for UTF-8 encoding (bsearch instead of O(n) search)
  62. cpdvowels_utf16_len=0; // vowels
  63. pfxappnd=NULL; // previous prefix for counting the syllables of prefix BUG
  64. sfxappnd=NULL; // previous suffix for counting a special syllables BUG
  65. cpdsyllablenum=NULL; // syllable count incrementing flag
  66. checknum=0; // checking numbers, and word with numbers
  67. wordchars=NULL; // letters + spec. word characters
  68. wordchars_utf16=NULL; // letters + spec. word characters
  69. wordchars_utf16_len=0; // letters + spec. word characters
  70. ignorechars=NULL; // letters + spec. word characters
  71. ignorechars_utf16=NULL; // letters + spec. word characters
  72. ignorechars_utf16_len=0; // letters + spec. word characters
  73. version=NULL; // affix and dictionary file version string
  74. havecontclass=0; // flags of possible continuing classes (double affix)
  75. // LEMMA_PRESENT: not put root into the morphological output. Lemma presents
  76. // in morhological description in dictionary file. It's often combined with PSEUDOROOT.
  77. lemma_present = FLAG_NULL;
  78. circumfix = FLAG_NULL;
  79. onlyincompound = FLAG_NULL;
  80. flag_mode = FLAG_CHAR; // default one-character flags in affix and dic file
  81. maxngramsugs = -1; // undefined
  82. nosplitsugs = 0;
  83. sugswithdots = 0;
  84. keepcase = 0;
  85. checksharps = 0;
  86. derived = NULL; // XXX not threadsafe variable for experimental stemming
  87. sfx = NULL;
  88. pfx = NULL;
  89. for (int i=0; i < SETSIZE; i++) {
  90. pStart[i] = NULL;
  91. sStart[i] = NULL;
  92. pFlag[i] = NULL;
  93. sFlag[i] = NULL;
  94. }
  95. for (int j=0; j < CONTSIZE; j++) {
  96. contclasses[j] = 0;
  97. }
  98. if (parse_file(affpath)) {
  99. HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath);
  100. wordchars = mystrdup("qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM");
  101. }
  102. if (cpdmin == -1) cpdmin = MINCPDLEN;
  103. }
  104. AffixMgr::~AffixMgr()
  105. {
  106. // pass through linked prefix entries and clean up
  107. for (int i=0; i < SETSIZE ;i++) {
  108. pFlag[i] = NULL;
  109. PfxEntry * ptr = (PfxEntry *)pStart[i];
  110. PfxEntry * nptr = NULL;
  111. while (ptr) {
  112. nptr = ptr->getNext();
  113. delete(ptr);
  114. ptr = nptr;
  115. nptr = NULL;
  116. }
  117. }
  118. // pass through linked suffix entries and clean up
  119. for (int j=0; j < SETSIZE ; j++) {
  120. sFlag[j] = NULL;
  121. SfxEntry * ptr = (SfxEntry *)sStart[j];
  122. SfxEntry * nptr = NULL;
  123. while (ptr) {
  124. nptr = ptr->getNext();
  125. delete(ptr);
  126. ptr = nptr;
  127. nptr = NULL;
  128. }
  129. sStart[j] = NULL;
  130. }
  131. if (trystring) free(trystring);
  132. trystring=NULL;
  133. if (encoding) free(encoding);
  134. encoding=NULL;
  135. if (maptable) {
  136. for (int j=0; j < nummap; j++) {
  137. if (maptable[j].set) free(maptable[j].set);
  138. if (maptable[j].set_utf16) free(maptable[j].set_utf16);
  139. maptable[j].set = NULL;
  140. maptable[j].len = 0;
  141. }
  142. free(maptable);
  143. maptable = NULL;
  144. }
  145. nummap = 0;
  146. if (breaktable) {
  147. for (int j=0; j < numbreak; j++) {
  148. if (breaktable[j]) free(breaktable[j]);
  149. breaktable[j] = NULL;
  150. }
  151. free(breaktable);
  152. breaktable = NULL;
  153. }
  154. numbreak = 0;
  155. if (reptable) {
  156. for (int j=0; j < numrep; j++) {
  157. free(reptable[j].pattern);
  158. free(reptable[j].pattern2);
  159. reptable[j].pattern = NULL;
  160. reptable[j].pattern2 = NULL;
  161. }
  162. free(reptable);
  163. reptable = NULL;
  164. }
  165. if (defcpdtable) {
  166. for (int j=0; j < numdefcpd; j++) {
  167. free(defcpdtable[j].def);
  168. defcpdtable[j].def = NULL;
  169. }
  170. free(defcpdtable);
  171. defcpdtable = NULL;
  172. }
  173. numrep = 0;
  174. if (checkcpdtable) {
  175. for (int j=0; j < numcheckcpd; j++) {
  176. free(checkcpdtable[j].pattern);
  177. free(checkcpdtable[j].pattern2);
  178. checkcpdtable[j].pattern = NULL;
  179. checkcpdtable[j].pattern2 = NULL;
  180. }
  181. free(checkcpdtable);
  182. checkcpdtable = NULL;
  183. }
  184. numcheckcpd = 0;
  185. FREE_FLAG(compoundflag);
  186. FREE_FLAG(compoundbegin);
  187. FREE_FLAG(compoundmiddle);
  188. FREE_FLAG(compoundend);
  189. FREE_FLAG(compoundpermitflag);
  190. FREE_FLAG(compoundforbidflag);
  191. FREE_FLAG(compoundroot);
  192. FREE_FLAG(forbiddenword);
  193. FREE_FLAG(nosuggest);
  194. FREE_FLAG(pseudoroot);
  195. FREE_FLAG(lemma_present);
  196. FREE_FLAG(circumfix);
  197. FREE_FLAG(onlyincompound);
  198. cpdwordmax = 0;
  199. pHMgr = NULL;
  200. cpdmin = 0;
  201. cpdmaxsyllable = 0;
  202. if (cpdvowels) free(cpdvowels);
  203. if (cpdvowels_utf16) free(cpdvowels_utf16);
  204. if (cpdsyllablenum) free(cpdsyllablenum);
  205. free_utf_tbl();
  206. if (lang) free(lang);
  207. if (wordchars) free(wordchars);
  208. if (wordchars_utf16) free(wordchars_utf16);
  209. if (ignorechars) free(ignorechars);
  210. if (ignorechars_utf16) free(ignorechars_utf16);
  211. if (version) free(version);
  212. if (derived) free(derived);
  213. checknum=0;
  214. }
  215. // read in aff file and build up prefix and suffix entry objects
  216. int AffixMgr::parse_file(const char * affpath)
  217. {
  218. // io buffers
  219. char line[MAXLNLEN+1];
  220. // affix type
  221. char ft;
  222. // checking flag duplication
  223. char dupflags[CONTSIZE];
  224. char dupflags_ini = 1;
  225. // first line indicator for removing byte order mark
  226. int firstline = 1;
  227. // open the affix file
  228. FILE * afflst;
  229. afflst = fopen(affpath,"r");
  230. if (!afflst) {
  231. HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath);
  232. return 1;
  233. }
  234. // step one is to parse the affix file building up the internal
  235. // affix data structures
  236. // read in each line ignoring any that do not
  237. // start with a known line type indicator
  238. while (fgets(line,MAXLNLEN,afflst)) {
  239. mychomp(line);
  240. /* remove byte order mark */
  241. if (firstline) {
  242. firstline = 0;
  243. if (strncmp(line,"",3) == 0) {
  244. memmove(line, line+3, strlen(line+3)+1);
  245. HUNSPELL_WARNING(stderr, "warning: affix file begins with byte order mark: possible incompatibility with old Hunspell versions\n");
  246. }
  247. }
  248. /* parse in the try string */
  249. if (strncmp(line,"TRY",3) == 0) {
  250. if (parse_string(line, &trystring, "TRY")) {
  251. fclose(afflst);
  252. return 1;
  253. }
  254. }
  255. /* parse in the name of the character set used by the .dict and .aff */
  256. if (strncmp(line,"SET",3) == 0) {
  257. if (parse_string(line, &encoding, "SET")) {
  258. fclose(afflst);
  259. return 1;
  260. }
  261. if (strcmp(encoding, "UTF-8") == 0) {
  262. utf8 = 1;
  263. #ifndef OPENOFFICEORG
  264. #ifndef MOZILLA_CLIENT
  265. if (initialize_utf_tbl()) {
  266. fclose(afflst);
  267. return 1;
  268. }
  269. #endif
  270. #endif
  271. }
  272. }
  273. /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left writing system */
  274. if (strncmp(line,"COMPLEXPREFIXES",15) == 0)
  275. complexprefixes = 1;
  276. /* parse in the flag used by the controlled compound words */
  277. if (strncmp(line,"COMPOUNDFLAG",12) == 0) {
  278. if (parse_flag(line, &compoundflag, "COMPOUNDFLAG")) {
  279. fclose(afflst);
  280. return 1;
  281. }
  282. }
  283. /* parse in the flag used by compound words */
  284. if (strncmp(line,"COMPOUNDBEGIN",13) == 0) {
  285. if (complexprefixes) {
  286. if (parse_flag(line, &compoundend, "COMPOUNDBEGIN")) {
  287. fclose(afflst);
  288. return 1;
  289. }
  290. } else {
  291. if (parse_flag(line, &compoundbegin, "COMPOUNDBEGIN")) {
  292. fclose(afflst);
  293. return 1;
  294. }
  295. }
  296. }
  297. /* parse in the flag used by compound words */
  298. if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) {
  299. if (parse_flag(line, &compoundmiddle, "COMPOUNDMIDDLE")) {
  300. fclose(afflst);
  301. return 1;
  302. }
  303. }
  304. /* parse in the flag used by compound words */
  305. if (strncmp(line,"COMPOUNDEND",11) == 0) {
  306. if (complexprefixes) {
  307. if (parse_flag(line, &compoundbegin, "COMPOUNDEND")) {
  308. fclose(afflst);
  309. return 1;
  310. }
  311. } else {
  312. if (parse_flag(line, &compoundend, "COMPOUNDEND")) {
  313. fclose(afflst);
  314. return 1;
  315. }
  316. }
  317. }
  318. /* parse in the data used by compound_check() method */
  319. if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) {
  320. if (parse_num(line, &cpdwordmax, "COMPOUNDWORDMAX")) {
  321. fclose(afflst);
  322. return 1;
  323. }
  324. }
  325. /* parse in the flag sign compounds in dictionary */
  326. if (strncmp(line,"COMPOUNDROOT",12) == 0) {
  327. if (parse_flag(line, &compoundroot, "COMPOUNDROOT")) {
  328. fclose(afflst);
  329. return 1;
  330. }
  331. }
  332. /* parse in the flag used by compound_check() method */
  333. if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) {
  334. if (parse_flag(line, &compoundpermitflag, "COMPOUNDPERMITFLAG")) {
  335. fclose(afflst);
  336. return 1;
  337. }
  338. }
  339. /* parse in the flag used by compound_check() method */
  340. if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) {
  341. if (parse_flag(line, &compoundforbidflag, "COMPOUNDFORBIDFLAG")) {
  342. fclose(afflst);
  343. return 1;
  344. }
  345. }
  346. if (strncmp(line,"CHECKCOMPOUNDDUP",16) == 0) {
  347. checkcompounddup = 1;
  348. }
  349. if (strncmp(line,"CHECKCOMPOUNDREP",16) == 0) {
  350. checkcompoundrep = 1;
  351. }
  352. if (strncmp(line,"CHECKCOMPOUNDTRIPLE",19) == 0) {
  353. checkcompoundtriple = 1;
  354. }
  355. if (strncmp(line,"CHECKCOMPOUNDCASE",17) == 0) {
  356. checkcompoundcase = 1;
  357. }
  358. if (strncmp(line,"NOSUGGEST",9) == 0) {
  359. if (parse_flag(line, &nosuggest, "NOSUGGEST")) {
  360. fclose(afflst);
  361. return 1;
  362. }
  363. }
  364. /* parse in the flag used by forbidden words */
  365. if (strncmp(line,"FORBIDDENWORD",13) == 0) {
  366. if (parse_flag(line, &forbiddenword, "FORBIDDENWORD")) {
  367. fclose(afflst);
  368. return 1;
  369. }
  370. }
  371. /* parse in the flag used by forbidden words */
  372. if (strncmp(line,"LEMMA_PRESENT",13) == 0) {
  373. if (parse_flag(line, &lemma_present, "LEMMA_PRESENT")) {
  374. fclose(afflst);
  375. return 1;
  376. }
  377. }
  378. /* parse in the flag used by circumfixes */
  379. if (strncmp(line,"CIRCUMFIX",9) == 0) {
  380. if (parse_flag(line, &circumfix, "CIRCUMFIX")) {
  381. fclose(afflst);
  382. return 1;
  383. }
  384. }
  385. /* parse in the flag used by fogemorphemes */
  386. if (strncmp(line,"ONLYINCOMPOUND",14) == 0) {
  387. if (parse_flag(line, &onlyincompound, "ONLYINCOMPOUND")) {
  388. fclose(afflst);
  389. return 1;
  390. }
  391. }
  392. /* parse in the flag used by `pseudoroots' */
  393. if (strncmp(line,"PSEUDOROOT",10) == 0) {
  394. if (parse_flag(line, &pseudoroot, "PSEUDOROOT")) {
  395. fclose(afflst);
  396. return 1;
  397. }
  398. }
  399. /* parse in the flag used by `pseudoroots' */
  400. if (strncmp(line,"NEEDAFFIX",9) == 0) {
  401. if (parse_flag(line, &pseudoroot, "NEEDAFFIX")) {
  402. fclose(afflst);
  403. return 1;
  404. }
  405. }
  406. /* parse in the minimal length for words in compounds */
  407. if (strncmp(line,"COMPOUNDMIN",11) == 0) {
  408. if (parse_num(line, &cpdmin, "COMPOUNDMIN")) {
  409. fclose(afflst);
  410. return 1;
  411. }
  412. if (cpdmin < 1) cpdmin = 1;
  413. }
  414. /* parse in the max. words and syllables in compounds */
  415. if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) {
  416. if (parse_cpdsyllable(line)) {
  417. fclose(afflst);
  418. return 1;
  419. }
  420. }
  421. /* parse in the flag used by compound_check() method */
  422. if (strncmp(line,"SYLLABLENUM",11) == 0) {
  423. if (parse_string(line, &cpdsyllablenum, "SYLLABLENUM")) {
  424. fclose(afflst);
  425. return 1;
  426. }
  427. }
  428. /* parse in the flag used by the controlled compound words */
  429. if (strncmp(line,"CHECKNUM",8) == 0) {
  430. checknum=1;
  431. }
  432. /* parse in the extra word characters */
  433. if (strncmp(line,"WORDCHARS",9) == 0) {
  434. if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, "WORDCHARS", utf8)) {
  435. fclose(afflst);
  436. return 1;
  437. }
  438. }
  439. /* parse in the ignored characters (for example, Arabic optional diacretics charachters */
  440. if (strncmp(line,"IGNORE",6) == 0) {
  441. if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, "IGNORE", utf8)) {
  442. fclose(afflst);
  443. return 1;
  444. }
  445. }
  446. /* parse in the typical fault correcting table */
  447. if (strncmp(line,"REP",3) == 0) {
  448. if (parse_reptable(line, afflst)) {
  449. fclose(afflst);
  450. return 1;
  451. }
  452. }
  453. /* parse in the checkcompoundpattern table */
  454. if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) {
  455. if (parse_checkcpdtable(line, afflst)) {
  456. fclose(afflst);
  457. return 1;
  458. }
  459. }
  460. /* parse in the defcompound table */
  461. if (strncmp(line,"COMPOUNDRULE",12) == 0) {
  462. if (parse_defcpdtable(line, afflst)) {
  463. fclose(afflst);
  464. return 1;
  465. }
  466. }
  467. /* parse in the related character map table */
  468. if (strncmp(line,"MAP",3) == 0) {
  469. if (parse_maptable(line, afflst)) {
  470. fclose(afflst);
  471. return 1;
  472. }
  473. }
  474. /* parse in the word breakpoints table */
  475. if (strncmp(line,"BREAK",5) == 0) {
  476. if (parse_breaktable(line, afflst)) {
  477. fclose(afflst);
  478. return 1;
  479. }
  480. }
  481. /* parse in the language for language specific codes */
  482. if (strncmp(line,"LANG",4) == 0) {
  483. if (parse_string(line, &lang, "LANG")) {
  484. fclose(afflst);
  485. return 1;
  486. }
  487. langnum = get_lang_num(lang);
  488. }
  489. if (strncmp(line,"VERSION",7) == 0) {
  490. if (parse_string(line, &version, "VERSION")) {
  491. fclose(afflst);
  492. return 1;
  493. }
  494. }
  495. if (strncmp(line,"MAXNGRAMSUGS",12) == 0) {
  496. if (parse_num(line, &maxngramsugs, "MAXNGRAMSUGS")) {
  497. fclose(afflst);
  498. return 1;
  499. }
  500. }
  501. if (strncmp(line,"NOSPLITSUGS",11) == 0) {
  502. nosplitsugs=1;
  503. }
  504. if (strncmp(line,"SUGSWITHDOTS",12) == 0) {
  505. sugswithdots=1;
  506. }
  507. /* parse in the flag used by forbidden words */
  508. if (strncmp(line,"KEEPCASE",8) == 0) {
  509. if (parse_flag(line, &keepcase, "KEEPCASE")) {
  510. fclose(afflst);
  511. return 1;
  512. }
  513. }
  514. if (strncmp(line,"CHECKSHARPS",11) == 0) {
  515. checksharps=1;
  516. }
  517. /* parse this affix: P - prefix, S - suffix */
  518. ft = ' ';
  519. if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P';
  520. if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S';
  521. if (ft != ' ') {
  522. if (dupflags_ini) {
  523. for (int i = 0; i < CONTSIZE; i++) dupflags[i] = 0;
  524. dupflags_ini = 0;
  525. }
  526. if (parse_affix(line, ft, afflst, dupflags)) {
  527. fclose(afflst);
  528. process_pfx_tree_to_list();
  529. process_sfx_tree_to_list();
  530. return 1;
  531. }
  532. }
  533. }
  534. fclose(afflst);
  535. // convert affix trees to sorted list
  536. process_pfx_tree_to_list();
  537. process_sfx_tree_to_list();
  538. // now we can speed up performance greatly taking advantage of the
  539. // relationship between the affixes and the idea of "subsets".
  540. // View each prefix as a potential leading subset of another and view
  541. // each suffix (reversed) as a potential trailing subset of another.
  542. // To illustrate this relationship if we know the prefix "ab" is found in the
  543. // word to examine, only prefixes that "ab" is a leading subset of need be examined.
  544. // Furthermore is "ab" is not present then none of the prefixes that "ab" is
  545. // is a subset need be examined.
  546. // The same argument goes for suffix string that are reversed.
  547. // Then to top this off why not examine the first char of the word to quickly
  548. // limit the set of prefixes to examine (i.e. the prefixes to examine must
  549. // be leading supersets of the first character of the word (if they exist)
  550. // To take advantage of this "subset" relationship, we need to add two links
  551. // from entry. One to take next if the current prefix is found (call it nexteq)
  552. // and one to take next if the current prefix is not found (call it nextne).
  553. // Since we have built ordered lists, all that remains is to properly intialize
  554. // the nextne and nexteq pointers that relate them
  555. process_pfx_order();
  556. process_sfx_order();
  557. // expand wordchars string, based on csutil (for external tokenization)
  558. char * enc = get_encoding();
  559. csconv = get_current_cs(enc);
  560. free(enc);
  561. enc = NULL;
  562. char expw[MAXLNLEN];
  563. if (wordchars) {
  564. strcpy(expw, wordchars);
  565. free(wordchars);
  566. } else *expw = '\0';
  567. for (int i = 0; i <= 255; i++) {
  568. if ( (csconv[i].cupper != csconv[i].clower) &&
  569. (! strchr(expw, (char) i))) {
  570. *(expw + strlen(expw) + 1) = '\0';
  571. *(expw + strlen(expw)) = (char) i;
  572. }
  573. }
  574. wordchars = mystrdup(expw);
  575. // temporary BREAK definition for German dash handling (OOo issue 64400)
  576. if ((langnum == LANG_de) && (!breaktable)) {
  577. breaktable = (char **) malloc(sizeof(char *));
  578. if (!breaktable) return 1;
  579. breaktable[0] = mystrdup("-");
  580. numbreak = 1;
  581. }
  582. return 0;
  583. }
  584. // we want to be able to quickly access prefix information
  585. // both by prefix flag, and sorted by prefix string itself
  586. // so we need to set up two indexes
  587. int AffixMgr::build_pfxtree(AffEntry* pfxptr)
  588. {
  589. PfxEntry * ptr;
  590. PfxEntry * pptr;
  591. PfxEntry * ep = (PfxEntry*) pfxptr;
  592. // get the right starting points
  593. const char * key = ep->getKey();
  594. const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
  595. // first index by flag which must exist
  596. ptr = (PfxEntry*)pFlag[flg];
  597. ep->setFlgNxt(ptr);
  598. pFlag[flg] = (AffEntry *) ep;
  599. // handle the special case of null affix string
  600. if (strlen(key) == 0) {
  601. // always inset them at head of list at element 0
  602. ptr = (PfxEntry*)pStart[0];
  603. ep->setNext(ptr);
  604. pStart[0] = (AffEntry*)ep;
  605. return 0;
  606. }
  607. // now handle the normal case
  608. ep->setNextEQ(NULL);
  609. ep->setNextNE(NULL);
  610. unsigned char sp = *((const unsigned char *)key);
  611. ptr = (PfxEntry*)pStart[sp];
  612. // handle the first insert
  613. if (!ptr) {
  614. pStart[sp] = (AffEntry*)ep;
  615. return 0;
  616. }
  617. // otherwise use binary tree insertion so that a sorted
  618. // list can easily be generated later
  619. pptr = NULL;
  620. for (;;) {
  621. pptr = ptr;
  622. if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
  623. ptr = ptr->getNextEQ();
  624. if (!ptr) {
  625. pptr->setNextEQ(ep);
  626. break;
  627. }
  628. } else {
  629. ptr = ptr->getNextNE();
  630. if (!ptr) {
  631. pptr->setNextNE(ep);
  632. break;
  633. }
  634. }
  635. }
  636. return 0;
  637. }
  638. // we want to be able to quickly access suffix information
  639. // both by suffix flag, and sorted by the reverse of the
  640. // suffix string itself; so we need to set up two indexes
  641. int AffixMgr::build_sfxtree(AffEntry* sfxptr)
  642. {
  643. SfxEntry * ptr;
  644. SfxEntry * pptr;
  645. SfxEntry * ep = (SfxEntry *) sfxptr;
  646. /* get the right starting point */
  647. const char * key = ep->getKey();
  648. const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
  649. // first index by flag which must exist
  650. ptr = (SfxEntry*)sFlag[flg];
  651. ep->setFlgNxt(ptr);
  652. sFlag[flg] = (AffEntry *) ep;
  653. // next index by affix string
  654. // handle the special case of null affix string
  655. if (strlen(key) == 0) {
  656. // always inset them at head of list at element 0
  657. ptr = (SfxEntry*)sStart[0];
  658. ep->setNext(ptr);
  659. sStart[0] = (AffEntry*)ep;
  660. return 0;
  661. }
  662. // now handle the normal case
  663. ep->setNextEQ(NULL);
  664. ep->setNextNE(NULL);
  665. unsigned char sp = *((const unsigned char *)key);
  666. ptr = (SfxEntry*)sStart[sp];
  667. // handle the first insert
  668. if (!ptr) {
  669. sStart[sp] = (AffEntry*)ep;
  670. return 0;
  671. }
  672. // otherwise use binary tree insertion so that a sorted
  673. // list can easily be generated later
  674. pptr = NULL;
  675. for (;;) {
  676. pptr = ptr;
  677. if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
  678. ptr = ptr->getNextEQ();
  679. if (!ptr) {
  680. pptr->setNextEQ(ep);
  681. break;
  682. }
  683. } else {
  684. ptr = ptr->getNextNE();
  685. if (!ptr) {
  686. pptr->setNextNE(ep);
  687. break;
  688. }
  689. }
  690. }
  691. return 0;
  692. }
  693. // convert from binary tree to sorted list
  694. int AffixMgr::process_pfx_tree_to_list()
  695. {
  696. for (int i=1; i< SETSIZE; i++) {
  697. pStart[i] = process_pfx_in_order(pStart[i],NULL);
  698. }
  699. return 0;
  700. }
  701. AffEntry* AffixMgr::process_pfx_in_order(AffEntry* ptr, AffEntry* nptr)
  702. {
  703. if (ptr) {
  704. nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextNE(), nptr);
  705. ((PfxEntry*) ptr)->setNext((PfxEntry*) nptr);
  706. nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextEQ(), ptr);
  707. }
  708. return nptr;
  709. }
  710. // convert from binary tree to sorted list
  711. int AffixMgr:: process_sfx_tree_to_list()
  712. {
  713. for (int i=1; i< SETSIZE; i++) {
  714. sStart[i] = process_sfx_in_order(sStart[i],NULL);
  715. }
  716. return 0;
  717. }
  718. AffEntry* AffixMgr::process_sfx_in_order(AffEntry* ptr, AffEntry* nptr)
  719. {
  720. if (ptr) {
  721. nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextNE(), nptr);
  722. ((SfxEntry*) ptr)->setNext((SfxEntry*) nptr);
  723. nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextEQ(), ptr);
  724. }
  725. return nptr;
  726. }
  727. // reinitialize the PfxEntry links NextEQ and NextNE to speed searching
  728. // using the idea of leading subsets this time
  729. int AffixMgr::process_pfx_order()
  730. {
  731. PfxEntry* ptr;
  732. // loop through each prefix list starting point
  733. for (int i=1; i < SETSIZE; i++) {
  734. ptr = (PfxEntry*)pStart[i];
  735. // look through the remainder of the list
  736. // and find next entry with affix that
  737. // the current one is not a subset of
  738. // mark that as destination for NextNE
  739. // use next in list that you are a subset
  740. // of as NextEQ
  741. for (; ptr != NULL; ptr = ptr->getNext()) {
  742. PfxEntry * nptr = ptr->getNext();
  743. for (; nptr != NULL; nptr = nptr->getNext()) {
  744. if (! isSubset( ptr->getKey() , nptr->getKey() )) break;
  745. }
  746. ptr->setNextNE(nptr);
  747. ptr->setNextEQ(NULL);
  748. if ((ptr->getNext()) && isSubset(ptr->getKey() , (ptr->getNext())->getKey()))
  749. ptr->setNextEQ(ptr->getNext());
  750. }
  751. // now clean up by adding smart search termination strings:
  752. // if you are already a superset of the previous prefix
  753. // but not a subset of the next, search can end here
  754. // so set NextNE properly
  755. ptr = (PfxEntry *) pStart[i];
  756. for (; ptr != NULL; ptr = ptr->getNext()) {
  757. PfxEntry * nptr = ptr->getNext();
  758. PfxEntry * mptr = NULL;
  759. for (; nptr != NULL; nptr = nptr->getNext()) {
  760. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  761. mptr = nptr;
  762. }
  763. if (mptr) mptr->setNextNE(NULL);
  764. }
  765. }
  766. return 0;
  767. }
  768. // initialize the SfxEntry links NextEQ and NextNE to speed searching
  769. // using the idea of leading subsets this time
  770. int AffixMgr::process_sfx_order()
  771. {
  772. SfxEntry* ptr;
  773. // loop through each prefix list starting point
  774. for (int i=1; i < SETSIZE; i++) {
  775. ptr = (SfxEntry *) sStart[i];
  776. // look through the remainder of the list
  777. // and find next entry with affix that
  778. // the current one is not a subset of
  779. // mark that as destination for NextNE
  780. // use next in list that you are a subset
  781. // of as NextEQ
  782. for (; ptr != NULL; ptr = ptr->getNext()) {
  783. SfxEntry * nptr = ptr->getNext();
  784. for (; nptr != NULL; nptr = nptr->getNext()) {
  785. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  786. }
  787. ptr->setNextNE(nptr);
  788. ptr->setNextEQ(NULL);
  789. if ((ptr->getNext()) && isSubset(ptr->getKey(),(ptr->getNext())->getKey()))
  790. ptr->setNextEQ(ptr->getNext());
  791. }
  792. // now clean up by adding smart search termination strings:
  793. // if you are already a superset of the previous suffix
  794. // but not a subset of the next, search can end here
  795. // so set NextNE properly
  796. ptr = (SfxEntry *) sStart[i];
  797. for (; ptr != NULL; ptr = ptr->getNext()) {
  798. SfxEntry * nptr = ptr->getNext();
  799. SfxEntry * mptr = NULL;
  800. for (; nptr != NULL; nptr = nptr->getNext()) {
  801. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  802. mptr = nptr;
  803. }
  804. if (mptr) mptr->setNextNE(NULL);
  805. }
  806. }
  807. return 0;
  808. }
  809. // takes aff file condition string and creates the
  810. // conds array - please see the appendix at the end of the
  811. // file affentry.cxx which describes what is going on here
  812. // in much more detail
  813. int AffixMgr::encodeit(struct affentry * ptr, char * cs)
  814. {
  815. unsigned char c;
  816. int i, j, k;
  817. unsigned char mbr[MAXLNLEN];
  818. w_char wmbr[MAXLNLEN];
  819. w_char * wpos = wmbr;
  820. // now clear the conditions array */
  821. for (i=0;i<SETSIZE;i++) ptr->conds.base[i] = (unsigned char) 0;
  822. // now parse the string to create the conds array */
  823. int nc = strlen(cs);
  824. unsigned char neg = 0; // complement indicator
  825. int grp = 0; // group indicator
  826. unsigned char n = 0; // number of conditions
  827. int ec = 0; // end condition indicator
  828. int nm = 0; // number of member in group
  829. // if no condition just return
  830. if (strcmp(cs,".")==0) {
  831. ptr->numconds = 0;
  832. return 0;
  833. }
  834. i = 0;
  835. while (i < nc) {
  836. c = *((unsigned char *)(cs + i));
  837. // start group indicator
  838. if (c == '[') {
  839. grp = 1;
  840. c = 0;
  841. }
  842. // complement flag
  843. if ((grp == 1) && (c == '^')) {
  844. neg = 1;
  845. c = 0;
  846. }
  847. // end goup indicator
  848. if (c == ']') {
  849. ec = 1;
  850. c = 0;
  851. }
  852. // add character of group to list
  853. if ((grp == 1) && (c != 0)) {
  854. *(mbr + nm) = c;
  855. nm++;
  856. c = 0;
  857. }
  858. // end of condition
  859. if (c != 0) {
  860. ec = 1;
  861. }
  862. if (ec) {
  863. if (!utf8) {
  864. if (grp == 1) {
  865. if (neg == 0) {
  866. // set the proper bits in the condition array vals for those chars
  867. for (j=0;j<nm;j++) {
  868. k = (unsigned int) mbr[j];
  869. ptr->conds.base[k] = ptr->conds.base[k] | ((unsigned char)1 << n);
  870. }
  871. } else {
  872. // complement so set all of them and then unset indicated ones
  873. for (j=0;j<SETSIZE;j++) ptr->conds.base[j] = ptr->conds.base[j] | ((unsigned char)1 << n);
  874. for (j=0;j<nm;j++) {
  875. k = (unsigned int) mbr[j];
  876. ptr->conds.base[k] = ptr->conds.base[k] & ~((unsigned char)1 << n);
  877. }
  878. }
  879. neg = 0;
  880. grp = 0;
  881. nm = 0;
  882. } else {
  883. // not a group so just set the proper bit for this char
  884. // but first handle special case of . inside condition
  885. if (c == '.') {
  886. // wild card character so set them all
  887. for (j=0;j<SETSIZE;j++) ptr->conds.base[j] = ptr->conds.base[j] | ((unsigned char)1 << n);
  888. } else {
  889. ptr->conds.base[(unsigned int) c] = ptr->conds.base[(unsigned int)c] | ((unsigned char)1 << n);
  890. }
  891. }
  892. n++;
  893. ec = 0;
  894. } else { // UTF-8 character set
  895. if (grp == 1) {
  896. ptr->conds.utf8.neg[n] = neg;
  897. if (neg == 0) {
  898. // set the proper bits in the condition array vals for those chars
  899. for (j=0;j<nm;j++) {
  900. k = (unsigned int) mbr[j];
  901. if (k >> 7) {
  902. u8_u16(wpos, 1, (char *) mbr + j);
  903. wpos++;
  904. if ((k & 0xe0) == 0xe0) j+=2; else j++; // 3-byte UTF-8 character
  905. } else {
  906. ptr->conds.utf8.ascii[k] = ptr->conds.utf8.ascii[k] | ((unsigned char)1 << n);
  907. }
  908. }
  909. } else { // neg == 1
  910. // complement so set all of them and then unset indicated ones
  911. for (j=0;j<(SETSIZE/2);j++) ptr->conds.utf8.ascii[j] = ptr->conds.utf8.ascii[j] | ((unsigned char)1 << n);
  912. for (j=0;j<nm;j++) {
  913. k = (unsigned int) mbr[j];
  914. if (k >> 7) {
  915. u8_u16(wpos, 1, (char *) mbr + j);
  916. wpos++;
  917. if ((k & 0xe0) == 0xe0) j+=2; else j++; // 3-byte UTF-8 character
  918. } else {
  919. ptr->conds.utf8.ascii[k] = ptr->conds.utf8.ascii[k] & ~((unsigned char)1 << n);
  920. }
  921. }
  922. }
  923. neg = 0;
  924. grp = 0;
  925. nm = 0;
  926. ptr->conds.utf8.wlen[n] = wpos - wmbr;
  927. if ((wpos - wmbr) != 0) {
  928. ptr->conds.utf8.wchars[n] = (w_char *) malloc(sizeof(w_char) * (wpos - wmbr));
  929. if (!ptr->conds.utf8.wchars[n]) return 1;
  930. memcpy(ptr->conds.utf8.wchars[n], wmbr, sizeof(w_char) * (wpos - wmbr));
  931. flag_qsort((unsigned short *) ptr->conds.utf8.wchars[n], 0, ptr->conds.utf8.wlen[n]);
  932. wpos = wmbr;
  933. }
  934. } else { // grp == 0
  935. // is UTF-8 character?
  936. if (c >> 7) {
  937. ptr->conds.utf8.wchars[n] = (w_char *) malloc(sizeof(w_char));
  938. if (!ptr->conds.utf8.wchars[n]) return 1;
  939. ptr->conds.utf8.wlen[n] = 1;
  940. u8_u16(ptr->conds.utf8.wchars[n], 1, cs + i);
  941. if ((c & 0xe0) == 0xe0) i+=2; else i++; // 3-byte UFT-8 character
  942. } else {
  943. ptr->conds.utf8.wchars[n] = NULL;
  944. // not a group so just set the proper bit for this char
  945. // but first handle special case of . inside condition
  946. if (c == '.') {
  947. ptr->conds.utf8.all[n] = 1;
  948. // wild card character so set them all
  949. for (j=0;j<(SETSIZE/2);j++) ptr->conds.utf8.ascii[j] = ptr->conds.utf8.ascii[j] | ((unsigned char)1 << n);
  950. } else {
  951. ptr->conds.utf8.all[n] = 0;
  952. ptr->conds.utf8.ascii[(unsigned int) c] = ptr->conds.utf8.ascii[(unsigned int)c] | ((unsigned char)1 << n);
  953. }
  954. }
  955. neg = 0;
  956. }
  957. n++;
  958. ec = 0;
  959. neg = 0;
  960. }
  961. }
  962. i++;
  963. }
  964. ptr->numconds = n;
  965. return 0;
  966. }
  967. // return 1 if s1 is a leading subset of s2
  968. /* inline int AffixMgr::isSubset(const char * s1, const char * s2)
  969. {
  970. while ((*s1 == *s2) && *s1) {
  971. s1++;
  972. s2++;
  973. }
  974. return (*s1 == '\0');
  975. }
  976. */
  977. // return 1 if s1 is a leading subset of s2 (dots are for infixes)
  978. inline int AffixMgr::isSubset(const char * s1, const char * s2)
  979. {
  980. while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) {
  981. s1++;
  982. s2++;
  983. }
  984. return (*s1 == '\0');
  985. }
  986. // check word for prefixes
  987. struct hentry * AffixMgr::prefix_check(const char * word, int len, char in_compound,
  988. const FLAG needflag)
  989. {
  990. struct hentry * rv= NULL;
  991. pfx = NULL;
  992. pfxappnd = NULL;
  993. sfxappnd = NULL;
  994. // first handle the special case of 0 length prefixes
  995. PfxEntry * pe = (PfxEntry *) pStart[0];
  996. while (pe) {
  997. if (
  998. // fogemorpheme
  999. ((in_compound != IN_CPD_NOT) || !(pe->getCont() &&
  1000. (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) &&
  1001. // permit prefixes in compounds
  1002. ((in_compound != IN_CPD_END) || (pe->getCont() &&
  1003. (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))
  1004. ) {
  1005. // check prefix
  1006. rv = pe->checkword(word, len, in_compound, needflag);
  1007. if (rv) {
  1008. pfx=(AffEntry *)pe; // BUG: pfx not stateless
  1009. return rv;
  1010. }
  1011. }
  1012. pe = pe->getNext();
  1013. }
  1014. // now handle the general case
  1015. unsigned char sp = *((const unsigned char *)word);
  1016. PfxEntry * pptr = (PfxEntry *)pStart[sp];
  1017. while (pptr) {
  1018. if (isSubset(pptr->getKey(),word)) {
  1019. if (
  1020. // fogemorpheme
  1021. ((in_compound != IN_CPD_NOT) || !(pptr->getCont() &&
  1022. (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) &&
  1023. // permit prefixes in compounds
  1024. ((in_compound != IN_CPD_END) || (pptr->getCont() &&
  1025. (TESTAFF(pptr->getCont(), compoundpermitflag, pptr->getContLen()))))
  1026. ) {
  1027. // check prefix
  1028. rv = pptr->checkword(word, len, in_compound, needflag);
  1029. if (rv) {
  1030. pfx=(AffEntry *)pptr; // BUG: pfx not stateless
  1031. return rv;
  1032. }
  1033. }
  1034. pptr = pptr->getNextEQ();
  1035. } else {
  1036. pptr = pptr->getNextNE();
  1037. }
  1038. }
  1039. return NULL;
  1040. }
  1041. // check word for prefixes
  1042. struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len,
  1043. char in_compound, const FLAG needflag)
  1044. {
  1045. struct hentry * rv= NULL;
  1046. pfx = NULL;
  1047. sfxappnd = NULL;
  1048. // first handle the special case of 0 length prefixes
  1049. PfxEntry * pe = (PfxEntry *) pStart[0];
  1050. while (pe) {
  1051. rv = pe->check_twosfx(word, len, in_compound, needflag);
  1052. if (rv) return rv;
  1053. pe = pe->getNext();
  1054. }
  1055. // now handle the general case
  1056. unsigned char sp = *((const unsigned char *)word);
  1057. PfxEntry * pptr = (PfxEntry *)pStart[sp];
  1058. while (pptr) {
  1059. if (isSubset(pptr->getKey(),word)) {
  1060. rv = pptr->check_twosfx(word, len, in_compound, needflag);
  1061. if (rv) {
  1062. pfx = (AffEntry *)pptr;
  1063. return rv;
  1064. }
  1065. pptr = pptr->getNextEQ();
  1066. } else {
  1067. pptr = pptr->getNextNE();
  1068. }
  1069. }
  1070. return NULL;
  1071. }
  1072. #ifdef HUNSPELL_EXPERIMENTAL
  1073. // check word for prefixes
  1074. char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound,
  1075. const FLAG needflag)
  1076. {
  1077. char * st;
  1078. char result[MAXLNLEN];
  1079. result[0] = '\0';
  1080. pfx = NULL;
  1081. sfxappnd = NULL;
  1082. // first handle the special case of 0 length prefixes
  1083. PfxEntry * pe = (PfxEntry *) pStart[0];
  1084. while (pe) {
  1085. st = pe->check_morph(word,len,in_compound, needflag);
  1086. if (st) {
  1087. strcat(result, st);
  1088. free(st);
  1089. }
  1090. // if (rv) return rv;
  1091. pe = pe->getNext();
  1092. }
  1093. // now handle the general case
  1094. unsigned char sp = *((const unsigned char *)word);
  1095. PfxEntry * pptr = (PfxEntry *)pStart[sp];
  1096. while (pptr) {
  1097. if (isSubset(pptr->getKey(),word)) {
  1098. st = pptr->check_morph(word,len,in_compound, needflag);
  1099. if (st) {
  1100. // fogemorpheme
  1101. if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() &&
  1102. (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))))) {
  1103. strcat(result, st);
  1104. pfx = (AffEntry *)pptr;
  1105. }
  1106. free(st);
  1107. }
  1108. pptr = pptr->getNextEQ();
  1109. } else {
  1110. pptr = pptr->getNextNE();
  1111. }
  1112. }
  1113. if (*result) return mystrdup(result);
  1114. return NULL;
  1115. }
  1116. // check word for prefixes
  1117. char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len,
  1118. char in_compound, const FLAG needflag)
  1119. {
  1120. char * st;
  1121. char result[MAXLNLEN];
  1122. result[0] = '\0';
  1123. pfx = NULL;
  1124. sfxappnd = NULL;
  1125. // first handle the special case of 0 length prefixes
  1126. PfxEntry * pe = (PfxEntry *) pStart[0];
  1127. while (pe) {
  1128. st = pe->check_twosfx_morph(word,len,in_compound, needflag);
  1129. if (st) {
  1130. strcat(result, st);
  1131. free(st);
  1132. }
  1133. pe = pe->getNext();
  1134. }
  1135. // now handle the general case
  1136. unsigned char sp = *((const unsigned char *)word);
  1137. PfxEntry * pptr = (PfxEntry *)pStart[sp];
  1138. while (pptr) {
  1139. if (isSubset(pptr->getKey(),word)) {
  1140. st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
  1141. if (st) {
  1142. strcat(result, st);
  1143. free(st);
  1144. pfx = (AffEntry *)pptr;
  1145. }
  1146. pptr = pptr->getNextEQ();
  1147. } else {
  1148. pptr = pptr->getNextNE();
  1149. }
  1150. }
  1151. if (*result) return mystrdup(result);
  1152. return NULL;
  1153. }
  1154. #endif // END OF HUNSPELL_EXPERIMENTAL CODE
  1155. // Is word a non compound with a REP substitution (see checkcompoundrep)?
  1156. int AffixMgr::cpdrep_check(const char * word, int wl)
  1157. {
  1158. char candidate[MAXLNLEN];
  1159. const char * r;
  1160. int lenr, lenp;
  1161. if ((wl < 2) || !numrep) return 0;
  1162. for (int i=0; i < numrep; i++ ) {
  1163. r = word;
  1164. lenr = strlen(reptable[i].pattern2);
  1165. lenp = strlen(reptable[i].pattern);
  1166. // search every occurence of the pattern in the word
  1167. while ((r=strstr(r, reptable[i].pattern)) != NULL) {
  1168. strcpy(candidate, word);
  1169. if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break;
  1170. strcpy(candidate+(r-word),reptable[i].pattern2);
  1171. strcpy(candidate+(r-word)+lenr, r+lenp);
  1172. if (candidate_check(candidate,strlen(candidate))) return 1;
  1173. r++; // search for the next letter
  1174. }
  1175. }
  1176. return 0;
  1177. }
  1178. // forbid compoundings when there are special patterns at word bound
  1179. int AffixMgr::cpdpat_check(const char * word, int pos)
  1180. {
  1181. int len;
  1182. for (int i = 0; i < numcheckcpd; i++) {
  1183. if (isSubset(checkcpdtable[i].pattern2, word + pos) &&
  1184. (len = strlen(checkcpdtable[i].pattern)) && (pos > len) &&
  1185. (strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)) return 1;
  1186. }
  1187. return 0;
  1188. }
  1189. // forbid compounding with neighbouring upper and lower case characters at word bounds
  1190. int AffixMgr::cpdcase_check(const char * word, int pos)
  1191. {
  1192. if (utf8) {
  1193. w_char u, w;
  1194. const char * p;
  1195. u8_u16(&u, 1, word + pos);
  1196. for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--);
  1197. u8_u16(&w, 1, p);
  1198. unsigned short a = (u.h << 8) + u.l;
  1199. unsigned short b = (w.h << 8) + w.l;
  1200. if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b))) return 1;
  1201. } else {
  1202. unsigned char a = *(word + pos - 1);
  1203. unsigned char b = *(word + pos);
  1204. if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-')) return 1;
  1205. }
  1206. return 0;
  1207. }
  1208. // check compound patterns
  1209. int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** def, char all)
  1210. {
  1211. signed short btpp[MAXWORDLEN]; // metacharacter (*, ?) positions for backtracking
  1212. signed short btwp[MAXWORDLEN]; // word positions for metacharacters
  1213. int btnum[MAXWORDLEN]; // number of matched characters in metacharacter positions
  1214. short bt = 0;
  1215. int i;
  1216. int ok;
  1217. int w = 0;
  1218. if (!*words) {
  1219. w = 1;
  1220. *words = def;
  1221. }
  1222. (*words)[wnum] = rv;
  1223. for (i = 0; i < numdefcpd; i++) {
  1224. signed short pp = 0; // pattern position
  1225. signed short wp = 0; // "words" position
  1226. int ok2;
  1227. ok = 1;
  1228. ok2 = 1;
  1229. do {
  1230. while ((pp < defcpdtable[i].len) && (wp <= wnum)) {
  1231. if (((pp+1) < defcpdtable[i].len) &&
  1232. ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) {
  1233. int wend = (defcpdtable[i].def[pp+1] == '?') ? wp : wnum;
  1234. ok2 = 1;
  1235. pp+=2;
  1236. btpp[bt] = pp;
  1237. btwp[bt] = wp;
  1238. while (wp <= wend) {
  1239. if (!(*words)[wp]->alen ||
  1240. !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp-2], (*words)[wp]->alen)) {
  1241. ok2 = 0;
  1242. break;
  1243. }
  1244. wp++;
  1245. }
  1246. if (wp <= wnum) ok2 = 0;
  1247. btnum[bt] = wp - btwp[bt];
  1248. if (btnum[bt] > 0) bt++;
  1249. if (ok2) break;
  1250. } else {
  1251. ok2 = 1;
  1252. if (!(*words)[wp] || !(*words)[wp]->alen ||
  1253. !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp], (*words)[wp]->alen)) {
  1254. ok = 0;
  1255. break;
  1256. }
  1257. pp++;
  1258. wp++;
  1259. if ((defcpdtable[i].len == pp) && !(wp > wnum)) ok = 0;
  1260. }
  1261. }
  1262. if (ok && ok2) {
  1263. int r = pp;
  1264. while ((defcpdtable[i].len > r) && ((r+1) < defcpdtable[i].len) &&
  1265. ((defcpdtable[i].def[r+1] == '*') || (defcpdtable[i].def[r+1] == '?'))) r+=2;
  1266. if (defcpdtable[i].len <= r) return 1;
  1267. }
  1268. // backtrack
  1269. if (bt) do {
  1270. ok = 1;
  1271. btnum[bt - 1]--;
  1272. pp = btpp[bt - 1];
  1273. wp = btwp[bt - 1] + btnum[bt - 1];
  1274. } while ((btnum[bt - 1] < 0) && --bt);
  1275. } while (bt);
  1276. if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1;
  1277. // check zero ending
  1278. while (ok && ok2 && (defcpdtable[i].len > pp) && ((pp+1) < defcpdtable[i].len) &&
  1279. ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) pp+=2;
  1280. if (ok && ok2 && (defcpdtable[i].len <= pp)) return 1;
  1281. }
  1282. (*words)[wnum] = NULL;
  1283. if (w) *words = NULL;
  1284. return 0;
  1285. }
  1286. inline int AffixMgr::candidate_check(const char * word, int len)
  1287. {
  1288. struct hentry * rv=NULL;
  1289. rv = lookup(word);
  1290. if (rv) return 1;
  1291. // rv = prefix_check(word,len,1);
  1292. // if (rv) return 1;
  1293. rv = affix_check(word,len);
  1294. if (rv) return 1;
  1295. return 0;
  1296. }
  1297. // calculate number of syllable for compound-checking
  1298. short AffixMgr::get_syllable(const char * word, int wlen)
  1299. {
  1300. if (cpdmaxsyllable==0) return 0;
  1301. short num=0;
  1302. if (!utf8) {
  1303. for (int i=0; i<wlen; i++) {
  1304. if (strchr(cpdvowels, word[i])) num++;
  1305. }
  1306. } else if (cpdvowels_utf16) {
  1307. w_char w[MAXWORDUTF8LEN];
  1308. int i = u8_u16(w, MAXWORDUTF8LEN, word);
  1309. for (; i; i--) {
  1310. if (flag_bsearch((unsigned short *) cpdvowels_utf16,
  1311. ((unsigned short *) w)[i - 1], cpdvowels_utf16_len)) num++;
  1312. }
  1313. }
  1314. return num;
  1315. }
  1316. // check if compound word is correctly spelled
  1317. // hu_mov_rule = spec. Hungarian rule (XXX)
  1318. struct hentry * AffixMgr::compound_check(const char * word, int len,
  1319. short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words = NULL,
  1320. char hu_mov_rule = 0, int * cmpdstemnum = NULL, int * cmpdstem = NULL, char is_sug = 0)
  1321. {
  1322. int i;
  1323. short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
  1324. int oldcmpdstemnum = 0;
  1325. struct hentry * rv = NULL;
  1326. struct hentry * rv_first;
  1327. struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
  1328. char st [MAXWORDUTF8LEN + 4];
  1329. char ch;
  1330. int cmin;
  1331. int cmax;
  1332. int checked_prefix;
  1333. #ifdef HUNSTEM
  1334. if (cmpdstemnum) {
  1335. if (wordnum == 0) {
  1336. *cmpdstemnum = 1;
  1337. } else {
  1338. (*cmpdstemnum)++;
  1339. }
  1340. }
  1341. #endif
  1342. if (utf8) {
  1343. for (cmin = 0, i = 0; (i < cpdmin) && word[cmin]; i++) {
  1344. cmin++;
  1345. for (; (word[cmin] & 0xc0) == 0x80; cmin++);
  1346. }
  1347. for (cmax = len, i = 0; (i < (cpdmin - 1)) && cmax; i++) {
  1348. cmax--;
  1349. for (; (word[cmax] & 0xc0) == 0x80; cmax--);
  1350. }
  1351. } else {
  1352. cmin = cpdmin;
  1353. cmax = len - cpdmin + 1;
  1354. }
  1355. strcpy(st, word);
  1356. for (i = cmin; i < cmax; i++) {
  1357. oldnumsyllable = numsyllable;
  1358. oldwordnum = wordnum;
  1359. checked_prefix = 0;
  1360. // go to end of the UTF-8 character
  1361. if (utf8) {
  1362. for (; (st[i] & 0xc0) == 0x80; i++);
  1363. if (i >= cmax) return NULL;
  1364. }
  1365. ch = st[i];
  1366. st[i] = '\0';
  1367. sfx = NULL;
  1368. pfx = NULL;
  1369. // FIRST WORD
  1370. rv = lookup(st); // perhaps without prefix
  1371. // search homonym with compound flag
  1372. while ((rv) && !hu_mov_rule &&
  1373. ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) ||
  1374. !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1375. (compoundbegin && !wordnum &&
  1376. TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
  1377. (compoundmiddle && wordnum && !words &&
  1378. TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
  1379. (numdefcpd &&
  1380. ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||
  1381. (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))
  1382. ))) {
  1383. rv = rv->next_homonym;
  1384. }
  1385. if (!rv) {
  1386. if (compoundflag &&
  1387. !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {
  1388. if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,
  1389. FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule &&
  1390. ((SfxEntry*)sfx)->getCont() &&
  1391. ((compoundforbidflag && TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
  1392. ((SfxEntry*)sfx)->getContLen())) || (compoundend &&
  1393. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend,
  1394. ((SfxEntry*)sfx)->getContLen())))) {
  1395. rv = NULL;
  1396. }
  1397. }
  1398. if (rv ||
  1399. (((wordnum == 0) && compoundbegin &&
  1400. ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
  1401. (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) ||
  1402. ((wordnum > 0) && compoundmiddle &&
  1403. ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
  1404. (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle)))))
  1405. ) checked_prefix = 1;
  1406. // else check forbiddenwords and pseudoroot
  1407. } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
  1408. TESTAFF(rv->astr, pseudoroot, rv->alen) ||
  1409. (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen))
  1410. )) {
  1411. st[i] = ch;
  1412. continue;
  1413. }
  1414. // check non_compound flag in suffix and prefix
  1415. if ((rv) && !hu_mov_rule &&
  1416. ((pfx && ((PfxEntry*)pfx)->getCont() &&
  1417. TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag,
  1418. ((PfxEntry*)pfx)->getContLen())) ||
  1419. (sfx && ((SfxEntry*)sfx)->getCont() &&
  1420. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
  1421. ((SfxEntry*)sfx)->getContLen())))) {
  1422. rv = NULL;
  1423. }
  1424. // check compoundend flag in suffix and prefix
  1425. if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
  1426. ((pfx && ((PfxEntry*)pfx)->getCont() &&
  1427. TESTAFF(((PfxEntry*)pfx)->getCont(), compoundend,
  1428. ((PfxEntry*)pfx)->getContLen())) ||
  1429. (sfx && ((SfxEntry*)sfx)->getCont() &&
  1430. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend,
  1431. ((SfxEntry*)sfx)->getContLen())))) {
  1432. rv = NULL;
  1433. }
  1434. // check compoundmiddle flag in suffix and prefix
  1435. if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule &&
  1436. ((pfx && ((PfxEntry*)pfx)->getCont() &&
  1437. TESTAFF(((PfxEntry*)pfx)->getCont(), compoundmiddle,
  1438. ((PfxEntry*)pfx)->getContLen())) ||
  1439. (sfx && ((SfxEntry*)sfx)->getCont() &&
  1440. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundmiddle,
  1441. ((SfxEntry*)sfx)->getContLen())))) {
  1442. rv = NULL;
  1443. }
  1444. // check forbiddenwords
  1445. if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
  1446. (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) {
  1447. return NULL;
  1448. }
  1449. // increment word number, if the second root has a compoundroot flag
  1450. if ((rv) && compoundroot &&
  1451. (TESTAFF(rv->astr, compoundroot, rv->alen))) {
  1452. wordnum++;
  1453. }
  1454. // first word is acceptable in compound words?
  1455. if (((rv) &&
  1456. ( checked_prefix || (words && words[wnum]) ||
  1457. (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1458. ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
  1459. ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))// ||
  1460. // (numdefcpd && )
  1461. // LANG_hu section: spec. Hungarian rule
  1462. || ((langnum == LANG_hu) && hu_mov_rule && (
  1463. TESTAFF(rv->astr, 'F', rv->alen) || // XXX hardwired Hungarian dictionary codes
  1464. TESTAFF(rv->astr, 'G', rv->alen) ||
  1465. TESTAFF(rv->astr, 'H', rv->alen)
  1466. )
  1467. )
  1468. // END of LANG_hu section
  1469. )
  1470. && ! (( checkcompoundtriple && // test triple letters
  1471. (word[i-1]==word[i]) && (
  1472. ((i>1) && (word[i-1]==word[i-2])) ||
  1473. ((word[i-1]==word[i+1])) // may be word[i+1] == '\0'
  1474. )
  1475. ) ||
  1476. (
  1477. // test CHECKCOMPOUNDPATTERN
  1478. numcheckcpd && cpdpat_check(word, i)
  1479. ) ||
  1480. (
  1481. checkcompoundcase && cpdcase_check(word, i)
  1482. ))
  1483. )
  1484. // LANG_hu section: spec. Hungarian rule
  1485. || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) &&
  1486. (sfx && ((SfxEntry*)sfx)->getCont() && ( // XXX hardwired Hungarian dic. codes
  1487. TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) 'x', ((SfxEntry*)sfx)->getContLen()) ||
  1488. TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) '%', ((SfxEntry*)sfx)->getContLen())
  1489. )
  1490. )
  1491. )
  1492. // END of LANG_hu section
  1493. ) {
  1494. // LANG_hu section: spec. Hungarian rule
  1495. if (langnum == LANG_hu) {
  1496. // calculate syllable number of the word
  1497. numsyllable += get_syllable(st, i);
  1498. // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
  1499. if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++;
  1500. }
  1501. // END of LANG_hu section
  1502. #ifdef HUNSTEM
  1503. if (cmpdstem) cmpdstem[*cmpdstemnum - 1] = i;
  1504. #endif
  1505. // NEXT WORD(S)
  1506. rv_first = rv;
  1507. rv = lookup((word+i)); // perhaps without prefix
  1508. // search homonym with compound flag
  1509. while ((rv) && ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) ||
  1510. !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1511. (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) ||
  1512. (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))))) {
  1513. rv = rv->next_homonym;
  1514. }
  1515. if (rv && words && words[wnum + 1]) return rv;
  1516. oldnumsyllable2 = numsyllable;
  1517. oldwordnum2 = wordnum;
  1518. // LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary code
  1519. if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) {
  1520. numsyllable--;
  1521. }
  1522. // END of LANG_hu section
  1523. // increment word number, if the second root has a compoundroot flag
  1524. if ((rv) && (compoundroot) &&
  1525. (TESTAFF(rv->astr, compoundroot, rv->alen))) {
  1526. wordnum++;
  1527. }
  1528. // check forbiddenwords
  1529. if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
  1530. (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL;
  1531. // second word is acceptable, as a root?
  1532. // hungarian conventions: compounding is acceptable,
  1533. // when compound forms consist of 2 words, or if more,
  1534. // then the syllable number of root words must be 6, or lesser.
  1535. if ((rv) && (
  1536. (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1537. (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))
  1538. )
  1539. && (
  1540. ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
  1541. ((cpdmaxsyllable==0) ||
  1542. (numsyllable + get_syllable(rv->word,rv->wlen)<=cpdmaxsyllable))
  1543. )
  1544. && (
  1545. (!checkcompounddup || (rv != rv_first))
  1546. )
  1547. )
  1548. {
  1549. // forbid compound word, if it is a non compound word with typical fault
  1550. if (checkcompoundrep && cpdrep_check(word,len)) return NULL;
  1551. return rv;
  1552. }
  1553. numsyllable = oldnumsyllable2 ;
  1554. wordnum = oldwordnum2;
  1555. // perhaps second word has prefix or/and suffix
  1556. sfx = NULL;
  1557. sfxflag = FLAG_NULL;
  1558. rv = (compoundflag) ? affix_check((word+i),strlen(word+i), compoundflag, IN_CPD_END) : NULL;
  1559. if (!rv && compoundend) {
  1560. sfx = NULL;
  1561. pfx = NULL;
  1562. rv = affix_check((word+i),strlen(word+i), compoundend, IN_CPD_END);
  1563. }
  1564. if (!rv && numdefcpd && words) {
  1565. rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END);
  1566. if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) return rv;
  1567. }
  1568. // check non_compound flag in suffix and prefix
  1569. if ((rv) &&
  1570. ((pfx && ((PfxEntry*)pfx)->getCont() &&
  1571. TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag,
  1572. ((PfxEntry*)pfx)->getContLen())) ||
  1573. (sfx && ((SfxEntry*)sfx)->getCont() &&
  1574. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
  1575. ((SfxEntry*)sfx)->getContLen())))) {
  1576. rv = NULL;
  1577. }
  1578. // check forbiddenwords
  1579. if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
  1580. (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL;
  1581. // pfxappnd = prefix of word+i, or NULL
  1582. // calculate syllable number of prefix.
  1583. // hungarian convention: when syllable number of prefix is more,
  1584. // than 1, the prefix+word counts as two words.
  1585. if (langnum == LANG_hu) {
  1586. // calculate syllable number of the word
  1587. numsyllable += get_syllable(word + i, strlen(word + i));
  1588. // - affix syllable num.
  1589. // XXX only second suffix (inflections, not derivations)
  1590. if (sfxappnd) {
  1591. char * tmp = myrevstrdup(sfxappnd);
  1592. numsyllable -= get_syllable(tmp, strlen(tmp));
  1593. free(tmp);
  1594. }
  1595. // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
  1596. if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++;
  1597. // increment syllable num, if last word has a SYLLABLENUM flag
  1598. // and the suffix is beginning `s'
  1599. if (cpdsyllablenum) {
  1600. switch (sfxflag) {
  1601. case 'c': { numsyllable+=2; break; }
  1602. case 'J': { numsyllable += 1; break; }
  1603. case 'I': { if (TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; }
  1604. }
  1605. }
  1606. }
  1607. // increment word number, if the second word has a compoundroot flag
  1608. if ((rv) && (compoundroot) &&
  1609. (TESTAFF(rv->astr, compoundroot, rv->alen))) {
  1610. wordnum++;
  1611. }
  1612. // second word is acceptable, as a word with prefix or/and suffix?
  1613. // hungarian conventions: compounding is acceptable,
  1614. // when compound forms consist 2 word, otherwise
  1615. // the syllable number of root words is 6, or lesser.
  1616. if ((rv) &&
  1617. (
  1618. ((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
  1619. ((cpdmaxsyllable == 0) ||
  1620. (numsyllable <= cpdmaxsyllable))
  1621. )
  1622. && (
  1623. (!checkcompounddup || (rv != rv_first))
  1624. )) {
  1625. // forbid compound word, if it is a non compound word with typical fault
  1626. if (checkcompoundrep && cpdrep_check(word, len)) return NULL;
  1627. return rv;
  1628. }
  1629. numsyllable = oldnumsyllable2;
  1630. wordnum = oldwordnum2;
  1631. #ifdef HUNSTEM
  1632. if (cmpdstemnum) oldcmpdstemnum = *cmpdstemnum;
  1633. #endif
  1634. // perhaps second word is a compound word (recursive call)
  1635. if (wordnum < maxwordnum) {
  1636. rv = compound_check((word+i),strlen(word+i), wordnum+1,
  1637. numsyllable, maxwordnum, wnum + 1, words,
  1638. 0, cmpdstemnum, cmpdstem, is_sug);
  1639. } else {
  1640. rv=NULL;
  1641. }
  1642. if (rv) {
  1643. // forbid compound word, if it is a non compound word with typical fault
  1644. if (checkcompoundrep && cpdrep_check(word, len)) return NULL;
  1645. return rv;
  1646. } else {
  1647. #ifdef HUNSTEM
  1648. if (cmpdstemnum) *cmpdstemnum = oldcmpdstemnum;
  1649. #endif
  1650. }
  1651. }
  1652. st[i] = ch;
  1653. wordnum = oldwordnum;
  1654. numsyllable = oldnumsyllable;
  1655. }
  1656. return NULL;
  1657. }
  1658. #ifdef HUNSPELL_EXPERIMENTAL
  1659. // check if compound word is correctly spelled
  1660. // hu_mov_rule = spec. Hungarian rule (XXX)
  1661. int AffixMgr::compound_check_morph(const char * word, int len,
  1662. short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words,
  1663. char hu_mov_rule = 0, char ** result = NULL, char * partresult = NULL)
  1664. {
  1665. int i;
  1666. short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
  1667. int ok = 0;
  1668. struct hentry * rv = NULL;
  1669. struct hentry * rv_first;
  1670. struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
  1671. char st [MAXWORDUTF8LEN + 4];
  1672. char ch;
  1673. int checked_prefix;
  1674. char presult[MAXLNLEN];
  1675. int cmin;
  1676. int cmax;
  1677. if (utf8) {
  1678. for (cmin = 0, i = 0; (i < cpdmin) && word[cmin]; i++) {
  1679. cmin++;
  1680. for (; (word[cmin] & 0xc0) == 0x80; cmin++);
  1681. }
  1682. for (cmax = len, i = 0; (i < (cpdmin - 1)) && cmax; i++) {
  1683. cmax--;
  1684. for (; (word[cmax] & 0xc0) == 0x80; cmax--);
  1685. }
  1686. } else {
  1687. cmin = cpdmin;
  1688. cmax = len - cpdmin + 1;
  1689. }
  1690. strcpy(st, word);
  1691. for (i = cmin; i < cmax; i++) {
  1692. oldnumsyllable = numsyllable;
  1693. oldwordnum = wordnum;
  1694. checked_prefix = 0;
  1695. // go to end of the UTF-8 character
  1696. if (utf8) {
  1697. for (; (st[i] & 0xc0) == 0x80; i++);
  1698. if (i >= cmax) return 0;
  1699. }
  1700. ch = st[i];
  1701. st[i] = '\0';
  1702. sfx = NULL;
  1703. // FIRST WORD
  1704. *presult = '\0';
  1705. if (partresult) strcat(presult, partresult);
  1706. rv = lookup(st); // perhaps without prefix
  1707. // search homonym with compound flag
  1708. while ((rv) && !hu_mov_rule &&
  1709. ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) ||
  1710. !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1711. (compoundbegin && !wordnum &&
  1712. TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
  1713. (compoundmiddle && wordnum && !words &&
  1714. TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
  1715. (numdefcpd &&
  1716. ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||
  1717. (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))
  1718. ))) {
  1719. rv = rv->next_homonym;
  1720. }
  1721. if (rv) {
  1722. if (rv->description) {
  1723. if ((!rv->astr) || !TESTAFF(rv->astr, lemma_present, rv->alen))
  1724. strcat(presult, st);
  1725. strcat(presult, rv->description);
  1726. }
  1727. }
  1728. if (!rv) {
  1729. if (compoundflag &&
  1730. !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {
  1731. if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,
  1732. FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule &&
  1733. ((SfxEntry*)sfx)->getCont() &&
  1734. ((compoundforbidflag && TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
  1735. ((SfxEntry*)sfx)->getContLen())) || (compoundend &&
  1736. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend,
  1737. ((SfxEntry*)sfx)->getContLen())))) {
  1738. rv = NULL;
  1739. }
  1740. }
  1741. if (rv ||
  1742. (((wordnum == 0) && compoundbegin &&
  1743. ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
  1744. (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) ||
  1745. ((wordnum > 0) && compoundmiddle &&
  1746. ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
  1747. (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle)))))
  1748. ) {
  1749. //char * p = prefix_check_morph(st, i, 0, compound);
  1750. char * p = NULL;
  1751. if (compoundflag) p = affix_check_morph(st, i, compoundflag);
  1752. if (!p || (*p == '\0')) {
  1753. if ((wordnum == 0) && compoundbegin) {
  1754. p = affix_check_morph(st, i, compoundbegin);
  1755. } else if ((wordnum > 0) && compoundmiddle) {
  1756. p = affix_check_morph(st, i, compoundmiddle);
  1757. }
  1758. }
  1759. if (*p != '\0') {
  1760. line_uniq(p);
  1761. if (strchr(p, '\n')) {
  1762. strcat(presult, "(");
  1763. strcat(presult, line_join(p, '|'));
  1764. strcat(presult, ")");
  1765. } else {
  1766. strcat(presult, p);
  1767. }
  1768. }
  1769. if (presult[strlen(presult) - 1] == '\n') {
  1770. presult[strlen(presult) - 1] = '\0';
  1771. }
  1772. checked_prefix = 1;
  1773. //strcat(presult, "+");
  1774. }
  1775. // else check forbiddenwords
  1776. } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
  1777. TESTAFF(rv->astr, pseudoroot, rv->alen))) {
  1778. st[i] = ch;
  1779. continue;
  1780. }
  1781. // check non_compound flag in suffix and prefix
  1782. if ((rv) && !hu_mov_rule &&
  1783. ((pfx && ((PfxEntry*)pfx)->getCont() &&
  1784. TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag,
  1785. ((PfxEntry*)pfx)->getContLen())) ||
  1786. (sfx && ((SfxEntry*)sfx)->getCont() &&
  1787. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
  1788. ((SfxEntry*)sfx)->getContLen())))) {
  1789. continue;
  1790. }
  1791. // check compoundend flag in suffix and prefix
  1792. if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
  1793. ((pfx && ((PfxEntry*)pfx)->getCont() &&
  1794. TESTAFF(((PfxEntry*)pfx)->getCont(), compoundend,
  1795. ((PfxEntry*)pfx)->getContLen())) ||
  1796. (sfx && ((SfxEntry*)sfx)->getCont() &&
  1797. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend,
  1798. ((SfxEntry*)sfx)->getContLen())))) {
  1799. continue;
  1800. }
  1801. // check compoundmiddle flag in suffix and prefix
  1802. if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule &&
  1803. ((pfx && ((PfxEntry*)pfx)->getCont() &&
  1804. TESTAFF(((PfxEntry*)pfx)->getCont(), compoundmiddle,
  1805. ((PfxEntry*)pfx)->getContLen())) ||
  1806. (sfx && ((SfxEntry*)sfx)->getCont() &&
  1807. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundmiddle,
  1808. ((SfxEntry*)sfx)->getContLen())))) {
  1809. rv = NULL;
  1810. }
  1811. // check forbiddenwords
  1812. if ((rv) && (rv->astr) && TESTAFF(rv->astr, forbiddenword, rv->alen)) continue;
  1813. // increment word number, if the second root has a compoundroot flag
  1814. if ((rv) && (compoundroot) &&
  1815. (TESTAFF(rv->astr, compoundroot, rv->alen))) {
  1816. wordnum++;
  1817. }
  1818. // first word is acceptable in compound words?
  1819. if (((rv) &&
  1820. ( checked_prefix || (words && words[wnum]) ||
  1821. (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1822. ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
  1823. ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))
  1824. // LANG_hu section: spec. Hungarian rule
  1825. || ((langnum == LANG_hu) && // hu_mov_rule
  1826. hu_mov_rule && (
  1827. TESTAFF(rv->astr, 'F', rv->alen) ||
  1828. TESTAFF(rv->astr, 'G', rv->alen) ||
  1829. TESTAFF(rv->astr, 'H', rv->alen)
  1830. )
  1831. )
  1832. // END of LANG_hu section
  1833. )
  1834. && ! (( checkcompoundtriple && // test triple letters
  1835. (word[i-1]==word[i]) && (
  1836. ((i>1) && (word[i-1]==word[i-2])) ||
  1837. ((word[i-1]==word[i+1])) // may be word[i+1] == '\0'
  1838. )
  1839. ) ||
  1840. (
  1841. // test CHECKCOMPOUNDPATTERN
  1842. numcheckcpd && cpdpat_check(word, i)
  1843. ) ||
  1844. (
  1845. checkcompoundcase && cpdcase_check(word, i)
  1846. ))
  1847. )
  1848. // LANG_hu section: spec. Hungarian rule
  1849. || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) &&
  1850. (sfx && ((SfxEntry*)sfx)->getCont() && (
  1851. TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) 'x', ((SfxEntry*)sfx)->getContLen()) ||
  1852. TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) '%', ((SfxEntry*)sfx)->getContLen())
  1853. )
  1854. )
  1855. )
  1856. // END of LANG_hu section
  1857. ) {
  1858. // LANG_hu section: spec. Hungarian rule
  1859. if (langnum == LANG_hu) {
  1860. // calculate syllable number of the word
  1861. numsyllable += get_syllable(st, i);
  1862. // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
  1863. if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++;
  1864. }
  1865. // END of LANG_hu section
  1866. // NEXT WORD(S)
  1867. rv_first = rv;
  1868. rv = lookup((word+i)); // perhaps without prefix
  1869. // search homonym with compound flag
  1870. while ((rv) && ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) ||
  1871. !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1872. (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) ||
  1873. (numdefcpd && defcpd_check(&words, wnum + 1, rv, NULL,1))))) {
  1874. rv = rv->next_homonym;
  1875. }
  1876. if (rv && words && words[wnum + 1]) {
  1877. strcat(*result, presult);
  1878. if (complexprefixes && rv->description) strcat(*result, rv->description);
  1879. if (rv->description && ((!rv->astr) ||
  1880. !TESTAFF(rv->astr, lemma_present, rv->alen)))
  1881. strcat(*result, rv->word);
  1882. if (!complexprefixes && rv->description) strcat(*result, rv->description);
  1883. strcat(*result, "\n");
  1884. ok = 1;
  1885. return 0;
  1886. }
  1887. oldnumsyllable2 = numsyllable;
  1888. oldwordnum2 = wordnum;
  1889. // LANG_hu section: spec. Hungarian rule
  1890. if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) {
  1891. numsyllable--;
  1892. }
  1893. // END of LANG_hu section
  1894. // increment word number, if the second root has a compoundroot flag
  1895. if ((rv) && (compoundroot) &&
  1896. (TESTAFF(rv->astr, compoundroot, rv->alen))) {
  1897. wordnum++;
  1898. }
  1899. // check forbiddenwords
  1900. if ((rv) && (rv->astr) && TESTAFF(rv->astr, forbiddenword, rv->alen)) {
  1901. st[i] = ch;
  1902. continue;
  1903. }
  1904. // second word is acceptable, as a root?
  1905. // hungarian conventions: compounding is acceptable,
  1906. // when compound forms consist of 2 words, or if more,
  1907. // then the syllable number of root words must be 6, or lesser.
  1908. if ((rv) && (
  1909. (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1910. (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))
  1911. )
  1912. && (
  1913. ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
  1914. ((cpdmaxsyllable==0) ||
  1915. (numsyllable+get_syllable(rv->word,rv->wlen)<=cpdmaxsyllable))
  1916. )
  1917. && (
  1918. (!checkcompounddup || (rv != rv_first))
  1919. )
  1920. )
  1921. {
  1922. // bad compound word
  1923. strcat(*result, presult);
  1924. if (rv->description) {
  1925. if (complexprefixes) strcat(*result, rv->description);
  1926. if ((!rv->astr) || !TESTAFF(rv->astr, lemma_present, rv->alen))
  1927. strcat(*result, rv->word);
  1928. if (!complexprefixes) strcat(*result, rv->description);
  1929. }
  1930. strcat(*result, "\n");
  1931. ok = 1;
  1932. }
  1933. numsyllable = oldnumsyllable2 ;
  1934. wordnum = oldwordnum2;
  1935. // perhaps second word has prefix or/and suffix
  1936. sfx = NULL;
  1937. sfxflag = FLAG_NULL;
  1938. if (compoundflag) rv = affix_check((word+i),strlen(word+i), compoundflag); else rv = NULL;
  1939. if (!rv && compoundend) {
  1940. sfx = NULL;
  1941. pfx = NULL;
  1942. rv = affix_check((word+i),strlen(word+i), compoundend);
  1943. }
  1944. if (!rv && numdefcpd && words) {
  1945. rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END);
  1946. if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) {
  1947. char * m = NULL;
  1948. if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag);
  1949. if ((!m || *m == '\0') && compoundend)
  1950. m = affix_check_morph((word+i),strlen(word+i), compoundend);
  1951. strcat(*result, presult);
  1952. if (m) {
  1953. line_uniq(m);
  1954. if (strchr(m, '\n')) {
  1955. strcat(*result, "(");
  1956. strcat(*result, line_join(m, '|'));
  1957. strcat(*result, ")");
  1958. } else {
  1959. strcat(*result, m);
  1960. }
  1961. free(m);
  1962. }
  1963. strcat(*result, "\n");
  1964. ok = 1;
  1965. }
  1966. }
  1967. // check non_compound flag in suffix and prefix
  1968. if ((rv) &&
  1969. ((pfx && ((PfxEntry*)pfx)->getCont() &&
  1970. TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag,
  1971. ((PfxEntry*)pfx)->getContLen())) ||
  1972. (sfx && ((SfxEntry*)sfx)->getCont() &&
  1973. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
  1974. ((SfxEntry*)sfx)->getContLen())))) {
  1975. rv = NULL;
  1976. }
  1977. // check forbiddenwords
  1978. if ((rv) && (rv->astr) && (TESTAFF(rv->astr,forbiddenword,rv->alen))
  1979. && (! TESTAFF(rv->astr, pseudoroot, rv->alen))) {
  1980. st[i] = ch;
  1981. continue;
  1982. }
  1983. if (langnum == LANG_hu) {
  1984. // calculate syllable number of the word
  1985. numsyllable += get_syllable(word + i, strlen(word + i));
  1986. // - affix syllable num.
  1987. // XXX only second suffix (inflections, not derivations)
  1988. if (sfxappnd) {
  1989. char * tmp = myrevstrdup(sfxappnd);
  1990. numsyllable -= get_syllable(tmp, strlen(tmp));
  1991. free(tmp);
  1992. }
  1993. // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
  1994. if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++;
  1995. // increment syllable num, if last word has a SYLLABLENUM flag
  1996. // and the suffix is beginning `s'
  1997. if (cpdsyllablenum) {
  1998. switch (sfxflag) {
  1999. case 'c': { numsyllable+=2; break; }
  2000. case 'J': { numsyllable += 1; break; }
  2001. case 'I': { if (rv && TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; }
  2002. }
  2003. }
  2004. }
  2005. // increment word number, if the second word has a compoundroot flag
  2006. if ((rv) && (compoundroot) &&
  2007. (TESTAFF(rv->astr, compoundroot, rv->alen))) {
  2008. wordnum++;
  2009. }
  2010. // second word is acceptable, as a word with prefix or/and suffix?
  2011. // hungarian conventions: compounding is acceptable,
  2012. // when compound forms consist 2 word, otherwise
  2013. // the syllable number of root words is 6, or lesser.
  2014. if ((rv) &&
  2015. (
  2016. ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
  2017. ((cpdmaxsyllable==0) ||
  2018. (numsyllable <= cpdmaxsyllable))
  2019. )
  2020. && (
  2021. (!checkcompounddup || (rv != rv_first))
  2022. )) {
  2023. char * m = NULL;
  2024. if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag);
  2025. if ((!m || *m == '\0') && compoundend)
  2026. m = affix_check_morph((word+i),strlen(word+i), compoundend);
  2027. strcat(*result, presult);
  2028. if (m) {
  2029. line_uniq(m);
  2030. if (strchr(m, '\n')) {
  2031. strcat(*result, "(");
  2032. strcat(*result, line_join(m, '|'));
  2033. strcat(*result, ")");
  2034. } else {
  2035. strcat(*result, m);
  2036. }
  2037. free(m);
  2038. }
  2039. strcat(*result, "\n");
  2040. ok = 1;
  2041. }
  2042. numsyllable = oldnumsyllable2;
  2043. wordnum = oldwordnum2;
  2044. // perhaps second word is a compound word (recursive call)
  2045. if ((wordnum < maxwordnum) && (ok == 0)) {
  2046. compound_check_morph((word+i),strlen(word+i), wordnum+1,
  2047. numsyllable, maxwordnum, wnum + 1, words, 0, result, presult);
  2048. } else {
  2049. rv=NULL;
  2050. }
  2051. }
  2052. st[i] = ch;
  2053. wordnum = oldwordnum;
  2054. numsyllable = oldnumsyllable;
  2055. }
  2056. return 0;
  2057. }
  2058. #endif // END OF HUNSPELL_EXPERIMENTAL CODE
  2059. // return 1 if s1 (reversed) is a leading subset of end of s2
  2060. /* inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len)
  2061. {
  2062. while ((len > 0) && *s1 && (*s1 == *end_of_s2)) {
  2063. s1++;
  2064. end_of_s2--;
  2065. len--;
  2066. }
  2067. return (*s1 == '\0');
  2068. }
  2069. */
  2070. inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len)
  2071. {
  2072. while ((len > 0) && (*s1 != '\0') && ((*s1 == *end_of_s2) || (*s1 == '.'))) {
  2073. s1++;
  2074. end_of_s2--;
  2075. len--;
  2076. }
  2077. return (*s1 == '\0');
  2078. }
  2079. // check word for suffixes
  2080. struct hentry * AffixMgr::suffix_check (const char * word, int len,
  2081. int sfxopts, AffEntry * ppfx, char ** wlst, int maxSug, int * ns,
  2082. const FLAG cclass, const FLAG needflag, char in_compound)
  2083. {
  2084. struct hentry * rv = NULL;
  2085. char result[MAXLNLEN];
  2086. PfxEntry* ep = (PfxEntry *) ppfx;
  2087. // first handle the special case of 0 length suffixes
  2088. SfxEntry * se = (SfxEntry *) sStart[0];
  2089. while (se) {
  2090. if (!cclass || se->getCont()) {
  2091. // suffixes are not allowed in beginning of compounds
  2092. if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass
  2093. // except when signed with compoundpermitflag flag
  2094. (se->getCont() && compoundpermitflag &&
  2095. TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix ||
  2096. // no circumfix flag in prefix and suffix
  2097. ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
  2098. circumfix, ep->getContLen())) &&
  2099. (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) ||
  2100. // circumfix flag in prefix AND suffix
  2101. ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
  2102. circumfix, ep->getContLen())) &&
  2103. (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen()))))) &&
  2104. // fogemorpheme
  2105. (in_compound ||
  2106. !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) &&
  2107. // pseudoroot on prefix or first suffix
  2108. (cclass ||
  2109. !(se->getCont() && TESTAFF(se->getCont(), pseudoroot, se->getContLen())) ||
  2110. (ppfx && !((ep->getCont()) &&
  2111. TESTAFF(ep->getCont(), pseudoroot,
  2112. ep->getContLen())))
  2113. )
  2114. ) {
  2115. rv = se->checkword(word,len, sfxopts, ppfx, wlst, maxSug, ns, (FLAG) cclass,
  2116. needflag, (in_compound ? 0 : onlyincompound));
  2117. if (rv) {
  2118. sfx=(AffEntry *)se; // BUG: sfx not stateless
  2119. return rv;
  2120. }
  2121. }
  2122. }
  2123. se = se->getNext();
  2124. }
  2125. // now handle the general case
  2126. unsigned char sp = *((const unsigned char *)(word + len - 1));
  2127. SfxEntry * sptr = (SfxEntry *) sStart[sp];
  2128. while (sptr) {
  2129. if (isRevSubset(sptr->getKey(), word + len - 1, len)
  2130. ) {
  2131. // suffixes are not allowed in beginning of compounds
  2132. if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass
  2133. // except when signed with compoundpermitflag flag
  2134. (sptr->getCont() && compoundpermitflag &&
  2135. TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix ||
  2136. // no circumfix flag in prefix and suffix
  2137. ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
  2138. circumfix, ep->getContLen())) &&
  2139. (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) ||
  2140. // circumfix flag in prefix AND suffix
  2141. ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
  2142. circumfix, ep->getContLen())) &&
  2143. (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen()))))) &&
  2144. // fogemorpheme
  2145. (in_compound ||
  2146. !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) &&
  2147. // pseudoroot on prefix or first suffix
  2148. (cclass ||
  2149. !(sptr->getCont() && TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen())) ||
  2150. (ppfx && !((ep->getCont()) &&
  2151. TESTAFF(ep->getCont(), pseudoroot,
  2152. ep->getContLen())))
  2153. )
  2154. ) {
  2155. rv = sptr->checkword(word,len, sfxopts, ppfx, wlst,
  2156. maxSug, ns, cclass, needflag, (in_compound ? 0 : onlyincompound));
  2157. if (rv) {
  2158. sfx=(AffEntry *)sptr; // BUG: sfx not stateless
  2159. sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
  2160. if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
  2161. if (cclass || sptr->getCont()) {
  2162. if (!derived) {
  2163. derived = mystrdup(word);
  2164. } else {
  2165. strcpy(result, derived); // XXX check size
  2166. strcat(result, "\n");
  2167. strcat(result, word);
  2168. free(derived);
  2169. derived = mystrdup(result);
  2170. }
  2171. }
  2172. return rv;
  2173. }
  2174. }
  2175. sptr = sptr->getNextEQ();
  2176. } else {
  2177. sptr = sptr->getNextNE();
  2178. }
  2179. }
  2180. return NULL;
  2181. }
  2182. // check word for two-level suffixes
  2183. struct hentry * AffixMgr::suffix_check_twosfx(const char * word, int len,
  2184. int sfxopts, AffEntry * ppfx, const FLAG needflag)
  2185. {
  2186. struct hentry * rv = NULL;
  2187. // first handle the special case of 0 length suffixes
  2188. SfxEntry * se = (SfxEntry *) sStart[0];
  2189. while (se) {
  2190. if (contclasses[se->getFlag()])
  2191. {
  2192. rv = se->check_twosfx(word,len, sfxopts, ppfx, needflag);
  2193. if (rv) return rv;
  2194. }
  2195. se = se->getNext();
  2196. }
  2197. // now handle the general case
  2198. unsigned char sp = *((const unsigned char *)(word + len - 1));
  2199. SfxEntry * sptr = (SfxEntry *) sStart[sp];
  2200. while (sptr) {
  2201. if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
  2202. if (contclasses[sptr->getFlag()])
  2203. {
  2204. rv = sptr->check_twosfx(word,len, sfxopts, ppfx, needflag);
  2205. if (rv) {
  2206. sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
  2207. if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
  2208. return rv;
  2209. }
  2210. }
  2211. sptr = sptr->getNextEQ();
  2212. } else {
  2213. sptr = sptr->getNextNE();
  2214. }
  2215. }
  2216. return NULL;
  2217. }
  2218. #ifdef HUNSPELL_EXPERIMENTAL
  2219. char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len,
  2220. int sfxopts, AffEntry * ppfx, const FLAG needflag)
  2221. {
  2222. char result[MAXLNLEN];
  2223. char result2[MAXLNLEN];
  2224. char result3[MAXLNLEN];
  2225. char * st;
  2226. result[0] = '\0';
  2227. result2[0] = '\0';
  2228. result3[0] = '\0';
  2229. // first handle the special case of 0 length suffixes
  2230. SfxEntry * se = (SfxEntry *) sStart[0];
  2231. while (se) {
  2232. if (contclasses[se->getFlag()])
  2233. {
  2234. st = se->check_twosfx_morph(word,len, sfxopts, ppfx, needflag);
  2235. if (st) {
  2236. if (ppfx) {
  2237. if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph());
  2238. }
  2239. strcat(result, st);
  2240. free(st);
  2241. if (se->getMorph()) strcat(result, se->getMorph());
  2242. strcat(result, "\n");
  2243. }
  2244. }
  2245. se = se->getNext();
  2246. }
  2247. // now handle the general case
  2248. unsigned char sp = *((const unsigned char *)(word + len - 1));
  2249. SfxEntry * sptr = (SfxEntry *) sStart[sp];
  2250. while (sptr) {
  2251. if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
  2252. if (contclasses[sptr->getFlag()])
  2253. {
  2254. st = sptr->check_twosfx_morph(word,len, sfxopts, ppfx, needflag);
  2255. if (st) {
  2256. sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
  2257. if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
  2258. strcpy(result2, st);
  2259. free(st);
  2260. result3[0] = '\0';
  2261. #ifdef DEBUG
  2262. unsigned short flag = sptr->getFlag();
  2263. if (flag_mode == FLAG_NUM) {
  2264. sprintf(result3, "<%d>", sptr->getKey());
  2265. } else if (flag_mode == FLAG_LONG) {
  2266. sprintf(result3, "<%c%c>", flag >> 8, (flag << 8) >>8);
  2267. } else sprintf(result3, "<%c>", flag);
  2268. strcat(result3, ":");
  2269. #endif
  2270. if (sptr->getMorph()) strcat(result3, sptr->getMorph());
  2271. strlinecat(result2, result3);
  2272. strcat(result2, "\n");
  2273. strcat(result, result2);
  2274. }
  2275. }
  2276. sptr = sptr->getNextEQ();
  2277. } else {
  2278. sptr = sptr->getNextNE();
  2279. }
  2280. }
  2281. if (result) return mystrdup(result);
  2282. return NULL;
  2283. }
  2284. char * AffixMgr::suffix_check_morph(const char * word, int len,
  2285. int sfxopts, AffEntry * ppfx, const FLAG cclass, const FLAG needflag, char in_compound)
  2286. {
  2287. char result[MAXLNLEN];
  2288. struct hentry * rv = NULL;
  2289. result[0] = '\0';
  2290. PfxEntry* ep = (PfxEntry *) ppfx;
  2291. // first handle the special case of 0 length suffixes
  2292. SfxEntry * se = (SfxEntry *) sStart[0];
  2293. while (se) {
  2294. if (!cclass || se->getCont()) {
  2295. // suffixes are not allowed in beginning of compounds
  2296. if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
  2297. // except when signed with compoundpermitflag flag
  2298. (se->getCont() && compoundpermitflag &&
  2299. TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix ||
  2300. // no circumfix flag in prefix and suffix
  2301. ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
  2302. circumfix, ep->getContLen())) &&
  2303. (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) ||
  2304. // circumfix flag in prefix AND suffix
  2305. ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
  2306. circumfix, ep->getContLen())) &&
  2307. (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen()))))) &&
  2308. // fogemorpheme
  2309. (in_compound ||
  2310. !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) &&
  2311. // pseudoroot on prefix or first suffix
  2312. (cclass ||
  2313. !(se->getCont() && TESTAFF(se->getCont(), pseudoroot, se->getContLen())) ||
  2314. (ppfx && !((ep->getCont()) &&
  2315. TESTAFF(ep->getCont(), pseudoroot,
  2316. ep->getContLen())))
  2317. )
  2318. ))
  2319. rv = se->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag);
  2320. while (rv) {
  2321. if (ppfx) {
  2322. if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph());
  2323. }
  2324. if (complexprefixes && rv->description) strcat(result, rv->description);
  2325. if (rv->description && ((!rv->astr) ||
  2326. !TESTAFF(rv->astr, lemma_present, rv->alen)))
  2327. strcat(result, rv->word);
  2328. if (!complexprefixes && rv->description) strcat(result, rv->description);
  2329. if (se->getMorph()) strcat(result, se->getMorph());
  2330. strcat(result, "\n");
  2331. rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
  2332. }
  2333. }
  2334. se = se->getNext();
  2335. }
  2336. // now handle the general case
  2337. unsigned char sp = *((const unsigned char *)(word + len - 1));
  2338. SfxEntry * sptr = (SfxEntry *) sStart[sp];
  2339. while (sptr) {
  2340. if (isRevSubset(sptr->getKey(), word + len - 1, len)
  2341. ) {
  2342. // suffixes are not allowed in beginning of compounds
  2343. if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
  2344. // except when signed with compoundpermitflag flag
  2345. (sptr->getCont() && compoundpermitflag &&
  2346. TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix ||
  2347. // no circumfix flag in prefix and suffix
  2348. ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
  2349. circumfix, ep->getContLen())) &&
  2350. (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) ||
  2351. // circumfix flag in prefix AND suffix
  2352. ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
  2353. circumfix, ep->getContLen())) &&
  2354. (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen()))))) &&
  2355. // fogemorpheme
  2356. (in_compound ||
  2357. !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) &&
  2358. // pseudoroot on first suffix
  2359. (cclass || !(sptr->getCont() &&
  2360. TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen())))
  2361. )) rv = sptr->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag);
  2362. while (rv) {
  2363. if (ppfx) {
  2364. if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph());
  2365. }
  2366. if (complexprefixes && rv->description) strcat(result, rv->description);
  2367. if (rv->description && ((!rv->astr) ||
  2368. !TESTAFF(rv->astr, lemma_present, rv->alen))) strcat(result, rv->word);
  2369. if (!complexprefixes && rv->description) strcat(result, rv->description);
  2370. #ifdef DEBUG
  2371. unsigned short flag = sptr->getFlag();
  2372. if (flag_mode == FLAG_NUM) {
  2373. sprintf(result, "<%d>", sptr->getKey());
  2374. } else if (flag_mode == FLAG_LONG) {
  2375. sprintf(result, "<%c%c>", flag >> 8, (flag << 8) >>8);
  2376. } else sprintf(result, "<%c>", flag);
  2377. strcat(result, ":");
  2378. #endif
  2379. if (sptr->getMorph()) strcat(result, sptr->getMorph());
  2380. strcat(result, "\n");
  2381. rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
  2382. }
  2383. sptr = sptr->getNextEQ();
  2384. } else {
  2385. sptr = sptr->getNextNE();
  2386. }
  2387. }
  2388. if (*result) return mystrdup(result);
  2389. return NULL;
  2390. }
  2391. #endif // END OF HUNSPELL_EXPERIMENTAL CODE
  2392. // check if word with affixes is correctly spelled
  2393. struct hentry * AffixMgr::affix_check (const char * word, int len, const FLAG needflag, char in_compound)
  2394. {
  2395. struct hentry * rv= NULL;
  2396. if (derived) free(derived);
  2397. derived = NULL;
  2398. // check all prefixes (also crossed with suffixes if allowed)
  2399. rv = prefix_check(word, len, in_compound, needflag);
  2400. if (rv) return rv;
  2401. // if still not found check all suffixes
  2402. rv = suffix_check(word, len, 0, NULL, NULL, 0, NULL, FLAG_NULL, needflag, in_compound);
  2403. if (havecontclass) {
  2404. sfx = NULL;
  2405. pfx = NULL;
  2406. if (rv) return rv;
  2407. // if still not found check all two-level suffixes
  2408. rv = suffix_check_twosfx(word, len, 0, NULL, needflag);
  2409. if (rv) return rv;
  2410. // if still not found check all two-level suffixes
  2411. rv = prefix_check_twosfx(word, len, IN_CPD_NOT, needflag);
  2412. }
  2413. return rv;
  2414. }
  2415. #ifdef HUNSPELL_EXPERIMENTAL
  2416. // check if word with affixes is correctly spelled
  2417. char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needflag, char in_compound)
  2418. {
  2419. char result[MAXLNLEN];
  2420. char * st = NULL;
  2421. *result = '\0';
  2422. // check all prefixes (also crossed with suffixes if allowed)
  2423. st = prefix_check_morph(word, len, in_compound);
  2424. if (st) {
  2425. strcat(result, st);
  2426. free(st);
  2427. }
  2428. // if still not found check all suffixes
  2429. st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound);
  2430. if (st) {
  2431. strcat(result, st);
  2432. free(st);
  2433. }
  2434. if (havecontclass) {
  2435. sfx = NULL;
  2436. pfx = NULL;
  2437. // if still not found check all two-level suffixes
  2438. st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag);
  2439. if (st) {
  2440. strcat(result, st);
  2441. free(st);
  2442. }
  2443. // if still not found check all two-level suffixes
  2444. st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag);
  2445. if (st) {
  2446. strcat(result, st);
  2447. free(st);
  2448. }
  2449. }
  2450. return mystrdup(result);
  2451. }
  2452. #endif // END OF HUNSPELL_EXPERIMENTAL CODE
  2453. int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts,
  2454. int wl, const unsigned short * ap, unsigned short al, char * bad, int badl)
  2455. {
  2456. int nh=0;
  2457. // first add root word to list
  2458. if ((nh < maxn) && !(al && ((pseudoroot && TESTAFF(ap, pseudoroot, al)) ||
  2459. (onlyincompound && TESTAFF(ap, onlyincompound, al))))) {
  2460. wlst[nh].word = mystrdup(ts);
  2461. wlst[nh].allow = (1 == 0);
  2462. nh++;
  2463. }
  2464. // handle suffixes
  2465. for (int i = 0; i < al; i++) {
  2466. unsigned short c = (unsigned short) ap[i];
  2467. SfxEntry * sptr = (SfxEntry *)sFlag[c];
  2468. while (sptr) {
  2469. if (!sptr->getKeyLen() || ((badl > sptr->getKeyLen()) &&
  2470. (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0)) &&
  2471. // check pseudoroot flag
  2472. !(sptr->getCont() && ((pseudoroot &&
  2473. TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen())) ||
  2474. (circumfix &&
  2475. TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) ||
  2476. (onlyincompound &&
  2477. TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))
  2478. ) {
  2479. char * newword = sptr->add(ts, wl);
  2480. if (newword) {
  2481. if (nh < maxn) {
  2482. wlst[nh].word = newword;
  2483. wlst[nh].allow = sptr->allowCross();
  2484. nh++;
  2485. } else {
  2486. free(newword);
  2487. }
  2488. }
  2489. }
  2490. sptr = (SfxEntry *)sptr ->getFlgNxt();
  2491. }
  2492. }
  2493. int n = nh;
  2494. // handle cross products of prefixes and suffixes
  2495. for (int j=1;j<n ;j++)
  2496. if (wlst[j].allow) {
  2497. for (int k = 0; k < al; k++) {
  2498. unsigned short c = (unsigned short) ap[k];
  2499. PfxEntry * cptr = (PfxEntry *) pFlag[c];
  2500. while (cptr) {
  2501. if (cptr->allowCross() && (!cptr->getKeyLen() || ((badl > cptr->getKeyLen()) &&
  2502. (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) {
  2503. int l1 = strlen(wlst[j].word);
  2504. char * newword = cptr->add(wlst[j].word, l1);
  2505. if (newword) {
  2506. if (nh < maxn) {
  2507. wlst[nh].word = newword;
  2508. wlst[nh].allow = cptr->allowCross();
  2509. nh++;
  2510. } else {
  2511. free(newword);
  2512. }
  2513. }
  2514. }
  2515. cptr = (PfxEntry *)cptr ->getFlgNxt();
  2516. }
  2517. }
  2518. }
  2519. // now handle pure prefixes
  2520. for (int m = 0; m < al; m ++) {
  2521. unsigned short c = (unsigned short) ap[m];
  2522. PfxEntry * ptr = (PfxEntry *) pFlag[c];
  2523. while (ptr) {
  2524. if (!ptr->getKeyLen() || ((badl > ptr->getKeyLen()) &&
  2525. (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0)) &&
  2526. // check pseudoroot flag
  2527. !(ptr->getCont() && ((pseudoroot &&
  2528. TESTAFF(ptr->getCont(), pseudoroot, ptr->getContLen())) ||
  2529. (circumfix &&
  2530. TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) ||
  2531. (onlyincompound &&
  2532. TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen()))))
  2533. ) {
  2534. char * newword = ptr->add(ts, wl);
  2535. if (newword) {
  2536. if (nh < maxn) {
  2537. wlst[nh].word = newword;
  2538. wlst[nh].allow = ptr->allowCross();
  2539. nh++;
  2540. } else {
  2541. free(newword);
  2542. }
  2543. }
  2544. }
  2545. ptr = (PfxEntry *)ptr ->getFlgNxt();
  2546. }
  2547. }
  2548. return nh;
  2549. }
  2550. // return length of replacing table
  2551. int AffixMgr::get_numrep()
  2552. {
  2553. return numrep;
  2554. }
  2555. // return replacing table
  2556. struct replentry * AffixMgr::get_reptable()
  2557. {
  2558. if (! reptable ) return NULL;
  2559. return reptable;
  2560. }
  2561. // return length of character map table
  2562. int AffixMgr::get_nummap()
  2563. {
  2564. return nummap;
  2565. }
  2566. // return character map table
  2567. struct mapentry * AffixMgr::get_maptable()
  2568. {
  2569. if (! maptable ) return NULL;
  2570. return maptable;
  2571. }
  2572. // return length of word break table
  2573. int AffixMgr::get_numbreak()
  2574. {
  2575. return numbreak;
  2576. }
  2577. // return character map table
  2578. char ** AffixMgr::get_breaktable()
  2579. {
  2580. if (! breaktable ) return NULL;
  2581. return breaktable;
  2582. }
  2583. // return text encoding of dictionary
  2584. char * AffixMgr::get_encoding()
  2585. {
  2586. if (! encoding ) {
  2587. encoding = mystrdup("ISO8859-1");
  2588. }
  2589. return mystrdup(encoding);
  2590. }
  2591. // return text encoding of dictionary
  2592. int AffixMgr::get_langnum()
  2593. {
  2594. return langnum;
  2595. }
  2596. // return double prefix option
  2597. int AffixMgr::get_complexprefixes()
  2598. {
  2599. return complexprefixes;
  2600. }
  2601. FLAG AffixMgr::get_keepcase()
  2602. {
  2603. return keepcase;
  2604. }
  2605. int AffixMgr::get_checksharps()
  2606. {
  2607. return checksharps;
  2608. }
  2609. // return the preferred ignore string for suggestions
  2610. char * AffixMgr::get_ignore()
  2611. {
  2612. if (!ignorechars) return NULL;
  2613. return mystrdup(ignorechars);
  2614. }
  2615. // return the preferred ignore string for suggestions
  2616. unsigned short * AffixMgr::get_ignore_utf16(int * len)
  2617. {
  2618. *len = ignorechars_utf16_len;
  2619. return ignorechars_utf16;
  2620. }
  2621. // return the preferred try string for suggestions
  2622. char * AffixMgr::get_try_string()
  2623. {
  2624. if (! trystring ) return NULL;
  2625. return mystrdup(trystring);
  2626. }
  2627. // return the preferred try string for suggestions
  2628. const char * AffixMgr::get_wordchars()
  2629. {
  2630. return wordchars;
  2631. }
  2632. unsigned short * AffixMgr::get_wordchars_utf16(int * len)
  2633. {
  2634. *len = wordchars_utf16_len;
  2635. return wordchars_utf16;
  2636. }
  2637. // is there compounding?
  2638. int AffixMgr::get_compound()
  2639. {
  2640. return compoundflag || compoundbegin || numdefcpd;
  2641. }
  2642. // return the compound words control flag
  2643. FLAG AffixMgr::get_compoundflag()
  2644. {
  2645. return compoundflag;
  2646. }
  2647. // return the forbidden words control flag
  2648. FLAG AffixMgr::get_forbiddenword()
  2649. {
  2650. return forbiddenword;
  2651. }
  2652. // return the forbidden words control flag
  2653. FLAG AffixMgr::get_nosuggest()
  2654. {
  2655. return nosuggest;
  2656. }
  2657. // return the forbidden words flag modify flag
  2658. FLAG AffixMgr::get_pseudoroot()
  2659. {
  2660. return pseudoroot;
  2661. }
  2662. // return the onlyincompound flag
  2663. FLAG AffixMgr::get_onlyincompound()
  2664. {
  2665. return onlyincompound;
  2666. }
  2667. // return the compound word signal flag
  2668. FLAG AffixMgr::get_compoundroot()
  2669. {
  2670. return compoundroot;
  2671. }
  2672. // return the compound begin signal flag
  2673. FLAG AffixMgr::get_compoundbegin()
  2674. {
  2675. return compoundbegin;
  2676. }
  2677. // return the value of checknum
  2678. int AffixMgr::get_checknum()
  2679. {
  2680. return checknum;
  2681. }
  2682. // return the value of prefix
  2683. const char * AffixMgr::get_prefix()
  2684. {
  2685. if (pfx) return ((PfxEntry *)pfx)->getKey();
  2686. return NULL;
  2687. }
  2688. // return the value of suffix
  2689. const char * AffixMgr::get_suffix()
  2690. {
  2691. return sfxappnd;
  2692. }
  2693. // return the value of derived form (base word with first suffix).
  2694. const char * AffixMgr::get_derived()
  2695. {
  2696. return derived;
  2697. }
  2698. // return the value of suffix
  2699. const char * AffixMgr::get_version()
  2700. {
  2701. return version;
  2702. }
  2703. // return lemma_present flag
  2704. FLAG AffixMgr::get_lemma_present()
  2705. {
  2706. return lemma_present;
  2707. }
  2708. // utility method to look up root words in hash table
  2709. struct hentry * AffixMgr::lookup(const char * word)
  2710. {
  2711. if (! pHMgr) return NULL;
  2712. return pHMgr->lookup(word);
  2713. }
  2714. // return the value of suffix
  2715. const int AffixMgr::have_contclass()
  2716. {
  2717. return havecontclass;
  2718. }
  2719. // return utf8
  2720. int AffixMgr::get_utf8()
  2721. {
  2722. return utf8;
  2723. }
  2724. // return nosplitsugs
  2725. int AffixMgr::get_maxngramsugs(void)
  2726. {
  2727. return maxngramsugs;
  2728. }
  2729. // return nosplitsugs
  2730. int AffixMgr::get_nosplitsugs(void)
  2731. {
  2732. return nosplitsugs;
  2733. }
  2734. // return sugswithdots
  2735. int AffixMgr::get_sugswithdots(void)
  2736. {
  2737. return sugswithdots;
  2738. }
  2739. /* parse flag */
  2740. int AffixMgr::parse_flag(char * line, unsigned short * out, const char * name) {
  2741. char * s = NULL;
  2742. if (*out != FLAG_NULL) {
  2743. HUNSPELL_WARNING(stderr, "error: duplicate %s line\n", name);
  2744. return 1;
  2745. }
  2746. if (parse_string(line, &s, name)) return 1;
  2747. *out = pHMgr->decode_flag(s);
  2748. free(s);
  2749. return 0;
  2750. }
  2751. /* parse num */
  2752. int AffixMgr::parse_num(char * line, int * out, const char * name) {
  2753. char * s = NULL;
  2754. if (*out != -1) {
  2755. HUNSPELL_WARNING(stderr, "error: duplicate %s line\n", name);
  2756. return 1;
  2757. }
  2758. if (parse_string(line, &s, name)) return 1;
  2759. *out = atoi(s);
  2760. free(s);
  2761. return 0;
  2762. }
  2763. /* parse in the max syllablecount of compound words and */
  2764. int AffixMgr::parse_cpdsyllable(char * line)
  2765. {
  2766. char * tp = line;
  2767. char * piece;
  2768. int i = 0;
  2769. int np = 0;
  2770. w_char w[MAXWORDLEN];
  2771. piece = mystrsep(&tp, 0);
  2772. while (piece) {
  2773. if (*piece != '\0') {
  2774. switch(i) {
  2775. case 0: { np++; break; }
  2776. case 1: { cpdmaxsyllable = atoi(piece); np++; break; }
  2777. case 2: {
  2778. if (!utf8) {
  2779. cpdvowels = mystrdup(piece);
  2780. } else {
  2781. int n = u8_u16(w, MAXWORDLEN, piece);
  2782. if (n > 0) {
  2783. flag_qsort((unsigned short *) w, 0, n);
  2784. cpdvowels_utf16 = (w_char *) malloc(n * sizeof(w_char));
  2785. if (!cpdvowels_utf16) return 1;
  2786. memcpy(cpdvowels_utf16, w, n * sizeof(w_char));
  2787. }
  2788. cpdvowels_utf16_len = n;
  2789. }
  2790. np++;
  2791. break;
  2792. }
  2793. default: break;
  2794. }
  2795. i++;
  2796. }
  2797. free(piece);
  2798. piece = mystrsep(&tp, 0);
  2799. }
  2800. if (np < 2) {
  2801. HUNSPELL_WARNING(stderr, "error: missing compoundsyllable information\n");
  2802. return 1;
  2803. }
  2804. if (np == 2) cpdvowels = mystrdup("aeiouAEIOU");
  2805. return 0;
  2806. }
  2807. /* parse in the typical fault correcting table */
  2808. int AffixMgr::parse_reptable(char * line, FILE * af)
  2809. {
  2810. if (numrep != 0) {
  2811. HUNSPELL_WARNING(stderr, "error: duplicate REP tables used\n");
  2812. return 1;
  2813. }
  2814. char * tp = line;
  2815. char * piece;
  2816. int i = 0;
  2817. int np = 0;
  2818. piece = mystrsep(&tp, 0);
  2819. while (piece) {
  2820. if (*piece != '\0') {
  2821. switch(i) {
  2822. case 0: { np++; break; }
  2823. case 1: {
  2824. numrep = atoi(piece);
  2825. if (numrep < 1) {
  2826. HUNSPELL_WARNING(stderr, "incorrect number of entries in replacement table\n");
  2827. free(piece);
  2828. return 1;
  2829. }
  2830. reptable = (replentry *) malloc(numrep * sizeof(struct replentry));
  2831. if (!reptable) {
  2832. free(piece);
  2833. return 1;
  2834. }
  2835. np++;
  2836. break;
  2837. }
  2838. default: break;
  2839. }
  2840. i++;
  2841. }
  2842. free(piece);
  2843. piece = mystrsep(&tp, 0);
  2844. }
  2845. if (np != 2) {
  2846. HUNSPELL_WARNING(stderr, "error: missing replacement table information\n");
  2847. return 1;
  2848. }
  2849. /* now parse the numrep lines to read in the remainder of the table */
  2850. char * nl = line;
  2851. for (int j=0; j < numrep; j++) {
  2852. if (!fgets(nl,MAXLNLEN,af)) return 1;
  2853. mychomp(nl);
  2854. tp = nl;
  2855. i = 0;
  2856. reptable[j].pattern = NULL;
  2857. reptable[j].pattern2 = NULL;
  2858. piece = mystrsep(&tp, 0);
  2859. while (piece) {
  2860. if (*piece != '\0') {
  2861. switch(i) {
  2862. case 0: {
  2863. if (strncmp(piece,"REP",3) != 0) {
  2864. HUNSPELL_WARNING(stderr, "error: replacement table is corrupt\n");
  2865. free(piece);
  2866. return 1;
  2867. }
  2868. break;
  2869. }
  2870. case 1: { reptable[j].pattern = mystrrep(mystrdup(piece),"_"," "); break; }
  2871. case 2: { reptable[j].pattern2 = mystrrep(mystrdup(piece),"_"," "); break; }
  2872. default: break;
  2873. }
  2874. i++;
  2875. }
  2876. free(piece);
  2877. piece = mystrsep(&tp, 0);
  2878. }
  2879. if ((!(reptable[j].pattern)) || (!(reptable[j].pattern2))) {
  2880. HUNSPELL_WARNING(stderr, "error: replacement table is corrupt\n");
  2881. return 1;
  2882. }
  2883. }
  2884. return 0;
  2885. }
  2886. /* parse in the checkcompoundpattern table */
  2887. int AffixMgr::parse_checkcpdtable(char * line, FILE * af)
  2888. {
  2889. if (numcheckcpd != 0) {
  2890. HUNSPELL_WARNING(stderr, "error: duplicate compound pattern tables used\n");
  2891. return 1;
  2892. }
  2893. char * tp = line;
  2894. char * piece;
  2895. int i = 0;
  2896. int np = 0;
  2897. piece = mystrsep(&tp, 0);
  2898. while (piece) {
  2899. if (*piece != '\0') {
  2900. switch(i) {
  2901. case 0: { np++; break; }
  2902. case 1: {
  2903. numcheckcpd = atoi(piece);
  2904. if (numcheckcpd < 1) {
  2905. HUNSPELL_WARNING(stderr, "incorrect number of entries in compound pattern table\n");
  2906. free(piece);
  2907. return 1;
  2908. }
  2909. checkcpdtable = (replentry *) malloc(numcheckcpd * sizeof(struct replentry));
  2910. if (!checkcpdtable) {
  2911. free(piece);
  2912. return 1;
  2913. }
  2914. np++;
  2915. break;
  2916. }
  2917. default: break;
  2918. }
  2919. i++;
  2920. }
  2921. free(piece);
  2922. piece = mystrsep(&tp, 0);
  2923. }
  2924. if (np != 2) {
  2925. HUNSPELL_WARNING(stderr, "error: missing compound pattern table information\n");
  2926. return 1;
  2927. }
  2928. /* now parse the numcheckcpd lines to read in the remainder of the table */
  2929. char * nl = line;
  2930. for (int j=0; j < numcheckcpd; j++) {
  2931. if (!fgets(nl,MAXLNLEN,af)) return 1;
  2932. mychomp(nl);
  2933. tp = nl;
  2934. i = 0;
  2935. checkcpdtable[j].pattern = NULL;
  2936. checkcpdtable[j].pattern2 = NULL;
  2937. piece = mystrsep(&tp, 0);
  2938. while (piece) {
  2939. if (*piece != '\0') {
  2940. switch(i) {
  2941. case 0: {
  2942. if (strncmp(piece,"CHECKCOMPOUNDPATTERN",20) != 0) {
  2943. HUNSPELL_WARNING(stderr, "error: compound pattern table is corrupt\n");
  2944. free(piece);
  2945. return 1;
  2946. }
  2947. break;
  2948. }
  2949. case 1: { checkcpdtable[j].pattern = mystrdup(piece); break; }
  2950. case 2: { checkcpdtable[j].pattern2 = mystrdup(piece); break; }
  2951. default: break;
  2952. }
  2953. i++;
  2954. }
  2955. free(piece);
  2956. piece = mystrsep(&tp, 0);
  2957. }
  2958. if ((!(checkcpdtable[j].pattern)) || (!(checkcpdtable[j].pattern2))) {
  2959. HUNSPELL_WARNING(stderr, "error: compound pattern table is corrupt\n");
  2960. return 1;
  2961. }
  2962. }
  2963. return 0;
  2964. }
  2965. /* parse in the compound rule table */
  2966. int AffixMgr::parse_defcpdtable(char * line, FILE * af)
  2967. {
  2968. if (numdefcpd != 0) {
  2969. HUNSPELL_WARNING(stderr, "error: duplicate compound rule tables used\n");
  2970. return 1;
  2971. }
  2972. char * tp = line;
  2973. char * piece;
  2974. int i = 0;
  2975. int np = 0;
  2976. piece = mystrsep(&tp, 0);
  2977. while (piece) {
  2978. if (*piece != '\0') {
  2979. switch(i) {
  2980. case 0: { np++; break; }
  2981. case 1: {
  2982. numdefcpd = atoi(piece);
  2983. if (numdefcpd < 1) {
  2984. HUNSPELL_WARNING(stderr, "incorrect number of entries in compound rule table\n");
  2985. free(piece);
  2986. return 1;
  2987. }
  2988. defcpdtable = (flagentry *) malloc(numdefcpd * sizeof(flagentry));
  2989. if (!defcpdtable) {
  2990. free(piece);
  2991. return 1;
  2992. }
  2993. np++;
  2994. break;
  2995. }
  2996. default: break;
  2997. }
  2998. i++;
  2999. }
  3000. free(piece);
  3001. piece = mystrsep(&tp, 0);
  3002. }
  3003. if (np != 2) {
  3004. HUNSPELL_WARNING(stderr, "error: missing compound rule table information\n");
  3005. return 1;
  3006. }
  3007. /* now parse the numdefcpd lines to read in the remainder of the table */
  3008. char * nl = line;
  3009. for (int j=0; j < numdefcpd; j++) {
  3010. if (!fgets(nl,MAXLNLEN,af)) return 1;
  3011. mychomp(nl);
  3012. tp = nl;
  3013. i = 0;
  3014. defcpdtable[j].def = NULL;
  3015. piece = mystrsep(&tp, 0);
  3016. while (piece) {
  3017. if (*piece != '\0') {
  3018. switch(i) {
  3019. case 0: {
  3020. if (strncmp(piece, "COMPOUNDRULE", 12) != 0) {
  3021. HUNSPELL_WARNING(stderr, "error: compound rule table is corrupt\n");
  3022. free(piece);
  3023. return 1;
  3024. }
  3025. break;
  3026. }
  3027. case 1: {
  3028. defcpdtable[j].len =
  3029. pHMgr->decode_flags(&(defcpdtable[j].def), piece);
  3030. break;
  3031. }
  3032. default: break;
  3033. }
  3034. i++;
  3035. }
  3036. free(piece);
  3037. piece = mystrsep(&tp, 0);
  3038. }
  3039. if (!defcpdtable[j].len) {
  3040. HUNSPELL_WARNING(stderr, "error: compound rule table is corrupt\n");
  3041. return 1;
  3042. }
  3043. }
  3044. return 0;
  3045. }
  3046. /* parse in the character map table */
  3047. int AffixMgr::parse_maptable(char * line, FILE * af)
  3048. {
  3049. if (nummap != 0) {
  3050. HUNSPELL_WARNING(stderr, "error: duplicate MAP tables used\n");
  3051. return 1;
  3052. }
  3053. char * tp = line;
  3054. char * piece;
  3055. int i = 0;
  3056. int np = 0;
  3057. piece = mystrsep(&tp, 0);
  3058. while (piece) {
  3059. if (*piece != '\0') {
  3060. switch(i) {
  3061. case 0: { np++; break; }
  3062. case 1: {
  3063. nummap = atoi(piece);
  3064. if (nummap < 1) {
  3065. HUNSPELL_WARNING(stderr, "incorrect number of entries in map table\n");
  3066. free(piece);
  3067. return 1;
  3068. }
  3069. maptable = (mapentry *) malloc(nummap * sizeof(struct mapentry));
  3070. if (!maptable) {
  3071. free(piece);
  3072. return 1;
  3073. }
  3074. np++;
  3075. break;
  3076. }
  3077. default: break;
  3078. }
  3079. i++;
  3080. }
  3081. free(piece);
  3082. piece = mystrsep(&tp, 0);
  3083. }
  3084. if (np != 2) {
  3085. HUNSPELL_WARNING(stderr, "error: missing map table information\n");
  3086. return 1;
  3087. }
  3088. /* now parse the nummap lines to read in the remainder of the table */
  3089. char * nl = line;
  3090. for (int j=0; j < nummap; j++) {
  3091. if (!fgets(nl,MAXLNLEN,af)) return 1;
  3092. mychomp(nl);
  3093. tp = nl;
  3094. i = 0;
  3095. maptable[j].set = NULL;
  3096. maptable[j].len = 0;
  3097. piece = mystrsep(&tp, 0);
  3098. while (piece) {
  3099. if (*piece != '\0') {
  3100. switch(i) {
  3101. case 0: {
  3102. if (strncmp(piece,"MAP",3) != 0) {
  3103. HUNSPELL_WARNING(stderr, "error: map table is corrupt\n");
  3104. free(piece);
  3105. return 1;
  3106. }
  3107. break;
  3108. }
  3109. case 1: {
  3110. maptable[j].len = 0;
  3111. maptable[j].set = NULL;
  3112. maptable[j].set_utf16 = NULL;
  3113. if (!utf8) {
  3114. maptable[j].set = mystrdup(piece);
  3115. maptable[j].len = strlen(maptable[j].set);
  3116. } else {
  3117. w_char w[MAXWORDLEN];
  3118. int n = u8_u16(w, MAXWORDLEN, piece);
  3119. if (n > 0) {
  3120. flag_qsort((unsigned short *) w, 0, n);
  3121. maptable[j].set_utf16 = (w_char *) malloc(n * sizeof(w_char));
  3122. if (!maptable[j].set_utf16) return 1;
  3123. memcpy(maptable[j].set_utf16, w, n * sizeof(w_char));
  3124. }
  3125. maptable[j].len = n;
  3126. }
  3127. break; }
  3128. default: break;
  3129. }
  3130. i++;
  3131. }
  3132. free(piece);
  3133. piece = mystrsep(&tp, 0);
  3134. }
  3135. if ((!(maptable[j].set || maptable[j].set_utf16)) || (!(maptable[j].len))) {
  3136. HUNSPELL_WARNING(stderr, "error: map table is corrupt\n");
  3137. return 1;
  3138. }
  3139. }
  3140. return 0;
  3141. }
  3142. /* parse in the word breakpoint table */
  3143. int AffixMgr::parse_breaktable(char * line, FILE * af)
  3144. {
  3145. if (numbreak != 0) {
  3146. HUNSPELL_WARNING(stderr, "error: duplicate word breakpoint tables used\n");
  3147. return 1;
  3148. }
  3149. char * tp = line;
  3150. char * piece;
  3151. int i = 0;
  3152. int np = 0;
  3153. piece = mystrsep(&tp, 0);
  3154. while (piece) {
  3155. if (*piece != '\0') {
  3156. switch(i) {
  3157. case 0: { np++; break; }
  3158. case 1: {
  3159. numbreak = atoi(piece);
  3160. if (numbreak < 1) {
  3161. HUNSPELL_WARNING(stderr, "incorrect number of entries in BREAK table\n");
  3162. free(piece);
  3163. return 1;
  3164. }
  3165. breaktable = (char **) malloc(numbreak * sizeof(char *));
  3166. if (!breaktable) {
  3167. free(piece);
  3168. return 1;
  3169. }
  3170. np++;
  3171. break;
  3172. }
  3173. default: break;
  3174. }
  3175. i++;
  3176. }
  3177. free(piece);
  3178. piece = mystrsep(&tp, 0);
  3179. }
  3180. if (np != 2) {
  3181. HUNSPELL_WARNING(stderr, "error: missing word breakpoint table information\n");
  3182. return 1;
  3183. }
  3184. /* now parse the numbreak lines to read in the remainder of the table */
  3185. char * nl = line;
  3186. for (int j=0; j < numbreak; j++) {
  3187. if (!fgets(nl,MAXLNLEN,af)) return 1;
  3188. mychomp(nl);
  3189. tp = nl;
  3190. i = 0;
  3191. piece = mystrsep(&tp, 0);
  3192. while (piece) {
  3193. if (*piece != '\0') {
  3194. switch(i) {
  3195. case 0: {
  3196. if (strncmp(piece,"BREAK",5) != 0) {
  3197. HUNSPELL_WARNING(stderr, "error: BREAK table is corrupt\n");
  3198. free(piece);
  3199. return 1;
  3200. }
  3201. break;
  3202. }
  3203. case 1: {
  3204. breaktable[j] = mystrdup(piece);
  3205. break;
  3206. }
  3207. default: break;
  3208. }
  3209. i++;
  3210. }
  3211. free(piece);
  3212. piece = mystrsep(&tp, 0);
  3213. }
  3214. if (!breaktable) {
  3215. HUNSPELL_WARNING(stderr, "error: BREAK table is corrupt\n");
  3216. return 1;
  3217. }
  3218. }
  3219. return 0;
  3220. }
  3221. int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflags)
  3222. {
  3223. int numents = 0; // number of affentry structures to parse
  3224. unsigned short aflag = 0; // affix char identifier
  3225. char ff=0;
  3226. struct affentry * ptr= NULL;
  3227. struct affentry * nptr= NULL;
  3228. char * tp = line;
  3229. char * nl = line;
  3230. char * piece;
  3231. int i = 0;
  3232. // checking lines with bad syntax
  3233. #ifdef DEBUG
  3234. int basefieldnum = 0;
  3235. #endif
  3236. // split affix header line into pieces
  3237. int np = 0;
  3238. piece = mystrsep(&tp, 0);
  3239. while (piece) {
  3240. if (*piece != '\0') {
  3241. switch(i) {
  3242. // piece 1 - is type of affix
  3243. case 0: { np++; break; }
  3244. // piece 2 - is affix char
  3245. case 1: {
  3246. np++;
  3247. aflag = pHMgr->decode_flag(piece);
  3248. if (((at == 'S') && (dupflags[aflag] & dupSFX)) ||
  3249. ((at == 'P') && (dupflags[aflag] & dupPFX))) {
  3250. HUNSPELL_WARNING(stderr, "error: duplicate affix flag %s in line %s\n", piece, nl);
  3251. // return 1; XXX permissive mode for bad dictionaries
  3252. }
  3253. dupflags[aflag] += ((at == 'S') ? dupSFX : dupPFX);
  3254. break;
  3255. }
  3256. // piece 3 - is cross product indicator
  3257. case 2: { np++; if (*piece == 'Y') ff = aeXPRODUCT; break; }
  3258. // piece 4 - is number of affentries
  3259. case 3: {
  3260. np++;
  3261. numents = atoi(piece);
  3262. if (numents == 0) {
  3263. char * err = pHMgr->encode_flag(aflag);
  3264. HUNSPELL_WARNING(stderr, "error: affix %s header has incorrect entry count in line %s\n",
  3265. err, nl);
  3266. free(err);
  3267. return 1;
  3268. }
  3269. ptr = (struct affentry *) malloc(numents * sizeof(struct affentry));
  3270. if (!ptr) return 1;
  3271. ptr->opts = ff;
  3272. if (utf8) ptr->opts += aeUTF8;
  3273. if (pHMgr->is_aliasf()) ptr->opts += aeALIASF;
  3274. #ifdef HUNSPELL_EXPERIMENTAL
  3275. if (pHMgr->is_aliasm()) ptr->opts += aeALIASM;
  3276. #endif
  3277. ptr->aflag = aflag;
  3278. }
  3279. default: break;
  3280. }
  3281. i++;
  3282. }
  3283. free(piece);
  3284. piece = mystrsep(&tp, 0);
  3285. }
  3286. // check to make sure we parsed enough pieces
  3287. if (np != 4) {
  3288. char * err = pHMgr->encode_flag(aflag);
  3289. HUNSPELL_WARNING(stderr, "error: affix %s header has insufficient data in line %s\n", err, nl);
  3290. free(err);
  3291. free(ptr);
  3292. return 1;
  3293. }
  3294. // store away ptr to first affentry
  3295. nptr = ptr;
  3296. // now parse numents affentries for this affix
  3297. for (int j=0; j < numents; j++) {
  3298. if (!fgets(nl,MAXLNLEN,af)) return 1;
  3299. mychomp(nl);
  3300. tp = nl;
  3301. i = 0;
  3302. np = 0;
  3303. // split line into pieces
  3304. piece = mystrsep(&tp, 0);
  3305. while (piece) {
  3306. if (*piece != '\0') {
  3307. switch(i) {
  3308. // piece 1 - is type
  3309. case 0: {
  3310. np++;
  3311. if (nptr != ptr) nptr->opts = ptr->opts;
  3312. break;
  3313. }
  3314. // piece 2 - is affix char
  3315. case 1: {
  3316. np++;
  3317. if (pHMgr->decode_flag(piece) != aflag) {
  3318. char * err = pHMgr->encode_flag(aflag);
  3319. HUNSPELL_WARNING(stderr, "error: affix %s is corrupt near line %s\n", err, nl);
  3320. HUNSPELL_WARNING(stderr, "error: possible incorrect count\n");
  3321. free(err);
  3322. free(piece);
  3323. return 1;
  3324. }
  3325. if (nptr != ptr) nptr->aflag = ptr->aflag;
  3326. break;
  3327. }
  3328. // piece 3 - is string to strip or 0 for null
  3329. case 2: {
  3330. np++;
  3331. if (complexprefixes) {
  3332. if (utf8) reverseword_utf(piece); else reverseword(piece);
  3333. }
  3334. nptr->strip = mystrdup(piece);
  3335. nptr->stripl = (unsigned char) strlen(nptr->strip);
  3336. if (strcmp(nptr->strip,"0") == 0) {
  3337. free(nptr->strip);
  3338. nptr->strip=mystrdup("");
  3339. nptr->stripl = 0;
  3340. }
  3341. break;
  3342. }
  3343. // piece 4 - is affix string or 0 for null
  3344. case 3: {
  3345. char * dash;
  3346. #ifdef HUNSPELL_EXPERIMENTAL
  3347. nptr->morphcode = NULL;
  3348. #endif
  3349. nptr->contclass = NULL;
  3350. nptr->contclasslen = 0;
  3351. np++;
  3352. dash = strchr(piece, '/');
  3353. if (dash) {
  3354. *dash = '\0';
  3355. if (ignorechars) {
  3356. if (utf8) {
  3357. remove_ignored_chars_utf(piece, ignorechars_utf16, ignorechars_utf16_len);
  3358. } else {
  3359. remove_ignored_chars(piece,ignorechars);
  3360. }
  3361. }
  3362. if (complexprefixes) {
  3363. if (utf8) reverseword_utf(piece); else reverseword(piece);
  3364. }
  3365. nptr->appnd = mystrdup(piece);
  3366. if (pHMgr->is_aliasf()) {
  3367. int index = atoi(dash + 1);
  3368. nptr->contclasslen = (unsigned short) pHMgr->get_aliasf(index, &(nptr->contclass));
  3369. } else {
  3370. nptr->contclasslen = (unsigned short) pHMgr->decode_flags(&(nptr->contclass), dash + 1);
  3371. flag_qsort(nptr->contclass, 0, nptr->contclasslen);
  3372. }
  3373. *dash = '/';
  3374. havecontclass = 1;
  3375. for (unsigned short _i = 0; _i < nptr->contclasslen; _i++) {
  3376. contclasses[(nptr->contclass)[_i]] = 1;
  3377. }
  3378. } else {
  3379. if (ignorechars) {
  3380. if (utf8) {
  3381. remove_ignored_chars_utf(piece, ignorechars_utf16, ignorechars_utf16_len);
  3382. } else {
  3383. remove_ignored_chars(piece,ignorechars);
  3384. }
  3385. }
  3386. if (complexprefixes) {
  3387. if (utf8) reverseword_utf(piece); else reverseword(piece);
  3388. }
  3389. nptr->appnd = mystrdup(piece);
  3390. }
  3391. nptr->appndl = (unsigned char) strlen(nptr->appnd);
  3392. if (strcmp(nptr->appnd,"0") == 0) {
  3393. free(nptr->appnd);
  3394. nptr->appnd=mystrdup("");
  3395. nptr->appndl = 0;
  3396. }
  3397. break;
  3398. }
  3399. // piece 5 - is the conditions descriptions
  3400. case 4: {
  3401. np++;
  3402. if (complexprefixes) {
  3403. int neg = 0;
  3404. if (utf8) reverseword_utf(piece); else reverseword(piece);
  3405. // reverse condition
  3406. for (char * k = piece + strlen(piece) - 1; k >= piece; k--) {
  3407. switch(*k) {
  3408. case '[': {
  3409. if (neg) *(k+1) = '['; else *k = ']';
  3410. break;
  3411. }
  3412. case ']': {
  3413. *k = '[';
  3414. if (neg) *(k+1) = '^';
  3415. neg = 0;
  3416. break;
  3417. }
  3418. case '^': {
  3419. if (*(k+1) == ']') neg = 1; else *(k+1) = *k;
  3420. break;
  3421. }
  3422. default: {
  3423. if (neg) *(k+1) = *k;
  3424. }
  3425. }
  3426. }
  3427. }
  3428. if (nptr->stripl && (strcmp(piece, ".") != 0) &&
  3429. redundant_condition(at, nptr->strip, nptr->stripl, piece, nl))
  3430. strcpy(piece, ".");
  3431. if (encodeit(nptr,piece)) return 1;
  3432. break;
  3433. }
  3434. #ifdef HUNSPELL_EXPERIMENTAL
  3435. case 5: {
  3436. np++;
  3437. if (pHMgr->is_aliasm()) {
  3438. int index = atoi(piece);
  3439. nptr->morphcode = pHMgr->get_aliasm(index);
  3440. } else {
  3441. if (complexprefixes) {
  3442. if (utf8) reverseword_utf(piece); else reverseword(piece);
  3443. }
  3444. nptr->morphcode = mystrdup(piece);
  3445. }
  3446. break;
  3447. }
  3448. #endif
  3449. default: break;
  3450. }
  3451. i++;
  3452. }
  3453. free(piece);
  3454. piece = mystrsep(&tp, 0);
  3455. }
  3456. // check to make sure we parsed enough pieces
  3457. if (np < 4) {
  3458. char * err = pHMgr->encode_flag(aflag);
  3459. HUNSPELL_WARNING(stderr, "error: affix %s is corrupt near line %s\n", err, nl);
  3460. free(err);
  3461. free(ptr);
  3462. return 1;
  3463. }
  3464. #ifdef DEBUG
  3465. #ifdef HUNSPELL_EXPERIMENTAL
  3466. // detect unnecessary fields, excepting comments
  3467. if (basefieldnum) {
  3468. int fieldnum = !(nptr->morphcode) ? 5 : ((*(nptr->morphcode)=='#') ? 5 : 6);
  3469. if (fieldnum != basefieldnum)
  3470. HUNSPELL_WARNING(stderr, "warning: bad field number:\n%s\n", nl);
  3471. } else {
  3472. basefieldnum = !(nptr->morphcode) ? 5 : ((*(nptr->morphcode)=='#') ? 5 : 6);
  3473. }
  3474. #endif
  3475. #endif
  3476. nptr++;
  3477. }
  3478. // now create SfxEntry or PfxEntry objects and use links to
  3479. // build an ordered (sorted by affix string) list
  3480. nptr = ptr;
  3481. for (int k = 0; k < numents; k++) {
  3482. if (at == 'P') {
  3483. PfxEntry * pfxptr = new PfxEntry(this,nptr);
  3484. build_pfxtree((AffEntry *)pfxptr);
  3485. } else {
  3486. SfxEntry * sfxptr = new SfxEntry(this,nptr);
  3487. build_sfxtree((AffEntry *)sfxptr);
  3488. }
  3489. nptr++;
  3490. }
  3491. free(ptr);
  3492. return 0;
  3493. }
  3494. int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char * cond, char * line) {
  3495. int condl = strlen(cond);
  3496. int i;
  3497. int j;
  3498. int neg;
  3499. int in;
  3500. if (ft == 'P') { // prefix
  3501. if (strncmp(strip, cond, condl) == 0) return 1;
  3502. if (utf8) {
  3503. } else {
  3504. for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) {
  3505. if (cond[j] != '[') {
  3506. if (cond[j] != strip[i]) {
  3507. HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", line);
  3508. }
  3509. } else {
  3510. neg = (cond[j+1] == '^') ? 1 : 0;
  3511. in = 0;
  3512. do {
  3513. j++;
  3514. if (strip[i] == cond[j]) in = 1;
  3515. } while ((j < (condl - 1)) && (cond[j] != ']'));
  3516. if (j == (condl - 1) && (cond[j] != ']')) {
  3517. HUNSPELL_WARNING(stderr, "error: missing ] in condition:\n%s\n", line);
  3518. return 0;
  3519. }
  3520. if ((!neg && !in) || (neg && in)) {
  3521. HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", line);
  3522. return 0;
  3523. }
  3524. }
  3525. }
  3526. if (j >= condl) return 1;
  3527. }
  3528. } else { // suffix
  3529. if ((stripl >= condl) && strcmp(strip + stripl - condl, cond) == 0) return 1;
  3530. if (utf8) {
  3531. } else {
  3532. for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) {
  3533. if (cond[j] != ']') {
  3534. if (cond[j] != strip[i]) {
  3535. HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", line);
  3536. }
  3537. } else {
  3538. in = 0;
  3539. do {
  3540. j--;
  3541. if (strip[i] == cond[j]) in = 1;
  3542. } while ((j > 0) && (cond[j] != '['));
  3543. if ((j == 0) && (cond[j] != '[')) {
  3544. HUNSPELL_WARNING(stderr, "error: missing ] in condition:\n%s\n", line);
  3545. return 0;
  3546. }
  3547. neg = (cond[j+1] == '^') ? 1 : 0;
  3548. if ((!neg && !in) || (neg && in)) {
  3549. HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", line);
  3550. return 0;
  3551. }
  3552. }
  3553. }
  3554. if (j < 0) return 1;
  3555. }
  3556. }
  3557. return 0;
  3558. }