PageRenderTime 68ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 1ms

/enchant-1.6.0/src/myspell/affixmgr.cxx

#
C++ | 4115 lines | 3262 code | 417 blank | 436 comment | 1325 complexity | 159738d71666c8f2bd4397deb25c4301 MD5 | raw file
Possible License(s): LGPL-2.1, MPL-2.0-no-copyleft-exception
  1. #include "license.hunspell"
  2. #include "license.myspell"
  3. #ifndef MOZILLA_CLIENT
  4. #include <cstdlib>
  5. #include <cstring>
  6. #include <cctype>
  7. #include <cstdio>
  8. #else
  9. #include <stdlib.h>
  10. #include <string.h>
  11. #include <stdio.h>
  12. #include <ctype.h>
  13. #endif
  14. #include "affixmgr.hxx"
  15. #include "affentry.hxx"
  16. #include "langnum.hxx"
  17. #include "csutil.hxx"
  18. #ifndef MOZILLA_CLIENT
  19. #ifndef WIN32
  20. using namespace std;
  21. #endif
  22. #endif
  23. AffixMgr::AffixMgr(const char * affpath, HashMgr** ptr, int * md, const char * key)
  24. {
  25. // register hash manager and load affix data from aff file
  26. pHMgr = ptr[0];
  27. alldic = ptr;
  28. maxdic = md;
  29. keystring = NULL;
  30. trystring = NULL;
  31. encoding=NULL;
  32. utf8 = 0;
  33. complexprefixes = 0;
  34. maptable = NULL;
  35. nummap = 0;
  36. breaktable = NULL;
  37. numbreak = 0;
  38. reptable = NULL;
  39. numrep = 0;
  40. checkcpdtable = NULL;
  41. numcheckcpd = 0;
  42. defcpdtable = NULL;
  43. numdefcpd = 0;
  44. phone = NULL;
  45. compoundflag = FLAG_NULL; // permits word in compound forms
  46. compoundbegin = FLAG_NULL; // may be first word in compound forms
  47. compoundmiddle = FLAG_NULL; // may be middle word in compound forms
  48. compoundend = FLAG_NULL; // may be last word in compound forms
  49. compoundroot = FLAG_NULL; // compound word signing flag
  50. compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word
  51. compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word
  52. checkcompounddup = 0; // forbid double words in compounds
  53. checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a REP substitution)
  54. checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds
  55. checkcompoundtriple = 0; // forbid compounds with triple letters
  56. forbiddenword = FORBIDDENWORD; // forbidden word signing flag
  57. nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag
  58. lang = NULL; // language
  59. langnum = 0; // language code (see http://l10n.openoffice.org/languages.html)
  60. needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes
  61. cpdwordmax = -1; // default: unlimited wordcount in compound words
  62. cpdmin = -1; // undefined
  63. cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words
  64. cpdvowels=NULL; // vowels (for calculating of Hungarian compounding limit, O(n) search! XXX)
  65. cpdvowels_utf16=NULL; // vowels for UTF-8 encoding (bsearch instead of O(n) search)
  66. cpdvowels_utf16_len=0; // vowels
  67. pfxappnd=NULL; // previous prefix for counting the syllables of prefix BUG
  68. sfxappnd=NULL; // previous suffix for counting a special syllables BUG
  69. cpdsyllablenum=NULL; // syllable count incrementing flag
  70. checknum=0; // checking numbers, and word with numbers
  71. wordchars=NULL; // letters + spec. word characters
  72. wordchars_utf16=NULL; // letters + spec. word characters
  73. wordchars_utf16_len=0; // letters + spec. word characters
  74. ignorechars=NULL; // letters + spec. word characters
  75. ignorechars_utf16=NULL; // letters + spec. word characters
  76. ignorechars_utf16_len=0; // letters + spec. word characters
  77. version=NULL; // affix and dictionary file version string
  78. havecontclass=0; // flags of possible continuing classes (double affix)
  79. // LEMMA_PRESENT: not put root into the morphological output. Lemma presents
  80. // in morhological description in dictionary file. It's often combined with PSEUDOROOT.
  81. lemma_present = FLAG_NULL;
  82. circumfix = FLAG_NULL;
  83. onlyincompound = FLAG_NULL;
  84. flag_mode = FLAG_CHAR; // default one-character flags in affix and dic file
  85. maxngramsugs = -1; // undefined
  86. nosplitsugs = 0;
  87. sugswithdots = 0;
  88. keepcase = 0;
  89. checksharps = 0;
  90. substandard = FLAG_NULL;
  91. derived = NULL; // XXX not threadsafe variable for experimental stemming
  92. sfx = NULL;
  93. pfx = NULL;
  94. for (int i=0; i < SETSIZE; i++) {
  95. pStart[i] = NULL;
  96. sStart[i] = NULL;
  97. pFlag[i] = NULL;
  98. sFlag[i] = NULL;
  99. }
  100. for (int j=0; j < CONTSIZE; j++) {
  101. contclasses[j] = 0;
  102. }
  103. if (parse_file(affpath, key)) {
  104. HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath);
  105. }
  106. if (cpdmin == -1) cpdmin = MINCPDLEN;
  107. }
  108. AffixMgr::~AffixMgr()
  109. {
  110. // pass through linked prefix entries and clean up
  111. for (int i=0; i < SETSIZE ;i++) {
  112. pFlag[i] = NULL;
  113. PfxEntry * ptr = (PfxEntry *)pStart[i];
  114. PfxEntry * nptr = NULL;
  115. while (ptr) {
  116. nptr = ptr->getNext();
  117. delete(ptr);
  118. ptr = nptr;
  119. nptr = NULL;
  120. }
  121. }
  122. // pass through linked suffix entries and clean up
  123. for (int j=0; j < SETSIZE ; j++) {
  124. sFlag[j] = NULL;
  125. SfxEntry * ptr = (SfxEntry *)sStart[j];
  126. SfxEntry * nptr = NULL;
  127. while (ptr) {
  128. nptr = ptr->getNext();
  129. delete(ptr);
  130. ptr = nptr;
  131. nptr = NULL;
  132. }
  133. sStart[j] = NULL;
  134. }
  135. if (keystring) free(keystring);
  136. keystring=NULL;
  137. if (trystring) free(trystring);
  138. trystring=NULL;
  139. if (encoding) free(encoding);
  140. encoding=NULL;
  141. if (maptable) {
  142. for (int j=0; j < nummap; j++) {
  143. if (maptable[j].set) free(maptable[j].set);
  144. if (maptable[j].set_utf16) free(maptable[j].set_utf16);
  145. maptable[j].set = NULL;
  146. maptable[j].len = 0;
  147. }
  148. free(maptable);
  149. maptable = NULL;
  150. }
  151. nummap = 0;
  152. if (breaktable) {
  153. for (int j=0; j < numbreak; j++) {
  154. if (breaktable[j]) free(breaktable[j]);
  155. breaktable[j] = NULL;
  156. }
  157. free(breaktable);
  158. breaktable = NULL;
  159. }
  160. numbreak = 0;
  161. if (reptable) {
  162. for (int j=0; j < numrep; j++) {
  163. free(reptable[j].pattern);
  164. free(reptable[j].pattern2);
  165. }
  166. free(reptable);
  167. reptable = NULL;
  168. }
  169. if (phone && phone->rules) {
  170. for (int j=0; j < phone->num + 1; j++) {
  171. free(phone->rules[j * 2]);
  172. free(phone->rules[j * 2 + 1]);
  173. }
  174. free(phone->rules);
  175. free(phone);
  176. phone = NULL;
  177. }
  178. if (defcpdtable) {
  179. for (int j=0; j < numdefcpd; j++) {
  180. free(defcpdtable[j].def);
  181. defcpdtable[j].def = NULL;
  182. }
  183. free(defcpdtable);
  184. defcpdtable = NULL;
  185. }
  186. numrep = 0;
  187. if (checkcpdtable) {
  188. for (int j=0; j < numcheckcpd; j++) {
  189. free(checkcpdtable[j].pattern);
  190. free(checkcpdtable[j].pattern2);
  191. checkcpdtable[j].pattern = NULL;
  192. checkcpdtable[j].pattern2 = NULL;
  193. }
  194. free(checkcpdtable);
  195. checkcpdtable = NULL;
  196. }
  197. numcheckcpd = 0;
  198. FREE_FLAG(compoundflag);
  199. FREE_FLAG(compoundbegin);
  200. FREE_FLAG(compoundmiddle);
  201. FREE_FLAG(compoundend);
  202. FREE_FLAG(compoundpermitflag);
  203. FREE_FLAG(compoundforbidflag);
  204. FREE_FLAG(compoundroot);
  205. FREE_FLAG(forbiddenword);
  206. FREE_FLAG(nosuggest);
  207. FREE_FLAG(needaffix);
  208. FREE_FLAG(lemma_present);
  209. FREE_FLAG(circumfix);
  210. FREE_FLAG(onlyincompound);
  211. cpdwordmax = 0;
  212. pHMgr = NULL;
  213. cpdmin = 0;
  214. cpdmaxsyllable = 0;
  215. if (cpdvowels) free(cpdvowels);
  216. if (cpdvowels_utf16) free(cpdvowels_utf16);
  217. if (cpdsyllablenum) free(cpdsyllablenum);
  218. free_utf_tbl();
  219. if (lang) free(lang);
  220. if (wordchars) free(wordchars);
  221. if (wordchars_utf16) free(wordchars_utf16);
  222. if (ignorechars) free(ignorechars);
  223. if (ignorechars_utf16) free(ignorechars_utf16);
  224. if (version) free(version);
  225. if (derived) free(derived);
  226. checknum=0;
  227. }
  228. // read in aff file and build up prefix and suffix entry objects
  229. int AffixMgr::parse_file(const char * affpath, const char * key)
  230. {
  231. char * line; // io buffers
  232. char ft; // affix type
  233. // checking flag duplication
  234. char dupflags[CONTSIZE];
  235. char dupflags_ini = 1;
  236. // first line indicator for removing byte order mark
  237. int firstline = 1;
  238. // open the affix file
  239. FileMgr * afflst = new FileMgr(affpath, key);
  240. if (!afflst) {
  241. HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath);
  242. return 1;
  243. }
  244. // step one is to parse the affix file building up the internal
  245. // affix data structures
  246. // read in each line ignoring any that do not
  247. // start with a known line type indicator
  248. while ((line = afflst->getline())) {
  249. mychomp(line);
  250. /* remove byte order mark */
  251. if (firstline) {
  252. firstline = 0;
  253. if (strncmp(line,"\xEF\xBB\xBF",3) == 0) {
  254. memmove(line, line+3, strlen(line+3)+1);
  255. HUNSPELL_WARNING(stderr, "warning: affix file begins with byte order mark: possible incompatibility with old Hunspell versions\n");
  256. }
  257. }
  258. /* parse in the keyboard string */
  259. if (strncmp(line,"KEY",3) == 0) {
  260. if (parse_string(line, &keystring, "KEY")) {
  261. delete afflst;
  262. return 1;
  263. }
  264. }
  265. /* parse in the try string */
  266. if (strncmp(line,"TRY",3) == 0) {
  267. if (parse_string(line, &trystring, "TRY")) {
  268. delete afflst;
  269. return 1;
  270. }
  271. }
  272. /* parse in the name of the character set used by the .dict and .aff */
  273. if (strncmp(line,"SET",3) == 0) {
  274. if (parse_string(line, &encoding, "SET")) {
  275. delete afflst;
  276. return 1;
  277. }
  278. if (strcmp(encoding, "UTF-8") == 0) {
  279. utf8 = 1;
  280. #ifndef OPENOFFICEORG
  281. #ifndef MOZILLA_CLIENT
  282. if (initialize_utf_tbl()) return 1;
  283. #endif
  284. #endif
  285. }
  286. }
  287. /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left writing system */
  288. if (strncmp(line,"COMPLEXPREFIXES",15) == 0)
  289. complexprefixes = 1;
  290. /* parse in the flag used by the controlled compound words */
  291. if (strncmp(line,"COMPOUNDFLAG",12) == 0) {
  292. if (parse_flag(line, &compoundflag, "COMPOUNDFLAG")) {
  293. delete afflst;
  294. return 1;
  295. }
  296. }
  297. /* parse in the flag used by compound words */
  298. if (strncmp(line,"COMPOUNDBEGIN",13) == 0) {
  299. if (complexprefixes) {
  300. if (parse_flag(line, &compoundend, "COMPOUNDBEGIN")) {
  301. delete afflst;
  302. return 1;
  303. }
  304. } else {
  305. if (parse_flag(line, &compoundbegin, "COMPOUNDBEGIN")) {
  306. delete afflst;
  307. return 1;
  308. }
  309. }
  310. }
  311. /* parse in the flag used by compound words */
  312. if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) {
  313. if (parse_flag(line, &compoundmiddle, "COMPOUNDMIDDLE")) {
  314. delete afflst;
  315. return 1;
  316. }
  317. }
  318. /* parse in the flag used by compound words */
  319. if (strncmp(line,"COMPOUNDEND",11) == 0) {
  320. if (complexprefixes) {
  321. if (parse_flag(line, &compoundbegin, "COMPOUNDEND")) {
  322. delete afflst;
  323. return 1;
  324. }
  325. } else {
  326. if (parse_flag(line, &compoundend, "COMPOUNDEND")) {
  327. delete afflst;
  328. return 1;
  329. }
  330. }
  331. }
  332. /* parse in the data used by compound_check() method */
  333. if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) {
  334. if (parse_num(line, &cpdwordmax, "COMPOUNDWORDMAX")) {
  335. delete afflst;
  336. return 1;
  337. }
  338. }
  339. /* parse in the flag sign compounds in dictionary */
  340. if (strncmp(line,"COMPOUNDROOT",12) == 0) {
  341. if (parse_flag(line, &compoundroot, "COMPOUNDROOT")) {
  342. delete afflst;
  343. return 1;
  344. }
  345. }
  346. /* parse in the flag used by compound_check() method */
  347. if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) {
  348. if (parse_flag(line, &compoundpermitflag, "COMPOUNDPERMITFLAG")) {
  349. delete afflst;
  350. return 1;
  351. }
  352. }
  353. /* parse in the flag used by compound_check() method */
  354. if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) {
  355. if (parse_flag(line, &compoundforbidflag, "COMPOUNDFORBIDFLAG")) {
  356. delete afflst;
  357. return 1;
  358. }
  359. }
  360. if (strncmp(line,"CHECKCOMPOUNDDUP",16) == 0) {
  361. checkcompounddup = 1;
  362. }
  363. if (strncmp(line,"CHECKCOMPOUNDREP",16) == 0) {
  364. checkcompoundrep = 1;
  365. }
  366. if (strncmp(line,"CHECKCOMPOUNDTRIPLE",19) == 0) {
  367. checkcompoundtriple = 1;
  368. }
  369. if (strncmp(line,"CHECKCOMPOUNDCASE",17) == 0) {
  370. checkcompoundcase = 1;
  371. }
  372. if (strncmp(line,"NOSUGGEST",9) == 0) {
  373. if (parse_flag(line, &nosuggest, "NOSUGGEST")) {
  374. delete afflst;
  375. return 1;
  376. }
  377. }
  378. /* parse in the flag used by forbidden words */
  379. if (strncmp(line,"FORBIDDENWORD",13) == 0) {
  380. if (parse_flag(line, &forbiddenword, "FORBIDDENWORD")) {
  381. delete afflst;
  382. return 1;
  383. }
  384. }
  385. /* parse in the flag used by forbidden words */
  386. if (strncmp(line,"LEMMA_PRESENT",13) == 0) {
  387. if (parse_flag(line, &lemma_present, "LEMMA_PRESENT")) {
  388. delete afflst;
  389. return 1;
  390. }
  391. }
  392. /* parse in the flag used by circumfixes */
  393. if (strncmp(line,"CIRCUMFIX",9) == 0) {
  394. if (parse_flag(line, &circumfix, "CIRCUMFIX")) {
  395. delete afflst;
  396. return 1;
  397. }
  398. }
  399. /* parse in the flag used by fogemorphemes */
  400. if (strncmp(line,"ONLYINCOMPOUND",14) == 0) {
  401. if (parse_flag(line, &onlyincompound, "ONLYINCOMPOUND")) {
  402. delete afflst;
  403. return 1;
  404. }
  405. }
  406. /* parse in the flag used by `needaffixs' */
  407. if (strncmp(line,"PSEUDOROOT",10) == 0) {
  408. if (parse_flag(line, &needaffix, "PSEUDOROOT")) {
  409. delete afflst;
  410. return 1;
  411. }
  412. }
  413. /* parse in the flag used by `needaffixs' */
  414. if (strncmp(line,"NEEDAFFIX",9) == 0) {
  415. if (parse_flag(line, &needaffix, "NEEDAFFIX")) {
  416. delete afflst;
  417. return 1;
  418. }
  419. }
  420. /* parse in the minimal length for words in compounds */
  421. if (strncmp(line,"COMPOUNDMIN",11) == 0) {
  422. if (parse_num(line, &cpdmin, "COMPOUNDMIN")) {
  423. delete afflst;
  424. return 1;
  425. }
  426. if (cpdmin < 1) cpdmin = 1;
  427. }
  428. /* parse in the max. words and syllables in compounds */
  429. if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) {
  430. if (parse_cpdsyllable(line)) {
  431. delete afflst;
  432. return 1;
  433. }
  434. }
  435. /* parse in the flag used by compound_check() method */
  436. if (strncmp(line,"SYLLABLENUM",11) == 0) {
  437. if (parse_string(line, &cpdsyllablenum, "SYLLABLENUM")) {
  438. delete afflst;
  439. return 1;
  440. }
  441. }
  442. /* parse in the flag used by the controlled compound words */
  443. if (strncmp(line,"CHECKNUM",8) == 0) {
  444. checknum=1;
  445. }
  446. /* parse in the extra word characters */
  447. if (strncmp(line,"WORDCHARS",9) == 0) {
  448. if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, "WORDCHARS", utf8)) {
  449. delete afflst;
  450. return 1;
  451. }
  452. }
  453. /* parse in the ignored characters (for example, Arabic optional diacretics charachters */
  454. if (strncmp(line,"IGNORE",6) == 0) {
  455. if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, "IGNORE", utf8)) {
  456. delete afflst;
  457. return 1;
  458. }
  459. }
  460. /* parse in the typical fault correcting table */
  461. if (strncmp(line,"REP",3) == 0) {
  462. if (parse_reptable(line, afflst)) {
  463. delete afflst;
  464. return 1;
  465. }
  466. }
  467. /* parse in the phonetic translation table */
  468. if (strncmp(line,"PHONE",5) == 0) {
  469. if (parse_phonetable(line, afflst)) {
  470. delete afflst;
  471. return 1;
  472. }
  473. }
  474. /* parse in the checkcompoundpattern table */
  475. if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) {
  476. if (parse_checkcpdtable(line, afflst)) {
  477. delete afflst;
  478. return 1;
  479. }
  480. }
  481. /* parse in the defcompound table */
  482. if (strncmp(line,"COMPOUNDRULE",12) == 0) {
  483. if (parse_defcpdtable(line, afflst)) {
  484. delete afflst;
  485. return 1;
  486. }
  487. }
  488. /* parse in the related character map table */
  489. if (strncmp(line,"MAP",3) == 0) {
  490. if (parse_maptable(line, afflst)) {
  491. delete afflst;
  492. return 1;
  493. }
  494. }
  495. /* parse in the word breakpoints table */
  496. if (strncmp(line,"BREAK",5) == 0) {
  497. if (parse_breaktable(line, afflst)) {
  498. delete afflst;
  499. return 1;
  500. }
  501. }
  502. /* parse in the language for language specific codes */
  503. if (strncmp(line,"LANG",4) == 0) {
  504. if (parse_string(line, &lang, "LANG")) {
  505. delete afflst;
  506. return 1;
  507. }
  508. langnum = get_lang_num(lang);
  509. }
  510. if (strncmp(line,"VERSION",7) == 0) {
  511. if (parse_string(line, &version, "VERSION")) {
  512. delete afflst;
  513. return 1;
  514. }
  515. }
  516. if (strncmp(line,"MAXNGRAMSUGS",12) == 0) {
  517. if (parse_num(line, &maxngramsugs, "MAXNGRAMSUGS")) {
  518. delete afflst;
  519. return 1;
  520. }
  521. }
  522. if (strncmp(line,"NOSPLITSUGS",11) == 0) {
  523. nosplitsugs=1;
  524. }
  525. if (strncmp(line,"SUGSWITHDOTS",12) == 0) {
  526. sugswithdots=1;
  527. }
  528. /* parse in the flag used by forbidden words */
  529. if (strncmp(line,"KEEPCASE",8) == 0) {
  530. if (parse_flag(line, &keepcase, "KEEPCASE")) {
  531. delete afflst;
  532. return 1;
  533. }
  534. }
  535. /* parse in the flag used by the affix generator */
  536. if (strncmp(line,"SUBSTANDARD",11) == 0) {
  537. if (parse_flag(line, &substandard, "SUBSTANDARD")) {
  538. delete afflst;
  539. return 1;
  540. }
  541. }
  542. if (strncmp(line,"CHECKSHARPS",11) == 0) {
  543. checksharps=1;
  544. }
  545. /* parse this affix: P - prefix, S - suffix */
  546. ft = ' ';
  547. if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P';
  548. if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S';
  549. if (ft != ' ') {
  550. if (dupflags_ini) {
  551. for (int i = 0; i < CONTSIZE; i++) dupflags[i] = 0;
  552. dupflags_ini = 0;
  553. }
  554. if (parse_affix(line, ft, afflst, dupflags)) {
  555. delete afflst;
  556. process_pfx_tree_to_list();
  557. process_sfx_tree_to_list();
  558. return 1;
  559. }
  560. }
  561. }
  562. delete afflst;
  563. // convert affix trees to sorted list
  564. process_pfx_tree_to_list();
  565. process_sfx_tree_to_list();
  566. // now we can speed up performance greatly taking advantage of the
  567. // relationship between the affixes and the idea of "subsets".
  568. // View each prefix as a potential leading subset of another and view
  569. // each suffix (reversed) as a potential trailing subset of another.
  570. // To illustrate this relationship if we know the prefix "ab" is found in the
  571. // word to examine, only prefixes that "ab" is a leading subset of need be examined.
  572. // Furthermore is "ab" is not present then none of the prefixes that "ab" is
  573. // is a subset need be examined.
  574. // The same argument goes for suffix string that are reversed.
  575. // Then to top this off why not examine the first char of the word to quickly
  576. // limit the set of prefixes to examine (i.e. the prefixes to examine must
  577. // be leading supersets of the first character of the word (if they exist)
  578. // To take advantage of this "subset" relationship, we need to add two links
  579. // from entry. One to take next if the current prefix is found (call it nexteq)
  580. // and one to take next if the current prefix is not found (call it nextne).
  581. // Since we have built ordered lists, all that remains is to properly intialize
  582. // the nextne and nexteq pointers that relate them
  583. process_pfx_order();
  584. process_sfx_order();
  585. /* get encoding for CHECKCOMPOUNDCASE */
  586. char * enc = get_encoding();
  587. csconv = get_current_cs(enc);
  588. free(enc);
  589. enc = NULL;
  590. #ifdef WINSHELL
  591. char expw[MAXLNLEN];
  592. if (wordchars) {
  593. strcpy(expw, wordchars);
  594. free(wordchars);
  595. } else *expw = '\0';
  596. for (int i = 0; i <= 255; i++) {
  597. if ( (csconv[i].cupper != csconv[i].clower) &&
  598. (! strchr(expw, (char) i))) {
  599. *(expw + strlen(expw) + 1) = '\0';
  600. *(expw + strlen(expw)) = (char) i;
  601. }
  602. }
  603. wordchars = mystrdup(expw);
  604. #endif
  605. // temporary BREAK definition for German dash handling (OOo issue 64400)
  606. if ((langnum == LANG_de) && (!breaktable)) {
  607. breaktable = (char **) malloc(sizeof(char *));
  608. if (!breaktable) return 1;
  609. breaktable[0] = mystrdup("-");
  610. numbreak = 1;
  611. }
  612. return 0;
  613. }
  614. // we want to be able to quickly access prefix information
  615. // both by prefix flag, and sorted by prefix string itself
  616. // so we need to set up two indexes
  617. int AffixMgr::build_pfxtree(AffEntry* pfxptr)
  618. {
  619. PfxEntry * ptr;
  620. PfxEntry * pptr;
  621. PfxEntry * ep = (PfxEntry*) pfxptr;
  622. // get the right starting points
  623. const char * key = ep->getKey();
  624. const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
  625. // first index by flag which must exist
  626. ptr = (PfxEntry*)pFlag[flg];
  627. ep->setFlgNxt(ptr);
  628. pFlag[flg] = (AffEntry *) ep;
  629. // handle the special case of null affix string
  630. if (strlen(key) == 0) {
  631. // always inset them at head of list at element 0
  632. ptr = (PfxEntry*)pStart[0];
  633. ep->setNext(ptr);
  634. pStart[0] = (AffEntry*)ep;
  635. return 0;
  636. }
  637. // now handle the normal case
  638. ep->setNextEQ(NULL);
  639. ep->setNextNE(NULL);
  640. unsigned char sp = *((const unsigned char *)key);
  641. ptr = (PfxEntry*)pStart[sp];
  642. // handle the first insert
  643. if (!ptr) {
  644. pStart[sp] = (AffEntry*)ep;
  645. return 0;
  646. }
  647. // otherwise use binary tree insertion so that a sorted
  648. // list can easily be generated later
  649. pptr = NULL;
  650. for (;;) {
  651. pptr = ptr;
  652. if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
  653. ptr = ptr->getNextEQ();
  654. if (!ptr) {
  655. pptr->setNextEQ(ep);
  656. break;
  657. }
  658. } else {
  659. ptr = ptr->getNextNE();
  660. if (!ptr) {
  661. pptr->setNextNE(ep);
  662. break;
  663. }
  664. }
  665. }
  666. return 0;
  667. }
  668. // we want to be able to quickly access suffix information
  669. // both by suffix flag, and sorted by the reverse of the
  670. // suffix string itself; so we need to set up two indexes
  671. int AffixMgr::build_sfxtree(AffEntry* sfxptr)
  672. {
  673. SfxEntry * ptr;
  674. SfxEntry * pptr;
  675. SfxEntry * ep = (SfxEntry *) sfxptr;
  676. /* get the right starting point */
  677. const char * key = ep->getKey();
  678. const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
  679. // first index by flag which must exist
  680. ptr = (SfxEntry*)sFlag[flg];
  681. ep->setFlgNxt(ptr);
  682. sFlag[flg] = (AffEntry *) ep;
  683. // next index by affix string
  684. // handle the special case of null affix string
  685. if (strlen(key) == 0) {
  686. // always inset them at head of list at element 0
  687. ptr = (SfxEntry*)sStart[0];
  688. ep->setNext(ptr);
  689. sStart[0] = (AffEntry*)ep;
  690. return 0;
  691. }
  692. // now handle the normal case
  693. ep->setNextEQ(NULL);
  694. ep->setNextNE(NULL);
  695. unsigned char sp = *((const unsigned char *)key);
  696. ptr = (SfxEntry*)sStart[sp];
  697. // handle the first insert
  698. if (!ptr) {
  699. sStart[sp] = (AffEntry*)ep;
  700. return 0;
  701. }
  702. // otherwise use binary tree insertion so that a sorted
  703. // list can easily be generated later
  704. pptr = NULL;
  705. for (;;) {
  706. pptr = ptr;
  707. if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
  708. ptr = ptr->getNextEQ();
  709. if (!ptr) {
  710. pptr->setNextEQ(ep);
  711. break;
  712. }
  713. } else {
  714. ptr = ptr->getNextNE();
  715. if (!ptr) {
  716. pptr->setNextNE(ep);
  717. break;
  718. }
  719. }
  720. }
  721. return 0;
  722. }
  723. // convert from binary tree to sorted list
  724. int AffixMgr::process_pfx_tree_to_list()
  725. {
  726. for (int i=1; i< SETSIZE; i++) {
  727. pStart[i] = process_pfx_in_order(pStart[i],NULL);
  728. }
  729. return 0;
  730. }
  731. AffEntry* AffixMgr::process_pfx_in_order(AffEntry* ptr, AffEntry* nptr)
  732. {
  733. if (ptr) {
  734. nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextNE(), nptr);
  735. ((PfxEntry*) ptr)->setNext((PfxEntry*) nptr);
  736. nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextEQ(), ptr);
  737. }
  738. return nptr;
  739. }
  740. // convert from binary tree to sorted list
  741. int AffixMgr:: process_sfx_tree_to_list()
  742. {
  743. for (int i=1; i< SETSIZE; i++) {
  744. sStart[i] = process_sfx_in_order(sStart[i],NULL);
  745. }
  746. return 0;
  747. }
  748. AffEntry* AffixMgr::process_sfx_in_order(AffEntry* ptr, AffEntry* nptr)
  749. {
  750. if (ptr) {
  751. nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextNE(), nptr);
  752. ((SfxEntry*) ptr)->setNext((SfxEntry*) nptr);
  753. nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextEQ(), ptr);
  754. }
  755. return nptr;
  756. }
  757. // reinitialize the PfxEntry links NextEQ and NextNE to speed searching
  758. // using the idea of leading subsets this time
  759. int AffixMgr::process_pfx_order()
  760. {
  761. PfxEntry* ptr;
  762. // loop through each prefix list starting point
  763. for (int i=1; i < SETSIZE; i++) {
  764. ptr = (PfxEntry*)pStart[i];
  765. // look through the remainder of the list
  766. // and find next entry with affix that
  767. // the current one is not a subset of
  768. // mark that as destination for NextNE
  769. // use next in list that you are a subset
  770. // of as NextEQ
  771. for (; ptr != NULL; ptr = ptr->getNext()) {
  772. PfxEntry * nptr = ptr->getNext();
  773. for (; nptr != NULL; nptr = nptr->getNext()) {
  774. if (! isSubset( ptr->getKey() , nptr->getKey() )) break;
  775. }
  776. ptr->setNextNE(nptr);
  777. ptr->setNextEQ(NULL);
  778. if ((ptr->getNext()) && isSubset(ptr->getKey() , (ptr->getNext())->getKey()))
  779. ptr->setNextEQ(ptr->getNext());
  780. }
  781. // now clean up by adding smart search termination strings:
  782. // if you are already a superset of the previous prefix
  783. // but not a subset of the next, search can end here
  784. // so set NextNE properly
  785. ptr = (PfxEntry *) pStart[i];
  786. for (; ptr != NULL; ptr = ptr->getNext()) {
  787. PfxEntry * nptr = ptr->getNext();
  788. PfxEntry * mptr = NULL;
  789. for (; nptr != NULL; nptr = nptr->getNext()) {
  790. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  791. mptr = nptr;
  792. }
  793. if (mptr) mptr->setNextNE(NULL);
  794. }
  795. }
  796. return 0;
  797. }
  798. // initialize the SfxEntry links NextEQ and NextNE to speed searching
  799. // using the idea of leading subsets this time
  800. int AffixMgr::process_sfx_order()
  801. {
  802. SfxEntry* ptr;
  803. // loop through each prefix list starting point
  804. for (int i=1; i < SETSIZE; i++) {
  805. ptr = (SfxEntry *) sStart[i];
  806. // look through the remainder of the list
  807. // and find next entry with affix that
  808. // the current one is not a subset of
  809. // mark that as destination for NextNE
  810. // use next in list that you are a subset
  811. // of as NextEQ
  812. for (; ptr != NULL; ptr = ptr->getNext()) {
  813. SfxEntry * nptr = ptr->getNext();
  814. for (; nptr != NULL; nptr = nptr->getNext()) {
  815. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  816. }
  817. ptr->setNextNE(nptr);
  818. ptr->setNextEQ(NULL);
  819. if ((ptr->getNext()) && isSubset(ptr->getKey(),(ptr->getNext())->getKey()))
  820. ptr->setNextEQ(ptr->getNext());
  821. }
  822. // now clean up by adding smart search termination strings:
  823. // if you are already a superset of the previous suffix
  824. // but not a subset of the next, search can end here
  825. // so set NextNE properly
  826. ptr = (SfxEntry *) sStart[i];
  827. for (; ptr != NULL; ptr = ptr->getNext()) {
  828. SfxEntry * nptr = ptr->getNext();
  829. SfxEntry * mptr = NULL;
  830. for (; nptr != NULL; nptr = nptr->getNext()) {
  831. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  832. mptr = nptr;
  833. }
  834. if (mptr) mptr->setNextNE(NULL);
  835. }
  836. }
  837. return 0;
  838. }
  839. // add flags to the result for dictionary debugging
  840. void AffixMgr::debugflag(char * result, unsigned short flag) {
  841. char * st = encode_flag(flag);
  842. strcat(result, " ");
  843. strcat(result, MORPH_FLAG);
  844. strcat(result, st);
  845. free(st);
  846. }
  847. // calculate the character length of the condition
  848. int AffixMgr::condlen(char * st)
  849. {
  850. int l = 0;
  851. bool group = false;
  852. for(; *st; st++) {
  853. if (*st == '[') {
  854. group = true;
  855. l++;
  856. } else if (*st == ']') group = false;
  857. else if (!group && (!utf8 ||
  858. (!(*st & 0x80) || ((*st & 0xc0) == 0x80)))) l++;
  859. }
  860. return l;
  861. }
  862. int AffixMgr::encodeit(struct affentry * ptr, char * cs)
  863. {
  864. if (strcmp(cs,".") != 0) {
  865. ptr->numconds = (char) condlen(cs);
  866. strncpy(ptr->c.conds, cs, MAXCONDLEN);
  867. // long condition (end of conds padded by strncpy)
  868. if (ptr->c.conds[MAXCONDLEN - 1] && cs[MAXCONDLEN]) {
  869. ptr->opts += aeLONGCOND;
  870. ptr->c.l.conds2 = mystrdup(cs + MAXCONDLEN_1);
  871. }
  872. } else {
  873. ptr->numconds = 0;
  874. ptr->c.conds[0] = '\0';
  875. }
  876. return 0;
  877. }
  878. // return 1 if s1 is a leading subset of s2 (dots are for infixes)
  879. inline int AffixMgr::isSubset(const char * s1, const char * s2)
  880. {
  881. while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) {
  882. s1++;
  883. s2++;
  884. }
  885. return (*s1 == '\0');
  886. }
  887. // check word for prefixes
  888. struct hentry * AffixMgr::prefix_check(const char * word, int len, char in_compound,
  889. const FLAG needflag)
  890. {
  891. struct hentry * rv= NULL;
  892. pfx = NULL;
  893. pfxappnd = NULL;
  894. sfxappnd = NULL;
  895. // first handle the special case of 0 length prefixes
  896. PfxEntry * pe = (PfxEntry *) pStart[0];
  897. while (pe) {
  898. if (
  899. // fogemorpheme
  900. ((in_compound != IN_CPD_NOT) || !(pe->getCont() &&
  901. (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) &&
  902. // permit prefixes in compounds
  903. ((in_compound != IN_CPD_END) || (pe->getCont() &&
  904. (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))
  905. ) {
  906. // check prefix
  907. rv = pe->checkword(word, len, in_compound, needflag);
  908. if (rv) {
  909. pfx=(AffEntry *)pe; // BUG: pfx not stateless
  910. return rv;
  911. }
  912. }
  913. pe = pe->getNext();
  914. }
  915. // now handle the general case
  916. unsigned char sp = *((const unsigned char *)word);
  917. PfxEntry * pptr = (PfxEntry *)pStart[sp];
  918. while (pptr) {
  919. if (isSubset(pptr->getKey(),word)) {
  920. if (
  921. // fogemorpheme
  922. ((in_compound != IN_CPD_NOT) || !(pptr->getCont() &&
  923. (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) &&
  924. // permit prefixes in compounds
  925. ((in_compound != IN_CPD_END) || (pptr->getCont() &&
  926. (TESTAFF(pptr->getCont(), compoundpermitflag, pptr->getContLen()))))
  927. ) {
  928. // check prefix
  929. rv = pptr->checkword(word, len, in_compound, needflag);
  930. if (rv) {
  931. pfx=(AffEntry *)pptr; // BUG: pfx not stateless
  932. return rv;
  933. }
  934. }
  935. pptr = pptr->getNextEQ();
  936. } else {
  937. pptr = pptr->getNextNE();
  938. }
  939. }
  940. return NULL;
  941. }
  942. // check word for prefixes
  943. struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len,
  944. char in_compound, const FLAG needflag)
  945. {
  946. struct hentry * rv= NULL;
  947. pfx = NULL;
  948. sfxappnd = NULL;
  949. // first handle the special case of 0 length prefixes
  950. PfxEntry * pe = (PfxEntry *) pStart[0];
  951. while (pe) {
  952. rv = pe->check_twosfx(word, len, in_compound, needflag);
  953. if (rv) return rv;
  954. pe = pe->getNext();
  955. }
  956. // now handle the general case
  957. unsigned char sp = *((const unsigned char *)word);
  958. PfxEntry * pptr = (PfxEntry *)pStart[sp];
  959. while (pptr) {
  960. if (isSubset(pptr->getKey(),word)) {
  961. rv = pptr->check_twosfx(word, len, in_compound, needflag);
  962. if (rv) {
  963. pfx = (AffEntry *)pptr;
  964. return rv;
  965. }
  966. pptr = pptr->getNextEQ();
  967. } else {
  968. pptr = pptr->getNextNE();
  969. }
  970. }
  971. return NULL;
  972. }
  973. // check word for prefixes
  974. char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound,
  975. const FLAG needflag)
  976. {
  977. char * st;
  978. char result[MAXLNLEN];
  979. result[0] = '\0';
  980. pfx = NULL;
  981. sfxappnd = NULL;
  982. // first handle the special case of 0 length prefixes
  983. PfxEntry * pe = (PfxEntry *) pStart[0];
  984. while (pe) {
  985. st = pe->check_morph(word,len,in_compound, needflag);
  986. if (st) {
  987. strcat(result, st);
  988. free(st);
  989. }
  990. // if (rv) return rv;
  991. pe = pe->getNext();
  992. }
  993. // now handle the general case
  994. unsigned char sp = *((const unsigned char *)word);
  995. PfxEntry * pptr = (PfxEntry *)pStart[sp];
  996. while (pptr) {
  997. if (isSubset(pptr->getKey(),word)) {
  998. st = pptr->check_morph(word,len,in_compound, needflag);
  999. if (st) {
  1000. // fogemorpheme
  1001. if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() &&
  1002. (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))))) {
  1003. strcat(result, st);
  1004. pfx = (AffEntry *)pptr;
  1005. }
  1006. free(st);
  1007. }
  1008. pptr = pptr->getNextEQ();
  1009. } else {
  1010. pptr = pptr->getNextNE();
  1011. }
  1012. }
  1013. if (*result) return mystrdup(result);
  1014. return NULL;
  1015. }
  1016. // check word for prefixes
  1017. char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len,
  1018. char in_compound, const FLAG needflag)
  1019. {
  1020. char * st;
  1021. char result[MAXLNLEN];
  1022. result[0] = '\0';
  1023. pfx = NULL;
  1024. sfxappnd = NULL;
  1025. // first handle the special case of 0 length prefixes
  1026. PfxEntry * pe = (PfxEntry *) pStart[0];
  1027. while (pe) {
  1028. st = pe->check_twosfx_morph(word,len,in_compound, needflag);
  1029. if (st) {
  1030. strcat(result, st);
  1031. free(st);
  1032. }
  1033. pe = pe->getNext();
  1034. }
  1035. // now handle the general case
  1036. unsigned char sp = *((const unsigned char *)word);
  1037. PfxEntry * pptr = (PfxEntry *)pStart[sp];
  1038. while (pptr) {
  1039. if (isSubset(pptr->getKey(),word)) {
  1040. st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
  1041. if (st) {
  1042. strcat(result, st);
  1043. free(st);
  1044. pfx = (AffEntry *)pptr;
  1045. }
  1046. pptr = pptr->getNextEQ();
  1047. } else {
  1048. pptr = pptr->getNextNE();
  1049. }
  1050. }
  1051. if (*result) return mystrdup(result);
  1052. return NULL;
  1053. }
  1054. // Is word a non compound with a REP substitution (see checkcompoundrep)?
  1055. int AffixMgr::cpdrep_check(const char * word, int wl)
  1056. {
  1057. char candidate[MAXLNLEN];
  1058. const char * r;
  1059. int lenr, lenp;
  1060. if ((wl < 2) || !numrep) return 0;
  1061. for (int i=0; i < numrep; i++ ) {
  1062. r = word;
  1063. lenr = strlen(reptable[i].pattern2);
  1064. lenp = strlen(reptable[i].pattern);
  1065. // search every occurence of the pattern in the word
  1066. while ((r=strstr(r, reptable[i].pattern)) != NULL) {
  1067. strcpy(candidate, word);
  1068. if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break;
  1069. strcpy(candidate+(r-word),reptable[i].pattern2);
  1070. strcpy(candidate+(r-word)+lenr, r+lenp);
  1071. if (candidate_check(candidate,strlen(candidate))) return 1;
  1072. r++; // search for the next letter
  1073. }
  1074. }
  1075. return 0;
  1076. }
  1077. // forbid compoundings when there are special patterns at word bound
  1078. int AffixMgr::cpdpat_check(const char * word, int pos)
  1079. {
  1080. int len;
  1081. for (int i = 0; i < numcheckcpd; i++) {
  1082. if (isSubset(checkcpdtable[i].pattern2, word + pos) &&
  1083. (len = strlen(checkcpdtable[i].pattern)) && (pos > len) &&
  1084. (strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)) return 1;
  1085. }
  1086. return 0;
  1087. }
  1088. // forbid compounding with neighbouring upper and lower case characters at word bounds
  1089. int AffixMgr::cpdcase_check(const char * word, int pos)
  1090. {
  1091. if (utf8) {
  1092. w_char u, w;
  1093. const char * p;
  1094. u8_u16(&u, 1, word + pos);
  1095. for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--);
  1096. u8_u16(&w, 1, p);
  1097. unsigned short a = (u.h << 8) + u.l;
  1098. unsigned short b = (w.h << 8) + w.l;
  1099. if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b))) return 1;
  1100. } else {
  1101. unsigned char a = *(word + pos - 1);
  1102. unsigned char b = *(word + pos);
  1103. if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-')) return 1;
  1104. }
  1105. return 0;
  1106. }
  1107. // check compound patterns
  1108. int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** def, char all)
  1109. {
  1110. signed short btpp[MAXWORDLEN]; // metacharacter (*, ?) positions for backtracking
  1111. signed short btwp[MAXWORDLEN]; // word positions for metacharacters
  1112. int btnum[MAXWORDLEN]; // number of matched characters in metacharacter positions
  1113. short bt = 0;
  1114. int i;
  1115. int ok;
  1116. int w = 0;
  1117. if (!*words) {
  1118. w = 1;
  1119. *words = def;
  1120. }
  1121. (*words)[wnum] = rv;
  1122. for (i = 0; i < numdefcpd; i++) {
  1123. signed short pp = 0; // pattern position
  1124. signed short wp = 0; // "words" position
  1125. int ok2;
  1126. ok = 1;
  1127. ok2 = 1;
  1128. do {
  1129. while ((pp < defcpdtable[i].len) && (wp <= wnum)) {
  1130. if (((pp+1) < defcpdtable[i].len) &&
  1131. ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) {
  1132. int wend = (defcpdtable[i].def[pp+1] == '?') ? wp : wnum;
  1133. ok2 = 1;
  1134. pp+=2;
  1135. btpp[bt] = pp;
  1136. btwp[bt] = wp;
  1137. while (wp <= wend) {
  1138. if (!(*words)[wp]->alen ||
  1139. !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp-2], (*words)[wp]->alen)) {
  1140. ok2 = 0;
  1141. break;
  1142. }
  1143. wp++;
  1144. }
  1145. if (wp <= wnum) ok2 = 0;
  1146. btnum[bt] = wp - btwp[bt];
  1147. if (btnum[bt] > 0) bt++;
  1148. if (ok2) break;
  1149. } else {
  1150. ok2 = 1;
  1151. if (!(*words)[wp] || !(*words)[wp]->alen ||
  1152. !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp], (*words)[wp]->alen)) {
  1153. ok = 0;
  1154. break;
  1155. }
  1156. pp++;
  1157. wp++;
  1158. if ((defcpdtable[i].len == pp) && !(wp > wnum)) ok = 0;
  1159. }
  1160. }
  1161. if (ok && ok2) {
  1162. int r = pp;
  1163. while ((defcpdtable[i].len > r) && ((r+1) < defcpdtable[i].len) &&
  1164. ((defcpdtable[i].def[r+1] == '*') || (defcpdtable[i].def[r+1] == '?'))) r+=2;
  1165. if (defcpdtable[i].len <= r) return 1;
  1166. }
  1167. // backtrack
  1168. if (bt) do {
  1169. ok = 1;
  1170. btnum[bt - 1]--;
  1171. pp = btpp[bt - 1];
  1172. wp = btwp[bt - 1] + (signed short) btnum[bt - 1];
  1173. } while ((btnum[bt - 1] < 0) && --bt);
  1174. } while (bt);
  1175. if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1;
  1176. // check zero ending
  1177. while (ok && ok2 && (defcpdtable[i].len > pp) && ((pp+1) < defcpdtable[i].len) &&
  1178. ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) pp+=2;
  1179. if (ok && ok2 && (defcpdtable[i].len <= pp)) return 1;
  1180. }
  1181. (*words)[wnum] = NULL;
  1182. if (w) *words = NULL;
  1183. return 0;
  1184. }
  1185. inline int AffixMgr::candidate_check(const char * word, int len)
  1186. {
  1187. struct hentry * rv=NULL;
  1188. rv = lookup(word);
  1189. if (rv) return 1;
  1190. // rv = prefix_check(word,len,1);
  1191. // if (rv) return 1;
  1192. rv = affix_check(word,len);
  1193. if (rv) return 1;
  1194. return 0;
  1195. }
  1196. // calculate number of syllable for compound-checking
  1197. short AffixMgr::get_syllable(const char * word, int wlen)
  1198. {
  1199. if (cpdmaxsyllable==0) return 0;
  1200. short num=0;
  1201. if (!utf8) {
  1202. for (int i=0; i<wlen; i++) {
  1203. if (strchr(cpdvowels, word[i])) num++;
  1204. }
  1205. } else if (cpdvowels_utf16) {
  1206. w_char w[MAXWORDUTF8LEN];
  1207. int i = u8_u16(w, MAXWORDUTF8LEN, word);
  1208. for (; i > 0; i--) {
  1209. if (flag_bsearch((unsigned short *) cpdvowels_utf16,
  1210. ((unsigned short *) w)[i - 1], cpdvowels_utf16_len)) num++;
  1211. }
  1212. }
  1213. return num;
  1214. }
  1215. // check if compound word is correctly spelled
  1216. // hu_mov_rule = spec. Hungarian rule (XXX)
  1217. struct hentry * AffixMgr::compound_check(const char * word, int len,
  1218. short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words = NULL,
  1219. char hu_mov_rule = 0, int * cmpdstemnum = NULL, int * cmpdstem = NULL, char is_sug = 0)
  1220. {
  1221. int i;
  1222. short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
  1223. int oldcmpdstemnum = 0;
  1224. struct hentry * rv = NULL;
  1225. struct hentry * rv_first;
  1226. struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
  1227. char st [MAXWORDUTF8LEN + 4];
  1228. char ch;
  1229. int cmin;
  1230. int cmax;
  1231. int checked_prefix;
  1232. #ifdef HUNSTEM
  1233. if (cmpdstemnum) {
  1234. if (wordnum == 0) {
  1235. *cmpdstemnum = 1;
  1236. } else {
  1237. (*cmpdstemnum)++;
  1238. }
  1239. }
  1240. #endif
  1241. if (utf8) {
  1242. for (cmin = 0, i = 0; (i < cpdmin) && word[cmin]; i++) {
  1243. cmin++;
  1244. for (; (word[cmin] & 0xc0) == 0x80; cmin++);
  1245. }
  1246. for (cmax = len, i = 0; (i < (cpdmin - 1)) && cmax; i++) {
  1247. cmax--;
  1248. for (; (word[cmax] & 0xc0) == 0x80; cmax--);
  1249. }
  1250. } else {
  1251. cmin = cpdmin;
  1252. cmax = len - cpdmin + 1;
  1253. }
  1254. strcpy(st, word);
  1255. for (i = cmin; i < cmax; i++) {
  1256. oldnumsyllable = numsyllable;
  1257. oldwordnum = wordnum;
  1258. checked_prefix = 0;
  1259. // go to end of the UTF-8 character
  1260. if (utf8) {
  1261. for (; (st[i] & 0xc0) == 0x80; i++);
  1262. if (i >= cmax) return NULL;
  1263. }
  1264. ch = st[i];
  1265. st[i] = '\0';
  1266. sfx = NULL;
  1267. pfx = NULL;
  1268. // FIRST WORD
  1269. rv = lookup(st); // perhaps without prefix
  1270. // search homonym with compound flag
  1271. while ((rv) && !hu_mov_rule &&
  1272. ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
  1273. !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1274. (compoundbegin && !wordnum &&
  1275. TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
  1276. (compoundmiddle && wordnum && !words &&
  1277. TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
  1278. (numdefcpd &&
  1279. ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||
  1280. (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))
  1281. ))) {
  1282. rv = rv->next_homonym;
  1283. }
  1284. if (!rv) {
  1285. if (compoundflag &&
  1286. !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {
  1287. if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,
  1288. FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule &&
  1289. ((SfxEntry*)sfx)->getCont() &&
  1290. ((compoundforbidflag && TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
  1291. ((SfxEntry*)sfx)->getContLen())) || (compoundend &&
  1292. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend,
  1293. ((SfxEntry*)sfx)->getContLen())))) {
  1294. rv = NULL;
  1295. }
  1296. }
  1297. if (rv ||
  1298. (((wordnum == 0) && compoundbegin &&
  1299. ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
  1300. (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) ||
  1301. ((wordnum > 0) && compoundmiddle &&
  1302. ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
  1303. (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle)))))
  1304. ) checked_prefix = 1;
  1305. // else check forbiddenwords and needaffix
  1306. } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
  1307. TESTAFF(rv->astr, needaffix, rv->alen) ||
  1308. (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen))
  1309. )) {
  1310. st[i] = ch;
  1311. continue;
  1312. }
  1313. // check non_compound flag in suffix and prefix
  1314. if ((rv) && !hu_mov_rule &&
  1315. ((pfx && ((PfxEntry*)pfx)->getCont() &&
  1316. TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag,
  1317. ((PfxEntry*)pfx)->getContLen())) ||
  1318. (sfx && ((SfxEntry*)sfx)->getCont() &&
  1319. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
  1320. ((SfxEntry*)sfx)->getContLen())))) {
  1321. rv = NULL;
  1322. }
  1323. // check compoundend flag in suffix and prefix
  1324. if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
  1325. ((pfx && ((PfxEntry*)pfx)->getCont() &&
  1326. TESTAFF(((PfxEntry*)pfx)->getCont(), compoundend,
  1327. ((PfxEntry*)pfx)->getContLen())) ||
  1328. (sfx && ((SfxEntry*)sfx)->getCont() &&
  1329. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend,
  1330. ((SfxEntry*)sfx)->getContLen())))) {
  1331. rv = NULL;
  1332. }
  1333. // check compoundmiddle flag in suffix and prefix
  1334. if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule &&
  1335. ((pfx && ((PfxEntry*)pfx)->getCont() &&
  1336. TESTAFF(((PfxEntry*)pfx)->getCont(), compoundmiddle,
  1337. ((PfxEntry*)pfx)->getContLen())) ||
  1338. (sfx && ((SfxEntry*)sfx)->getCont() &&
  1339. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundmiddle,
  1340. ((SfxEntry*)sfx)->getContLen())))) {
  1341. rv = NULL;
  1342. }
  1343. // check forbiddenwords
  1344. if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
  1345. (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) {
  1346. return NULL;
  1347. }
  1348. // increment word number, if the second root has a compoundroot flag
  1349. if ((rv) && compoundroot &&
  1350. (TESTAFF(rv->astr, compoundroot, rv->alen))) {
  1351. wordnum++;
  1352. }
  1353. // first word is acceptable in compound words?
  1354. if (((rv) &&
  1355. ( checked_prefix || (words && words[wnum]) ||
  1356. (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1357. ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
  1358. ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))// ||
  1359. // (numdefcpd && )
  1360. // LANG_hu section: spec. Hungarian rule
  1361. || ((langnum == LANG_hu) && hu_mov_rule && (
  1362. TESTAFF(rv->astr, 'F', rv->alen) || // XXX hardwired Hungarian dictionary codes
  1363. TESTAFF(rv->astr, 'G', rv->alen) ||
  1364. TESTAFF(rv->astr, 'H', rv->alen)
  1365. )
  1366. )
  1367. // END of LANG_hu section
  1368. )
  1369. && ! (( checkcompoundtriple && // test triple letters
  1370. (word[i-1]==word[i]) && (
  1371. ((i>1) && (word[i-1]==word[i-2])) ||
  1372. ((word[i-1]==word[i+1])) // may be word[i+1] == '\0'
  1373. )
  1374. ) ||
  1375. (
  1376. // test CHECKCOMPOUNDPATTERN
  1377. numcheckcpd && cpdpat_check(word, i)
  1378. ) ||
  1379. (
  1380. checkcompoundcase && cpdcase_check(word, i)
  1381. ))
  1382. )
  1383. // LANG_hu section: spec. Hungarian rule
  1384. || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) &&
  1385. (sfx && ((SfxEntry*)sfx)->getCont() && ( // XXX hardwired Hungarian dic. codes
  1386. TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) 'x', ((SfxEntry*)sfx)->getContLen()) ||
  1387. TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) '%', ((SfxEntry*)sfx)->getContLen())
  1388. )
  1389. )
  1390. )
  1391. // END of LANG_hu section
  1392. ) {
  1393. // LANG_hu section: spec. Hungarian rule
  1394. if (langnum == LANG_hu) {
  1395. // calculate syllable number of the word
  1396. numsyllable += get_syllable(st, i);
  1397. // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
  1398. if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++;
  1399. }
  1400. // END of LANG_hu section
  1401. #ifdef HUNSTEM
  1402. if (cmpdstem) cmpdstem[*cmpdstemnum - 1] = i;
  1403. #endif
  1404. // NEXT WORD(S)
  1405. rv_first = rv;
  1406. rv = lookup((word+i)); // perhaps without prefix
  1407. // search homonym with compound flag
  1408. while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
  1409. !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1410. (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) ||
  1411. (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))))) {
  1412. rv = rv->next_homonym;
  1413. }
  1414. if (rv && words && words[wnum + 1]) return rv;
  1415. oldnumsyllable2 = numsyllable;
  1416. oldwordnum2 = wordnum;
  1417. // LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary code
  1418. if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) {
  1419. numsyllable--;
  1420. }
  1421. // END of LANG_hu section
  1422. // increment word number, if the second root has a compoundroot flag
  1423. if ((rv) && (compoundroot) &&
  1424. (TESTAFF(rv->astr, compoundroot, rv->alen))) {
  1425. wordnum++;
  1426. }
  1427. // check forbiddenwords
  1428. if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
  1429. (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL;
  1430. // second word is acceptable, as a root?
  1431. // hungarian conventions: compounding is acceptable,
  1432. // when compound forms consist of 2 words, or if more,
  1433. // then the syllable number of root words must be 6, or lesser.
  1434. if ((rv) && (
  1435. (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1436. (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))
  1437. )
  1438. && (
  1439. ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
  1440. ((cpdmaxsyllable==0) ||
  1441. (numsyllable + get_syllable(HENTRY_WORD(rv), rv->clen)<=cpdmaxsyllable))
  1442. )
  1443. && (
  1444. (!checkcompounddup || (rv != rv_first))
  1445. )
  1446. )
  1447. {
  1448. // forbid compound word, if it is a non compound word with typical fault
  1449. if (checkcompoundrep && cpdrep_check(word,len)) return NULL;
  1450. return rv;
  1451. }
  1452. numsyllable = oldnumsyllable2 ;
  1453. wordnum = oldwordnum2;
  1454. // perhaps second word has prefix or/and suffix
  1455. sfx = NULL;
  1456. sfxflag = FLAG_NULL;
  1457. rv = (compoundflag) ? affix_check((word+i),strlen(word+i), compoundflag, IN_CPD_END) : NULL;
  1458. if (!rv && compoundend) {
  1459. sfx = NULL;
  1460. pfx = NULL;
  1461. rv = affix_check((word+i),strlen(word+i), compoundend, IN_CPD_END);
  1462. }
  1463. if (!rv && numdefcpd && words) {
  1464. rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END);
  1465. if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) return rv;
  1466. rv = NULL;
  1467. }
  1468. // check non_compound flag in suffix and prefix
  1469. if ((rv) &&
  1470. ((pfx && ((PfxEntry*)pfx)->getCont() &&
  1471. TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag,
  1472. ((PfxEntry*)pfx)->getContLen())) ||
  1473. (sfx && ((SfxEntry*)sfx)->getCont() &&
  1474. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
  1475. ((SfxEntry*)sfx)->getContLen())))) {
  1476. rv = NULL;
  1477. }
  1478. // check forbiddenwords
  1479. if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
  1480. (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL;
  1481. // pfxappnd = prefix of word+i, or NULL
  1482. // calculate syllable number of prefix.
  1483. // hungarian convention: when syllable number of prefix is more,
  1484. // than 1, the prefix+word counts as two words.
  1485. if (langnum == LANG_hu) {
  1486. // calculate syllable number of the word
  1487. numsyllable += get_syllable(word + i, strlen(word + i));
  1488. // - affix syllable num.
  1489. // XXX only second suffix (inflections, not derivations)
  1490. if (sfxappnd) {
  1491. char * tmp = myrevstrdup(sfxappnd);
  1492. numsyllable -= get_syllable(tmp, strlen(tmp));
  1493. free(tmp);
  1494. }
  1495. // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
  1496. if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++;
  1497. // increment syllable num, if last word has a SYLLABLENUM flag
  1498. // and the suffix is beginning `s'
  1499. if (cpdsyllablenum) {
  1500. switch (sfxflag) {
  1501. case 'c': { numsyllable+=2; break; }
  1502. case 'J': { numsyllable += 1; break; }
  1503. case 'I': { if (TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; }
  1504. }
  1505. }
  1506. }
  1507. // increment word number, if the second word has a compoundroot flag
  1508. if ((rv) && (compoundroot) &&
  1509. (TESTAFF(rv->astr, compoundroot, rv->alen))) {
  1510. wordnum++;
  1511. }
  1512. // second word is acceptable, as a word with prefix or/and suffix?
  1513. // hungarian conventions: compounding is acceptable,
  1514. // when compound forms consist 2 word, otherwise
  1515. // the syllable number of root words is 6, or lesser.
  1516. if ((rv) &&
  1517. (
  1518. ((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
  1519. ((cpdmaxsyllable == 0) ||
  1520. (numsyllable <= cpdmaxsyllable))
  1521. )
  1522. && (
  1523. (!checkcompounddup || (rv != rv_first))
  1524. )) {
  1525. // forbid compound word, if it is a non compound word with typical fault
  1526. if (checkcompoundrep && cpdrep_check(word, len)) return NULL;
  1527. return rv;
  1528. }
  1529. numsyllable = oldnumsyllable2;
  1530. wordnum = oldwordnum2;
  1531. #ifdef HUNSTEM
  1532. if (cmpdstemnum) oldcmpdstemnum = *cmpdstemnum;
  1533. #endif
  1534. // perhaps second word is a compound word (recursive call)
  1535. if (wordnum < maxwordnum) {
  1536. rv = compound_check((word+i),strlen(word+i), wordnum+1,
  1537. numsyllable, maxwordnum, wnum + 1, words,
  1538. 0, cmpdstemnum, cmpdstem, is_sug);
  1539. } else {
  1540. rv=NULL;
  1541. }
  1542. if (rv) {
  1543. // forbid compound word, if it is a non compound word with typical fault
  1544. if (checkcompoundrep && cpdrep_check(word, len)) return NULL;
  1545. return rv;
  1546. } else {
  1547. #ifdef HUNSTEM
  1548. if (cmpdstemnum) *cmpdstemnum = oldcmpdstemnum;
  1549. #endif
  1550. }
  1551. }
  1552. st[i] = ch;
  1553. wordnum = oldwordnum;
  1554. numsyllable = oldnumsyllable;
  1555. }
  1556. return NULL;
  1557. }
  1558. // check if compound word is correctly spelled
  1559. // hu_mov_rule = spec. Hungarian rule (XXX)
  1560. int AffixMgr::compound_check_morph(const char * word, int len,
  1561. short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words,
  1562. char hu_mov_rule = 0, char ** result = NULL, char * partresult = NULL)
  1563. {
  1564. int i;
  1565. short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
  1566. int ok = 0;
  1567. struct hentry * rv = NULL;
  1568. struct hentry * rv_first;
  1569. struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
  1570. char st [MAXWORDUTF8LEN + 4];
  1571. char ch;
  1572. int checked_prefix;
  1573. char presult[MAXLNLEN];
  1574. int cmin;
  1575. int cmax;
  1576. if (utf8) {
  1577. for (cmin = 0, i = 0; (i < cpdmin) && word[cmin]; i++) {
  1578. cmin++;
  1579. for (; (word[cmin] & 0xc0) == 0x80; cmin++);
  1580. }
  1581. for (cmax = len, i = 0; (i < (cpdmin - 1)) && cmax; i++) {
  1582. cmax--;
  1583. for (; (word[cmax] & 0xc0) == 0x80; cmax--);
  1584. }
  1585. } else {
  1586. cmin = cpdmin;
  1587. cmax = len - cpdmin + 1;
  1588. }
  1589. strcpy(st, word);
  1590. for (i = cmin; i < cmax; i++) {
  1591. oldnumsyllable = numsyllable;
  1592. oldwordnum = wordnum;
  1593. checked_prefix = 0;
  1594. // go to end of the UTF-8 character
  1595. if (utf8) {
  1596. for (; (st[i] & 0xc0) == 0x80; i++);
  1597. if (i >= cmax) return 0;
  1598. }
  1599. ch = st[i];
  1600. st[i] = '\0';
  1601. sfx = NULL;
  1602. // FIRST WORD
  1603. *presult = '\0';
  1604. if (partresult) strcat(presult, partresult);
  1605. rv = lookup(st); // perhaps without prefix
  1606. // search homonym with compound flag
  1607. while ((rv) && !hu_mov_rule &&
  1608. ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
  1609. !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1610. (compoundbegin && !wordnum &&
  1611. TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
  1612. (compoundmiddle && wordnum && !words &&
  1613. TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
  1614. (numdefcpd &&
  1615. ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||
  1616. (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))
  1617. ))) {
  1618. rv = rv->next_homonym;
  1619. }
  1620. if (rv) {
  1621. sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_PART, st);
  1622. if (!HENTRY_FIND(rv, MORPH_STEM)) {
  1623. sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_STEM, st);
  1624. }
  1625. // store the pointer of the hash entry
  1626. // sprintf(presult + strlen(presult), "%c%s%p", MSEP_FLD, MORPH_HENTRY, rv);
  1627. if (HENTRY_DATA(rv)) {
  1628. sprintf(presult + strlen(presult), "%c%s", MSEP_FLD, HENTRY_DATA(rv));
  1629. }
  1630. }
  1631. if (!rv) {
  1632. if (compoundflag &&
  1633. !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {
  1634. if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,
  1635. FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule &&
  1636. ((SfxEntry*)sfx)->getCont() &&
  1637. ((compoundforbidflag && TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
  1638. ((SfxEntry*)sfx)->getContLen())) || (compoundend &&
  1639. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend,
  1640. ((SfxEntry*)sfx)->getContLen())))) {
  1641. rv = NULL;
  1642. }
  1643. }
  1644. if (rv ||
  1645. (((wordnum == 0) && compoundbegin &&
  1646. ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
  1647. (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) ||
  1648. ((wordnum > 0) && compoundmiddle &&
  1649. ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
  1650. (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle)))))
  1651. ) {
  1652. // char * p = prefix_check_morph(st, i, 0, compound);
  1653. char * p = NULL;
  1654. if (compoundflag) p = affix_check_morph(st, i, compoundflag);
  1655. if (!p || (*p == '\0')) {
  1656. if (p) free(p);
  1657. p = NULL;
  1658. if ((wordnum == 0) && compoundbegin) {
  1659. p = affix_check_morph(st, i, compoundbegin);
  1660. } else if ((wordnum > 0) && compoundmiddle) {
  1661. p = affix_check_morph(st, i, compoundmiddle);
  1662. }
  1663. }
  1664. if (p && (*p != '\0')) {
  1665. sprintf(presult + strlen(presult), "%c%s%s%s", MSEP_FLD,
  1666. MORPH_PART, st, line_uniq_app(&p, MSEP_REC));
  1667. }
  1668. if (p) free(p);
  1669. checked_prefix = 1;
  1670. }
  1671. // else check forbiddenwords
  1672. } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
  1673. TESTAFF(rv->astr, needaffix, rv->alen))) {
  1674. st[i] = ch;
  1675. continue;
  1676. }
  1677. // check non_compound flag in suffix and prefix
  1678. if ((rv) && !hu_mov_rule &&
  1679. ((pfx && ((PfxEntry*)pfx)->getCont() &&
  1680. TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag,
  1681. ((PfxEntry*)pfx)->getContLen())) ||
  1682. (sfx && ((SfxEntry*)sfx)->getCont() &&
  1683. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
  1684. ((SfxEntry*)sfx)->getContLen())))) {
  1685. continue;
  1686. }
  1687. // check compoundend flag in suffix and prefix
  1688. if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
  1689. ((pfx && ((PfxEntry*)pfx)->getCont() &&
  1690. TESTAFF(((PfxEntry*)pfx)->getCont(), compoundend,
  1691. ((PfxEntry*)pfx)->getContLen())) ||
  1692. (sfx && ((SfxEntry*)sfx)->getCont() &&
  1693. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend,
  1694. ((SfxEntry*)sfx)->getContLen())))) {
  1695. continue;
  1696. }
  1697. // check compoundmiddle flag in suffix and prefix
  1698. if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule &&
  1699. ((pfx && ((PfxEntry*)pfx)->getCont() &&
  1700. TESTAFF(((PfxEntry*)pfx)->getCont(), compoundmiddle,
  1701. ((PfxEntry*)pfx)->getContLen())) ||
  1702. (sfx && ((SfxEntry*)sfx)->getCont() &&
  1703. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundmiddle,
  1704. ((SfxEntry*)sfx)->getContLen())))) {
  1705. rv = NULL;
  1706. }
  1707. // check forbiddenwords
  1708. if ((rv) && (rv->astr) && TESTAFF(rv->astr, forbiddenword, rv->alen)) continue;
  1709. // increment word number, if the second root has a compoundroot flag
  1710. if ((rv) && (compoundroot) &&
  1711. (TESTAFF(rv->astr, compoundroot, rv->alen))) {
  1712. wordnum++;
  1713. }
  1714. // first word is acceptable in compound words?
  1715. if (((rv) &&
  1716. ( checked_prefix || (words && words[wnum]) ||
  1717. (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1718. ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
  1719. ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))
  1720. // LANG_hu section: spec. Hungarian rule
  1721. || ((langnum == LANG_hu) && // hu_mov_rule
  1722. hu_mov_rule && (
  1723. TESTAFF(rv->astr, 'F', rv->alen) ||
  1724. TESTAFF(rv->astr, 'G', rv->alen) ||
  1725. TESTAFF(rv->astr, 'H', rv->alen)
  1726. )
  1727. )
  1728. // END of LANG_hu section
  1729. )
  1730. && ! (( checkcompoundtriple && // test triple letters
  1731. (word[i-1]==word[i]) && (
  1732. ((i>1) && (word[i-1]==word[i-2])) ||
  1733. ((word[i-1]==word[i+1])) // may be word[i+1] == '\0'
  1734. )
  1735. ) ||
  1736. (
  1737. // test CHECKCOMPOUNDPATTERN
  1738. numcheckcpd && cpdpat_check(word, i)
  1739. ) ||
  1740. (
  1741. checkcompoundcase && cpdcase_check(word, i)
  1742. ))
  1743. )
  1744. // LANG_hu section: spec. Hungarian rule
  1745. || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) &&
  1746. (sfx && ((SfxEntry*)sfx)->getCont() && (
  1747. TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) 'x', ((SfxEntry*)sfx)->getContLen()) ||
  1748. TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) '%', ((SfxEntry*)sfx)->getContLen())
  1749. )
  1750. )
  1751. )
  1752. // END of LANG_hu section
  1753. ) {
  1754. // LANG_hu section: spec. Hungarian rule
  1755. if (langnum == LANG_hu) {
  1756. // calculate syllable number of the word
  1757. numsyllable += get_syllable(st, i);
  1758. // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
  1759. if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++;
  1760. }
  1761. // END of LANG_hu section
  1762. // NEXT WORD(S)
  1763. rv_first = rv;
  1764. rv = lookup((word+i)); // perhaps without prefix
  1765. // search homonym with compound flag
  1766. while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
  1767. !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1768. (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) ||
  1769. (numdefcpd && defcpd_check(&words, wnum + 1, rv, NULL,1))))) {
  1770. rv = rv->next_homonym;
  1771. }
  1772. if (rv && words && words[wnum + 1]) {
  1773. strcat(*result, presult);
  1774. strcat(*result, " ");
  1775. strcat(*result, MORPH_PART);
  1776. strcat(*result, word+i);
  1777. if (complexprefixes && HENTRY_DATA(rv)) strcat(*result, HENTRY_DATA(rv));
  1778. if (!HENTRY_FIND(rv, MORPH_STEM)) {
  1779. strcat(*result, " ");
  1780. strcat(*result, MORPH_STEM);
  1781. strcat(*result, HENTRY_WORD(rv));
  1782. }
  1783. // store the pointer of the hash entry
  1784. // sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv);
  1785. if (!complexprefixes && HENTRY_DATA(rv)) {
  1786. strcat(*result, " ");
  1787. strcat(*result, HENTRY_DATA(rv));
  1788. }
  1789. strcat(*result, "\n");
  1790. ok = 1;
  1791. return 0;
  1792. }
  1793. oldnumsyllable2 = numsyllable;
  1794. oldwordnum2 = wordnum;
  1795. // LANG_hu section: spec. Hungarian rule
  1796. if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) {
  1797. numsyllable--;
  1798. }
  1799. // END of LANG_hu section
  1800. // increment word number, if the second root has a compoundroot flag
  1801. if ((rv) && (compoundroot) &&
  1802. (TESTAFF(rv->astr, compoundroot, rv->alen))) {
  1803. wordnum++;
  1804. }
  1805. // check forbiddenwords
  1806. if ((rv) && (rv->astr) && TESTAFF(rv->astr, forbiddenword, rv->alen)) {
  1807. st[i] = ch;
  1808. continue;
  1809. }
  1810. // second word is acceptable, as a root?
  1811. // hungarian conventions: compounding is acceptable,
  1812. // when compound forms consist of 2 words, or if more,
  1813. // then the syllable number of root words must be 6, or lesser.
  1814. if ((rv) && (
  1815. (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1816. (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))
  1817. )
  1818. && (
  1819. ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
  1820. ((cpdmaxsyllable==0) ||
  1821. (numsyllable+get_syllable(HENTRY_WORD(rv),rv->blen)<=cpdmaxsyllable))
  1822. )
  1823. && (
  1824. (!checkcompounddup || (rv != rv_first))
  1825. )
  1826. )
  1827. {
  1828. // bad compound word
  1829. strcat(*result, presult);
  1830. strcat(*result, " ");
  1831. strcat(*result, MORPH_PART);
  1832. strcat(*result, word+i);
  1833. if (HENTRY_DATA(rv)) {
  1834. if (complexprefixes) strcat(*result, HENTRY_DATA(rv));
  1835. if (! HENTRY_FIND(rv, MORPH_STEM)) {
  1836. strcat(*result, " ");
  1837. strcat(*result, MORPH_STEM);
  1838. strcat(*result, HENTRY_WORD(rv));
  1839. }
  1840. // store the pointer of the hash entry
  1841. // sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv);
  1842. if (!complexprefixes) {
  1843. strcat(*result, " ");
  1844. strcat(*result, HENTRY_DATA(rv));
  1845. }
  1846. }
  1847. strcat(*result, "\n");
  1848. ok = 1;
  1849. }
  1850. numsyllable = oldnumsyllable2 ;
  1851. wordnum = oldwordnum2;
  1852. // perhaps second word has prefix or/and suffix
  1853. sfx = NULL;
  1854. sfxflag = FLAG_NULL;
  1855. if (compoundflag) rv = affix_check((word+i),strlen(word+i), compoundflag); else rv = NULL;
  1856. if (!rv && compoundend) {
  1857. sfx = NULL;
  1858. pfx = NULL;
  1859. rv = affix_check((word+i),strlen(word+i), compoundend);
  1860. }
  1861. if (!rv && numdefcpd && words) {
  1862. rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END);
  1863. if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) {
  1864. char * m = NULL;
  1865. if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag);
  1866. if ((!m || *m == '\0') && compoundend) {
  1867. if (m) free(m);
  1868. m = affix_check_morph((word+i),strlen(word+i), compoundend);
  1869. }
  1870. strcat(*result, presult);
  1871. if (m || (*m != '\0')) {
  1872. sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD,
  1873. MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC));
  1874. }
  1875. if (m) free(m);
  1876. strcat(*result, "\n");
  1877. ok = 1;
  1878. }
  1879. }
  1880. // check non_compound flag in suffix and prefix
  1881. if ((rv) &&
  1882. ((pfx && ((PfxEntry*)pfx)->getCont() &&
  1883. TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag,
  1884. ((PfxEntry*)pfx)->getContLen())) ||
  1885. (sfx && ((SfxEntry*)sfx)->getCont() &&
  1886. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
  1887. ((SfxEntry*)sfx)->getContLen())))) {
  1888. rv = NULL;
  1889. }
  1890. // check forbiddenwords
  1891. if ((rv) && (rv->astr) && (TESTAFF(rv->astr,forbiddenword,rv->alen))
  1892. && (! TESTAFF(rv->astr, needaffix, rv->alen))) {
  1893. st[i] = ch;
  1894. continue;
  1895. }
  1896. if (langnum == LANG_hu) {
  1897. // calculate syllable number of the word
  1898. numsyllable += get_syllable(word + i, strlen(word + i));
  1899. // - affix syllable num.
  1900. // XXX only second suffix (inflections, not derivations)
  1901. if (sfxappnd) {
  1902. char * tmp = myrevstrdup(sfxappnd);
  1903. numsyllable -= get_syllable(tmp, strlen(tmp));
  1904. free(tmp);
  1905. }
  1906. // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
  1907. if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++;
  1908. // increment syllable num, if last word has a SYLLABLENUM flag
  1909. // and the suffix is beginning `s'
  1910. if (cpdsyllablenum) {
  1911. switch (sfxflag) {
  1912. case 'c': { numsyllable+=2; break; }
  1913. case 'J': { numsyllable += 1; break; }
  1914. case 'I': { if (rv && TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; }
  1915. }
  1916. }
  1917. }
  1918. // increment word number, if the second word has a compoundroot flag
  1919. if ((rv) && (compoundroot) &&
  1920. (TESTAFF(rv->astr, compoundroot, rv->alen))) {
  1921. wordnum++;
  1922. }
  1923. // second word is acceptable, as a word with prefix or/and suffix?
  1924. // hungarian conventions: compounding is acceptable,
  1925. // when compound forms consist 2 word, otherwise
  1926. // the syllable number of root words is 6, or lesser.
  1927. if ((rv) &&
  1928. (
  1929. ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
  1930. ((cpdmaxsyllable==0) ||
  1931. (numsyllable <= cpdmaxsyllable))
  1932. )
  1933. && (
  1934. (!checkcompounddup || (rv != rv_first))
  1935. )) {
  1936. char * m = NULL;
  1937. if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag);
  1938. if ((!m || *m == '\0') && compoundend) {
  1939. if (m) free(m);
  1940. m = affix_check_morph((word+i),strlen(word+i), compoundend);
  1941. }
  1942. strcat(*result, presult);
  1943. if (m && (*m != '\0')) {
  1944. sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD,
  1945. MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC));
  1946. }
  1947. if (m) free(m);
  1948. sprintf(*result + strlen(*result), "%c", MSEP_REC);
  1949. ok = 1;
  1950. }
  1951. numsyllable = oldnumsyllable2;
  1952. wordnum = oldwordnum2;
  1953. // perhaps second word is a compound word (recursive call)
  1954. if ((wordnum < maxwordnum) && (ok == 0)) {
  1955. compound_check_morph((word+i),strlen(word+i), wordnum+1,
  1956. numsyllable, maxwordnum, wnum + 1, words, 0, result, presult);
  1957. } else {
  1958. rv=NULL;
  1959. }
  1960. }
  1961. st[i] = ch;
  1962. wordnum = oldwordnum;
  1963. numsyllable = oldnumsyllable;
  1964. }
  1965. return 0;
  1966. }
  1967. // return 1 if s1 (reversed) is a leading subset of end of s2
  1968. /* inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len)
  1969. {
  1970. while ((len > 0) && *s1 && (*s1 == *end_of_s2)) {
  1971. s1++;
  1972. end_of_s2--;
  1973. len--;
  1974. }
  1975. return (*s1 == '\0');
  1976. }
  1977. */
  1978. inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len)
  1979. {
  1980. while ((len > 0) && (*s1 != '\0') && ((*s1 == *end_of_s2) || (*s1 == '.'))) {
  1981. s1++;
  1982. end_of_s2--;
  1983. len--;
  1984. }
  1985. return (*s1 == '\0');
  1986. }
  1987. // check word for suffixes
  1988. struct hentry * AffixMgr::suffix_check (const char * word, int len,
  1989. int sfxopts, AffEntry * ppfx, char ** wlst, int maxSug, int * ns,
  1990. const FLAG cclass, const FLAG needflag, char in_compound)
  1991. {
  1992. struct hentry * rv = NULL;
  1993. char result[MAXLNLEN];
  1994. PfxEntry* ep = (PfxEntry *) ppfx;
  1995. // first handle the special case of 0 length suffixes
  1996. SfxEntry * se = (SfxEntry *) sStart[0];
  1997. while (se) {
  1998. if (!cclass || se->getCont()) {
  1999. // suffixes are not allowed in beginning of compounds
  2000. if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass
  2001. // except when signed with compoundpermitflag flag
  2002. (se->getCont() && compoundpermitflag &&
  2003. TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix ||
  2004. // no circumfix flag in prefix and suffix
  2005. ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
  2006. circumfix, ep->getContLen())) &&
  2007. (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) ||
  2008. // circumfix flag in prefix AND suffix
  2009. ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
  2010. circumfix, ep->getContLen())) &&
  2011. (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen()))))) &&
  2012. // fogemorpheme
  2013. (in_compound ||
  2014. !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) &&
  2015. // needaffix on prefix or first suffix
  2016. (cclass ||
  2017. !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
  2018. (ppfx && !((ep->getCont()) &&
  2019. TESTAFF(ep->getCont(), needaffix,
  2020. ep->getContLen())))
  2021. )
  2022. ) {
  2023. rv = se->checkword(word,len, sfxopts, ppfx, wlst, maxSug, ns, (FLAG) cclass,
  2024. needflag, (in_compound ? 0 : onlyincompound));
  2025. if (rv) {
  2026. sfx=(AffEntry *)se; // BUG: sfx not stateless
  2027. return rv;
  2028. }
  2029. }
  2030. }
  2031. se = se->getNext();
  2032. }
  2033. // now handle the general case
  2034. unsigned char sp = *((const unsigned char *)(word + len - 1));
  2035. SfxEntry * sptr = (SfxEntry *) sStart[sp];
  2036. while (sptr) {
  2037. if (isRevSubset(sptr->getKey(), word + len - 1, len)
  2038. ) {
  2039. // suffixes are not allowed in beginning of compounds
  2040. if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass
  2041. // except when signed with compoundpermitflag flag
  2042. (sptr->getCont() && compoundpermitflag &&
  2043. TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix ||
  2044. // no circumfix flag in prefix and suffix
  2045. ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
  2046. circumfix, ep->getContLen())) &&
  2047. (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) ||
  2048. // circumfix flag in prefix AND suffix
  2049. ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
  2050. circumfix, ep->getContLen())) &&
  2051. (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen()))))) &&
  2052. // fogemorpheme
  2053. (in_compound ||
  2054. !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) &&
  2055. // needaffix on prefix or first suffix
  2056. (cclass ||
  2057. !(sptr->getCont() && TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
  2058. (ppfx && !((ep->getCont()) &&
  2059. TESTAFF(ep->getCont(), needaffix,
  2060. ep->getContLen())))
  2061. )
  2062. ) {
  2063. rv = sptr->checkword(word,len, sfxopts, ppfx, wlst,
  2064. maxSug, ns, cclass, needflag, (in_compound ? 0 : onlyincompound));
  2065. if (rv) {
  2066. sfx=(AffEntry *)sptr; // BUG: sfx not stateless
  2067. sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
  2068. if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
  2069. if (cclass || sptr->getCont()) {
  2070. if (!derived) {
  2071. derived = mystrdup(word);
  2072. } else {
  2073. strcat(result, " ");
  2074. strcpy(result, MORPH_STEM);
  2075. strcpy(result, derived); // XXX check size
  2076. strcat(result, "\n");
  2077. strcat(result, " ");
  2078. strcat(result, MORPH_STEM);
  2079. strcat(result, word);
  2080. // store the pointer of the hash entry
  2081. // sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv);
  2082. free(derived);
  2083. derived = mystrdup(result);
  2084. }
  2085. }
  2086. return rv;
  2087. }
  2088. }
  2089. sptr = sptr->getNextEQ();
  2090. } else {
  2091. sptr = sptr->getNextNE();
  2092. }
  2093. }
  2094. return NULL;
  2095. }
  2096. // check word for two-level suffixes
  2097. struct hentry * AffixMgr::suffix_check_twosfx(const char * word, int len,
  2098. int sfxopts, AffEntry * ppfx, const FLAG needflag)
  2099. {
  2100. struct hentry * rv = NULL;
  2101. // first handle the special case of 0 length suffixes
  2102. SfxEntry * se = (SfxEntry *) sStart[0];
  2103. while (se) {
  2104. if (contclasses[se->getFlag()])
  2105. {
  2106. rv = se->check_twosfx(word,len, sfxopts, ppfx, needflag);
  2107. if (rv) return rv;
  2108. }
  2109. se = se->getNext();
  2110. }
  2111. // now handle the general case
  2112. unsigned char sp = *((const unsigned char *)(word + len - 1));
  2113. SfxEntry * sptr = (SfxEntry *) sStart[sp];
  2114. while (sptr) {
  2115. if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
  2116. if (contclasses[sptr->getFlag()])
  2117. {
  2118. rv = sptr->check_twosfx(word,len, sfxopts, ppfx, needflag);
  2119. if (rv) {
  2120. sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
  2121. if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
  2122. return rv;
  2123. }
  2124. }
  2125. sptr = sptr->getNextEQ();
  2126. } else {
  2127. sptr = sptr->getNextNE();
  2128. }
  2129. }
  2130. return NULL;
  2131. }
  2132. char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len,
  2133. int sfxopts, AffEntry * ppfx, const FLAG needflag)
  2134. {
  2135. char result[MAXLNLEN];
  2136. char result2[MAXLNLEN];
  2137. char result3[MAXLNLEN];
  2138. char * st;
  2139. result[0] = '\0';
  2140. result2[0] = '\0';
  2141. result3[0] = '\0';
  2142. // first handle the special case of 0 length suffixes
  2143. SfxEntry * se = (SfxEntry *) sStart[0];
  2144. while (se) {
  2145. if (contclasses[se->getFlag()])
  2146. {
  2147. st = se->check_twosfx_morph(word,len, sfxopts, ppfx, needflag);
  2148. if (st) {
  2149. if (ppfx) {
  2150. if (((PfxEntry *) ppfx)->getMorph()) {
  2151. strcat(result, ((PfxEntry *) ppfx)->getMorph());
  2152. strcat(result, " ");
  2153. } else debugflag(result, ((PfxEntry *) ppfx)->getFlag());
  2154. }
  2155. strcat(result, st);
  2156. free(st);
  2157. if (se->getMorph()) {
  2158. strcat(result, " ");
  2159. strcat(result, se->getMorph());
  2160. } else debugflag(result, se->getFlag());
  2161. strcat(result, "\n");
  2162. }
  2163. }
  2164. se = se->getNext();
  2165. }
  2166. // now handle the general case
  2167. unsigned char sp = *((const unsigned char *)(word + len - 1));
  2168. SfxEntry * sptr = (SfxEntry *) sStart[sp];
  2169. while (sptr) {
  2170. if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
  2171. if (contclasses[sptr->getFlag()])
  2172. {
  2173. st = sptr->check_twosfx_morph(word,len, sfxopts, ppfx, needflag);
  2174. if (st) {
  2175. sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
  2176. if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
  2177. strcpy(result2, st);
  2178. free(st);
  2179. result3[0] = '\0';
  2180. #ifdef DEBUG
  2181. unsigned short flag = sptr->getFlag();
  2182. if (flag_mode == FLAG_NUM) {
  2183. sprintf(result3, "<%d>", sptr->getKey());
  2184. } else if (flag_mode == FLAG_LONG) {
  2185. sprintf(result3, "<%c%c>", flag >> 8, (flag << 8) >>8);
  2186. } else sprintf(result3, "<%c>", flag);
  2187. strcat(result3, ":");
  2188. #endif
  2189. if (sptr->getMorph()) {
  2190. strcat(result3, " ");
  2191. strcat(result3, sptr->getMorph());
  2192. } else debugflag(result3, sptr->getFlag());
  2193. strlinecat(result2, result3);
  2194. strcat(result2, "\n");
  2195. strcat(result, result2);
  2196. }
  2197. }
  2198. sptr = sptr->getNextEQ();
  2199. } else {
  2200. sptr = sptr->getNextNE();
  2201. }
  2202. }
  2203. if (result) return mystrdup(result);
  2204. return NULL;
  2205. }
  2206. char * AffixMgr::suffix_check_morph(const char * word, int len,
  2207. int sfxopts, AffEntry * ppfx, const FLAG cclass, const FLAG needflag, char in_compound)
  2208. {
  2209. char result[MAXLNLEN];
  2210. struct hentry * rv = NULL;
  2211. result[0] = '\0';
  2212. PfxEntry* ep = (PfxEntry *) ppfx;
  2213. // first handle the special case of 0 length suffixes
  2214. SfxEntry * se = (SfxEntry *) sStart[0];
  2215. while (se) {
  2216. if (!cclass || se->getCont()) {
  2217. // suffixes are not allowed in beginning of compounds
  2218. if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
  2219. // except when signed with compoundpermitflag flag
  2220. (se->getCont() && compoundpermitflag &&
  2221. TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix ||
  2222. // no circumfix flag in prefix and suffix
  2223. ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
  2224. circumfix, ep->getContLen())) &&
  2225. (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) ||
  2226. // circumfix flag in prefix AND suffix
  2227. ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
  2228. circumfix, ep->getContLen())) &&
  2229. (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen()))))) &&
  2230. // fogemorpheme
  2231. (in_compound ||
  2232. !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) &&
  2233. // needaffix on prefix or first suffix
  2234. (cclass ||
  2235. !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
  2236. (ppfx && !((ep->getCont()) &&
  2237. TESTAFF(ep->getCont(), needaffix,
  2238. ep->getContLen())))
  2239. )
  2240. ))
  2241. rv = se->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag);
  2242. while (rv) {
  2243. if (ppfx) {
  2244. if (((PfxEntry *) ppfx)->getMorph()) {
  2245. strcat(result, ((PfxEntry *) ppfx)->getMorph());
  2246. strcat(result, " ");
  2247. } else debugflag(result, ((PfxEntry *) ppfx)->getFlag());
  2248. }
  2249. if (complexprefixes && HENTRY_DATA(rv)) strcat(result, HENTRY_DATA(rv));
  2250. if (! HENTRY_FIND(rv, MORPH_STEM)) {
  2251. strcat(result, " ");
  2252. strcat(result, MORPH_STEM);
  2253. strcat(result, HENTRY_WORD(rv));
  2254. }
  2255. // store the pointer of the hash entry
  2256. // sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv);
  2257. if (!complexprefixes && HENTRY_DATA(rv)) {
  2258. strcat(result, " ");
  2259. strcat(result, HENTRY_DATA(rv));
  2260. }
  2261. if (se->getMorph()) {
  2262. strcat(result, " ");
  2263. strcat(result, se->getMorph());
  2264. } else debugflag(result, se->getFlag());
  2265. strcat(result, "\n");
  2266. rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
  2267. }
  2268. }
  2269. se = se->getNext();
  2270. }
  2271. // now handle the general case
  2272. unsigned char sp = *((const unsigned char *)(word + len - 1));
  2273. SfxEntry * sptr = (SfxEntry *) sStart[sp];
  2274. while (sptr) {
  2275. if (isRevSubset(sptr->getKey(), word + len - 1, len)
  2276. ) {
  2277. // suffixes are not allowed in beginning of compounds
  2278. if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
  2279. // except when signed with compoundpermitflag flag
  2280. (sptr->getCont() && compoundpermitflag &&
  2281. TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix ||
  2282. // no circumfix flag in prefix and suffix
  2283. ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
  2284. circumfix, ep->getContLen())) &&
  2285. (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) ||
  2286. // circumfix flag in prefix AND suffix
  2287. ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
  2288. circumfix, ep->getContLen())) &&
  2289. (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen()))))) &&
  2290. // fogemorpheme
  2291. (in_compound ||
  2292. !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) &&
  2293. // needaffix on first suffix
  2294. (cclass || !(sptr->getCont() &&
  2295. TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())))
  2296. )) rv = sptr->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag);
  2297. while (rv) {
  2298. if (ppfx) {
  2299. if (((PfxEntry *) ppfx)->getMorph()) {
  2300. strcat(result, ((PfxEntry *) ppfx)->getMorph());
  2301. strcat(result, " ");
  2302. } else debugflag(result, ((PfxEntry *) ppfx)->getFlag());
  2303. }
  2304. if (complexprefixes && HENTRY_DATA(rv)) strcat(result, HENTRY_DATA(rv));
  2305. if (! HENTRY_FIND(rv, MORPH_STEM)) {
  2306. strcat(result, " ");
  2307. strcat(result, MORPH_STEM);
  2308. strcat(result, HENTRY_WORD(rv));
  2309. }
  2310. // store the pointer of the hash entry
  2311. // sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv);
  2312. if (!complexprefixes && HENTRY_DATA(rv)) {
  2313. strcat(result, " ");
  2314. strcat(result, HENTRY_DATA(rv));
  2315. }
  2316. #ifdef DEBUG
  2317. unsigned short flag = sptr->getFlag();
  2318. if (flag_mode == FLAG_NUM) {
  2319. sprintf(result, "<%d>", sptr->getKey());
  2320. } else if (flag_mode == FLAG_LONG) {
  2321. sprintf(result, "<%c%c>", flag >> 8, (flag << 8) >>8);
  2322. } else sprintf(result, "<%c>", flag);
  2323. strcat(result, ":");
  2324. #endif
  2325. if (sptr->getMorph()) {
  2326. strcat(result, " ");
  2327. strcat(result, sptr->getMorph());
  2328. } else debugflag(result, sptr->getFlag());
  2329. strcat(result, "\n");
  2330. rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
  2331. }
  2332. sptr = sptr->getNextEQ();
  2333. } else {
  2334. sptr = sptr->getNextNE();
  2335. }
  2336. }
  2337. if (*result) return mystrdup(result);
  2338. return NULL;
  2339. }
  2340. // check if word with affixes is correctly spelled
  2341. struct hentry * AffixMgr::affix_check (const char * word, int len, const FLAG needflag, char in_compound)
  2342. {
  2343. struct hentry * rv= NULL;
  2344. if (derived) free(derived);
  2345. derived = NULL;
  2346. // check all prefixes (also crossed with suffixes if allowed)
  2347. rv = prefix_check(word, len, in_compound, needflag);
  2348. if (rv) return rv;
  2349. // if still not found check all suffixes
  2350. rv = suffix_check(word, len, 0, NULL, NULL, 0, NULL, FLAG_NULL, needflag, in_compound);
  2351. if (havecontclass) {
  2352. sfx = NULL;
  2353. pfx = NULL;
  2354. if (rv) return rv;
  2355. // if still not found check all two-level suffixes
  2356. rv = suffix_check_twosfx(word, len, 0, NULL, needflag);
  2357. if (rv) return rv;
  2358. // if still not found check all two-level suffixes
  2359. rv = prefix_check_twosfx(word, len, IN_CPD_NOT, needflag);
  2360. }
  2361. return rv;
  2362. }
  2363. // check if word with affixes is correctly spelled
  2364. char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needflag, char in_compound)
  2365. {
  2366. char result[MAXLNLEN];
  2367. char * st = NULL;
  2368. *result = '\0';
  2369. // check all prefixes (also crossed with suffixes if allowed)
  2370. st = prefix_check_morph(word, len, in_compound);
  2371. if (st) {
  2372. strcat(result, st);
  2373. free(st);
  2374. }
  2375. // if still not found check all suffixes
  2376. st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound);
  2377. if (st) {
  2378. strcat(result, st);
  2379. free(st);
  2380. }
  2381. if (havecontclass) {
  2382. sfx = NULL;
  2383. pfx = NULL;
  2384. // if still not found check all two-level suffixes
  2385. st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag);
  2386. if (st) {
  2387. strcat(result, st);
  2388. free(st);
  2389. }
  2390. // if still not found check all two-level suffixes
  2391. st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag);
  2392. if (st) {
  2393. strcat(result, st);
  2394. free(st);
  2395. }
  2396. }
  2397. return mystrdup(result);
  2398. }
  2399. char * AffixMgr::morphgen(char * ts, int wl, const unsigned short * ap,
  2400. unsigned short al, char * morph, char * targetmorph, int level)
  2401. {
  2402. // handle suffixes
  2403. char * stemmorph;
  2404. char * stemmorphcatpos;
  2405. char mymorph[MAXLNLEN];
  2406. if (!morph && !targetmorph) return NULL;
  2407. // check substandard flag
  2408. if (TESTAFF(ap, substandard, al)) return NULL;
  2409. if (morphcmp(morph, targetmorph) == 0) return mystrdup(ts);
  2410. // int targetcount = get_sfxcount(targetmorph);
  2411. // use input suffix fields, if exist
  2412. if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) {
  2413. stemmorph = mymorph;
  2414. strcpy(stemmorph, morph);
  2415. strcat(stemmorph, " ");
  2416. stemmorphcatpos = stemmorph + strlen(stemmorph);
  2417. } else {
  2418. stemmorph = morph;
  2419. stemmorphcatpos = NULL;
  2420. }
  2421. for (int i = 0; i < al; i++) {
  2422. const unsigned char c = (unsigned char) (ap[i] & 0x00FF);
  2423. SfxEntry * sptr = (SfxEntry *)sFlag[c];
  2424. while (sptr) {
  2425. if (sptr->getFlag() == ap[i] && ((sptr->getContLen() == 0) ||
  2426. // don't generate forms with substandard affixes
  2427. !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))) {
  2428. if (stemmorphcatpos) strcpy(stemmorphcatpos, sptr->getMorph());
  2429. else stemmorph = (char *) sptr->getMorph();
  2430. int cmp = morphcmp(stemmorph, targetmorph);
  2431. if (cmp == 0) {
  2432. char * newword = sptr->add(ts, wl);
  2433. if (newword) {
  2434. hentry * check = pHMgr->lookup(newword); // XXX extra dic
  2435. if (!check || !check->astr ||
  2436. !TESTAFF(check->astr, forbiddenword, check->alen)) {
  2437. return newword;
  2438. }
  2439. free(newword);
  2440. }
  2441. }
  2442. // recursive call for secondary suffixes
  2443. if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) &&
  2444. // (get_sfxcount(stemmorph) < targetcount) &&
  2445. !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())) {
  2446. char * newword = sptr->add(ts, wl);
  2447. if (newword) {
  2448. char * newword2 = morphgen(newword, strlen(newword), sptr->getCont(),
  2449. sptr->getContLen(), stemmorph, targetmorph, 1);
  2450. if (newword2) {
  2451. free(newword);
  2452. return newword2;
  2453. }
  2454. free(newword);
  2455. newword = NULL;
  2456. }
  2457. }
  2458. }
  2459. sptr = (SfxEntry *)sptr ->getFlgNxt();
  2460. }
  2461. }
  2462. return NULL;
  2463. }
  2464. int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts,
  2465. int wl, const unsigned short * ap, unsigned short al, char * bad, int badl,
  2466. char * phon)
  2467. {
  2468. int nh=0;
  2469. // first add root word to list
  2470. if ((nh < maxn) && !(al && ((needaffix && TESTAFF(ap, needaffix, al)) ||
  2471. (onlyincompound && TESTAFF(ap, onlyincompound, al))))) {
  2472. wlst[nh].word = mystrdup(ts);
  2473. wlst[nh].allow = (1 == 0);
  2474. wlst[nh].orig = NULL;
  2475. nh++;
  2476. // add special phonetic version
  2477. if (phon && (nh < maxn)) {
  2478. wlst[nh].word = mystrdup(phon);
  2479. wlst[nh].allow = (1 == 0);
  2480. wlst[nh].orig = mystrdup(ts);
  2481. nh++;
  2482. }
  2483. }
  2484. // handle suffixes
  2485. for (int i = 0; i < al; i++) {
  2486. const unsigned char c = (unsigned char) (ap[i] & 0x00FF);
  2487. SfxEntry * sptr = (SfxEntry *)sFlag[c];
  2488. while (sptr) {
  2489. if ((sptr->getFlag() == ap[i]) && (!sptr->getKeyLen() || ((badl > sptr->getKeyLen()) &&
  2490. (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))) &&
  2491. // check needaffix flag
  2492. !(sptr->getCont() && ((needaffix &&
  2493. TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
  2494. (circumfix &&
  2495. TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) ||
  2496. (onlyincompound &&
  2497. TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))
  2498. ) {
  2499. char * newword = sptr->add(ts, wl);
  2500. if (newword) {
  2501. if (nh < maxn) {
  2502. wlst[nh].word = newword;
  2503. wlst[nh].allow = sptr->allowCross();
  2504. wlst[nh].orig = NULL;
  2505. nh++;
  2506. // add special phonetic version
  2507. if (phon && (nh < maxn)) {
  2508. char st[MAXWORDUTF8LEN];
  2509. strcpy(st, phon);
  2510. strcat(st, sptr->getKey());
  2511. reverseword(st + strlen(phon));
  2512. wlst[nh].word = mystrdup(st);
  2513. wlst[nh].allow = (1 == 0);
  2514. wlst[nh].orig = mystrdup(newword);
  2515. nh++;
  2516. }
  2517. } else {
  2518. free(newword);
  2519. }
  2520. }
  2521. }
  2522. sptr = (SfxEntry *)sptr ->getFlgNxt();
  2523. }
  2524. }
  2525. int n = nh;
  2526. // handle cross products of prefixes and suffixes
  2527. for (int j=1;j<n ;j++)
  2528. if (wlst[j].allow) {
  2529. for (int k = 0; k < al; k++) {
  2530. const unsigned char c = (unsigned char) (ap[k] & 0x00FF);
  2531. PfxEntry * cptr = (PfxEntry *) pFlag[c];
  2532. while (cptr) {
  2533. if ((cptr->getFlag() == ap[k]) && cptr->allowCross() && (!cptr->getKeyLen() || ((badl > cptr->getKeyLen()) &&
  2534. (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) {
  2535. int l1 = strlen(wlst[j].word);
  2536. char * newword = cptr->add(wlst[j].word, l1);
  2537. if (newword) {
  2538. if (nh < maxn) {
  2539. wlst[nh].word = newword;
  2540. wlst[nh].allow = cptr->allowCross();
  2541. wlst[nh].orig = NULL;
  2542. nh++;
  2543. } else {
  2544. free(newword);
  2545. }
  2546. }
  2547. }
  2548. cptr = (PfxEntry *)cptr ->getFlgNxt();
  2549. }
  2550. }
  2551. }
  2552. // now handle pure prefixes
  2553. for (int m = 0; m < al; m ++) {
  2554. const unsigned char c = (unsigned char) (ap[m] & 0x00FF);
  2555. PfxEntry * ptr = (PfxEntry *) pFlag[c];
  2556. while (ptr) {
  2557. if ((ptr->getFlag() == ap[m]) && (!ptr->getKeyLen() || ((badl > ptr->getKeyLen()) &&
  2558. (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) &&
  2559. // check needaffix flag
  2560. !(ptr->getCont() && ((needaffix &&
  2561. TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())) ||
  2562. (circumfix &&
  2563. TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) ||
  2564. (onlyincompound &&
  2565. TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen()))))
  2566. ) {
  2567. char * newword = ptr->add(ts, wl);
  2568. if (newword) {
  2569. if (nh < maxn) {
  2570. wlst[nh].word = newword;
  2571. wlst[nh].allow = ptr->allowCross();
  2572. wlst[nh].orig = NULL;
  2573. nh++;
  2574. } else {
  2575. free(newword);
  2576. }
  2577. }
  2578. }
  2579. ptr = (PfxEntry *)ptr ->getFlgNxt();
  2580. }
  2581. }
  2582. return nh;
  2583. }
  2584. // return length of replacing table
  2585. int AffixMgr::get_numrep()
  2586. {
  2587. return numrep;
  2588. }
  2589. // return replacing table
  2590. struct replentry * AffixMgr::get_reptable()
  2591. {
  2592. if (! reptable ) return NULL;
  2593. return reptable;
  2594. }
  2595. // return replacing table
  2596. struct phonetable * AffixMgr::get_phonetable()
  2597. {
  2598. if (! phone ) return NULL;
  2599. return phone;
  2600. }
  2601. // return length of character map table
  2602. int AffixMgr::get_nummap()
  2603. {
  2604. return nummap;
  2605. }
  2606. // return character map table
  2607. struct mapentry * AffixMgr::get_maptable()
  2608. {
  2609. if (! maptable ) return NULL;
  2610. return maptable;
  2611. }
  2612. // return length of word break table
  2613. int AffixMgr::get_numbreak()
  2614. {
  2615. return numbreak;
  2616. }
  2617. // return character map table
  2618. char ** AffixMgr::get_breaktable()
  2619. {
  2620. if (! breaktable ) return NULL;
  2621. return breaktable;
  2622. }
  2623. // return text encoding of dictionary
  2624. char * AffixMgr::get_encoding()
  2625. {
  2626. if (! encoding ) {
  2627. encoding = mystrdup("ISO8859-1");
  2628. }
  2629. return mystrdup(encoding);
  2630. }
  2631. // return text encoding of dictionary
  2632. int AffixMgr::get_langnum()
  2633. {
  2634. return langnum;
  2635. }
  2636. // return double prefix option
  2637. int AffixMgr::get_complexprefixes()
  2638. {
  2639. return complexprefixes;
  2640. }
  2641. FLAG AffixMgr::get_keepcase()
  2642. {
  2643. return keepcase;
  2644. }
  2645. int AffixMgr::get_checksharps()
  2646. {
  2647. return checksharps;
  2648. }
  2649. char * AffixMgr::encode_flag(unsigned short aflag)
  2650. {
  2651. return pHMgr->encode_flag(aflag);
  2652. }
  2653. // return the preferred ignore string for suggestions
  2654. char * AffixMgr::get_ignore()
  2655. {
  2656. if (!ignorechars) return NULL;
  2657. return ignorechars;
  2658. }
  2659. // return the preferred ignore string for suggestions
  2660. unsigned short * AffixMgr::get_ignore_utf16(int * len)
  2661. {
  2662. *len = ignorechars_utf16_len;
  2663. return ignorechars_utf16;
  2664. }
  2665. // return the keyboard string for suggestions
  2666. char * AffixMgr::get_key_string()
  2667. {
  2668. if (! keystring ) return NULL;
  2669. return mystrdup(keystring);
  2670. }
  2671. // return the preferred try string for suggestions
  2672. char * AffixMgr::get_try_string()
  2673. {
  2674. if (! trystring ) return NULL;
  2675. return mystrdup(trystring);
  2676. }
  2677. // return the preferred try string for suggestions
  2678. const char * AffixMgr::get_wordchars()
  2679. {
  2680. return wordchars;
  2681. }
  2682. unsigned short * AffixMgr::get_wordchars_utf16(int * len)
  2683. {
  2684. *len = wordchars_utf16_len;
  2685. return wordchars_utf16;
  2686. }
  2687. // is there compounding?
  2688. int AffixMgr::get_compound()
  2689. {
  2690. return compoundflag || compoundbegin || numdefcpd;
  2691. }
  2692. // return the compound words control flag
  2693. FLAG AffixMgr::get_compoundflag()
  2694. {
  2695. return compoundflag;
  2696. }
  2697. // return the forbidden words control flag
  2698. FLAG AffixMgr::get_forbiddenword()
  2699. {
  2700. return forbiddenword;
  2701. }
  2702. // return the forbidden words control flag
  2703. FLAG AffixMgr::get_nosuggest()
  2704. {
  2705. return nosuggest;
  2706. }
  2707. // return the forbidden words flag modify flag
  2708. FLAG AffixMgr::get_needaffix()
  2709. {
  2710. return needaffix;
  2711. }
  2712. // return the onlyincompound flag
  2713. FLAG AffixMgr::get_onlyincompound()
  2714. {
  2715. return onlyincompound;
  2716. }
  2717. // return the compound word signal flag
  2718. FLAG AffixMgr::get_compoundroot()
  2719. {
  2720. return compoundroot;
  2721. }
  2722. // return the compound begin signal flag
  2723. FLAG AffixMgr::get_compoundbegin()
  2724. {
  2725. return compoundbegin;
  2726. }
  2727. // return the value of checknum
  2728. int AffixMgr::get_checknum()
  2729. {
  2730. return checknum;
  2731. }
  2732. // return the value of prefix
  2733. const char * AffixMgr::get_prefix()
  2734. {
  2735. if (pfx) return ((PfxEntry *)pfx)->getKey();
  2736. return NULL;
  2737. }
  2738. // return the value of suffix
  2739. const char * AffixMgr::get_suffix()
  2740. {
  2741. return sfxappnd;
  2742. }
  2743. // return the value of derived form (base word with first suffix).
  2744. const char * AffixMgr::get_derived()
  2745. {
  2746. return derived;
  2747. }
  2748. // return the value of suffix
  2749. const char * AffixMgr::get_version()
  2750. {
  2751. return version;
  2752. }
  2753. // return lemma_present flag
  2754. FLAG AffixMgr::get_lemma_present()
  2755. {
  2756. return lemma_present;
  2757. }
  2758. // utility method to look up root words in hash table
  2759. struct hentry * AffixMgr::lookup(const char * word)
  2760. {
  2761. int i;
  2762. struct hentry * he = NULL;
  2763. for (i = 0; i < *maxdic && !he; i++) {
  2764. he = (alldic[i])->lookup(word);
  2765. }
  2766. return he;
  2767. }
  2768. // return the value of suffix
  2769. const int AffixMgr::have_contclass()
  2770. {
  2771. return havecontclass;
  2772. }
  2773. // return utf8
  2774. int AffixMgr::get_utf8()
  2775. {
  2776. return utf8;
  2777. }
  2778. // return nosplitsugs
  2779. int AffixMgr::get_maxngramsugs(void)
  2780. {
  2781. return maxngramsugs;
  2782. }
  2783. // return nosplitsugs
  2784. int AffixMgr::get_nosplitsugs(void)
  2785. {
  2786. return nosplitsugs;
  2787. }
  2788. // return sugswithdots
  2789. int AffixMgr::get_sugswithdots(void)
  2790. {
  2791. return sugswithdots;
  2792. }
  2793. /* parse flag */
  2794. int AffixMgr::parse_flag(char * line, unsigned short * out, const char * name) {
  2795. char * s = NULL;
  2796. if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) {
  2797. HUNSPELL_WARNING(stderr, "error: duplicate %s line\n", name);
  2798. return 1;
  2799. }
  2800. if (parse_string(line, &s, name)) return 1;
  2801. *out = pHMgr->decode_flag(s);
  2802. free(s);
  2803. return 0;
  2804. }
  2805. /* parse num */
  2806. int AffixMgr::parse_num(char * line, int * out, const char * name) {
  2807. char * s = NULL;
  2808. if (*out != -1) {
  2809. HUNSPELL_WARNING(stderr, "error: duplicate %s line\n", name);
  2810. return 1;
  2811. }
  2812. if (parse_string(line, &s, name)) return 1;
  2813. *out = atoi(s);
  2814. free(s);
  2815. return 0;
  2816. }
  2817. /* parse in the max syllablecount of compound words and */
  2818. int AffixMgr::parse_cpdsyllable(char * line)
  2819. {
  2820. char * tp = line;
  2821. char * piece;
  2822. int i = 0;
  2823. int np = 0;
  2824. w_char w[MAXWORDLEN];
  2825. piece = mystrsep(&tp, 0);
  2826. while (piece) {
  2827. if (*piece != '\0') {
  2828. switch(i) {
  2829. case 0: { np++; break; }
  2830. case 1: { cpdmaxsyllable = atoi(piece); np++; break; }
  2831. case 2: {
  2832. if (!utf8) {
  2833. cpdvowels = mystrdup(piece);
  2834. } else {
  2835. int n = u8_u16(w, MAXWORDLEN, piece);
  2836. if (n > 0) {
  2837. flag_qsort((unsigned short *) w, 0, n);
  2838. cpdvowels_utf16 = (w_char *) malloc(n * sizeof(w_char));
  2839. if (!cpdvowels_utf16) return 1;
  2840. memcpy(cpdvowels_utf16, w, n * sizeof(w_char));
  2841. }
  2842. cpdvowels_utf16_len = n;
  2843. }
  2844. np++;
  2845. break;
  2846. }
  2847. default: break;
  2848. }
  2849. i++;
  2850. }
  2851. // free(piece);
  2852. piece = mystrsep(&tp, 0);
  2853. }
  2854. if (np < 2) {
  2855. HUNSPELL_WARNING(stderr, "error: missing compoundsyllable information\n");
  2856. return 1;
  2857. }
  2858. if (np == 2) cpdvowels = mystrdup("aeiouAEIOU");
  2859. return 0;
  2860. }
  2861. /* parse in the typical fault correcting table */
  2862. int AffixMgr::parse_reptable(char * line, FileMgr * af)
  2863. {
  2864. if (numrep != 0) {
  2865. HUNSPELL_WARNING(stderr, "error: duplicate REP tables used\n");
  2866. return 1;
  2867. }
  2868. char * tp = line;
  2869. char * piece;
  2870. int i = 0;
  2871. int np = 0;
  2872. piece = mystrsep(&tp, 0);
  2873. while (piece) {
  2874. if (*piece != '\0') {
  2875. switch(i) {
  2876. case 0: { np++; break; }
  2877. case 1: {
  2878. numrep = atoi(piece);
  2879. if (numrep < 1) {
  2880. HUNSPELL_WARNING(stderr, "incorrect number of entries in replacement table\n");
  2881. // free(piece);
  2882. return 1;
  2883. }
  2884. reptable = (replentry *) malloc(numrep * sizeof(struct replentry));
  2885. if (!reptable) return 1;
  2886. np++;
  2887. break;
  2888. }
  2889. default: break;
  2890. }
  2891. i++;
  2892. }
  2893. // free(piece);
  2894. piece = mystrsep(&tp, 0);
  2895. }
  2896. if (np != 2) {
  2897. HUNSPELL_WARNING(stderr, "error: missing replacement table information\n");
  2898. return 1;
  2899. }
  2900. /* now parse the numrep lines to read in the remainder of the table */
  2901. char * nl;
  2902. for (int j=0; j < numrep; j++) {
  2903. if (!(nl = af->getline())) return 1;
  2904. mychomp(nl);
  2905. tp = nl;
  2906. i = 0;
  2907. reptable[j].pattern = NULL;
  2908. reptable[j].pattern2 = NULL;
  2909. piece = mystrsep(&tp, 0);
  2910. while (piece) {
  2911. if (*piece != '\0') {
  2912. switch(i) {
  2913. case 0: {
  2914. if (strncmp(piece,"REP",3) != 0) {
  2915. HUNSPELL_WARNING(stderr, "error: replacement table is corrupt\n");
  2916. numrep = 0;
  2917. // free(piece);
  2918. return 1;
  2919. }
  2920. break;
  2921. }
  2922. case 1: { reptable[j].pattern = mystrrep(mystrdup(piece),"_"," "); break; }
  2923. case 2: { reptable[j].pattern2 = mystrrep(mystrdup(piece),"_"," "); break; }
  2924. default: break;
  2925. }
  2926. i++;
  2927. }
  2928. // free(piece);
  2929. piece = mystrsep(&tp, 0);
  2930. }
  2931. if ((!(reptable[j].pattern)) || (!(reptable[j].pattern2))) {
  2932. HUNSPELL_WARNING(stderr, "error: replacement table is corrupt\n");
  2933. numrep = 0;
  2934. return 1;
  2935. }
  2936. }
  2937. return 0;
  2938. }
  2939. /* parse in the typical fault correcting table */
  2940. int AffixMgr::parse_phonetable(char * line, FileMgr * af)
  2941. {
  2942. if (phone) {
  2943. HUNSPELL_WARNING(stderr, "error: duplicate PHONE tables used\n");
  2944. return 1;
  2945. }
  2946. char * tp = line;
  2947. char * piece;
  2948. int i = 0;
  2949. int np = 0;
  2950. piece = mystrsep(&tp, 0);
  2951. while (piece) {
  2952. if (*piece != '\0') {
  2953. switch(i) {
  2954. case 0: { np++; break; }
  2955. case 1: {
  2956. phone = (phonetable *) malloc(sizeof(struct phonetable));
  2957. phone->num = atoi(piece);
  2958. phone->rules = NULL;
  2959. phone->utf8 = (char) utf8;
  2960. if (!phone) return 1;
  2961. if (phone->num < 1) {
  2962. HUNSPELL_WARNING(stderr, "incorrect number of entries in phonelacement table\n");
  2963. // free(piece);
  2964. return 1;
  2965. }
  2966. phone->rules = (char * *) malloc(2 * (phone->num + 1) * sizeof(char *));
  2967. if (!phone->rules) return 1;
  2968. np++;
  2969. break;
  2970. }
  2971. default: break;
  2972. }
  2973. i++;
  2974. }
  2975. // free(piece);
  2976. piece = mystrsep(&tp, 0);
  2977. }
  2978. if (np != 2) {
  2979. HUNSPELL_WARNING(stderr, "error: missing PHONE table information\n");
  2980. return 1;
  2981. }
  2982. /* now parse the phone->num lines to read in the remainder of the table */
  2983. char * nl;
  2984. for (int j=0; j < phone->num; j++) {
  2985. if (!(nl = af->getline())) return 1;
  2986. mychomp(nl);
  2987. tp = nl;
  2988. i = 0;
  2989. phone->rules[j * 2] = NULL;
  2990. phone->rules[j * 2 + 1] = NULL;
  2991. piece = mystrsep(&tp, 0);
  2992. while (piece) {
  2993. if (*piece != '\0') {
  2994. switch(i) {
  2995. case 0: {
  2996. if (strncmp(piece,"PHONE",5) != 0) {
  2997. HUNSPELL_WARNING(stderr, "error: PHONE table is corrupt\n");
  2998. phone->num = 0;
  2999. // free(piece);
  3000. return 1;
  3001. }
  3002. break;
  3003. }
  3004. case 1: { phone->rules[j * 2] = mystrrep(mystrdup(piece),"_",""); break; }
  3005. case 2: { phone->rules[j * 2 + 1] = mystrrep(mystrdup(piece),"_",""); break; }
  3006. default: break;
  3007. }
  3008. i++;
  3009. }
  3010. // free(piece);
  3011. piece = mystrsep(&tp, 0);
  3012. }
  3013. if ((!(phone->rules[j * 2])) || (!(phone->rules[j * 2 + 1]))) {
  3014. HUNSPELL_WARNING(stderr, "error: PHONE table is corrupt\n");
  3015. phone->num = 0;
  3016. return 1;
  3017. }
  3018. }
  3019. phone->rules[phone->num * 2] = mystrdup("");
  3020. phone->rules[phone->num * 2 + 1] = mystrdup("");
  3021. init_phonet_hash(*phone);
  3022. return 0;
  3023. }
  3024. /* parse in the checkcompoundpattern table */
  3025. int AffixMgr::parse_checkcpdtable(char * line, FileMgr * af)
  3026. {
  3027. if (numcheckcpd != 0) {
  3028. HUNSPELL_WARNING(stderr, "error: duplicate compound pattern tables used\n");
  3029. return 1;
  3030. }
  3031. char * tp = line;
  3032. char * piece;
  3033. int i = 0;
  3034. int np = 0;
  3035. piece = mystrsep(&tp, 0);
  3036. while (piece) {
  3037. if (*piece != '\0') {
  3038. switch(i) {
  3039. case 0: { np++; break; }
  3040. case 1: {
  3041. numcheckcpd = atoi(piece);
  3042. if (numcheckcpd < 1) {
  3043. HUNSPELL_WARNING(stderr, "incorrect number of entries in compound pattern table\n");
  3044. // free(piece);
  3045. return 1;
  3046. }
  3047. checkcpdtable = (replentry *) malloc(numcheckcpd * sizeof(struct replentry));
  3048. if (!checkcpdtable) return 1;
  3049. np++;
  3050. break;
  3051. }
  3052. default: break;
  3053. }
  3054. i++;
  3055. }
  3056. // free(piece);
  3057. piece = mystrsep(&tp, 0);
  3058. }
  3059. if (np != 2) {
  3060. HUNSPELL_WARNING(stderr, "error: missing compound pattern table information\n");
  3061. return 1;
  3062. }
  3063. /* now parse the numcheckcpd lines to read in the remainder of the table */
  3064. char * nl;
  3065. for (int j=0; j < numcheckcpd; j++) {
  3066. if (!(nl = af->getline())) return 1;
  3067. mychomp(nl);
  3068. tp = nl;
  3069. i = 0;
  3070. checkcpdtable[j].pattern = NULL;
  3071. checkcpdtable[j].pattern2 = NULL;
  3072. piece = mystrsep(&tp, 0);
  3073. while (piece) {
  3074. if (*piece != '\0') {
  3075. switch(i) {
  3076. case 0: {
  3077. if (strncmp(piece,"CHECKCOMPOUNDPATTERN",20) != 0) {
  3078. HUNSPELL_WARNING(stderr, "error: compound pattern table is corrupt\n");
  3079. numcheckcpd = 0;
  3080. // free(piece);
  3081. return 1;
  3082. }
  3083. break;
  3084. }
  3085. case 1: { checkcpdtable[j].pattern = mystrdup(piece); break; }
  3086. case 2: { checkcpdtable[j].pattern2 = mystrdup(piece); break; }
  3087. default: break;
  3088. }
  3089. i++;
  3090. }
  3091. // free(piece);
  3092. piece = mystrsep(&tp, 0);
  3093. }
  3094. if ((!(checkcpdtable[j].pattern)) || (!(checkcpdtable[j].pattern2))) {
  3095. HUNSPELL_WARNING(stderr, "error: compound pattern table is corrupt\n");
  3096. numcheckcpd = 0;
  3097. return 1;
  3098. }
  3099. }
  3100. return 0;
  3101. }
  3102. /* parse in the compound rule table */
  3103. int AffixMgr::parse_defcpdtable(char * line, FileMgr * af)
  3104. {
  3105. if (numdefcpd != 0) {
  3106. HUNSPELL_WARNING(stderr, "error: duplicate compound rule tables used\n");
  3107. return 1;
  3108. }
  3109. char * tp = line;
  3110. char * piece;
  3111. int i = 0;
  3112. int np = 0;
  3113. piece = mystrsep(&tp, 0);
  3114. while (piece) {
  3115. if (*piece != '\0') {
  3116. switch(i) {
  3117. case 0: { np++; break; }
  3118. case 1: {
  3119. numdefcpd = atoi(piece);
  3120. if (numdefcpd < 1) {
  3121. HUNSPELL_WARNING(stderr, "incorrect number of entries in compound rule table\n");
  3122. // free(piece);
  3123. return 1;
  3124. }
  3125. defcpdtable = (flagentry *) malloc(numdefcpd * sizeof(flagentry));
  3126. if (!defcpdtable) return 1;
  3127. np++;
  3128. break;
  3129. }
  3130. default: break;
  3131. }
  3132. i++;
  3133. }
  3134. // free(piece);
  3135. piece = mystrsep(&tp, 0);
  3136. }
  3137. if (np != 2) {
  3138. HUNSPELL_WARNING(stderr, "error: missing compound rule table information\n");
  3139. return 1;
  3140. }
  3141. /* now parse the numdefcpd lines to read in the remainder of the table */
  3142. char * nl;
  3143. for (int j=0; j < numdefcpd; j++) {
  3144. if (!(nl = af->getline())) return 1;
  3145. mychomp(nl);
  3146. tp = nl;
  3147. i = 0;
  3148. defcpdtable[j].def = NULL;
  3149. piece = mystrsep(&tp, 0);
  3150. while (piece) {
  3151. if (*piece != '\0') {
  3152. switch(i) {
  3153. case 0: {
  3154. if (strncmp(piece, "COMPOUNDRULE", 12) != 0) {
  3155. HUNSPELL_WARNING(stderr, "error: compound rule table is corrupt\n");
  3156. // free(piece);
  3157. numdefcpd = 0;
  3158. return 1;
  3159. }
  3160. break;
  3161. }
  3162. case 1: {
  3163. defcpdtable[j].len =
  3164. pHMgr->decode_flags(&(defcpdtable[j].def), piece);
  3165. break;
  3166. }
  3167. default: break;
  3168. }
  3169. i++;
  3170. }
  3171. // free(piece);
  3172. piece = mystrsep(&tp, 0);
  3173. }
  3174. if (!defcpdtable[j].len) {
  3175. HUNSPELL_WARNING(stderr, "error: compound rule table is corrupt\n");
  3176. numdefcpd = 0;
  3177. return 1;
  3178. }
  3179. }
  3180. return 0;
  3181. }
  3182. /* parse in the character map table */
  3183. int AffixMgr::parse_maptable(char * line, FileMgr * af)
  3184. {
  3185. if (nummap != 0) {
  3186. HUNSPELL_WARNING(stderr, "error: duplicate MAP tables used\n");
  3187. return 1;
  3188. }
  3189. char * tp = line;
  3190. char * piece;
  3191. int i = 0;
  3192. int np = 0;
  3193. piece = mystrsep(&tp, 0);
  3194. while (piece) {
  3195. if (*piece != '\0') {
  3196. switch(i) {
  3197. case 0: { np++; break; }
  3198. case 1: {
  3199. nummap = atoi(piece);
  3200. if (nummap < 1) {
  3201. HUNSPELL_WARNING(stderr, "incorrect number of entries in map table\n");
  3202. // free(piece);
  3203. return 1;
  3204. }
  3205. maptable = (mapentry *) malloc(nummap * sizeof(struct mapentry));
  3206. if (!maptable) return 1;
  3207. np++;
  3208. break;
  3209. }
  3210. default: break;
  3211. }
  3212. i++;
  3213. }
  3214. // free(piece);
  3215. piece = mystrsep(&tp, 0);
  3216. }
  3217. if (np != 2) {
  3218. HUNSPELL_WARNING(stderr, "error: missing map table information\n");
  3219. return 1;
  3220. }
  3221. /* now parse the nummap lines to read in the remainder of the table */
  3222. char * nl;
  3223. for (int j=0; j < nummap; j++) {
  3224. if (!(nl = af->getline())) return 1;
  3225. mychomp(nl);
  3226. tp = nl;
  3227. i = 0;
  3228. maptable[j].set = NULL;
  3229. maptable[j].len = 0;
  3230. piece = mystrsep(&tp, 0);
  3231. while (piece) {
  3232. if (*piece != '\0') {
  3233. switch(i) {
  3234. case 0: {
  3235. if (strncmp(piece,"MAP",3) != 0) {
  3236. HUNSPELL_WARNING(stderr, "error: map table is corrupt\n");
  3237. nummap = 0;
  3238. // free(piece);
  3239. return 1;
  3240. }
  3241. break;
  3242. }
  3243. case 1: {
  3244. maptable[j].len = 0;
  3245. maptable[j].set = NULL;
  3246. maptable[j].set_utf16 = NULL;
  3247. if (!utf8) {
  3248. maptable[j].set = mystrdup(piece);
  3249. maptable[j].len = strlen(maptable[j].set);
  3250. } else {
  3251. w_char w[MAXWORDLEN];
  3252. int n = u8_u16(w, MAXWORDLEN, piece);
  3253. if (n > 0) {
  3254. flag_qsort((unsigned short *) w, 0, n);
  3255. maptable[j].set_utf16 = (w_char *) malloc(n * sizeof(w_char));
  3256. if (!maptable[j].set_utf16) return 1;
  3257. memcpy(maptable[j].set_utf16, w, n * sizeof(w_char));
  3258. }
  3259. maptable[j].len = n;
  3260. }
  3261. break; }
  3262. default: break;
  3263. }
  3264. i++;
  3265. }
  3266. // free(piece);
  3267. piece = mystrsep(&tp, 0);
  3268. }
  3269. if ((!(maptable[j].set || maptable[j].set_utf16)) || (!(maptable[j].len))) {
  3270. HUNSPELL_WARNING(stderr, "error: map table is corrupt\n");
  3271. nummap = 0;
  3272. return 1;
  3273. }
  3274. }
  3275. return 0;
  3276. }
  3277. /* parse in the word breakpoint table */
  3278. int AffixMgr::parse_breaktable(char * line, FileMgr * af)
  3279. {
  3280. if (numbreak != 0) {
  3281. HUNSPELL_WARNING(stderr, "error: duplicate word breakpoint tables used\n");
  3282. return 1;
  3283. }
  3284. char * tp = line;
  3285. char * piece;
  3286. int i = 0;
  3287. int np = 0;
  3288. piece = mystrsep(&tp, 0);
  3289. while (piece) {
  3290. if (*piece != '\0') {
  3291. switch(i) {
  3292. case 0: { np++; break; }
  3293. case 1: {
  3294. numbreak = atoi(piece);
  3295. if (numbreak < 1) {
  3296. HUNSPELL_WARNING(stderr, "incorrect number of entries in BREAK table\n");
  3297. // free(piece);
  3298. return 1;
  3299. }
  3300. breaktable = (char **) malloc(numbreak * sizeof(char *));
  3301. if (!breaktable) return 1;
  3302. np++;
  3303. break;
  3304. }
  3305. default: break;
  3306. }
  3307. i++;
  3308. }
  3309. // free(piece);
  3310. piece = mystrsep(&tp, 0);
  3311. }
  3312. if (np != 2) {
  3313. HUNSPELL_WARNING(stderr, "error: missing word breakpoint table information\n");
  3314. return 1;
  3315. }
  3316. /* now parse the numbreak lines to read in the remainder of the table */
  3317. char * nl;
  3318. for (int j=0; j < numbreak; j++) {
  3319. if (!(nl = af->getline())) return 1;
  3320. mychomp(nl);
  3321. tp = nl;
  3322. i = 0;
  3323. piece = mystrsep(&tp, 0);
  3324. while (piece) {
  3325. if (*piece != '\0') {
  3326. switch(i) {
  3327. case 0: {
  3328. if (strncmp(piece,"BREAK",5) != 0) {
  3329. HUNSPELL_WARNING(stderr, "error: BREAK table is corrupt\n");
  3330. // free(piece);
  3331. numbreak = 0;
  3332. return 1;
  3333. }
  3334. break;
  3335. }
  3336. case 1: {
  3337. breaktable[j] = mystrdup(piece);
  3338. break;
  3339. }
  3340. default: break;
  3341. }
  3342. i++;
  3343. }
  3344. // free(piece);
  3345. piece = mystrsep(&tp, 0);
  3346. }
  3347. if (!breaktable) {
  3348. HUNSPELL_WARNING(stderr, "error: BREAK table is corrupt\n");
  3349. numbreak = 0;
  3350. return 1;
  3351. }
  3352. }
  3353. return 0;
  3354. }
  3355. void AffixMgr::reverse_condition(char * piece) {
  3356. int neg = 0;
  3357. for (char * k = piece + strlen(piece) - 1; k >= piece; k--) {
  3358. switch(*k) {
  3359. case '[': {
  3360. if (neg) *(k+1) = '['; else *k = ']';
  3361. break;
  3362. }
  3363. case ']': {
  3364. *k = '[';
  3365. if (neg) *(k+1) = '^';
  3366. neg = 0;
  3367. break;
  3368. }
  3369. case '^': {
  3370. if (*(k+1) == ']') neg = 1; else *(k+1) = *k;
  3371. break;
  3372. }
  3373. default: {
  3374. if (neg) *(k+1) = *k;
  3375. }
  3376. }
  3377. }
  3378. }
  3379. int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupflags)
  3380. {
  3381. int numents = 0; // number of affentry structures to parse
  3382. unsigned short aflag = 0; // affix char identifier
  3383. char ff=0;
  3384. struct affentry * ptr= NULL;
  3385. struct affentry * nptr= NULL;
  3386. char * tp = line;
  3387. char * nl = NULL;
  3388. char * piece;
  3389. int i = 0;
  3390. // checking lines with bad syntax
  3391. #ifdef DEBUG
  3392. int basefieldnum = 0;
  3393. #endif
  3394. // split affix header line into pieces
  3395. int np = 0;
  3396. piece = mystrsep(&tp, 0);
  3397. while (piece) {
  3398. if (*piece != '\0') {
  3399. switch(i) {
  3400. // piece 1 - is type of affix
  3401. case 0: { np++; break; }
  3402. // piece 2 - is affix char
  3403. case 1: {
  3404. np++;
  3405. aflag = pHMgr->decode_flag(piece);
  3406. if (((at == 'S') && (dupflags[aflag] & dupSFX)) ||
  3407. ((at == 'P') && (dupflags[aflag] & dupPFX))) {
  3408. HUNSPELL_WARNING(stderr, "error: duplicate affix flag %s in line %s\n", piece, nl);
  3409. // return 1; XXX permissive mode for bad dictionaries
  3410. }
  3411. dupflags[aflag] += (char) ((at == 'S') ? dupSFX : dupPFX);
  3412. break;
  3413. }
  3414. // piece 3 - is cross product indicator
  3415. case 2: { np++; if (*piece == 'Y') ff = aeXPRODUCT; break; }
  3416. // piece 4 - is number of affentries
  3417. case 3: {
  3418. np++;
  3419. numents = atoi(piece);
  3420. if (numents == 0) {
  3421. char * err = pHMgr->encode_flag(aflag);
  3422. HUNSPELL_WARNING(stderr, "error: affix %s header has incorrect entry count in line %s\n",
  3423. err, nl);
  3424. free(err);
  3425. return 1;
  3426. }
  3427. ptr = (struct affentry *) malloc(numents * sizeof(struct affentry));
  3428. if (!ptr) return 1;
  3429. ptr->opts = ff;
  3430. if (utf8) ptr->opts += aeUTF8;
  3431. if (pHMgr->is_aliasf()) ptr->opts += aeALIASF;
  3432. if (pHMgr->is_aliasm()) ptr->opts += aeALIASM;
  3433. ptr->aflag = aflag;
  3434. }
  3435. default: break;
  3436. }
  3437. i++;
  3438. }
  3439. // free(piece);
  3440. piece = mystrsep(&tp, 0);
  3441. }
  3442. // check to make sure we parsed enough pieces
  3443. if (np != 4) {
  3444. char * err = pHMgr->encode_flag(aflag);
  3445. HUNSPELL_WARNING(stderr, "error: affix %s header has insufficient data in line %s\n", err, nl);
  3446. free(err);
  3447. free(ptr);
  3448. return 1;
  3449. }
  3450. // store away ptr to first affentry
  3451. nptr = ptr;
  3452. // now parse numents affentries for this affix
  3453. for (int j=0; j < numents; j++) {
  3454. if (!(nl = af->getline())) return 1;
  3455. mychomp(nl);
  3456. tp = nl;
  3457. i = 0;
  3458. np = 0;
  3459. // split line into pieces
  3460. piece = mystrsep(&tp, 0);
  3461. while (piece) {
  3462. if (*piece != '\0') {
  3463. switch(i) {
  3464. // piece 1 - is type
  3465. case 0: {
  3466. np++;
  3467. if (nptr != ptr) nptr->opts = ptr->opts &
  3468. (char) (aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM);
  3469. break;
  3470. }
  3471. // piece 2 - is affix char
  3472. case 1: {
  3473. np++;
  3474. if (pHMgr->decode_flag(piece) != aflag) {
  3475. char * err = pHMgr->encode_flag(aflag);
  3476. HUNSPELL_WARNING(stderr, "error: affix %s is corrupt near line %s\n", err, nl);
  3477. HUNSPELL_WARNING(stderr, "error: possible incorrect count\n");
  3478. free(err);
  3479. // free(piece);
  3480. return 1;
  3481. }
  3482. if (nptr != ptr) nptr->aflag = ptr->aflag;
  3483. break;
  3484. }
  3485. // piece 3 - is string to strip or 0 for null
  3486. case 2: {
  3487. np++;
  3488. if (complexprefixes) {
  3489. if (utf8) reverseword_utf(piece); else reverseword(piece);
  3490. }
  3491. nptr->strip = mystrdup(piece);
  3492. nptr->stripl = (unsigned char) strlen(nptr->strip);
  3493. if (strcmp(nptr->strip,"0") == 0) {
  3494. free(nptr->strip);
  3495. nptr->strip=mystrdup("");
  3496. nptr->stripl = 0;
  3497. }
  3498. break;
  3499. }
  3500. // piece 4 - is affix string or 0 for null
  3501. case 3: {
  3502. char * dash;
  3503. nptr->morphcode = NULL;
  3504. nptr->contclass = NULL;
  3505. nptr->contclasslen = 0;
  3506. np++;
  3507. dash = strchr(piece, '/');
  3508. if (dash) {
  3509. *dash = '\0';
  3510. if (ignorechars) {
  3511. if (utf8) {
  3512. remove_ignored_chars_utf(piece, ignorechars_utf16, ignorechars_utf16_len);
  3513. } else {
  3514. remove_ignored_chars(piece,ignorechars);
  3515. }
  3516. }
  3517. if (complexprefixes) {
  3518. if (utf8) reverseword_utf(piece); else reverseword(piece);
  3519. }
  3520. nptr->appnd = mystrdup(piece);
  3521. if (pHMgr->is_aliasf()) {
  3522. int index = atoi(dash + 1);
  3523. nptr->contclasslen = (unsigned short) pHMgr->get_aliasf(index, &(nptr->contclass));
  3524. if (!nptr->contclasslen) HUNSPELL_WARNING(stderr, "error: bad affix flag alias: \"%s\"\n", dash+1);
  3525. } else {
  3526. nptr->contclasslen = (unsigned short) pHMgr->decode_flags(&(nptr->contclass), dash + 1);
  3527. flag_qsort(nptr->contclass, 0, nptr->contclasslen);
  3528. }
  3529. *dash = '/';
  3530. havecontclass = 1;
  3531. for (unsigned short _i = 0; _i < nptr->contclasslen; _i++) {
  3532. contclasses[(nptr->contclass)[_i]] = 1;
  3533. }
  3534. } else {
  3535. if (ignorechars) {
  3536. if (utf8) {
  3537. remove_ignored_chars_utf(piece, ignorechars_utf16, ignorechars_utf16_len);
  3538. } else {
  3539. remove_ignored_chars(piece,ignorechars);
  3540. }
  3541. }
  3542. if (complexprefixes) {
  3543. if (utf8) reverseword_utf(piece); else reverseword(piece);
  3544. }
  3545. nptr->appnd = mystrdup(piece);
  3546. }
  3547. nptr->appndl = (unsigned char) strlen(nptr->appnd);
  3548. if (strcmp(nptr->appnd,"0") == 0) {
  3549. free(nptr->appnd);
  3550. nptr->appnd=mystrdup("");
  3551. nptr->appndl = 0;
  3552. }
  3553. break;
  3554. }
  3555. // piece 5 - is the conditions descriptions
  3556. case 4: {
  3557. np++;
  3558. if (complexprefixes) {
  3559. if (utf8) reverseword_utf(piece); else reverseword(piece);
  3560. reverse_condition(piece);
  3561. }
  3562. if (nptr->stripl && (strcmp(piece, ".") != 0) &&
  3563. redundant_condition(at, nptr->strip, nptr->stripl, piece, nl))
  3564. strcpy(piece, ".");
  3565. if (at == 'S') {
  3566. reverseword(piece);
  3567. reverse_condition(piece);
  3568. }
  3569. if (encodeit(nptr, piece)) return 1;
  3570. break;
  3571. }
  3572. case 5: {
  3573. np++;
  3574. if (pHMgr->is_aliasm()) {
  3575. int index = atoi(piece);
  3576. nptr->morphcode = pHMgr->get_aliasm(index);
  3577. } else {
  3578. if (complexprefixes) { // XXX - fix me for morph. gen.
  3579. if (utf8) reverseword_utf(piece); else reverseword(piece);
  3580. }
  3581. // add the remaining of the line
  3582. if (*tp) {
  3583. *(tp - 1) = ' ';
  3584. tp = tp + strlen(tp);
  3585. }
  3586. nptr->morphcode = (char *) malloc(strlen(piece)+1);
  3587. strcpy(nptr->morphcode, piece);
  3588. }
  3589. break;
  3590. }
  3591. default: break;
  3592. }
  3593. i++;
  3594. }
  3595. // free(piece);
  3596. piece = mystrsep(&tp, 0);
  3597. }
  3598. // check to make sure we parsed enough pieces
  3599. if (np < 4) {
  3600. char * err = pHMgr->encode_flag(aflag);
  3601. HUNSPELL_WARNING(stderr, "error: affix %s is corrupt near line %s\n", err, nl);
  3602. free(err);
  3603. free(ptr);
  3604. return 1;
  3605. }
  3606. #ifdef DEBUG
  3607. // detect unnecessary fields, excepting comments
  3608. if (basefieldnum) {
  3609. int fieldnum = !(nptr->morphcode) ? 5 : ((*(nptr->morphcode)=='#') ? 5 : 6);
  3610. if (fieldnum != basefieldnum)
  3611. HUNSPELL_WARNING(stderr, "warning: bad field number:\n%s\n", nl);
  3612. } else {
  3613. basefieldnum = !(nptr->morphcode) ? 5 : ((*(nptr->morphcode)=='#') ? 5 : 6);
  3614. }
  3615. #endif
  3616. nptr++;
  3617. }
  3618. // now create SfxEntry or PfxEntry objects and use links to
  3619. // build an ordered (sorted by affix string) list
  3620. nptr = ptr;
  3621. for (int k = 0; k < numents; k++) {
  3622. if (at == 'P') {
  3623. PfxEntry * pfxptr = new PfxEntry(this,nptr);
  3624. build_pfxtree((AffEntry *)pfxptr);
  3625. } else {
  3626. SfxEntry * sfxptr = new SfxEntry(this,nptr);
  3627. build_sfxtree((AffEntry *)sfxptr);
  3628. }
  3629. nptr++;
  3630. }
  3631. free(ptr);
  3632. return 0;
  3633. }
  3634. int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char * cond, char * warnvar) {
  3635. int condl = strlen(cond);
  3636. int i;
  3637. int j;
  3638. int neg;
  3639. int in;
  3640. if (ft == 'P') { // prefix
  3641. if (strncmp(strip, cond, condl) == 0) return 1;
  3642. if (utf8) {
  3643. } else {
  3644. for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) {
  3645. if (cond[j] != '[') {
  3646. if (cond[j] != strip[i]) {
  3647. HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", warnvar);
  3648. }
  3649. } else {
  3650. neg = (cond[j+1] == '^') ? 1 : 0;
  3651. in = 0;
  3652. do {
  3653. j++;
  3654. if (strip[i] == cond[j]) in = 1;
  3655. } while ((j < (condl - 1)) && (cond[j] != ']'));
  3656. if (j == (condl - 1) && (cond[j] != ']')) {
  3657. HUNSPELL_WARNING(stderr, "error: missing ] in condition:\n%s\n", warnvar);
  3658. return 0;
  3659. }
  3660. if ((!neg && !in) || (neg && in)) {
  3661. HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", warnvar);
  3662. return 0;
  3663. }
  3664. }
  3665. }
  3666. if (j >= condl) return 1;
  3667. }
  3668. } else { // suffix
  3669. if ((stripl >= condl) && strcmp(strip + stripl - condl, cond) == 0) return 1;
  3670. if (utf8) {
  3671. } else {
  3672. for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) {
  3673. if (cond[j] != ']') {
  3674. if (cond[j] != strip[i]) {
  3675. HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", warnvar);
  3676. }
  3677. } else {
  3678. in = 0;
  3679. do {
  3680. j--;
  3681. if (strip[i] == cond[j]) in = 1;
  3682. } while ((j > 0) && (cond[j] != '['));
  3683. if ((j == 0) && (cond[j] != '[')) {
  3684. HUNSPELL_WARNING(stderr, "error: missing ] in condition:\n%s\n", warnvar);
  3685. return 0;
  3686. }
  3687. neg = (cond[j+1] == '^') ? 1 : 0;
  3688. if ((!neg && !in) || (neg && in)) {
  3689. HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", warnvar);
  3690. return 0;
  3691. }
  3692. }
  3693. }
  3694. if (j < 0) return 1;
  3695. }
  3696. }
  3697. return 0;
  3698. }