PageRenderTime 41ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 1ms

/Hunspell/src/hunspell/affixmgr.cxx

https://bitbucket.org/texniccenter/texniccenter
C++ | 4521 lines | 3606 code | 489 blank | 426 comment | 1549 complexity | 8e7e785e713ee59041c2cc3cb66fb85d MD5 | raw file
Possible License(s): LGPL-2.0, GPL-2.0, MPL-2.0-no-copyleft-exception, LGPL-2.1, GPL-3.0
  1. #include "license.hunspell"
  2. #include "license.myspell"
  3. #include <stdlib.h>
  4. #include <string.h>
  5. #include <stdio.h>
  6. #include <ctype.h>
  7. #include <vector>
  8. #include "affixmgr.hxx"
  9. #include "affentry.hxx"
  10. #include "langnum.hxx"
  11. #include "csutil.hxx"
  12. AffixMgr::AffixMgr(const char * affpath, HashMgr** ptr, int * md, const char * key)
  13. {
  14. // register hash manager and load affix data from aff file
  15. pHMgr = ptr[0];
  16. alldic = ptr;
  17. maxdic = md;
  18. keystring = NULL;
  19. trystring = NULL;
  20. encoding=NULL;
  21. csconv=NULL;
  22. utf8 = 0;
  23. complexprefixes = 0;
  24. maptable = NULL;
  25. nummap = 0;
  26. breaktable = NULL;
  27. numbreak = -1;
  28. reptable = NULL;
  29. numrep = 0;
  30. iconvtable = NULL;
  31. oconvtable = NULL;
  32. checkcpdtable = NULL;
  33. // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN)
  34. simplifiedcpd = 0;
  35. numcheckcpd = 0;
  36. defcpdtable = NULL;
  37. numdefcpd = 0;
  38. phone = NULL;
  39. compoundflag = FLAG_NULL; // permits word in compound forms
  40. compoundbegin = FLAG_NULL; // may be first word in compound forms
  41. compoundmiddle = FLAG_NULL; // may be middle word in compound forms
  42. compoundend = FLAG_NULL; // may be last word in compound forms
  43. compoundroot = FLAG_NULL; // compound word signing flag
  44. compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word
  45. compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word
  46. checkcompounddup = 0; // forbid double words in compounds
  47. checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a REP substitution)
  48. checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds
  49. checkcompoundtriple = 0; // forbid compounds with triple letters
  50. simplifiedtriple = 0; // allow simplified triple letters in compounds (Schiff+fahrt -> Schiffahrt)
  51. forbiddenword = FORBIDDENWORD; // forbidden word signing flag
  52. nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag
  53. nongramsuggest = FLAG_NULL;
  54. lang = NULL; // language
  55. langnum = 0; // language code (see http://l10n.openoffice.org/languages.html)
  56. needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes
  57. cpdwordmax = -1; // default: unlimited wordcount in compound words
  58. cpdmin = -1; // undefined
  59. cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words
  60. cpdvowels=NULL; // vowels (for calculating of Hungarian compounding limit, O(n) search! XXX)
  61. cpdvowels_utf16=NULL; // vowels for UTF-8 encoding (bsearch instead of O(n) search)
  62. cpdvowels_utf16_len=0; // vowels
  63. pfxappnd=NULL; // previous prefix for counting the syllables of prefix BUG
  64. sfxappnd=NULL; // previous suffix for counting a special syllables BUG
  65. cpdsyllablenum=NULL; // syllable count incrementing flag
  66. checknum=0; // checking numbers, and word with numbers
  67. wordchars=NULL; // letters + spec. word characters
  68. wordchars_utf16=NULL; // letters + spec. word characters
  69. wordchars_utf16_len=0; // letters + spec. word characters
  70. ignorechars=NULL; // letters + spec. word characters
  71. ignorechars_utf16=NULL; // letters + spec. word characters
  72. ignorechars_utf16_len=0; // letters + spec. word characters
  73. version=NULL; // affix and dictionary file version string
  74. havecontclass=0; // flags of possible continuing classes (double affix)
  75. // LEMMA_PRESENT: not put root into the morphological output. Lemma presents
  76. // in morhological description in dictionary file. It's often combined with PSEUDOROOT.
  77. lemma_present = FLAG_NULL;
  78. circumfix = FLAG_NULL;
  79. onlyincompound = FLAG_NULL;
  80. maxngramsugs = -1; // undefined
  81. maxdiff = -1; // undefined
  82. onlymaxdiff = 0;
  83. maxcpdsugs = -1; // undefined
  84. nosplitsugs = 0;
  85. sugswithdots = 0;
  86. keepcase = 0;
  87. forceucase = 0;
  88. warn = 0;
  89. forbidwarn = 0;
  90. checksharps = 0;
  91. substandard = FLAG_NULL;
  92. fullstrip = 0;
  93. sfx = NULL;
  94. pfx = NULL;
  95. for (int i=0; i < SETSIZE; i++) {
  96. pStart[i] = NULL;
  97. sStart[i] = NULL;
  98. pFlag[i] = NULL;
  99. sFlag[i] = NULL;
  100. }
  101. for (int j=0; j < CONTSIZE; j++) {
  102. contclasses[j] = 0;
  103. }
  104. if (parse_file(affpath, key)) {
  105. HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath);
  106. }
  107. if (cpdmin == -1) cpdmin = MINCPDLEN;
  108. }
  109. AffixMgr::~AffixMgr()
  110. {
  111. // pass through linked prefix entries and clean up
  112. for (int i=0; i < SETSIZE ;i++) {
  113. pFlag[i] = NULL;
  114. PfxEntry * ptr = pStart[i];
  115. PfxEntry * nptr = NULL;
  116. while (ptr) {
  117. nptr = ptr->getNext();
  118. delete(ptr);
  119. ptr = nptr;
  120. nptr = NULL;
  121. }
  122. }
  123. // pass through linked suffix entries and clean up
  124. for (int j=0; j < SETSIZE ; j++) {
  125. sFlag[j] = NULL;
  126. SfxEntry * ptr = sStart[j];
  127. SfxEntry * nptr = NULL;
  128. while (ptr) {
  129. nptr = ptr->getNext();
  130. delete(ptr);
  131. ptr = nptr;
  132. nptr = NULL;
  133. }
  134. sStart[j] = NULL;
  135. }
  136. if (keystring) free(keystring);
  137. keystring=NULL;
  138. if (trystring) free(trystring);
  139. trystring=NULL;
  140. if (encoding) free(encoding);
  141. encoding=NULL;
  142. if (maptable) {
  143. for (int j=0; j < nummap; j++) {
  144. for (int k=0; k < maptable[j].len; k++) {
  145. if (maptable[j].set[k]) free(maptable[j].set[k]);
  146. }
  147. free(maptable[j].set);
  148. maptable[j].set = NULL;
  149. maptable[j].len = 0;
  150. }
  151. free(maptable);
  152. maptable = NULL;
  153. }
  154. nummap = 0;
  155. if (breaktable) {
  156. for (int j=0; j < numbreak; j++) {
  157. if (breaktable[j]) free(breaktable[j]);
  158. breaktable[j] = NULL;
  159. }
  160. free(breaktable);
  161. breaktable = NULL;
  162. }
  163. numbreak = 0;
  164. if (reptable) {
  165. for (int j=0; j < numrep; j++) {
  166. free(reptable[j].pattern);
  167. free(reptable[j].pattern2);
  168. }
  169. free(reptable);
  170. reptable = NULL;
  171. }
  172. if (iconvtable) delete iconvtable;
  173. if (oconvtable) delete oconvtable;
  174. if (phone && phone->rules) {
  175. for (int j=0; j < phone->num + 1; j++) {
  176. free(phone->rules[j * 2]);
  177. free(phone->rules[j * 2 + 1]);
  178. }
  179. free(phone->rules);
  180. free(phone);
  181. phone = NULL;
  182. }
  183. if (defcpdtable) {
  184. for (int j=0; j < numdefcpd; j++) {
  185. free(defcpdtable[j].def);
  186. defcpdtable[j].def = NULL;
  187. }
  188. free(defcpdtable);
  189. defcpdtable = NULL;
  190. }
  191. numrep = 0;
  192. if (checkcpdtable) {
  193. for (int j=0; j < numcheckcpd; j++) {
  194. free(checkcpdtable[j].pattern);
  195. free(checkcpdtable[j].pattern2);
  196. free(checkcpdtable[j].pattern3);
  197. checkcpdtable[j].pattern = NULL;
  198. checkcpdtable[j].pattern2 = NULL;
  199. checkcpdtable[j].pattern3 = NULL;
  200. }
  201. free(checkcpdtable);
  202. checkcpdtable = NULL;
  203. }
  204. numcheckcpd = 0;
  205. FREE_FLAG(compoundflag);
  206. FREE_FLAG(compoundbegin);
  207. FREE_FLAG(compoundmiddle);
  208. FREE_FLAG(compoundend);
  209. FREE_FLAG(compoundpermitflag);
  210. FREE_FLAG(compoundforbidflag);
  211. FREE_FLAG(compoundroot);
  212. FREE_FLAG(forbiddenword);
  213. FREE_FLAG(nosuggest);
  214. FREE_FLAG(nongramsuggest);
  215. FREE_FLAG(needaffix);
  216. FREE_FLAG(lemma_present);
  217. FREE_FLAG(circumfix);
  218. FREE_FLAG(onlyincompound);
  219. cpdwordmax = 0;
  220. pHMgr = NULL;
  221. cpdmin = 0;
  222. cpdmaxsyllable = 0;
  223. if (cpdvowels) free(cpdvowels);
  224. if (cpdvowels_utf16) free(cpdvowels_utf16);
  225. if (cpdsyllablenum) free(cpdsyllablenum);
  226. free_utf_tbl();
  227. if (lang) free(lang);
  228. if (wordchars) free(wordchars);
  229. if (wordchars_utf16) free(wordchars_utf16);
  230. if (ignorechars) free(ignorechars);
  231. if (ignorechars_utf16) free(ignorechars_utf16);
  232. if (version) free(version);
  233. checknum=0;
  234. #ifdef MOZILLA_CLIENT
  235. delete [] csconv;
  236. #endif
  237. }
  238. // read in aff file and build up prefix and suffix entry objects
  239. int AffixMgr::parse_file(const char * affpath, const char * key)
  240. {
  241. char * line; // io buffers
  242. char ft; // affix type
  243. // checking flag duplication
  244. char dupflags[CONTSIZE];
  245. char dupflags_ini = 1;
  246. // first line indicator for removing byte order mark
  247. int firstline = 1;
  248. // open the affix file
  249. FileMgr * afflst = new FileMgr(affpath, key);
  250. if (!afflst) {
  251. HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath);
  252. return 1;
  253. }
  254. // step one is to parse the affix file building up the internal
  255. // affix data structures
  256. // read in each line ignoring any that do not
  257. // start with a known line type indicator
  258. while ((line = afflst->getline())) {
  259. mychomp(line);
  260. /* remove byte order mark */
  261. if (firstline) {
  262. firstline = 0;
  263. // Affix file begins with byte order mark: possible incompatibility with old Hunspell versions
  264. if (strncmp(line,"\xEF\xBB\xBF",3) == 0) {
  265. memmove(line, line+3, strlen(line+3)+1);
  266. }
  267. }
  268. /* parse in the keyboard string */
  269. if (strncmp(line,"KEY",3) == 0) {
  270. if (parse_string(line, &keystring, afflst->getlinenum())) {
  271. delete afflst;
  272. return 1;
  273. }
  274. }
  275. /* parse in the try string */
  276. if (strncmp(line,"TRY",3) == 0) {
  277. if (parse_string(line, &trystring, afflst->getlinenum())) {
  278. delete afflst;
  279. return 1;
  280. }
  281. }
  282. /* parse in the name of the character set used by the .dict and .aff */
  283. if (strncmp(line,"SET",3) == 0) {
  284. if (parse_string(line, &encoding, afflst->getlinenum())) {
  285. delete afflst;
  286. return 1;
  287. }
  288. if (strcmp(encoding, "UTF-8") == 0) {
  289. utf8 = 1;
  290. #ifndef OPENOFFICEORG
  291. #ifndef MOZILLA_CLIENT
  292. if (initialize_utf_tbl()) return 1;
  293. #endif
  294. #endif
  295. }
  296. }
  297. /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left writing system */
  298. if (strncmp(line,"COMPLEXPREFIXES",15) == 0)
  299. complexprefixes = 1;
  300. /* parse in the flag used by the controlled compound words */
  301. if (strncmp(line,"COMPOUNDFLAG",12) == 0) {
  302. if (parse_flag(line, &compoundflag, afflst)) {
  303. delete afflst;
  304. return 1;
  305. }
  306. }
  307. /* parse in the flag used by compound words */
  308. if (strncmp(line,"COMPOUNDBEGIN",13) == 0) {
  309. if (complexprefixes) {
  310. if (parse_flag(line, &compoundend, afflst)) {
  311. delete afflst;
  312. return 1;
  313. }
  314. } else {
  315. if (parse_flag(line, &compoundbegin, afflst)) {
  316. delete afflst;
  317. return 1;
  318. }
  319. }
  320. }
  321. /* parse in the flag used by compound words */
  322. if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) {
  323. if (parse_flag(line, &compoundmiddle, afflst)) {
  324. delete afflst;
  325. return 1;
  326. }
  327. }
  328. /* parse in the flag used by compound words */
  329. if (strncmp(line,"COMPOUNDEND",11) == 0) {
  330. if (complexprefixes) {
  331. if (parse_flag(line, &compoundbegin, afflst)) {
  332. delete afflst;
  333. return 1;
  334. }
  335. } else {
  336. if (parse_flag(line, &compoundend, afflst)) {
  337. delete afflst;
  338. return 1;
  339. }
  340. }
  341. }
  342. /* parse in the data used by compound_check() method */
  343. if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) {
  344. if (parse_num(line, &cpdwordmax, afflst)) {
  345. delete afflst;
  346. return 1;
  347. }
  348. }
  349. /* parse in the flag sign compounds in dictionary */
  350. if (strncmp(line,"COMPOUNDROOT",12) == 0) {
  351. if (parse_flag(line, &compoundroot, afflst)) {
  352. delete afflst;
  353. return 1;
  354. }
  355. }
  356. /* parse in the flag used by compound_check() method */
  357. if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) {
  358. if (parse_flag(line, &compoundpermitflag, afflst)) {
  359. delete afflst;
  360. return 1;
  361. }
  362. }
  363. /* parse in the flag used by compound_check() method */
  364. if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) {
  365. if (parse_flag(line, &compoundforbidflag, afflst)) {
  366. delete afflst;
  367. return 1;
  368. }
  369. }
  370. if (strncmp(line,"CHECKCOMPOUNDDUP",16) == 0) {
  371. checkcompounddup = 1;
  372. }
  373. if (strncmp(line,"CHECKCOMPOUNDREP",16) == 0) {
  374. checkcompoundrep = 1;
  375. }
  376. if (strncmp(line,"CHECKCOMPOUNDTRIPLE",19) == 0) {
  377. checkcompoundtriple = 1;
  378. }
  379. if (strncmp(line,"SIMPLIFIEDTRIPLE",16) == 0) {
  380. simplifiedtriple = 1;
  381. }
  382. if (strncmp(line,"CHECKCOMPOUNDCASE",17) == 0) {
  383. checkcompoundcase = 1;
  384. }
  385. if (strncmp(line,"NOSUGGEST",9) == 0) {
  386. if (parse_flag(line, &nosuggest, afflst)) {
  387. delete afflst;
  388. return 1;
  389. }
  390. }
  391. if (strncmp(line,"NONGRAMSUGGEST",14) == 0) {
  392. if (parse_flag(line, &nongramsuggest, afflst)) {
  393. delete afflst;
  394. return 1;
  395. }
  396. }
  397. /* parse in the flag used by forbidden words */
  398. if (strncmp(line,"FORBIDDENWORD",13) == 0) {
  399. if (parse_flag(line, &forbiddenword, afflst)) {
  400. delete afflst;
  401. return 1;
  402. }
  403. }
  404. /* parse in the flag used by forbidden words */
  405. if (strncmp(line,"LEMMA_PRESENT",13) == 0) {
  406. if (parse_flag(line, &lemma_present, afflst)) {
  407. delete afflst;
  408. return 1;
  409. }
  410. }
  411. /* parse in the flag used by circumfixes */
  412. if (strncmp(line,"CIRCUMFIX",9) == 0) {
  413. if (parse_flag(line, &circumfix, afflst)) {
  414. delete afflst;
  415. return 1;
  416. }
  417. }
  418. /* parse in the flag used by fogemorphemes */
  419. if (strncmp(line,"ONLYINCOMPOUND",14) == 0) {
  420. if (parse_flag(line, &onlyincompound, afflst)) {
  421. delete afflst;
  422. return 1;
  423. }
  424. }
  425. /* parse in the flag used by `needaffixs' */
  426. if (strncmp(line,"PSEUDOROOT",10) == 0) {
  427. if (parse_flag(line, &needaffix, afflst)) {
  428. delete afflst;
  429. return 1;
  430. }
  431. }
  432. /* parse in the flag used by `needaffixs' */
  433. if (strncmp(line,"NEEDAFFIX",9) == 0) {
  434. if (parse_flag(line, &needaffix, afflst)) {
  435. delete afflst;
  436. return 1;
  437. }
  438. }
  439. /* parse in the minimal length for words in compounds */
  440. if (strncmp(line,"COMPOUNDMIN",11) == 0) {
  441. if (parse_num(line, &cpdmin, afflst)) {
  442. delete afflst;
  443. return 1;
  444. }
  445. if (cpdmin < 1) cpdmin = 1;
  446. }
  447. /* parse in the max. words and syllables in compounds */
  448. if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) {
  449. if (parse_cpdsyllable(line, afflst)) {
  450. delete afflst;
  451. return 1;
  452. }
  453. }
  454. /* parse in the flag used by compound_check() method */
  455. if (strncmp(line,"SYLLABLENUM",11) == 0) {
  456. if (parse_string(line, &cpdsyllablenum, afflst->getlinenum())) {
  457. delete afflst;
  458. return 1;
  459. }
  460. }
  461. /* parse in the flag used by the controlled compound words */
  462. if (strncmp(line,"CHECKNUM",8) == 0) {
  463. checknum=1;
  464. }
  465. /* parse in the extra word characters */
  466. if (strncmp(line,"WORDCHARS",9) == 0) {
  467. if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, utf8, afflst->getlinenum())) {
  468. delete afflst;
  469. return 1;
  470. }
  471. }
  472. /* parse in the ignored characters (for example, Arabic optional diacretics charachters */
  473. if (strncmp(line,"IGNORE",6) == 0) {
  474. if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, utf8, afflst->getlinenum())) {
  475. delete afflst;
  476. return 1;
  477. }
  478. }
  479. /* parse in the typical fault correcting table */
  480. if (strncmp(line,"REP",3) == 0) {
  481. if (parse_reptable(line, afflst)) {
  482. delete afflst;
  483. return 1;
  484. }
  485. }
  486. /* parse in the input conversion table */
  487. if (strncmp(line,"ICONV",5) == 0) {
  488. if (parse_convtable(line, afflst, &iconvtable, "ICONV")) {
  489. delete afflst;
  490. return 1;
  491. }
  492. }
  493. /* parse in the input conversion table */
  494. if (strncmp(line,"OCONV",5) == 0) {
  495. if (parse_convtable(line, afflst, &oconvtable, "OCONV")) {
  496. delete afflst;
  497. return 1;
  498. }
  499. }
  500. /* parse in the phonetic translation table */
  501. if (strncmp(line,"PHONE",5) == 0) {
  502. if (parse_phonetable(line, afflst)) {
  503. delete afflst;
  504. return 1;
  505. }
  506. }
  507. /* parse in the checkcompoundpattern table */
  508. if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) {
  509. if (parse_checkcpdtable(line, afflst)) {
  510. delete afflst;
  511. return 1;
  512. }
  513. }
  514. /* parse in the defcompound table */
  515. if (strncmp(line,"COMPOUNDRULE",12) == 0) {
  516. if (parse_defcpdtable(line, afflst)) {
  517. delete afflst;
  518. return 1;
  519. }
  520. }
  521. /* parse in the related character map table */
  522. if (strncmp(line,"MAP",3) == 0) {
  523. if (parse_maptable(line, afflst)) {
  524. delete afflst;
  525. return 1;
  526. }
  527. }
  528. /* parse in the word breakpoints table */
  529. if (strncmp(line,"BREAK",5) == 0) {
  530. if (parse_breaktable(line, afflst)) {
  531. delete afflst;
  532. return 1;
  533. }
  534. }
  535. /* parse in the language for language specific codes */
  536. if (strncmp(line,"LANG",4) == 0) {
  537. if (parse_string(line, &lang, afflst->getlinenum())) {
  538. delete afflst;
  539. return 1;
  540. }
  541. langnum = get_lang_num(lang);
  542. }
  543. if (strncmp(line,"VERSION",7) == 0) {
  544. for(line = line + 7; *line == ' ' || *line == '\t'; line++);
  545. version = mystrdup(line);
  546. }
  547. if (strncmp(line,"MAXNGRAMSUGS",12) == 0) {
  548. if (parse_num(line, &maxngramsugs, afflst)) {
  549. delete afflst;
  550. return 1;
  551. }
  552. }
  553. if (strncmp(line,"ONLYMAXDIFF", 11) == 0)
  554. onlymaxdiff = 1;
  555. if (strncmp(line,"MAXDIFF",7) == 0) {
  556. if (parse_num(line, &maxdiff, afflst)) {
  557. delete afflst;
  558. return 1;
  559. }
  560. }
  561. if (strncmp(line,"MAXCPDSUGS",10) == 0) {
  562. if (parse_num(line, &maxcpdsugs, afflst)) {
  563. delete afflst;
  564. return 1;
  565. }
  566. }
  567. if (strncmp(line,"NOSPLITSUGS",11) == 0) {
  568. nosplitsugs=1;
  569. }
  570. if (strncmp(line,"FULLSTRIP",9) == 0) {
  571. fullstrip=1;
  572. }
  573. if (strncmp(line,"SUGSWITHDOTS",12) == 0) {
  574. sugswithdots=1;
  575. }
  576. /* parse in the flag used by forbidden words */
  577. if (strncmp(line,"KEEPCASE",8) == 0) {
  578. if (parse_flag(line, &keepcase, afflst)) {
  579. delete afflst;
  580. return 1;
  581. }
  582. }
  583. /* parse in the flag used by `forceucase' */
  584. if (strncmp(line,"FORCEUCASE",10) == 0) {
  585. if (parse_flag(line, &forceucase, afflst)) {
  586. delete afflst;
  587. return 1;
  588. }
  589. }
  590. /* parse in the flag used by `warn' */
  591. if (strncmp(line,"WARN",4) == 0) {
  592. if (parse_flag(line, &warn, afflst)) {
  593. delete afflst;
  594. return 1;
  595. }
  596. }
  597. if (strncmp(line,"FORBIDWARN",10) == 0) {
  598. forbidwarn=1;
  599. }
  600. /* parse in the flag used by the affix generator */
  601. if (strncmp(line,"SUBSTANDARD",11) == 0) {
  602. if (parse_flag(line, &substandard, afflst)) {
  603. delete afflst;
  604. return 1;
  605. }
  606. }
  607. if (strncmp(line,"CHECKSHARPS",11) == 0) {
  608. checksharps=1;
  609. }
  610. /* parse this affix: P - prefix, S - suffix */
  611. ft = ' ';
  612. if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P';
  613. if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S';
  614. if (ft != ' ') {
  615. if (dupflags_ini) {
  616. memset(dupflags, 0, sizeof(dupflags));
  617. dupflags_ini = 0;
  618. }
  619. if (parse_affix(line, ft, afflst, dupflags)) {
  620. delete afflst;
  621. process_pfx_tree_to_list();
  622. process_sfx_tree_to_list();
  623. return 1;
  624. }
  625. }
  626. }
  627. delete afflst;
  628. // convert affix trees to sorted list
  629. process_pfx_tree_to_list();
  630. process_sfx_tree_to_list();
  631. // now we can speed up performance greatly taking advantage of the
  632. // relationship between the affixes and the idea of "subsets".
  633. // View each prefix as a potential leading subset of another and view
  634. // each suffix (reversed) as a potential trailing subset of another.
  635. // To illustrate this relationship if we know the prefix "ab" is found in the
  636. // word to examine, only prefixes that "ab" is a leading subset of need be examined.
  637. // Furthermore is "ab" is not present then none of the prefixes that "ab" is
  638. // is a subset need be examined.
  639. // The same argument goes for suffix string that are reversed.
  640. // Then to top this off why not examine the first char of the word to quickly
  641. // limit the set of prefixes to examine (i.e. the prefixes to examine must
  642. // be leading supersets of the first character of the word (if they exist)
  643. // To take advantage of this "subset" relationship, we need to add two links
  644. // from entry. One to take next if the current prefix is found (call it nexteq)
  645. // and one to take next if the current prefix is not found (call it nextne).
  646. // Since we have built ordered lists, all that remains is to properly initialize
  647. // the nextne and nexteq pointers that relate them
  648. process_pfx_order();
  649. process_sfx_order();
  650. /* get encoding for CHECKCOMPOUNDCASE */
  651. if (!utf8) {
  652. char * enc = get_encoding();
  653. csconv = get_current_cs(enc);
  654. free(enc);
  655. enc = NULL;
  656. char expw[MAXLNLEN];
  657. if (wordchars) {
  658. strcpy(expw, wordchars);
  659. free(wordchars);
  660. } else *expw = '\0';
  661. for (int i = 0; i <= 255; i++) {
  662. if ( (csconv[i].cupper != csconv[i].clower) &&
  663. (! strchr(expw, (char) i))) {
  664. *(expw + strlen(expw) + 1) = '\0';
  665. *(expw + strlen(expw)) = (char) i;
  666. }
  667. }
  668. wordchars = mystrdup(expw);
  669. }
  670. // default BREAK definition
  671. if (numbreak == -1) {
  672. breaktable = (char **) malloc(sizeof(char *) * 3);
  673. if (!breaktable) return 1;
  674. breaktable[0] = mystrdup("-");
  675. breaktable[1] = mystrdup("^-");
  676. breaktable[2] = mystrdup("-$");
  677. if (breaktable[0] && breaktable[1] && breaktable[2]) numbreak = 3;
  678. }
  679. return 0;
  680. }
  681. // we want to be able to quickly access prefix information
  682. // both by prefix flag, and sorted by prefix string itself
  683. // so we need to set up two indexes
  684. int AffixMgr::build_pfxtree(PfxEntry* pfxptr)
  685. {
  686. PfxEntry * ptr;
  687. PfxEntry * pptr;
  688. PfxEntry * ep = pfxptr;
  689. // get the right starting points
  690. const char * key = ep->getKey();
  691. const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
  692. // first index by flag which must exist
  693. ptr = pFlag[flg];
  694. ep->setFlgNxt(ptr);
  695. pFlag[flg] = ep;
  696. // handle the special case of null affix string
  697. if (strlen(key) == 0) {
  698. // always inset them at head of list at element 0
  699. ptr = pStart[0];
  700. ep->setNext(ptr);
  701. pStart[0] = ep;
  702. return 0;
  703. }
  704. // now handle the normal case
  705. ep->setNextEQ(NULL);
  706. ep->setNextNE(NULL);
  707. unsigned char sp = *((const unsigned char *)key);
  708. ptr = pStart[sp];
  709. // handle the first insert
  710. if (!ptr) {
  711. pStart[sp] = ep;
  712. return 0;
  713. }
  714. // otherwise use binary tree insertion so that a sorted
  715. // list can easily be generated later
  716. pptr = NULL;
  717. for (;;) {
  718. pptr = ptr;
  719. if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
  720. ptr = ptr->getNextEQ();
  721. if (!ptr) {
  722. pptr->setNextEQ(ep);
  723. break;
  724. }
  725. } else {
  726. ptr = ptr->getNextNE();
  727. if (!ptr) {
  728. pptr->setNextNE(ep);
  729. break;
  730. }
  731. }
  732. }
  733. return 0;
  734. }
  735. // we want to be able to quickly access suffix information
  736. // both by suffix flag, and sorted by the reverse of the
  737. // suffix string itself; so we need to set up two indexes
  738. int AffixMgr::build_sfxtree(SfxEntry* sfxptr)
  739. {
  740. SfxEntry * ptr;
  741. SfxEntry * pptr;
  742. SfxEntry * ep = sfxptr;
  743. /* get the right starting point */
  744. const char * key = ep->getKey();
  745. const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
  746. // first index by flag which must exist
  747. ptr = sFlag[flg];
  748. ep->setFlgNxt(ptr);
  749. sFlag[flg] = ep;
  750. // next index by affix string
  751. // handle the special case of null affix string
  752. if (strlen(key) == 0) {
  753. // always inset them at head of list at element 0
  754. ptr = sStart[0];
  755. ep->setNext(ptr);
  756. sStart[0] = ep;
  757. return 0;
  758. }
  759. // now handle the normal case
  760. ep->setNextEQ(NULL);
  761. ep->setNextNE(NULL);
  762. unsigned char sp = *((const unsigned char *)key);
  763. ptr = sStart[sp];
  764. // handle the first insert
  765. if (!ptr) {
  766. sStart[sp] = ep;
  767. return 0;
  768. }
  769. // otherwise use binary tree insertion so that a sorted
  770. // list can easily be generated later
  771. pptr = NULL;
  772. for (;;) {
  773. pptr = ptr;
  774. if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
  775. ptr = ptr->getNextEQ();
  776. if (!ptr) {
  777. pptr->setNextEQ(ep);
  778. break;
  779. }
  780. } else {
  781. ptr = ptr->getNextNE();
  782. if (!ptr) {
  783. pptr->setNextNE(ep);
  784. break;
  785. }
  786. }
  787. }
  788. return 0;
  789. }
  790. // convert from binary tree to sorted list
  791. int AffixMgr::process_pfx_tree_to_list()
  792. {
  793. for (int i=1; i< SETSIZE; i++) {
  794. pStart[i] = process_pfx_in_order(pStart[i],NULL);
  795. }
  796. return 0;
  797. }
  798. PfxEntry* AffixMgr::process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr)
  799. {
  800. if (ptr) {
  801. nptr = process_pfx_in_order(ptr->getNextNE(), nptr);
  802. ptr->setNext(nptr);
  803. nptr = process_pfx_in_order(ptr->getNextEQ(), ptr);
  804. }
  805. return nptr;
  806. }
  807. // convert from binary tree to sorted list
  808. int AffixMgr:: process_sfx_tree_to_list()
  809. {
  810. for (int i=1; i< SETSIZE; i++) {
  811. sStart[i] = process_sfx_in_order(sStart[i],NULL);
  812. }
  813. return 0;
  814. }
  815. SfxEntry* AffixMgr::process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr)
  816. {
  817. if (ptr) {
  818. nptr = process_sfx_in_order(ptr->getNextNE(), nptr);
  819. ptr->setNext(nptr);
  820. nptr = process_sfx_in_order(ptr->getNextEQ(), ptr);
  821. }
  822. return nptr;
  823. }
  824. // reinitialize the PfxEntry links NextEQ and NextNE to speed searching
  825. // using the idea of leading subsets this time
  826. int AffixMgr::process_pfx_order()
  827. {
  828. PfxEntry* ptr;
  829. // loop through each prefix list starting point
  830. for (int i=1; i < SETSIZE; i++) {
  831. ptr = pStart[i];
  832. // look through the remainder of the list
  833. // and find next entry with affix that
  834. // the current one is not a subset of
  835. // mark that as destination for NextNE
  836. // use next in list that you are a subset
  837. // of as NextEQ
  838. for (; ptr != NULL; ptr = ptr->getNext()) {
  839. PfxEntry * nptr = ptr->getNext();
  840. for (; nptr != NULL; nptr = nptr->getNext()) {
  841. if (! isSubset( ptr->getKey() , nptr->getKey() )) break;
  842. }
  843. ptr->setNextNE(nptr);
  844. ptr->setNextEQ(NULL);
  845. if ((ptr->getNext()) && isSubset(ptr->getKey() , (ptr->getNext())->getKey()))
  846. ptr->setNextEQ(ptr->getNext());
  847. }
  848. // now clean up by adding smart search termination strings:
  849. // if you are already a superset of the previous prefix
  850. // but not a subset of the next, search can end here
  851. // so set NextNE properly
  852. ptr = pStart[i];
  853. for (; ptr != NULL; ptr = ptr->getNext()) {
  854. PfxEntry * nptr = ptr->getNext();
  855. PfxEntry * mptr = NULL;
  856. for (; nptr != NULL; nptr = nptr->getNext()) {
  857. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  858. mptr = nptr;
  859. }
  860. if (mptr) mptr->setNextNE(NULL);
  861. }
  862. }
  863. return 0;
  864. }
  865. // initialize the SfxEntry links NextEQ and NextNE to speed searching
  866. // using the idea of leading subsets this time
  867. int AffixMgr::process_sfx_order()
  868. {
  869. SfxEntry* ptr;
  870. // loop through each prefix list starting point
  871. for (int i=1; i < SETSIZE; i++) {
  872. ptr = sStart[i];
  873. // look through the remainder of the list
  874. // and find next entry with affix that
  875. // the current one is not a subset of
  876. // mark that as destination for NextNE
  877. // use next in list that you are a subset
  878. // of as NextEQ
  879. for (; ptr != NULL; ptr = ptr->getNext()) {
  880. SfxEntry * nptr = ptr->getNext();
  881. for (; nptr != NULL; nptr = nptr->getNext()) {
  882. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  883. }
  884. ptr->setNextNE(nptr);
  885. ptr->setNextEQ(NULL);
  886. if ((ptr->getNext()) && isSubset(ptr->getKey(),(ptr->getNext())->getKey()))
  887. ptr->setNextEQ(ptr->getNext());
  888. }
  889. // now clean up by adding smart search termination strings:
  890. // if you are already a superset of the previous suffix
  891. // but not a subset of the next, search can end here
  892. // so set NextNE properly
  893. ptr = sStart[i];
  894. for (; ptr != NULL; ptr = ptr->getNext()) {
  895. SfxEntry * nptr = ptr->getNext();
  896. SfxEntry * mptr = NULL;
  897. for (; nptr != NULL; nptr = nptr->getNext()) {
  898. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  899. mptr = nptr;
  900. }
  901. if (mptr) mptr->setNextNE(NULL);
  902. }
  903. }
  904. return 0;
  905. }
  906. // add flags to the result for dictionary debugging
  907. void AffixMgr::debugflag(char * result, unsigned short flag) {
  908. char * st = encode_flag(flag);
  909. mystrcat(result, " ", MAXLNLEN);
  910. mystrcat(result, MORPH_FLAG, MAXLNLEN);
  911. if (st) {
  912. mystrcat(result, st, MAXLNLEN);
  913. free(st);
  914. }
  915. }
  916. // calculate the character length of the condition
  917. int AffixMgr::condlen(char * st)
  918. {
  919. int l = 0;
  920. bool group = false;
  921. for(; *st; st++) {
  922. if (*st == '[') {
  923. group = true;
  924. l++;
  925. } else if (*st == ']') group = false;
  926. else if (!group && (!utf8 ||
  927. (!(*st & 0x80) || ((*st & 0xc0) == 0x80)))) l++;
  928. }
  929. return l;
  930. }
  931. int AffixMgr::encodeit(affentry &entry, char * cs)
  932. {
  933. if (strcmp(cs,".") != 0) {
  934. entry.numconds = (char) condlen(cs);
  935. strncpy(entry.c.conds, cs, MAXCONDLEN);
  936. // long condition (end of conds padded by strncpy)
  937. if (entry.c.conds[MAXCONDLEN - 1] && cs[MAXCONDLEN]) {
  938. entry.opts += aeLONGCOND;
  939. entry.c.l.conds2 = mystrdup(cs + MAXCONDLEN_1);
  940. if (!entry.c.l.conds2) return 1;
  941. }
  942. } else {
  943. entry.numconds = 0;
  944. entry.c.conds[0] = '\0';
  945. }
  946. return 0;
  947. }
  948. // return 1 if s1 is a leading subset of s2 (dots are for infixes)
  949. inline int AffixMgr::isSubset(const char * s1, const char * s2)
  950. {
  951. while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) {
  952. s1++;
  953. s2++;
  954. }
  955. return (*s1 == '\0');
  956. }
  957. // check word for prefixes
  958. struct hentry * AffixMgr::prefix_check(const char * word, int len, char in_compound,
  959. const FLAG needflag)
  960. {
  961. struct hentry * rv= NULL;
  962. pfx = NULL;
  963. pfxappnd = NULL;
  964. sfxappnd = NULL;
  965. // first handle the special case of 0 length prefixes
  966. PfxEntry * pe = pStart[0];
  967. while (pe) {
  968. if (
  969. // fogemorpheme
  970. ((in_compound != IN_CPD_NOT) || !(pe->getCont() &&
  971. (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) &&
  972. // permit prefixes in compounds
  973. ((in_compound != IN_CPD_END) || (pe->getCont() &&
  974. (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))
  975. ) {
  976. // check prefix
  977. rv = pe->checkword(word, len, in_compound, needflag);
  978. if (rv) {
  979. pfx=pe; // BUG: pfx not stateless
  980. return rv;
  981. }
  982. }
  983. pe = pe->getNext();
  984. }
  985. // now handle the general case
  986. unsigned char sp = *((const unsigned char *)word);
  987. PfxEntry * pptr = pStart[sp];
  988. while (pptr) {
  989. if (isSubset(pptr->getKey(),word)) {
  990. if (
  991. // fogemorpheme
  992. ((in_compound != IN_CPD_NOT) || !(pptr->getCont() &&
  993. (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) &&
  994. // permit prefixes in compounds
  995. ((in_compound != IN_CPD_END) || (pptr->getCont() &&
  996. (TESTAFF(pptr->getCont(), compoundpermitflag, pptr->getContLen()))))
  997. ) {
  998. // check prefix
  999. rv = pptr->checkword(word, len, in_compound, needflag);
  1000. if (rv) {
  1001. pfx=pptr; // BUG: pfx not stateless
  1002. return rv;
  1003. }
  1004. }
  1005. pptr = pptr->getNextEQ();
  1006. } else {
  1007. pptr = pptr->getNextNE();
  1008. }
  1009. }
  1010. return NULL;
  1011. }
  1012. // check word for prefixes
  1013. struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len,
  1014. char in_compound, const FLAG needflag)
  1015. {
  1016. struct hentry * rv= NULL;
  1017. pfx = NULL;
  1018. sfxappnd = NULL;
  1019. // first handle the special case of 0 length prefixes
  1020. PfxEntry * pe = pStart[0];
  1021. while (pe) {
  1022. rv = pe->check_twosfx(word, len, in_compound, needflag);
  1023. if (rv) return rv;
  1024. pe = pe->getNext();
  1025. }
  1026. // now handle the general case
  1027. unsigned char sp = *((const unsigned char *)word);
  1028. PfxEntry * pptr = pStart[sp];
  1029. while (pptr) {
  1030. if (isSubset(pptr->getKey(),word)) {
  1031. rv = pptr->check_twosfx(word, len, in_compound, needflag);
  1032. if (rv) {
  1033. pfx = pptr;
  1034. return rv;
  1035. }
  1036. pptr = pptr->getNextEQ();
  1037. } else {
  1038. pptr = pptr->getNextNE();
  1039. }
  1040. }
  1041. return NULL;
  1042. }
  1043. // check word for prefixes
  1044. char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound,
  1045. const FLAG needflag)
  1046. {
  1047. char * st;
  1048. char result[MAXLNLEN];
  1049. result[0] = '\0';
  1050. pfx = NULL;
  1051. sfxappnd = NULL;
  1052. // first handle the special case of 0 length prefixes
  1053. PfxEntry * pe = pStart[0];
  1054. while (pe) {
  1055. st = pe->check_morph(word,len,in_compound, needflag);
  1056. if (st) {
  1057. mystrcat(result, st, MAXLNLEN);
  1058. free(st);
  1059. }
  1060. // if (rv) return rv;
  1061. pe = pe->getNext();
  1062. }
  1063. // now handle the general case
  1064. unsigned char sp = *((const unsigned char *)word);
  1065. PfxEntry * pptr = pStart[sp];
  1066. while (pptr) {
  1067. if (isSubset(pptr->getKey(),word)) {
  1068. st = pptr->check_morph(word,len,in_compound, needflag);
  1069. if (st) {
  1070. // fogemorpheme
  1071. if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() &&
  1072. (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))))) {
  1073. mystrcat(result, st, MAXLNLEN);
  1074. pfx = pptr;
  1075. }
  1076. free(st);
  1077. }
  1078. pptr = pptr->getNextEQ();
  1079. } else {
  1080. pptr = pptr->getNextNE();
  1081. }
  1082. }
  1083. if (*result) return mystrdup(result);
  1084. return NULL;
  1085. }
  1086. // check word for prefixes
  1087. char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len,
  1088. char in_compound, const FLAG needflag)
  1089. {
  1090. char * st;
  1091. char result[MAXLNLEN];
  1092. result[0] = '\0';
  1093. pfx = NULL;
  1094. sfxappnd = NULL;
  1095. // first handle the special case of 0 length prefixes
  1096. PfxEntry * pe = pStart[0];
  1097. while (pe) {
  1098. st = pe->check_twosfx_morph(word,len,in_compound, needflag);
  1099. if (st) {
  1100. mystrcat(result, st, MAXLNLEN);
  1101. free(st);
  1102. }
  1103. pe = pe->getNext();
  1104. }
  1105. // now handle the general case
  1106. unsigned char sp = *((const unsigned char *)word);
  1107. PfxEntry * pptr = pStart[sp];
  1108. while (pptr) {
  1109. if (isSubset(pptr->getKey(),word)) {
  1110. st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
  1111. if (st) {
  1112. mystrcat(result, st, MAXLNLEN);
  1113. free(st);
  1114. pfx = pptr;
  1115. }
  1116. pptr = pptr->getNextEQ();
  1117. } else {
  1118. pptr = pptr->getNextNE();
  1119. }
  1120. }
  1121. if (*result) return mystrdup(result);
  1122. return NULL;
  1123. }
  1124. // Is word a non compound with a REP substitution (see checkcompoundrep)?
  1125. int AffixMgr::cpdrep_check(const char * word, int wl)
  1126. {
  1127. char candidate[MAXLNLEN];
  1128. const char * r;
  1129. int lenr, lenp;
  1130. if ((wl < 2) || !numrep) return 0;
  1131. for (int i=0; i < numrep; i++ ) {
  1132. r = word;
  1133. lenr = strlen(reptable[i].pattern2);
  1134. lenp = strlen(reptable[i].pattern);
  1135. // search every occurence of the pattern in the word
  1136. while ((r=strstr(r, reptable[i].pattern)) != NULL) {
  1137. strcpy(candidate, word);
  1138. if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break;
  1139. strcpy(candidate+(r-word),reptable[i].pattern2);
  1140. strcpy(candidate+(r-word)+lenr, r+lenp);
  1141. if (candidate_check(candidate,strlen(candidate))) return 1;
  1142. r++; // search for the next letter
  1143. }
  1144. }
  1145. return 0;
  1146. }
  1147. // forbid compoundings when there are special patterns at word bound
  1148. int AffixMgr::cpdpat_check(const char * word, int pos, hentry * r1, hentry * r2, const char affixed)
  1149. {
  1150. int len;
  1151. for (int i = 0; i < numcheckcpd; i++) {
  1152. if (isSubset(checkcpdtable[i].pattern2, word + pos) &&
  1153. (!r1 || !checkcpdtable[i].cond ||
  1154. (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen))) &&
  1155. (!r2 || !checkcpdtable[i].cond2 ||
  1156. (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen))) &&
  1157. // zero length pattern => only TESTAFF
  1158. // zero pattern (0/flag) => unmodified stem (zero affixes allowed)
  1159. (!*(checkcpdtable[i].pattern) || (
  1160. (*(checkcpdtable[i].pattern)=='0' && r1->blen <= pos && strncmp(word + pos - r1->blen, r1->word, r1->blen) == 0) ||
  1161. (*(checkcpdtable[i].pattern)!='0' && (len = strlen(checkcpdtable[i].pattern)) &&
  1162. strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)))) {
  1163. return 1;
  1164. }
  1165. }
  1166. return 0;
  1167. }
  1168. // forbid compounding with neighbouring upper and lower case characters at word bounds
  1169. int AffixMgr::cpdcase_check(const char * word, int pos)
  1170. {
  1171. if (utf8) {
  1172. w_char u, w;
  1173. const char * p;
  1174. u8_u16(&u, 1, word + pos);
  1175. for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--);
  1176. u8_u16(&w, 1, p);
  1177. unsigned short a = (u.h << 8) + u.l;
  1178. unsigned short b = (w.h << 8) + w.l;
  1179. if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b)) &&
  1180. (a != '-') && (b != '-')) return 1;
  1181. } else {
  1182. unsigned char a = *(word + pos - 1);
  1183. unsigned char b = *(word + pos);
  1184. if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-')) return 1;
  1185. }
  1186. return 0;
  1187. }
  1188. // check compound patterns
  1189. int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** def, char all)
  1190. {
  1191. signed short btpp[MAXWORDLEN]; // metacharacter (*, ?) positions for backtracking
  1192. signed short btwp[MAXWORDLEN]; // word positions for metacharacters
  1193. int btnum[MAXWORDLEN]; // number of matched characters in metacharacter positions
  1194. short bt = 0;
  1195. int i, j;
  1196. int ok;
  1197. int w = 0;
  1198. if (!*words) {
  1199. w = 1;
  1200. *words = def;
  1201. }
  1202. if (!*words) {
  1203. return 0;
  1204. }
  1205. (*words)[wnum] = rv;
  1206. // has the last word COMPOUNDRULE flag?
  1207. if (rv->alen == 0) {
  1208. (*words)[wnum] = NULL;
  1209. if (w) *words = NULL;
  1210. return 0;
  1211. }
  1212. ok = 0;
  1213. for (i = 0; i < numdefcpd; i++) {
  1214. for (j = 0; j < defcpdtable[i].len; j++) {
  1215. if (defcpdtable[i].def[j] != '*' && defcpdtable[i].def[j] != '?' &&
  1216. TESTAFF(rv->astr, defcpdtable[i].def[j], rv->alen)) ok = 1;
  1217. }
  1218. }
  1219. if (ok == 0) {
  1220. (*words)[wnum] = NULL;
  1221. if (w) *words = NULL;
  1222. return 0;
  1223. }
  1224. for (i = 0; i < numdefcpd; i++) {
  1225. signed short pp = 0; // pattern position
  1226. signed short wp = 0; // "words" position
  1227. int ok2;
  1228. ok = 1;
  1229. ok2 = 1;
  1230. do {
  1231. while ((pp < defcpdtable[i].len) && (wp <= wnum)) {
  1232. if (((pp+1) < defcpdtable[i].len) &&
  1233. ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) {
  1234. int wend = (defcpdtable[i].def[pp+1] == '?') ? wp : wnum;
  1235. ok2 = 1;
  1236. pp+=2;
  1237. btpp[bt] = pp;
  1238. btwp[bt] = wp;
  1239. while (wp <= wend) {
  1240. if (!(*words)[wp]->alen ||
  1241. !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp-2], (*words)[wp]->alen)) {
  1242. ok2 = 0;
  1243. break;
  1244. }
  1245. wp++;
  1246. }
  1247. if (wp <= wnum) ok2 = 0;
  1248. btnum[bt] = wp - btwp[bt];
  1249. if (btnum[bt] > 0) bt++;
  1250. if (ok2) break;
  1251. } else {
  1252. ok2 = 1;
  1253. if (!(*words)[wp] || !(*words)[wp]->alen ||
  1254. !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp], (*words)[wp]->alen)) {
  1255. ok = 0;
  1256. break;
  1257. }
  1258. pp++;
  1259. wp++;
  1260. if ((defcpdtable[i].len == pp) && !(wp > wnum)) ok = 0;
  1261. }
  1262. }
  1263. if (ok && ok2) {
  1264. int r = pp;
  1265. while ((defcpdtable[i].len > r) && ((r+1) < defcpdtable[i].len) &&
  1266. ((defcpdtable[i].def[r+1] == '*') || (defcpdtable[i].def[r+1] == '?'))) r+=2;
  1267. if (defcpdtable[i].len <= r) return 1;
  1268. }
  1269. // backtrack
  1270. if (bt) do {
  1271. ok = 1;
  1272. btnum[bt - 1]--;
  1273. pp = btpp[bt - 1];
  1274. wp = btwp[bt - 1] + (signed short) btnum[bt - 1];
  1275. } while ((btnum[bt - 1] < 0) && --bt);
  1276. } while (bt);
  1277. if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1;
  1278. // check zero ending
  1279. while (ok && ok2 && (defcpdtable[i].len > pp) && ((pp+1) < defcpdtable[i].len) &&
  1280. ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) pp+=2;
  1281. if (ok && ok2 && (defcpdtable[i].len <= pp)) return 1;
  1282. }
  1283. (*words)[wnum] = NULL;
  1284. if (w) *words = NULL;
  1285. return 0;
  1286. }
  1287. inline int AffixMgr::candidate_check(const char * word, int len)
  1288. {
  1289. struct hentry * rv=NULL;
  1290. rv = lookup(word);
  1291. if (rv) return 1;
  1292. // rv = prefix_check(word,len,1);
  1293. // if (rv) return 1;
  1294. rv = affix_check(word,len);
  1295. if (rv) return 1;
  1296. return 0;
  1297. }
  1298. // calculate number of syllable for compound-checking
  1299. short AffixMgr::get_syllable(const char * word, int wlen)
  1300. {
  1301. if (cpdmaxsyllable==0) return 0;
  1302. short num=0;
  1303. if (!utf8) {
  1304. for (int i=0; i<wlen; i++) {
  1305. if (strchr(cpdvowels, word[i])) num++;
  1306. }
  1307. } else if (cpdvowels_utf16) {
  1308. w_char w[MAXWORDUTF8LEN];
  1309. int i = u8_u16(w, MAXWORDUTF8LEN, word);
  1310. for (; i > 0; i--) {
  1311. if (flag_bsearch((unsigned short *) cpdvowels_utf16,
  1312. ((unsigned short *) w)[i - 1], cpdvowels_utf16_len)) num++;
  1313. }
  1314. }
  1315. return num;
  1316. }
  1317. void AffixMgr::setcminmax(int * cmin, int * cmax, const char * word, int len) {
  1318. if (utf8) {
  1319. int i;
  1320. for (*cmin = 0, i = 0; (i < cpdmin) && word[*cmin]; i++) {
  1321. for ((*cmin)++; (word[*cmin] & 0xc0) == 0x80; (*cmin)++);
  1322. }
  1323. for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax; i++) {
  1324. for ((*cmax)--; (word[*cmax] & 0xc0) == 0x80; (*cmax)--);
  1325. }
  1326. } else {
  1327. *cmin = cpdmin;
  1328. *cmax = len - cpdmin + 1;
  1329. }
  1330. }
  1331. // check if compound word is correctly spelled
  1332. // hu_mov_rule = spec. Hungarian rule (XXX)
  1333. struct hentry * AffixMgr::compound_check(const char * word, int len,
  1334. short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words = NULL,
  1335. char hu_mov_rule = 0, char is_sug = 0, int * info = NULL)
  1336. {
  1337. int i;
  1338. short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
  1339. struct hentry * rv = NULL;
  1340. struct hentry * rv_first;
  1341. struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
  1342. char st [MAXWORDUTF8LEN + 4];
  1343. char ch = '\0';
  1344. int cmin;
  1345. int cmax;
  1346. int striple = 0;
  1347. int scpd = 0;
  1348. int soldi = 0;
  1349. int oldcmin = 0;
  1350. int oldcmax = 0;
  1351. int oldlen = 0;
  1352. int checkedstriple = 0;
  1353. int onlycpdrule;
  1354. int affixed = 0;
  1355. hentry ** oldwords = words;
  1356. int checked_prefix;
  1357. setcminmax(&cmin, &cmax, word, len);
  1358. strcpy(st, word);
  1359. for (i = cmin; i < cmax; i++) {
  1360. // go to end of the UTF-8 character
  1361. if (utf8) {
  1362. for (; (st[i] & 0xc0) == 0x80; i++);
  1363. if (i >= cmax) return NULL;
  1364. }
  1365. words = oldwords;
  1366. onlycpdrule = (words) ? 1 : 0;
  1367. do { // onlycpdrule loop
  1368. oldnumsyllable = numsyllable;
  1369. oldwordnum = wordnum;
  1370. checked_prefix = 0;
  1371. do { // simplified checkcompoundpattern loop
  1372. if (scpd > 0) {
  1373. for (; scpd <= numcheckcpd && (!checkcpdtable[scpd-1].pattern3 ||
  1374. strncmp(word + i, checkcpdtable[scpd-1].pattern3, strlen(checkcpdtable[scpd-1].pattern3)) != 0); scpd++);
  1375. if (scpd > numcheckcpd) break; // break simplified checkcompoundpattern loop
  1376. strcpy(st + i, checkcpdtable[scpd-1].pattern);
  1377. soldi = i;
  1378. i += strlen(checkcpdtable[scpd-1].pattern);
  1379. strcpy(st + i, checkcpdtable[scpd-1].pattern2);
  1380. strcpy(st + i + strlen(checkcpdtable[scpd-1].pattern2), word + soldi + strlen(checkcpdtable[scpd-1].pattern3));
  1381. oldlen = len;
  1382. len += strlen(checkcpdtable[scpd-1].pattern) + strlen(checkcpdtable[scpd-1].pattern2) - strlen(checkcpdtable[scpd-1].pattern3);
  1383. oldcmin = cmin;
  1384. oldcmax = cmax;
  1385. setcminmax(&cmin, &cmax, st, len);
  1386. cmax = len - cpdmin + 1;
  1387. }
  1388. ch = st[i];
  1389. st[i] = '\0';
  1390. sfx = NULL;
  1391. pfx = NULL;
  1392. // FIRST WORD
  1393. affixed = 1;
  1394. rv = lookup(st); // perhaps without prefix
  1395. // search homonym with compound flag
  1396. while ((rv) && !hu_mov_rule &&
  1397. ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
  1398. !((compoundflag && !words && !onlycpdrule && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1399. (compoundbegin && !wordnum && !onlycpdrule &&
  1400. TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
  1401. (compoundmiddle && wordnum && !words && !onlycpdrule &&
  1402. TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
  1403. (numdefcpd && onlycpdrule &&
  1404. ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||
  1405. (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))) ||
  1406. (scpd != 0 && checkcpdtable[scpd-1].cond != FLAG_NULL &&
  1407. !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen)))
  1408. ) {
  1409. rv = rv->next_homonym;
  1410. }
  1411. if (rv) affixed = 0;
  1412. if (!rv) {
  1413. if (onlycpdrule) break;
  1414. if (compoundflag &&
  1415. !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {
  1416. if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,
  1417. FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule &&
  1418. sfx->getCont() &&
  1419. ((compoundforbidflag && TESTAFF(sfx->getCont(), compoundforbidflag,
  1420. sfx->getContLen())) || (compoundend &&
  1421. TESTAFF(sfx->getCont(), compoundend,
  1422. sfx->getContLen())))) {
  1423. rv = NULL;
  1424. }
  1425. }
  1426. if (rv ||
  1427. (((wordnum == 0) && compoundbegin &&
  1428. ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
  1429. (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) ||
  1430. ((wordnum > 0) && compoundmiddle &&
  1431. ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
  1432. (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle)))))
  1433. ) checked_prefix = 1;
  1434. // else check forbiddenwords and needaffix
  1435. } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
  1436. TESTAFF(rv->astr, needaffix, rv->alen) ||
  1437. TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
  1438. (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen))
  1439. )) {
  1440. st[i] = ch;
  1441. //continue;
  1442. break;
  1443. }
  1444. // check non_compound flag in suffix and prefix
  1445. if ((rv) && !hu_mov_rule &&
  1446. ((pfx && pfx->getCont() &&
  1447. TESTAFF(pfx->getCont(), compoundforbidflag,
  1448. pfx->getContLen())) ||
  1449. (sfx && sfx->getCont() &&
  1450. TESTAFF(sfx->getCont(), compoundforbidflag,
  1451. sfx->getContLen())))) {
  1452. rv = NULL;
  1453. }
  1454. // check compoundend flag in suffix and prefix
  1455. if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
  1456. ((pfx && pfx->getCont() &&
  1457. TESTAFF(pfx->getCont(), compoundend,
  1458. pfx->getContLen())) ||
  1459. (sfx && sfx->getCont() &&
  1460. TESTAFF(sfx->getCont(), compoundend,
  1461. sfx->getContLen())))) {
  1462. rv = NULL;
  1463. }
  1464. // check compoundmiddle flag in suffix and prefix
  1465. if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule &&
  1466. ((pfx && pfx->getCont() &&
  1467. TESTAFF(pfx->getCont(), compoundmiddle,
  1468. pfx->getContLen())) ||
  1469. (sfx && sfx->getCont() &&
  1470. TESTAFF(sfx->getCont(), compoundmiddle,
  1471. sfx->getContLen())))) {
  1472. rv = NULL;
  1473. }
  1474. // check forbiddenwords
  1475. if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
  1476. TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
  1477. (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) {
  1478. return NULL;
  1479. }
  1480. // increment word number, if the second root has a compoundroot flag
  1481. if ((rv) && compoundroot &&
  1482. (TESTAFF(rv->astr, compoundroot, rv->alen))) {
  1483. wordnum++;
  1484. }
  1485. // first word is acceptable in compound words?
  1486. if (((rv) &&
  1487. ( checked_prefix || (words && words[wnum]) ||
  1488. (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1489. ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
  1490. ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))// ||
  1491. // (numdefcpd && )
  1492. // LANG_hu section: spec. Hungarian rule
  1493. || ((langnum == LANG_hu) && hu_mov_rule && (
  1494. TESTAFF(rv->astr, 'F', rv->alen) || // XXX hardwired Hungarian dictionary codes
  1495. TESTAFF(rv->astr, 'G', rv->alen) ||
  1496. TESTAFF(rv->astr, 'H', rv->alen)
  1497. )
  1498. )
  1499. // END of LANG_hu section
  1500. ) &&
  1501. (
  1502. // test CHECKCOMPOUNDPATTERN conditions
  1503. scpd == 0 || checkcpdtable[scpd-1].cond == FLAG_NULL ||
  1504. TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen)
  1505. )
  1506. && ! (( checkcompoundtriple && scpd == 0 && !words && // test triple letters
  1507. (word[i-1]==word[i]) && (
  1508. ((i>1) && (word[i-1]==word[i-2])) ||
  1509. ((word[i-1]==word[i+1])) // may be word[i+1] == '\0'
  1510. )
  1511. ) ||
  1512. (
  1513. checkcompoundcase && scpd == 0 && !words && cpdcase_check(word, i)
  1514. ))
  1515. )
  1516. // LANG_hu section: spec. Hungarian rule
  1517. || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) &&
  1518. (sfx && sfx->getCont() && ( // XXX hardwired Hungarian dic. codes
  1519. TESTAFF(sfx->getCont(), (unsigned short) 'x', sfx->getContLen()) ||
  1520. TESTAFF(sfx->getCont(), (unsigned short) '%', sfx->getContLen())
  1521. )
  1522. )
  1523. )
  1524. ) { // first word is ok condition
  1525. // LANG_hu section: spec. Hungarian rule
  1526. if (langnum == LANG_hu) {
  1527. // calculate syllable number of the word
  1528. numsyllable += get_syllable(st, i);
  1529. // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
  1530. if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++;
  1531. }
  1532. // END of LANG_hu section
  1533. // NEXT WORD(S)
  1534. rv_first = rv;
  1535. st[i] = ch;
  1536. do { // striple loop
  1537. // check simplifiedtriple
  1538. if (simplifiedtriple) {
  1539. if (striple) {
  1540. checkedstriple = 1;
  1541. i--; // check "fahrt" instead of "ahrt" in "Schiffahrt"
  1542. } else if (i > 2 && *(word+i - 1) == *(word + i - 2)) striple = 1;
  1543. }
  1544. rv = lookup((st+i)); // perhaps without prefix
  1545. // search homonym with compound flag
  1546. while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
  1547. !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1548. (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) ||
  1549. (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))) ||
  1550. (scpd != 0 && checkcpdtable[scpd-1].cond2 != FLAG_NULL &&
  1551. !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))
  1552. )) {
  1553. rv = rv->next_homonym;
  1554. }
  1555. // check FORCEUCASE
  1556. if (rv && forceucase && (rv) &&
  1557. (TESTAFF(rv->astr, forceucase, rv->alen)) && !(info && *info & SPELL_ORIGCAP)) rv = NULL;
  1558. if (rv && words && words[wnum + 1]) return rv_first;
  1559. oldnumsyllable2 = numsyllable;
  1560. oldwordnum2 = wordnum;
  1561. // LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary code
  1562. if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) {
  1563. numsyllable--;
  1564. }
  1565. // END of LANG_hu section
  1566. // increment word number, if the second root has a compoundroot flag
  1567. if ((rv) && (compoundroot) &&
  1568. (TESTAFF(rv->astr, compoundroot, rv->alen))) {
  1569. wordnum++;
  1570. }
  1571. // check forbiddenwords
  1572. if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
  1573. TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
  1574. (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL;
  1575. // second word is acceptable, as a root?
  1576. // hungarian conventions: compounding is acceptable,
  1577. // when compound forms consist of 2 words, or if more,
  1578. // then the syllable number of root words must be 6, or lesser.
  1579. if ((rv) && (
  1580. (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1581. (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))
  1582. )
  1583. && (
  1584. ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
  1585. ((cpdmaxsyllable!=0) &&
  1586. (numsyllable + get_syllable(HENTRY_WORD(rv), rv->clen)<=cpdmaxsyllable))
  1587. ) &&
  1588. (
  1589. // test CHECKCOMPOUNDPATTERN
  1590. !numcheckcpd || scpd != 0 || !cpdpat_check(word, i, rv_first, rv, 0)
  1591. ) &&
  1592. (
  1593. (!checkcompounddup || (rv != rv_first))
  1594. )
  1595. // test CHECKCOMPOUNDPATTERN conditions
  1596. && (scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL ||
  1597. TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))
  1598. )
  1599. {
  1600. // forbid compound word, if it is a non compound word with typical fault
  1601. if (checkcompoundrep && cpdrep_check(word,len)) return NULL;
  1602. return rv_first;
  1603. }
  1604. numsyllable = oldnumsyllable2;
  1605. wordnum = oldwordnum2;
  1606. // perhaps second word has prefix or/and suffix
  1607. sfx = NULL;
  1608. sfxflag = FLAG_NULL;
  1609. rv = (compoundflag && !onlycpdrule) ? affix_check((word+i),strlen(word+i), compoundflag, IN_CPD_END) : NULL;
  1610. if (!rv && compoundend && !onlycpdrule) {
  1611. sfx = NULL;
  1612. pfx = NULL;
  1613. rv = affix_check((word+i),strlen(word+i), compoundend, IN_CPD_END);
  1614. }
  1615. if (!rv && numdefcpd && words) {
  1616. rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END);
  1617. if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) return rv_first;
  1618. rv = NULL;
  1619. }
  1620. // test CHECKCOMPOUNDPATTERN conditions (allowed forms)
  1621. if (rv && !(scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL ||
  1622. TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))) rv = NULL;
  1623. // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds)
  1624. if (rv && numcheckcpd && scpd == 0 && cpdpat_check(word, i, rv_first, rv, affixed)) rv = NULL;
  1625. // check non_compound flag in suffix and prefix
  1626. if ((rv) &&
  1627. ((pfx && pfx->getCont() &&
  1628. TESTAFF(pfx->getCont(), compoundforbidflag,
  1629. pfx->getContLen())) ||
  1630. (sfx && sfx->getCont() &&
  1631. TESTAFF(sfx->getCont(), compoundforbidflag,
  1632. sfx->getContLen())))) {
  1633. rv = NULL;
  1634. }
  1635. // check FORCEUCASE
  1636. if (rv && forceucase && (rv) &&
  1637. (TESTAFF(rv->astr, forceucase, rv->alen)) && !(info && *info & SPELL_ORIGCAP)) rv = NULL;
  1638. // check forbiddenwords
  1639. if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
  1640. TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
  1641. (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL;
  1642. // pfxappnd = prefix of word+i, or NULL
  1643. // calculate syllable number of prefix.
  1644. // hungarian convention: when syllable number of prefix is more,
  1645. // than 1, the prefix+word counts as two words.
  1646. if (langnum == LANG_hu) {
  1647. // calculate syllable number of the word
  1648. numsyllable += get_syllable(word + i, strlen(word + i));
  1649. // - affix syllable num.
  1650. // XXX only second suffix (inflections, not derivations)
  1651. if (sfxappnd) {
  1652. char * tmp = myrevstrdup(sfxappnd);
  1653. numsyllable -= get_syllable(tmp, strlen(tmp));
  1654. free(tmp);
  1655. }
  1656. // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
  1657. if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++;
  1658. // increment syllable num, if last word has a SYLLABLENUM flag
  1659. // and the suffix is beginning `s'
  1660. if (cpdsyllablenum) {
  1661. switch (sfxflag) {
  1662. case 'c': { numsyllable+=2; break; }
  1663. case 'J': { numsyllable += 1; break; }
  1664. case 'I': { if (rv && TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; }
  1665. }
  1666. }
  1667. }
  1668. // increment word number, if the second word has a compoundroot flag
  1669. if ((rv) && (compoundroot) &&
  1670. (TESTAFF(rv->astr, compoundroot, rv->alen))) {
  1671. wordnum++;
  1672. }
  1673. // second word is acceptable, as a word with prefix or/and suffix?
  1674. // hungarian conventions: compounding is acceptable,
  1675. // when compound forms consist 2 word, otherwise
  1676. // the syllable number of root words is 6, or lesser.
  1677. if ((rv) &&
  1678. (
  1679. ((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
  1680. ((cpdmaxsyllable != 0) &&
  1681. (numsyllable <= cpdmaxsyllable))
  1682. )
  1683. && (
  1684. (!checkcompounddup || (rv != rv_first))
  1685. )) {
  1686. // forbid compound word, if it is a non compound word with typical fault
  1687. if (checkcompoundrep && cpdrep_check(word, len)) return NULL;
  1688. return rv_first;
  1689. }
  1690. numsyllable = oldnumsyllable2;
  1691. wordnum = oldwordnum2;
  1692. // perhaps second word is a compound word (recursive call)
  1693. if (wordnum < maxwordnum) {
  1694. rv = compound_check((st+i),strlen(st+i), wordnum+1,
  1695. numsyllable, maxwordnum, wnum + 1, words, 0, is_sug, info);
  1696. if (rv && numcheckcpd && ((scpd == 0 && cpdpat_check(word, i, rv_first, rv, affixed)) ||
  1697. (scpd != 0 && !cpdpat_check(word, i, rv_first, rv, affixed)))) rv = NULL;
  1698. } else {
  1699. rv=NULL;
  1700. }
  1701. if (rv) {
  1702. // forbid compound word, if it is a non compound word with typical fault
  1703. if (checkcompoundrep || forbiddenword) {
  1704. struct hentry * rv2 = NULL;
  1705. if (checkcompoundrep && cpdrep_check(word, len)) return NULL;
  1706. // check first part
  1707. if (strncmp(rv->word, word + i, rv->blen) == 0) {
  1708. char r = *(st + i + rv->blen);
  1709. *(st + i + rv->blen) = '\0';
  1710. if (checkcompoundrep && cpdrep_check(st, i + rv->blen)) {
  1711. *(st + i + rv->blen) = r;
  1712. continue;
  1713. }
  1714. if (forbiddenword) {
  1715. rv2 = lookup(word);
  1716. if (!rv2) rv2 = affix_check(word, len);
  1717. if (rv2 && rv2->astr && TESTAFF(rv2->astr, forbiddenword, rv2->alen) &&
  1718. (strncmp(rv2->word, st, i + rv->blen) == 0)) {
  1719. return NULL;
  1720. }
  1721. }
  1722. *(st + i + rv->blen) = r;
  1723. }
  1724. }
  1725. return rv_first;
  1726. }
  1727. } while (striple && !checkedstriple); // end of striple loop
  1728. if (checkedstriple) {
  1729. i++;
  1730. checkedstriple = 0;
  1731. striple = 0;
  1732. }
  1733. } // first word is ok condition
  1734. if (soldi != 0) {
  1735. i = soldi;
  1736. soldi = 0;
  1737. len = oldlen;
  1738. cmin = oldcmin;
  1739. cmax = oldcmax;
  1740. }
  1741. scpd++;
  1742. } while (!onlycpdrule && simplifiedcpd && scpd <= numcheckcpd); // end of simplifiedcpd loop
  1743. scpd = 0;
  1744. wordnum = oldwordnum;
  1745. numsyllable = oldnumsyllable;
  1746. if (soldi != 0) {
  1747. i = soldi;
  1748. strcpy(st, word); // XXX add more optim.
  1749. soldi = 0;
  1750. } else st[i] = ch;
  1751. } while (numdefcpd && oldwordnum == 0 && !onlycpdrule && (onlycpdrule = 1)); // end of onlycpd loop
  1752. }
  1753. return NULL;
  1754. }
  1755. // check if compound word is correctly spelled
  1756. // hu_mov_rule = spec. Hungarian rule (XXX)
  1757. int AffixMgr::compound_check_morph(const char * word, int len,
  1758. short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words,
  1759. char hu_mov_rule = 0, char ** result = NULL, char * partresult = NULL)
  1760. {
  1761. int i;
  1762. short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
  1763. int ok = 0;
  1764. struct hentry * rv = NULL;
  1765. struct hentry * rv_first;
  1766. struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
  1767. char st [MAXWORDUTF8LEN + 4];
  1768. char ch;
  1769. int checked_prefix;
  1770. char presult[MAXLNLEN];
  1771. int cmin;
  1772. int cmax;
  1773. int onlycpdrule;
  1774. int affixed = 0;
  1775. hentry ** oldwords = words;
  1776. setcminmax(&cmin, &cmax, word, len);
  1777. strcpy(st, word);
  1778. for (i = cmin; i < cmax; i++) {
  1779. oldnumsyllable = numsyllable;
  1780. oldwordnum = wordnum;
  1781. checked_prefix = 0;
  1782. // go to end of the UTF-8 character
  1783. if (utf8) {
  1784. for (; (st[i] & 0xc0) == 0x80; i++);
  1785. if (i >= cmax) return 0;
  1786. }
  1787. words = oldwords;
  1788. onlycpdrule = (words) ? 1 : 0;
  1789. do { // onlycpdrule loop
  1790. oldnumsyllable = numsyllable;
  1791. oldwordnum = wordnum;
  1792. checked_prefix = 0;
  1793. ch = st[i];
  1794. st[i] = '\0';
  1795. sfx = NULL;
  1796. // FIRST WORD
  1797. affixed = 1;
  1798. *presult = '\0';
  1799. if (partresult) mystrcat(presult, partresult, MAXLNLEN);
  1800. rv = lookup(st); // perhaps without prefix
  1801. // search homonym with compound flag
  1802. while ((rv) && !hu_mov_rule &&
  1803. ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
  1804. !((compoundflag && !words && !onlycpdrule && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1805. (compoundbegin && !wordnum && !onlycpdrule &&
  1806. TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
  1807. (compoundmiddle && wordnum && !words && !onlycpdrule &&
  1808. TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
  1809. (numdefcpd && onlycpdrule &&
  1810. ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||
  1811. (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))
  1812. ))) {
  1813. rv = rv->next_homonym;
  1814. }
  1815. if (rv) affixed = 0;
  1816. if (rv) {
  1817. sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_PART, st);
  1818. if (!HENTRY_FIND(rv, MORPH_STEM)) {
  1819. sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_STEM, st);
  1820. }
  1821. // store the pointer of the hash entry
  1822. // sprintf(presult + strlen(presult), "%c%s%p", MSEP_FLD, MORPH_HENTRY, rv);
  1823. if (HENTRY_DATA(rv)) {
  1824. sprintf(presult + strlen(presult), "%c%s", MSEP_FLD, HENTRY_DATA2(rv));
  1825. }
  1826. }
  1827. if (!rv) {
  1828. if (onlycpdrule) break;
  1829. if (compoundflag &&
  1830. !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {
  1831. if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,
  1832. FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule &&
  1833. sfx->getCont() &&
  1834. ((compoundforbidflag && TESTAFF(sfx->getCont(), compoundforbidflag,
  1835. sfx->getContLen())) || (compoundend &&
  1836. TESTAFF(sfx->getCont(), compoundend,
  1837. sfx->getContLen())))) {
  1838. rv = NULL;
  1839. }
  1840. }
  1841. if (rv ||
  1842. (((wordnum == 0) && compoundbegin &&
  1843. ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
  1844. (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) ||
  1845. ((wordnum > 0) && compoundmiddle &&
  1846. ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
  1847. (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle)))))
  1848. ) {
  1849. // char * p = prefix_check_morph(st, i, 0, compound);
  1850. char * p = NULL;
  1851. if (compoundflag) p = affix_check_morph(st, i, compoundflag);
  1852. if (!p || (*p == '\0')) {
  1853. if (p) free(p);
  1854. p = NULL;
  1855. if ((wordnum == 0) && compoundbegin) {
  1856. p = affix_check_morph(st, i, compoundbegin);
  1857. } else if ((wordnum > 0) && compoundmiddle) {
  1858. p = affix_check_morph(st, i, compoundmiddle);
  1859. }
  1860. }
  1861. if (p && (*p != '\0')) {
  1862. sprintf(presult + strlen(presult), "%c%s%s%s", MSEP_FLD,
  1863. MORPH_PART, st, line_uniq_app(&p, MSEP_REC));
  1864. }
  1865. if (p) free(p);
  1866. checked_prefix = 1;
  1867. }
  1868. // else check forbiddenwords
  1869. } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
  1870. TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
  1871. TESTAFF(rv->astr, needaffix, rv->alen))) {
  1872. st[i] = ch;
  1873. continue;
  1874. }
  1875. // check non_compound flag in suffix and prefix
  1876. if ((rv) && !hu_mov_rule &&
  1877. ((pfx && pfx->getCont() &&
  1878. TESTAFF(pfx->getCont(), compoundforbidflag,
  1879. pfx->getContLen())) ||
  1880. (sfx && sfx->getCont() &&
  1881. TESTAFF(sfx->getCont(), compoundforbidflag,
  1882. sfx->getContLen())))) {
  1883. continue;
  1884. }
  1885. // check compoundend flag in suffix and prefix
  1886. if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
  1887. ((pfx && pfx->getCont() &&
  1888. TESTAFF(pfx->getCont(), compoundend,
  1889. pfx->getContLen())) ||
  1890. (sfx && sfx->getCont() &&
  1891. TESTAFF(sfx->getCont(), compoundend,
  1892. sfx->getContLen())))) {
  1893. continue;
  1894. }
  1895. // check compoundmiddle flag in suffix and prefix
  1896. if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule &&
  1897. ((pfx && pfx->getCont() &&
  1898. TESTAFF(pfx->getCont(), compoundmiddle,
  1899. pfx->getContLen())) ||
  1900. (sfx && sfx->getCont() &&
  1901. TESTAFF(sfx->getCont(), compoundmiddle,
  1902. sfx->getContLen())))) {
  1903. rv = NULL;
  1904. }
  1905. // check forbiddenwords
  1906. if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen)
  1907. || TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) continue;
  1908. // increment word number, if the second root has a compoundroot flag
  1909. if ((rv) && (compoundroot) &&
  1910. (TESTAFF(rv->astr, compoundroot, rv->alen))) {
  1911. wordnum++;
  1912. }
  1913. // first word is acceptable in compound words?
  1914. if (((rv) &&
  1915. ( checked_prefix || (words && words[wnum]) ||
  1916. (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1917. ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
  1918. ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))
  1919. // LANG_hu section: spec. Hungarian rule
  1920. || ((langnum == LANG_hu) && // hu_mov_rule
  1921. hu_mov_rule && (
  1922. TESTAFF(rv->astr, 'F', rv->alen) ||
  1923. TESTAFF(rv->astr, 'G', rv->alen) ||
  1924. TESTAFF(rv->astr, 'H', rv->alen)
  1925. )
  1926. )
  1927. // END of LANG_hu section
  1928. )
  1929. && ! (( checkcompoundtriple && !words && // test triple letters
  1930. (word[i-1]==word[i]) && (
  1931. ((i>1) && (word[i-1]==word[i-2])) ||
  1932. ((word[i-1]==word[i+1])) // may be word[i+1] == '\0'
  1933. )
  1934. ) ||
  1935. (
  1936. // test CHECKCOMPOUNDPATTERN
  1937. numcheckcpd && !words && cpdpat_check(word, i, rv, NULL, affixed)
  1938. ) ||
  1939. (
  1940. checkcompoundcase && !words && cpdcase_check(word, i)
  1941. ))
  1942. )
  1943. // LANG_hu section: spec. Hungarian rule
  1944. || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) &&
  1945. (sfx && sfx->getCont() && (
  1946. TESTAFF(sfx->getCont(), (unsigned short) 'x', sfx->getContLen()) ||
  1947. TESTAFF(sfx->getCont(), (unsigned short) '%', sfx->getContLen())
  1948. )
  1949. )
  1950. )
  1951. // END of LANG_hu section
  1952. ) {
  1953. // LANG_hu section: spec. Hungarian rule
  1954. if (langnum == LANG_hu) {
  1955. // calculate syllable number of the word
  1956. numsyllable += get_syllable(st, i);
  1957. // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
  1958. if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++;
  1959. }
  1960. // END of LANG_hu section
  1961. // NEXT WORD(S)
  1962. rv_first = rv;
  1963. rv = lookup((word+i)); // perhaps without prefix
  1964. // search homonym with compound flag
  1965. while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
  1966. !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1967. (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) ||
  1968. (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))))) {
  1969. rv = rv->next_homonym;
  1970. }
  1971. if (rv && words && words[wnum + 1]) {
  1972. mystrcat(*result, presult, MAXLNLEN);
  1973. mystrcat(*result, " ", MAXLNLEN);
  1974. mystrcat(*result, MORPH_PART, MAXLNLEN);
  1975. mystrcat(*result, word+i, MAXLNLEN);
  1976. if (complexprefixes && HENTRY_DATA(rv)) mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN);
  1977. if (!HENTRY_FIND(rv, MORPH_STEM)) {
  1978. mystrcat(*result, " ", MAXLNLEN);
  1979. mystrcat(*result, MORPH_STEM, MAXLNLEN);
  1980. mystrcat(*result, HENTRY_WORD(rv), MAXLNLEN);
  1981. }
  1982. // store the pointer of the hash entry
  1983. // sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv);
  1984. if (!complexprefixes && HENTRY_DATA(rv)) {
  1985. mystrcat(*result, " ", MAXLNLEN);
  1986. mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN);
  1987. }
  1988. mystrcat(*result, "\n", MAXLNLEN);
  1989. ok = 1;
  1990. return 0;
  1991. }
  1992. oldnumsyllable2 = numsyllable;
  1993. oldwordnum2 = wordnum;
  1994. // LANG_hu section: spec. Hungarian rule
  1995. if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) {
  1996. numsyllable--;
  1997. }
  1998. // END of LANG_hu section
  1999. // increment word number, if the second root has a compoundroot flag
  2000. if ((rv) && (compoundroot) &&
  2001. (TESTAFF(rv->astr, compoundroot, rv->alen))) {
  2002. wordnum++;
  2003. }
  2004. // check forbiddenwords
  2005. if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
  2006. TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) {
  2007. st[i] = ch;
  2008. continue;
  2009. }
  2010. // second word is acceptable, as a root?
  2011. // hungarian conventions: compounding is acceptable,
  2012. // when compound forms consist of 2 words, or if more,
  2013. // then the syllable number of root words must be 6, or lesser.
  2014. if ((rv) && (
  2015. (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  2016. (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))
  2017. )
  2018. && (
  2019. ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
  2020. ((cpdmaxsyllable!=0) &&
  2021. (numsyllable+get_syllable(HENTRY_WORD(rv),rv->blen)<=cpdmaxsyllable))
  2022. )
  2023. && (
  2024. (!checkcompounddup || (rv != rv_first))
  2025. )
  2026. )
  2027. {
  2028. // bad compound word
  2029. mystrcat(*result, presult, MAXLNLEN);
  2030. mystrcat(*result, " ", MAXLNLEN);
  2031. mystrcat(*result, MORPH_PART, MAXLNLEN);
  2032. mystrcat(*result, word+i, MAXLNLEN);
  2033. if (HENTRY_DATA(rv)) {
  2034. if (complexprefixes) mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN);
  2035. if (! HENTRY_FIND(rv, MORPH_STEM)) {
  2036. mystrcat(*result, " ", MAXLNLEN);
  2037. mystrcat(*result, MORPH_STEM, MAXLNLEN);
  2038. mystrcat(*result, HENTRY_WORD(rv), MAXLNLEN);
  2039. }
  2040. // store the pointer of the hash entry
  2041. // sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv);
  2042. if (!complexprefixes) {
  2043. mystrcat(*result, " ", MAXLNLEN);
  2044. mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN);
  2045. }
  2046. }
  2047. mystrcat(*result, "\n", MAXLNLEN);
  2048. ok = 1;
  2049. }
  2050. numsyllable = oldnumsyllable2 ;
  2051. wordnum = oldwordnum2;
  2052. // perhaps second word has prefix or/and suffix
  2053. sfx = NULL;
  2054. sfxflag = FLAG_NULL;
  2055. if (compoundflag && !onlycpdrule) rv = affix_check((word+i),strlen(word+i), compoundflag); else rv = NULL;
  2056. if (!rv && compoundend && !onlycpdrule) {
  2057. sfx = NULL;
  2058. pfx = NULL;
  2059. rv = affix_check((word+i),strlen(word+i), compoundend);
  2060. }
  2061. if (!rv && numdefcpd && words) {
  2062. rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END);
  2063. if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) {
  2064. char * m = NULL;
  2065. if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag);
  2066. if ((!m || *m == '\0') && compoundend) {
  2067. if (m) free(m);
  2068. m = affix_check_morph((word+i),strlen(word+i), compoundend);
  2069. }
  2070. mystrcat(*result, presult, MAXLNLEN);
  2071. if (m || (*m != '\0')) {
  2072. sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD,
  2073. MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC));
  2074. }
  2075. if (m) free(m);
  2076. mystrcat(*result, "\n", MAXLNLEN);
  2077. ok = 1;
  2078. }
  2079. }
  2080. // check non_compound flag in suffix and prefix
  2081. if ((rv) &&
  2082. ((pfx && pfx->getCont() &&
  2083. TESTAFF(pfx->getCont(), compoundforbidflag,
  2084. pfx->getContLen())) ||
  2085. (sfx && sfx->getCont() &&
  2086. TESTAFF(sfx->getCont(), compoundforbidflag,
  2087. sfx->getContLen())))) {
  2088. rv = NULL;
  2089. }
  2090. // check forbiddenwords
  2091. if ((rv) && (rv->astr) && (TESTAFF(rv->astr,forbiddenword,rv->alen) ||
  2092. TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))
  2093. && (! TESTAFF(rv->astr, needaffix, rv->alen))) {
  2094. st[i] = ch;
  2095. continue;
  2096. }
  2097. if (langnum == LANG_hu) {
  2098. // calculate syllable number of the word
  2099. numsyllable += get_syllable(word + i, strlen(word + i));
  2100. // - affix syllable num.
  2101. // XXX only second suffix (inflections, not derivations)
  2102. if (sfxappnd) {
  2103. char * tmp = myrevstrdup(sfxappnd);
  2104. numsyllable -= get_syllable(tmp, strlen(tmp));
  2105. free(tmp);
  2106. }
  2107. // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
  2108. if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++;
  2109. // increment syllable num, if last word has a SYLLABLENUM flag
  2110. // and the suffix is beginning `s'
  2111. if (cpdsyllablenum) {
  2112. switch (sfxflag) {
  2113. case 'c': { numsyllable+=2; break; }
  2114. case 'J': { numsyllable += 1; break; }
  2115. case 'I': { if (rv && TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; }
  2116. }
  2117. }
  2118. }
  2119. // increment word number, if the second word has a compoundroot flag
  2120. if ((rv) && (compoundroot) &&
  2121. (TESTAFF(rv->astr, compoundroot, rv->alen))) {
  2122. wordnum++;
  2123. }
  2124. // second word is acceptable, as a word with prefix or/and suffix?
  2125. // hungarian conventions: compounding is acceptable,
  2126. // when compound forms consist 2 word, otherwise
  2127. // the syllable number of root words is 6, or lesser.
  2128. if ((rv) &&
  2129. (
  2130. ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
  2131. ((cpdmaxsyllable!=0) &&
  2132. (numsyllable <= cpdmaxsyllable))
  2133. )
  2134. && (
  2135. (!checkcompounddup || (rv != rv_first))
  2136. )) {
  2137. char * m = NULL;
  2138. if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag);
  2139. if ((!m || *m == '\0') && compoundend) {
  2140. if (m) free(m);
  2141. m = affix_check_morph((word+i),strlen(word+i), compoundend);
  2142. }
  2143. mystrcat(*result, presult, MAXLNLEN);
  2144. if (m && (*m != '\0')) {
  2145. sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD,
  2146. MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC));
  2147. }
  2148. if (m) free(m);
  2149. sprintf(*result + strlen(*result), "%c", MSEP_REC);
  2150. ok = 1;
  2151. }
  2152. numsyllable = oldnumsyllable2;
  2153. wordnum = oldwordnum2;
  2154. // perhaps second word is a compound word (recursive call)
  2155. if ((wordnum < maxwordnum) && (ok == 0)) {
  2156. compound_check_morph((word+i),strlen(word+i), wordnum+1,
  2157. numsyllable, maxwordnum, wnum + 1, words, 0, result, presult);
  2158. } else {
  2159. rv=NULL;
  2160. }
  2161. }
  2162. st[i] = ch;
  2163. wordnum = oldwordnum;
  2164. numsyllable = oldnumsyllable;
  2165. } while (numdefcpd && oldwordnum == 0 && !onlycpdrule && (onlycpdrule = 1)); // end of onlycpd loop
  2166. }
  2167. return 0;
  2168. }
  2169. // return 1 if s1 (reversed) is a leading subset of end of s2
  2170. /* inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len)
  2171. {
  2172. while ((len > 0) && *s1 && (*s1 == *end_of_s2)) {
  2173. s1++;
  2174. end_of_s2--;
  2175. len--;
  2176. }
  2177. return (*s1 == '\0');
  2178. }
  2179. */
  2180. inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len)
  2181. {
  2182. while ((len > 0) && (*s1 != '\0') && ((*s1 == *end_of_s2) || (*s1 == '.'))) {
  2183. s1++;
  2184. end_of_s2--;
  2185. len--;
  2186. }
  2187. return (*s1 == '\0');
  2188. }
  2189. // check word for suffixes
  2190. struct hentry * AffixMgr::suffix_check (const char * word, int len,
  2191. int sfxopts, PfxEntry * ppfx, char ** wlst, int maxSug, int * ns,
  2192. const FLAG cclass, const FLAG needflag, char in_compound)
  2193. {
  2194. struct hentry * rv = NULL;
  2195. PfxEntry* ep = ppfx;
  2196. // first handle the special case of 0 length suffixes
  2197. SfxEntry * se = sStart[0];
  2198. while (se) {
  2199. if (!cclass || se->getCont()) {
  2200. // suffixes are not allowed in beginning of compounds
  2201. if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass
  2202. // except when signed with compoundpermitflag flag
  2203. (se->getCont() && compoundpermitflag &&
  2204. TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix ||
  2205. // no circumfix flag in prefix and suffix
  2206. ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
  2207. circumfix, ep->getContLen())) &&
  2208. (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) ||
  2209. // circumfix flag in prefix AND suffix
  2210. ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
  2211. circumfix, ep->getContLen())) &&
  2212. (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen()))))) &&
  2213. // fogemorpheme
  2214. (in_compound ||
  2215. !(se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen())))) &&
  2216. // needaffix on prefix or first suffix
  2217. (cclass ||
  2218. !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
  2219. (ppfx && !((ep->getCont()) &&
  2220. TESTAFF(ep->getCont(), needaffix,
  2221. ep->getContLen())))
  2222. )) {
  2223. rv = se->checkword(word,len, sfxopts, ppfx, wlst, maxSug, ns, (FLAG) cclass,
  2224. needflag, (in_compound ? 0 : onlyincompound));
  2225. if (rv) {
  2226. sfx=se; // BUG: sfx not stateless
  2227. return rv;
  2228. }
  2229. }
  2230. }
  2231. se = se->getNext();
  2232. }
  2233. // now handle the general case
  2234. if (len == 0) return NULL; // FULLSTRIP
  2235. unsigned char sp= *((const unsigned char *)(word + len - 1));
  2236. SfxEntry * sptr = sStart[sp];
  2237. while (sptr) {
  2238. if (isRevSubset(sptr->getKey(), word + len - 1, len)
  2239. ) {
  2240. // suffixes are not allowed in beginning of compounds
  2241. if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass
  2242. // except when signed with compoundpermitflag flag
  2243. (sptr->getCont() && compoundpermitflag &&
  2244. TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix ||
  2245. // no circumfix flag in prefix and suffix
  2246. ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
  2247. circumfix, ep->getContLen())) &&
  2248. (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) ||
  2249. // circumfix flag in prefix AND suffix
  2250. ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
  2251. circumfix, ep->getContLen())) &&
  2252. (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen()))))) &&
  2253. // fogemorpheme
  2254. (in_compound ||
  2255. !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) &&
  2256. // needaffix on prefix or first suffix
  2257. (cclass ||
  2258. !(sptr->getCont() && TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
  2259. (ppfx && !((ep->getCont()) &&
  2260. TESTAFF(ep->getCont(), needaffix,
  2261. ep->getContLen())))
  2262. )
  2263. ) if (in_compound != IN_CPD_END || ppfx || !(sptr->getCont() && TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))) {
  2264. rv = sptr->checkword(word,len, sfxopts, ppfx, wlst,
  2265. maxSug, ns, cclass, needflag, (in_compound ? 0 : onlyincompound));
  2266. if (rv) {
  2267. sfx=sptr; // BUG: sfx not stateless
  2268. sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
  2269. if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
  2270. return rv;
  2271. }
  2272. }
  2273. sptr = sptr->getNextEQ();
  2274. } else {
  2275. sptr = sptr->getNextNE();
  2276. }
  2277. }
  2278. return NULL;
  2279. }
  2280. // check word for two-level suffixes
  2281. struct hentry * AffixMgr::suffix_check_twosfx(const char * word, int len,
  2282. int sfxopts, PfxEntry * ppfx, const FLAG needflag)
  2283. {
  2284. struct hentry * rv = NULL;
  2285. // first handle the special case of 0 length suffixes
  2286. SfxEntry * se = sStart[0];
  2287. while (se) {
  2288. if (contclasses[se->getFlag()])
  2289. {
  2290. rv = se->check_twosfx(word,len, sfxopts, ppfx, needflag);
  2291. if (rv) return rv;
  2292. }
  2293. se = se->getNext();
  2294. }
  2295. // now handle the general case
  2296. if (len == 0) return NULL; // FULLSTRIP
  2297. unsigned char sp = *((const unsigned char *)(word + len - 1));
  2298. SfxEntry * sptr = sStart[sp];
  2299. while (sptr) {
  2300. if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
  2301. if (contclasses[sptr->getFlag()])
  2302. {
  2303. rv = sptr->check_twosfx(word,len, sfxopts, ppfx, needflag);
  2304. if (rv) {
  2305. sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
  2306. if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
  2307. return rv;
  2308. }
  2309. }
  2310. sptr = sptr->getNextEQ();
  2311. } else {
  2312. sptr = sptr->getNextNE();
  2313. }
  2314. }
  2315. return NULL;
  2316. }
  2317. char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len,
  2318. int sfxopts, PfxEntry * ppfx, const FLAG needflag)
  2319. {
  2320. char result[MAXLNLEN];
  2321. char result2[MAXLNLEN];
  2322. char result3[MAXLNLEN];
  2323. char * st;
  2324. result[0] = '\0';
  2325. result2[0] = '\0';
  2326. result3[0] = '\0';
  2327. // first handle the special case of 0 length suffixes
  2328. SfxEntry * se = sStart[0];
  2329. while (se) {
  2330. if (contclasses[se->getFlag()])
  2331. {
  2332. st = se->check_twosfx_morph(word,len, sfxopts, ppfx, needflag);
  2333. if (st) {
  2334. if (ppfx) {
  2335. if (ppfx->getMorph()) {
  2336. mystrcat(result, ppfx->getMorph(), MAXLNLEN);
  2337. mystrcat(result, " ", MAXLNLEN);
  2338. } else debugflag(result, ppfx->getFlag());
  2339. }
  2340. mystrcat(result, st, MAXLNLEN);
  2341. free(st);
  2342. if (se->getMorph()) {
  2343. mystrcat(result, " ", MAXLNLEN);
  2344. mystrcat(result, se->getMorph(), MAXLNLEN);
  2345. } else debugflag(result, se->getFlag());
  2346. mystrcat(result, "\n", MAXLNLEN);
  2347. }
  2348. }
  2349. se = se->getNext();
  2350. }
  2351. // now handle the general case
  2352. if (len == 0) return NULL; // FULLSTRIP
  2353. unsigned char sp = *((const unsigned char *)(word + len - 1));
  2354. SfxEntry * sptr = sStart[sp];
  2355. while (sptr) {
  2356. if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
  2357. if (contclasses[sptr->getFlag()])
  2358. {
  2359. st = sptr->check_twosfx_morph(word,len, sfxopts, ppfx, needflag);
  2360. if (st) {
  2361. sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
  2362. if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
  2363. strcpy(result2, st);
  2364. free(st);
  2365. result3[0] = '\0';
  2366. if (sptr->getMorph()) {
  2367. mystrcat(result3, " ", MAXLNLEN);
  2368. mystrcat(result3, sptr->getMorph(), MAXLNLEN);
  2369. } else debugflag(result3, sptr->getFlag());
  2370. strlinecat(result2, result3);
  2371. mystrcat(result2, "\n", MAXLNLEN);
  2372. mystrcat(result, result2, MAXLNLEN);
  2373. }
  2374. }
  2375. sptr = sptr->getNextEQ();
  2376. } else {
  2377. sptr = sptr->getNextNE();
  2378. }
  2379. }
  2380. if (*result) return mystrdup(result);
  2381. return NULL;
  2382. }
  2383. char * AffixMgr::suffix_check_morph(const char * word, int len,
  2384. int sfxopts, PfxEntry * ppfx, const FLAG cclass, const FLAG needflag, char in_compound)
  2385. {
  2386. char result[MAXLNLEN];
  2387. struct hentry * rv = NULL;
  2388. result[0] = '\0';
  2389. PfxEntry* ep = ppfx;
  2390. // first handle the special case of 0 length suffixes
  2391. SfxEntry * se = sStart[0];
  2392. while (se) {
  2393. if (!cclass || se->getCont()) {
  2394. // suffixes are not allowed in beginning of compounds
  2395. if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
  2396. // except when signed with compoundpermitflag flag
  2397. (se->getCont() && compoundpermitflag &&
  2398. TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix ||
  2399. // no circumfix flag in prefix and suffix
  2400. ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
  2401. circumfix, ep->getContLen())) &&
  2402. (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) ||
  2403. // circumfix flag in prefix AND suffix
  2404. ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
  2405. circumfix, ep->getContLen())) &&
  2406. (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen()))))) &&
  2407. // fogemorpheme
  2408. (in_compound ||
  2409. !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) &&
  2410. // needaffix on prefix or first suffix
  2411. (cclass ||
  2412. !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
  2413. (ppfx && !((ep->getCont()) &&
  2414. TESTAFF(ep->getCont(), needaffix,
  2415. ep->getContLen())))
  2416. )
  2417. ))
  2418. rv = se->checkword(word, len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag);
  2419. while (rv) {
  2420. if (ppfx) {
  2421. if (ppfx->getMorph()) {
  2422. mystrcat(result, ppfx->getMorph(), MAXLNLEN);
  2423. mystrcat(result, " ", MAXLNLEN);
  2424. } else debugflag(result, ppfx->getFlag());
  2425. }
  2426. if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
  2427. if (! HENTRY_FIND(rv, MORPH_STEM)) {
  2428. mystrcat(result, " ", MAXLNLEN);
  2429. mystrcat(result, MORPH_STEM, MAXLNLEN);
  2430. mystrcat(result, HENTRY_WORD(rv), MAXLNLEN);
  2431. }
  2432. // store the pointer of the hash entry
  2433. // sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv);
  2434. if (!complexprefixes && HENTRY_DATA(rv)) {
  2435. mystrcat(result, " ", MAXLNLEN);
  2436. mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
  2437. }
  2438. if (se->getMorph()) {
  2439. mystrcat(result, " ", MAXLNLEN);
  2440. mystrcat(result, se->getMorph(), MAXLNLEN);
  2441. } else debugflag(result, se->getFlag());
  2442. mystrcat(result, "\n", MAXLNLEN);
  2443. rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
  2444. }
  2445. }
  2446. se = se->getNext();
  2447. }
  2448. // now handle the general case
  2449. if (len == 0) return NULL; // FULLSTRIP
  2450. unsigned char sp = *((const unsigned char *)(word + len - 1));
  2451. SfxEntry * sptr = sStart[sp];
  2452. while (sptr) {
  2453. if (isRevSubset(sptr->getKey(), word + len - 1, len)
  2454. ) {
  2455. // suffixes are not allowed in beginning of compounds
  2456. if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
  2457. // except when signed with compoundpermitflag flag
  2458. (sptr->getCont() && compoundpermitflag &&
  2459. TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix ||
  2460. // no circumfix flag in prefix and suffix
  2461. ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
  2462. circumfix, ep->getContLen())) &&
  2463. (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) ||
  2464. // circumfix flag in prefix AND suffix
  2465. ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
  2466. circumfix, ep->getContLen())) &&
  2467. (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen()))))) &&
  2468. // fogemorpheme
  2469. (in_compound ||
  2470. !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) &&
  2471. // needaffix on first suffix
  2472. (cclass || !(sptr->getCont() &&
  2473. TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())))
  2474. )) rv = sptr->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag);
  2475. while (rv) {
  2476. if (ppfx) {
  2477. if (ppfx->getMorph()) {
  2478. mystrcat(result, ppfx->getMorph(), MAXLNLEN);
  2479. mystrcat(result, " ", MAXLNLEN);
  2480. } else debugflag(result, ppfx->getFlag());
  2481. }
  2482. if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
  2483. if (! HENTRY_FIND(rv, MORPH_STEM)) {
  2484. mystrcat(result, " ", MAXLNLEN);
  2485. mystrcat(result, MORPH_STEM, MAXLNLEN);
  2486. mystrcat(result, HENTRY_WORD(rv), MAXLNLEN);
  2487. }
  2488. // store the pointer of the hash entry
  2489. // sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv);
  2490. if (!complexprefixes && HENTRY_DATA(rv)) {
  2491. mystrcat(result, " ", MAXLNLEN);
  2492. mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
  2493. }
  2494. if (sptr->getMorph()) {
  2495. mystrcat(result, " ", MAXLNLEN);
  2496. mystrcat(result, sptr->getMorph(), MAXLNLEN);
  2497. } else debugflag(result, sptr->getFlag());
  2498. mystrcat(result, "\n", MAXLNLEN);
  2499. rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
  2500. }
  2501. sptr = sptr->getNextEQ();
  2502. } else {
  2503. sptr = sptr->getNextNE();
  2504. }
  2505. }
  2506. if (*result) return mystrdup(result);
  2507. return NULL;
  2508. }
  2509. // check if word with affixes is correctly spelled
  2510. struct hentry * AffixMgr::affix_check (const char * word, int len, const FLAG needflag, char in_compound)
  2511. {
  2512. struct hentry * rv= NULL;
  2513. // check all prefixes (also crossed with suffixes if allowed)
  2514. rv = prefix_check(word, len, in_compound, needflag);
  2515. if (rv) return rv;
  2516. // if still not found check all suffixes
  2517. rv = suffix_check(word, len, 0, NULL, NULL, 0, NULL, FLAG_NULL, needflag, in_compound);
  2518. if (havecontclass) {
  2519. sfx = NULL;
  2520. pfx = NULL;
  2521. if (rv) return rv;
  2522. // if still not found check all two-level suffixes
  2523. rv = suffix_check_twosfx(word, len, 0, NULL, needflag);
  2524. if (rv) return rv;
  2525. // if still not found check all two-level suffixes
  2526. rv = prefix_check_twosfx(word, len, IN_CPD_NOT, needflag);
  2527. }
  2528. return rv;
  2529. }
  2530. // check if word with affixes is correctly spelled
  2531. char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needflag, char in_compound)
  2532. {
  2533. char result[MAXLNLEN];
  2534. char * st = NULL;
  2535. *result = '\0';
  2536. // check all prefixes (also crossed with suffixes if allowed)
  2537. st = prefix_check_morph(word, len, in_compound);
  2538. if (st) {
  2539. mystrcat(result, st, MAXLNLEN);
  2540. free(st);
  2541. }
  2542. // if still not found check all suffixes
  2543. st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound);
  2544. if (st) {
  2545. mystrcat(result, st, MAXLNLEN);
  2546. free(st);
  2547. }
  2548. if (havecontclass) {
  2549. sfx = NULL;
  2550. pfx = NULL;
  2551. // if still not found check all two-level suffixes
  2552. st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag);
  2553. if (st) {
  2554. mystrcat(result, st, MAXLNLEN);
  2555. free(st);
  2556. }
  2557. // if still not found check all two-level suffixes
  2558. st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag);
  2559. if (st) {
  2560. mystrcat(result, st, MAXLNLEN);
  2561. free(st);
  2562. }
  2563. }
  2564. return mystrdup(result);
  2565. }
  2566. char * AffixMgr::morphgen(char * ts, int wl, const unsigned short * ap,
  2567. unsigned short al, char * morph, char * targetmorph, int level)
  2568. {
  2569. // handle suffixes
  2570. char * stemmorph;
  2571. char * stemmorphcatpos;
  2572. char mymorph[MAXLNLEN];
  2573. if (!morph) return NULL;
  2574. // check substandard flag
  2575. if (TESTAFF(ap, substandard, al)) return NULL;
  2576. if (morphcmp(morph, targetmorph) == 0) return mystrdup(ts);
  2577. // int targetcount = get_sfxcount(targetmorph);
  2578. // use input suffix fields, if exist
  2579. if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) {
  2580. stemmorph = mymorph;
  2581. strcpy(stemmorph, morph);
  2582. mystrcat(stemmorph, " ", MAXLNLEN);
  2583. stemmorphcatpos = stemmorph + strlen(stemmorph);
  2584. } else {
  2585. stemmorph = morph;
  2586. stemmorphcatpos = NULL;
  2587. }
  2588. for (int i = 0; i < al; i++) {
  2589. const unsigned char c = (unsigned char) (ap[i] & 0x00FF);
  2590. SfxEntry * sptr = sFlag[c];
  2591. while (sptr) {
  2592. if (sptr->getFlag() == ap[i] && sptr->getMorph() && ((sptr->getContLen() == 0) ||
  2593. // don't generate forms with substandard affixes
  2594. !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))) {
  2595. if (stemmorphcatpos) strcpy(stemmorphcatpos, sptr->getMorph());
  2596. else stemmorph = (char *) sptr->getMorph();
  2597. int cmp = morphcmp(stemmorph, targetmorph);
  2598. if (cmp == 0) {
  2599. char * newword = sptr->add(ts, wl);
  2600. if (newword) {
  2601. hentry * check = pHMgr->lookup(newword); // XXX extra dic
  2602. if (!check || !check->astr ||
  2603. !(TESTAFF(check->astr, forbiddenword, check->alen) ||
  2604. TESTAFF(check->astr, ONLYUPCASEFLAG, check->alen))) {
  2605. return newword;
  2606. }
  2607. free(newword);
  2608. }
  2609. }
  2610. // recursive call for secondary suffixes
  2611. if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) &&
  2612. // (get_sfxcount(stemmorph) < targetcount) &&
  2613. !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())) {
  2614. char * newword = sptr->add(ts, wl);
  2615. if (newword) {
  2616. char * newword2 = morphgen(newword, strlen(newword), sptr->getCont(),
  2617. sptr->getContLen(), stemmorph, targetmorph, 1);
  2618. if (newword2) {
  2619. free(newword);
  2620. return newword2;
  2621. }
  2622. free(newword);
  2623. newword = NULL;
  2624. }
  2625. }
  2626. }
  2627. sptr = sptr->getFlgNxt();
  2628. }
  2629. }
  2630. return NULL;
  2631. }
  2632. int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts,
  2633. int wl, const unsigned short * ap, unsigned short al, char * bad, int badl,
  2634. char * phon)
  2635. {
  2636. int nh=0;
  2637. // first add root word to list
  2638. if ((nh < maxn) && !(al && ((needaffix && TESTAFF(ap, needaffix, al)) ||
  2639. (onlyincompound && TESTAFF(ap, onlyincompound, al))))) {
  2640. wlst[nh].word = mystrdup(ts);
  2641. if (!wlst[nh].word) return 0;
  2642. wlst[nh].allow = (1 == 0);
  2643. wlst[nh].orig = NULL;
  2644. nh++;
  2645. // add special phonetic version
  2646. if (phon && (nh < maxn)) {
  2647. wlst[nh].word = mystrdup(phon);
  2648. if (!wlst[nh].word) return nh - 1;
  2649. wlst[nh].allow = (1 == 0);
  2650. wlst[nh].orig = mystrdup(ts);
  2651. if (!wlst[nh].orig) return nh - 1;
  2652. nh++;
  2653. }
  2654. }
  2655. // handle suffixes
  2656. for (int i = 0; i < al; i++) {
  2657. const unsigned char c = (unsigned char) (ap[i] & 0x00FF);
  2658. SfxEntry * sptr = sFlag[c];
  2659. while (sptr) {
  2660. if ((sptr->getFlag() == ap[i]) && (!sptr->getKeyLen() || ((badl > sptr->getKeyLen()) &&
  2661. (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))) &&
  2662. // check needaffix flag
  2663. !(sptr->getCont() && ((needaffix &&
  2664. TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
  2665. (circumfix &&
  2666. TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) ||
  2667. (onlyincompound &&
  2668. TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))
  2669. ) {
  2670. char * newword = sptr->add(ts, wl);
  2671. if (newword) {
  2672. if (nh < maxn) {
  2673. wlst[nh].word = newword;
  2674. wlst[nh].allow = sptr->allowCross();
  2675. wlst[nh].orig = NULL;
  2676. nh++;
  2677. // add special phonetic version
  2678. if (phon && (nh < maxn)) {
  2679. char st[MAXWORDUTF8LEN];
  2680. strcpy(st, phon);
  2681. strcat(st, sptr->getKey());
  2682. reverseword(st + strlen(phon));
  2683. wlst[nh].word = mystrdup(st);
  2684. if (!wlst[nh].word) return nh - 1;
  2685. wlst[nh].allow = (1 == 0);
  2686. wlst[nh].orig = mystrdup(newword);
  2687. if (!wlst[nh].orig) return nh - 1;
  2688. nh++;
  2689. }
  2690. } else {
  2691. free(newword);
  2692. }
  2693. }
  2694. }
  2695. sptr = sptr->getFlgNxt();
  2696. }
  2697. }
  2698. int n = nh;
  2699. // handle cross products of prefixes and suffixes
  2700. for (int j=1;j<n ;j++)
  2701. if (wlst[j].allow) {
  2702. for (int k = 0; k < al; k++) {
  2703. const unsigned char c = (unsigned char) (ap[k] & 0x00FF);
  2704. PfxEntry * cptr = pFlag[c];
  2705. while (cptr) {
  2706. if ((cptr->getFlag() == ap[k]) && cptr->allowCross() && (!cptr->getKeyLen() || ((badl > cptr->getKeyLen()) &&
  2707. (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) {
  2708. int l1 = strlen(wlst[j].word);
  2709. char * newword = cptr->add(wlst[j].word, l1);
  2710. if (newword) {
  2711. if (nh < maxn) {
  2712. wlst[nh].word = newword;
  2713. wlst[nh].allow = cptr->allowCross();
  2714. wlst[nh].orig = NULL;
  2715. nh++;
  2716. } else {
  2717. free(newword);
  2718. }
  2719. }
  2720. }
  2721. cptr = cptr->getFlgNxt();
  2722. }
  2723. }
  2724. }
  2725. // now handle pure prefixes
  2726. for (int m = 0; m < al; m ++) {
  2727. const unsigned char c = (unsigned char) (ap[m] & 0x00FF);
  2728. PfxEntry * ptr = pFlag[c];
  2729. while (ptr) {
  2730. if ((ptr->getFlag() == ap[m]) && (!ptr->getKeyLen() || ((badl > ptr->getKeyLen()) &&
  2731. (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) &&
  2732. // check needaffix flag
  2733. !(ptr->getCont() && ((needaffix &&
  2734. TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())) ||
  2735. (circumfix &&
  2736. TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) ||
  2737. (onlyincompound &&
  2738. TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen()))))
  2739. ) {
  2740. char * newword = ptr->add(ts, wl);
  2741. if (newword) {
  2742. if (nh < maxn) {
  2743. wlst[nh].word = newword;
  2744. wlst[nh].allow = ptr->allowCross();
  2745. wlst[nh].orig = NULL;
  2746. nh++;
  2747. } else {
  2748. free(newword);
  2749. }
  2750. }
  2751. }
  2752. ptr = ptr->getFlgNxt();
  2753. }
  2754. }
  2755. return nh;
  2756. }
  2757. // return length of replacing table
  2758. int AffixMgr::get_numrep() const
  2759. {
  2760. return numrep;
  2761. }
  2762. // return replacing table
  2763. struct replentry * AffixMgr::get_reptable() const
  2764. {
  2765. if (! reptable ) return NULL;
  2766. return reptable;
  2767. }
  2768. // return iconv table
  2769. RepList * AffixMgr::get_iconvtable() const
  2770. {
  2771. if (! iconvtable ) return NULL;
  2772. return iconvtable;
  2773. }
  2774. // return oconv table
  2775. RepList * AffixMgr::get_oconvtable() const
  2776. {
  2777. if (! oconvtable ) return NULL;
  2778. return oconvtable;
  2779. }
  2780. // return replacing table
  2781. struct phonetable * AffixMgr::get_phonetable() const
  2782. {
  2783. if (! phone ) return NULL;
  2784. return phone;
  2785. }
  2786. // return length of character map table
  2787. int AffixMgr::get_nummap() const
  2788. {
  2789. return nummap;
  2790. }
  2791. // return character map table
  2792. struct mapentry * AffixMgr::get_maptable() const
  2793. {
  2794. if (! maptable ) return NULL;
  2795. return maptable;
  2796. }
  2797. // return length of word break table
  2798. int AffixMgr::get_numbreak() const
  2799. {
  2800. return numbreak;
  2801. }
  2802. // return character map table
  2803. char ** AffixMgr::get_breaktable() const
  2804. {
  2805. if (! breaktable ) return NULL;
  2806. return breaktable;
  2807. }
  2808. // return text encoding of dictionary
  2809. char * AffixMgr::get_encoding()
  2810. {
  2811. if (! encoding ) encoding = mystrdup(SPELL_ENCODING);
  2812. return mystrdup(encoding);
  2813. }
  2814. // return text encoding of dictionary
  2815. int AffixMgr::get_langnum() const
  2816. {
  2817. return langnum;
  2818. }
  2819. // return double prefix option
  2820. int AffixMgr::get_complexprefixes() const
  2821. {
  2822. return complexprefixes;
  2823. }
  2824. // return FULLSTRIP option
  2825. int AffixMgr::get_fullstrip() const
  2826. {
  2827. return fullstrip;
  2828. }
  2829. FLAG AffixMgr::get_keepcase() const
  2830. {
  2831. return keepcase;
  2832. }
  2833. FLAG AffixMgr::get_forceucase() const
  2834. {
  2835. return forceucase;
  2836. }
  2837. FLAG AffixMgr::get_warn() const
  2838. {
  2839. return warn;
  2840. }
  2841. int AffixMgr::get_forbidwarn() const
  2842. {
  2843. return forbidwarn;
  2844. }
  2845. int AffixMgr::get_checksharps() const
  2846. {
  2847. return checksharps;
  2848. }
  2849. char * AffixMgr::encode_flag(unsigned short aflag) const
  2850. {
  2851. return pHMgr->encode_flag(aflag);
  2852. }
  2853. // return the preferred ignore string for suggestions
  2854. char * AffixMgr::get_ignore() const
  2855. {
  2856. if (!ignorechars) return NULL;
  2857. return ignorechars;
  2858. }
  2859. // return the preferred ignore string for suggestions
  2860. unsigned short * AffixMgr::get_ignore_utf16(int * len) const
  2861. {
  2862. *len = ignorechars_utf16_len;
  2863. return ignorechars_utf16;
  2864. }
  2865. // return the keyboard string for suggestions
  2866. char * AffixMgr::get_key_string()
  2867. {
  2868. if (! keystring ) keystring = mystrdup(SPELL_KEYSTRING);
  2869. return mystrdup(keystring);
  2870. }
  2871. // return the preferred try string for suggestions
  2872. char * AffixMgr::get_try_string() const
  2873. {
  2874. if (! trystring ) return NULL;
  2875. return mystrdup(trystring);
  2876. }
  2877. // return the preferred try string for suggestions
  2878. const char * AffixMgr::get_wordchars() const
  2879. {
  2880. return wordchars;
  2881. }
  2882. unsigned short * AffixMgr::get_wordchars_utf16(int * len) const
  2883. {
  2884. *len = wordchars_utf16_len;
  2885. return wordchars_utf16;
  2886. }
  2887. // is there compounding?
  2888. int AffixMgr::get_compound() const
  2889. {
  2890. return compoundflag || compoundbegin || numdefcpd;
  2891. }
  2892. // return the compound words control flag
  2893. FLAG AffixMgr::get_compoundflag() const
  2894. {
  2895. return compoundflag;
  2896. }
  2897. // return the forbidden words control flag
  2898. FLAG AffixMgr::get_forbiddenword() const
  2899. {
  2900. return forbiddenword;
  2901. }
  2902. // return the forbidden words control flag
  2903. FLAG AffixMgr::get_nosuggest() const
  2904. {
  2905. return nosuggest;
  2906. }
  2907. // return the forbidden words control flag
  2908. FLAG AffixMgr::get_nongramsuggest() const
  2909. {
  2910. return nongramsuggest;
  2911. }
  2912. // return the forbidden words flag modify flag
  2913. FLAG AffixMgr::get_needaffix() const
  2914. {
  2915. return needaffix;
  2916. }
  2917. // return the onlyincompound flag
  2918. FLAG AffixMgr::get_onlyincompound() const
  2919. {
  2920. return onlyincompound;
  2921. }
  2922. // return the compound word signal flag
  2923. FLAG AffixMgr::get_compoundroot() const
  2924. {
  2925. return compoundroot;
  2926. }
  2927. // return the compound begin signal flag
  2928. FLAG AffixMgr::get_compoundbegin() const
  2929. {
  2930. return compoundbegin;
  2931. }
  2932. // return the value of checknum
  2933. int AffixMgr::get_checknum() const
  2934. {
  2935. return checknum;
  2936. }
  2937. // return the value of prefix
  2938. const char * AffixMgr::get_prefix() const
  2939. {
  2940. if (pfx) return pfx->getKey();
  2941. return NULL;
  2942. }
  2943. // return the value of suffix
  2944. const char * AffixMgr::get_suffix() const
  2945. {
  2946. return sfxappnd;
  2947. }
  2948. // return the value of suffix
  2949. const char * AffixMgr::get_version() const
  2950. {
  2951. return version;
  2952. }
  2953. // return lemma_present flag
  2954. FLAG AffixMgr::get_lemma_present() const
  2955. {
  2956. return lemma_present;
  2957. }
  2958. // utility method to look up root words in hash table
  2959. struct hentry * AffixMgr::lookup(const char * word)
  2960. {
  2961. int i;
  2962. struct hentry * he = NULL;
  2963. for (i = 0; i < *maxdic && !he; i++) {
  2964. he = (alldic[i])->lookup(word);
  2965. }
  2966. return he;
  2967. }
  2968. // return the value of suffix
  2969. int AffixMgr::have_contclass() const
  2970. {
  2971. return havecontclass;
  2972. }
  2973. // return utf8
  2974. int AffixMgr::get_utf8() const
  2975. {
  2976. return utf8;
  2977. }
  2978. int AffixMgr::get_maxngramsugs(void) const
  2979. {
  2980. return maxngramsugs;
  2981. }
  2982. int AffixMgr::get_maxcpdsugs(void) const
  2983. {
  2984. return maxcpdsugs;
  2985. }
  2986. int AffixMgr::get_maxdiff(void) const
  2987. {
  2988. return maxdiff;
  2989. }
  2990. int AffixMgr::get_onlymaxdiff(void) const
  2991. {
  2992. return onlymaxdiff;
  2993. }
  2994. // return nosplitsugs
  2995. int AffixMgr::get_nosplitsugs(void) const
  2996. {
  2997. return nosplitsugs;
  2998. }
  2999. // return sugswithdots
  3000. int AffixMgr::get_sugswithdots(void) const
  3001. {
  3002. return sugswithdots;
  3003. }
  3004. /* parse flag */
  3005. int AffixMgr::parse_flag(char * line, unsigned short * out, FileMgr * af) {
  3006. char * s = NULL;
  3007. if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) {
  3008. HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix file parameter\n", af->getlinenum());
  3009. return 1;
  3010. }
  3011. if (parse_string(line, &s, af->getlinenum())) return 1;
  3012. *out = pHMgr->decode_flag(s);
  3013. free(s);
  3014. return 0;
  3015. }
  3016. /* parse num */
  3017. int AffixMgr::parse_num(char * line, int * out, FileMgr * af) {
  3018. char * s = NULL;
  3019. if (*out != -1) {
  3020. HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix file parameter\n", af->getlinenum());
  3021. return 1;
  3022. }
  3023. if (parse_string(line, &s, af->getlinenum())) return 1;
  3024. *out = atoi(s);
  3025. free(s);
  3026. return 0;
  3027. }
  3028. /* parse in the max syllablecount of compound words and */
  3029. int AffixMgr::parse_cpdsyllable(char * line, FileMgr * af)
  3030. {
  3031. char * tp = line;
  3032. char * piece;
  3033. int i = 0;
  3034. int np = 0;
  3035. w_char w[MAXWORDLEN];
  3036. piece = mystrsep(&tp, 0);
  3037. while (piece) {
  3038. if (*piece != '\0') {
  3039. switch(i) {
  3040. case 0: { np++; break; }
  3041. case 1: { cpdmaxsyllable = atoi(piece); np++; break; }
  3042. case 2: {
  3043. if (!utf8) {
  3044. cpdvowels = mystrdup(piece);
  3045. } else {
  3046. int n = u8_u16(w, MAXWORDLEN, piece);
  3047. if (n > 0) {
  3048. flag_qsort((unsigned short *) w, 0, n);
  3049. cpdvowels_utf16 = (w_char *) malloc(n * sizeof(w_char));
  3050. if (!cpdvowels_utf16) return 1;
  3051. memcpy(cpdvowels_utf16, w, n * sizeof(w_char));
  3052. }
  3053. cpdvowels_utf16_len = n;
  3054. }
  3055. np++;
  3056. break;
  3057. }
  3058. default: break;
  3059. }
  3060. i++;
  3061. }
  3062. piece = mystrsep(&tp, 0);
  3063. }
  3064. if (np < 2) {
  3065. HUNSPELL_WARNING(stderr, "error: line %d: missing compoundsyllable information\n", af->getlinenum());
  3066. return 1;
  3067. }
  3068. if (np == 2) cpdvowels = mystrdup("aeiouAEIOU");
  3069. return 0;
  3070. }
  3071. /* parse in the typical fault correcting table */
  3072. int AffixMgr::parse_reptable(char * line, FileMgr * af)
  3073. {
  3074. if (numrep != 0) {
  3075. HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
  3076. return 1;
  3077. }
  3078. char * tp = line;
  3079. char * piece;
  3080. int i = 0;
  3081. int np = 0;
  3082. piece = mystrsep(&tp, 0);
  3083. while (piece) {
  3084. if (*piece != '\0') {
  3085. switch(i) {
  3086. case 0: { np++; break; }
  3087. case 1: {
  3088. numrep = atoi(piece);
  3089. if (numrep < 1) {
  3090. HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", af->getlinenum());
  3091. return 1;
  3092. }
  3093. reptable = (replentry *) malloc(numrep * sizeof(struct replentry));
  3094. if (!reptable) return 1;
  3095. np++;
  3096. break;
  3097. }
  3098. default: break;
  3099. }
  3100. i++;
  3101. }
  3102. piece = mystrsep(&tp, 0);
  3103. }
  3104. if (np != 2) {
  3105. HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
  3106. return 1;
  3107. }
  3108. /* now parse the numrep lines to read in the remainder of the table */
  3109. char * nl;
  3110. for (int j=0; j < numrep; j++) {
  3111. if (!(nl = af->getline())) return 1;
  3112. mychomp(nl);
  3113. tp = nl;
  3114. i = 0;
  3115. reptable[j].pattern = NULL;
  3116. reptable[j].pattern2 = NULL;
  3117. piece = mystrsep(&tp, 0);
  3118. while (piece) {
  3119. if (*piece != '\0') {
  3120. switch(i) {
  3121. case 0: {
  3122. if (strncmp(piece,"REP",3) != 0) {
  3123. HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
  3124. numrep = 0;
  3125. return 1;
  3126. }
  3127. break;
  3128. }
  3129. case 1: {
  3130. if (*piece == '^') reptable[j].start = true; else reptable[j].start = false;
  3131. reptable[j].pattern = mystrrep(mystrdup(piece + int(reptable[j].start)),"_"," ");
  3132. int lr = strlen(reptable[j].pattern) - 1;
  3133. if (reptable[j].pattern[lr] == '$') {
  3134. reptable[j].end = true;
  3135. reptable[j].pattern[lr] = '\0';
  3136. } else reptable[j].end = false;
  3137. break;
  3138. }
  3139. case 2: { reptable[j].pattern2 = mystrrep(mystrdup(piece),"_"," "); break; }
  3140. default: break;
  3141. }
  3142. i++;
  3143. }
  3144. piece = mystrsep(&tp, 0);
  3145. }
  3146. if ((!(reptable[j].pattern)) || (!(reptable[j].pattern2))) {
  3147. HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
  3148. numrep = 0;
  3149. return 1;
  3150. }
  3151. }
  3152. return 0;
  3153. }
  3154. /* parse in the typical fault correcting table */
  3155. int AffixMgr::parse_convtable(char * line, FileMgr * af, RepList ** rl, const char * keyword)
  3156. {
  3157. if (*rl) {
  3158. HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
  3159. return 1;
  3160. }
  3161. char * tp = line;
  3162. char * piece;
  3163. int i = 0;
  3164. int np = 0;
  3165. int numrl = 0;
  3166. piece = mystrsep(&tp, 0);
  3167. while (piece) {
  3168. if (*piece != '\0') {
  3169. switch(i) {
  3170. case 0: { np++; break; }
  3171. case 1: {
  3172. numrl = atoi(piece);
  3173. if (numrl < 1) {
  3174. HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", af->getlinenum());
  3175. return 1;
  3176. }
  3177. *rl = new RepList(numrl);
  3178. if (!*rl) return 1;
  3179. np++;
  3180. break;
  3181. }
  3182. default: break;
  3183. }
  3184. i++;
  3185. }
  3186. piece = mystrsep(&tp, 0);
  3187. }
  3188. if (np != 2) {
  3189. HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
  3190. return 1;
  3191. }
  3192. /* now parse the num lines to read in the remainder of the table */
  3193. char * nl;
  3194. for (int j=0; j < numrl; j++) {
  3195. if (!(nl = af->getline())) return 1;
  3196. mychomp(nl);
  3197. tp = nl;
  3198. i = 0;
  3199. char * pattern = NULL;
  3200. char * pattern2 = NULL;
  3201. piece = mystrsep(&tp, 0);
  3202. while (piece) {
  3203. if (*piece != '\0') {
  3204. switch(i) {
  3205. case 0: {
  3206. if (strncmp(piece, keyword, sizeof(keyword)) != 0) {
  3207. HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
  3208. delete *rl;
  3209. *rl = NULL;
  3210. return 1;
  3211. }
  3212. break;
  3213. }
  3214. case 1: { pattern = mystrrep(mystrdup(piece),"_"," "); break; }
  3215. case 2: {
  3216. pattern2 = mystrrep(mystrdup(piece),"_"," ");
  3217. break;
  3218. }
  3219. default: break;
  3220. }
  3221. i++;
  3222. }
  3223. piece = mystrsep(&tp, 0);
  3224. }
  3225. if (!pattern || !pattern2) {
  3226. if (pattern)
  3227. free(pattern);
  3228. if (pattern2)
  3229. free(pattern2);
  3230. HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
  3231. return 1;
  3232. }
  3233. (*rl)->add(pattern, pattern2);
  3234. }
  3235. return 0;
  3236. }
  3237. /* parse in the typical fault correcting table */
  3238. int AffixMgr::parse_phonetable(char * line, FileMgr * af)
  3239. {
  3240. if (phone) {
  3241. HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
  3242. return 1;
  3243. }
  3244. char * tp = line;
  3245. char * piece;
  3246. int i = 0;
  3247. int np = 0;
  3248. piece = mystrsep(&tp, 0);
  3249. while (piece) {
  3250. if (*piece != '\0') {
  3251. switch(i) {
  3252. case 0: { np++; break; }
  3253. case 1: {
  3254. phone = (phonetable *) malloc(sizeof(struct phonetable));
  3255. if (!phone) return 1;
  3256. phone->num = atoi(piece);
  3257. phone->rules = NULL;
  3258. phone->utf8 = (char) utf8;
  3259. if (phone->num < 1) {
  3260. HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
  3261. return 1;
  3262. }
  3263. phone->rules = (char * *) malloc(2 * (phone->num + 1) * sizeof(char *));
  3264. if (!phone->rules) {
  3265. free(phone);
  3266. phone = NULL;
  3267. return 1;
  3268. }
  3269. np++;
  3270. break;
  3271. }
  3272. default: break;
  3273. }
  3274. i++;
  3275. }
  3276. piece = mystrsep(&tp, 0);
  3277. }
  3278. if (np != 2) {
  3279. HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
  3280. return 1;
  3281. }
  3282. /* now parse the phone->num lines to read in the remainder of the table */
  3283. char * nl;
  3284. for (int j=0; j < phone->num; j++) {
  3285. if (!(nl = af->getline())) return 1;
  3286. mychomp(nl);
  3287. tp = nl;
  3288. i = 0;
  3289. phone->rules[j * 2] = NULL;
  3290. phone->rules[j * 2 + 1] = NULL;
  3291. piece = mystrsep(&tp, 0);
  3292. while (piece) {
  3293. if (*piece != '\0') {
  3294. switch(i) {
  3295. case 0: {
  3296. if (strncmp(piece,"PHONE",5) != 0) {
  3297. HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
  3298. phone->num = 0;
  3299. return 1;
  3300. }
  3301. break;
  3302. }
  3303. case 1: { phone->rules[j * 2] = mystrrep(mystrdup(piece),"_",""); break; }
  3304. case 2: { phone->rules[j * 2 + 1] = mystrrep(mystrdup(piece),"_",""); break; }
  3305. default: break;
  3306. }
  3307. i++;
  3308. }
  3309. piece = mystrsep(&tp, 0);
  3310. }
  3311. if ((!(phone->rules[j * 2])) || (!(phone->rules[j * 2 + 1]))) {
  3312. HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
  3313. phone->num = 0;
  3314. return 1;
  3315. }
  3316. }
  3317. phone->rules[phone->num * 2] = mystrdup("");
  3318. phone->rules[phone->num * 2 + 1] = mystrdup("");
  3319. init_phonet_hash(*phone);
  3320. return 0;
  3321. }
  3322. /* parse in the checkcompoundpattern table */
  3323. int AffixMgr::parse_checkcpdtable(char * line, FileMgr * af)
  3324. {
  3325. if (numcheckcpd != 0) {
  3326. HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
  3327. return 1;
  3328. }
  3329. char * tp = line;
  3330. char * piece;
  3331. int i = 0;
  3332. int np = 0;
  3333. piece = mystrsep(&tp, 0);
  3334. while (piece) {
  3335. if (*piece != '\0') {
  3336. switch(i) {
  3337. case 0: { np++; break; }
  3338. case 1: {
  3339. numcheckcpd = atoi(piece);
  3340. if (numcheckcpd < 1) {
  3341. HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
  3342. return 1;
  3343. }
  3344. checkcpdtable = (patentry *) malloc(numcheckcpd * sizeof(struct patentry));
  3345. if (!checkcpdtable) return 1;
  3346. np++;
  3347. break;
  3348. }
  3349. default: break;
  3350. }
  3351. i++;
  3352. }
  3353. piece = mystrsep(&tp, 0);
  3354. }
  3355. if (np != 2) {
  3356. HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
  3357. return 1;
  3358. }
  3359. /* now parse the numcheckcpd lines to read in the remainder of the table */
  3360. char * nl;
  3361. for (int j=0; j < numcheckcpd; j++) {
  3362. if (!(nl = af->getline())) return 1;
  3363. mychomp(nl);
  3364. tp = nl;
  3365. i = 0;
  3366. checkcpdtable[j].pattern = NULL;
  3367. checkcpdtable[j].pattern2 = NULL;
  3368. checkcpdtable[j].pattern3 = NULL;
  3369. checkcpdtable[j].cond = FLAG_NULL;
  3370. checkcpdtable[j].cond2 = FLAG_NULL;
  3371. piece = mystrsep(&tp, 0);
  3372. while (piece) {
  3373. if (*piece != '\0') {
  3374. switch(i) {
  3375. case 0: {
  3376. if (strncmp(piece,"CHECKCOMPOUNDPATTERN",20) != 0) {
  3377. HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
  3378. numcheckcpd = 0;
  3379. return 1;
  3380. }
  3381. break;
  3382. }
  3383. case 1: {
  3384. checkcpdtable[j].pattern = mystrdup(piece);
  3385. char * p = strchr(checkcpdtable[j].pattern, '/');
  3386. if (p) {
  3387. *p = '\0';
  3388. checkcpdtable[j].cond = pHMgr->decode_flag(p + 1);
  3389. }
  3390. break; }
  3391. case 2: {
  3392. checkcpdtable[j].pattern2 = mystrdup(piece);
  3393. char * p = strchr(checkcpdtable[j].pattern2, '/');
  3394. if (p) {
  3395. *p = '\0';
  3396. checkcpdtable[j].cond2 = pHMgr->decode_flag(p + 1);
  3397. }
  3398. break;
  3399. }
  3400. case 3: { checkcpdtable[j].pattern3 = mystrdup(piece); simplifiedcpd = 1; break; }
  3401. default: break;
  3402. }
  3403. i++;
  3404. }
  3405. piece = mystrsep(&tp, 0);
  3406. }
  3407. if ((!(checkcpdtable[j].pattern)) || (!(checkcpdtable[j].pattern2))) {
  3408. HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
  3409. numcheckcpd = 0;
  3410. return 1;
  3411. }
  3412. }
  3413. return 0;
  3414. }
  3415. /* parse in the compound rule table */
  3416. int AffixMgr::parse_defcpdtable(char * line, FileMgr * af)
  3417. {
  3418. if (numdefcpd != 0) {
  3419. HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
  3420. return 1;
  3421. }
  3422. char * tp = line;
  3423. char * piece;
  3424. int i = 0;
  3425. int np = 0;
  3426. piece = mystrsep(&tp, 0);
  3427. while (piece) {
  3428. if (*piece != '\0') {
  3429. switch(i) {
  3430. case 0: { np++; break; }
  3431. case 1: {
  3432. numdefcpd = atoi(piece);
  3433. if (numdefcpd < 1) {
  3434. HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
  3435. return 1;
  3436. }
  3437. defcpdtable = (flagentry *) malloc(numdefcpd * sizeof(flagentry));
  3438. if (!defcpdtable) return 1;
  3439. np++;
  3440. break;
  3441. }
  3442. default: break;
  3443. }
  3444. i++;
  3445. }
  3446. piece = mystrsep(&tp, 0);
  3447. }
  3448. if (np != 2) {
  3449. HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
  3450. return 1;
  3451. }
  3452. /* now parse the numdefcpd lines to read in the remainder of the table */
  3453. char * nl;
  3454. for (int j=0; j < numdefcpd; j++) {
  3455. if (!(nl = af->getline())) return 1;
  3456. mychomp(nl);
  3457. tp = nl;
  3458. i = 0;
  3459. defcpdtable[j].def = NULL;
  3460. piece = mystrsep(&tp, 0);
  3461. while (piece) {
  3462. if (*piece != '\0') {
  3463. switch(i) {
  3464. case 0: {
  3465. if (strncmp(piece, "COMPOUNDRULE", 12) != 0) {
  3466. HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
  3467. numdefcpd = 0;
  3468. return 1;
  3469. }
  3470. break;
  3471. }
  3472. case 1: { // handle parenthesized flags
  3473. if (strchr(piece, '(')) {
  3474. defcpdtable[j].def = (FLAG *) malloc(strlen(piece) * sizeof(FLAG));
  3475. defcpdtable[j].len = 0;
  3476. int end = 0;
  3477. FLAG * conv;
  3478. while (!end) {
  3479. char * par = piece + 1;
  3480. while (*par != '(' && *par != ')' && *par != '\0') par++;
  3481. if (*par == '\0') end = 1; else *par = '\0';
  3482. if (*piece == '(') piece++;
  3483. if (*piece == '*' || *piece == '?') {
  3484. defcpdtable[j].def[defcpdtable[j].len++] = (FLAG) *piece;
  3485. } else if (*piece != '\0') {
  3486. int l = pHMgr->decode_flags(&conv, piece, af);
  3487. for (int k = 0; k < l; k++) defcpdtable[j].def[defcpdtable[j].len++] = conv[k];
  3488. free(conv);
  3489. }
  3490. piece = par + 1;
  3491. }
  3492. } else {
  3493. defcpdtable[j].len = pHMgr->decode_flags(&(defcpdtable[j].def), piece, af);
  3494. }
  3495. break;
  3496. }
  3497. default: break;
  3498. }
  3499. i++;
  3500. }
  3501. piece = mystrsep(&tp, 0);
  3502. }
  3503. if (!defcpdtable[j].len) {
  3504. HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
  3505. numdefcpd = 0;
  3506. return 1;
  3507. }
  3508. }
  3509. return 0;
  3510. }
  3511. /* parse in the character map table */
  3512. int AffixMgr::parse_maptable(char * line, FileMgr * af)
  3513. {
  3514. if (nummap != 0) {
  3515. HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
  3516. return 1;
  3517. }
  3518. char * tp = line;
  3519. char * piece;
  3520. int i = 0;
  3521. int np = 0;
  3522. piece = mystrsep(&tp, 0);
  3523. while (piece) {
  3524. if (*piece != '\0') {
  3525. switch(i) {
  3526. case 0: { np++; break; }
  3527. case 1: {
  3528. nummap = atoi(piece);
  3529. if (nummap < 1) {
  3530. HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
  3531. return 1;
  3532. }
  3533. maptable = (mapentry *) malloc(nummap * sizeof(struct mapentry));
  3534. if (!maptable) return 1;
  3535. np++;
  3536. break;
  3537. }
  3538. default: break;
  3539. }
  3540. i++;
  3541. }
  3542. piece = mystrsep(&tp, 0);
  3543. }
  3544. if (np != 2) {
  3545. HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
  3546. return 1;
  3547. }
  3548. /* now parse the nummap lines to read in the remainder of the table */
  3549. char * nl;
  3550. for (int j=0; j < nummap; j++) {
  3551. if (!(nl = af->getline())) return 1;
  3552. mychomp(nl);
  3553. tp = nl;
  3554. i = 0;
  3555. maptable[j].set = NULL;
  3556. maptable[j].len = 0;
  3557. piece = mystrsep(&tp, 0);
  3558. while (piece) {
  3559. if (*piece != '\0') {
  3560. switch(i) {
  3561. case 0: {
  3562. if (strncmp(piece,"MAP",3) != 0) {
  3563. HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
  3564. nummap = 0;
  3565. return 1;
  3566. }
  3567. break;
  3568. }
  3569. case 1: {
  3570. int setn = 0;
  3571. maptable[j].len = strlen(piece);
  3572. maptable[j].set = (char **) malloc(maptable[j].len * sizeof(char*));
  3573. if (!maptable[j].set) return 1;
  3574. for (int k = 0; k < maptable[j].len; k++) {
  3575. int chl = 1;
  3576. int chb = k;
  3577. if (piece[k] == '(') {
  3578. char * parpos = strchr(piece + k, ')');
  3579. if (parpos != NULL) {
  3580. chb = k + 1;
  3581. chl = (int)(parpos - piece) - k - 1;
  3582. k = k + chl + 1;
  3583. }
  3584. } else {
  3585. if (utf8 && (piece[k] & 0xc0) == 0xc0) {
  3586. for (k++; utf8 && (piece[k] & 0xc0) == 0x80; k++);
  3587. chl = k - chb;
  3588. k--;
  3589. }
  3590. }
  3591. maptable[j].set[setn] = (char *) malloc(chl + 1);
  3592. if (!maptable[j].set[setn]) return 1;
  3593. strncpy(maptable[j].set[setn], piece + chb, chl);
  3594. maptable[j].set[setn][chl] = '\0';
  3595. setn++;
  3596. }
  3597. maptable[j].len = setn;
  3598. break; }
  3599. default: break;
  3600. }
  3601. i++;
  3602. }
  3603. piece = mystrsep(&tp, 0);
  3604. }
  3605. if (!maptable[j].set || !maptable[j].len) {
  3606. HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
  3607. nummap = 0;
  3608. return 1;
  3609. }
  3610. }
  3611. return 0;
  3612. }
  3613. /* parse in the word breakpoint table */
  3614. int AffixMgr::parse_breaktable(char * line, FileMgr * af)
  3615. {
  3616. if (numbreak > -1) {
  3617. HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
  3618. return 1;
  3619. }
  3620. char * tp = line;
  3621. char * piece;
  3622. int i = 0;
  3623. int np = 0;
  3624. piece = mystrsep(&tp, 0);
  3625. while (piece) {
  3626. if (*piece != '\0') {
  3627. switch(i) {
  3628. case 0: { np++; break; }
  3629. case 1: {
  3630. numbreak = atoi(piece);
  3631. if (numbreak < 0) {
  3632. HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
  3633. return 1;
  3634. }
  3635. if (numbreak == 0) return 0;
  3636. breaktable = (char **) malloc(numbreak * sizeof(char *));
  3637. if (!breaktable) return 1;
  3638. np++;
  3639. break;
  3640. }
  3641. default: break;
  3642. }
  3643. i++;
  3644. }
  3645. piece = mystrsep(&tp, 0);
  3646. }
  3647. if (np != 2) {
  3648. HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
  3649. return 1;
  3650. }
  3651. /* now parse the numbreak lines to read in the remainder of the table */
  3652. char * nl;
  3653. for (int j=0; j < numbreak; j++) {
  3654. if (!(nl = af->getline())) return 1;
  3655. mychomp(nl);
  3656. tp = nl;
  3657. i = 0;
  3658. piece = mystrsep(&tp, 0);
  3659. while (piece) {
  3660. if (*piece != '\0') {
  3661. switch(i) {
  3662. case 0: {
  3663. if (strncmp(piece,"BREAK",5) != 0) {
  3664. HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
  3665. numbreak = 0;
  3666. return 1;
  3667. }
  3668. break;
  3669. }
  3670. case 1: {
  3671. breaktable[j] = mystrdup(piece);
  3672. break;
  3673. }
  3674. default: break;
  3675. }
  3676. i++;
  3677. }
  3678. piece = mystrsep(&tp, 0);
  3679. }
  3680. if (!breaktable) {
  3681. HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
  3682. numbreak = 0;
  3683. return 1;
  3684. }
  3685. }
  3686. return 0;
  3687. }
  3688. void AffixMgr::reverse_condition(char * piece) {
  3689. int neg = 0;
  3690. for (char * k = piece + strlen(piece) - 1; k >= piece; k--) {
  3691. switch(*k) {
  3692. case '[': {
  3693. if (neg) *(k+1) = '['; else *k = ']';
  3694. break;
  3695. }
  3696. case ']': {
  3697. *k = '[';
  3698. if (neg) *(k+1) = '^';
  3699. neg = 0;
  3700. break;
  3701. }
  3702. case '^': {
  3703. if (*(k+1) == ']') neg = 1; else *(k+1) = *k;
  3704. break;
  3705. }
  3706. default: {
  3707. if (neg) *(k+1) = *k;
  3708. }
  3709. }
  3710. }
  3711. }
  3712. int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupflags)
  3713. {
  3714. int numents = 0; // number of affentry structures to parse
  3715. unsigned short aflag = 0; // affix char identifier
  3716. char ff=0;
  3717. std::vector<affentry> affentries;
  3718. char * tp = line;
  3719. char * nl = line;
  3720. char * piece;
  3721. int i = 0;
  3722. // checking lines with bad syntax
  3723. #ifdef DEBUG
  3724. int basefieldnum = 0;
  3725. #endif
  3726. // split affix header line into pieces
  3727. int np = 0;
  3728. piece = mystrsep(&tp, 0);
  3729. while (piece) {
  3730. if (*piece != '\0') {
  3731. switch(i) {
  3732. // piece 1 - is type of affix
  3733. case 0: { np++; break; }
  3734. // piece 2 - is affix char
  3735. case 1: {
  3736. np++;
  3737. aflag = pHMgr->decode_flag(piece);
  3738. if (((at == 'S') && (dupflags[aflag] & dupSFX)) ||
  3739. ((at == 'P') && (dupflags[aflag] & dupPFX))) {
  3740. HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix flag\n",
  3741. af->getlinenum());
  3742. // return 1; XXX permissive mode for bad dictionaries
  3743. }
  3744. dupflags[aflag] += (char) ((at == 'S') ? dupSFX : dupPFX);
  3745. break;
  3746. }
  3747. // piece 3 - is cross product indicator
  3748. case 2: { np++; if (*piece == 'Y') ff = aeXPRODUCT; break; }
  3749. // piece 4 - is number of affentries
  3750. case 3: {
  3751. np++;
  3752. numents = atoi(piece);
  3753. if (numents == 0) {
  3754. char * err = pHMgr->encode_flag(aflag);
  3755. if (err) {
  3756. HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
  3757. af->getlinenum());
  3758. free(err);
  3759. }
  3760. return 1;
  3761. }
  3762. affentries.resize(numents);
  3763. affentries[0].opts = ff;
  3764. if (utf8) affentries[0].opts += aeUTF8;
  3765. if (pHMgr->is_aliasf()) affentries[0].opts += aeALIASF;
  3766. if (pHMgr->is_aliasm()) affentries[0].opts += aeALIASM;
  3767. affentries[0].aflag = aflag;
  3768. }
  3769. default: break;
  3770. }
  3771. i++;
  3772. }
  3773. piece = mystrsep(&tp, 0);
  3774. }
  3775. // check to make sure we parsed enough pieces
  3776. if (np != 4) {
  3777. char * err = pHMgr->encode_flag(aflag);
  3778. if (err) {
  3779. HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
  3780. free(err);
  3781. }
  3782. return 1;
  3783. }
  3784. // now parse numents affentries for this affix
  3785. std::vector<affentry>::iterator start = affentries.begin();
  3786. std::vector<affentry>::iterator end = affentries.end();
  3787. for (std::vector<affentry>::iterator entry = start; entry != end; ++entry) {
  3788. if (!(nl = af->getline())) return 1;
  3789. mychomp(nl);
  3790. tp = nl;
  3791. i = 0;
  3792. np = 0;
  3793. // split line into pieces
  3794. piece = mystrsep(&tp, 0);
  3795. while (piece) {
  3796. if (*piece != '\0') {
  3797. switch(i) {
  3798. // piece 1 - is type
  3799. case 0: {
  3800. np++;
  3801. if (entry != start) entry->opts = start->opts &
  3802. (char) (aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM);
  3803. break;
  3804. }
  3805. // piece 2 - is affix char
  3806. case 1: {
  3807. np++;
  3808. if (pHMgr->decode_flag(piece) != aflag) {
  3809. char * err = pHMgr->encode_flag(aflag);
  3810. if (err) {
  3811. HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n",
  3812. af->getlinenum(), err);
  3813. free(err);
  3814. }
  3815. return 1;
  3816. }
  3817. if (entry != start) entry->aflag = start->aflag;
  3818. break;
  3819. }
  3820. // piece 3 - is string to strip or 0 for null
  3821. case 2: {
  3822. np++;
  3823. if (complexprefixes) {
  3824. if (utf8) reverseword_utf(piece); else reverseword(piece);
  3825. }
  3826. entry->strip = mystrdup(piece);
  3827. entry->stripl = (unsigned char) strlen(entry->strip);
  3828. if (strcmp(entry->strip,"0") == 0) {
  3829. free(entry->strip);
  3830. entry->strip=mystrdup("");
  3831. entry->stripl = 0;
  3832. }
  3833. break;
  3834. }
  3835. // piece 4 - is affix string or 0 for null
  3836. case 3: {
  3837. char * dash;
  3838. entry->morphcode = NULL;
  3839. entry->contclass = NULL;
  3840. entry->contclasslen = 0;
  3841. np++;
  3842. dash = strchr(piece, '/');
  3843. if (dash) {
  3844. *dash = '\0';
  3845. if (ignorechars) {
  3846. if (utf8) {
  3847. remove_ignored_chars_utf(piece, ignorechars_utf16, ignorechars_utf16_len);
  3848. } else {
  3849. remove_ignored_chars(piece,ignorechars);
  3850. }
  3851. }
  3852. if (complexprefixes) {
  3853. if (utf8) reverseword_utf(piece); else reverseword(piece);
  3854. }
  3855. entry->appnd = mystrdup(piece);
  3856. if (pHMgr->is_aliasf()) {
  3857. int index = atoi(dash + 1);
  3858. entry->contclasslen = (unsigned short) pHMgr->get_aliasf(index, &(entry->contclass), af);
  3859. if (!entry->contclasslen) HUNSPELL_WARNING(stderr, "error: bad affix flag alias: \"%s\"\n", dash+1);
  3860. } else {
  3861. entry->contclasslen = (unsigned short) pHMgr->decode_flags(&(entry->contclass), dash + 1, af);
  3862. flag_qsort(entry->contclass, 0, entry->contclasslen);
  3863. }
  3864. *dash = '/';
  3865. havecontclass = 1;
  3866. for (unsigned short _i = 0; _i < entry->contclasslen; _i++) {
  3867. contclasses[(entry->contclass)[_i]] = 1;
  3868. }
  3869. } else {
  3870. if (ignorechars) {
  3871. if (utf8) {
  3872. remove_ignored_chars_utf(piece, ignorechars_utf16, ignorechars_utf16_len);
  3873. } else {
  3874. remove_ignored_chars(piece,ignorechars);
  3875. }
  3876. }
  3877. if (complexprefixes) {
  3878. if (utf8) reverseword_utf(piece); else reverseword(piece);
  3879. }
  3880. entry->appnd = mystrdup(piece);
  3881. }
  3882. entry->appndl = (unsigned char) strlen(entry->appnd);
  3883. if (strcmp(entry->appnd,"0") == 0) {
  3884. free(entry->appnd);
  3885. entry->appnd=mystrdup("");
  3886. entry->appndl = 0;
  3887. }
  3888. break;
  3889. }
  3890. // piece 5 - is the conditions descriptions
  3891. case 4: {
  3892. np++;
  3893. if (complexprefixes) {
  3894. if (utf8) reverseword_utf(piece); else reverseword(piece);
  3895. reverse_condition(piece);
  3896. }
  3897. if (entry->stripl && (strcmp(piece, ".") != 0) &&
  3898. redundant_condition(at, entry->strip, entry->stripl, piece, af->getlinenum()))
  3899. strcpy(piece, ".");
  3900. if (at == 'S') {
  3901. reverseword(piece);
  3902. reverse_condition(piece);
  3903. }
  3904. if (encodeit(*entry, piece)) return 1;
  3905. break;
  3906. }
  3907. case 5: {
  3908. np++;
  3909. if (pHMgr->is_aliasm()) {
  3910. int index = atoi(piece);
  3911. entry->morphcode = pHMgr->get_aliasm(index);
  3912. } else {
  3913. if (complexprefixes) { // XXX - fix me for morph. gen.
  3914. if (utf8) reverseword_utf(piece); else reverseword(piece);
  3915. }
  3916. // add the remaining of the line
  3917. if (*tp) {
  3918. *(tp - 1) = ' ';
  3919. tp = tp + strlen(tp);
  3920. }
  3921. entry->morphcode = mystrdup(piece);
  3922. if (!entry->morphcode) return 1;
  3923. }
  3924. break;
  3925. }
  3926. default: break;
  3927. }
  3928. i++;
  3929. }
  3930. piece = mystrsep(&tp, 0);
  3931. }
  3932. // check to make sure we parsed enough pieces
  3933. if (np < 4) {
  3934. char * err = pHMgr->encode_flag(aflag);
  3935. if (err) {
  3936. HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n",
  3937. af->getlinenum(), err);
  3938. free(err);
  3939. }
  3940. return 1;
  3941. }
  3942. #ifdef DEBUG
  3943. // detect unnecessary fields, excepting comments
  3944. if (basefieldnum) {
  3945. int fieldnum = !(entry->morphcode) ? 5 : ((*(entry->morphcode)=='#') ? 5 : 6);
  3946. if (fieldnum != basefieldnum)
  3947. HUNSPELL_WARNING(stderr, "warning: line %d: bad field number\n", af->getlinenum());
  3948. } else {
  3949. basefieldnum = !(entry->morphcode) ? 5 : ((*(entry->morphcode)=='#') ? 5 : 6);
  3950. }
  3951. #endif
  3952. }
  3953. // now create SfxEntry or PfxEntry objects and use links to
  3954. // build an ordered (sorted by affix string) list
  3955. for (std::vector<affentry>::iterator entry = start; entry != end; ++entry) {
  3956. if (at == 'P') {
  3957. PfxEntry * pfxptr = new PfxEntry(this,&(*entry));
  3958. build_pfxtree(pfxptr);
  3959. } else {
  3960. SfxEntry * sfxptr = new SfxEntry(this,&(*entry));
  3961. build_sfxtree(sfxptr);
  3962. }
  3963. }
  3964. return 0;
  3965. }
  3966. int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char * cond, int linenum) {
  3967. int condl = strlen(cond);
  3968. int i;
  3969. int j;
  3970. int neg;
  3971. int in;
  3972. if (ft == 'P') { // prefix
  3973. if (strncmp(strip, cond, condl) == 0) return 1;
  3974. if (utf8) {
  3975. } else {
  3976. for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) {
  3977. if (cond[j] != '[') {
  3978. if (cond[j] != strip[i]) {
  3979. HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);
  3980. return 0;
  3981. }
  3982. } else {
  3983. neg = (cond[j+1] == '^') ? 1 : 0;
  3984. in = 0;
  3985. do {
  3986. j++;
  3987. if (strip[i] == cond[j]) in = 1;
  3988. } while ((j < (condl - 1)) && (cond[j] != ']'));
  3989. if (j == (condl - 1) && (cond[j] != ']')) {
  3990. HUNSPELL_WARNING(stderr, "error: line %d: missing ] in condition:\n%s\n", linenum, cond);
  3991. return 0;
  3992. }
  3993. if ((!neg && !in) || (neg && in)) {
  3994. HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);
  3995. return 0;
  3996. }
  3997. }
  3998. }
  3999. if (j >= condl) return 1;
  4000. }
  4001. } else { // suffix
  4002. if ((stripl >= condl) && strcmp(strip + stripl - condl, cond) == 0) return 1;
  4003. if (utf8) {
  4004. } else {
  4005. for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) {
  4006. if (cond[j] != ']') {
  4007. if (cond[j] != strip[i]) {
  4008. HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);
  4009. return 0;
  4010. }
  4011. } else {
  4012. in = 0;
  4013. do {
  4014. j--;
  4015. if (strip[i] == cond[j]) in = 1;
  4016. } while ((j > 0) && (cond[j] != '['));
  4017. if ((j == 0) && (cond[j] != '[')) {
  4018. HUNSPELL_WARNING(stderr, "error: line: %d: missing ] in condition:\n%s\n", linenum, cond);
  4019. return 0;
  4020. }
  4021. neg = (cond[j+1] == '^') ? 1 : 0;
  4022. if ((!neg && !in) || (neg && in)) {
  4023. HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);
  4024. return 0;
  4025. }
  4026. }
  4027. }
  4028. if (j < 0) return 1;
  4029. }
  4030. }
  4031. return 0;
  4032. }