PageRenderTime 66ms CodeModel.GetById 25ms RepoModel.GetById 1ms app.codeStats 0ms

/enchant-1.6.0/src/myspell/affixmgr.cxx

#
C++ | 4115 lines | 3262 code | 417 blank | 436 comment | 1325 complexity | 159738d71666c8f2bd4397deb25c4301 MD5 | raw file
Possible License(s): LGPL-2.1, MPL-2.0-no-copyleft-exception

Large files files are truncated, but you can click here to view the full file

  1. #include "license.hunspell"
  2. #include "license.myspell"
  3. #ifndef MOZILLA_CLIENT
  4. #include <cstdlib>
  5. #include <cstring>
  6. #include <cctype>
  7. #include <cstdio>
  8. #else
  9. #include <stdlib.h>
  10. #include <string.h>
  11. #include <stdio.h>
  12. #include <ctype.h>
  13. #endif
  14. #include "affixmgr.hxx"
  15. #include "affentry.hxx"
  16. #include "langnum.hxx"
  17. #include "csutil.hxx"
  18. #ifndef MOZILLA_CLIENT
  19. #ifndef WIN32
  20. using namespace std;
  21. #endif
  22. #endif
  23. AffixMgr::AffixMgr(const char * affpath, HashMgr** ptr, int * md, const char * key)
  24. {
  25. // register hash manager and load affix data from aff file
  26. pHMgr = ptr[0];
  27. alldic = ptr;
  28. maxdic = md;
  29. keystring = NULL;
  30. trystring = NULL;
  31. encoding=NULL;
  32. utf8 = 0;
  33. complexprefixes = 0;
  34. maptable = NULL;
  35. nummap = 0;
  36. breaktable = NULL;
  37. numbreak = 0;
  38. reptable = NULL;
  39. numrep = 0;
  40. checkcpdtable = NULL;
  41. numcheckcpd = 0;
  42. defcpdtable = NULL;
  43. numdefcpd = 0;
  44. phone = NULL;
  45. compoundflag = FLAG_NULL; // permits word in compound forms
  46. compoundbegin = FLAG_NULL; // may be first word in compound forms
  47. compoundmiddle = FLAG_NULL; // may be middle word in compound forms
  48. compoundend = FLAG_NULL; // may be last word in compound forms
  49. compoundroot = FLAG_NULL; // compound word signing flag
  50. compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word
  51. compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word
  52. checkcompounddup = 0; // forbid double words in compounds
  53. checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a REP substitution)
  54. checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds
  55. checkcompoundtriple = 0; // forbid compounds with triple letters
  56. forbiddenword = FORBIDDENWORD; // forbidden word signing flag
  57. nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag
  58. lang = NULL; // language
  59. langnum = 0; // language code (see http://l10n.openoffice.org/languages.html)
  60. needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes
  61. cpdwordmax = -1; // default: unlimited wordcount in compound words
  62. cpdmin = -1; // undefined
  63. cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words
  64. cpdvowels=NULL; // vowels (for calculating of Hungarian compounding limit, O(n) search! XXX)
  65. cpdvowels_utf16=NULL; // vowels for UTF-8 encoding (bsearch instead of O(n) search)
  66. cpdvowels_utf16_len=0; // vowels
  67. pfxappnd=NULL; // previous prefix for counting the syllables of prefix BUG
  68. sfxappnd=NULL; // previous suffix for counting a special syllables BUG
  69. cpdsyllablenum=NULL; // syllable count incrementing flag
  70. checknum=0; // checking numbers, and word with numbers
  71. wordchars=NULL; // letters + spec. word characters
  72. wordchars_utf16=NULL; // letters + spec. word characters
  73. wordchars_utf16_len=0; // letters + spec. word characters
  74. ignorechars=NULL; // letters + spec. word characters
  75. ignorechars_utf16=NULL; // letters + spec. word characters
  76. ignorechars_utf16_len=0; // letters + spec. word characters
  77. version=NULL; // affix and dictionary file version string
  78. havecontclass=0; // flags of possible continuing classes (double affix)
  79. // LEMMA_PRESENT: not put root into the morphological output. Lemma presents
  80. // in morhological description in dictionary file. It's often combined with PSEUDOROOT.
  81. lemma_present = FLAG_NULL;
  82. circumfix = FLAG_NULL;
  83. onlyincompound = FLAG_NULL;
  84. flag_mode = FLAG_CHAR; // default one-character flags in affix and dic file
  85. maxngramsugs = -1; // undefined
  86. nosplitsugs = 0;
  87. sugswithdots = 0;
  88. keepcase = 0;
  89. checksharps = 0;
  90. substandard = FLAG_NULL;
  91. derived = NULL; // XXX not threadsafe variable for experimental stemming
  92. sfx = NULL;
  93. pfx = NULL;
  94. for (int i=0; i < SETSIZE; i++) {
  95. pStart[i] = NULL;
  96. sStart[i] = NULL;
  97. pFlag[i] = NULL;
  98. sFlag[i] = NULL;
  99. }
  100. for (int j=0; j < CONTSIZE; j++) {
  101. contclasses[j] = 0;
  102. }
  103. if (parse_file(affpath, key)) {
  104. HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath);
  105. }
  106. if (cpdmin == -1) cpdmin = MINCPDLEN;
  107. }
  108. AffixMgr::~AffixMgr()
  109. {
  110. // pass through linked prefix entries and clean up
  111. for (int i=0; i < SETSIZE ;i++) {
  112. pFlag[i] = NULL;
  113. PfxEntry * ptr = (PfxEntry *)pStart[i];
  114. PfxEntry * nptr = NULL;
  115. while (ptr) {
  116. nptr = ptr->getNext();
  117. delete(ptr);
  118. ptr = nptr;
  119. nptr = NULL;
  120. }
  121. }
  122. // pass through linked suffix entries and clean up
  123. for (int j=0; j < SETSIZE ; j++) {
  124. sFlag[j] = NULL;
  125. SfxEntry * ptr = (SfxEntry *)sStart[j];
  126. SfxEntry * nptr = NULL;
  127. while (ptr) {
  128. nptr = ptr->getNext();
  129. delete(ptr);
  130. ptr = nptr;
  131. nptr = NULL;
  132. }
  133. sStart[j] = NULL;
  134. }
  135. if (keystring) free(keystring);
  136. keystring=NULL;
  137. if (trystring) free(trystring);
  138. trystring=NULL;
  139. if (encoding) free(encoding);
  140. encoding=NULL;
  141. if (maptable) {
  142. for (int j=0; j < nummap; j++) {
  143. if (maptable[j].set) free(maptable[j].set);
  144. if (maptable[j].set_utf16) free(maptable[j].set_utf16);
  145. maptable[j].set = NULL;
  146. maptable[j].len = 0;
  147. }
  148. free(maptable);
  149. maptable = NULL;
  150. }
  151. nummap = 0;
  152. if (breaktable) {
  153. for (int j=0; j < numbreak; j++) {
  154. if (breaktable[j]) free(breaktable[j]);
  155. breaktable[j] = NULL;
  156. }
  157. free(breaktable);
  158. breaktable = NULL;
  159. }
  160. numbreak = 0;
  161. if (reptable) {
  162. for (int j=0; j < numrep; j++) {
  163. free(reptable[j].pattern);
  164. free(reptable[j].pattern2);
  165. }
  166. free(reptable);
  167. reptable = NULL;
  168. }
  169. if (phone && phone->rules) {
  170. for (int j=0; j < phone->num + 1; j++) {
  171. free(phone->rules[j * 2]);
  172. free(phone->rules[j * 2 + 1]);
  173. }
  174. free(phone->rules);
  175. free(phone);
  176. phone = NULL;
  177. }
  178. if (defcpdtable) {
  179. for (int j=0; j < numdefcpd; j++) {
  180. free(defcpdtable[j].def);
  181. defcpdtable[j].def = NULL;
  182. }
  183. free(defcpdtable);
  184. defcpdtable = NULL;
  185. }
  186. numrep = 0;
  187. if (checkcpdtable) {
  188. for (int j=0; j < numcheckcpd; j++) {
  189. free(checkcpdtable[j].pattern);
  190. free(checkcpdtable[j].pattern2);
  191. checkcpdtable[j].pattern = NULL;
  192. checkcpdtable[j].pattern2 = NULL;
  193. }
  194. free(checkcpdtable);
  195. checkcpdtable = NULL;
  196. }
  197. numcheckcpd = 0;
  198. FREE_FLAG(compoundflag);
  199. FREE_FLAG(compoundbegin);
  200. FREE_FLAG(compoundmiddle);
  201. FREE_FLAG(compoundend);
  202. FREE_FLAG(compoundpermitflag);
  203. FREE_FLAG(compoundforbidflag);
  204. FREE_FLAG(compoundroot);
  205. FREE_FLAG(forbiddenword);
  206. FREE_FLAG(nosuggest);
  207. FREE_FLAG(needaffix);
  208. FREE_FLAG(lemma_present);
  209. FREE_FLAG(circumfix);
  210. FREE_FLAG(onlyincompound);
  211. cpdwordmax = 0;
  212. pHMgr = NULL;
  213. cpdmin = 0;
  214. cpdmaxsyllable = 0;
  215. if (cpdvowels) free(cpdvowels);
  216. if (cpdvowels_utf16) free(cpdvowels_utf16);
  217. if (cpdsyllablenum) free(cpdsyllablenum);
  218. free_utf_tbl();
  219. if (lang) free(lang);
  220. if (wordchars) free(wordchars);
  221. if (wordchars_utf16) free(wordchars_utf16);
  222. if (ignorechars) free(ignorechars);
  223. if (ignorechars_utf16) free(ignorechars_utf16);
  224. if (version) free(version);
  225. if (derived) free(derived);
  226. checknum=0;
  227. }
  228. // read in aff file and build up prefix and suffix entry objects
  229. int AffixMgr::parse_file(const char * affpath, const char * key)
  230. {
  231. char * line; // io buffers
  232. char ft; // affix type
  233. // checking flag duplication
  234. char dupflags[CONTSIZE];
  235. char dupflags_ini = 1;
  236. // first line indicator for removing byte order mark
  237. int firstline = 1;
  238. // open the affix file
  239. FileMgr * afflst = new FileMgr(affpath, key);
  240. if (!afflst) {
  241. HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath);
  242. return 1;
  243. }
  244. // step one is to parse the affix file building up the internal
  245. // affix data structures
  246. // read in each line ignoring any that do not
  247. // start with a known line type indicator
  248. while ((line = afflst->getline())) {
  249. mychomp(line);
  250. /* remove byte order mark */
  251. if (firstline) {
  252. firstline = 0;
  253. if (strncmp(line,"\xEF\xBB\xBF",3) == 0) {
  254. memmove(line, line+3, strlen(line+3)+1);
  255. HUNSPELL_WARNING(stderr, "warning: affix file begins with byte order mark: possible incompatibility with old Hunspell versions\n");
  256. }
  257. }
  258. /* parse in the keyboard string */
  259. if (strncmp(line,"KEY",3) == 0) {
  260. if (parse_string(line, &keystring, "KEY")) {
  261. delete afflst;
  262. return 1;
  263. }
  264. }
  265. /* parse in the try string */
  266. if (strncmp(line,"TRY",3) == 0) {
  267. if (parse_string(line, &trystring, "TRY")) {
  268. delete afflst;
  269. return 1;
  270. }
  271. }
  272. /* parse in the name of the character set used by the .dict and .aff */
  273. if (strncmp(line,"SET",3) == 0) {
  274. if (parse_string(line, &encoding, "SET")) {
  275. delete afflst;
  276. return 1;
  277. }
  278. if (strcmp(encoding, "UTF-8") == 0) {
  279. utf8 = 1;
  280. #ifndef OPENOFFICEORG
  281. #ifndef MOZILLA_CLIENT
  282. if (initialize_utf_tbl()) return 1;
  283. #endif
  284. #endif
  285. }
  286. }
  287. /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left writing system */
  288. if (strncmp(line,"COMPLEXPREFIXES",15) == 0)
  289. complexprefixes = 1;
  290. /* parse in the flag used by the controlled compound words */
  291. if (strncmp(line,"COMPOUNDFLAG",12) == 0) {
  292. if (parse_flag(line, &compoundflag, "COMPOUNDFLAG")) {
  293. delete afflst;
  294. return 1;
  295. }
  296. }
  297. /* parse in the flag used by compound words */
  298. if (strncmp(line,"COMPOUNDBEGIN",13) == 0) {
  299. if (complexprefixes) {
  300. if (parse_flag(line, &compoundend, "COMPOUNDBEGIN")) {
  301. delete afflst;
  302. return 1;
  303. }
  304. } else {
  305. if (parse_flag(line, &compoundbegin, "COMPOUNDBEGIN")) {
  306. delete afflst;
  307. return 1;
  308. }
  309. }
  310. }
  311. /* parse in the flag used by compound words */
  312. if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) {
  313. if (parse_flag(line, &compoundmiddle, "COMPOUNDMIDDLE")) {
  314. delete afflst;
  315. return 1;
  316. }
  317. }
  318. /* parse in the flag used by compound words */
  319. if (strncmp(line,"COMPOUNDEND",11) == 0) {
  320. if (complexprefixes) {
  321. if (parse_flag(line, &compoundbegin, "COMPOUNDEND")) {
  322. delete afflst;
  323. return 1;
  324. }
  325. } else {
  326. if (parse_flag(line, &compoundend, "COMPOUNDEND")) {
  327. delete afflst;
  328. return 1;
  329. }
  330. }
  331. }
  332. /* parse in the data used by compound_check() method */
  333. if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) {
  334. if (parse_num(line, &cpdwordmax, "COMPOUNDWORDMAX")) {
  335. delete afflst;
  336. return 1;
  337. }
  338. }
  339. /* parse in the flag sign compounds in dictionary */
  340. if (strncmp(line,"COMPOUNDROOT",12) == 0) {
  341. if (parse_flag(line, &compoundroot, "COMPOUNDROOT")) {
  342. delete afflst;
  343. return 1;
  344. }
  345. }
  346. /* parse in the flag used by compound_check() method */
  347. if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) {
  348. if (parse_flag(line, &compoundpermitflag, "COMPOUNDPERMITFLAG")) {
  349. delete afflst;
  350. return 1;
  351. }
  352. }
  353. /* parse in the flag used by compound_check() method */
  354. if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) {
  355. if (parse_flag(line, &compoundforbidflag, "COMPOUNDFORBIDFLAG")) {
  356. delete afflst;
  357. return 1;
  358. }
  359. }
  360. if (strncmp(line,"CHECKCOMPOUNDDUP",16) == 0) {
  361. checkcompounddup = 1;
  362. }
  363. if (strncmp(line,"CHECKCOMPOUNDREP",16) == 0) {
  364. checkcompoundrep = 1;
  365. }
  366. if (strncmp(line,"CHECKCOMPOUNDTRIPLE",19) == 0) {
  367. checkcompoundtriple = 1;
  368. }
  369. if (strncmp(line,"CHECKCOMPOUNDCASE",17) == 0) {
  370. checkcompoundcase = 1;
  371. }
  372. if (strncmp(line,"NOSUGGEST",9) == 0) {
  373. if (parse_flag(line, &nosuggest, "NOSUGGEST")) {
  374. delete afflst;
  375. return 1;
  376. }
  377. }
  378. /* parse in the flag used by forbidden words */
  379. if (strncmp(line,"FORBIDDENWORD",13) == 0) {
  380. if (parse_flag(line, &forbiddenword, "FORBIDDENWORD")) {
  381. delete afflst;
  382. return 1;
  383. }
  384. }
  385. /* parse in the flag used by forbidden words */
  386. if (strncmp(line,"LEMMA_PRESENT",13) == 0) {
  387. if (parse_flag(line, &lemma_present, "LEMMA_PRESENT")) {
  388. delete afflst;
  389. return 1;
  390. }
  391. }
  392. /* parse in the flag used by circumfixes */
  393. if (strncmp(line,"CIRCUMFIX",9) == 0) {
  394. if (parse_flag(line, &circumfix, "CIRCUMFIX")) {
  395. delete afflst;
  396. return 1;
  397. }
  398. }
  399. /* parse in the flag used by fogemorphemes */
  400. if (strncmp(line,"ONLYINCOMPOUND",14) == 0) {
  401. if (parse_flag(line, &onlyincompound, "ONLYINCOMPOUND")) {
  402. delete afflst;
  403. return 1;
  404. }
  405. }
  406. /* parse in the flag used by `needaffixs' */
  407. if (strncmp(line,"PSEUDOROOT",10) == 0) {
  408. if (parse_flag(line, &needaffix, "PSEUDOROOT")) {
  409. delete afflst;
  410. return 1;
  411. }
  412. }
  413. /* parse in the flag used by `needaffixs' */
  414. if (strncmp(line,"NEEDAFFIX",9) == 0) {
  415. if (parse_flag(line, &needaffix, "NEEDAFFIX")) {
  416. delete afflst;
  417. return 1;
  418. }
  419. }
  420. /* parse in the minimal length for words in compounds */
  421. if (strncmp(line,"COMPOUNDMIN",11) == 0) {
  422. if (parse_num(line, &cpdmin, "COMPOUNDMIN")) {
  423. delete afflst;
  424. return 1;
  425. }
  426. if (cpdmin < 1) cpdmin = 1;
  427. }
  428. /* parse in the max. words and syllables in compounds */
  429. if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) {
  430. if (parse_cpdsyllable(line)) {
  431. delete afflst;
  432. return 1;
  433. }
  434. }
  435. /* parse in the flag used by compound_check() method */
  436. if (strncmp(line,"SYLLABLENUM",11) == 0) {
  437. if (parse_string(line, &cpdsyllablenum, "SYLLABLENUM")) {
  438. delete afflst;
  439. return 1;
  440. }
  441. }
  442. /* parse in the flag used by the controlled compound words */
  443. if (strncmp(line,"CHECKNUM",8) == 0) {
  444. checknum=1;
  445. }
  446. /* parse in the extra word characters */
  447. if (strncmp(line,"WORDCHARS",9) == 0) {
  448. if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, "WORDCHARS", utf8)) {
  449. delete afflst;
  450. return 1;
  451. }
  452. }
  453. /* parse in the ignored characters (for example, Arabic optional diacretics charachters */
  454. if (strncmp(line,"IGNORE",6) == 0) {
  455. if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, "IGNORE", utf8)) {
  456. delete afflst;
  457. return 1;
  458. }
  459. }
  460. /* parse in the typical fault correcting table */
  461. if (strncmp(line,"REP",3) == 0) {
  462. if (parse_reptable(line, afflst)) {
  463. delete afflst;
  464. return 1;
  465. }
  466. }
  467. /* parse in the phonetic translation table */
  468. if (strncmp(line,"PHONE",5) == 0) {
  469. if (parse_phonetable(line, afflst)) {
  470. delete afflst;
  471. return 1;
  472. }
  473. }
  474. /* parse in the checkcompoundpattern table */
  475. if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) {
  476. if (parse_checkcpdtable(line, afflst)) {
  477. delete afflst;
  478. return 1;
  479. }
  480. }
  481. /* parse in the defcompound table */
  482. if (strncmp(line,"COMPOUNDRULE",12) == 0) {
  483. if (parse_defcpdtable(line, afflst)) {
  484. delete afflst;
  485. return 1;
  486. }
  487. }
  488. /* parse in the related character map table */
  489. if (strncmp(line,"MAP",3) == 0) {
  490. if (parse_maptable(line, afflst)) {
  491. delete afflst;
  492. return 1;
  493. }
  494. }
  495. /* parse in the word breakpoints table */
  496. if (strncmp(line,"BREAK",5) == 0) {
  497. if (parse_breaktable(line, afflst)) {
  498. delete afflst;
  499. return 1;
  500. }
  501. }
  502. /* parse in the language for language specific codes */
  503. if (strncmp(line,"LANG",4) == 0) {
  504. if (parse_string(line, &lang, "LANG")) {
  505. delete afflst;
  506. return 1;
  507. }
  508. langnum = get_lang_num(lang);
  509. }
  510. if (strncmp(line,"VERSION",7) == 0) {
  511. if (parse_string(line, &version, "VERSION")) {
  512. delete afflst;
  513. return 1;
  514. }
  515. }
  516. if (strncmp(line,"MAXNGRAMSUGS",12) == 0) {
  517. if (parse_num(line, &maxngramsugs, "MAXNGRAMSUGS")) {
  518. delete afflst;
  519. return 1;
  520. }
  521. }
  522. if (strncmp(line,"NOSPLITSUGS",11) == 0) {
  523. nosplitsugs=1;
  524. }
  525. if (strncmp(line,"SUGSWITHDOTS",12) == 0) {
  526. sugswithdots=1;
  527. }
  528. /* parse in the flag used by forbidden words */
  529. if (strncmp(line,"KEEPCASE",8) == 0) {
  530. if (parse_flag(line, &keepcase, "KEEPCASE")) {
  531. delete afflst;
  532. return 1;
  533. }
  534. }
  535. /* parse in the flag used by the affix generator */
  536. if (strncmp(line,"SUBSTANDARD",11) == 0) {
  537. if (parse_flag(line, &substandard, "SUBSTANDARD")) {
  538. delete afflst;
  539. return 1;
  540. }
  541. }
  542. if (strncmp(line,"CHECKSHARPS",11) == 0) {
  543. checksharps=1;
  544. }
  545. /* parse this affix: P - prefix, S - suffix */
  546. ft = ' ';
  547. if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P';
  548. if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S';
  549. if (ft != ' ') {
  550. if (dupflags_ini) {
  551. for (int i = 0; i < CONTSIZE; i++) dupflags[i] = 0;
  552. dupflags_ini = 0;
  553. }
  554. if (parse_affix(line, ft, afflst, dupflags)) {
  555. delete afflst;
  556. process_pfx_tree_to_list();
  557. process_sfx_tree_to_list();
  558. return 1;
  559. }
  560. }
  561. }
  562. delete afflst;
  563. // convert affix trees to sorted list
  564. process_pfx_tree_to_list();
  565. process_sfx_tree_to_list();
  566. // now we can speed up performance greatly taking advantage of the
  567. // relationship between the affixes and the idea of "subsets".
  568. // View each prefix as a potential leading subset of another and view
  569. // each suffix (reversed) as a potential trailing subset of another.
  570. // To illustrate this relationship if we know the prefix "ab" is found in the
  571. // word to examine, only prefixes that "ab" is a leading subset of need be examined.
  572. // Furthermore is "ab" is not present then none of the prefixes that "ab" is
  573. // is a subset need be examined.
  574. // The same argument goes for suffix string that are reversed.
  575. // Then to top this off why not examine the first char of the word to quickly
  576. // limit the set of prefixes to examine (i.e. the prefixes to examine must
  577. // be leading supersets of the first character of the word (if they exist)
  578. // To take advantage of this "subset" relationship, we need to add two links
  579. // from entry. One to take next if the current prefix is found (call it nexteq)
  580. // and one to take next if the current prefix is not found (call it nextne).
  581. // Since we have built ordered lists, all that remains is to properly intialize
  582. // the nextne and nexteq pointers that relate them
  583. process_pfx_order();
  584. process_sfx_order();
  585. /* get encoding for CHECKCOMPOUNDCASE */
  586. char * enc = get_encoding();
  587. csconv = get_current_cs(enc);
  588. free(enc);
  589. enc = NULL;
  590. #ifdef WINSHELL
  591. char expw[MAXLNLEN];
  592. if (wordchars) {
  593. strcpy(expw, wordchars);
  594. free(wordchars);
  595. } else *expw = '\0';
  596. for (int i = 0; i <= 255; i++) {
  597. if ( (csconv[i].cupper != csconv[i].clower) &&
  598. (! strchr(expw, (char) i))) {
  599. *(expw + strlen(expw) + 1) = '\0';
  600. *(expw + strlen(expw)) = (char) i;
  601. }
  602. }
  603. wordchars = mystrdup(expw);
  604. #endif
  605. // temporary BREAK definition for German dash handling (OOo issue 64400)
  606. if ((langnum == LANG_de) && (!breaktable)) {
  607. breaktable = (char **) malloc(sizeof(char *));
  608. if (!breaktable) return 1;
  609. breaktable[0] = mystrdup("-");
  610. numbreak = 1;
  611. }
  612. return 0;
  613. }
  614. // we want to be able to quickly access prefix information
  615. // both by prefix flag, and sorted by prefix string itself
  616. // so we need to set up two indexes
  617. int AffixMgr::build_pfxtree(AffEntry* pfxptr)
  618. {
  619. PfxEntry * ptr;
  620. PfxEntry * pptr;
  621. PfxEntry * ep = (PfxEntry*) pfxptr;
  622. // get the right starting points
  623. const char * key = ep->getKey();
  624. const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
  625. // first index by flag which must exist
  626. ptr = (PfxEntry*)pFlag[flg];
  627. ep->setFlgNxt(ptr);
  628. pFlag[flg] = (AffEntry *) ep;
  629. // handle the special case of null affix string
  630. if (strlen(key) == 0) {
  631. // always inset them at head of list at element 0
  632. ptr = (PfxEntry*)pStart[0];
  633. ep->setNext(ptr);
  634. pStart[0] = (AffEntry*)ep;
  635. return 0;
  636. }
  637. // now handle the normal case
  638. ep->setNextEQ(NULL);
  639. ep->setNextNE(NULL);
  640. unsigned char sp = *((const unsigned char *)key);
  641. ptr = (PfxEntry*)pStart[sp];
  642. // handle the first insert
  643. if (!ptr) {
  644. pStart[sp] = (AffEntry*)ep;
  645. return 0;
  646. }
  647. // otherwise use binary tree insertion so that a sorted
  648. // list can easily be generated later
  649. pptr = NULL;
  650. for (;;) {
  651. pptr = ptr;
  652. if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
  653. ptr = ptr->getNextEQ();
  654. if (!ptr) {
  655. pptr->setNextEQ(ep);
  656. break;
  657. }
  658. } else {
  659. ptr = ptr->getNextNE();
  660. if (!ptr) {
  661. pptr->setNextNE(ep);
  662. break;
  663. }
  664. }
  665. }
  666. return 0;
  667. }
  668. // we want to be able to quickly access suffix information
  669. // both by suffix flag, and sorted by the reverse of the
  670. // suffix string itself; so we need to set up two indexes
  671. int AffixMgr::build_sfxtree(AffEntry* sfxptr)
  672. {
  673. SfxEntry * ptr;
  674. SfxEntry * pptr;
  675. SfxEntry * ep = (SfxEntry *) sfxptr;
  676. /* get the right starting point */
  677. const char * key = ep->getKey();
  678. const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
  679. // first index by flag which must exist
  680. ptr = (SfxEntry*)sFlag[flg];
  681. ep->setFlgNxt(ptr);
  682. sFlag[flg] = (AffEntry *) ep;
  683. // next index by affix string
  684. // handle the special case of null affix string
  685. if (strlen(key) == 0) {
  686. // always inset them at head of list at element 0
  687. ptr = (SfxEntry*)sStart[0];
  688. ep->setNext(ptr);
  689. sStart[0] = (AffEntry*)ep;
  690. return 0;
  691. }
  692. // now handle the normal case
  693. ep->setNextEQ(NULL);
  694. ep->setNextNE(NULL);
  695. unsigned char sp = *((const unsigned char *)key);
  696. ptr = (SfxEntry*)sStart[sp];
  697. // handle the first insert
  698. if (!ptr) {
  699. sStart[sp] = (AffEntry*)ep;
  700. return 0;
  701. }
  702. // otherwise use binary tree insertion so that a sorted
  703. // list can easily be generated later
  704. pptr = NULL;
  705. for (;;) {
  706. pptr = ptr;
  707. if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
  708. ptr = ptr->getNextEQ();
  709. if (!ptr) {
  710. pptr->setNextEQ(ep);
  711. break;
  712. }
  713. } else {
  714. ptr = ptr->getNextNE();
  715. if (!ptr) {
  716. pptr->setNextNE(ep);
  717. break;
  718. }
  719. }
  720. }
  721. return 0;
  722. }
  723. // convert from binary tree to sorted list
  724. int AffixMgr::process_pfx_tree_to_list()
  725. {
  726. for (int i=1; i< SETSIZE; i++) {
  727. pStart[i] = process_pfx_in_order(pStart[i],NULL);
  728. }
  729. return 0;
  730. }
  731. AffEntry* AffixMgr::process_pfx_in_order(AffEntry* ptr, AffEntry* nptr)
  732. {
  733. if (ptr) {
  734. nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextNE(), nptr);
  735. ((PfxEntry*) ptr)->setNext((PfxEntry*) nptr);
  736. nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextEQ(), ptr);
  737. }
  738. return nptr;
  739. }
  740. // convert from binary tree to sorted list
  741. int AffixMgr:: process_sfx_tree_to_list()
  742. {
  743. for (int i=1; i< SETSIZE; i++) {
  744. sStart[i] = process_sfx_in_order(sStart[i],NULL);
  745. }
  746. return 0;
  747. }
  748. AffEntry* AffixMgr::process_sfx_in_order(AffEntry* ptr, AffEntry* nptr)
  749. {
  750. if (ptr) {
  751. nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextNE(), nptr);
  752. ((SfxEntry*) ptr)->setNext((SfxEntry*) nptr);
  753. nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextEQ(), ptr);
  754. }
  755. return nptr;
  756. }
  757. // reinitialize the PfxEntry links NextEQ and NextNE to speed searching
  758. // using the idea of leading subsets this time
  759. int AffixMgr::process_pfx_order()
  760. {
  761. PfxEntry* ptr;
  762. // loop through each prefix list starting point
  763. for (int i=1; i < SETSIZE; i++) {
  764. ptr = (PfxEntry*)pStart[i];
  765. // look through the remainder of the list
  766. // and find next entry with affix that
  767. // the current one is not a subset of
  768. // mark that as destination for NextNE
  769. // use next in list that you are a subset
  770. // of as NextEQ
  771. for (; ptr != NULL; ptr = ptr->getNext()) {
  772. PfxEntry * nptr = ptr->getNext();
  773. for (; nptr != NULL; nptr = nptr->getNext()) {
  774. if (! isSubset( ptr->getKey() , nptr->getKey() )) break;
  775. }
  776. ptr->setNextNE(nptr);
  777. ptr->setNextEQ(NULL);
  778. if ((ptr->getNext()) && isSubset(ptr->getKey() , (ptr->getNext())->getKey()))
  779. ptr->setNextEQ(ptr->getNext());
  780. }
  781. // now clean up by adding smart search termination strings:
  782. // if you are already a superset of the previous prefix
  783. // but not a subset of the next, search can end here
  784. // so set NextNE properly
  785. ptr = (PfxEntry *) pStart[i];
  786. for (; ptr != NULL; ptr = ptr->getNext()) {
  787. PfxEntry * nptr = ptr->getNext();
  788. PfxEntry * mptr = NULL;
  789. for (; nptr != NULL; nptr = nptr->getNext()) {
  790. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  791. mptr = nptr;
  792. }
  793. if (mptr) mptr->setNextNE(NULL);
  794. }
  795. }
  796. return 0;
  797. }
  798. // initialize the SfxEntry links NextEQ and NextNE to speed searching
  799. // using the idea of leading subsets this time
  800. int AffixMgr::process_sfx_order()
  801. {
  802. SfxEntry* ptr;
  803. // loop through each prefix list starting point
  804. for (int i=1; i < SETSIZE; i++) {
  805. ptr = (SfxEntry *) sStart[i];
  806. // look through the remainder of the list
  807. // and find next entry with affix that
  808. // the current one is not a subset of
  809. // mark that as destination for NextNE
  810. // use next in list that you are a subset
  811. // of as NextEQ
  812. for (; ptr != NULL; ptr = ptr->getNext()) {
  813. SfxEntry * nptr = ptr->getNext();
  814. for (; nptr != NULL; nptr = nptr->getNext()) {
  815. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  816. }
  817. ptr->setNextNE(nptr);
  818. ptr->setNextEQ(NULL);
  819. if ((ptr->getNext()) && isSubset(ptr->getKey(),(ptr->getNext())->getKey()))
  820. ptr->setNextEQ(ptr->getNext());
  821. }
  822. // now clean up by adding smart search termination strings:
  823. // if you are already a superset of the previous suffix
  824. // but not a subset of the next, search can end here
  825. // so set NextNE properly
  826. ptr = (SfxEntry *) sStart[i];
  827. for (; ptr != NULL; ptr = ptr->getNext()) {
  828. SfxEntry * nptr = ptr->getNext();
  829. SfxEntry * mptr = NULL;
  830. for (; nptr != NULL; nptr = nptr->getNext()) {
  831. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  832. mptr = nptr;
  833. }
  834. if (mptr) mptr->setNextNE(NULL);
  835. }
  836. }
  837. return 0;
  838. }
  839. // add flags to the result for dictionary debugging
  840. void AffixMgr::debugflag(char * result, unsigned short flag) {
  841. char * st = encode_flag(flag);
  842. strcat(result, " ");
  843. strcat(result, MORPH_FLAG);
  844. strcat(result, st);
  845. free(st);
  846. }
  847. // calculate the character length of the condition
  848. int AffixMgr::condlen(char * st)
  849. {
  850. int l = 0;
  851. bool group = false;
  852. for(; *st; st++) {
  853. if (*st == '[') {
  854. group = true;
  855. l++;
  856. } else if (*st == ']') group = false;
  857. else if (!group && (!utf8 ||
  858. (!(*st & 0x80) || ((*st & 0xc0) == 0x80)))) l++;
  859. }
  860. return l;
  861. }
  862. int AffixMgr::encodeit(struct affentry * ptr, char * cs)
  863. {
  864. if (strcmp(cs,".") != 0) {
  865. ptr->numconds = (char) condlen(cs);
  866. strncpy(ptr->c.conds, cs, MAXCONDLEN);
  867. // long condition (end of conds padded by strncpy)
  868. if (ptr->c.conds[MAXCONDLEN - 1] && cs[MAXCONDLEN]) {
  869. ptr->opts += aeLONGCOND;
  870. ptr->c.l.conds2 = mystrdup(cs + MAXCONDLEN_1);
  871. }
  872. } else {
  873. ptr->numconds = 0;
  874. ptr->c.conds[0] = '\0';
  875. }
  876. return 0;
  877. }
  878. // return 1 if s1 is a leading subset of s2 (dots are for infixes)
  879. inline int AffixMgr::isSubset(const char * s1, const char * s2)
  880. {
  881. while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) {
  882. s1++;
  883. s2++;
  884. }
  885. return (*s1 == '\0');
  886. }
  887. // check word for prefixes
  888. struct hentry * AffixMgr::prefix_check(const char * word, int len, char in_compound,
  889. const FLAG needflag)
  890. {
  891. struct hentry * rv= NULL;
  892. pfx = NULL;
  893. pfxappnd = NULL;
  894. sfxappnd = NULL;
  895. // first handle the special case of 0 length prefixes
  896. PfxEntry * pe = (PfxEntry *) pStart[0];
  897. while (pe) {
  898. if (
  899. // fogemorpheme
  900. ((in_compound != IN_CPD_NOT) || !(pe->getCont() &&
  901. (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) &&
  902. // permit prefixes in compounds
  903. ((in_compound != IN_CPD_END) || (pe->getCont() &&
  904. (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))
  905. ) {
  906. // check prefix
  907. rv = pe->checkword(word, len, in_compound, needflag);
  908. if (rv) {
  909. pfx=(AffEntry *)pe; // BUG: pfx not stateless
  910. return rv;
  911. }
  912. }
  913. pe = pe->getNext();
  914. }
  915. // now handle the general case
  916. unsigned char sp = *((const unsigned char *)word);
  917. PfxEntry * pptr = (PfxEntry *)pStart[sp];
  918. while (pptr) {
  919. if (isSubset(pptr->getKey(),word)) {
  920. if (
  921. // fogemorpheme
  922. ((in_compound != IN_CPD_NOT) || !(pptr->getCont() &&
  923. (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) &&
  924. // permit prefixes in compounds
  925. ((in_compound != IN_CPD_END) || (pptr->getCont() &&
  926. (TESTAFF(pptr->getCont(), compoundpermitflag, pptr->getContLen()))))
  927. ) {
  928. // check prefix
  929. rv = pptr->checkword(word, len, in_compound, needflag);
  930. if (rv) {
  931. pfx=(AffEntry *)pptr; // BUG: pfx not stateless
  932. return rv;
  933. }
  934. }
  935. pptr = pptr->getNextEQ();
  936. } else {
  937. pptr = pptr->getNextNE();
  938. }
  939. }
  940. return NULL;
  941. }
  942. // check word for prefixes
  943. struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len,
  944. char in_compound, const FLAG needflag)
  945. {
  946. struct hentry * rv= NULL;
  947. pfx = NULL;
  948. sfxappnd = NULL;
  949. // first handle the special case of 0 length prefixes
  950. PfxEntry * pe = (PfxEntry *) pStart[0];
  951. while (pe) {
  952. rv = pe->check_twosfx(word, len, in_compound, needflag);
  953. if (rv) return rv;
  954. pe = pe->getNext();
  955. }
  956. // now handle the general case
  957. unsigned char sp = *((const unsigned char *)word);
  958. PfxEntry * pptr = (PfxEntry *)pStart[sp];
  959. while (pptr) {
  960. if (isSubset(pptr->getKey(),word)) {
  961. rv = pptr->check_twosfx(word, len, in_compound, needflag);
  962. if (rv) {
  963. pfx = (AffEntry *)pptr;
  964. return rv;
  965. }
  966. pptr = pptr->getNextEQ();
  967. } else {
  968. pptr = pptr->getNextNE();
  969. }
  970. }
  971. return NULL;
  972. }
  973. // check word for prefixes
  974. char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound,
  975. const FLAG needflag)
  976. {
  977. char * st;
  978. char result[MAXLNLEN];
  979. result[0] = '\0';
  980. pfx = NULL;
  981. sfxappnd = NULL;
  982. // first handle the special case of 0 length prefixes
  983. PfxEntry * pe = (PfxEntry *) pStart[0];
  984. while (pe) {
  985. st = pe->check_morph(word,len,in_compound, needflag);
  986. if (st) {
  987. strcat(result, st);
  988. free(st);
  989. }
  990. // if (rv) return rv;
  991. pe = pe->getNext();
  992. }
  993. // now handle the general case
  994. unsigned char sp = *((const unsigned char *)word);
  995. PfxEntry * pptr = (PfxEntry *)pStart[sp];
  996. while (pptr) {
  997. if (isSubset(pptr->getKey(),word)) {
  998. st = pptr->check_morph(word,len,in_compound, needflag);
  999. if (st) {
  1000. // fogemorpheme
  1001. if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() &&
  1002. (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))))) {
  1003. strcat(result, st);
  1004. pfx = (AffEntry *)pptr;
  1005. }
  1006. free(st);
  1007. }
  1008. pptr = pptr->getNextEQ();
  1009. } else {
  1010. pptr = pptr->getNextNE();
  1011. }
  1012. }
  1013. if (*result) return mystrdup(result);
  1014. return NULL;
  1015. }
  1016. // check word for prefixes
  1017. char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len,
  1018. char in_compound, const FLAG needflag)
  1019. {
  1020. char * st;
  1021. char result[MAXLNLEN];
  1022. result[0] = '\0';
  1023. pfx = NULL;
  1024. sfxappnd = NULL;
  1025. // first handle the special case of 0 length prefixes
  1026. PfxEntry * pe = (PfxEntry *) pStart[0];
  1027. while (pe) {
  1028. st = pe->check_twosfx_morph(word,len,in_compound, needflag);
  1029. if (st) {
  1030. strcat(result, st);
  1031. free(st);
  1032. }
  1033. pe = pe->getNext();
  1034. }
  1035. // now handle the general case
  1036. unsigned char sp = *((const unsigned char *)word);
  1037. PfxEntry * pptr = (PfxEntry *)pStart[sp];
  1038. while (pptr) {
  1039. if (isSubset(pptr->getKey(),word)) {
  1040. st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
  1041. if (st) {
  1042. strcat(result, st);
  1043. free(st);
  1044. pfx = (AffEntry *)pptr;
  1045. }
  1046. pptr = pptr->getNextEQ();
  1047. } else {
  1048. pptr = pptr->getNextNE();
  1049. }
  1050. }
  1051. if (*result) return mystrdup(result);
  1052. return NULL;
  1053. }
  1054. // Is word a non compound with a REP substitution (see checkcompoundrep)?
  1055. int AffixMgr::cpdrep_check(const char * word, int wl)
  1056. {
  1057. char candidate[MAXLNLEN];
  1058. const char * r;
  1059. int lenr, lenp;
  1060. if ((wl < 2) || !numrep) return 0;
  1061. for (int i=0; i < numrep; i++ ) {
  1062. r = word;
  1063. lenr = strlen(reptable[i].pattern2);
  1064. lenp = strlen(reptable[i].pattern);
  1065. // search every occurence of the pattern in the word
  1066. while ((r=strstr(r, reptable[i].pattern)) != NULL) {
  1067. strcpy(candidate, word);
  1068. if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break;
  1069. strcpy(candidate+(r-word),reptable[i].pattern2);
  1070. strcpy(candidate+(r-word)+lenr, r+lenp);
  1071. if (candidate_check(candidate,strlen(candidate))) return 1;
  1072. r++; // search for the next letter
  1073. }
  1074. }
  1075. return 0;
  1076. }
  1077. // forbid compoundings when there are special patterns at word bound
  1078. int AffixMgr::cpdpat_check(const char * word, int pos)
  1079. {
  1080. int len;
  1081. for (int i = 0; i < numcheckcpd; i++) {
  1082. if (isSubset(checkcpdtable[i].pattern2, word + pos) &&
  1083. (len = strlen(checkcpdtable[i].pattern)) && (pos > len) &&
  1084. (strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)) return 1;
  1085. }
  1086. return 0;
  1087. }
  1088. // forbid compounding with neighbouring upper and lower case characters at word bounds
  1089. int AffixMgr::cpdcase_check(const char * word, int pos)
  1090. {
  1091. if (utf8) {
  1092. w_char u, w;
  1093. const char * p;
  1094. u8_u16(&u, 1, word + pos);
  1095. for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--);
  1096. u8_u16(&w, 1, p);
  1097. unsigned short a = (u.h << 8) + u.l;
  1098. unsigned short b = (w.h << 8) + w.l;
  1099. if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b))) return 1;
  1100. } else {
  1101. unsigned char a = *(word + pos - 1);
  1102. unsigned char b = *(word + pos);
  1103. if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-')) return 1;
  1104. }
  1105. return 0;
  1106. }
  1107. // check compound patterns
  1108. int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** def, char all)
  1109. {
  1110. signed short btpp[MAXWORDLEN]; // metacharacter (*, ?) positions for backtracking
  1111. signed short btwp[MAXWORDLEN]; // word positions for metacharacters
  1112. int btnum[MAXWORDLEN]; // number of matched characters in metacharacter positions
  1113. short bt = 0;
  1114. int i;
  1115. int ok;
  1116. int w = 0;
  1117. if (!*words) {
  1118. w = 1;
  1119. *words = def;
  1120. }
  1121. (*words)[wnum] = rv;
  1122. for (i = 0; i < numdefcpd; i++) {
  1123. signed short pp = 0; // pattern position
  1124. signed short wp = 0; // "words" position
  1125. int ok2;
  1126. ok = 1;
  1127. ok2 = 1;
  1128. do {
  1129. while ((pp < defcpdtable[i].len) && (wp <= wnum)) {
  1130. if (((pp+1) < defcpdtable[i].len) &&
  1131. ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) {
  1132. int wend = (defcpdtable[i].def[pp+1] == '?') ? wp : wnum;
  1133. ok2 = 1;
  1134. pp+=2;
  1135. btpp[bt] = pp;
  1136. btwp[bt] = wp;
  1137. while (wp <= wend) {
  1138. if (!(*words)[wp]->alen ||
  1139. !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp-2], (*words)[wp]->alen)) {
  1140. ok2 = 0;
  1141. break;
  1142. }
  1143. wp++;
  1144. }
  1145. if (wp <= wnum) ok2 = 0;
  1146. btnum[bt] = wp - btwp[bt];
  1147. if (btnum[bt] > 0) bt++;
  1148. if (ok2) break;
  1149. } else {
  1150. ok2 = 1;
  1151. if (!(*words)[wp] || !(*words)[wp]->alen ||
  1152. !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp], (*words)[wp]->alen)) {
  1153. ok = 0;
  1154. break;
  1155. }
  1156. pp++;
  1157. wp++;
  1158. if ((defcpdtable[i].len == pp) && !(wp > wnum)) ok = 0;
  1159. }
  1160. }
  1161. if (ok && ok2) {
  1162. int r = pp;
  1163. while ((defcpdtable[i].len > r) && ((r+1) < defcpdtable[i].len) &&
  1164. ((defcpdtable[i].def[r+1] == '*') || (defcpdtable[i].def[r+1] == '?'))) r+=2;
  1165. if (defcpdtable[i].len <= r) return 1;
  1166. }
  1167. // backtrack
  1168. if (bt) do {
  1169. ok = 1;
  1170. btnum[bt - 1]--;
  1171. pp = btpp[bt - 1];
  1172. wp = btwp[bt - 1] + (signed short) btnum[bt - 1];
  1173. } while ((btnum[bt - 1] < 0) && --bt);
  1174. } while (bt);
  1175. if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1;
  1176. // check zero ending
  1177. while (ok && ok2 && (defcpdtable[i].len > pp) && ((pp+1) < defcpdtable[i].len) &&
  1178. ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) pp+=2;
  1179. if (ok && ok2 && (defcpdtable[i].len <= pp)) return 1;
  1180. }
  1181. (*words)[wnum] = NULL;
  1182. if (w) *words = NULL;
  1183. return 0;
  1184. }
  1185. inline int AffixMgr::candidate_check(const char * word, int len)
  1186. {
  1187. struct hentry * rv=NULL;
  1188. rv = lookup(word);
  1189. if (rv) return 1;
  1190. // rv = prefix_check(word,len,1);
  1191. // if (rv) return 1;
  1192. rv = affix_check(word,len);
  1193. if (rv) return 1;
  1194. return 0;
  1195. }
  1196. // calculate number of syllable for compound-checking
  1197. short AffixMgr::get_syllable(const char * word, int wlen)
  1198. {
  1199. if (cpdmaxsyllable==0) return 0;
  1200. short num=0;
  1201. if (!utf8) {
  1202. for (int i=0; i<wlen; i++) {
  1203. if (strchr(cpdvowels, word[i])) num++;
  1204. }
  1205. } else if (cpdvowels_utf16) {
  1206. w_char w[MAXWORDUTF8LEN];
  1207. int i = u8_u16(w, MAXWORDUTF8LEN, word);
  1208. for (; i > 0; i--) {
  1209. if (flag_bsearch((unsigned short *) cpdvowels_utf16,
  1210. ((unsigned short *) w)[i - 1], cpdvowels_utf16_len)) num++;
  1211. }
  1212. }
  1213. return num;
  1214. }
  1215. // check if compound word is correctly spelled
  1216. // hu_mov_rule = spec. Hungarian rule (XXX)
  1217. struct hentry * AffixMgr::compound_check(const char * word, int len,
  1218. short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words = NULL,
  1219. char hu_mov_rule = 0, int * cmpdstemnum = NULL, int * cmpdstem = NULL, char is_sug = 0)
  1220. {
  1221. int i;
  1222. short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
  1223. int oldcmpdstemnum = 0;
  1224. struct hentry * rv = NULL;
  1225. struct hentry * rv_first;
  1226. struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
  1227. char st [MAXWORDUTF8LEN + 4];
  1228. char ch;
  1229. int cmin;
  1230. int cmax;
  1231. int checked_prefix;
  1232. #ifdef HUNSTEM
  1233. if (cmpdstemnum) {
  1234. if (wordnum == 0) {
  1235. *cmpdstemnum = 1;
  1236. } else {
  1237. (*cmpdstemnum)++;
  1238. }
  1239. }
  1240. #endif
  1241. if (utf8) {
  1242. for (cmin = 0, i = 0; (i < cpdmin) && word[cmin]; i++) {
  1243. cmin++;
  1244. for (; (word[cmin] & 0xc0) == 0x80; cmin++);
  1245. }
  1246. for (cmax = len, i = 0; (i < (cpdmin - 1)) && cmax; i++) {
  1247. cmax--;
  1248. for (; (word[cmax] & 0xc0) == 0x80; cmax--);
  1249. }
  1250. } else {
  1251. cmin = cpdmin;
  1252. cmax = len - cpdmin + 1;
  1253. }
  1254. strcpy(st, word);
  1255. for (i = cmin; i < cmax; i++) {
  1256. oldnumsyllable = numsyllable;
  1257. oldwordnum = wordnum;
  1258. checked_prefix = 0;
  1259. // go to end of the UTF-8 character
  1260. if (utf8) {
  1261. for (; (st[i] & 0xc0) == 0x80; i++);
  1262. if (i >= cmax) return NULL;
  1263. }
  1264. ch = st[i];
  1265. st[i] = '\0';
  1266. sfx = NULL;
  1267. pfx = NULL;
  1268. // FIRST WORD
  1269. rv = lookup(st); // perhaps without prefix
  1270. // search homonym with compound flag
  1271. while ((rv) && !hu_mov_rule &&
  1272. ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
  1273. !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1274. (compoundbegin && !wordnum &&
  1275. TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
  1276. (compoundmiddle && wordnum && !words &&
  1277. TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
  1278. (numdefcpd &&
  1279. ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||
  1280. (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))
  1281. ))) {
  1282. rv = rv->next_homonym;
  1283. }
  1284. if (!rv) {
  1285. if (compoundflag &&
  1286. !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {
  1287. if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,
  1288. FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule &&
  1289. ((SfxEntry*)sfx)->getCont() &&
  1290. ((compoundforbidflag && TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
  1291. ((SfxEntry*)sfx)->getContLen())) || (compoundend &&
  1292. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend,
  1293. ((SfxEntry*)sfx)->getContLen())))) {
  1294. rv = NULL;
  1295. }
  1296. }
  1297. if (rv ||
  1298. (((wordnum == 0) && compoundbegin &&
  1299. ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
  1300. (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) ||
  1301. ((wordnum > 0) && compoundmiddle &&
  1302. ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
  1303. (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle)))))
  1304. ) checked_prefix = 1;
  1305. // else check forbiddenwords and needaffix
  1306. } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
  1307. TESTAFF(rv->astr, needaffix, rv->alen) ||
  1308. (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen))
  1309. )) {
  1310. st[i] = ch;
  1311. continue;
  1312. }
  1313. // check non_compound flag in suffix and prefix
  1314. if ((rv) && !hu_mov_rule &&
  1315. ((pfx && ((PfxEntry*)pfx)->getCont() &&
  1316. TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag,
  1317. ((PfxEntry*)pfx)->getContLen())) ||
  1318. (sfx && ((SfxEntry*)sfx)->getCont() &&
  1319. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
  1320. ((SfxEntry*)sfx)->getContLen())))) {
  1321. rv = NULL;
  1322. }
  1323. // check compoundend flag in suffix and prefix
  1324. if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
  1325. ((pfx && ((PfxEntry*)pfx)->getCont() &&
  1326. TESTAFF(((PfxEntry*)pfx)->getCont(), compoundend,
  1327. ((PfxEntry*)pfx)->getContLen())) ||
  1328. (sfx && ((SfxEntry*)sfx)->getCont() &&
  1329. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend,
  1330. ((SfxEntry*)sfx)->getContLen())))) {
  1331. rv = NULL;
  1332. }
  1333. // check compoundmiddle flag in suffix and prefix
  1334. if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule &&
  1335. ((pfx && ((PfxEntry*)pfx)->getCont() &&
  1336. TESTAFF(((PfxEntry*)pfx)->getCont(), compoundmiddle,
  1337. ((PfxEntry*)pfx)->getContLen())) ||
  1338. (sfx && ((SfxEntry*)sfx)->getCont() &&
  1339. TESTAFF(((SfxEntry*)sfx)->getCont(), compoundmiddle,
  1340. ((SfxEntry*)sfx)->getContLen())))) {
  1341. rv = NULL;
  1342. }
  1343. // check forbiddenwords
  1344. if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
  1345. (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) {
  1346. return NULL;
  1347. }
  1348. // increment word number, if the second root has a compoundroot flag
  1349. if ((rv) && compoundroot &&
  1350. (TESTAFF(rv->astr, compoundroot, rv->alen))) {
  1351. wordnum++;
  1352. }
  1353. // first word is acceptable in compound words?
  1354. if (((rv) &&
  1355. ( checked_prefix || (words && words[wnum]) ||
  1356. (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1357. ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
  1358. ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))// ||
  1359. // (numdefcpd && )
  1360. // LANG_hu section: spec. Hungarian rule
  1361. || ((langnum == LANG_hu) && hu_mov_rule && (
  1362. TESTAFF(rv->astr, 'F', rv->alen) || // XXX hardwired Hungarian dictionary codes
  1363. TESTAFF(rv->astr, 'G', rv->alen) ||
  1364. TESTAFF(rv->astr, 'H', rv->alen)
  1365. )
  1366. )
  1367. // END of LANG_hu section
  1368. )
  1369. && ! (( checkcompoundtriple && // test triple letters
  1370. (word[i-1]==word[i]) && (
  1371. ((i>1) && (word[i-1]==word[i-2])) ||
  1372. ((word[i-1]==word[i+1])) // may be word[i+1] == '\0'
  1373. )
  1374. ) ||
  1375. (
  1376. // test CHECKCOMPOUNDPATTERN
  1377. numcheckcpd && cpdpat_check(word, i)
  1378. ) ||
  1379. (
  1380. checkcompoundcase && cpdcase_check(word, i)
  1381. ))
  1382. )
  1383. // LANG_hu section: spec. Hungarian rule
  1384. || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) &&
  1385. (sfx && ((SfxEntry*)sfx)->getCont() && ( // XXX hardwired Hungarian dic. codes
  1386. TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) 'x', ((SfxEntry*)sfx)->getContLen()) ||
  1387. TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) '%', ((SfxEntry*)sfx)->getContLen())
  1388. )
  1389. )
  1390. )
  1391. // END of LANG_hu section
  1392. ) {
  1393. // LANG_hu section: spec. Hungarian rule
  1394. if (langnum == LANG_hu) {
  1395. // calculate syllable number of the word
  1396. numsyllable += get_syllable(st, i);
  1397. // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
  1398. if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++;
  1399. }
  1400. // END of LANG_hu section
  1401. #ifdef HUNSTEM
  1402. if (cmpdstem) cmpdstem[*cmpdstemnum - 1] = i;
  1403. #endif
  1404. // NEXT WORD(S)
  1405. rv_first = rv;
  1406. rv = lookup((word+i)); // perhaps without prefix
  1407. // …

Large files files are truncated, but you can click here to view the full file