PageRenderTime 36ms CodeModel.GetById 1ms RepoModel.GetById 0ms app.codeStats 1ms

/extensions/spellcheck/hunspell/src/affixmgr.cpp

http://github.com/zpao/v8monkey
C++ | 4575 lines | 3604 code | 489 blank | 482 comment | 1549 complexity | 240bb4bb862d7846dbcab5744698c2eb MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, LGPL-3.0, AGPL-1.0, LGPL-2.1, BSD-3-Clause, GPL-2.0, JSON, Apache-2.0, 0BSD

Large files files are truncated, but you can click here to view the full file

  1. /******* BEGIN LICENSE BLOCK *******
  2. * Version: MPL 1.1/GPL 2.0/LGPL 2.1
  3. *
  4. * The contents of this file are subject to the Mozilla Public License Version
  5. * 1.1 (the "License"); you may not use this file except in compliance with
  6. * the License. You may obtain a copy of the License at
  7. * http://www.mozilla.org/MPL/
  8. *
  9. * Software distributed under the License is distributed on an "AS IS" basis,
  10. * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  11. * for the specific language governing rights and limitations under the
  12. * License.
  13. *
  14. * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
  15. * and László Németh (Hunspell). Portions created by the Initial Developers
  16. * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
  17. *
  18. * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
  19. * David Einstein (deinst@world.std.com)
  20. * László Németh (nemethl@gyorsposta.hu)
  21. * Caolan McNamara (caolanm@redhat.com)
  22. * Davide Prina
  23. * Giuseppe Modugno
  24. * Gianluca Turconi
  25. * Simon Brouwer
  26. * Noll Janos
  27. * Biro Arpad
  28. * Goldman Eleonora
  29. * Sarlos Tamas
  30. * Bencsath Boldizsar
  31. * Halacsy Peter
  32. * Dvornik Laszlo
  33. * Gefferth Andras
  34. * Nagy Viktor
  35. * Varga Daniel
  36. * Chris Halls
  37. * Rene Engelhard
  38. * Bram Moolenaar
  39. * Dafydd Jones
  40. * Harri Pitkanen
  41. * Andras Timar
  42. * Tor Lillqvist
  43. *
  44. * Alternatively, the contents of this file may be used under the terms of
  45. * either the GNU General Public License Version 2 or later (the "GPL"), or
  46. * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  47. * in which case the provisions of the GPL or the LGPL are applicable instead
  48. * of those above. If you wish to allow use of your version of this file only
  49. * under the terms of either the GPL or the LGPL, and not to allow others to
  50. * use your version of this file under the terms of the MPL, indicate your
  51. * decision by deleting the provisions above and replace them with the notice
  52. * and other provisions required by the GPL or the LGPL. If you do not delete
  53. * the provisions above, a recipient may use your version of this file under
  54. * the terms of any one of the MPL, the GPL or the LGPL.
  55. *
  56. ******* END LICENSE BLOCK *******/
  57. #include <stdlib.h>
  58. #include <string.h>
  59. #include <stdio.h>
  60. #include <ctype.h>
  61. #include <vector>
  62. #include "affixmgr.hxx"
  63. #include "affentry.hxx"
  64. #include "langnum.hxx"
  65. #include "csutil.hxx"
  66. AffixMgr::AffixMgr(const char * affpath, HashMgr** ptr, int * md, const char * key)
  67. {
  68. // register hash manager and load affix data from aff file
  69. pHMgr = ptr[0];
  70. alldic = ptr;
  71. maxdic = md;
  72. keystring = NULL;
  73. trystring = NULL;
  74. encoding=NULL;
  75. csconv=NULL;
  76. utf8 = 0;
  77. complexprefixes = 0;
  78. maptable = NULL;
  79. nummap = 0;
  80. breaktable = NULL;
  81. numbreak = -1;
  82. reptable = NULL;
  83. numrep = 0;
  84. iconvtable = NULL;
  85. oconvtable = NULL;
  86. checkcpdtable = NULL;
  87. // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN)
  88. simplifiedcpd = 0;
  89. numcheckcpd = 0;
  90. defcpdtable = NULL;
  91. numdefcpd = 0;
  92. phone = NULL;
  93. compoundflag = FLAG_NULL; // permits word in compound forms
  94. compoundbegin = FLAG_NULL; // may be first word in compound forms
  95. compoundmiddle = FLAG_NULL; // may be middle word in compound forms
  96. compoundend = FLAG_NULL; // may be last word in compound forms
  97. compoundroot = FLAG_NULL; // compound word signing flag
  98. compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word
  99. compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word
  100. checkcompounddup = 0; // forbid double words in compounds
  101. checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a REP substitution)
  102. checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds
  103. checkcompoundtriple = 0; // forbid compounds with triple letters
  104. simplifiedtriple = 0; // allow simplified triple letters in compounds (Schiff+fahrt -> Schiffahrt)
  105. forbiddenword = FORBIDDENWORD; // forbidden word signing flag
  106. nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag
  107. nongramsuggest = FLAG_NULL;
  108. lang = NULL; // language
  109. langnum = 0; // language code (see http://l10n.openoffice.org/languages.html)
  110. needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes
  111. cpdwordmax = -1; // default: unlimited wordcount in compound words
  112. cpdmin = -1; // undefined
  113. cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words
  114. cpdvowels=NULL; // vowels (for calculating of Hungarian compounding limit, O(n) search! XXX)
  115. cpdvowels_utf16=NULL; // vowels for UTF-8 encoding (bsearch instead of O(n) search)
  116. cpdvowels_utf16_len=0; // vowels
  117. pfxappnd=NULL; // previous prefix for counting the syllables of prefix BUG
  118. sfxappnd=NULL; // previous suffix for counting a special syllables BUG
  119. cpdsyllablenum=NULL; // syllable count incrementing flag
  120. checknum=0; // checking numbers, and word with numbers
  121. wordchars=NULL; // letters + spec. word characters
  122. wordchars_utf16=NULL; // letters + spec. word characters
  123. wordchars_utf16_len=0; // letters + spec. word characters
  124. ignorechars=NULL; // letters + spec. word characters
  125. ignorechars_utf16=NULL; // letters + spec. word characters
  126. ignorechars_utf16_len=0; // letters + spec. word characters
  127. version=NULL; // affix and dictionary file version string
  128. havecontclass=0; // flags of possible continuing classes (double affix)
  129. // LEMMA_PRESENT: not put root into the morphological output. Lemma presents
  130. // in morhological description in dictionary file. It's often combined with PSEUDOROOT.
  131. lemma_present = FLAG_NULL;
  132. circumfix = FLAG_NULL;
  133. onlyincompound = FLAG_NULL;
  134. maxngramsugs = -1; // undefined
  135. maxdiff = -1; // undefined
  136. onlymaxdiff = 0;
  137. maxcpdsugs = -1; // undefined
  138. nosplitsugs = 0;
  139. sugswithdots = 0;
  140. keepcase = 0;
  141. forceucase = 0;
  142. warn = 0;
  143. forbidwarn = 0;
  144. checksharps = 0;
  145. substandard = FLAG_NULL;
  146. fullstrip = 0;
  147. sfx = NULL;
  148. pfx = NULL;
  149. for (int i=0; i < SETSIZE; i++) {
  150. pStart[i] = NULL;
  151. sStart[i] = NULL;
  152. pFlag[i] = NULL;
  153. sFlag[i] = NULL;
  154. }
  155. for (int j=0; j < CONTSIZE; j++) {
  156. contclasses[j] = 0;
  157. }
  158. if (parse_file(affpath, key)) {
  159. HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath);
  160. }
  161. if (cpdmin == -1) cpdmin = MINCPDLEN;
  162. }
  163. AffixMgr::~AffixMgr()
  164. {
  165. // pass through linked prefix entries and clean up
  166. for (int i=0; i < SETSIZE ;i++) {
  167. pFlag[i] = NULL;
  168. PfxEntry * ptr = pStart[i];
  169. PfxEntry * nptr = NULL;
  170. while (ptr) {
  171. nptr = ptr->getNext();
  172. delete(ptr);
  173. ptr = nptr;
  174. nptr = NULL;
  175. }
  176. }
  177. // pass through linked suffix entries and clean up
  178. for (int j=0; j < SETSIZE ; j++) {
  179. sFlag[j] = NULL;
  180. SfxEntry * ptr = sStart[j];
  181. SfxEntry * nptr = NULL;
  182. while (ptr) {
  183. nptr = ptr->getNext();
  184. delete(ptr);
  185. ptr = nptr;
  186. nptr = NULL;
  187. }
  188. sStart[j] = NULL;
  189. }
  190. if (keystring) free(keystring);
  191. keystring=NULL;
  192. if (trystring) free(trystring);
  193. trystring=NULL;
  194. if (encoding) free(encoding);
  195. encoding=NULL;
  196. if (maptable) {
  197. for (int j=0; j < nummap; j++) {
  198. for (int k=0; k < maptable[j].len; k++) {
  199. if (maptable[j].set[k]) free(maptable[j].set[k]);
  200. }
  201. free(maptable[j].set);
  202. maptable[j].set = NULL;
  203. maptable[j].len = 0;
  204. }
  205. free(maptable);
  206. maptable = NULL;
  207. }
  208. nummap = 0;
  209. if (breaktable) {
  210. for (int j=0; j < numbreak; j++) {
  211. if (breaktable[j]) free(breaktable[j]);
  212. breaktable[j] = NULL;
  213. }
  214. free(breaktable);
  215. breaktable = NULL;
  216. }
  217. numbreak = 0;
  218. if (reptable) {
  219. for (int j=0; j < numrep; j++) {
  220. free(reptable[j].pattern);
  221. free(reptable[j].pattern2);
  222. }
  223. free(reptable);
  224. reptable = NULL;
  225. }
  226. if (iconvtable) delete iconvtable;
  227. if (oconvtable) delete oconvtable;
  228. if (phone && phone->rules) {
  229. for (int j=0; j < phone->num + 1; j++) {
  230. free(phone->rules[j * 2]);
  231. free(phone->rules[j * 2 + 1]);
  232. }
  233. free(phone->rules);
  234. free(phone);
  235. phone = NULL;
  236. }
  237. if (defcpdtable) {
  238. for (int j=0; j < numdefcpd; j++) {
  239. free(defcpdtable[j].def);
  240. defcpdtable[j].def = NULL;
  241. }
  242. free(defcpdtable);
  243. defcpdtable = NULL;
  244. }
  245. numrep = 0;
  246. if (checkcpdtable) {
  247. for (int j=0; j < numcheckcpd; j++) {
  248. free(checkcpdtable[j].pattern);
  249. free(checkcpdtable[j].pattern2);
  250. free(checkcpdtable[j].pattern3);
  251. checkcpdtable[j].pattern = NULL;
  252. checkcpdtable[j].pattern2 = NULL;
  253. checkcpdtable[j].pattern3 = NULL;
  254. }
  255. free(checkcpdtable);
  256. checkcpdtable = NULL;
  257. }
  258. numcheckcpd = 0;
  259. FREE_FLAG(compoundflag);
  260. FREE_FLAG(compoundbegin);
  261. FREE_FLAG(compoundmiddle);
  262. FREE_FLAG(compoundend);
  263. FREE_FLAG(compoundpermitflag);
  264. FREE_FLAG(compoundforbidflag);
  265. FREE_FLAG(compoundroot);
  266. FREE_FLAG(forbiddenword);
  267. FREE_FLAG(nosuggest);
  268. FREE_FLAG(nongramsuggest);
  269. FREE_FLAG(needaffix);
  270. FREE_FLAG(lemma_present);
  271. FREE_FLAG(circumfix);
  272. FREE_FLAG(onlyincompound);
  273. cpdwordmax = 0;
  274. pHMgr = NULL;
  275. cpdmin = 0;
  276. cpdmaxsyllable = 0;
  277. if (cpdvowels) free(cpdvowels);
  278. if (cpdvowels_utf16) free(cpdvowels_utf16);
  279. if (cpdsyllablenum) free(cpdsyllablenum);
  280. free_utf_tbl();
  281. if (lang) free(lang);
  282. if (wordchars) free(wordchars);
  283. if (wordchars_utf16) free(wordchars_utf16);
  284. if (ignorechars) free(ignorechars);
  285. if (ignorechars_utf16) free(ignorechars_utf16);
  286. if (version) free(version);
  287. checknum=0;
  288. #ifdef MOZILLA_CLIENT
  289. delete [] csconv;
  290. #endif
  291. }
  292. // read in aff file and build up prefix and suffix entry objects
  293. int AffixMgr::parse_file(const char * affpath, const char * key)
  294. {
  295. char * line; // io buffers
  296. char ft; // affix type
  297. // checking flag duplication
  298. char dupflags[CONTSIZE];
  299. char dupflags_ini = 1;
  300. // first line indicator for removing byte order mark
  301. int firstline = 1;
  302. // open the affix file
  303. FileMgr * afflst = new FileMgr(affpath, key);
  304. if (!afflst) {
  305. HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath);
  306. return 1;
  307. }
  308. // step one is to parse the affix file building up the internal
  309. // affix data structures
  310. // read in each line ignoring any that do not
  311. // start with a known line type indicator
  312. while ((line = afflst->getline())) {
  313. mychomp(line);
  314. /* remove byte order mark */
  315. if (firstline) {
  316. firstline = 0;
  317. // Affix file begins with byte order mark: possible incompatibility with old Hunspell versions
  318. if (strncmp(line,"\xEF\xBB\xBF",3) == 0) {
  319. memmove(line, line+3, strlen(line+3)+1);
  320. }
  321. }
  322. /* parse in the keyboard string */
  323. if (strncmp(line,"KEY",3) == 0) {
  324. if (parse_string(line, &keystring, afflst->getlinenum())) {
  325. delete afflst;
  326. return 1;
  327. }
  328. }
  329. /* parse in the try string */
  330. if (strncmp(line,"TRY",3) == 0) {
  331. if (parse_string(line, &trystring, afflst->getlinenum())) {
  332. delete afflst;
  333. return 1;
  334. }
  335. }
  336. /* parse in the name of the character set used by the .dict and .aff */
  337. if (strncmp(line,"SET",3) == 0) {
  338. if (parse_string(line, &encoding, afflst->getlinenum())) {
  339. delete afflst;
  340. return 1;
  341. }
  342. if (strcmp(encoding, "UTF-8") == 0) {
  343. utf8 = 1;
  344. #ifndef OPENOFFICEORG
  345. #ifndef MOZILLA_CLIENT
  346. if (initialize_utf_tbl()) return 1;
  347. #endif
  348. #endif
  349. }
  350. }
  351. /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left writing system */
  352. if (strncmp(line,"COMPLEXPREFIXES",15) == 0)
  353. complexprefixes = 1;
  354. /* parse in the flag used by the controlled compound words */
  355. if (strncmp(line,"COMPOUNDFLAG",12) == 0) {
  356. if (parse_flag(line, &compoundflag, afflst)) {
  357. delete afflst;
  358. return 1;
  359. }
  360. }
  361. /* parse in the flag used by compound words */
  362. if (strncmp(line,"COMPOUNDBEGIN",13) == 0) {
  363. if (complexprefixes) {
  364. if (parse_flag(line, &compoundend, afflst)) {
  365. delete afflst;
  366. return 1;
  367. }
  368. } else {
  369. if (parse_flag(line, &compoundbegin, afflst)) {
  370. delete afflst;
  371. return 1;
  372. }
  373. }
  374. }
  375. /* parse in the flag used by compound words */
  376. if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) {
  377. if (parse_flag(line, &compoundmiddle, afflst)) {
  378. delete afflst;
  379. return 1;
  380. }
  381. }
  382. /* parse in the flag used by compound words */
  383. if (strncmp(line,"COMPOUNDEND",11) == 0) {
  384. if (complexprefixes) {
  385. if (parse_flag(line, &compoundbegin, afflst)) {
  386. delete afflst;
  387. return 1;
  388. }
  389. } else {
  390. if (parse_flag(line, &compoundend, afflst)) {
  391. delete afflst;
  392. return 1;
  393. }
  394. }
  395. }
  396. /* parse in the data used by compound_check() method */
  397. if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) {
  398. if (parse_num(line, &cpdwordmax, afflst)) {
  399. delete afflst;
  400. return 1;
  401. }
  402. }
  403. /* parse in the flag sign compounds in dictionary */
  404. if (strncmp(line,"COMPOUNDROOT",12) == 0) {
  405. if (parse_flag(line, &compoundroot, afflst)) {
  406. delete afflst;
  407. return 1;
  408. }
  409. }
  410. /* parse in the flag used by compound_check() method */
  411. if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) {
  412. if (parse_flag(line, &compoundpermitflag, afflst)) {
  413. delete afflst;
  414. return 1;
  415. }
  416. }
  417. /* parse in the flag used by compound_check() method */
  418. if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) {
  419. if (parse_flag(line, &compoundforbidflag, afflst)) {
  420. delete afflst;
  421. return 1;
  422. }
  423. }
  424. if (strncmp(line,"CHECKCOMPOUNDDUP",16) == 0) {
  425. checkcompounddup = 1;
  426. }
  427. if (strncmp(line,"CHECKCOMPOUNDREP",16) == 0) {
  428. checkcompoundrep = 1;
  429. }
  430. if (strncmp(line,"CHECKCOMPOUNDTRIPLE",19) == 0) {
  431. checkcompoundtriple = 1;
  432. }
  433. if (strncmp(line,"SIMPLIFIEDTRIPLE",16) == 0) {
  434. simplifiedtriple = 1;
  435. }
  436. if (strncmp(line,"CHECKCOMPOUNDCASE",17) == 0) {
  437. checkcompoundcase = 1;
  438. }
  439. if (strncmp(line,"NOSUGGEST",9) == 0) {
  440. if (parse_flag(line, &nosuggest, afflst)) {
  441. delete afflst;
  442. return 1;
  443. }
  444. }
  445. if (strncmp(line,"NONGRAMSUGGEST",14) == 0) {
  446. if (parse_flag(line, &nongramsuggest, afflst)) {
  447. delete afflst;
  448. return 1;
  449. }
  450. }
  451. /* parse in the flag used by forbidden words */
  452. if (strncmp(line,"FORBIDDENWORD",13) == 0) {
  453. if (parse_flag(line, &forbiddenword, afflst)) {
  454. delete afflst;
  455. return 1;
  456. }
  457. }
  458. /* parse in the flag used by forbidden words */
  459. if (strncmp(line,"LEMMA_PRESENT",13) == 0) {
  460. if (parse_flag(line, &lemma_present, afflst)) {
  461. delete afflst;
  462. return 1;
  463. }
  464. }
  465. /* parse in the flag used by circumfixes */
  466. if (strncmp(line,"CIRCUMFIX",9) == 0) {
  467. if (parse_flag(line, &circumfix, afflst)) {
  468. delete afflst;
  469. return 1;
  470. }
  471. }
  472. /* parse in the flag used by fogemorphemes */
  473. if (strncmp(line,"ONLYINCOMPOUND",14) == 0) {
  474. if (parse_flag(line, &onlyincompound, afflst)) {
  475. delete afflst;
  476. return 1;
  477. }
  478. }
  479. /* parse in the flag used by `needaffixs' */
  480. if (strncmp(line,"PSEUDOROOT",10) == 0) {
  481. if (parse_flag(line, &needaffix, afflst)) {
  482. delete afflst;
  483. return 1;
  484. }
  485. }
  486. /* parse in the flag used by `needaffixs' */
  487. if (strncmp(line,"NEEDAFFIX",9) == 0) {
  488. if (parse_flag(line, &needaffix, afflst)) {
  489. delete afflst;
  490. return 1;
  491. }
  492. }
  493. /* parse in the minimal length for words in compounds */
  494. if (strncmp(line,"COMPOUNDMIN",11) == 0) {
  495. if (parse_num(line, &cpdmin, afflst)) {
  496. delete afflst;
  497. return 1;
  498. }
  499. if (cpdmin < 1) cpdmin = 1;
  500. }
  501. /* parse in the max. words and syllables in compounds */
  502. if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) {
  503. if (parse_cpdsyllable(line, afflst)) {
  504. delete afflst;
  505. return 1;
  506. }
  507. }
  508. /* parse in the flag used by compound_check() method */
  509. if (strncmp(line,"SYLLABLENUM",11) == 0) {
  510. if (parse_string(line, &cpdsyllablenum, afflst->getlinenum())) {
  511. delete afflst;
  512. return 1;
  513. }
  514. }
  515. /* parse in the flag used by the controlled compound words */
  516. if (strncmp(line,"CHECKNUM",8) == 0) {
  517. checknum=1;
  518. }
  519. /* parse in the extra word characters */
  520. if (strncmp(line,"WORDCHARS",9) == 0) {
  521. if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, utf8, afflst->getlinenum())) {
  522. delete afflst;
  523. return 1;
  524. }
  525. }
  526. /* parse in the ignored characters (for example, Arabic optional diacretics charachters */
  527. if (strncmp(line,"IGNORE",6) == 0) {
  528. if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, utf8, afflst->getlinenum())) {
  529. delete afflst;
  530. return 1;
  531. }
  532. }
  533. /* parse in the typical fault correcting table */
  534. if (strncmp(line,"REP",3) == 0) {
  535. if (parse_reptable(line, afflst)) {
  536. delete afflst;
  537. return 1;
  538. }
  539. }
  540. /* parse in the input conversion table */
  541. if (strncmp(line,"ICONV",5) == 0) {
  542. if (parse_convtable(line, afflst, &iconvtable, "ICONV")) {
  543. delete afflst;
  544. return 1;
  545. }
  546. }
  547. /* parse in the input conversion table */
  548. if (strncmp(line,"OCONV",5) == 0) {
  549. if (parse_convtable(line, afflst, &oconvtable, "OCONV")) {
  550. delete afflst;
  551. return 1;
  552. }
  553. }
  554. /* parse in the phonetic translation table */
  555. if (strncmp(line,"PHONE",5) == 0) {
  556. if (parse_phonetable(line, afflst)) {
  557. delete afflst;
  558. return 1;
  559. }
  560. }
  561. /* parse in the checkcompoundpattern table */
  562. if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) {
  563. if (parse_checkcpdtable(line, afflst)) {
  564. delete afflst;
  565. return 1;
  566. }
  567. }
  568. /* parse in the defcompound table */
  569. if (strncmp(line,"COMPOUNDRULE",12) == 0) {
  570. if (parse_defcpdtable(line, afflst)) {
  571. delete afflst;
  572. return 1;
  573. }
  574. }
  575. /* parse in the related character map table */
  576. if (strncmp(line,"MAP",3) == 0) {
  577. if (parse_maptable(line, afflst)) {
  578. delete afflst;
  579. return 1;
  580. }
  581. }
  582. /* parse in the word breakpoints table */
  583. if (strncmp(line,"BREAK",5) == 0) {
  584. if (parse_breaktable(line, afflst)) {
  585. delete afflst;
  586. return 1;
  587. }
  588. }
  589. /* parse in the language for language specific codes */
  590. if (strncmp(line,"LANG",4) == 0) {
  591. if (parse_string(line, &lang, afflst->getlinenum())) {
  592. delete afflst;
  593. return 1;
  594. }
  595. langnum = get_lang_num(lang);
  596. }
  597. if (strncmp(line,"VERSION",7) == 0) {
  598. for(line = line + 7; *line == ' ' || *line == '\t'; line++);
  599. version = mystrdup(line);
  600. }
  601. if (strncmp(line,"MAXNGRAMSUGS",12) == 0) {
  602. if (parse_num(line, &maxngramsugs, afflst)) {
  603. delete afflst;
  604. return 1;
  605. }
  606. }
  607. if (strncmp(line,"ONLYMAXDIFF", 11) == 0)
  608. onlymaxdiff = 1;
  609. if (strncmp(line,"MAXDIFF",7) == 0) {
  610. if (parse_num(line, &maxdiff, afflst)) {
  611. delete afflst;
  612. return 1;
  613. }
  614. }
  615. if (strncmp(line,"MAXCPDSUGS",10) == 0) {
  616. if (parse_num(line, &maxcpdsugs, afflst)) {
  617. delete afflst;
  618. return 1;
  619. }
  620. }
  621. if (strncmp(line,"NOSPLITSUGS",11) == 0) {
  622. nosplitsugs=1;
  623. }
  624. if (strncmp(line,"FULLSTRIP",9) == 0) {
  625. fullstrip=1;
  626. }
  627. if (strncmp(line,"SUGSWITHDOTS",12) == 0) {
  628. sugswithdots=1;
  629. }
  630. /* parse in the flag used by forbidden words */
  631. if (strncmp(line,"KEEPCASE",8) == 0) {
  632. if (parse_flag(line, &keepcase, afflst)) {
  633. delete afflst;
  634. return 1;
  635. }
  636. }
  637. /* parse in the flag used by `forceucase' */
  638. if (strncmp(line,"FORCEUCASE",10) == 0) {
  639. if (parse_flag(line, &forceucase, afflst)) {
  640. delete afflst;
  641. return 1;
  642. }
  643. }
  644. /* parse in the flag used by `warn' */
  645. if (strncmp(line,"WARN",4) == 0) {
  646. if (parse_flag(line, &warn, afflst)) {
  647. delete afflst;
  648. return 1;
  649. }
  650. }
  651. if (strncmp(line,"FORBIDWARN",10) == 0) {
  652. forbidwarn=1;
  653. }
  654. /* parse in the flag used by the affix generator */
  655. if (strncmp(line,"SUBSTANDARD",11) == 0) {
  656. if (parse_flag(line, &substandard, afflst)) {
  657. delete afflst;
  658. return 1;
  659. }
  660. }
  661. if (strncmp(line,"CHECKSHARPS",11) == 0) {
  662. checksharps=1;
  663. }
  664. /* parse this affix: P - prefix, S - suffix */
  665. ft = ' ';
  666. if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P';
  667. if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S';
  668. if (ft != ' ') {
  669. if (dupflags_ini) {
  670. memset(dupflags, 0, sizeof(dupflags));
  671. dupflags_ini = 0;
  672. }
  673. if (parse_affix(line, ft, afflst, dupflags)) {
  674. delete afflst;
  675. process_pfx_tree_to_list();
  676. process_sfx_tree_to_list();
  677. return 1;
  678. }
  679. }
  680. }
  681. delete afflst;
  682. // convert affix trees to sorted list
  683. process_pfx_tree_to_list();
  684. process_sfx_tree_to_list();
  685. // now we can speed up performance greatly taking advantage of the
  686. // relationship between the affixes and the idea of "subsets".
  687. // View each prefix as a potential leading subset of another and view
  688. // each suffix (reversed) as a potential trailing subset of another.
  689. // To illustrate this relationship if we know the prefix "ab" is found in the
  690. // word to examine, only prefixes that "ab" is a leading subset of need be examined.
  691. // Furthermore is "ab" is not present then none of the prefixes that "ab" is
  692. // is a subset need be examined.
  693. // The same argument goes for suffix string that are reversed.
  694. // Then to top this off why not examine the first char of the word to quickly
  695. // limit the set of prefixes to examine (i.e. the prefixes to examine must
  696. // be leading supersets of the first character of the word (if they exist)
  697. // To take advantage of this "subset" relationship, we need to add two links
  698. // from entry. One to take next if the current prefix is found (call it nexteq)
  699. // and one to take next if the current prefix is not found (call it nextne).
  700. // Since we have built ordered lists, all that remains is to properly initialize
  701. // the nextne and nexteq pointers that relate them
  702. process_pfx_order();
  703. process_sfx_order();
  704. /* get encoding for CHECKCOMPOUNDCASE */
  705. if (!utf8) {
  706. char * enc = get_encoding();
  707. csconv = get_current_cs(enc);
  708. free(enc);
  709. enc = NULL;
  710. char expw[MAXLNLEN];
  711. if (wordchars) {
  712. strcpy(expw, wordchars);
  713. free(wordchars);
  714. } else *expw = '\0';
  715. for (int i = 0; i <= 255; i++) {
  716. if ( (csconv[i].cupper != csconv[i].clower) &&
  717. (! strchr(expw, (char) i))) {
  718. *(expw + strlen(expw) + 1) = '\0';
  719. *(expw + strlen(expw)) = (char) i;
  720. }
  721. }
  722. wordchars = mystrdup(expw);
  723. }
  724. // default BREAK definition
  725. if (numbreak == -1) {
  726. breaktable = (char **) malloc(sizeof(char *) * 3);
  727. if (!breaktable) return 1;
  728. breaktable[0] = mystrdup("-");
  729. breaktable[1] = mystrdup("^-");
  730. breaktable[2] = mystrdup("-$");
  731. if (breaktable[0] && breaktable[1] && breaktable[2]) numbreak = 3;
  732. }
  733. return 0;
  734. }
  735. // we want to be able to quickly access prefix information
  736. // both by prefix flag, and sorted by prefix string itself
  737. // so we need to set up two indexes
  738. int AffixMgr::build_pfxtree(PfxEntry* pfxptr)
  739. {
  740. PfxEntry * ptr;
  741. PfxEntry * pptr;
  742. PfxEntry * ep = pfxptr;
  743. // get the right starting points
  744. const char * key = ep->getKey();
  745. const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
  746. // first index by flag which must exist
  747. ptr = pFlag[flg];
  748. ep->setFlgNxt(ptr);
  749. pFlag[flg] = ep;
  750. // handle the special case of null affix string
  751. if (strlen(key) == 0) {
  752. // always inset them at head of list at element 0
  753. ptr = pStart[0];
  754. ep->setNext(ptr);
  755. pStart[0] = ep;
  756. return 0;
  757. }
  758. // now handle the normal case
  759. ep->setNextEQ(NULL);
  760. ep->setNextNE(NULL);
  761. unsigned char sp = *((const unsigned char *)key);
  762. ptr = pStart[sp];
  763. // handle the first insert
  764. if (!ptr) {
  765. pStart[sp] = ep;
  766. return 0;
  767. }
  768. // otherwise use binary tree insertion so that a sorted
  769. // list can easily be generated later
  770. pptr = NULL;
  771. for (;;) {
  772. pptr = ptr;
  773. if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
  774. ptr = ptr->getNextEQ();
  775. if (!ptr) {
  776. pptr->setNextEQ(ep);
  777. break;
  778. }
  779. } else {
  780. ptr = ptr->getNextNE();
  781. if (!ptr) {
  782. pptr->setNextNE(ep);
  783. break;
  784. }
  785. }
  786. }
  787. return 0;
  788. }
  789. // we want to be able to quickly access suffix information
  790. // both by suffix flag, and sorted by the reverse of the
  791. // suffix string itself; so we need to set up two indexes
  792. int AffixMgr::build_sfxtree(SfxEntry* sfxptr)
  793. {
  794. SfxEntry * ptr;
  795. SfxEntry * pptr;
  796. SfxEntry * ep = sfxptr;
  797. /* get the right starting point */
  798. const char * key = ep->getKey();
  799. const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
  800. // first index by flag which must exist
  801. ptr = sFlag[flg];
  802. ep->setFlgNxt(ptr);
  803. sFlag[flg] = ep;
  804. // next index by affix string
  805. // handle the special case of null affix string
  806. if (strlen(key) == 0) {
  807. // always inset them at head of list at element 0
  808. ptr = sStart[0];
  809. ep->setNext(ptr);
  810. sStart[0] = ep;
  811. return 0;
  812. }
  813. // now handle the normal case
  814. ep->setNextEQ(NULL);
  815. ep->setNextNE(NULL);
  816. unsigned char sp = *((const unsigned char *)key);
  817. ptr = sStart[sp];
  818. // handle the first insert
  819. if (!ptr) {
  820. sStart[sp] = ep;
  821. return 0;
  822. }
  823. // otherwise use binary tree insertion so that a sorted
  824. // list can easily be generated later
  825. pptr = NULL;
  826. for (;;) {
  827. pptr = ptr;
  828. if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
  829. ptr = ptr->getNextEQ();
  830. if (!ptr) {
  831. pptr->setNextEQ(ep);
  832. break;
  833. }
  834. } else {
  835. ptr = ptr->getNextNE();
  836. if (!ptr) {
  837. pptr->setNextNE(ep);
  838. break;
  839. }
  840. }
  841. }
  842. return 0;
  843. }
  844. // convert from binary tree to sorted list
  845. int AffixMgr::process_pfx_tree_to_list()
  846. {
  847. for (int i=1; i< SETSIZE; i++) {
  848. pStart[i] = process_pfx_in_order(pStart[i],NULL);
  849. }
  850. return 0;
  851. }
  852. PfxEntry* AffixMgr::process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr)
  853. {
  854. if (ptr) {
  855. nptr = process_pfx_in_order(ptr->getNextNE(), nptr);
  856. ptr->setNext(nptr);
  857. nptr = process_pfx_in_order(ptr->getNextEQ(), ptr);
  858. }
  859. return nptr;
  860. }
  861. // convert from binary tree to sorted list
  862. int AffixMgr:: process_sfx_tree_to_list()
  863. {
  864. for (int i=1; i< SETSIZE; i++) {
  865. sStart[i] = process_sfx_in_order(sStart[i],NULL);
  866. }
  867. return 0;
  868. }
  869. SfxEntry* AffixMgr::process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr)
  870. {
  871. if (ptr) {
  872. nptr = process_sfx_in_order(ptr->getNextNE(), nptr);
  873. ptr->setNext(nptr);
  874. nptr = process_sfx_in_order(ptr->getNextEQ(), ptr);
  875. }
  876. return nptr;
  877. }
  878. // reinitialize the PfxEntry links NextEQ and NextNE to speed searching
  879. // using the idea of leading subsets this time
  880. int AffixMgr::process_pfx_order()
  881. {
  882. PfxEntry* ptr;
  883. // loop through each prefix list starting point
  884. for (int i=1; i < SETSIZE; i++) {
  885. ptr = pStart[i];
  886. // look through the remainder of the list
  887. // and find next entry with affix that
  888. // the current one is not a subset of
  889. // mark that as destination for NextNE
  890. // use next in list that you are a subset
  891. // of as NextEQ
  892. for (; ptr != NULL; ptr = ptr->getNext()) {
  893. PfxEntry * nptr = ptr->getNext();
  894. for (; nptr != NULL; nptr = nptr->getNext()) {
  895. if (! isSubset( ptr->getKey() , nptr->getKey() )) break;
  896. }
  897. ptr->setNextNE(nptr);
  898. ptr->setNextEQ(NULL);
  899. if ((ptr->getNext()) && isSubset(ptr->getKey() , (ptr->getNext())->getKey()))
  900. ptr->setNextEQ(ptr->getNext());
  901. }
  902. // now clean up by adding smart search termination strings:
  903. // if you are already a superset of the previous prefix
  904. // but not a subset of the next, search can end here
  905. // so set NextNE properly
  906. ptr = pStart[i];
  907. for (; ptr != NULL; ptr = ptr->getNext()) {
  908. PfxEntry * nptr = ptr->getNext();
  909. PfxEntry * mptr = NULL;
  910. for (; nptr != NULL; nptr = nptr->getNext()) {
  911. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  912. mptr = nptr;
  913. }
  914. if (mptr) mptr->setNextNE(NULL);
  915. }
  916. }
  917. return 0;
  918. }
  919. // initialize the SfxEntry links NextEQ and NextNE to speed searching
  920. // using the idea of leading subsets this time
  921. int AffixMgr::process_sfx_order()
  922. {
  923. SfxEntry* ptr;
  924. // loop through each prefix list starting point
  925. for (int i=1; i < SETSIZE; i++) {
  926. ptr = sStart[i];
  927. // look through the remainder of the list
  928. // and find next entry with affix that
  929. // the current one is not a subset of
  930. // mark that as destination for NextNE
  931. // use next in list that you are a subset
  932. // of as NextEQ
  933. for (; ptr != NULL; ptr = ptr->getNext()) {
  934. SfxEntry * nptr = ptr->getNext();
  935. for (; nptr != NULL; nptr = nptr->getNext()) {
  936. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  937. }
  938. ptr->setNextNE(nptr);
  939. ptr->setNextEQ(NULL);
  940. if ((ptr->getNext()) && isSubset(ptr->getKey(),(ptr->getNext())->getKey()))
  941. ptr->setNextEQ(ptr->getNext());
  942. }
  943. // now clean up by adding smart search termination strings:
  944. // if you are already a superset of the previous suffix
  945. // but not a subset of the next, search can end here
  946. // so set NextNE properly
  947. ptr = sStart[i];
  948. for (; ptr != NULL; ptr = ptr->getNext()) {
  949. SfxEntry * nptr = ptr->getNext();
  950. SfxEntry * mptr = NULL;
  951. for (; nptr != NULL; nptr = nptr->getNext()) {
  952. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  953. mptr = nptr;
  954. }
  955. if (mptr) mptr->setNextNE(NULL);
  956. }
  957. }
  958. return 0;
  959. }
  960. // add flags to the result for dictionary debugging
  961. void AffixMgr::debugflag(char * result, unsigned short flag) {
  962. char * st = encode_flag(flag);
  963. mystrcat(result, " ", MAXLNLEN);
  964. mystrcat(result, MORPH_FLAG, MAXLNLEN);
  965. if (st) {
  966. mystrcat(result, st, MAXLNLEN);
  967. free(st);
  968. }
  969. }
  970. // calculate the character length of the condition
  971. int AffixMgr::condlen(char * st)
  972. {
  973. int l = 0;
  974. bool group = false;
  975. for(; *st; st++) {
  976. if (*st == '[') {
  977. group = true;
  978. l++;
  979. } else if (*st == ']') group = false;
  980. else if (!group && (!utf8 ||
  981. (!(*st & 0x80) || ((*st & 0xc0) == 0x80)))) l++;
  982. }
  983. return l;
  984. }
  985. int AffixMgr::encodeit(affentry &entry, char * cs)
  986. {
  987. if (strcmp(cs,".") != 0) {
  988. entry.numconds = (char) condlen(cs);
  989. strncpy(entry.c.conds, cs, MAXCONDLEN);
  990. // long condition (end of conds padded by strncpy)
  991. if (entry.c.conds[MAXCONDLEN - 1] && cs[MAXCONDLEN]) {
  992. entry.opts += aeLONGCOND;
  993. entry.c.l.conds2 = mystrdup(cs + MAXCONDLEN_1);
  994. if (!entry.c.l.conds2) return 1;
  995. }
  996. } else {
  997. entry.numconds = 0;
  998. entry.c.conds[0] = '\0';
  999. }
  1000. return 0;
  1001. }
  1002. // return 1 if s1 is a leading subset of s2 (dots are for infixes)
  1003. inline int AffixMgr::isSubset(const char * s1, const char * s2)
  1004. {
  1005. while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) {
  1006. s1++;
  1007. s2++;
  1008. }
  1009. return (*s1 == '\0');
  1010. }
  1011. // check word for prefixes
  1012. struct hentry * AffixMgr::prefix_check(const char * word, int len, char in_compound,
  1013. const FLAG needflag)
  1014. {
  1015. struct hentry * rv= NULL;
  1016. pfx = NULL;
  1017. pfxappnd = NULL;
  1018. sfxappnd = NULL;
  1019. // first handle the special case of 0 length prefixes
  1020. PfxEntry * pe = pStart[0];
  1021. while (pe) {
  1022. if (
  1023. // fogemorpheme
  1024. ((in_compound != IN_CPD_NOT) || !(pe->getCont() &&
  1025. (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) &&
  1026. // permit prefixes in compounds
  1027. ((in_compound != IN_CPD_END) || (pe->getCont() &&
  1028. (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))
  1029. ) {
  1030. // check prefix
  1031. rv = pe->checkword(word, len, in_compound, needflag);
  1032. if (rv) {
  1033. pfx=pe; // BUG: pfx not stateless
  1034. return rv;
  1035. }
  1036. }
  1037. pe = pe->getNext();
  1038. }
  1039. // now handle the general case
  1040. unsigned char sp = *((const unsigned char *)word);
  1041. PfxEntry * pptr = pStart[sp];
  1042. while (pptr) {
  1043. if (isSubset(pptr->getKey(),word)) {
  1044. if (
  1045. // fogemorpheme
  1046. ((in_compound != IN_CPD_NOT) || !(pptr->getCont() &&
  1047. (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) &&
  1048. // permit prefixes in compounds
  1049. ((in_compound != IN_CPD_END) || (pptr->getCont() &&
  1050. (TESTAFF(pptr->getCont(), compoundpermitflag, pptr->getContLen()))))
  1051. ) {
  1052. // check prefix
  1053. rv = pptr->checkword(word, len, in_compound, needflag);
  1054. if (rv) {
  1055. pfx=pptr; // BUG: pfx not stateless
  1056. return rv;
  1057. }
  1058. }
  1059. pptr = pptr->getNextEQ();
  1060. } else {
  1061. pptr = pptr->getNextNE();
  1062. }
  1063. }
  1064. return NULL;
  1065. }
  1066. // check word for prefixes
  1067. struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len,
  1068. char in_compound, const FLAG needflag)
  1069. {
  1070. struct hentry * rv= NULL;
  1071. pfx = NULL;
  1072. sfxappnd = NULL;
  1073. // first handle the special case of 0 length prefixes
  1074. PfxEntry * pe = pStart[0];
  1075. while (pe) {
  1076. rv = pe->check_twosfx(word, len, in_compound, needflag);
  1077. if (rv) return rv;
  1078. pe = pe->getNext();
  1079. }
  1080. // now handle the general case
  1081. unsigned char sp = *((const unsigned char *)word);
  1082. PfxEntry * pptr = pStart[sp];
  1083. while (pptr) {
  1084. if (isSubset(pptr->getKey(),word)) {
  1085. rv = pptr->check_twosfx(word, len, in_compound, needflag);
  1086. if (rv) {
  1087. pfx = pptr;
  1088. return rv;
  1089. }
  1090. pptr = pptr->getNextEQ();
  1091. } else {
  1092. pptr = pptr->getNextNE();
  1093. }
  1094. }
  1095. return NULL;
  1096. }
  1097. // check word for prefixes
  1098. char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound,
  1099. const FLAG needflag)
  1100. {
  1101. char * st;
  1102. char result[MAXLNLEN];
  1103. result[0] = '\0';
  1104. pfx = NULL;
  1105. sfxappnd = NULL;
  1106. // first handle the special case of 0 length prefixes
  1107. PfxEntry * pe = pStart[0];
  1108. while (pe) {
  1109. st = pe->check_morph(word,len,in_compound, needflag);
  1110. if (st) {
  1111. mystrcat(result, st, MAXLNLEN);
  1112. free(st);
  1113. }
  1114. // if (rv) return rv;
  1115. pe = pe->getNext();
  1116. }
  1117. // now handle the general case
  1118. unsigned char sp = *((const unsigned char *)word);
  1119. PfxEntry * pptr = pStart[sp];
  1120. while (pptr) {
  1121. if (isSubset(pptr->getKey(),word)) {
  1122. st = pptr->check_morph(word,len,in_compound, needflag);
  1123. if (st) {
  1124. // fogemorpheme
  1125. if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() &&
  1126. (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))))) {
  1127. mystrcat(result, st, MAXLNLEN);
  1128. pfx = pptr;
  1129. }
  1130. free(st);
  1131. }
  1132. pptr = pptr->getNextEQ();
  1133. } else {
  1134. pptr = pptr->getNextNE();
  1135. }
  1136. }
  1137. if (*result) return mystrdup(result);
  1138. return NULL;
  1139. }
  1140. // check word for prefixes
  1141. char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len,
  1142. char in_compound, const FLAG needflag)
  1143. {
  1144. char * st;
  1145. char result[MAXLNLEN];
  1146. result[0] = '\0';
  1147. pfx = NULL;
  1148. sfxappnd = NULL;
  1149. // first handle the special case of 0 length prefixes
  1150. PfxEntry * pe = pStart[0];
  1151. while (pe) {
  1152. st = pe->check_twosfx_morph(word,len,in_compound, needflag);
  1153. if (st) {
  1154. mystrcat(result, st, MAXLNLEN);
  1155. free(st);
  1156. }
  1157. pe = pe->getNext();
  1158. }
  1159. // now handle the general case
  1160. unsigned char sp = *((const unsigned char *)word);
  1161. PfxEntry * pptr = pStart[sp];
  1162. while (pptr) {
  1163. if (isSubset(pptr->getKey(),word)) {
  1164. st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
  1165. if (st) {
  1166. mystrcat(result, st, MAXLNLEN);
  1167. free(st);
  1168. pfx = pptr;
  1169. }
  1170. pptr = pptr->getNextEQ();
  1171. } else {
  1172. pptr = pptr->getNextNE();
  1173. }
  1174. }
  1175. if (*result) return mystrdup(result);
  1176. return NULL;
  1177. }
  1178. // Is word a non compound with a REP substitution (see checkcompoundrep)?
  1179. int AffixMgr::cpdrep_check(const char * word, int wl)
  1180. {
  1181. char candidate[MAXLNLEN];
  1182. const char * r;
  1183. int lenr, lenp;
  1184. if ((wl < 2) || !numrep) return 0;
  1185. for (int i=0; i < numrep; i++ ) {
  1186. r = word;
  1187. lenr = strlen(reptable[i].pattern2);
  1188. lenp = strlen(reptable[i].pattern);
  1189. // search every occurence of the pattern in the word
  1190. while ((r=strstr(r, reptable[i].pattern)) != NULL) {
  1191. strcpy(candidate, word);
  1192. if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break;
  1193. strcpy(candidate+(r-word),reptable[i].pattern2);
  1194. strcpy(candidate+(r-word)+lenr, r+lenp);
  1195. if (candidate_check(candidate,strlen(candidate))) return 1;
  1196. r++; // search for the next letter
  1197. }
  1198. }
  1199. return 0;
  1200. }
  1201. // forbid compoundings when there are special patterns at word bound
  1202. int AffixMgr::cpdpat_check(const char * word, int pos, hentry * r1, hentry * r2, const char affixed)
  1203. {
  1204. int len;
  1205. for (int i = 0; i < numcheckcpd; i++) {
  1206. if (isSubset(checkcpdtable[i].pattern2, word + pos) &&
  1207. (!r1 || !checkcpdtable[i].cond ||
  1208. (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen))) &&
  1209. (!r2 || !checkcpdtable[i].cond2 ||
  1210. (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen))) &&
  1211. // zero length pattern => only TESTAFF
  1212. // zero pattern (0/flag) => unmodified stem (zero affixes allowed)
  1213. (!*(checkcpdtable[i].pattern) || (
  1214. (*(checkcpdtable[i].pattern)=='0' && r1->blen <= pos && strncmp(word + pos - r1->blen, r1->word, r1->blen) == 0) ||
  1215. (*(checkcpdtable[i].pattern)!='0' && (len = strlen(checkcpdtable[i].pattern)) &&
  1216. strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)))) {
  1217. return 1;
  1218. }
  1219. }
  1220. return 0;
  1221. }
  1222. // forbid compounding with neighbouring upper and lower case characters at word bounds
  1223. int AffixMgr::cpdcase_check(const char * word, int pos)
  1224. {
  1225. if (utf8) {
  1226. w_char u, w;
  1227. const char * p;
  1228. u8_u16(&u, 1, word + pos);
  1229. for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--);
  1230. u8_u16(&w, 1, p);
  1231. unsigned short a = (u.h << 8) + u.l;
  1232. unsigned short b = (w.h << 8) + w.l;
  1233. if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b)) &&
  1234. (a != '-') && (b != '-')) return 1;
  1235. } else {
  1236. unsigned char a = *(word + pos - 1);
  1237. unsigned char b = *(word + pos);
  1238. if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-')) return 1;
  1239. }
  1240. return 0;
  1241. }
  1242. // check compound patterns
  1243. int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** def, char all)
  1244. {
  1245. signed short btpp[MAXWORDLEN]; // metacharacter (*, ?) positions for backtracking
  1246. signed short btwp[MAXWORDLEN]; // word positions for metacharacters
  1247. int btnum[MAXWORDLEN]; // number of matched characters in metacharacter positions
  1248. short bt = 0;
  1249. int i, j;
  1250. int ok;
  1251. int w = 0;
  1252. if (!*words) {
  1253. w = 1;
  1254. *words = def;
  1255. }
  1256. if (!*words) {
  1257. return 0;
  1258. }
  1259. (*words)[wnum] = rv;
  1260. // has the last word COMPOUNDRULE flag?
  1261. if (rv->alen == 0) {
  1262. (*words)[wnum] = NULL;
  1263. if (w) *words = NULL;
  1264. return 0;
  1265. }
  1266. ok = 0;
  1267. for (i = 0; i < numdefcpd; i++) {
  1268. for (j = 0; j < defcpdtable[i].len; j++) {
  1269. if (defcpdtable[i].def[j] != '*' && defcpdtable[i].def[j] != '?' &&
  1270. TESTAFF(rv->astr, defcpdtable[i].def[j], rv->alen)) ok = 1;
  1271. }
  1272. }
  1273. if (ok == 0) {
  1274. (*words)[wnum] = NULL;
  1275. if (w) *words = NULL;
  1276. return 0;
  1277. }
  1278. for (i = 0; i < numdefcpd; i++) {
  1279. signed short pp = 0; // pattern position
  1280. signed short wp = 0; // "words" position
  1281. int ok2;
  1282. ok = 1;
  1283. ok2 = 1;
  1284. do {
  1285. while ((pp < defcpdtable[i].len) && (wp <= wnum)) {
  1286. if (((pp+1) < defcpdtable[i].len) &&
  1287. ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) {
  1288. int wend = (defcpdtable[i].def[pp+1] == '?') ? wp : wnum;
  1289. ok2 = 1;
  1290. pp+=2;
  1291. btpp[bt] = pp;
  1292. btwp[bt] = wp;
  1293. while (wp <= wend) {
  1294. if (!(*words)[wp]->alen ||
  1295. !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp-2], (*words)[wp]->alen)) {
  1296. ok2 = 0;
  1297. break;
  1298. }
  1299. wp++;
  1300. }
  1301. if (wp <= wnum) ok2 = 0;
  1302. btnum[bt] = wp - btwp[bt];
  1303. if (btnum[bt] > 0) bt++;
  1304. if (ok2) break;
  1305. } else {
  1306. ok2 = 1;
  1307. if (!(*words)[wp] || !(*words)[wp]->alen ||
  1308. !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp], (*words)[wp]->alen)) {
  1309. ok = 0;
  1310. break;
  1311. }
  1312. pp++;
  1313. wp++;
  1314. if ((defcpdtable[i].len == pp) && !(wp > wnum)) ok = 0;
  1315. }
  1316. }
  1317. if (ok && ok2) {
  1318. int r = pp;
  1319. while ((defcpdtable[i].len > r) && ((r+1) < defcpdtable[i].len) &&
  1320. ((defcpdtable[i].def[r+1] == '*') || (defcpdtable[i].def[r+1] == '?'))) r+=2;
  1321. if (defcpdtable[i].len <= r) return 1;
  1322. }
  1323. // backtrack
  1324. if (bt) do {
  1325. ok = 1;
  1326. btnum[bt - 1]--;
  1327. pp = btpp[bt - 1];
  1328. wp = btwp[bt - 1] + (signed short) btnum[bt - 1];
  1329. } while ((btnum[bt - 1] < 0) && --bt);
  1330. } while (bt);
  1331. if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1;
  1332. // check zero ending
  1333. while (ok && ok2 && (defcpdtable[i].len > pp) && ((pp+1) < defcpdtable[i].len) &&
  1334. ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) pp+=2;
  1335. if (ok && ok2 && (defcpdtable[i].len <= pp)) return 1;
  1336. }
  1337. (*words)[wnum] = NULL;
  1338. if (w) *words = NULL;
  1339. return 0;
  1340. }
  1341. inline int AffixMgr::candidate_check(const char * word, int len)
  1342. {
  1343. struct hentry * rv=NULL;
  1344. rv = lookup(word);
  1345. if (rv) return 1;
  1346. // rv = prefix_check(word,len,1);
  1347. // if (rv) return 1;
  1348. rv = affix_check(word,len);
  1349. if (rv) return 1;
  1350. return 0;
  1351. }
  1352. // calculate number of syllable for compound-checking
  1353. short AffixMgr::get_syllable(const char * word, int wlen)
  1354. {
  1355. if (cpdmaxsyllable==0) return 0;
  1356. short num=0;
  1357. if (!utf8) {
  1358. for (int i=0; i<wlen; i++) {
  1359. if (strchr(cpdvowels, word[i])) num++;
  1360. }
  1361. } else if (cpdvowels_utf16) {
  1362. w_char w[MAXWORDUTF8LEN];
  1363. int i = u8_u16(w, MAXWORDUTF8LEN, word);
  1364. for (; i > 0; i--) {
  1365. if (flag_bsearch((unsigned short *) cpdvowels_utf16,
  1366. ((unsigned short *) w)[i - 1], cpdvowels_utf16_len)) num++;
  1367. }
  1368. }
  1369. return num;
  1370. }
  1371. void AffixMgr::setcminmax(int * cmin, int * cmax, const char * word, int len) {
  1372. if (utf8) {
  1373. int i;
  1374. for (*cmin = 0, i = 0; (i < cpdmin) && word[*cmin]; i++) {
  1375. for ((*cmin)++; (word[*cmin] & 0xc0) == 0x80; (*cmin)++);
  1376. }
  1377. for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax; i++) {
  1378. for ((*cmax)--; (word[*cmax] & 0xc0) == 0x80; (*cmax)--);
  1379. }
  1380. } else {
  1381. *cmin = cpdmin;
  1382. *cmax = len - cpdmin + 1;
  1383. }
  1384. }
  1385. // check if compound word is correctly spelled
  1386. // hu_mov_rule = spec. Hungarian rule (XXX)
  1387. struct hentry * AffixMgr::compound_check(const char * word, int len,
  1388. short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words = NULL,
  1389. char hu_mov_rule = 0, char is_sug = 0, int * info = NULL)
  1390. {
  1391. int i;
  1392. short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
  1393. struct hentry * rv = NULL;
  1394. struct hentry * rv_first;
  1395. struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
  1396. char st [MAXWORDUTF8LEN + 4];
  1397. char ch = '\0';
  1398. int cmin;
  1399. int cmax;
  1400. int striple = 0;
  1401. int scpd = 0;
  1402. int soldi = 0;
  1403. int oldcmin = 0;
  1404. int oldcmax = 0;
  1405. int oldlen = 0;
  1406. int checkedstriple = 0;
  1407. int onlycpdrule;
  1408. int affixed = 0;
  1409. hentry ** oldwords = words;
  1410. int checked_prefix;
  1411. setcminmax(&cmin, &cmax, word, len);
  1412. strcpy(st, word);
  1413. for (i = cmin; i < cmax; i++) {
  1414. // go to end of the UTF-8 character
  1415. if (utf8) {
  1416. for (; (st[i] & 0xc0) == 0x80; i++);
  1417. if (i >= cmax) return NULL;
  1418. }
  1419. words = oldwords;
  1420. onlycpdrule = (words) ? 1 : 0;
  1421. do { // onlycpdrule loop
  1422. oldnumsyllable = numsyllable;
  1423. oldwordnum = wordnum;
  1424. checked_prefix = 0;
  1425. do { // simplified checkcompoundpattern loop
  1426. if (scpd > 0) {
  1427. for (; scpd <= numcheckcpd && (!checkcpdtable[scpd-1].pattern3 ||
  1428. strncmp(word + i, checkcpdtable[scpd-1].pattern3, strlen(checkcpdtable[scpd-1].pattern3)) != 0); scpd++);
  1429. if (scpd > numcheckcpd) break; // break simplified checkcompoundpattern loop
  1430. strcpy(st + i, checkcpdtable[scpd-1].pattern);
  1431. soldi = i;
  1432. i += strlen(checkcpdtable[scpd-1].pattern);
  1433. strcpy(st + i, checkcpdtable[scpd-1].pattern2);
  1434. strcpy(st + i + strlen(checkcpdtable[scpd-1].pattern2), word + soldi + strlen(checkcpdtable[scpd-1].pattern3));
  1435. oldlen = len;
  1436. len += strlen(checkcpdtable[scpd-1].pattern) + strlen(checkcpdtable[scpd-1].pattern2) - strlen(checkcpdtable[scpd-1].pattern3);
  1437. oldcmin = cmin;
  1438. oldcmax = cmax;
  1439. setcminmax(&cmin, &cmax, st, len);
  1440. cmax = len - cpdmin + 1;
  1441. }
  1442. ch = st[i];
  1443. st[i] = '\0';
  1444. sfx = NULL;
  1445. pfx = NULL;
  1446. // FIRST WORD
  1447. affixed = 1;
  1448. rv = lookup(st); // perhaps without prefix
  1449. // search homonym with compound flag
  1450. while ((rv) && !hu_mov_rule &&
  1451. ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
  1452. !((compoundflag && !words && !onlycpdrule && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
  1453. (compoundbegin && !wordnum && !onlycpdrule &&
  1454. TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
  1455. (compoundmiddle && wordnum && !words && !onlycpdrule &&
  1456. TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
  1457. (numdefcpd && onlycpdrul…

Large files files are truncated, but you can click here to view the full file