PageRenderTime 47ms CodeModel.GetById 12ms RepoModel.GetById 1ms app.codeStats 0ms

/PROJECTS_ROOT/SmartWires/SystemUtils/myspell/affixmgr.cxx

http://wiredplane-wintools.googlecode.com/
C++ | 1304 lines | 954 code | 194 blank | 156 comment | 254 complexity | 30ea2c2c4d96626dc75d4d895fb897dc MD5 | raw file
Possible License(s): GPL-2.0, Unlicense
  1. #include "license.readme"
  2. #include <cstdlib>
  3. #include <cstring>
  4. #include <cstdio>
  5. #include "affixmgr.hxx"
  6. #include "affentry.hxx"
  7. //using namespace std;
  8. // First some base level utility routines
  9. extern void mychomp(char * s);
  10. extern char * mystrdup(const char * s);
  11. extern char * myrevstrdup(const char * s);
  12. extern char * mystrsep(char ** sptr, const char delim);
  13. extern int isSubset(const char * s1, const char * s2);
  14. extern int isRevSubset(const char * s1, const char * end_of_s2, int len_s2);
  15. AffixMgr::AffixMgr(const char * affpath, HashMgr* ptr)
  16. {
  17. // register hash manager and load affix data from aff file
  18. pHMgr = ptr;
  19. trystring = NULL;
  20. encoding=NULL;
  21. reptable = NULL;
  22. numrep = 0;
  23. maptable = NULL;
  24. nummap = 0;
  25. compound=NULL;
  26. nosplitsugs= (0==1);
  27. cpdmin = 3; // default value
  28. for (int i=0; i < SETSIZE; i++) {
  29. pStart[i] = NULL;
  30. sStart[i] = NULL;
  31. pFlag[i] = NULL;
  32. sFlag[i] = NULL;
  33. }
  34. if (parse_file(affpath)) {
  35. fprintf(stderr,"Failure loading aff file %s\n",affpath);
  36. fflush(stderr);
  37. }
  38. }
  39. AffixMgr::~AffixMgr()
  40. {
  41. // pass through linked prefix entries and clean up
  42. for (int i=0; i < SETSIZE ;i++) {
  43. pFlag[i] = NULL;
  44. PfxEntry * ptr = (PfxEntry *)pStart[i];
  45. PfxEntry * nptr = NULL;
  46. while (ptr) {
  47. nptr = ptr->getNext();
  48. delete(ptr);
  49. ptr = nptr;
  50. nptr = NULL;
  51. }
  52. }
  53. // pass through linked suffix entries and clean up
  54. for (int j=0; j < SETSIZE ; j++) {
  55. sFlag[j] = NULL;
  56. SfxEntry * ptr = (SfxEntry *)sStart[j];
  57. SfxEntry * nptr = NULL;
  58. while (ptr) {
  59. nptr = ptr->getNext();
  60. delete(ptr);
  61. ptr = nptr;
  62. nptr = NULL;
  63. }
  64. }
  65. if (trystring) free(trystring);
  66. trystring=NULL;
  67. if (encoding) free(encoding);
  68. encoding=NULL;
  69. if (maptable) {
  70. for (int j=0; j < nummap; j++) {
  71. free(maptable[j].set);
  72. maptable[j].set = NULL;
  73. maptable[j].len = 0;
  74. }
  75. free(maptable);
  76. maptable = NULL;
  77. }
  78. nummap = 0;
  79. if (reptable) {
  80. for (int j=0; j < numrep; j++) {
  81. free(reptable[j].pattern);
  82. free(reptable[j].replacement);
  83. reptable[j].pattern = NULL;
  84. reptable[j].replacement = NULL;
  85. }
  86. free(reptable);
  87. reptable = NULL;
  88. }
  89. numrep = 0;
  90. if (compound) free(compound);
  91. compound=NULL;
  92. pHMgr = NULL;
  93. cpdmin = 0;
  94. }
  95. // read in aff file and build up prefix and suffix entry objects
  96. int AffixMgr::parse_file(const char * affpath)
  97. {
  98. // io buffers
  99. char line[MAXLNLEN+1];
  100. // affix type
  101. char ft;
  102. // open the affix file
  103. FILE * afflst;
  104. afflst = fopen(affpath,"r");
  105. if (!afflst) {
  106. fprintf(stderr,"Error - could not open affix description file %s\n",affpath);
  107. return 1;
  108. }
  109. // step one is to parse the affix file building up the internal
  110. // affix data structures
  111. // read in each line ignoring any that do not
  112. // start with a known line type indicator
  113. while (fgets(line,MAXLNLEN,afflst)) {
  114. mychomp(line);
  115. /* parse in the try string */
  116. if (strncmp(line,"TRY",3) == 0) {
  117. if (parse_try(line)) {
  118. return 1;
  119. }
  120. }
  121. /* parse in the name of the character set used by the .dict and .aff */
  122. if (strncmp(line,"SET",3) == 0) {
  123. if (parse_set(line)) {
  124. return 1;
  125. }
  126. }
  127. /* parse in the flag used by the controlled compound words */
  128. if (strncmp(line,"COMPOUNDFLAG",12) == 0) {
  129. if (parse_cpdflag(line)) {
  130. return 1;
  131. }
  132. }
  133. /* parse in the flag used by the controlled compound words */
  134. if (strncmp(line,"COMPOUNDMIN",11) == 0) {
  135. if (parse_cpdmin(line)) {
  136. return 1;
  137. }
  138. }
  139. /* parse in the typical fault correcting table */
  140. if (strncmp(line,"REP",3) == 0) {
  141. if (parse_reptable(line, afflst)) {
  142. return 1;
  143. }
  144. }
  145. /* parse in the related character map table */
  146. if (strncmp(line,"MAP",3) == 0) {
  147. if (parse_maptable(line, afflst)) {
  148. return 1;
  149. }
  150. }
  151. // parse this affix: P - prefix, S - suffix
  152. ft = ' ';
  153. if (strncmp(line,"PFX",3) == 0) ft = 'P';
  154. if (strncmp(line,"SFX",3) == 0) ft = 'S';
  155. if (ft != ' ') {
  156. if (parse_affix(line, ft, afflst)) {
  157. return 1;
  158. }
  159. }
  160. // handle NOSPLITSUGS
  161. if (strncmp(line,"NOSPLITSUGS",11) == 0)
  162. nosplitsugs=(0==0);
  163. }
  164. fclose(afflst);
  165. // convert affix trees to sorted list
  166. process_pfx_tree_to_list();
  167. process_sfx_tree_to_list();
  168. // now we can speed up performance greatly taking advantage of the
  169. // relationship between the affixes and the idea of "subsets".
  170. // View each prefix as a potential leading subset of another and view
  171. // each suffix (reversed) as a potential trailing subset of another.
  172. // To illustrate this relationship if we know the prefix "ab" is found in the
  173. // word to examine, only prefixes that "ab" is a leading subset of need be examined.
  174. // Furthermore is "ab" is not present then none of the prefixes that "ab" is
  175. // is a subset need be examined.
  176. // The same argument goes for suffix string that are reversed.
  177. // Then to top this off why not examine the first char of the word to quickly
  178. // limit the set of prefixes to examine (i.e. the prefixes to examine must
  179. // be leading supersets of the first character of the word (if they exist)
  180. // To take advantage of this "subset" relationship, we need to add two links
  181. // from entry. One to take next if the current prefix is found (call it nexteq)
  182. // and one to take next if the current prefix is not found (call it nextne).
  183. // Since we have built ordered lists, all that remains is to properly intialize
  184. // the nextne and nexteq pointers that relate them
  185. process_pfx_order();
  186. process_sfx_order();
  187. return 0;
  188. }
  189. // we want to be able to quickly access prefix information
  190. // both by prefix flag, and sorted by prefix string itself
  191. // so we need to set up two indexes
  192. int AffixMgr::build_pfxtree(AffEntry* pfxptr)
  193. {
  194. PfxEntry * ptr;
  195. PfxEntry * pptr;
  196. PfxEntry * ep = (PfxEntry*) pfxptr;
  197. // get the right starting points
  198. const char * key = ep->getKey();
  199. const unsigned char flg = ep->getFlag();
  200. // first index by flag which must exist
  201. ptr = (PfxEntry*)pFlag[flg];
  202. ep->setFlgNxt(ptr);
  203. pFlag[flg] = (AffEntry *) ep;
  204. // handle the special case of null affix string
  205. if (strlen(key) == 0) {
  206. // always inset them at head of list at element 0
  207. ptr = (PfxEntry*)pStart[0];
  208. ep->setNext(ptr);
  209. pStart[0] = (AffEntry*)ep;
  210. return 0;
  211. }
  212. // now handle the normal case
  213. ep->setNextEQ(NULL);
  214. ep->setNextNE(NULL);
  215. unsigned char sp = *((const unsigned char *)key);
  216. ptr = (PfxEntry*)pStart[sp];
  217. // handle the first insert
  218. if (!ptr) {
  219. pStart[sp] = (AffEntry*)ep;
  220. return 0;
  221. }
  222. // otherwise use binary tree insertion so that a sorted
  223. // list can easily be generated later
  224. pptr = NULL;
  225. for (;;) {
  226. pptr = ptr;
  227. if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
  228. ptr = ptr->getNextEQ();
  229. if (!ptr) {
  230. pptr->setNextEQ(ep);
  231. break;
  232. }
  233. } else {
  234. ptr = ptr->getNextNE();
  235. if (!ptr) {
  236. pptr->setNextNE(ep);
  237. break;
  238. }
  239. }
  240. }
  241. return 0;
  242. }
  243. // we want to be able to quickly access suffix information
  244. // both by suffix flag, and sorted by the reverse of the
  245. // suffix string itself; so we need to set up two indexes
  246. int AffixMgr::build_sfxtree(AffEntry* sfxptr)
  247. {
  248. SfxEntry * ptr;
  249. SfxEntry * pptr;
  250. SfxEntry * ep = (SfxEntry *) sfxptr;
  251. /* get the right starting point */
  252. const char * key = ep->getKey();
  253. const unsigned char flg = ep->getFlag();
  254. // first index by flag which must exist
  255. ptr = (SfxEntry*)sFlag[flg];
  256. ep->setFlgNxt(ptr);
  257. sFlag[flg] = (AffEntry *) ep;
  258. // next index by affix string
  259. // handle the special case of null affix string
  260. if (strlen(key) == 0) {
  261. // always inset them at head of list at element 0
  262. ptr = (SfxEntry*)sStart[0];
  263. ep->setNext(ptr);
  264. sStart[0] = (AffEntry*)ep;
  265. return 0;
  266. }
  267. // now handle the normal case
  268. ep->setNextEQ(NULL);
  269. ep->setNextNE(NULL);
  270. unsigned char sp = *((const unsigned char *)key);
  271. ptr = (SfxEntry*)sStart[sp];
  272. // handle the first insert
  273. if (!ptr) {
  274. sStart[sp] = (AffEntry*)ep;
  275. return 0;
  276. }
  277. // otherwise use binary tree insertion so that a sorted
  278. // list can easily be generated later
  279. pptr = NULL;
  280. for (;;) {
  281. pptr = ptr;
  282. if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
  283. ptr = ptr->getNextEQ();
  284. if (!ptr) {
  285. pptr->setNextEQ(ep);
  286. break;
  287. }
  288. } else {
  289. ptr = ptr->getNextNE();
  290. if (!ptr) {
  291. pptr->setNextNE(ep);
  292. break;
  293. }
  294. }
  295. }
  296. return 0;
  297. }
  298. // convert from binary tree to sorted list
  299. int AffixMgr::process_pfx_tree_to_list()
  300. {
  301. for (int i=1; i< SETSIZE; i++) {
  302. pStart[i] = process_pfx_in_order(pStart[i],NULL);
  303. }
  304. return 0;
  305. }
  306. AffEntry* AffixMgr::process_pfx_in_order(AffEntry* ptr, AffEntry* nptr)
  307. {
  308. if (ptr) {
  309. nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextNE(), nptr);
  310. ((PfxEntry*) ptr)->setNext((PfxEntry*) nptr);
  311. nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextEQ(), ptr);
  312. }
  313. return nptr;
  314. }
  315. // convert from binary tree to sorted list
  316. int AffixMgr:: process_sfx_tree_to_list()
  317. {
  318. for (int i=1; i< SETSIZE; i++) {
  319. sStart[i] = process_sfx_in_order(sStart[i],NULL);
  320. }
  321. return 0;
  322. }
  323. AffEntry* AffixMgr::process_sfx_in_order(AffEntry* ptr, AffEntry* nptr)
  324. {
  325. if (ptr) {
  326. nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextNE(), nptr);
  327. ((SfxEntry*) ptr)->setNext((SfxEntry*) nptr);
  328. nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextEQ(), ptr);
  329. }
  330. return nptr;
  331. }
  332. // reinitialize the PfxEntry links NextEQ and NextNE to speed searching
  333. // using the idea of leading subsets this time
  334. int AffixMgr::process_pfx_order()
  335. {
  336. PfxEntry* ptr;
  337. // loop through each prefix list starting point
  338. for (int i=1; i < SETSIZE; i++) {
  339. ptr = (PfxEntry*)pStart[i];
  340. // look through the remainder of the list
  341. // and find next entry with affix that
  342. // the current one is not a subset of
  343. // mark that as destination for NextNE
  344. // use next in list that you are a subset
  345. // of as NextEQ
  346. for (; ptr != NULL; ptr = ptr->getNext()) {
  347. PfxEntry * nptr = ptr->getNext();
  348. for (; nptr != NULL; nptr = nptr->getNext()) {
  349. if (! isSubset( ptr->getKey() , nptr->getKey() )) break;
  350. }
  351. ptr->setNextNE(nptr);
  352. ptr->setNextEQ(NULL);
  353. if ((ptr->getNext()) && isSubset(ptr->getKey() , (ptr->getNext())->getKey()))
  354. ptr->setNextEQ(ptr->getNext());
  355. }
  356. // now clean up by adding smart search termination strings:
  357. // if you are already a superset of the previous prefix
  358. // but not a subset of the next, search can end here
  359. // so set NextNE properly
  360. ptr = (PfxEntry *) pStart[i];
  361. for (; ptr != NULL; ptr = ptr->getNext()) {
  362. PfxEntry * nptr = ptr->getNext();
  363. PfxEntry * mptr = NULL;
  364. for (; nptr != NULL; nptr = nptr->getNext()) {
  365. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  366. mptr = nptr;
  367. }
  368. if (mptr) mptr->setNextNE(NULL);
  369. }
  370. }
  371. return 0;
  372. }
  373. // reinitialize the SfxEntry links NextEQ and NextNE to speed searching
  374. // using the idea of leading subsets this time
  375. int AffixMgr::process_sfx_order()
  376. {
  377. SfxEntry* ptr;
  378. // loop through each prefix list starting point
  379. for (int i=1; i < SETSIZE; i++) {
  380. ptr = (SfxEntry *) sStart[i];
  381. // look through the remainder of the list
  382. // and find next entry with affix that
  383. // the current one is not a subset of
  384. // mark that as destination for NextNE
  385. // use next in list that you are a subset
  386. // of as NextEQ
  387. for (; ptr != NULL; ptr = ptr->getNext()) {
  388. SfxEntry * nptr = ptr->getNext();
  389. for (; nptr != NULL; nptr = nptr->getNext()) {
  390. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  391. }
  392. ptr->setNextNE(nptr);
  393. ptr->setNextEQ(NULL);
  394. if ((ptr->getNext()) && isSubset(ptr->getKey(),(ptr->getNext())->getKey()))
  395. ptr->setNextEQ(ptr->getNext());
  396. }
  397. // now clean up by adding smart search termination strings:
  398. // if you are already a superset of the previous suffix
  399. // but not a subset of the next, search can end here
  400. // so set NextNE properly
  401. ptr = (SfxEntry *) sStart[i];
  402. for (; ptr != NULL; ptr = ptr->getNext()) {
  403. SfxEntry * nptr = ptr->getNext();
  404. SfxEntry * mptr = NULL;
  405. for (; nptr != NULL; nptr = nptr->getNext()) {
  406. if (! isSubset(ptr->getKey(),nptr->getKey())) break;
  407. mptr = nptr;
  408. }
  409. if (mptr) mptr->setNextNE(NULL);
  410. }
  411. }
  412. return 0;
  413. }
  414. // takes aff file condition string and creates the
  415. // conds array - please see the appendix at the end of the
  416. // file affentry.cxx which describes what is going on here
  417. // in much more detail
  418. void AffixMgr::encodeit(struct affentry * ptr, char * cs)
  419. {
  420. unsigned char c;
  421. int i, j, k;
  422. unsigned char mbr[MAXLNLEN];
  423. // now clear the conditions array */
  424. for (i=0;i<SETSIZE;i++) ptr->conds[i] = (unsigned char) 0;
  425. // now parse the string to create the conds array */
  426. int nc = strlen(cs);
  427. int neg = 0; // complement indicator
  428. int grp = 0; // group indicator
  429. int n = 0; // number of conditions
  430. int ec = 0; // end condition indicator
  431. int nm = 0; // number of member in group
  432. // if no condition just return
  433. if (strcmp(cs,".")==0) {
  434. ptr->numconds = 0;
  435. return;
  436. }
  437. i = 0;
  438. while (i < nc) {
  439. c = *((unsigned char *)(cs + i));
  440. // start group indicator
  441. if (c == '[') {
  442. grp = 1;
  443. c = 0;
  444. }
  445. // complement flag
  446. if ((grp == 1) && (c == '^')) {
  447. neg = 1;
  448. c = 0;
  449. }
  450. // end goup indicator
  451. if (c == ']') {
  452. ec = 1;
  453. c = 0;
  454. }
  455. // add character of group to list
  456. if ((grp == 1) && (c != 0)) {
  457. *(mbr + nm) = c;
  458. nm++;
  459. c = 0;
  460. }
  461. // end of condition
  462. if (c != 0) {
  463. ec = 1;
  464. }
  465. if (ec) {
  466. if (grp == 1) {
  467. if (neg == 0) {
  468. // set the proper bits in the condition array vals for those chars
  469. for (j=0;j<nm;j++) {
  470. k = (unsigned int) mbr[j];
  471. ptr->conds[k] = ptr->conds[k] | (1 << n);
  472. }
  473. } else {
  474. // complement so set all of them and then unset indicated ones
  475. for (j=0;j<SETSIZE;j++) ptr->conds[j] = ptr->conds[j] | (1 << n);
  476. for (j=0;j<nm;j++) {
  477. k = (unsigned int) mbr[j];
  478. ptr->conds[k] = ptr->conds[k] & ~(1 << n);
  479. }
  480. }
  481. neg = 0;
  482. grp = 0;
  483. nm = 0;
  484. } else {
  485. // not a group so just set the proper bit for this char
  486. // but first handle special case of . inside condition
  487. if (c == '.') {
  488. // wild card character so set them all
  489. for (j=0;j<SETSIZE;j++) ptr->conds[j] = ptr->conds[j] | (1 << n);
  490. } else {
  491. ptr->conds[(unsigned int) c] = ptr->conds[(unsigned int)c] | (1 << n);
  492. }
  493. }
  494. n++;
  495. ec = 0;
  496. }
  497. i++;
  498. }
  499. ptr->numconds = n;
  500. return;
  501. }
  502. // check word for prefixes
  503. struct hentry * AffixMgr::prefix_check (const char * word, int len)
  504. {
  505. struct hentry * rv= NULL;
  506. // first handle the special case of 0 length prefixes
  507. PfxEntry * pe = (PfxEntry *) pStart[0];
  508. while (pe) {
  509. rv = pe->check(word,len);
  510. if (rv) return rv;
  511. pe = pe->getNext();
  512. }
  513. // now handle the general case
  514. unsigned char sp = *((const unsigned char *)word);
  515. PfxEntry * pptr = (PfxEntry *)pStart[sp];
  516. while (pptr) {
  517. if (isSubset(pptr->getKey(),word)) {
  518. rv = pptr->check(word,len);
  519. if (rv) return rv;
  520. pptr = pptr->getNextEQ();
  521. } else {
  522. pptr = pptr->getNextNE();
  523. }
  524. }
  525. return NULL;
  526. }
  527. // check if compound word is correctly spelled
  528. struct hentry * AffixMgr::compound_check (const char * word, int len, char compound_flag)
  529. {
  530. int i;
  531. struct hentry * rv= NULL;
  532. char * st;
  533. char ch;
  534. // handle case of string too short to be a piece of a compound word
  535. if (len < cpdmin) return NULL;
  536. st = mystrdup(word);
  537. for (i=cpdmin; i < (len - (cpdmin-1)); i++) {
  538. ch = st[i];
  539. st[i] = '\0';
  540. rv = lookup(st);
  541. if (!rv) rv = affix_check(st,i);
  542. if ((rv) && (TESTAFF(rv->astr, compound_flag, rv->alen))) {
  543. rv = lookup((word+i));
  544. if ((rv) && (TESTAFF(rv->astr, compound_flag, rv->alen))) {
  545. free(st);
  546. return rv;
  547. }
  548. rv = affix_check((word+i),strlen(word+i));
  549. if ((rv) && (TESTAFF(rv->astr, compound_flag, rv->alen))) {
  550. free(st);
  551. return rv;
  552. }
  553. rv = compound_check((word+i),strlen(word+i),compound_flag);
  554. if (rv) {
  555. free(st);
  556. return rv;
  557. }
  558. }
  559. st[i] = ch;
  560. }
  561. free(st);
  562. return NULL;
  563. }
  564. // check word for suffixes
  565. struct hentry * AffixMgr::suffix_check (const char * word, int len,
  566. int sfxopts, AffEntry * ppfx)
  567. {
  568. struct hentry * rv = NULL;
  569. // first handle the special case of 0 length suffixes
  570. SfxEntry * se = (SfxEntry *) sStart[0];
  571. while (se) {
  572. rv = se->check(word,len, sfxopts, ppfx);
  573. if (rv) return rv;
  574. se = se->getNext();
  575. }
  576. // now handle the general case
  577. unsigned char sp = *((const unsigned char *)(word + len - 1));
  578. SfxEntry * sptr = (SfxEntry *) sStart[sp];
  579. while (sptr) {
  580. if (isRevSubset(sptr->getKey(),(word+len-1), len)) {
  581. rv = sptr->check(word,len, sfxopts, ppfx);
  582. if (rv) {
  583. return rv;
  584. }
  585. sptr = sptr->getNextEQ();
  586. } else {
  587. sptr = sptr->getNextNE();
  588. }
  589. }
  590. return NULL;
  591. }
  592. // check if word with affixes is correctly spelled
  593. struct hentry * AffixMgr::affix_check (const char * word, int len)
  594. {
  595. struct hentry * rv= NULL;
  596. // check all prefixes (also crossed with suffixes if allowed)
  597. rv = prefix_check(word, len);
  598. if (rv) return rv;
  599. // if still not found check all suffixes
  600. rv = suffix_check(word, len, 0, NULL);
  601. return rv;
  602. }
  603. int AffixMgr::expand_rootword(struct guessword * wlst, int maxn,
  604. const char * ts, int wl, const char * ap, int al)
  605. {
  606. int nh=0;
  607. // first add root word to list
  608. if (nh < maxn) {
  609. wlst[nh].word = mystrdup(ts);
  610. wlst[nh].allow = (1 == 0);
  611. nh++;
  612. }
  613. // handle suffixes
  614. for (int i = 0; i < al; i++) {
  615. unsigned char c = (unsigned char) ap[i];
  616. SfxEntry * sptr = (SfxEntry *)sFlag[c];
  617. while (sptr) {
  618. char * newword = sptr->add(ts, wl);
  619. if (newword) {
  620. if (nh < maxn) {
  621. wlst[nh].word = newword;
  622. wlst[nh].allow = sptr->allowCross();
  623. nh++;
  624. } else {
  625. free(newword);
  626. }
  627. }
  628. sptr = (SfxEntry *)sptr ->getFlgNxt();
  629. }
  630. }
  631. int n = nh;
  632. // handle cross products of prefixes and suffixes
  633. for (int j=1;j<n ;j++)
  634. if (wlst[j].allow) {
  635. for (int k = 0; k < al; k++) {
  636. unsigned char c = (unsigned char) ap[k];
  637. PfxEntry * cptr = (PfxEntry *) pFlag[c];
  638. while (cptr) {
  639. if (cptr->allowCross()) {
  640. int l1 = strlen(wlst[j].word);
  641. char * newword = cptr->add(wlst[j].word, l1);
  642. if (newword) {
  643. if (nh < maxn) {
  644. wlst[nh].word = newword;
  645. wlst[nh].allow = cptr->allowCross();
  646. nh++;
  647. } else {
  648. free(newword);
  649. }
  650. }
  651. }
  652. cptr = (PfxEntry *)cptr ->getFlgNxt();
  653. }
  654. }
  655. }
  656. // now handle pure prefixes
  657. for (int m = 0; m < al; m ++) {
  658. unsigned char c = (unsigned char) ap[m];
  659. PfxEntry * ptr = (PfxEntry *) pFlag[c];
  660. while (ptr) {
  661. char * newword = ptr->add(ts, wl);
  662. if (newword) {
  663. if (nh < maxn) {
  664. wlst[nh].word = newword;
  665. wlst[nh].allow = ptr->allowCross();
  666. nh++;
  667. } else {
  668. free(newword);
  669. }
  670. }
  671. ptr = (PfxEntry *)ptr ->getFlgNxt();
  672. }
  673. }
  674. return nh;
  675. }
  676. // return length of replacing table
  677. int AffixMgr::get_numrep()
  678. {
  679. return numrep;
  680. }
  681. // return replacing table
  682. struct replentry * AffixMgr::get_reptable()
  683. {
  684. if (! reptable ) return NULL;
  685. return reptable;
  686. }
  687. // return length of character map table
  688. int AffixMgr::get_nummap()
  689. {
  690. return nummap;
  691. }
  692. // return character map table
  693. struct mapentry * AffixMgr::get_maptable()
  694. {
  695. if (! maptable ) return NULL;
  696. return maptable;
  697. }
  698. // return text encoding of dictionary
  699. char * AffixMgr::get_encoding()
  700. {
  701. if (! encoding ) {
  702. encoding = mystrdup("ISO8859-1");
  703. }
  704. return mystrdup(encoding);
  705. }
  706. // return the preferred try string for suggestions
  707. char * AffixMgr::get_try_string()
  708. {
  709. if (! trystring ) return NULL;
  710. return mystrdup(trystring);
  711. }
  712. // return the compound words control flag
  713. char * AffixMgr::get_compound()
  714. {
  715. if (! compound ) return NULL;
  716. return mystrdup(compound);
  717. }
  718. // utility method to look up root words in hash table
  719. struct hentry * AffixMgr::lookup(const char * word)
  720. {
  721. if (! pHMgr) return NULL;
  722. return pHMgr->lookup(word);
  723. }
  724. // return nosplitsugs
  725. bool AffixMgr::get_nosplitsugs(void)
  726. {
  727. return nosplitsugs;
  728. }
  729. /* parse in the try string */
  730. int AffixMgr::parse_try(char * line)
  731. {
  732. if (trystring) {
  733. fprintf(stderr,"error: duplicate TRY strings\n");
  734. return 1;
  735. }
  736. char * tp = line;
  737. char * piece;
  738. int i = 0;
  739. int np = 0;
  740. while ((piece=mystrsep(&tp,' '))) {
  741. if (*piece != '\0') {
  742. switch(i) {
  743. case 0: { np++; break; }
  744. case 1: { trystring = mystrdup(piece); np++; break; }
  745. default: break;
  746. }
  747. i++;
  748. }
  749. free(piece);
  750. }
  751. if (np != 2) {
  752. fprintf(stderr,"error: missing TRY information\n");
  753. return 1;
  754. }
  755. return 0;
  756. }
  757. /* parse in the name of the character set used by the .dict and .aff */
  758. int AffixMgr::parse_set(char * line)
  759. {
  760. if (encoding) {
  761. fprintf(stderr,"error: duplicate SET strings\n");
  762. return 1;
  763. }
  764. char * tp = line;
  765. char * piece;
  766. int i = 0;
  767. int np = 0;
  768. while ((piece=mystrsep(&tp,' '))) {
  769. if (*piece != '\0') {
  770. switch(i) {
  771. case 0: { np++; break; }
  772. case 1: { encoding = mystrdup(piece); np++; break; }
  773. default: break;
  774. }
  775. i++;
  776. }
  777. free(piece);
  778. }
  779. if (np != 2) {
  780. fprintf(stderr,"error: missing SET information\n");
  781. return 1;
  782. }
  783. return 0;
  784. }
  785. /* parse in the flag used by the controlled compound words */
  786. int AffixMgr::parse_cpdflag(char * line)
  787. {
  788. if (compound) {
  789. fprintf(stderr,"error: duplicate compound flags used\n");
  790. return 1;
  791. }
  792. char * tp = line;
  793. char * piece;
  794. int i = 0;
  795. int np = 0;
  796. while ((piece=mystrsep(&tp,' '))) {
  797. if (*piece != '\0') {
  798. switch(i) {
  799. case 0: { np++; break; }
  800. case 1: { compound = mystrdup(piece); np++; break; }
  801. default: break;
  802. }
  803. i++;
  804. }
  805. free(piece);
  806. }
  807. if (np != 2) {
  808. fprintf(stderr,"error: missing compound flag information\n");
  809. return 1;
  810. }
  811. return 0;
  812. }
  813. /* parse in the min compound word length */
  814. int AffixMgr::parse_cpdmin(char * line)
  815. {
  816. char * tp = line;
  817. char * piece;
  818. int i = 0;
  819. int np = 0;
  820. while ((piece=mystrsep(&tp,' '))) {
  821. if (*piece != '\0') {
  822. switch(i) {
  823. case 0: { np++; break; }
  824. case 1: { cpdmin = atoi(piece); np++; break; }
  825. default: break;
  826. }
  827. i++;
  828. }
  829. free(piece);
  830. }
  831. if (np != 2) {
  832. fprintf(stderr,"error: missing compound min information\n");
  833. return 1;
  834. }
  835. if ((cpdmin < 1) || (cpdmin > 50)) cpdmin = 3;
  836. return 0;
  837. }
  838. /* parse in the typical fault correcting table */
  839. int AffixMgr::parse_reptable(char * line, FILE * af)
  840. {
  841. if (numrep != 0) {
  842. fprintf(stderr,"error: duplicate REP tables used\n");
  843. return 1;
  844. }
  845. char * tp = line;
  846. char * piece;
  847. int i = 0;
  848. int np = 0;
  849. while ((piece=mystrsep(&tp,' '))) {
  850. if (*piece != '\0') {
  851. switch(i) {
  852. case 0: { np++; break; }
  853. case 1: {
  854. numrep = atoi(piece);
  855. if (numrep < 1) {
  856. fprintf(stderr,"incorrect number of entries in replacement table\n");
  857. free(piece);
  858. return 1;
  859. }
  860. reptable = (replentry *) malloc(numrep * sizeof(struct replentry));
  861. np++;
  862. break;
  863. }
  864. default: break;
  865. }
  866. i++;
  867. }
  868. free(piece);
  869. }
  870. if (np != 2) {
  871. fprintf(stderr,"error: missing replacement table information\n");
  872. return 1;
  873. }
  874. /* now parse the numrep lines to read in the remainder of the table */
  875. char * nl = line;
  876. for (int j=0; j < numrep; j++) {
  877. fgets(nl,MAXLNLEN,af);
  878. mychomp(nl);
  879. tp = nl;
  880. i = 0;
  881. reptable[j].pattern = NULL;
  882. reptable[j].replacement = NULL;
  883. while ((piece=mystrsep(&tp,' '))) {
  884. if (*piece != '\0') {
  885. switch(i) {
  886. case 0: {
  887. if (strncmp(piece,"REP",3) != 0) {
  888. fprintf(stderr,"error: replacement table is corrupt\n");
  889. free(piece);
  890. return 1;
  891. }
  892. break;
  893. }
  894. case 1: { reptable[j].pattern = mystrdup(piece); break; }
  895. case 2: { reptable[j].replacement = mystrdup(piece); break; }
  896. default: break;
  897. }
  898. i++;
  899. }
  900. free(piece);
  901. }
  902. if ((!(reptable[j].pattern)) || (!(reptable[j].replacement))) {
  903. fprintf(stderr,"error: replacement table is corrupt\n");
  904. return 1;
  905. }
  906. }
  907. return 0;
  908. }
  909. /* parse in the character map table */
  910. int AffixMgr::parse_maptable(char * line, FILE * af)
  911. {
  912. if (nummap != 0) {
  913. fprintf(stderr,"error: duplicate MAP tables used\n");
  914. return 1;
  915. }
  916. char * tp = line;
  917. char * piece;
  918. int i = 0;
  919. int np = 0;
  920. while ((piece=mystrsep(&tp,' '))) {
  921. if (*piece != '\0') {
  922. switch(i) {
  923. case 0: { np++; break; }
  924. case 1: {
  925. nummap = atoi(piece);
  926. if (nummap < 1) {
  927. fprintf(stderr,"incorrect number of entries in map table\n");
  928. free(piece);
  929. return 1;
  930. }
  931. maptable = (mapentry *) malloc(nummap * sizeof(struct mapentry));
  932. np++;
  933. break;
  934. }
  935. default: break;
  936. }
  937. i++;
  938. }
  939. free(piece);
  940. }
  941. if (np != 2) {
  942. fprintf(stderr,"error: missing map table information\n");
  943. return 1;
  944. }
  945. /* now parse the nummap lines to read in the remainder of the table */
  946. char * nl = line;
  947. for (int j=0; j < nummap; j++) {
  948. fgets(nl,MAXLNLEN,af);
  949. mychomp(nl);
  950. tp = nl;
  951. i = 0;
  952. maptable[j].set = NULL;
  953. maptable[j].len = 0;
  954. while ((piece=mystrsep(&tp,' '))) {
  955. if (*piece != '\0') {
  956. switch(i) {
  957. case 0: {
  958. if (strncmp(piece,"MAP",3) != 0) {
  959. fprintf(stderr,"error: map table is corrupt\n");
  960. free(piece);
  961. return 1;
  962. }
  963. break;
  964. }
  965. case 1: { maptable[j].set = mystrdup(piece);
  966. maptable[j].len = strlen(maptable[j].set);
  967. break; }
  968. default: break;
  969. }
  970. i++;
  971. }
  972. free(piece);
  973. }
  974. if ((!(maptable[j].set)) || (!(maptable[j].len))) {
  975. fprintf(stderr,"error: map table is corrupt\n");
  976. return 1;
  977. }
  978. }
  979. return 0;
  980. }
  981. int AffixMgr::parse_affix(char * line, const char at, FILE * af)
  982. {
  983. int numents = 0; // number of affentry structures to parse
  984. char achar='\0'; // affix char identifier
  985. short ff=0;
  986. struct affentry * ptr= NULL;
  987. struct affentry * nptr= NULL;
  988. char * tp = line;
  989. char * nl = line;
  990. char * piece;
  991. int i = 0;
  992. // split affix header line into pieces
  993. int np = 0;
  994. while ((piece=mystrsep(&tp,' '))) {
  995. if (*piece != '\0') {
  996. switch(i) {
  997. // piece 1 - is type of affix
  998. case 0: { np++; break; }
  999. // piece 2 - is affix char
  1000. case 1: { np++; achar = *piece; break; }
  1001. // piece 3 - is cross product indicator
  1002. case 2: { np++; if (*piece == 'Y') ff = XPRODUCT; break; }
  1003. // piece 4 - is number of affentries
  1004. case 3: {
  1005. np++;
  1006. numents = atoi(piece);
  1007. ptr = (struct affentry *) malloc(numents * sizeof(struct affentry));
  1008. ptr->xpflg = ff;
  1009. ptr->achar = achar;
  1010. break;
  1011. }
  1012. default: break;
  1013. }
  1014. i++;
  1015. }
  1016. free(piece);
  1017. }
  1018. // check to make sure we parsed enough pieces
  1019. if (np != 4) {
  1020. fprintf(stderr, "error: affix %c header has insufficient data in line %s\n",achar,nl);
  1021. free(ptr);
  1022. return 1;
  1023. }
  1024. // store away ptr to first affentry
  1025. nptr = ptr;
  1026. // now parse numents affentries for this affix
  1027. for (int j=0; j < numents; j++) {
  1028. fgets(nl,MAXLNLEN,af);
  1029. mychomp(nl);
  1030. tp = nl;
  1031. i = 0;
  1032. np = 0;
  1033. // split line into pieces
  1034. while ((piece=mystrsep(&tp,' '))) {
  1035. if (*piece != '\0') {
  1036. switch(i) {
  1037. // piece 1 - is type
  1038. case 0: {
  1039. np++;
  1040. if (nptr != ptr) nptr->xpflg = ptr->xpflg;
  1041. break;
  1042. }
  1043. // piece 2 - is affix char
  1044. case 1: {
  1045. np++;
  1046. if (*piece != achar) {
  1047. fprintf(stderr, "error: affix %c is corrupt near line %s\n",achar,nl);
  1048. fprintf(stderr, "error: possible incorrect count\n");
  1049. free(piece);
  1050. return 1;
  1051. }
  1052. if (nptr != ptr) nptr->achar = ptr->achar;
  1053. break;
  1054. }
  1055. // piece 3 - is string to strip or 0 for null
  1056. case 2: {
  1057. np++;
  1058. nptr->strip = mystrdup(piece);
  1059. nptr->stripl = strlen(nptr->strip);
  1060. if (strcmp(nptr->strip,"0") == 0) {
  1061. free(nptr->strip);
  1062. nptr->strip=mystrdup("");
  1063. nptr->stripl = 0;
  1064. }
  1065. break;
  1066. }
  1067. // piece 4 - is affix string or 0 for null
  1068. case 3: {
  1069. np++;
  1070. nptr->appnd = mystrdup(piece);
  1071. nptr->appndl = strlen(nptr->appnd);
  1072. if (strcmp(nptr->appnd,"0") == 0) {
  1073. free(nptr->appnd);
  1074. nptr->appnd=mystrdup("");
  1075. nptr->appndl = 0;
  1076. }
  1077. break;
  1078. }
  1079. // piece 5 - is the conditions descriptions
  1080. case 4: { np++; encodeit(nptr,piece); }
  1081. default: break;
  1082. }
  1083. i++;
  1084. }
  1085. free(piece);
  1086. }
  1087. // check to make sure we parsed enough pieces
  1088. if (np != 5) {
  1089. fprintf(stderr, "error: affix %c is corrupt near line %s\n",achar,nl);
  1090. free(ptr);
  1091. return 1;
  1092. }
  1093. nptr++;
  1094. }
  1095. // now create SfxEntry or PfxEntry objects and use links to
  1096. // build an ordered (sorted by affix string) list
  1097. nptr = ptr;
  1098. for (int k = 0; k < numents; k++) {
  1099. if (at == 'P') {
  1100. PfxEntry * pfxptr = new PfxEntry(this,nptr);
  1101. build_pfxtree((AffEntry *)pfxptr);
  1102. } else {
  1103. SfxEntry * sfxptr = new SfxEntry(this,nptr);
  1104. build_sfxtree((AffEntry *)sfxptr);
  1105. }
  1106. nptr++;
  1107. }
  1108. free(ptr);
  1109. return 0;
  1110. }