PageRenderTime 52ms CodeModel.GetById 10ms RepoModel.GetById 1ms app.codeStats 0ms

/extensions/spellcheck/hunspell/src/hashmgr.cpp

http://github.com/zpao/v8monkey
C++ | 982 lines | 814 code | 61 blank | 107 comment | 284 complexity | 368381e8b15b43ed8ac649b3dd0ff7c9 MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, LGPL-3.0, AGPL-1.0, LGPL-2.1, BSD-3-Clause, GPL-2.0, JSON, Apache-2.0, 0BSD
  1. /******* BEGIN LICENSE BLOCK *******
  2. * Version: MPL 1.1/GPL 2.0/LGPL 2.1
  3. *
  4. * The contents of this file are subject to the Mozilla Public License Version
  5. * 1.1 (the "License"); you may not use this file except in compliance with
  6. * the License. You may obtain a copy of the License at
  7. * http://www.mozilla.org/MPL/
  8. *
  9. * Software distributed under the License is distributed on an "AS IS" basis,
  10. * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  11. * for the specific language governing rights and limitations under the
  12. * License.
  13. *
  14. * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
  15. * and László Németh (Hunspell). Portions created by the Initial Developers
  16. * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
  17. *
  18. * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
  19. * David Einstein (deinst@world.std.com)
  20. * László Németh (nemethl@gyorsposta.hu)
  21. * Caolan McNamara (caolanm@redhat.com)
  22. * Davide Prina
  23. * Giuseppe Modugno
  24. * Gianluca Turconi
  25. * Simon Brouwer
  26. * Noll Janos
  27. * Biro Arpad
  28. * Goldman Eleonora
  29. * Sarlos Tamas
  30. * Bencsath Boldizsar
  31. * Halacsy Peter
  32. * Dvornik Laszlo
  33. * Gefferth Andras
  34. * Nagy Viktor
  35. * Varga Daniel
  36. * Chris Halls
  37. * Rene Engelhard
  38. * Bram Moolenaar
  39. * Dafydd Jones
  40. * Harri Pitkanen
  41. * Andras Timar
  42. * Tor Lillqvist
  43. *
  44. * Alternatively, the contents of this file may be used under the terms of
  45. * either the GNU General Public License Version 2 or later (the "GPL"), or
  46. * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  47. * in which case the provisions of the GPL or the LGPL are applicable instead
  48. * of those above. If you wish to allow use of your version of this file only
  49. * under the terms of either the GPL or the LGPL, and not to allow others to
  50. * use your version of this file under the terms of the MPL, indicate your
  51. * decision by deleting the provisions above and replace them with the notice
  52. * and other provisions required by the GPL or the LGPL. If you do not delete
  53. * the provisions above, a recipient may use your version of this file under
  54. * the terms of any one of the MPL, the GPL or the LGPL.
  55. *
  56. ******* END LICENSE BLOCK *******/
  57. #include <stdlib.h>
  58. #include <string.h>
  59. #include <stdio.h>
  60. #include <ctype.h>
  61. #include "hashmgr.hxx"
  62. #include "csutil.hxx"
  63. #include "atypes.hxx"
  64. // build a hash table from a munched word list
  65. HashMgr::HashMgr(const char * tpath, const char * apath, const char * key)
  66. {
  67. tablesize = 0;
  68. tableptr = NULL;
  69. flag_mode = FLAG_CHAR;
  70. complexprefixes = 0;
  71. utf8 = 0;
  72. langnum = 0;
  73. lang = NULL;
  74. enc = NULL;
  75. csconv = 0;
  76. ignorechars = NULL;
  77. ignorechars_utf16 = NULL;
  78. ignorechars_utf16_len = 0;
  79. numaliasf = 0;
  80. aliasf = NULL;
  81. numaliasm = 0;
  82. aliasm = NULL;
  83. forbiddenword = FORBIDDENWORD; // forbidden word signing flag
  84. load_config(apath, key);
  85. int ec = load_tables(tpath, key);
  86. if (ec) {
  87. /* error condition - what should we do here */
  88. HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n",ec);
  89. if (tableptr) {
  90. free(tableptr);
  91. tableptr = NULL;
  92. }
  93. tablesize = 0;
  94. }
  95. }
  96. HashMgr::~HashMgr()
  97. {
  98. if (tableptr) {
  99. // now pass through hash table freeing up everything
  100. // go through column by column of the table
  101. for (int i=0; i < tablesize; i++) {
  102. struct hentry * pt = tableptr[i];
  103. struct hentry * nt = NULL;
  104. while(pt) {
  105. nt = pt->next;
  106. if (pt->astr && (!aliasf || TESTAFF(pt->astr, ONLYUPCASEFLAG, pt->alen))) free(pt->astr);
  107. free(pt);
  108. pt = nt;
  109. }
  110. }
  111. free(tableptr);
  112. }
  113. tablesize = 0;
  114. if (aliasf) {
  115. for (int j = 0; j < (numaliasf); j++) free(aliasf[j]);
  116. free(aliasf);
  117. aliasf = NULL;
  118. if (aliasflen) {
  119. free(aliasflen);
  120. aliasflen = NULL;
  121. }
  122. }
  123. if (aliasm) {
  124. for (int j = 0; j < (numaliasm); j++) free(aliasm[j]);
  125. free(aliasm);
  126. aliasm = NULL;
  127. }
  128. #ifndef OPENOFFICEORG
  129. #ifndef MOZILLA_CLIENT
  130. if (utf8) free_utf_tbl();
  131. #endif
  132. #endif
  133. if (enc) free(enc);
  134. if (lang) free(lang);
  135. if (ignorechars) free(ignorechars);
  136. if (ignorechars_utf16) free(ignorechars_utf16);
  137. #ifdef MOZILLA_CLIENT
  138. delete [] csconv;
  139. #endif
  140. }
  141. // lookup a root word in the hashtable
  142. struct hentry * HashMgr::lookup(const char *word) const
  143. {
  144. struct hentry * dp;
  145. if (tableptr) {
  146. dp = tableptr[hash(word)];
  147. if (!dp) return NULL;
  148. for ( ; dp != NULL; dp = dp->next) {
  149. if (strcmp(word, dp->word) == 0) return dp;
  150. }
  151. }
  152. return NULL;
  153. }
  154. // add a word to the hash table (private)
  155. int HashMgr::add_word(const char * word, int wbl, int wcl, unsigned short * aff,
  156. int al, const char * desc, bool onlyupcase)
  157. {
  158. bool upcasehomonym = false;
  159. int descl = desc ? (aliasm ? sizeof(short) : strlen(desc) + 1) : 0;
  160. // variable-length hash record with word and optional fields
  161. struct hentry* hp =
  162. (struct hentry *) malloc (sizeof(struct hentry) + wbl + descl);
  163. if (!hp) return 1;
  164. char * hpw = hp->word;
  165. strcpy(hpw, word);
  166. if (ignorechars != NULL) {
  167. if (utf8) {
  168. remove_ignored_chars_utf(hpw, ignorechars_utf16, ignorechars_utf16_len);
  169. } else {
  170. remove_ignored_chars(hpw, ignorechars);
  171. }
  172. }
  173. if (complexprefixes) {
  174. if (utf8) reverseword_utf(hpw); else reverseword(hpw);
  175. }
  176. int i = hash(hpw);
  177. hp->blen = (unsigned char) wbl;
  178. hp->clen = (unsigned char) wcl;
  179. hp->alen = (short) al;
  180. hp->astr = aff;
  181. hp->next = NULL;
  182. hp->next_homonym = NULL;
  183. // store the description string or its pointer
  184. if (desc) {
  185. hp->var = H_OPT;
  186. if (aliasm) {
  187. hp->var += H_OPT_ALIASM;
  188. store_pointer(hpw + wbl + 1, get_aliasm(atoi(desc)));
  189. } else {
  190. strcpy(hpw + wbl + 1, desc);
  191. if (complexprefixes) {
  192. if (utf8) reverseword_utf(HENTRY_DATA(hp));
  193. else reverseword(HENTRY_DATA(hp));
  194. }
  195. }
  196. if (strstr(HENTRY_DATA(hp), MORPH_PHON)) hp->var += H_OPT_PHON;
  197. } else hp->var = 0;
  198. struct hentry * dp = tableptr[i];
  199. if (!dp) {
  200. tableptr[i] = hp;
  201. return 0;
  202. }
  203. while (dp->next != NULL) {
  204. if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) {
  205. // remove hidden onlyupcase homonym
  206. if (!onlyupcase) {
  207. if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
  208. free(dp->astr);
  209. dp->astr = hp->astr;
  210. dp->alen = hp->alen;
  211. free(hp);
  212. return 0;
  213. } else {
  214. dp->next_homonym = hp;
  215. }
  216. } else {
  217. upcasehomonym = true;
  218. }
  219. }
  220. dp=dp->next;
  221. }
  222. if (strcmp(hp->word, dp->word) == 0) {
  223. // remove hidden onlyupcase homonym
  224. if (!onlyupcase) {
  225. if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
  226. free(dp->astr);
  227. dp->astr = hp->astr;
  228. dp->alen = hp->alen;
  229. free(hp);
  230. return 0;
  231. } else {
  232. dp->next_homonym = hp;
  233. }
  234. } else {
  235. upcasehomonym = true;
  236. }
  237. }
  238. if (!upcasehomonym) {
  239. dp->next = hp;
  240. } else {
  241. // remove hidden onlyupcase homonym
  242. if (hp->astr) free(hp->astr);
  243. free(hp);
  244. }
  245. return 0;
  246. }
  247. int HashMgr::add_hidden_capitalized_word(char * word, int wbl, int wcl,
  248. unsigned short * flags, int al, char * dp, int captype)
  249. {
  250. // add inner capitalized forms to handle the following allcap forms:
  251. // Mixed caps: OpenOffice.org -> OPENOFFICE.ORG
  252. // Allcaps with suffixes: CIA's -> CIA'S
  253. if (((captype == HUHCAP) || (captype == HUHINITCAP) ||
  254. ((captype == ALLCAP) && (flags != NULL))) &&
  255. !((flags != NULL) && TESTAFF(flags, forbiddenword, al))) {
  256. unsigned short * flags2 = (unsigned short *) malloc (sizeof(unsigned short) * (al+1));
  257. if (!flags2) return 1;
  258. if (al) memcpy(flags2, flags, al * sizeof(unsigned short));
  259. flags2[al] = ONLYUPCASEFLAG;
  260. if (utf8) {
  261. char st[BUFSIZE];
  262. w_char w[BUFSIZE];
  263. int wlen = u8_u16(w, BUFSIZE, word);
  264. mkallsmall_utf(w, wlen, langnum);
  265. mkallcap_utf(w, 1, langnum);
  266. u16_u8(st, BUFSIZE, w, wlen);
  267. return add_word(st,wbl,wcl,flags2,al+1,dp, true);
  268. } else {
  269. mkallsmall(word, csconv);
  270. mkinitcap(word, csconv);
  271. return add_word(word,wbl,wcl,flags2,al+1,dp, true);
  272. }
  273. }
  274. return 0;
  275. }
  276. // detect captype and modify word length for UTF-8 encoding
  277. int HashMgr::get_clen_and_captype(const char * word, int wbl, int * captype) {
  278. int len;
  279. if (utf8) {
  280. w_char dest_utf[BUFSIZE];
  281. len = u8_u16(dest_utf, BUFSIZE, word);
  282. *captype = get_captype_utf8(dest_utf, len, langnum);
  283. } else {
  284. len = wbl;
  285. *captype = get_captype((char *) word, len, csconv);
  286. }
  287. return len;
  288. }
  289. // remove word (personal dictionary function for standalone applications)
  290. int HashMgr::remove(const char * word)
  291. {
  292. struct hentry * dp = lookup(word);
  293. while (dp) {
  294. if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) {
  295. unsigned short * flags =
  296. (unsigned short *) malloc(sizeof(short) * (dp->alen + 1));
  297. if (!flags) return 1;
  298. for (int i = 0; i < dp->alen; i++) flags[i] = dp->astr[i];
  299. flags[dp->alen] = forbiddenword;
  300. dp->astr = flags;
  301. dp->alen++;
  302. flag_qsort(flags, 0, dp->alen);
  303. }
  304. dp = dp->next_homonym;
  305. }
  306. return 0;
  307. }
  308. /* remove forbidden flag to add a personal word to the hash */
  309. int HashMgr::remove_forbidden_flag(const char * word) {
  310. struct hentry * dp = lookup(word);
  311. if (!dp) return 1;
  312. while (dp) {
  313. if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) {
  314. if (dp->alen == 1) dp->alen = 0; // XXX forbidden words of personal dic.
  315. else {
  316. unsigned short * flags2 =
  317. (unsigned short *) malloc(sizeof(short) * (dp->alen - 1));
  318. if (!flags2) return 1;
  319. int i, j = 0;
  320. for (i = 0; i < dp->alen; i++) {
  321. if (dp->astr[i] != forbiddenword) flags2[j++] = dp->astr[i];
  322. }
  323. dp->alen--;
  324. dp->astr = flags2; // XXX allowed forbidden words
  325. }
  326. }
  327. dp = dp->next_homonym;
  328. }
  329. return 0;
  330. }
  331. // add a custom dic. word to the hash table (public)
  332. int HashMgr::add(const char * word)
  333. {
  334. unsigned short * flags = NULL;
  335. int al = 0;
  336. if (remove_forbidden_flag(word)) {
  337. int captype;
  338. int wbl = strlen(word);
  339. int wcl = get_clen_and_captype(word, wbl, &captype);
  340. add_word(word, wbl, wcl, flags, al, NULL, false);
  341. return add_hidden_capitalized_word((char *) word, wbl, wcl, flags, al, NULL, captype);
  342. }
  343. return 0;
  344. }
  345. int HashMgr::add_with_affix(const char * word, const char * example)
  346. {
  347. // detect captype and modify word length for UTF-8 encoding
  348. struct hentry * dp = lookup(example);
  349. remove_forbidden_flag(word);
  350. if (dp && dp->astr) {
  351. int captype;
  352. int wbl = strlen(word);
  353. int wcl = get_clen_and_captype(word, wbl, &captype);
  354. if (aliasf) {
  355. add_word(word, wbl, wcl, dp->astr, dp->alen, NULL, false);
  356. } else {
  357. unsigned short * flags = (unsigned short *) malloc (dp->alen * sizeof(short));
  358. if (flags) {
  359. memcpy((void *) flags, (void *) dp->astr, dp->alen * sizeof(short));
  360. add_word(word, wbl, wcl, flags, dp->alen, NULL, false);
  361. } else return 1;
  362. }
  363. return add_hidden_capitalized_word((char *) word, wbl, wcl, dp->astr, dp->alen, NULL, captype);
  364. }
  365. return 1;
  366. }
  367. // walk the hash table entry by entry - null at end
  368. // initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp);
  369. struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const
  370. {
  371. if (hp && hp->next != NULL) return hp->next;
  372. for (col++; col < tablesize; col++) {
  373. if (tableptr[col]) return tableptr[col];
  374. }
  375. // null at end and reset to start
  376. col = -1;
  377. return NULL;
  378. }
  379. // load a munched word list and build a hash table on the fly
  380. int HashMgr::load_tables(const char * tpath, const char * key)
  381. {
  382. int al;
  383. char * ap;
  384. char * dp;
  385. char * dp2;
  386. unsigned short * flags;
  387. char * ts;
  388. // open dictionary file
  389. FileMgr * dict = new FileMgr(tpath, key);
  390. if (dict == NULL) return 1;
  391. // first read the first line of file to get hash table size */
  392. if (!(ts = dict->getline())) {
  393. HUNSPELL_WARNING(stderr, "error: empty dic file\n");
  394. delete dict;
  395. return 2;
  396. }
  397. mychomp(ts);
  398. /* remove byte order mark */
  399. if (strncmp(ts,"\xEF\xBB\xBF",3) == 0) {
  400. memmove(ts, ts+3, strlen(ts+3)+1);
  401. // warning: dic file begins with byte order mark: possible incompatibility with old Hunspell versions
  402. }
  403. tablesize = atoi(ts);
  404. if (tablesize == 0) {
  405. HUNSPELL_WARNING(stderr, "error: line 1: missing or bad word count in the dic file\n");
  406. delete dict;
  407. return 4;
  408. }
  409. tablesize = tablesize + 5 + USERWORD;
  410. if ((tablesize %2) == 0) tablesize++;
  411. // allocate the hash table
  412. tableptr = (struct hentry **) malloc(tablesize * sizeof(struct hentry *));
  413. if (! tableptr) {
  414. delete dict;
  415. return 3;
  416. }
  417. for (int i=0; i<tablesize; i++) tableptr[i] = NULL;
  418. // loop through all words on much list and add to hash
  419. // table and create word and affix strings
  420. while ((ts = dict->getline())) {
  421. mychomp(ts);
  422. // split each line into word and morphological description
  423. dp = ts;
  424. while ((dp = strchr(dp, ':'))) {
  425. if ((dp > ts + 3) && (*(dp - 3) == ' ' || *(dp - 3) == '\t')) {
  426. for (dp -= 4; dp >= ts && (*dp == ' ' || *dp == '\t'); dp--);
  427. if (dp < ts) { // missing word
  428. dp = NULL;
  429. } else {
  430. *(dp + 1) = '\0';
  431. dp = dp + 2;
  432. }
  433. break;
  434. }
  435. dp++;
  436. }
  437. // tabulator is the old morphological field separator
  438. dp2 = strchr(ts, '\t');
  439. if (dp2 && (!dp || dp2 < dp)) {
  440. *dp2 = '\0';
  441. dp = dp2 + 1;
  442. }
  443. // split each line into word and affix char strings
  444. // "\/" signs slash in words (not affix separator)
  445. // "/" at beginning of the line is word character (not affix separator)
  446. ap = strchr(ts,'/');
  447. while (ap) {
  448. if (ap == ts) {
  449. ap++;
  450. continue;
  451. } else if (*(ap - 1) != '\\') break;
  452. // replace "\/" with "/"
  453. for (char * sp = ap - 1; *sp; *sp = *(sp + 1), sp++);
  454. ap = strchr(ap,'/');
  455. }
  456. if (ap) {
  457. *ap = '\0';
  458. if (aliasf) {
  459. int index = atoi(ap + 1);
  460. al = get_aliasf(index, &flags, dict);
  461. if (!al) {
  462. HUNSPELL_WARNING(stderr, "error: line %d: bad flag vector alias\n", dict->getlinenum());
  463. *ap = '\0';
  464. }
  465. } else {
  466. al = decode_flags(&flags, ap + 1, dict);
  467. if (al == -1) {
  468. HUNSPELL_WARNING(stderr, "Can't allocate memory.\n");
  469. delete dict;
  470. return 6;
  471. }
  472. flag_qsort(flags, 0, al);
  473. }
  474. } else {
  475. al = 0;
  476. ap = NULL;
  477. flags = NULL;
  478. }
  479. int captype;
  480. int wbl = strlen(ts);
  481. int wcl = get_clen_and_captype(ts, wbl, &captype);
  482. // add the word and its index plus its capitalized form optionally
  483. if (add_word(ts,wbl,wcl,flags,al,dp, false) ||
  484. add_hidden_capitalized_word(ts, wbl, wcl, flags, al, dp, captype)) {
  485. delete dict;
  486. return 5;
  487. }
  488. }
  489. delete dict;
  490. return 0;
  491. }
  492. // the hash function is a simple load and rotate
  493. // algorithm borrowed
  494. int HashMgr::hash(const char * word) const
  495. {
  496. long hv = 0;
  497. for (int i=0; i < 4 && *word != 0; i++)
  498. hv = (hv << 8) | (*word++);
  499. while (*word != 0) {
  500. ROTATE(hv,ROTATE_LEN);
  501. hv ^= (*word++);
  502. }
  503. return (unsigned long) hv % tablesize;
  504. }
  505. int HashMgr::decode_flags(unsigned short ** result, char * flags, FileMgr * af) {
  506. int len;
  507. if (*flags == '\0') {
  508. *result = NULL;
  509. return 0;
  510. }
  511. switch (flag_mode) {
  512. case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz)
  513. len = strlen(flags);
  514. if (len%2 == 1) HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n", af->getlinenum());
  515. len /= 2;
  516. *result = (unsigned short *) malloc(len * sizeof(short));
  517. if (!*result) return -1;
  518. for (int i = 0; i < len; i++) {
  519. (*result)[i] = (((unsigned short) flags[i * 2]) << 8) + (unsigned short) flags[i * 2 + 1];
  520. }
  521. break;
  522. }
  523. case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521 23 233)
  524. int i;
  525. len = 1;
  526. char * src = flags;
  527. unsigned short * dest;
  528. char * p;
  529. for (p = flags; *p; p++) {
  530. if (*p == ',') len++;
  531. }
  532. *result = (unsigned short *) malloc(len * sizeof(short));
  533. if (!*result) return -1;
  534. dest = *result;
  535. for (p = flags; *p; p++) {
  536. if (*p == ',') {
  537. i = atoi(src);
  538. if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: line %d: flag id %d is too large (max: %d)\n",
  539. af->getlinenum(), i, DEFAULTFLAGS - 1);
  540. *dest = (unsigned short) i;
  541. if (*dest == 0) HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", af->getlinenum());
  542. src = p + 1;
  543. dest++;
  544. }
  545. }
  546. i = atoi(src);
  547. if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: line %d: flag id %d is too large (max: %d)\n",
  548. af->getlinenum(), i, DEFAULTFLAGS - 1);
  549. *dest = (unsigned short) i;
  550. if (*dest == 0) HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", af->getlinenum());
  551. break;
  552. }
  553. case FLAG_UNI: { // UTF-8 characters
  554. w_char w[BUFSIZE/2];
  555. len = u8_u16(w, BUFSIZE/2, flags);
  556. *result = (unsigned short *) malloc(len * sizeof(short));
  557. if (!*result) return -1;
  558. memcpy(*result, w, len * sizeof(short));
  559. break;
  560. }
  561. default: { // Ispell's one-character flags (erfg -> e r f g)
  562. unsigned short * dest;
  563. len = strlen(flags);
  564. *result = (unsigned short *) malloc(len * sizeof(short));
  565. if (!*result) return -1;
  566. dest = *result;
  567. for (unsigned char * p = (unsigned char *) flags; *p; p++) {
  568. *dest = (unsigned short) *p;
  569. dest++;
  570. }
  571. }
  572. }
  573. return len;
  574. }
  575. unsigned short HashMgr::decode_flag(const char * f) {
  576. unsigned short s = 0;
  577. int i;
  578. switch (flag_mode) {
  579. case FLAG_LONG:
  580. s = ((unsigned short) f[0] << 8) + (unsigned short) f[1];
  581. break;
  582. case FLAG_NUM:
  583. i = atoi(f);
  584. if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n", i, DEFAULTFLAGS - 1);
  585. s = (unsigned short) i;
  586. break;
  587. case FLAG_UNI:
  588. u8_u16((w_char *) &s, 1, f);
  589. break;
  590. default:
  591. s = (unsigned short) *((unsigned char *)f);
  592. }
  593. if (s == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n");
  594. return s;
  595. }
  596. char * HashMgr::encode_flag(unsigned short f) {
  597. unsigned char ch[10];
  598. if (f==0) return mystrdup("(NULL)");
  599. if (flag_mode == FLAG_LONG) {
  600. ch[0] = (unsigned char) (f >> 8);
  601. ch[1] = (unsigned char) (f - ((f >> 8) << 8));
  602. ch[2] = '\0';
  603. } else if (flag_mode == FLAG_NUM) {
  604. sprintf((char *) ch, "%d", f);
  605. } else if (flag_mode == FLAG_UNI) {
  606. u16_u8((char *) &ch, 10, (w_char *) &f, 1);
  607. } else {
  608. ch[0] = (unsigned char) (f);
  609. ch[1] = '\0';
  610. }
  611. return mystrdup((char *) ch);
  612. }
  613. // read in aff file and set flag mode
  614. int HashMgr::load_config(const char * affpath, const char * key)
  615. {
  616. char * line; // io buffers
  617. int firstline = 1;
  618. // open the affix file
  619. FileMgr * afflst = new FileMgr(affpath, key);
  620. if (!afflst) {
  621. HUNSPELL_WARNING(stderr, "Error - could not open affix description file %s\n",affpath);
  622. return 1;
  623. }
  624. // read in each line ignoring any that do not
  625. // start with a known line type indicator
  626. while ((line = afflst->getline())) {
  627. mychomp(line);
  628. /* remove byte order mark */
  629. if (firstline) {
  630. firstline = 0;
  631. if (strncmp(line,"\xEF\xBB\xBF",3) == 0) memmove(line, line+3, strlen(line+3)+1);
  632. }
  633. /* parse in the try string */
  634. if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) {
  635. if (flag_mode != FLAG_CHAR) {
  636. HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of the FLAG affix file parameter\n", afflst->getlinenum());
  637. }
  638. if (strstr(line, "long")) flag_mode = FLAG_LONG;
  639. if (strstr(line, "num")) flag_mode = FLAG_NUM;
  640. if (strstr(line, "UTF-8")) flag_mode = FLAG_UNI;
  641. if (flag_mode == FLAG_CHAR) {
  642. HUNSPELL_WARNING(stderr, "error: line %d: FLAG needs `num', `long' or `UTF-8' parameter\n", afflst->getlinenum());
  643. }
  644. }
  645. if (strncmp(line,"FORBIDDENWORD",13) == 0) {
  646. char * st = NULL;
  647. if (parse_string(line, &st, afflst->getlinenum())) {
  648. delete afflst;
  649. return 1;
  650. }
  651. forbiddenword = decode_flag(st);
  652. free(st);
  653. }
  654. if (strncmp(line, "SET", 3) == 0) {
  655. if (parse_string(line, &enc, afflst->getlinenum())) {
  656. delete afflst;
  657. return 1;
  658. }
  659. if (strcmp(enc, "UTF-8") == 0) {
  660. utf8 = 1;
  661. #ifndef OPENOFFICEORG
  662. #ifndef MOZILLA_CLIENT
  663. initialize_utf_tbl();
  664. #endif
  665. #endif
  666. } else csconv = get_current_cs(enc);
  667. }
  668. if (strncmp(line, "LANG", 4) == 0) {
  669. if (parse_string(line, &lang, afflst->getlinenum())) {
  670. delete afflst;
  671. return 1;
  672. }
  673. langnum = get_lang_num(lang);
  674. }
  675. /* parse in the ignored characters (for example, Arabic optional diacritics characters */
  676. if (strncmp(line,"IGNORE",6) == 0) {
  677. if (parse_array(line, &ignorechars, &ignorechars_utf16,
  678. &ignorechars_utf16_len, utf8, afflst->getlinenum())) {
  679. delete afflst;
  680. return 1;
  681. }
  682. }
  683. if ((strncmp(line,"AF",2) == 0) && isspace(line[2])) {
  684. if (parse_aliasf(line, afflst)) {
  685. delete afflst;
  686. return 1;
  687. }
  688. }
  689. if ((strncmp(line,"AM",2) == 0) && isspace(line[2])) {
  690. if (parse_aliasm(line, afflst)) {
  691. delete afflst;
  692. return 1;
  693. }
  694. }
  695. if (strncmp(line,"COMPLEXPREFIXES",15) == 0) complexprefixes = 1;
  696. if (((strncmp(line,"SFX",3) == 0) || (strncmp(line,"PFX",3) == 0)) && isspace(line[3])) break;
  697. }
  698. if (csconv == NULL) csconv = get_current_cs(SPELL_ENCODING);
  699. delete afflst;
  700. return 0;
  701. }
  702. /* parse in the ALIAS table */
  703. int HashMgr::parse_aliasf(char * line, FileMgr * af)
  704. {
  705. if (numaliasf != 0) {
  706. HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
  707. return 1;
  708. }
  709. char * tp = line;
  710. char * piece;
  711. int i = 0;
  712. int np = 0;
  713. piece = mystrsep(&tp, 0);
  714. while (piece) {
  715. if (*piece != '\0') {
  716. switch(i) {
  717. case 0: { np++; break; }
  718. case 1: {
  719. numaliasf = atoi(piece);
  720. if (numaliasf < 1) {
  721. numaliasf = 0;
  722. aliasf = NULL;
  723. aliasflen = NULL;
  724. HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
  725. return 1;
  726. }
  727. aliasf = (unsigned short **) malloc(numaliasf * sizeof(unsigned short *));
  728. aliasflen = (unsigned short *) malloc(numaliasf * sizeof(short));
  729. if (!aliasf || !aliasflen) {
  730. numaliasf = 0;
  731. if (aliasf) free(aliasf);
  732. if (aliasflen) free(aliasflen);
  733. aliasf = NULL;
  734. aliasflen = NULL;
  735. return 1;
  736. }
  737. np++;
  738. break;
  739. }
  740. default: break;
  741. }
  742. i++;
  743. }
  744. piece = mystrsep(&tp, 0);
  745. }
  746. if (np != 2) {
  747. numaliasf = 0;
  748. free(aliasf);
  749. free(aliasflen);
  750. aliasf = NULL;
  751. aliasflen = NULL;
  752. HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
  753. return 1;
  754. }
  755. /* now parse the numaliasf lines to read in the remainder of the table */
  756. char * nl;
  757. for (int j=0; j < numaliasf; j++) {
  758. if (!(nl = af->getline())) return 1;
  759. mychomp(nl);
  760. tp = nl;
  761. i = 0;
  762. aliasf[j] = NULL;
  763. aliasflen[j] = 0;
  764. piece = mystrsep(&tp, 0);
  765. while (piece) {
  766. if (*piece != '\0') {
  767. switch(i) {
  768. case 0: {
  769. if (strncmp(piece,"AF",2) != 0) {
  770. numaliasf = 0;
  771. free(aliasf);
  772. free(aliasflen);
  773. aliasf = NULL;
  774. aliasflen = NULL;
  775. HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
  776. return 1;
  777. }
  778. break;
  779. }
  780. case 1: {
  781. aliasflen[j] = (unsigned short) decode_flags(&(aliasf[j]), piece, af);
  782. flag_qsort(aliasf[j], 0, aliasflen[j]);
  783. break;
  784. }
  785. default: break;
  786. }
  787. i++;
  788. }
  789. piece = mystrsep(&tp, 0);
  790. }
  791. if (!aliasf[j]) {
  792. free(aliasf);
  793. free(aliasflen);
  794. aliasf = NULL;
  795. aliasflen = NULL;
  796. numaliasf = 0;
  797. HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
  798. return 1;
  799. }
  800. }
  801. return 0;
  802. }
  803. int HashMgr::is_aliasf() {
  804. return (aliasf != NULL);
  805. }
  806. int HashMgr::get_aliasf(int index, unsigned short ** fvec, FileMgr * af) {
  807. if ((index > 0) && (index <= numaliasf)) {
  808. *fvec = aliasf[index - 1];
  809. return aliasflen[index - 1];
  810. }
  811. HUNSPELL_WARNING(stderr, "error: line %d: bad flag alias index: %d\n", af->getlinenum(), index);
  812. *fvec = NULL;
  813. return 0;
  814. }
  815. /* parse morph alias definitions */
  816. int HashMgr::parse_aliasm(char * line, FileMgr * af)
  817. {
  818. if (numaliasm != 0) {
  819. HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
  820. return 1;
  821. }
  822. char * tp = line;
  823. char * piece;
  824. int i = 0;
  825. int np = 0;
  826. piece = mystrsep(&tp, 0);
  827. while (piece) {
  828. if (*piece != '\0') {
  829. switch(i) {
  830. case 0: { np++; break; }
  831. case 1: {
  832. numaliasm = atoi(piece);
  833. if (numaliasm < 1) {
  834. HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
  835. return 1;
  836. }
  837. aliasm = (char **) malloc(numaliasm * sizeof(char *));
  838. if (!aliasm) {
  839. numaliasm = 0;
  840. return 1;
  841. }
  842. np++;
  843. break;
  844. }
  845. default: break;
  846. }
  847. i++;
  848. }
  849. piece = mystrsep(&tp, 0);
  850. }
  851. if (np != 2) {
  852. numaliasm = 0;
  853. free(aliasm);
  854. aliasm = NULL;
  855. HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
  856. return 1;
  857. }
  858. /* now parse the numaliasm lines to read in the remainder of the table */
  859. char * nl = line;
  860. for (int j=0; j < numaliasm; j++) {
  861. if (!(nl = af->getline())) return 1;
  862. mychomp(nl);
  863. tp = nl;
  864. i = 0;
  865. aliasm[j] = NULL;
  866. piece = mystrsep(&tp, ' ');
  867. while (piece) {
  868. if (*piece != '\0') {
  869. switch(i) {
  870. case 0: {
  871. if (strncmp(piece,"AM",2) != 0) {
  872. HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
  873. numaliasm = 0;
  874. free(aliasm);
  875. aliasm = NULL;
  876. return 1;
  877. }
  878. break;
  879. }
  880. case 1: {
  881. // add the remaining of the line
  882. if (*tp) {
  883. *(tp - 1) = ' ';
  884. tp = tp + strlen(tp);
  885. }
  886. if (complexprefixes) {
  887. if (utf8) reverseword_utf(piece);
  888. else reverseword(piece);
  889. }
  890. aliasm[j] = mystrdup(piece);
  891. if (!aliasm[j]) {
  892. numaliasm = 0;
  893. free(aliasm);
  894. aliasm = NULL;
  895. return 1;
  896. }
  897. break; }
  898. default: break;
  899. }
  900. i++;
  901. }
  902. piece = mystrsep(&tp, ' ');
  903. }
  904. if (!aliasm[j]) {
  905. numaliasm = 0;
  906. free(aliasm);
  907. aliasm = NULL;
  908. HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
  909. return 1;
  910. }
  911. }
  912. return 0;
  913. }
  914. int HashMgr::is_aliasm() {
  915. return (aliasm != NULL);
  916. }
  917. char * HashMgr::get_aliasm(int index) {
  918. if ((index > 0) && (index <= numaliasm)) return aliasm[index - 1];
  919. HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index);
  920. return NULL;
  921. }