PageRenderTime 56ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/ext/hunspell/mythes.cxx

http://tortoisesvn.googlecode.com/
C++ | 356 lines | 251 code | 65 blank | 40 comment | 61 complexity | bc23b7033f83269dbbc204059ca9a243 MD5 | raw file
Possible License(s): CC-BY-SA-3.0, GPL-3.0, MPL-2.0-no-copyleft-exception, GPL-2.0, LGPL-2.0, LGPL-2.1, BSD-3-Clause, Apache-2.0, LGPL-3.0
  1. #include <stdio.h>
  2. #include <string.h>
  3. #include <stdlib.h>
  4. #include <errno.h>
  5. #include "mythes.hxx"
  6. MyThes::MyThes(const char* idxpath, const char * datpath)
  7. {
  8. nw = 0;
  9. encoding = NULL;
  10. list = NULL;
  11. offst = NULL;
  12. if (thInitialize(idxpath, datpath) != 1) {
  13. fprintf(stderr,"Error - can't open %s or %s\n",idxpath, datpath);
  14. fflush(stderr);
  15. if (encoding) free((void*)encoding);
  16. if (list) free((void*)list);
  17. if (offst) free((void*)offst);
  18. // did not initialize properly - throw exception?
  19. }
  20. }
  21. MyThes::~MyThes()
  22. {
  23. if (thCleanup() != 1) {
  24. /* did not cleanup properly - throw exception? */
  25. }
  26. if (encoding) free((void*)encoding);
  27. encoding = NULL;
  28. list = NULL;
  29. offst = NULL;
  30. }
  31. int MyThes::thInitialize(const char* idxpath, const char* datpath)
  32. {
  33. // open the index file
  34. FILE * pifile = fopen(idxpath,"r");
  35. if (!pifile) {
  36. return 0;
  37. }
  38. char * wrd = (char *)calloc(1, MAX_WD_LEN);
  39. // parse in encoding and index size */
  40. int len = readLine(pifile,wrd,MAX_WD_LEN);
  41. encoding = mystrdup(wrd);
  42. len = readLine(pifile,wrd,MAX_WD_LEN);
  43. int idxsz = atoi(wrd);
  44. // now allocate list, offst for the given size
  45. list = (char**)calloc(idxsz,sizeof(char*));
  46. offst = (unsigned int*) calloc(idxsz,sizeof(unsigned int));
  47. if ( (!(list)) || (!(offst)) ) {
  48. fprintf(stderr,"Error - bad memory allocation\n");
  49. fflush(stderr);
  50. free((void *)wrd);
  51. fclose(pifile);
  52. return 0;
  53. }
  54. // now parse the remaining lines of the index
  55. len = readLine(pifile,wrd,MAX_WD_LEN);
  56. while (len > 0)
  57. {
  58. int np = mystr_indexOfChar(wrd,'|');
  59. if (nw < idxsz) {
  60. if (np >= 0) {
  61. *(wrd+np) = '\0';
  62. list[nw] = (char *)calloc(1,(np+1));
  63. memcpy((list[nw]),wrd,np);
  64. offst[nw] = atoi(wrd+np+1);
  65. nw++;
  66. }
  67. }
  68. len = readLine(pifile,wrd,MAX_WD_LEN);
  69. }
  70. free((void *)wrd);
  71. fclose(pifile);
  72. pifile=NULL;
  73. /* next open the data file */
  74. pdfile = fopen(datpath,"r");
  75. return pdfile ? 1 : 0;
  76. }
  77. int MyThes::thCleanup()
  78. {
  79. /* first close the data file */
  80. if (pdfile) {
  81. fclose(pdfile);
  82. pdfile=NULL;
  83. }
  84. /* now free up all the allocated strings on the list */
  85. for (int i=0; i < nw; i++)
  86. {
  87. if (list[i]) {
  88. free(list[i]);
  89. list[i] = 0;
  90. }
  91. }
  92. if (list) free((void*)list);
  93. if (offst) free((void*)offst);
  94. nw = 0;
  95. return 1;
  96. }
  97. // lookup text in index and count of meanings and a list of meaning entries
  98. // with each entry having a synonym count and pointer to an
  99. // array of char * (i.e the synonyms)
  100. //
  101. // note: calling routine should call CleanUpAfterLookup with the original
  102. // meaning point and count to properly deallocate memory
  103. int MyThes::Lookup(const char * pText, int len, mentry** pme)
  104. {
  105. *pme = NULL;
  106. // handle the case of missing file or file related errors
  107. if (! pdfile) return 0;
  108. long offset = 0;
  109. /* copy search word and make sure null terminated */
  110. char * wrd = (char *) calloc(1,(len+1));
  111. memcpy(wrd,pText,len);
  112. /* find it in the list */
  113. int idx = binsearch(wrd,list,nw);
  114. free(wrd);
  115. if (idx < 0) return 0;
  116. // now seek to the offset
  117. offset = (long) offst[idx];
  118. int rc = fseek(pdfile,offset,SEEK_SET);
  119. if (rc) {
  120. return 0;
  121. }
  122. // grab the count of the number of meanings
  123. // and allocate a list of meaning entries
  124. char * buf = NULL;
  125. buf = (char *) malloc( MAX_LN_LEN );
  126. if (!buf) return 0;
  127. readLine(pdfile, buf, (MAX_LN_LEN-1));
  128. int np = mystr_indexOfChar(buf,'|');
  129. if (np < 0) {
  130. free(buf);
  131. return 0;
  132. }
  133. int nmeanings = atoi(buf+np+1);
  134. *pme = (mentry*) malloc( nmeanings * sizeof(mentry) );
  135. if (!(*pme)) {
  136. free(buf);
  137. return 0;
  138. }
  139. // now read in each meaning and parse it to get defn, count and synonym lists
  140. mentry* pm = *(pme);
  141. char dfn[MAX_WD_LEN];
  142. for (int j = 0; j < nmeanings; j++) {
  143. readLine(pdfile, buf, (MAX_LN_LEN-1));
  144. pm->count = 0;
  145. pm->psyns = NULL;
  146. pm->defn = NULL;
  147. // store away the part of speech for later use
  148. char * p = buf;
  149. char * pos = NULL;
  150. np = mystr_indexOfChar(p,'|');
  151. if (np >= 0) {
  152. *(buf+np) = '\0';
  153. pos = mystrdup(p);
  154. p = p + np + 1;
  155. } else {
  156. pos = mystrdup("");
  157. }
  158. // count the number of fields in the remaining line
  159. int nf = 1;
  160. char * d = p;
  161. np = mystr_indexOfChar(d,'|');
  162. while ( np >= 0 ) {
  163. nf++;
  164. d = d + np + 1;
  165. np = mystr_indexOfChar(d,'|');
  166. }
  167. pm->count = nf;
  168. pm->psyns = (char **) malloc(nf*sizeof(char*));
  169. // fill in the synonym list
  170. d = p;
  171. for (int j = 0; j < nf; j++) {
  172. np = mystr_indexOfChar(d,'|');
  173. if (np > 0) {
  174. *(d+np) = '\0';
  175. pm->psyns[j] = mystrdup(d);
  176. d = d + np + 1;
  177. } else {
  178. pm->psyns[j] = mystrdup(d);
  179. }
  180. }
  181. // add pos to first synonym to create the definition
  182. int k = strlen(pos);
  183. int m = strlen(pm->psyns[0]);
  184. if ((k+m) < (MAX_WD_LEN - 1)) {
  185. strncpy(dfn,pos,k);
  186. *(dfn+k) = ' ';
  187. strncpy((dfn+k+1),(pm->psyns[0]),m+1);
  188. pm->defn = mystrdup(dfn);
  189. } else {
  190. pm->defn = mystrdup(pm->psyns[0]);
  191. }
  192. free(pos);
  193. pm++;
  194. }
  195. free(buf);
  196. return nmeanings;
  197. }
  198. void MyThes::CleanUpAfterLookup(mentry ** pme, int nmeanings)
  199. {
  200. if (nmeanings == 0) return;
  201. if ((*pme) == NULL) return;
  202. mentry * pm = *pme;
  203. for (int i = 0; i < nmeanings; i++) {
  204. int count = pm->count;
  205. for (int j = 0; j < count; j++) {
  206. if (pm->psyns[j]) free(pm->psyns[j]);
  207. pm->psyns[j] = NULL;
  208. }
  209. if (pm->psyns) free(pm->psyns);
  210. pm->psyns = NULL;
  211. if (pm->defn) free(pm->defn);
  212. pm->defn = NULL;
  213. pm->count = 0;
  214. pm++;
  215. }
  216. pm = *pme;
  217. free(pm);
  218. *pme = NULL;
  219. return;
  220. }
  221. // read a line of text from a text file stripping
  222. // off the line terminator and replacing it with
  223. // a null string terminator.
  224. // returns: -1 on error or the number of characters in
  225. // in the returning string
  226. // A maximum of nc characters will be returned
  227. int MyThes::readLine(FILE * pf, char * buf, int nc)
  228. {
  229. if (fgets(buf,nc,pf)) {
  230. mychomp(buf);
  231. return strlen(buf);
  232. }
  233. return -1;
  234. }
  235. // performs a binary search on null terminated character
  236. // strings
  237. //
  238. // returns: -1 on not found
  239. // index of wrd in the list[]
  240. int MyThes::binsearch(char * sw, char* list[], int nlst)
  241. {
  242. int lp, up, mp, j, indx;
  243. lp = 0;
  244. up = nlst-1;
  245. indx = -1;
  246. if (strcmp(sw,list[lp]) < 0) return -1;
  247. if (strcmp(sw,list[up]) > 0) return -1;
  248. while (indx < 0 ) {
  249. mp = (int)((lp+up) >> 1);
  250. j = strcmp(sw,list[mp]);
  251. if ( j > 0) {
  252. lp = mp + 1;
  253. } else if (j < 0 ) {
  254. up = mp - 1;
  255. } else {
  256. indx = mp;
  257. }
  258. if (lp > up) return -1;
  259. }
  260. return indx;
  261. }
  262. char * MyThes::get_th_encoding()
  263. {
  264. if (encoding) return encoding;
  265. return NULL;
  266. }
  267. // string duplication routine
  268. char * MyThes::mystrdup(const char * p)
  269. {
  270. int sl = strlen(p) + 1;
  271. char * d = (char *)malloc(sl);
  272. if (d) {
  273. memcpy(d,p,sl);
  274. return d;
  275. }
  276. return NULL;
  277. }
  278. // remove cross-platform text line end characters
  279. void MyThes::mychomp(char * s)
  280. {
  281. int k = strlen(s);
  282. if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
  283. if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
  284. }
  285. // return index of char in string
  286. int MyThes::mystr_indexOfChar(const char * d, int c)
  287. {
  288. char * p = strchr((char *)d,c);
  289. if (p) return (int)(p-d);
  290. return -1;
  291. }