PageRenderTime 26ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 0ms

/mythes-1.2.3/mythes.cxx

#
C++ | 372 lines | 270 code | 63 blank | 39 comment | 62 complexity | 793c41e1b09c407981cdbc613f64d5ea MD5 | raw file
  1. #include "COPYING"
  2. #include <stdio.h>
  3. #include <string.h>
  4. #include <stdlib.h>
  5. #include <errno.h>
  6. #include <limits>
  7. #include <vector>
  8. #include "mythes.hxx"
  9. MyThes::MyThes(const char* idxpath, const char * datpath)
  10. {
  11. nw = 0;
  12. encoding = NULL;
  13. list = NULL;
  14. offst = NULL;
  15. pdfile = NULL;
  16. if (thInitialize(idxpath, datpath) != 1) {
  17. fprintf(stderr,"Error - can't open %s or %s\n",idxpath, datpath);
  18. fflush(stderr);
  19. thCleanup();
  20. // did not initialize properly - throw exception?
  21. }
  22. }
  23. MyThes::~MyThes()
  24. {
  25. thCleanup();
  26. }
  27. int MyThes::thInitialize(const char* idxpath, const char* datpath)
  28. {
  29. // open the index file
  30. FILE * pifile = fopen(idxpath,"r");
  31. if (!pifile) {
  32. return 0;
  33. }
  34. // parse in encoding and index size */
  35. std::vector<char> buffer(MAX_WD_LEN);
  36. char * wrd = &buffer[0];
  37. int len = readLine(pifile,wrd,MAX_WD_LEN);
  38. encoding = mystrdup(wrd);
  39. len = readLine(pifile,wrd,MAX_WD_LEN);
  40. int idxsz = atoi(wrd);
  41. if (idxsz <= 0 || idxsz > std::numeric_limits<ssize_t>::max() / sizeof(sizeof(char*))) {
  42. fprintf(stderr,"Error - bad index %d\n", idxsz);
  43. fclose(pifile);
  44. return 0;
  45. }
  46. // now allocate list, offst for the given size
  47. list = (char**) calloc(idxsz,sizeof(char*));
  48. offst = (unsigned int*) calloc(idxsz,sizeof(unsigned int));
  49. if ( (!(list)) || (!(offst)) ) {
  50. fprintf(stderr,"Error - bad memory allocation\n");
  51. fclose(pifile);
  52. return 0;
  53. }
  54. // now parse the remaining lines of the index
  55. len = readLine(pifile,wrd,MAX_WD_LEN);
  56. while (len > 0)
  57. {
  58. int np = mystr_indexOfChar(wrd,'|');
  59. if (nw < idxsz) {
  60. if (np >= 0) {
  61. *(wrd+np) = '\0';
  62. list[nw] = (char *)calloc(1,(np+1));
  63. if (!list[nw]) {
  64. fprintf(stderr,"Error - bad memory allocation\n");
  65. fflush(stderr);
  66. fclose(pifile);
  67. return 0;
  68. }
  69. memcpy((list[nw]),wrd,np);
  70. offst[nw] = atoi(wrd+np+1);
  71. nw++;
  72. }
  73. }
  74. len = readLine(pifile,wrd,MAX_WD_LEN);
  75. }
  76. fclose(pifile);
  77. /* next open the data file */
  78. pdfile = fopen(datpath,"r");
  79. if (!pdfile) {
  80. return 0;
  81. }
  82. return 1;
  83. }
  84. void MyThes::thCleanup()
  85. {
  86. /* first close the data file */
  87. if (pdfile) {
  88. fclose(pdfile);
  89. pdfile=NULL;
  90. }
  91. if (list)
  92. {
  93. /* now free up all the allocated strings on the list */
  94. for (int i=0; i < nw; i++)
  95. {
  96. if (list[i]) {
  97. free(list[i]);
  98. list[i] = 0;
  99. }
  100. }
  101. free((void*)list);
  102. }
  103. if (encoding) free((void*)encoding);
  104. if (offst) free((void*)offst);
  105. encoding = NULL;
  106. list = NULL;
  107. offst = NULL;
  108. nw = 0;
  109. }
  110. // lookup text in index and count of meanings and a list of meaning entries
  111. // with each entry having a synonym count and pointer to an
  112. // array of char * (i.e the synonyms)
  113. //
  114. // note: calling routine should call CleanUpAfterLookup with the original
  115. // meaning point and count to properly deallocate memory
  116. int MyThes::Lookup(const char * pText, int len, mentry** pme)
  117. {
  118. *pme = NULL;
  119. // handle the case of missing file or file related errors
  120. if (! pdfile) return 0;
  121. long offset = 0;
  122. /* copy search word and make sure null terminated */
  123. std::vector<char> buffer(len+1);
  124. char * wrd = &buffer[0];
  125. memcpy(wrd,pText,len);
  126. /* find it in the list */
  127. int idx = nw > 0 ? binsearch(wrd,list,nw) : -1;
  128. if (idx < 0) return 0;
  129. // now seek to the offset
  130. offset = (long) offst[idx];
  131. int rc = fseek(pdfile,offset,SEEK_SET);
  132. if (rc) {
  133. return 0;
  134. }
  135. // grab the count of the number of meanings
  136. // and allocate a list of meaning entries
  137. char * buf = NULL;
  138. buf = (char *) malloc( MAX_LN_LEN );
  139. if (!buf) return 0;
  140. readLine(pdfile, buf, (MAX_LN_LEN-1));
  141. int np = mystr_indexOfChar(buf,'|');
  142. if (np < 0) {
  143. free(buf);
  144. return 0;
  145. }
  146. int nmeanings = atoi(buf+np+1);
  147. if (nmeanings < 0 || nmeanings > std::numeric_limits<ssize_t>::max() / sizeof(mentry))
  148. nmeanings = 0;
  149. *pme = (mentry*)(nmeanings ? malloc(nmeanings * sizeof(mentry)) : NULL);
  150. if (!(*pme)) {
  151. free(buf);
  152. return 0;
  153. }
  154. // now read in each meaning and parse it to get defn, count and synonym lists
  155. mentry* pm = *(pme);
  156. char dfn[MAX_WD_LEN];
  157. for (int j = 0; j < nmeanings; j++) {
  158. readLine(pdfile, buf, (MAX_LN_LEN-1));
  159. pm->count = 0;
  160. pm->psyns = NULL;
  161. pm->defn = NULL;
  162. // store away the part of speech for later use
  163. char * p = buf;
  164. char * pos = NULL;
  165. np = mystr_indexOfChar(p,'|');
  166. if (np >= 0) {
  167. *(buf+np) = '\0';
  168. pos = mystrdup(p);
  169. p = p + np + 1;
  170. } else {
  171. pos = mystrdup("");
  172. }
  173. // count the number of fields in the remaining line
  174. int nf = 1;
  175. char * d = p;
  176. np = mystr_indexOfChar(d,'|');
  177. while ( np >= 0 ) {
  178. nf++;
  179. d = d + np + 1;
  180. np = mystr_indexOfChar(d,'|');
  181. }
  182. pm->count = nf;
  183. pm->psyns = (char **) malloc(nf*sizeof(char*));
  184. // fill in the synonym list
  185. d = p;
  186. for (int jj = 0; jj < nf; jj++)
  187. {
  188. np = mystr_indexOfChar(d,'|');
  189. if (np > 0)
  190. {
  191. *(d+np) = '\0';
  192. pm->psyns[jj] = mystrdup(d);
  193. d = d + np + 1;
  194. }
  195. else
  196. {
  197. pm->psyns[jj] = mystrdup(d);
  198. }
  199. }
  200. // add pos to first synonym to create the definition
  201. int k = strlen(pos);
  202. int m = strlen(pm->psyns[0]);
  203. if ((k+m) < (MAX_WD_LEN - 1)) {
  204. strncpy(dfn,pos,k);
  205. *(dfn+k) = ' ';
  206. strncpy((dfn+k+1),(pm->psyns[0]),m+1);
  207. pm->defn = mystrdup(dfn);
  208. } else {
  209. pm->defn = mystrdup(pm->psyns[0]);
  210. }
  211. free(pos);
  212. pm++;
  213. }
  214. free(buf);
  215. return nmeanings;
  216. }
  217. void MyThes::CleanUpAfterLookup(mentry ** pme, int nmeanings)
  218. {
  219. if (nmeanings == 0) return;
  220. if ((*pme) == NULL) return;
  221. mentry * pm = *pme;
  222. for (int i = 0; i < nmeanings; i++) {
  223. int count = pm->count;
  224. for (int j = 0; j < count; j++) {
  225. if (pm->psyns[j]) free(pm->psyns[j]);
  226. pm->psyns[j] = NULL;
  227. }
  228. if (pm->psyns) free(pm->psyns);
  229. pm->psyns = NULL;
  230. if (pm->defn) free(pm->defn);
  231. pm->defn = NULL;
  232. pm->count = 0;
  233. pm++;
  234. }
  235. pm = *pme;
  236. free(pm);
  237. *pme = NULL;
  238. return;
  239. }
  240. // read a line of text from a text file stripping
  241. // off the line terminator and replacing it with
  242. // a null string terminator.
  243. // returns: -1 on error or the number of characters in
  244. // in the returning string
  245. // A maximum of nc characters will be returned
  246. int MyThes::readLine(FILE * pf, char * buf, int nc)
  247. {
  248. if (fgets(buf,nc,pf)) {
  249. mychomp(buf);
  250. return strlen(buf);
  251. }
  252. return -1;
  253. }
  254. // performs a binary search on null terminated character
  255. // strings
  256. //
  257. // returns: -1 on not found
  258. // index of wrd in the list[]
  259. int MyThes::binsearch(char * sw, char* _list[], int nlst)
  260. {
  261. int lp, up, mp, j, indx;
  262. lp = 0;
  263. up = nlst-1;
  264. indx = -1;
  265. if (strcmp(sw,_list[lp]) < 0) return -1;
  266. if (strcmp(sw,_list[up]) > 0) return -1;
  267. while (indx < 0 ) {
  268. mp = (int)((lp+up) >> 1);
  269. j = strcmp(sw,_list[mp]);
  270. if ( j > 0) {
  271. lp = mp + 1;
  272. } else if (j < 0 ) {
  273. up = mp - 1;
  274. } else {
  275. indx = mp;
  276. }
  277. if (lp > up) return -1;
  278. }
  279. return indx;
  280. }
  281. char * MyThes::get_th_encoding()
  282. {
  283. return encoding;
  284. }
  285. // string duplication routine
  286. char * MyThes::mystrdup(const char * s)
  287. {
  288. char * d = NULL;
  289. if (s) {
  290. int sl = strlen(s)+1;
  291. d = (char *) malloc(sl);
  292. if (d) memcpy(d,s,sl);
  293. }
  294. return d;
  295. }
  296. // remove cross-platform text line end characters
  297. void MyThes::mychomp(char * s)
  298. {
  299. int k = strlen(s);
  300. if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
  301. if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
  302. }
  303. // return index of char in string
  304. int MyThes::mystr_indexOfChar(const char * d, int c)
  305. {
  306. char * p = strchr((char *)d,c);
  307. if (p) return (int)(p-d);
  308. return -1;
  309. }