/LZ/index.c

https://github.com/peper/pizza · C · 245 lines · 212 code · 19 blank · 14 comment · 8 complexity · 563637b6ab030d603306a8322b1d62b7 MD5 · raw file

  1. // Indexing module
  2. #include "trie.h"
  3. #include "lztrie.h"
  4. #include "nodemap.h"
  5. #include "revtrie.h"
  6. #include "lzindex.h"
  7. #include <math.h>
  8. // creates lztrie over a null-terminated text
  9. // it also creates *ids
  10. #ifdef INDEXREPORT
  11. struct tms time;
  12. clock_t t1,t2;
  13. uint ticks;
  14. #endif
  15. lztrie buildLZTrie(byte *text, uint **ids, byte s)
  16. {
  17. trie T;
  18. uint n;
  19. uint *parent;
  20. byte *letters;
  21. lztrie LZT;
  22. // first creates a full trie T
  23. #ifdef INDEXREPORT
  24. ticks= sysconf(_SC_CLK_TCK);
  25. times(&time); t1 = time.tms_utime;
  26. printf (" Building LZTrie...\n"); fflush(stdout);
  27. printf (" Building normal trie...\n"); fflush(stdout);
  28. #endif
  29. T = createTrie();
  30. do {
  31. text = insertTrie(T,text);
  32. }
  33. while (text[-1]!=s);
  34. // now compresses it
  35. #ifdef INDEXREPORT
  36. times(&time); t2 = time.tms_utime;
  37. printf (" User time: %f secs\n",(t2-t1)/(float)ticks); fflush(stdout);
  38. t1 = t2;
  39. printf (" Representing with parentheses, letters and ids...\n"); fflush(stdout);
  40. #endif
  41. n = T->nid;
  42. parent = malloc (((2*n+W-1)/W)*sizeof(uint));
  43. letters = malloc (n*sizeof(byte));
  44. *ids = malloc (n*sizeof(uint));
  45. representTrie (T,parent,letters,*ids);
  46. #ifdef INDEXREPORT
  47. times(&time); t2 = time.tms_utime;
  48. printf (" User time: %f secs\n",(t2-t1)/(float)ticks); fflush(stdout);
  49. t1 = t2;
  50. printf (" Freing trie...\n"); fflush(stdout);
  51. #endif
  52. destroyTrie(T);
  53. #ifdef INDEXREPORT
  54. times(&time); t2 = time.tms_utime;
  55. printf (" User time: %f secs\n",(t2-t1)/(float)ticks); fflush(stdout);
  56. t1 = t2;
  57. printf (" Creating compressed trie...\n"); fflush(stdout);
  58. #endif
  59. LZT = createLZTrie (parent,letters,*ids,n);
  60. #ifdef INDEXREPORT
  61. times(&time); t2 = time.tms_utime;
  62. printf (" User time: %f secs\n",(t2-t1)/(float)ticks); fflush(stdout);
  63. t1 = t2;
  64. printf (" End of LZTrie\n"); fflush(stdout);
  65. #endif
  66. return LZT;
  67. }
  68. // builds Map from LZTrie and ids, which gets freed
  69. // it also writes the maximum depth of the trie
  70. nodemap buildMap (lztrie T, uint *ids, uint *maxdepth)
  71. { nodemap M;
  72. uint *map;
  73. trieNode i;
  74. uint n,j,depth,mdepth;
  75. #ifdef INDEXREPORT
  76. times(&time); t1 = time.tms_utime;
  77. printf (" Building Map...\n"); fflush(stdout);
  78. printf (" Computing indexes...\n"); fflush(stdout);
  79. #endif
  80. n = T->n;
  81. map = malloc (n*sizeof(uint));
  82. map[0] = ROOT; depth = mdepth = 0;
  83. i = ROOT;
  84. for (j=1;j<n;j++)
  85. { i = nextLZTrie (T,i,&depth);
  86. if (depth > mdepth) mdepth = depth;
  87. map[ids[j]] = i;
  88. }
  89. free (ids);
  90. #ifdef INDEXREPORT
  91. times(&time); t2 = time.tms_utime;
  92. printf (" User time: %f secs\n",(t2-t1)/(float)ticks); fflush(stdout);
  93. t1 = t2;
  94. printf (" Creating nodemap...\n"); fflush(stdout);
  95. #endif
  96. M = createNodemap (map,n,n);
  97. free (map);
  98. *maxdepth = mdepth;
  99. #ifdef INDEXREPORT
  100. times(&time); t2 = time.tms_utime;
  101. printf (" User time: %f secs\n",(t2-t1)/(float)ticks); fflush(stdout);
  102. t1 = t2;
  103. printf (" End of Map\n"); fflush(stdout);
  104. #endif
  105. return M;
  106. }
  107. // builds reverse trie from LZTrie, Map, and maximum LZTrie depth
  108. // returns reverse ids
  109. revtrie buildRevTrie (lztrie T, nodemap M, uint maxdepth, uint **ids)
  110. { byte *str;
  111. uint n,depth,j;
  112. trieNode i;
  113. trie RT;
  114. uint *parent, *emptybmap;
  115. revtrie CRT;
  116. // first create a full trie RT
  117. #ifdef INDEXREPORT
  118. times(&time); t1 = time.tms_utime;
  119. printf (" Building RevTrie...\n"); fflush(stdout);
  120. printf (" Creating full trie...\n"); fflush(stdout);
  121. #endif
  122. str = malloc (maxdepth*sizeof(byte));
  123. RT = createTrie();
  124. i = ROOT; depth = 0;
  125. for (j=1;j<T->n;j++)
  126. { i = nextLZTrie (T,i,&depth);
  127. str[maxdepth-depth] = letterLZTrie (T,i);
  128. insertstringTrie (RT,str+maxdepth-depth,depth,idLZTrie(T,i));
  129. }
  130. free (str);
  131. // now compresses it
  132. #ifdef INDEXREPORT
  133. times(&time); t2 = time.tms_utime;
  134. printf (" User time: %f secs\n",(t2-t1)/(float)ticks); fflush(stdout);
  135. t1 = t2;
  136. printf (" Representing with parentheses and ids...\n"); fflush(stdout);
  137. #endif
  138. n = RT->nid;
  139. parent = malloc (((2*n+W-1)/W)*sizeof(uint));
  140. *ids = malloc (n*sizeof(uint));
  141. representTrie (RT,parent,NULL,*ids);
  142. #ifdef INDEXREPORT
  143. times(&time); t2 = time.tms_utime;
  144. printf (" User time: %f secs\n",(t2-t1)/(float)ticks); fflush(stdout);
  145. t1 = t2;
  146. printf (" Freeing trie...\n"); fflush(stdout);
  147. #endif
  148. destroyTrie(RT);
  149. #ifdef INDEXREPORT
  150. times(&time); t2 = time.tms_utime;
  151. printf (" User time: %f secs\n",(t2-t1)/(float)ticks); fflush(stdout);
  152. t1 = t2;
  153. printf (" Creating compressed trie...\n"); fflush(stdout);
  154. #endif
  155. CRT = createRevTrie(parent,T,M,*ids,n);
  156. #ifdef INDEXREPORT
  157. times(&time); t2 = time.tms_utime;
  158. printf (" User time: %f secs\n",(t2-t1)/(float)ticks); fflush(stdout);
  159. t1 = t2;
  160. printf (" End of RevTrie...\n"); fflush(stdout);
  161. #endif
  162. return CRT;
  163. }
  164. // builds Map from RevTrie and ids, which gets freed
  165. nodemap buildRMap (revtrie T, uint *ids)
  166. { nodemap M;
  167. uint *map;
  168. trieNode i;
  169. uint j, n;
  170. #ifdef INDEXREPORT
  171. times(&time); t1 = time.tms_utime;
  172. printf (" Building RMap...\n"); fflush(stdout);
  173. printf (" Computing indexes...\n"); fflush(stdout);
  174. #endif
  175. n = T->n;
  176. map = malloc (n*sizeof(uint));
  177. map[0] = ROOT;
  178. i = ROOT;
  179. for (j=1;j<n;j++)
  180. { i = nextRevTrie (T,i);
  181. map[ids[j]] = i; // when equality, the innermost gets the mapping
  182. }
  183. free (ids);
  184. #ifdef INDEXREPORT
  185. times(&time); t2 = time.tms_utime;
  186. printf (" User time: %f secs\n",(t2-t1)/(float)ticks); fflush(stdout);
  187. t1 = t2;
  188. printf (" Creating nodemap...\n"); fflush(stdout);
  189. #endif
  190. M = createNodemap (map,n,n);
  191. free (map);
  192. #ifdef INDEXREPORT
  193. times(&time); t2 = time.tms_utime;
  194. printf (" User time: %f secs\n",(t2-t1)/(float)ticks); fflush(stdout);
  195. t1 = t2;
  196. printf (" End of Map\n"); fflush(stdout);
  197. #endif
  198. return M;
  199. }
  200. byte selectSymbol(byte *text, ulong length)
  201. {
  202. ulong i;
  203. byte s;
  204. bool *A = calloc(256, sizeof(bool));;
  205. for (i=0;i<length;i++) A[text[i]]= true;
  206. for (s=0;s<256;s++)
  207. if (!A[s]) break;
  208. return s;
  209. }
  210. // creates lzindex over a null-terminated text
  211. int build_index(byte *text, ulong length, char *build_options, void **index)
  212. {
  213. lzindex *I;
  214. uint *ids,maxdepth;
  215. I = malloc(sizeof(lzindex));
  216. text[length] = selectSymbol(text, length);
  217. // build index
  218. I->fwdtrie = buildLZTrie(text,&ids,text[length]);
  219. I->map = buildMap(I->fwdtrie,ids,&maxdepth);
  220. I->bwdtrie = buildRevTrie(I->fwdtrie,I->map,maxdepth,&ids);
  221. I->rmap = buildRMap(I->bwdtrie,ids);
  222. I->TPos = createPosition(I->fwdtrie, length, I->map);
  223. I->u = length;
  224. *index = I; // return index
  225. return 0; // no errors yet
  226. }