/vendor/gems/ferret-0.11.4/ext/q_term.c

https://github.com/ekcell/lovdbyless · C · 337 lines · 260 code · 51 blank · 26 comment · 30 complexity · 21ff35111e16c3fa07d7bf4bdf5b2d9d MD5 · raw file

  1. #include <string.h>
  2. #include "search.h"
  3. #define TQ(query) ((TermQuery *)(query))
  4. #define TSc(scorer) ((TermScorer *)(scorer))
  5. /***************************************************************************
  6. *
  7. * TermScorer
  8. *
  9. ***************************************************************************/
  10. #define SCORE_CACHE_SIZE 32
  11. #define TDE_READ_SIZE 32
  12. typedef struct TermScorer
  13. {
  14. Scorer super;
  15. int docs[TDE_READ_SIZE];
  16. int freqs[TDE_READ_SIZE];
  17. int pointer;
  18. int pointer_max;
  19. float score_cache[SCORE_CACHE_SIZE];
  20. Weight *weight;
  21. TermDocEnum *tde;
  22. uchar *norms;
  23. float weight_value;
  24. } TermScorer;
  25. static float tsc_score(Scorer *self)
  26. {
  27. TermScorer *ts = TSc(self);
  28. int freq = ts->freqs[ts->pointer];
  29. float score;
  30. /* compute tf(f)*weight */
  31. if (freq < SCORE_CACHE_SIZE) { /* check cache */
  32. score = ts->score_cache[freq]; /* cache hit */
  33. }
  34. else {
  35. /* cache miss */
  36. score = sim_tf(self->similarity, (float)freq) * ts->weight_value;
  37. }
  38. /* normalize for field */
  39. score *= sim_decode_norm(self->similarity, ts->norms[self->doc]);
  40. return score;
  41. }
  42. static bool tsc_next(Scorer *self)
  43. {
  44. TermScorer *ts = TSc(self);
  45. ts->pointer++;
  46. if (ts->pointer >= ts->pointer_max) {
  47. /* refill buffer */
  48. ts->pointer_max = ts->tde->read(ts->tde, ts->docs, ts->freqs,
  49. TDE_READ_SIZE);
  50. if (ts->pointer_max != 0) {
  51. ts->pointer = 0;
  52. }
  53. else {
  54. return false;
  55. }
  56. }
  57. self->doc = ts->docs[ts->pointer];
  58. return true;
  59. }
  60. static bool tsc_skip_to(Scorer *self, int doc_num)
  61. {
  62. TermScorer *ts = TSc(self);
  63. TermDocEnum *tde = ts->tde;
  64. /* first scan in cache */
  65. while (++(ts->pointer) < ts->pointer_max) {
  66. if (ts->docs[ts->pointer] >= doc_num) {
  67. self->doc = ts->docs[ts->pointer];
  68. return true;
  69. }
  70. }
  71. /* not found in cache, seek underlying stream */
  72. if (tde->skip_to(tde, doc_num)) {
  73. ts->pointer_max = 1;
  74. ts->pointer = 0;
  75. ts->docs[0] = self->doc = tde->doc_num(tde);
  76. ts->freqs[0] = tde->freq(tde);
  77. return true;
  78. }
  79. else {
  80. return false;
  81. }
  82. }
  83. static Explanation *tsc_explain(Scorer *self, int doc_num)
  84. {
  85. TermScorer *ts = TSc(self);
  86. Query *query = ts->weight->get_query(ts->weight);
  87. int tf = 0;
  88. tsc_skip_to(self, doc_num);
  89. if (self->doc == doc_num) {
  90. tf = ts->freqs[ts->pointer];
  91. }
  92. return expl_new(sim_tf(self->similarity, (float)tf),
  93. "tf(term_freq(%s:%s)=%d)",
  94. TQ(query)->field, TQ(query)->term, tf);
  95. }
  96. static void tsc_destroy(Scorer *self)
  97. {
  98. TSc(self)->tde->close(TSc(self)->tde);
  99. scorer_destroy_i(self);
  100. }
  101. static Scorer *tsc_new(Weight *weight, TermDocEnum *tde, uchar *norms)
  102. {
  103. int i;
  104. Scorer *self = scorer_new(TermScorer, weight->similarity);
  105. TSc(self)->weight = weight;
  106. TSc(self)->tde = tde;
  107. TSc(self)->norms = norms;
  108. TSc(self)->weight_value = weight->value;
  109. for (i = 0; i < SCORE_CACHE_SIZE; i++) {
  110. TSc(self)->score_cache[i]
  111. = sim_tf(self->similarity, (float)i) * TSc(self)->weight_value;
  112. }
  113. self->score = &tsc_score;
  114. self->next = &tsc_next;
  115. self->skip_to = &tsc_skip_to;
  116. self->explain = &tsc_explain;
  117. self->destroy = &tsc_destroy;
  118. return self;
  119. }
  120. /***************************************************************************
  121. *
  122. * TermWeight
  123. *
  124. ***************************************************************************/
  125. static Scorer *tw_scorer(Weight *self, IndexReader *ir)
  126. {
  127. TermQuery *tq = TQ(self->query);
  128. TermDocEnum *tde = ir_term_docs_for(ir, tq->field, tq->term);
  129. if (!tde) {
  130. return NULL;
  131. }
  132. return tsc_new(self, tde, ir_get_norms(ir, tq->field));
  133. }
  134. static Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
  135. {
  136. Explanation *qnorm_expl;
  137. Explanation *field_expl;
  138. Scorer *scorer;
  139. Explanation *tf_expl;
  140. uchar *field_norms;
  141. float field_norm;
  142. Explanation *field_norm_expl;
  143. char *query_str = self->query->to_s(self->query, "");
  144. TermQuery *tq = TQ(self->query);
  145. char *term = tq->term;
  146. char *field = tq->field;
  147. Explanation *expl = expl_new(0.0, "weight(%s in %d), product of:",
  148. query_str, doc_num);
  149. /* We need two of these as it's included in both the query explanation
  150. * and the field explanation */
  151. Explanation *idf_expl1 = expl_new(self->idf, "idf(doc_freq=%d)",
  152. ir_doc_freq(ir, field, term));
  153. Explanation *idf_expl2 = expl_new(self->idf, "idf(doc_freq=%d)",
  154. ir_doc_freq(ir, field, term));
  155. /* explain query weight */
  156. Explanation *query_expl = expl_new(0.0, "query_weight(%s), product of:",
  157. query_str);
  158. free(query_str);
  159. if (self->query->boost != 1.0) {
  160. expl_add_detail(query_expl, expl_new(self->query->boost, "boost"));
  161. }
  162. expl_add_detail(query_expl, idf_expl1);
  163. qnorm_expl = expl_new(self->qnorm, "query_norm");
  164. expl_add_detail(query_expl, qnorm_expl);
  165. query_expl->value = self->query->boost
  166. * idf_expl1->value * qnorm_expl->value;
  167. expl_add_detail(expl, query_expl);
  168. /* explain field weight */
  169. field_expl = expl_new(0.0, "field_weight(%s:%s in %d), product of:",
  170. field, term, doc_num);
  171. scorer = self->scorer(self, ir);
  172. tf_expl = scorer->explain(scorer, doc_num);
  173. scorer->destroy(scorer);
  174. expl_add_detail(field_expl, tf_expl);
  175. expl_add_detail(field_expl, idf_expl2);
  176. field_norms = ir_get_norms(ir, field);
  177. field_norm = (field_norms
  178. ? sim_decode_norm(self->similarity, field_norms[doc_num])
  179. : (float)0.0);
  180. field_norm_expl = expl_new(field_norm, "field_norm(field=%s, doc=%d)",
  181. field, doc_num);
  182. expl_add_detail(field_expl, field_norm_expl);
  183. field_expl->value = tf_expl->value * idf_expl2->value
  184. * field_norm_expl->value;
  185. /* combine them */
  186. if (query_expl->value == 1.0) {
  187. expl_destroy(expl);
  188. return field_expl;
  189. } else {
  190. expl->value = (query_expl->value * field_expl->value);
  191. expl_add_detail(expl, field_expl);
  192. return expl;
  193. }
  194. }
  195. static char *tw_to_s(Weight *self)
  196. {
  197. return strfmt("TermWeight(%f)", self->value);
  198. }
  199. static Weight *tw_new(Query *query, Searcher *searcher)
  200. {
  201. Weight *self = w_new(Weight, query);
  202. self->scorer = &tw_scorer;
  203. self->explain = &tw_explain;
  204. self->to_s = &tw_to_s;
  205. self->similarity = query->get_similarity(query, searcher);
  206. self->idf = sim_idf(self->similarity,
  207. searcher->doc_freq(searcher,
  208. TQ(query)->field,
  209. TQ(query)->term),
  210. searcher->max_doc(searcher)); /* compute idf */
  211. return self;
  212. }
  213. /***************************************************************************
  214. *
  215. * TermQuery
  216. *
  217. ***************************************************************************/
  218. static void tq_destroy(Query *self)
  219. {
  220. free(TQ(self)->term);
  221. free(TQ(self)->field);
  222. q_destroy_i(self);
  223. }
  224. static char *tq_to_s(Query *self, const char *field)
  225. {
  226. size_t flen = strlen(TQ(self)->field);
  227. size_t tlen = strlen(TQ(self)->term);
  228. char *buffer = ALLOC_N(char, 34 + flen + tlen);
  229. char *b = buffer;
  230. if (strcmp(field, TQ(self)->field) != 0) {
  231. memcpy(b, TQ(self)->field, sizeof(char) * flen);
  232. b[flen] = ':';
  233. b += flen + 1;
  234. }
  235. memcpy(b, TQ(self)->term, tlen);
  236. b += tlen;
  237. *b = 0;
  238. if (self->boost != 1.0) {
  239. *b = '^';
  240. dbl_to_s(b+1, self->boost);
  241. }
  242. return buffer;
  243. }
  244. static void tq_extract_terms(Query *self, HashSet *terms)
  245. {
  246. hs_add(terms, term_new(TQ(self)->field, TQ(self)->term));
  247. }
  248. static unsigned long tq_hash(Query *self)
  249. {
  250. return str_hash(TQ(self)->term) ^ str_hash(TQ(self)->field);
  251. }
  252. static int tq_eq(Query *self, Query *o)
  253. {
  254. return (strcmp(TQ(self)->term, TQ(o)->term) == 0)
  255. && (strcmp(TQ(self)->field, TQ(o)->field) == 0);
  256. }
  257. static MatchVector *tq_get_matchv_i(Query *self, MatchVector *mv,
  258. TermVector *tv)
  259. {
  260. if (strcmp(tv->field, TQ(self)->field) == 0) {
  261. int i;
  262. TVTerm *tv_term = tv_get_tv_term(tv, TQ(self)->term);
  263. if (tv_term) {
  264. for (i = 0; i < tv_term->freq; i++) {
  265. int pos = tv_term->positions[i];
  266. matchv_add(mv, pos, pos);
  267. }
  268. }
  269. }
  270. return mv;
  271. }
  272. Query *tq_new(const char *field, const char *term)
  273. {
  274. Query *self = q_new(TermQuery);
  275. TQ(self)->field = estrdup(field);
  276. TQ(self)->term = estrdup(term);
  277. self->type = TERM_QUERY;
  278. self->extract_terms = &tq_extract_terms;
  279. self->to_s = &tq_to_s;
  280. self->hash = &tq_hash;
  281. self->eq = &tq_eq;
  282. self->destroy_i = &tq_destroy;
  283. self->create_weight_i = &tw_new;
  284. self->get_matchv_i = &tq_get_matchv_i;
  285. return self;
  286. }