PageRenderTime 63ms CodeModel.GetById 39ms RepoModel.GetById 0ms app.codeStats 0ms

/src/whoosh/scoring.py

https://github.com/pombredanne/whoosh
Python | 360 lines | 268 code | 23 blank | 69 comment | 3 complexity | f26af94a531477da5d0fb7e4e80350eb MD5 | raw file
  1. #===============================================================================
  2. # Copyright 2008 Matt Chaput
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #===============================================================================
  16. """This module contains classes for scoring (and sorting) search results.
  17. """
  18. from __future__ import division
  19. from array import array
  20. from math import log, pi
  21. import weakref
  22. class Weighting(object):
  23. """Abstract base class for weighting objects. A weighting
  24. object implements a scoring algorithm.
  25. Concrete subclasses must implement the score() method, which
  26. returns a score given a term and a document in which that term
  27. appears.
  28. """
  29. #self.doc_count = searcher.doc_count_all()
  30. #self.max_doc_freq = ix.max_doc_freq()
  31. #self.unique_term_count = ix.unique_term_count()
  32. #self.avg_doc_length = self.index_length / self.doc_count
  33. def __init__(self):
  34. self._idf_cache = {}
  35. def idf(self, searcher, fieldnum, text):
  36. """Calculates the Inverse Document Frequency of the
  37. current term. Subclasses may want to override this.
  38. """
  39. cache = self._idf_cache
  40. term = (fieldnum, text)
  41. if term in cache: return cache[term]
  42. df = searcher.doc_frequency(fieldnum, text)
  43. idf = log(searcher.doc_count_all() / (df + 1)) + 1.0
  44. cache[term] = idf
  45. return idf
  46. def avg_field_length(self, searcher, fieldnum):
  47. """Returns the average length of the field per document.
  48. (i.e. total field length / total number of documents)
  49. """
  50. return searcher.field_length(fieldnum) / searcher.doc_count_all()
  51. def fl_over_avfl(self, searcher, docnum, fieldnum):
  52. """Returns the length of the current field in the current
  53. document divided by the average length of the field
  54. across all documents. This is used by some scoring algorithms.
  55. """
  56. return searcher.doc_field_length(docnum, fieldnum) / self.avg_field_length(searcher, fieldnum)
  57. def score(self, searcher, fieldnum, text, docnum, weight, QTF = 1):
  58. """Returns the score for a given term in the given document.
  59. :searcher: the searcher doing the scoring.
  60. :fieldnum: the field number of the term being scored.
  61. :text: the text of the term being scored.
  62. :docnum: the doc number of the document being scored.
  63. :weight: the frequency * boost of the term in this document.
  64. :QTF: the frequency of the term in the query.
  65. """
  66. raise NotImplementedError
  67. # Scoring classes
  68. class BM25F(Weighting):
  69. """Generates a BM25F score.
  70. """
  71. def __init__(self, B = 0.75, K1 = 1.2, field_B = None):
  72. """B and K1 are free parameters, see the BM25 literature.
  73. field_B can be a dictionary mapping fieldnums to field-specific B values.
  74. field_boost can be a dictionary mapping fieldnums to field boost factors.
  75. """
  76. super(self.__class__, self).__init__()
  77. self.K1 = K1
  78. self.B = B
  79. if field_B is None: field_B = {}
  80. self._field_B = field_B
  81. def score(self, searcher, fieldnum, text, docnum, weight, QTF = 1):
  82. if not searcher.scorable(fieldnum): return weight
  83. B = self._field_B.get(fieldnum, self.B)
  84. avl = self.avg_field_length(searcher, fieldnum)
  85. idf = self.idf(searcher, fieldnum, text)
  86. l = searcher.doc_field_length(docnum, fieldnum)
  87. w = weight / ((1 - B) + B * (l / avl))
  88. return idf * (w / (self.K1 + w))
  89. # The following scoring algorithms are translated from classes in
  90. # the Terrier search engine's uk.ac.gla.terrier.matching.models package.
  91. class Cosine(Weighting):
  92. """A cosine vector-space scoring algorithm similar to Lucene's.
  93. """
  94. def score(self, searcher, fieldnum, text, docnum, weight, QTF = 1):
  95. idf = self.idf(searcher, fieldnum, text)
  96. DTW = (1.0 + log(weight)) * idf
  97. QMF = 1.0 # TODO: Fix this
  98. QTW = ((0.5 + (0.5 * QTF / QMF))) * idf
  99. return DTW * QTW
  100. class DFree(Weighting):
  101. """The DFree probabilistic weighting algorithm, translated into Python
  102. from Terrier's Java implementation.
  103. """
  104. def score(self, searcher, fieldnum, text, docnum, weight, QTF = 1):
  105. if not searcher.scorable(fieldnum): return weight
  106. fieldlen = searcher.doc_field_length(docnum, fieldnum)
  107. prior = weight / fieldlen
  108. post = (weight + 1.0) / fieldlen
  109. invprior = searcher.field_length(fieldnum) / searcher.frequency(fieldnum, text)
  110. norm = weight * log(post / prior, 2)
  111. return QTF\
  112. * norm\
  113. * (weight * (- log(prior * invprior, 2))
  114. + (weight + 1.0) * (+ log(post * invprior, 2)) + 0.5 * log(post/prior, 2))
  115. class DLH13(Weighting):
  116. """The DLH13 probabilistic weighting algorithm, translated into Python
  117. from Terrier's Java implementation.
  118. """
  119. def __init__(self, k = 0.5):
  120. super(self.__class__, self).__init__()
  121. self.k = k
  122. def score(self, searcher, fieldnum, text, docnum, weight, QTF = 1):
  123. if not searcher.scorable(fieldnum): return weight
  124. k = self.k
  125. dl = searcher.doc_field_length(docnum, fieldnum)
  126. f = weight / dl
  127. tc = searcher.frequency(fieldnum, text)
  128. dc = searcher.doc_count_all()
  129. avl = self.avg_field_length(searcher, fieldnum)
  130. return QTF * (weight * log((weight * avl / dl) * (dc / tc), 2) + 0.5 * log(2.0 * pi * weight * (1.0 - f))) / (weight + k)
  131. class Hiemstra_LM(Weighting):
  132. """The Hiemstra LM probabilistic weighting algorithm, translated into Python
  133. from Terrier's Java implementation.
  134. """
  135. def __init__(self, c = 0.15):
  136. super(self.__class__, self).__init__()
  137. self.c = c
  138. def score(self, searcher, fieldnum, text, docnum, weight, QTF = 1):
  139. if not searcher.scorable(fieldnum): return weight
  140. c = self.c
  141. tc = searcher.frequency(fieldnum, text)
  142. dl = searcher.doc_field_length(docnum, fieldnum)
  143. return log(1 + (c * weight * searcher.field_length(fieldnum)) / ((1 - c) * tc * dl))
  144. class InL2(Weighting):
  145. """The InL2 LM probabilistic weighting algorithm, translated into Python
  146. from Terrier's Java implementation.
  147. """
  148. def __init__(self, c = 1.0):
  149. super(self.__class__, self).__init__()
  150. self.c = c
  151. def score(self, searcher, fieldnum, text, docnum, weight, QTF = 1):
  152. if not searcher.scorable(fieldnum): return weight
  153. dl = searcher.doc_field_length(docnum, fieldnum)
  154. TF = weight * log(1.0 + (self.c * self.avg_field_length(searcher, fieldnum)) / dl)
  155. norm = 1.0 / (TF + 1.0)
  156. df = searcher.doc_frequency(fieldnum, text)
  157. idf_dfr = log((searcher.doc_count_all() + 1) / (df + 0.5), 2)
  158. return TF * idf_dfr * QTF * norm
  159. class TF_IDF(Weighting):
  160. """Instead of doing any real scoring, this simply returns tf * idf.
  161. """
  162. def score(self, searcher, fieldnum, text, docnum, weight, QTF = 1):
  163. return weight * self.idf(searcher, fieldnum, text)
  164. class Frequency(Weighting):
  165. """Instead of doing any real scoring, simply returns the
  166. term frequency. This may be useful when you don't care about
  167. normalization and weighting.
  168. """
  169. def score(self, searcher, fieldnum, text, docnum, weight, QTF = 1):
  170. return searcher.frequency(fieldnum, text)
  171. # Sorting classes
  172. class Sorter(object):
  173. """Abstract base class for sorter objects. See the 'sortedby'
  174. keyword argument to searching.Searcher.search().
  175. Concrete subclasses must implement the order() method, which
  176. takes a sequence of doc numbers and returns it sorted.
  177. """
  178. def order(self, searcher, docnums, reverse = False):
  179. """Returns a sorted list of document numbers.
  180. """
  181. raise NotImplementedError
  182. class NullSorter(Sorter):
  183. """Sorter that does nothing."""
  184. def order(self, searcher, docnums, reverse = False):
  185. """Returns docnums as-is. The 'reverse' keyword is ignored."""
  186. return docnums
  187. class FieldSorter(Sorter):
  188. """Used by searching.Searcher to sort document results based on the
  189. value of an indexed field, rather than score. See the 'sortedby'
  190. keyword argument to searching.Searcher.search().
  191. This object creates a cache of document orders for the given field.
  192. Creating the cache may make the first sorted search of a field
  193. seem slow, but subsequent sorted searches of the same field will
  194. be much faster.
  195. """
  196. def __init__(self, fieldname, missingfirst = False):
  197. """
  198. :fieldname: The name of the field to sort by.
  199. :missingfirst: Place documents which don't have the given
  200. field first in the sorted results. The default is to put those
  201. documents last (after all documents that have the given field).
  202. """
  203. self.fieldname = fieldname
  204. self.missingfirst = missingfirst
  205. self._searcher = None
  206. self._cache = None
  207. def _make_cache(self, searcher):
  208. # Is this searcher already cached?
  209. if self._cache and self._searcher and self._searcher() is searcher:
  210. return
  211. fieldnum = searcher.fieldname_to_num(self.fieldname)
  212. # Create an array of an int for every document in the index.
  213. N = searcher.doc_count_all()
  214. if self.missingfirst:
  215. default = -1
  216. else:
  217. default = N + 1
  218. cache = array("i", [default] * N)
  219. # For every document containing every term in the field, set
  220. # its array value to the term's (inherently sorted) position.
  221. i = -1
  222. for i, word in enumerate(searcher.lexicon(fieldnum)):
  223. for docnum, _ in searcher.postings(fieldnum, word):
  224. cache[docnum] = i
  225. self.limit = i
  226. self._cache = cache
  227. self._searcher = weakref.ref(searcher, self._delete_cache)
  228. def _delete_cache(self, obj):
  229. # Callback function, called by the weakref implementation when
  230. # the searcher we're using to do the ordering goes away.
  231. self._cache = self._searcher = None
  232. def order(self, searcher, docnums, reverse = False):
  233. """Takes a sequence of docnums (as produced by query.docs()) and
  234. returns a list of docnums sorted by the field values.
  235. """
  236. self._make_cache(searcher)
  237. return sorted(docnums,
  238. key = self._cache.__getitem__,
  239. reverse = reverse)
  240. class MultiFieldSorter(FieldSorter):
  241. """Used by searching.Searcher to sort document results based on the
  242. value of an indexed field, rather than score. See the 'sortedby'
  243. keyword argument to searching.Searcher.search().
  244. This sorter uses multiple fields, so if for two documents the first
  245. field has the same value, it will use the second field to sort them,
  246. and so on.
  247. """
  248. def __init__(self, fieldnames, missingfirst = False):
  249. """
  250. :fieldnames: A list of field names to sort by.
  251. :missingfirst: Place documents which don't have the given
  252. field first in the sorted results. The default is to put those
  253. documents last (after all documents that have the given field).
  254. """
  255. self.fieldnames = fieldnames
  256. self.sorters = [FieldSorter(fn)
  257. for fn in fieldnames]
  258. self.missingfirst = missingfirst
  259. def order(self, searcher, docnums, reverse = False):
  260. sorters = self.sorters
  261. missingfirst = self.missingfirst
  262. for s in sorters:
  263. s._make_cache(searcher, missingfirst)
  264. return sorted(docnums,
  265. key = lambda x: tuple((s._cache[x] for s in sorters)),
  266. reverse = reverse)