PageRenderTime 59ms CodeModel.GetById 28ms RepoModel.GetById 1ms app.codeStats 0ms

/src/whoosh/query/terms.py

https://bitbucket.org/mkr/whoosh
Python | 494 lines | 458 code | 8 blank | 28 comment | 1 complexity | 5d5cfbedea097c5c48777488571b4a50 MD5 | raw file
Possible License(s): Apache-2.0
  1. # Copyright 2007 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. from __future__ import division
  28. import copy, fnmatch, re
  29. from collections import defaultdict
  30. from whoosh import matching
  31. from whoosh.analysis import Token
  32. from whoosh.compat import bytes_type, text_type, u
  33. from whoosh.lang.morph_en import variations
  34. from whoosh.query import qcore
  35. class Term(qcore.Query):
  36. """Matches documents containing the given term (fieldname+text pair).
  37. >>> Term("content", u"render")
  38. """
  39. __inittypes__ = dict(fieldname=str, text=text_type, boost=float)
  40. def __init__(self, fieldname, text, boost=1.0):
  41. self.fieldname = fieldname
  42. self.text = text
  43. self.boost = boost
  44. def __eq__(self, other):
  45. return (other
  46. and self.__class__ is other.__class__
  47. and self.fieldname == other.fieldname
  48. and self.text == other.text
  49. and self.boost == other.boost)
  50. def __repr__(self):
  51. r = "%s(%r, %r" % (self.__class__.__name__, self.fieldname, self.text)
  52. if self.boost != 1.0:
  53. r += ", boost=%s" % self.boost
  54. r += ")"
  55. return r
  56. def __unicode__(self):
  57. t = u("%s:%s") % (self.fieldname, self.text)
  58. if self.boost != 1:
  59. t += u("^") + text_type(self.boost)
  60. return t
  61. __str__ = __unicode__
  62. def __hash__(self):
  63. return hash(self.fieldname) ^ hash(self.text) ^ hash(self.boost)
  64. def has_terms(self):
  65. return True
  66. def tokens(self, boost=1.0):
  67. yield Token(fieldname=self.fieldname, text=self.text,
  68. boost=boost * self.boost, startchar=self.startchar,
  69. endchar=self.endchar, chars=True)
  70. def terms(self, phrases=False):
  71. if self.field():
  72. yield (self.field(), self.text)
  73. def replace(self, fieldname, oldtext, newtext):
  74. q = copy.copy(self)
  75. if q.fieldname == fieldname and q.text == oldtext:
  76. q.text = newtext
  77. return q
  78. def estimate_size(self, ixreader):
  79. fieldname = self.fieldname
  80. if fieldname not in ixreader.schema:
  81. return 0
  82. field = ixreader.schema[fieldname]
  83. text = field.to_bytes(self.text)
  84. return ixreader.doc_frequency(fieldname, text)
  85. def matcher(self, searcher, context=None):
  86. fieldname = self.fieldname
  87. text = self.text
  88. if fieldname not in searcher.schema:
  89. return matching.NullMatcher()
  90. field = searcher.schema[fieldname]
  91. text = field.to_bytes(text)
  92. if (self.fieldname, text) in searcher.reader():
  93. if context is None:
  94. w = searcher.weighting
  95. else:
  96. w = context.weighting
  97. m = searcher.postings(self.fieldname, text, weighting=w)
  98. if self.boost != 1.0:
  99. m = matching.WrappingMatcher(m, boost=self.boost)
  100. return m
  101. else:
  102. return matching.NullMatcher()
  103. class MultiTerm(qcore.Query):
  104. """Abstract base class for queries that operate on multiple terms in the
  105. same field.
  106. """
  107. TOO_MANY_CLAUSES = 1024
  108. constantscore = False
  109. def _btexts(self, ixreader):
  110. raise NotImplementedError(self.__class__.__name__)
  111. def expanded_terms(self, ixreader):
  112. fieldname = self.field()
  113. if fieldname:
  114. for btext in self._btexts(ixreader):
  115. yield (fieldname, btext)
  116. def tokens(self, boost=1.0):
  117. yield Token(fieldname=self.fieldname, text=self.text,
  118. boost=boost * self.boost, startchar=self.startchar,
  119. endchar=self.endchar, chars=True)
  120. def simplify(self, ixreader):
  121. if self.fieldname not in ixreader.schema:
  122. return qcore.NullQuery()
  123. field = ixreader.schema[self.fieldname]
  124. existing = []
  125. for btext in sorted(set(self._btexts(ixreader))):
  126. text = field.from_bytes(btext)
  127. existing.append(Term(self.fieldname, text, boost=self.boost))
  128. if len(existing) == 1:
  129. return existing[0]
  130. elif existing:
  131. from whoosh.query import Or
  132. return Or(existing)
  133. else:
  134. return qcore.NullQuery
  135. def estimate_size(self, ixreader):
  136. return sum(ixreader.doc_frequency(self.fieldname, text)
  137. for text in self._btexts(ixreader))
  138. def estimate_min_size(self, ixreader):
  139. return min(ixreader.doc_frequency(self.fieldname, text)
  140. for text in self._btexts(ixreader))
  141. def matcher(self, searcher, context=None):
  142. fieldname = self.fieldname
  143. constantscore = self.constantscore
  144. reader = searcher.reader()
  145. qs = [Term(fieldname, word) for word in self._btexts(reader)]
  146. if not qs:
  147. return matching.NullMatcher()
  148. if len(qs) == 1:
  149. # If there's only one term, just use it
  150. q = qs[0]
  151. elif constantscore or len(qs) > self.TOO_MANY_CLAUSES:
  152. # If there's so many clauses that an Or search would take forever,
  153. # trade memory for time and just find all the matching docs and
  154. # serve them as one ListMatcher
  155. fmt = searcher.schema[fieldname].format
  156. doc_to_values = defaultdict(list)
  157. doc_to_weights = defaultdict(float)
  158. for q in qs:
  159. m = q.matcher(searcher)
  160. while m.is_active():
  161. docnum = m.id()
  162. doc_to_values[docnum].append(m.value())
  163. if not constantscore:
  164. doc_to_weights[docnum] += m.weight()
  165. m.next()
  166. docnums = sorted(doc_to_values.keys())
  167. # This is a list of lists of value strings -- ListMatcher will
  168. # actually do the work of combining multiple values if the user
  169. # asks for them
  170. values = [doc_to_values[docnum] for docnum in docnums]
  171. kwargs = {"values": values, "format": fmt}
  172. if constantscore:
  173. kwargs["all_weights"] = self.boost
  174. else:
  175. kwargs["weights"] = [doc_to_weights[docnum]
  176. for docnum in docnums]
  177. #return matching.ListMatcher(docnums, term=term, **kwargs)
  178. return matching.ListMatcher(docnums, **kwargs)
  179. else:
  180. # The default case: Or the terms together
  181. from whoosh.query import Or
  182. q = Or(qs)
  183. m = q.matcher(searcher, context)
  184. #m = matching.SingleTermMatcher(m, term)
  185. return m
  186. class PatternQuery(MultiTerm):
  187. """An intermediate base class for common methods of Prefix and Wildcard.
  188. """
  189. __inittypes__ = dict(fieldname=str, text=text_type, boost=float)
  190. def __init__(self, fieldname, text, boost=1.0, constantscore=True):
  191. self.fieldname = fieldname
  192. self.text = text
  193. self.boost = boost
  194. self.constantscore = constantscore
  195. def __eq__(self, other):
  196. return (other and self.__class__ is other.__class__
  197. and self.fieldname == other.fieldname
  198. and self.text == other.text and self.boost == other.boost
  199. and self.constantscore == other.constantscore)
  200. def __repr__(self):
  201. r = "%s(%r, %r" % (self.__class__.__name__, self.fieldname, self.text)
  202. if self.boost != 1:
  203. r += ", boost=%s" % self.boost
  204. r += ")"
  205. return r
  206. def __hash__(self):
  207. return (hash(self.fieldname) ^ hash(self.text) ^ hash(self.boost)
  208. ^ hash(self.constantscore))
  209. def _get_pattern(self):
  210. raise NotImplementedError
  211. def _find_prefix(self, text):
  212. # Subclasses/instances should set the SPECIAL_CHARS attribute to a set
  213. # of characters that mark the end of the literal prefix
  214. specialchars = self.SPECIAL_CHARS
  215. for i, char in enumerate(text):
  216. if char in specialchars:
  217. break
  218. return text[:i]
  219. def _btexts(self, ixreader):
  220. field = ixreader.schema[self.fieldname]
  221. exp = re.compile(self._get_pattern())
  222. prefix = self._find_prefix(self.text)
  223. if prefix:
  224. candidates = ixreader.expand_prefix(self.fieldname, prefix)
  225. else:
  226. candidates = ixreader.lexicon(self.fieldname)
  227. from_bytes = field.from_bytes
  228. for btext in candidates:
  229. text = from_bytes(btext)
  230. if exp.match(text):
  231. yield btext
  232. class Prefix(PatternQuery):
  233. """Matches documents that contain any terms that start with the given text.
  234. >>> # Match documents containing words starting with 'comp'
  235. >>> Prefix("content", u"comp")
  236. """
  237. def __unicode__(self):
  238. return "%s:%s*" % (self.fieldname, self.text)
  239. __str__ = __unicode__
  240. def _btexts(self, ixreader):
  241. return ixreader.expand_prefix(self.fieldname, self.text)
  242. class Wildcard(PatternQuery):
  243. """Matches documents that contain any terms that match a "glob" pattern.
  244. See the Python ``fnmatch`` module for information about globs.
  245. >>> Wildcard("content", u"in*f?x")
  246. """
  247. SPECIAL_CHARS = frozenset("*?")
  248. def __unicode__(self):
  249. return "%s:%s" % (self.fieldname, self.text)
  250. __str__ = __unicode__
  251. def _get_pattern(self):
  252. return fnmatch.translate(self.text)
  253. def normalize(self):
  254. # If there are no wildcard characters in this "wildcard", turn it into
  255. # a simple Term
  256. text = self.text
  257. if text == "*":
  258. from whoosh.query import Every
  259. return Every(self.fieldname, boost=self.boost)
  260. if "*" not in text and "?" not in text:
  261. # If no wildcard chars, convert to a normal term.
  262. return Term(self.fieldname, self.text, boost=self.boost)
  263. elif ("?" not in text and text.endswith("*")
  264. and text.find("*") == len(text) - 1):
  265. # If the only wildcard char is an asterisk at the end, convert to a
  266. # Prefix query.
  267. return Prefix(self.fieldname, self.text[:-1], boost=self.boost)
  268. else:
  269. return self
  270. # _btexts() implemented in PatternQuery
  271. class Regex(PatternQuery):
  272. """Matches documents that contain any terms that match a regular
  273. expression. See the Python ``re`` module for information about regular
  274. expressions.
  275. """
  276. SPECIAL_CHARS = frozenset("{}()[].?*+^$\\")
  277. def __unicode__(self):
  278. return '%s:r"%s"' % (self.fieldname, self.text)
  279. __str__ = __unicode__
  280. def _get_pattern(self):
  281. return self.text
  282. def _find_prefix(self, text):
  283. if "|" in text:
  284. return ""
  285. if text.startswith("^"):
  286. text = text[1:]
  287. elif text.startswith("\\A"):
  288. text = text[2:]
  289. prefix = PatternQuery._find_prefix(self, text)
  290. lp = len(prefix)
  291. if lp < len(text) and text[lp] in "*?":
  292. # we stripped something starting from * or ? - they both MAY mean
  293. # "0 times". As we had stripped starting from FIRST special char,
  294. # that implies there were only ordinary chars left of it. Thus,
  295. # the very last of them is not part of the real prefix:
  296. prefix = prefix[:-1]
  297. return prefix
  298. # _btexts() implemented in PatternQuery
  299. class ExpandingTerm(MultiTerm):
  300. """Intermediate base class for queries such as FuzzyTerm and Variations
  301. that expand into multiple queries, but come from a single term.
  302. """
  303. def has_terms(self):
  304. return True
  305. def terms(self, phrases=False):
  306. if self.field():
  307. yield (self.field(), self.text)
  308. class FuzzyTerm(ExpandingTerm):
  309. """Matches documents containing words similar to the given term.
  310. """
  311. __inittypes__ = dict(fieldname=str, text=text_type, boost=float,
  312. maxdist=float, prefixlength=int)
  313. def __init__(self, fieldname, text, boost=1.0, maxdist=1,
  314. prefixlength=1, constantscore=True):
  315. """
  316. :param fieldname: The name of the field to search.
  317. :param text: The text to search for.
  318. :param boost: A boost factor to apply to scores of documents matching
  319. this query.
  320. :param maxdist: The maximum edit distance from the given text.
  321. :param prefixlength: The matched terms must share this many initial
  322. characters with 'text'. For example, if text is "light" and
  323. prefixlength is 2, then only terms starting with "li" are checked
  324. for similarity.
  325. """
  326. self.fieldname = fieldname
  327. self.text = text
  328. self.boost = boost
  329. self.maxdist = maxdist
  330. self.prefixlength = prefixlength
  331. self.constantscore = constantscore
  332. def __eq__(self, other):
  333. return (other and self.__class__ is other.__class__
  334. and self.fieldname == other.fieldname
  335. and self.text == other.text
  336. and self.maxdist == other.maxdist
  337. and self.prefixlength == other.prefixlength
  338. and self.boost == other.boost
  339. and self.constantscore == other.constantscore)
  340. def __repr__(self):
  341. r = "%s(%r, %r, boost=%f, maxdist=%d, prefixlength=%d)"
  342. return r % (self.__class__.__name__, self.fieldname, self.text,
  343. self.boost, self.maxdist, self.prefixlength)
  344. def __unicode__(self):
  345. r = u("%s:%s") % (self.fieldname, self.text) + u("~")
  346. if self.maxdist > 1:
  347. r += u("%d") % self.maxdist
  348. if self.boost != 1.0:
  349. r += u("^%f") % self.boost
  350. return r
  351. __str__ = __unicode__
  352. def __hash__(self):
  353. return (hash(self.fieldname) ^ hash(self.text) ^ hash(self.boost)
  354. ^ hash(self.maxdist) ^ hash(self.prefixlength)
  355. ^ hash(self.constantscore))
  356. def _btexts(self, ixreader):
  357. return ixreader.terms_within(self.fieldname, self.text, self.maxdist,
  358. prefix=self.prefixlength)
  359. class Variations(ExpandingTerm):
  360. """Query that automatically searches for morphological variations of the
  361. given word in the same field.
  362. """
  363. def __init__(self, fieldname, text, boost=1.0):
  364. self.fieldname = fieldname
  365. self.text = text
  366. self.boost = boost
  367. def __repr__(self):
  368. r = "%s(%r, %r" % (self.__class__.__name__, self.fieldname, self.text)
  369. if self.boost != 1:
  370. r += ", boost=%s" % self.boost
  371. r += ")"
  372. return r
  373. def __eq__(self, other):
  374. return (other and self.__class__ is other.__class__
  375. and self.fieldname == other.fieldname
  376. and self.text == other.text and self.boost == other.boost)
  377. def __hash__(self):
  378. return hash(self.fieldname) ^ hash(self.text) ^ hash(self.boost)
  379. def _btexts(self, ixreader):
  380. fieldname = self.fieldname
  381. to_bytes = ixreader.schema[fieldname].to_bytes
  382. for word in variations(self.text):
  383. btext = to_bytes(word)
  384. if (fieldname, btext) in ixreader:
  385. yield btext
  386. def __unicode__(self):
  387. return u("%s:<%s>") % (self.fieldname, self.text)
  388. __str__ = __unicode__
  389. def replace(self, fieldname, oldtext, newtext):
  390. q = copy.copy(self)
  391. if q.fieldname == fieldname and q.text == oldtext:
  392. q.text = newtext
  393. return q