PageRenderTime 520ms CodeModel.GetById 6ms RepoModel.GetById 1ms app.codeStats 0ms

/bangkokhotel/lib/python2.5/site-packages/whoosh/sorting.py

https://bitbucket.org/luisrodriguez/bangkokhotel
Python | 1021 lines | 989 code | 5 blank | 27 comment | 2 complexity | c03a82012e1aa2971dbab480b7acfad7 MD5 | raw file
  1. # Copyright 2011 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. from array import array
  28. from collections import defaultdict
  29. from whoosh.compat import string_type, u, xrange, iteritems
  30. from whoosh.fields import DEFAULT_LONG
  31. from whoosh.support.times import (long_to_datetime, datetime_to_long,
  32. timedelta_to_usecs)
  33. # Faceting objects
  34. class FacetType(object):
  35. """Base class for "facets", aspects that can be sorted/faceted.
  36. """
  37. maptype = None
  38. def categorizer(self, global_searcher):
  39. """Returns a :class:`Categorizer` corresponding to this facet.
  40. :param global_searcher: A parent searcher. You can use this searcher if
  41. you need global document ID references.
  42. """
  43. raise NotImplementedError
  44. def map(self, default=None):
  45. t = self.maptype
  46. if t is None:
  47. t = default
  48. if t is None:
  49. return OrderedList()
  50. elif type(t) is type:
  51. return t()
  52. else:
  53. return t
  54. def default_name(self):
  55. return "facet"
  56. class Categorizer(object):
  57. """Base class for categorizer objects which compute a key value for a
  58. document based on certain criteria, for use in sorting/faceting.
  59. Categorizers are created by FacetType objects through the
  60. :meth:`FacetType.categorizer` method. The
  61. :class:`whoosh.searching.Searcher` object passed to the ``categorizer``
  62. method may be a composite searcher (that is, wrapping a multi-reader), but
  63. categorizers are always run **per-segment**, with segment-relative document
  64. numbers.
  65. The collector will call a categorizer's ``set_searcher`` method as it
  66. searches each segment to let the cateogorizer set up whatever segment-
  67. specific data it needs.
  68. ``Collector.allow_overlap`` should be ``True`` if the caller can use the
  69. ``keys_for`` method instead of ``key_for`` to group documents into
  70. potentially overlapping groups. The default is ``False``.
  71. If a categorizer subclass can categorize the document using only the
  72. document number, it should set ``Collector.requires_matcher`` to ``False``
  73. (this is the default) and NOT USE the given matcher in the ``key_for`` or
  74. ``keys_for`` methods, since in that case ``segment_docnum`` is not
  75. guaranteed to be consistent with the given matcher. If a categorizer
  76. subclass needs to access information on the matcher, it should set
  77. ``requires_matcher`` to ``True``. This will prevent the caller from using
  78. optimizations that might leave the matcher in an inconsistent state.
  79. """
  80. allow_overlap = False
  81. requires_matcher = False
  82. def set_searcher(self, segment_searcher, docoffset):
  83. """Called by the collector when the collector moves to a new segment.
  84. The ``segment_searcher`` will be atomic. The ``docoffset`` is the
  85. offset of the segment's document numbers relative to the entire index.
  86. You can use the offset to get absolute index docnums by adding the
  87. offset to segment-relative docnums.
  88. """
  89. pass
  90. def key_for(self, matcher, segment_docnum):
  91. """Returns a key for the current match.
  92. :param matcher: a :class:`whoosh.matching.Matcher` object. If
  93. ``self.requires_matcher`` is ``False``, DO NOT use this object,
  94. since it may be inconsistent. Use the given ``segment_docnum``
  95. instead.
  96. :param segment_docnum: the segment-relative document number of the
  97. current match.
  98. """
  99. # Backwards compatibility
  100. if hasattr(self, "key_for_id"):
  101. return self.key_for_id(segment_docnum)
  102. elif hasattr(self, "key_for_matcher"):
  103. return self.key_for_matcher(matcher)
  104. raise NotImplementedError(self.__class__)
  105. def keys_for(self, matcher, segment_docnum):
  106. """Yields a series of keys for the current match.
  107. This method will be called instead of ``key_for`` if
  108. ``self.allow_overlap`` is ``True``.
  109. :param matcher: a :class:`whoosh.matching.Matcher` object. If
  110. ``self.requires_matcher`` is ``False``, DO NOT use this object,
  111. since it may be inconsistent. Use the given ``segment_docnum``
  112. instead.
  113. :param segment_docnum: the segment-relative document number of the
  114. current match.
  115. """
  116. # Backwards compatibility
  117. if hasattr(self, "keys_for_id"):
  118. return self.keys_for_id(segment_docnum)
  119. raise NotImplementedError(self.__class__)
  120. def key_to_name(self, key):
  121. """Returns a representation of the key to be used as a dictionary key
  122. in faceting. For example, the sorting key for date fields is a large
  123. integer; this method translates it into a ``datetime`` object to make
  124. the groupings clearer.
  125. """
  126. return key
  127. class FieldFacet(FacetType):
  128. """Sorts/facest by the contents of a field.
  129. For example, to sort by the contents of the "path" field in reverse order,
  130. and facet by the contents of the "tag" field::
  131. paths = FieldFacet("path", reverse=True)
  132. tags = FieldFacet("tag")
  133. results = searcher.search(myquery, sortedby=paths, groupedby=tags)
  134. This facet returns different categorizers based on the field type.
  135. """
  136. def __init__(self, fieldname, reverse=False, allow_overlap=False,
  137. maptype=None):
  138. """
  139. :param fieldname: the name of the field to sort/facet on.
  140. :param reverse: if True, when sorting, reverse the sort order of this
  141. facet.
  142. :param allow_overlap: if True, when grouping, allow documents to appear
  143. in multiple groups when they have multiple terms in the field.
  144. """
  145. self.fieldname = fieldname
  146. self.reverse = reverse
  147. self.allow_overlap = allow_overlap
  148. self.maptype = maptype
  149. def default_name(self):
  150. return self.fieldname
  151. def categorizer(self, global_searcher):
  152. from whoosh.fields import NUMERIC, DATETIME
  153. # The searcher we're passed here may wrap a multireader, but the
  154. # actual key functions will always be called per-segment following a
  155. # Categorizer.set_searcher method call
  156. fieldname = self.fieldname
  157. field = None
  158. if fieldname in global_searcher.schema:
  159. field = global_searcher.schema[fieldname]
  160. hascache = global_searcher.reader().supports_caches()
  161. if self.allow_overlap:
  162. return self.OverlappingFieldCategorizer(fieldname)
  163. elif hascache and isinstance(field, DATETIME):
  164. # Return a subclass of NumericFieldCategorizer that formats dates
  165. return self.DateFieldCategorizer(fieldname, self.reverse)
  166. elif hascache and isinstance(field, NUMERIC):
  167. # Numeric fields are naturally reversible
  168. return self.NumericFieldCategorizer(fieldname, self.reverse)
  169. elif hascache and not self.reverse:
  170. # Straightforward: use the field cache to sort/categorize
  171. return self.FieldCategorizer(fieldname)
  172. else:
  173. # If the reader does not support field caches or we need to
  174. # reverse-sort a string field, we need to do more work
  175. return self.NoCacheFieldCategorizer(global_searcher, fieldname,
  176. self.reverse)
  177. class FieldCategorizer(Categorizer):
  178. """Categorizer for regular, unreversed fields. Just uses the
  179. fieldcache to get the keys.
  180. """
  181. def __init__(self, fieldname):
  182. self.fieldname = fieldname
  183. def set_searcher(self, segment_searcher, docoffset):
  184. r = segment_searcher.reader()
  185. self.fieldcache = r.fieldcache(self.fieldname)
  186. def key_for(self, matcher, docid):
  187. return self.fieldcache.key_for(docid)
  188. def key_to_name(self, key):
  189. if key == u('\uFFFF'):
  190. return None
  191. else:
  192. return key
  193. class NumericFieldCategorizer(Categorizer):
  194. """Categorizer for numeric fields, which are naturally reversible.
  195. """
  196. def __init__(self, fieldname, reverse):
  197. self.fieldname = fieldname
  198. self.reverse = reverse
  199. def set_searcher(self, segment_searcher, docoffset):
  200. r = segment_searcher.reader()
  201. fieldobj = segment_searcher.schema[self.fieldname]
  202. self.default = fieldobj.sortable_default()
  203. self.fieldcache = r.fieldcache(self.fieldname)
  204. def key_for(self, matcher, docid):
  205. value = self.fieldcache.key_for(docid)
  206. if self.reverse:
  207. return 0 - value
  208. else:
  209. return value
  210. def key_to_name(self, key):
  211. if key == self.default:
  212. return None
  213. else:
  214. return key
  215. class DateFieldCategorizer(NumericFieldCategorizer):
  216. """Categorizer for date fields. Same as NumericFieldCategorizer, but
  217. converts the numeric keys back to dates for better labels.
  218. """
  219. def key_to_name(self, key):
  220. if key == DEFAULT_LONG:
  221. return None
  222. else:
  223. return long_to_datetime(key)
  224. class NoCacheFieldCategorizer(Categorizer):
  225. """This object builds an array caching the order of all documents
  226. according to the field, then uses the cached order as a numeric key.
  227. This is useful when a field cache is not available, and also for
  228. reversed fields (since field cache keys for non- numeric fields are
  229. arbitrary data, it's not possible to "negate" them to reverse the sort
  230. order).
  231. """
  232. def __init__(self, global_searcher, fieldname, reverse):
  233. # Cache the relative positions of all docs with the given field
  234. # across the entire index
  235. reader = global_searcher.reader()
  236. dc = reader.doc_count_all()
  237. fieldobj = global_searcher.schema[fieldname]
  238. self.values = []
  239. self.array = array("i", [dc + 1] * dc)
  240. # sortable_values() returns an iterator of (actual_term,
  241. # sortable_value) pairs
  242. tvs = fieldobj.sortable_values(reader, fieldname)
  243. for i, (t, v) in enumerate(tvs):
  244. self.values.append(v)
  245. if reverse:
  246. i = dc - i
  247. # Get global docids from global reader
  248. postings = reader.postings(fieldname, t)
  249. for docid in postings.all_ids():
  250. self.array[docid] = i
  251. if reverse:
  252. self.values.reverse()
  253. def set_searcher(self, segment_searcher, docoffset):
  254. self.docoffset = docoffset
  255. def key_for(self, matcher, docid):
  256. arry = self.array
  257. offset = self.docoffset
  258. global_id = offset + docid
  259. assert docid >= 0
  260. assert global_id < len(arry), ("%s + %s >= %s"
  261. % (docid, offset, len(arry)))
  262. return arry[global_id]
  263. def key_to_name(self, key):
  264. return self.values[key]
  265. class OverlappingFieldCategorizer(Categorizer):
  266. allow_overlap = True
  267. def __init__(self, fieldname):
  268. self.fieldname = fieldname
  269. self.use_vectors = False
  270. def set_searcher(self, segment_searcher, docoffset):
  271. fieldname = self.fieldname
  272. dc = segment_searcher.doc_count_all()
  273. field = segment_searcher.schema[fieldname]
  274. reader = segment_searcher.reader()
  275. if field.vector:
  276. # If the field was indexed with term vectors, use the vectors
  277. # to get the list of values in each matched document
  278. self.use_vectors = True
  279. self.segment_searcher = segment_searcher
  280. else:
  281. # Otherwise, cache the values in each document in a huge list
  282. # of lists
  283. self.use_vectors = False
  284. self.lists = [[] for _ in xrange(dc)]
  285. for t, _ in field.sortable_values(reader, fieldname):
  286. postings = reader.postings(fieldname, t)
  287. for docid in postings.all_ids():
  288. self.lists[docid].append(t)
  289. def keys_for(self, matcher, docid):
  290. if self.use_vectors:
  291. try:
  292. v = self.segment_searcher.vector(docid, self.fieldname)
  293. return list(v.all_ids())
  294. except KeyError:
  295. return None
  296. else:
  297. return self.lists[docid] or None
  298. def key_for(self, matcher, docid):
  299. if self.use_vectors:
  300. try:
  301. v = self.segment_searcher.vector(docid, self.fieldname)
  302. return v.id()
  303. except KeyError:
  304. return None
  305. else:
  306. ls = self.lists[docid]
  307. if ls:
  308. return ls[0]
  309. else:
  310. return None
  311. class QueryFacet(FacetType):
  312. """Sorts/facets based on the results of a series of queries.
  313. """
  314. def __init__(self, querydict, other=None, allow_overlap=False,
  315. maptype=None):
  316. """
  317. :param querydict: a dictionary mapping keys to
  318. :class:`whoosh.query.Query` objects.
  319. :param other: the key to use for documents that don't match any of the
  320. queries.
  321. """
  322. self.querydict = querydict
  323. self.other = other
  324. self.maptype = maptype
  325. def categorizer(self, global_searcher):
  326. return self.QueryCategorizer(self.querydict, self.other)
  327. class QueryCategorizer(Categorizer):
  328. def __init__(self, querydict, other, allow_overlap=False):
  329. self.querydict = querydict
  330. self.other = other
  331. self.allow_overlap = allow_overlap
  332. def set_searcher(self, segment_searcher, offset):
  333. self.docsets = {}
  334. for qname, q in self.querydict.items():
  335. docset = set(q.docs(segment_searcher))
  336. if docset:
  337. self.docsets[qname] = docset
  338. self.offset = offset
  339. def key_for(self, matcher, docid):
  340. for qname in self.docsets:
  341. if docid in self.docsets[qname]:
  342. return qname
  343. return self.other
  344. def keys_for(self, matcher, docid):
  345. found = False
  346. for qname in self.docsets:
  347. if docid in self.docsets[qname]:
  348. yield qname
  349. found = True
  350. if not found:
  351. yield None
  352. class RangeFacet(QueryFacet):
  353. """Sorts/facets based on numeric ranges. For textual ranges, use
  354. :class:`QueryFacet`.
  355. For example, to facet the "price" field into $100 buckets, up to $1000::
  356. prices = RangeFacet("price", 0, 1000, 100)
  357. results = searcher.search(myquery, groupedby=prices)
  358. The ranges/buckets are always **inclusive** at the start and **exclusive**
  359. at the end.
  360. """
  361. def __init__(self, fieldname, start, end, gap, hardend=False,
  362. maptype=None):
  363. """
  364. :param fieldname: the numeric field to sort/facet on.
  365. :param start: the start of the entire range.
  366. :param end: the end of the entire range.
  367. :param gap: the size of each "bucket" in the range. This can be a
  368. sequence of sizes. For example, ``gap=[1,5,10]`` will use 1 as the
  369. size of the first bucket, 5 as the size of the second bucket, and
  370. 10 as the size of all subsequent buckets.
  371. :param hardend: if True, the end of the last bucket is clamped to the
  372. value of ``end``. If False (the default), the last bucket is always
  373. ``gap`` sized, even if that means the end of the last bucket is
  374. after ``end``.
  375. """
  376. self.fieldname = fieldname
  377. self.start = start
  378. self.end = end
  379. self.gap = gap
  380. self.hardend = hardend
  381. self.maptype = maptype
  382. self._queries()
  383. def default_name(self):
  384. return self.fieldname
  385. def _rangetype(self):
  386. from whoosh import query
  387. return query.NumericRange
  388. def _range_name(self, startval, endval):
  389. return (startval, endval)
  390. def _queries(self):
  391. if not self.gap:
  392. raise Exception("No gap secified (%r)" % self.gap)
  393. if isinstance(self.gap, (list, tuple)):
  394. gaps = self.gap
  395. gapindex = 0
  396. else:
  397. gaps = [self.gap]
  398. gapindex = -1
  399. rangetype = self._rangetype()
  400. self.querydict = {}
  401. cstart = self.start
  402. while cstart < self.end:
  403. thisgap = gaps[gapindex]
  404. if gapindex >= 0:
  405. gapindex += 1
  406. if gapindex == len(gaps):
  407. gapindex = -1
  408. cend = cstart + thisgap
  409. if self.hardend:
  410. cend = min(self.end, cend)
  411. rangename = self._range_name(cstart, cend)
  412. q = rangetype(self.fieldname, cstart, cend, endexcl=True)
  413. self.querydict[rangename] = q
  414. cstart = cend
  415. def categorizer(self, global_searcher):
  416. return QueryFacet(self.querydict).categorizer(global_searcher)
  417. class DateRangeFacet(RangeFacet):
  418. """Sorts/facets based on date ranges. This is the same as RangeFacet
  419. except you are expected to use ``daterange`` objects as the start and end
  420. of the range, and ``timedelta`` or ``relativedelta`` objects as the gap(s),
  421. and it generates :class:`~whoosh.query.DateRange` queries instead of
  422. :class:`~whoosh.query.TermRange` queries.
  423. For example, to facet a "birthday" range into 5 year buckets::
  424. from datetime import datetime
  425. from whoosh.support.relativedelta import relativedelta
  426. startdate = datetime(1920, 0, 0)
  427. enddate = datetime.now()
  428. gap = relativedelta(years=5)
  429. bdays = DateRangeFacet("birthday", startdate, enddate, gap)
  430. results = searcher.search(myquery, groupedby=bdays)
  431. The ranges/buckets are always **inclusive** at the start and **exclusive**
  432. at the end.
  433. """
  434. def _rangetype(self):
  435. from whoosh import query
  436. return query.DateRange
  437. class ScoreFacet(FacetType):
  438. """Uses a document's score as a sorting criterion.
  439. For example, to sort by the ``tag`` field, and then within that by relative
  440. score::
  441. tag_score = MultiFacet(["tag", ScoreFacet()])
  442. results = searcher.search(myquery, sortedby=tag_score)
  443. """
  444. def categorizer(self, global_searcher):
  445. return self.ScoreCategorizer(global_searcher)
  446. class ScoreCategorizer(Categorizer):
  447. requires_matcher = True
  448. def __init__(self, global_searcher):
  449. w = global_searcher.weighting
  450. self.use_final = w.use_final
  451. if w.use_final:
  452. self.final = w.final
  453. def set_searcher(self, segment_searcher, offset):
  454. self.segment_searcher = segment_searcher
  455. def key_for(self, matcher, docid):
  456. score = matcher.score()
  457. if self.use_final:
  458. score = self.final(self.segment_searcher, docid, score)
  459. # Negate the score so higher values sort first
  460. return 0 - score
  461. class FunctionFacet(FacetType):
  462. """Lets you pass an arbitrary function that will compute the key. This may
  463. be easier than subclassing FacetType and Categorizer to set up the desired
  464. behavior.
  465. The function is called with the arguments ``(searcher, docid)``, where the
  466. ``searcher`` may be a composite searcher, and the ``docid`` is an absolute
  467. index document number (not segment-relative).
  468. For example, to use the number of words in the document's "content" field
  469. as the sorting/faceting key::
  470. fn = lambda s, docid: s.doc_field_length(docid, "content")
  471. lengths = FunctionFacet(fn)
  472. """
  473. def __init__(self, fn, maptype=None):
  474. self.fn = fn
  475. self.maptype = maptype
  476. def categorizer(self, global_searcher):
  477. return self.FunctionCategorizer(global_searcher, self.fn)
  478. class FunctionCategorizer(Categorizer):
  479. def __init__(self, global_searcher, fn):
  480. self.global_searcher = global_searcher
  481. self.fn = fn
  482. def set_searcher(self, segment_searcher, docoffset):
  483. self.offset = docoffset
  484. def key_for(self, matcher, docid):
  485. return self.fn(self.global_searcher, docid + self.offset)
  486. class StoredFieldFacet(FacetType):
  487. """Lets you sort/group using the value in an unindexed, stored field (e.g.
  488. STORED). This is usually slower than using an indexed field.
  489. For fields where the stored value is a space-separated list of keywords,
  490. (e.g. ``"tag1 tag2 tag3"``), you can use the ``allow_overlap`` keyword
  491. argument to allow overlapped faceting on the result of calling the
  492. ``split()`` method on the field value (or calling a custom split function
  493. if one is supplied).
  494. """
  495. def __init__(self, fieldname, allow_overlap=False, split_fn=None,
  496. maptype=None):
  497. """
  498. :param fieldname: the name of the stored field.
  499. :param allow_overlap: if True, when grouping, allow documents to appear
  500. in multiple groups when they have multiple terms in the field. The
  501. categorizer uses ``string.split()`` or the custom ``split_fn`` to
  502. convert the stored value into a list of facet values.
  503. :param split_fn: a custom function to split a stored field value into
  504. multiple facet values when ``allow_overlap`` is True. If not
  505. supplied, the categorizer simply calls the value's ``split()``
  506. method.
  507. """
  508. self.fieldname = fieldname
  509. self.allow_overlap = allow_overlap
  510. self.split_fn = None
  511. self.maptype = maptype
  512. def default_name(self):
  513. return self.fieldname
  514. def categorizer(self, global_searcher):
  515. return self.StoredFieldCategorizer(self.fieldname, self.allow_overlap,
  516. self.split_fn)
  517. class StoredFieldCategorizer(Categorizer):
  518. def __init__(self, fieldname, allow_overlap, split_fn):
  519. self.fieldname = fieldname
  520. self.allow_overlap = allow_overlap
  521. self.split_fn = split_fn
  522. def set_searcher(self, segment_searcher, docoffset):
  523. self.segment_searcher = segment_searcher
  524. def keys_for(self, matcher, docid):
  525. d = self.segment_searcher.stored_fields(docid)
  526. value = d.get(self.fieldname)
  527. if self.split_fn:
  528. return self.split_fn(value)
  529. else:
  530. return value.split()
  531. def key_for(self, matcher, docid):
  532. d = self.segment_searcher.stored_fields(docid)
  533. return d.get(self.fieldname)
  534. class MultiFacet(FacetType):
  535. """Sorts/facets by the combination of multiple "sub-facets".
  536. For example, to sort by the value of the "tag" field, and then (for
  537. documents where the tag is the same) by the value of the "path" field::
  538. facet = MultiFacet(FieldFacet("tag"), FieldFacet("path")
  539. results = searcher.search(myquery, sortedby=facet)
  540. As a shortcut, you can use strings to refer to field names, and they will
  541. be assumed to be field names and turned into FieldFacet objects::
  542. facet = MultiFacet("tag", "path")
  543. You can also use the ``add_*`` methods to add criteria to the multifacet::
  544. facet = MultiFacet()
  545. facet.add_field("tag")
  546. facet.add_field("path", reverse=True)
  547. facet.add_query({"a-m": TermRange("name", "a", "m"),
  548. "n-z": TermRange("name", "n", "z")})
  549. """
  550. def __init__(self, items=None, maptype=None):
  551. self.facets = []
  552. if items:
  553. for item in items:
  554. self._add(item)
  555. self.maptype = maptype
  556. @classmethod
  557. def from_sortedby(cls, sortedby):
  558. multi = cls()
  559. if isinstance(sortedby, string_type):
  560. multi._add(sortedby)
  561. elif (isinstance(sortedby, (list, tuple))
  562. or hasattr(sortedby, "__iter__")):
  563. for item in sortedby:
  564. multi._add(item)
  565. else:
  566. multi._add(sortedby)
  567. return multi
  568. def _add(self, item):
  569. if isinstance(item, FacetType):
  570. self.add_facet(item)
  571. elif isinstance(item, string_type):
  572. self.add_field(item)
  573. else:
  574. raise Exception("Don't know what to do with facet %r" % (item,))
  575. def add_field(self, fieldname, reverse=False):
  576. self.facets.append(FieldFacet(fieldname, reverse=reverse))
  577. return self
  578. def add_query(self, querydict, other=None, allow_overlap=False):
  579. self.facets.append(QueryFacet(querydict, other=other,
  580. allow_overlap=allow_overlap))
  581. return self
  582. def add_score(self):
  583. self.facets.append(ScoreFacet())
  584. return self
  585. def add_facet(self, facet):
  586. if not isinstance(facet, FacetType):
  587. raise Exception()
  588. self.facets.append(facet)
  589. return self
  590. def categorizer(self, global_searcher):
  591. if not self.facets:
  592. raise Exception("No facets")
  593. elif len(self.facets) == 1:
  594. catter = self.facets[0].categorizer(global_searcher)
  595. else:
  596. catter = self.MultiCategorizer([facet.categorizer(global_searcher)
  597. for facet in self.facets])
  598. return catter
  599. class MultiCategorizer(Categorizer):
  600. def __init__(self, catters):
  601. self.catters = catters
  602. @property
  603. def requires_matcher(self):
  604. return any(c.requires_matcher for c in self.catters)
  605. def set_searcher(self, segment_searcher, docoffset):
  606. for catter in self.catters:
  607. catter.set_searcher(segment_searcher, docoffset)
  608. def key_for(self, matcher, docid):
  609. return tuple(catter.key_for(matcher, docid)
  610. for catter in self.catters)
  611. class Facets(object):
  612. """Maps facet names to :class:`FacetType` objects, for creating multiple
  613. groupings of documents.
  614. For example, to group by tag, and **also** group by price range::
  615. facets = Facets()
  616. facets.add_field("tag")
  617. facets.add_facet("price", RangeFacet("price", 0, 1000, 100))
  618. results = searcher.search(myquery, groupedby=facets)
  619. tag_groups = results.groups("tag")
  620. price_groups = results.groups("price")
  621. (To group by the combination of multiple facets, use :class:`MultiFacet`.)
  622. """
  623. def __init__(self, x=None):
  624. self.facets = {}
  625. if x:
  626. self.add_facets(x)
  627. @classmethod
  628. def from_groupedby(cls, groupedby):
  629. facets = cls()
  630. if isinstance(groupedby, (cls, dict)):
  631. facets.add_facets(groupedby)
  632. elif isinstance(groupedby, string_type):
  633. facets.add_field(groupedby)
  634. elif isinstance(groupedby, FacetType):
  635. facets.add_facet(groupedby.default_name(), groupedby)
  636. elif isinstance(groupedby, (list, tuple)):
  637. for item in groupedby:
  638. facets.add_facets(cls.from_groupedby(item))
  639. else:
  640. raise Exception("Don't know what to do with groupedby=%r"
  641. % groupedby)
  642. return facets
  643. def names(self):
  644. """Returns an iterator of the facet names in this object.
  645. """
  646. return iter(self.facets)
  647. def items(self):
  648. """Returns a list of (facetname, facetobject) tuples for the facets in
  649. this object.
  650. """
  651. return self.facets.items()
  652. def add_field(self, fieldname, **kwargs):
  653. """Adds a :class:`FieldFacet` for the given field name (the field name
  654. is automatically used as the facet name).
  655. """
  656. self.facets[fieldname] = FieldFacet(fieldname, **kwargs)
  657. return self
  658. def add_query(self, name, querydict, **kwargs):
  659. """Adds a :class:`QueryFacet` under the given ``name``.
  660. :param name: a name for the facet.
  661. :param querydict: a dictionary mapping keys to
  662. :class:`whoosh.query.Query` objects.
  663. """
  664. self.facets[name] = QueryFacet(querydict, **kwargs)
  665. return self
  666. def add_facet(self, name, facet):
  667. """Adds a :class:`FacetType` object under the given ``name``.
  668. """
  669. if not isinstance(facet, FacetType):
  670. raise Exception("%r:%r is not a facet" % (name, facet))
  671. self.facets[name] = facet
  672. return self
  673. def add_facets(self, facets, replace=True):
  674. """Adds the contents of the given ``Facets`` or ``dict`` object to this
  675. object.
  676. """
  677. if not isinstance(facets, (dict, Facets)):
  678. raise Exception("%r is not a Facets object or dict" % facets)
  679. for name, facet in facets.items():
  680. if replace or name not in self.facets:
  681. self.facets[name] = facet
  682. return self
  683. # Objects for holding facet groups
  684. class FacetMap(object):
  685. """Base class for objects holding the results of grouping search results by
  686. a Facet. Use an object's ``as_dict()`` method to access the results.
  687. You can pass a subclass of this to the ``maptype`` keyword argument when
  688. creating a ``FacetType`` object to specify what information the facet
  689. should record about the group. For example::
  690. # Record each document in each group in its sorted order
  691. myfacet = FieldFacet("size", maptype=OrderedList)
  692. # Record only the count of documents in each group
  693. myfacet = FieldFacet("size", maptype=Count)
  694. """
  695. def add(self, groupname, docid, sortkey):
  696. """Adds a document to the facet results.
  697. :param groupname: the name of the group to add this document to.
  698. :param docid: the document number of the document to add.
  699. :param sortkey: a value representing the sort position of the document
  700. in the full results.
  701. """
  702. raise NotImplementedError
  703. def as_dict(self):
  704. """Returns a dictionary object mapping group names to
  705. implementation-specific values. For example, the value might be a list
  706. of document numbers, or a integer representing the number of documents
  707. in the group.
  708. """
  709. raise NotImplementedError
  710. class OrderedList(FacetMap):
  711. """Stores a list of document numbers for each group, in the same order as
  712. they appear in the search results.
  713. The ``as_dict`` method returns a dictionary mapping group names to lists
  714. of document numbers.
  715. """
  716. def __init__(self):
  717. self.dict = defaultdict(list)
  718. def __repr__(self):
  719. return "<%s %r>" % (self.__class__.__name__, self.dict)
  720. def add(self, groupname, docid, sortkey):
  721. self.dict[groupname].append((sortkey, docid))
  722. def as_dict(self):
  723. d = {}
  724. for key, items in iteritems(self.dict):
  725. d[key] = [docnum for _, docnum in sorted(items)]
  726. return d
  727. class UnorderedList(FacetMap):
  728. """Stores a list of document numbers for each group, in arbitrary order.
  729. This is slightly faster and uses less memory than
  730. :class:`OrderedListResult` if you don't care about the ordering of the
  731. documents within groups.
  732. The ``as_dict`` method returns a dictionary mapping group names to lists
  733. of document numbers.
  734. """
  735. def __init__(self):
  736. self.dict = defaultdict(list)
  737. def __repr__(self):
  738. return "<%s %r>" % (self.__class__.__name__, self.dict)
  739. def add(self, groupname, docid, sortkey):
  740. self.dict[groupname].append(docid)
  741. def as_dict(self):
  742. return dict(self.dict)
  743. class Count(FacetMap):
  744. """Stores the number of documents in each group.
  745. The ``as_dict`` method returns a dictionary mapping group names to
  746. integers.
  747. """
  748. def __init__(self):
  749. self.dict = defaultdict(int)
  750. def __repr__(self):
  751. return "<%s %r>" % (self.__class__.__name__, self.dict)
  752. def add(self, groupname, docid, sortkey):
  753. self.dict[groupname] += 1
  754. def as_dict(self):
  755. return dict(self.dict)
  756. class Best(FacetMap):
  757. """Stores the "best" document in each group (that is, the one with the
  758. highest sort key).
  759. The ``as_dict`` method returns a dictionary mapping group names to
  760. docnument numbers.
  761. """
  762. def __init__(self):
  763. self.bestids = {}
  764. self.bestkeys = {}
  765. def __repr__(self):
  766. return "<%s %r>" % (self.__class__.__name__, self.bestids)
  767. def add(self, groupname, docid, sortkey):
  768. if groupname not in self.bestids or sortkey < self.bestkeys[groupname]:
  769. self.bestids[groupname] = docid
  770. self.bestkeys[groupname] = sortkey
  771. def as_dict(self):
  772. return self.bestids
  773. #
  774. #
  775. #
  776. #
  777. # Legacy sorting object
  778. class Sorter(object):
  779. """This is a legacy interface. The functionality of the Sorter object was
  780. moved into the :class:`FacetType` classes in Whoosh 2.0. The old Sorter API
  781. is still supported for backwards-compatibility, but it simply forwards to
  782. the regular searching API.
  783. See :doc:`/facets` for information on the new API.
  784. """
  785. def __init__(self, searcher):
  786. self.searcher = searcher
  787. self.multi = MultiFacet()
  788. def add_field(self, fieldname, reverse=False):
  789. self.multi.add_field(fieldname, reverse=reverse)
  790. def sort_query(self, q, limit=None, reverse=False, filter=None, mask=None,
  791. groupedby=None):
  792. return self.searcher.search(q, sortedby=self.multi, limit=limit,
  793. reverse=reverse, filter=filter, mask=mask,
  794. groupedby=groupedby)