/bangkokhotel/lib/python2.5/site-packages/whoosh/sorting.py
Python | 1021 lines | 989 code | 5 blank | 27 comment | 2 complexity | c03a82012e1aa2971dbab480b7acfad7 MD5 | raw file
- # Copyright 2011 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- from array import array
- from collections import defaultdict
- from whoosh.compat import string_type, u, xrange, iteritems
- from whoosh.fields import DEFAULT_LONG
- from whoosh.support.times import (long_to_datetime, datetime_to_long,
- timedelta_to_usecs)
- # Faceting objects
- class FacetType(object):
- """Base class for "facets", aspects that can be sorted/faceted.
- """
- maptype = None
- def categorizer(self, global_searcher):
- """Returns a :class:`Categorizer` corresponding to this facet.
-
- :param global_searcher: A parent searcher. You can use this searcher if
- you need global document ID references.
- """
- raise NotImplementedError
- def map(self, default=None):
- t = self.maptype
- if t is None:
- t = default
- if t is None:
- return OrderedList()
- elif type(t) is type:
- return t()
- else:
- return t
- def default_name(self):
- return "facet"
- class Categorizer(object):
- """Base class for categorizer objects which compute a key value for a
- document based on certain criteria, for use in sorting/faceting.
-
- Categorizers are created by FacetType objects through the
- :meth:`FacetType.categorizer` method. The
- :class:`whoosh.searching.Searcher` object passed to the ``categorizer``
- method may be a composite searcher (that is, wrapping a multi-reader), but
- categorizers are always run **per-segment**, with segment-relative document
- numbers.
-
- The collector will call a categorizer's ``set_searcher`` method as it
- searches each segment to let the cateogorizer set up whatever segment-
- specific data it needs.
-
- ``Collector.allow_overlap`` should be ``True`` if the caller can use the
- ``keys_for`` method instead of ``key_for`` to group documents into
- potentially overlapping groups. The default is ``False``.
-
- If a categorizer subclass can categorize the document using only the
- document number, it should set ``Collector.requires_matcher`` to ``False``
- (this is the default) and NOT USE the given matcher in the ``key_for`` or
- ``keys_for`` methods, since in that case ``segment_docnum`` is not
- guaranteed to be consistent with the given matcher. If a categorizer
- subclass needs to access information on the matcher, it should set
- ``requires_matcher`` to ``True``. This will prevent the caller from using
- optimizations that might leave the matcher in an inconsistent state.
- """
- allow_overlap = False
- requires_matcher = False
- def set_searcher(self, segment_searcher, docoffset):
- """Called by the collector when the collector moves to a new segment.
- The ``segment_searcher`` will be atomic. The ``docoffset`` is the
- offset of the segment's document numbers relative to the entire index.
- You can use the offset to get absolute index docnums by adding the
- offset to segment-relative docnums.
- """
- pass
- def key_for(self, matcher, segment_docnum):
- """Returns a key for the current match.
-
- :param matcher: a :class:`whoosh.matching.Matcher` object. If
- ``self.requires_matcher`` is ``False``, DO NOT use this object,
- since it may be inconsistent. Use the given ``segment_docnum``
- instead.
- :param segment_docnum: the segment-relative document number of the
- current match.
- """
- # Backwards compatibility
- if hasattr(self, "key_for_id"):
- return self.key_for_id(segment_docnum)
- elif hasattr(self, "key_for_matcher"):
- return self.key_for_matcher(matcher)
- raise NotImplementedError(self.__class__)
- def keys_for(self, matcher, segment_docnum):
- """Yields a series of keys for the current match.
-
- This method will be called instead of ``key_for`` if
- ``self.allow_overlap`` is ``True``.
-
- :param matcher: a :class:`whoosh.matching.Matcher` object. If
- ``self.requires_matcher`` is ``False``, DO NOT use this object,
- since it may be inconsistent. Use the given ``segment_docnum``
- instead.
- :param segment_docnum: the segment-relative document number of the
- current match.
- """
- # Backwards compatibility
- if hasattr(self, "keys_for_id"):
- return self.keys_for_id(segment_docnum)
- raise NotImplementedError(self.__class__)
- def key_to_name(self, key):
- """Returns a representation of the key to be used as a dictionary key
- in faceting. For example, the sorting key for date fields is a large
- integer; this method translates it into a ``datetime`` object to make
- the groupings clearer.
- """
- return key
- class FieldFacet(FacetType):
- """Sorts/facest by the contents of a field.
-
- For example, to sort by the contents of the "path" field in reverse order,
- and facet by the contents of the "tag" field::
-
- paths = FieldFacet("path", reverse=True)
- tags = FieldFacet("tag")
- results = searcher.search(myquery, sortedby=paths, groupedby=tags)
-
- This facet returns different categorizers based on the field type.
- """
- def __init__(self, fieldname, reverse=False, allow_overlap=False,
- maptype=None):
- """
- :param fieldname: the name of the field to sort/facet on.
- :param reverse: if True, when sorting, reverse the sort order of this
- facet.
- :param allow_overlap: if True, when grouping, allow documents to appear
- in multiple groups when they have multiple terms in the field.
- """
- self.fieldname = fieldname
- self.reverse = reverse
- self.allow_overlap = allow_overlap
- self.maptype = maptype
- def default_name(self):
- return self.fieldname
- def categorizer(self, global_searcher):
- from whoosh.fields import NUMERIC, DATETIME
- # The searcher we're passed here may wrap a multireader, but the
- # actual key functions will always be called per-segment following a
- # Categorizer.set_searcher method call
- fieldname = self.fieldname
- field = None
- if fieldname in global_searcher.schema:
- field = global_searcher.schema[fieldname]
- hascache = global_searcher.reader().supports_caches()
- if self.allow_overlap:
- return self.OverlappingFieldCategorizer(fieldname)
- elif hascache and isinstance(field, DATETIME):
- # Return a subclass of NumericFieldCategorizer that formats dates
- return self.DateFieldCategorizer(fieldname, self.reverse)
- elif hascache and isinstance(field, NUMERIC):
- # Numeric fields are naturally reversible
- return self.NumericFieldCategorizer(fieldname, self.reverse)
- elif hascache and not self.reverse:
- # Straightforward: use the field cache to sort/categorize
- return self.FieldCategorizer(fieldname)
- else:
- # If the reader does not support field caches or we need to
- # reverse-sort a string field, we need to do more work
- return self.NoCacheFieldCategorizer(global_searcher, fieldname,
- self.reverse)
- class FieldCategorizer(Categorizer):
- """Categorizer for regular, unreversed fields. Just uses the
- fieldcache to get the keys.
- """
- def __init__(self, fieldname):
- self.fieldname = fieldname
- def set_searcher(self, segment_searcher, docoffset):
- r = segment_searcher.reader()
- self.fieldcache = r.fieldcache(self.fieldname)
- def key_for(self, matcher, docid):
- return self.fieldcache.key_for(docid)
- def key_to_name(self, key):
- if key == u('\uFFFF'):
- return None
- else:
- return key
- class NumericFieldCategorizer(Categorizer):
- """Categorizer for numeric fields, which are naturally reversible.