/src/whoosh/index.py
Python | 737 lines | 681 code | 13 blank | 43 comment | 3 complexity | bc2ad6ff469e3fdce8121bec25b87e78 MD5 | raw file
Possible License(s): Apache-2.0
- # Copyright 2007 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- """Contains the main functions/classes for creating, maintaining, and using
- an index.
- """
- from __future__ import division
- import os.path, re, sys
- from time import time, sleep
- from whoosh import __version__
- from whoosh.compat import xrange
- from whoosh.filedb.structfile import ChecksumFile
- from whoosh.legacy import toc_loaders
- from whoosh.compat import dumps, loads, string_type
- from whoosh.fields import ensure_schema
- from whoosh.system import _INT_SIZE, _FLOAT_SIZE, _LONG_SIZE
- _DEF_INDEX_NAME = "MAIN"
- _CURRENT_TOC_VERSION = -111
- # Exceptions
- class LockError(Exception):
- pass
- class IndexError(Exception):
- """Generic index error."""
- class IndexVersionError(IndexError):
- """Raised when you try to open an index using a format that the current
- version of Whoosh cannot read. That is, when the index you're trying to
- open is either not backward or forward compatible with this version of
- Whoosh.
- """
- def __init__(self, msg, version, release=None):
- Exception.__init__(self, msg)
- self.version = version
- self.release = release
- class OutOfDateError(IndexError):
- """Raised when you try to commit changes to an index which is not the
- latest generation.
- """
- class EmptyIndexError(IndexError):
- """Raised when you try to work with an index that has no indexed terms.
- """
- # Convenience functions
- def create_in(dirname, schema, indexname=None):
- """Convenience function to create an index in a directory. Takes care of
- creating a FileStorage object for you.
- :param dirname: the path string of the directory in which to create the
- index.
- :param schema: a :class:`whoosh.fields.Schema` object describing the
- index's fields.
- :param indexname: the name of the index to create; you only need to specify
- this if you are creating multiple indexes within the same storage
- object.
- :returns: :class:`Index`
- """
- from whoosh.filedb.filestore import FileStorage
- if not indexname:
- indexname = _DEF_INDEX_NAME
- storage = FileStorage(dirname)
- return FileIndex.create(storage, schema, indexname)
- def open_dir(dirname, indexname=None, readonly=False, schema=None):
- """Convenience function for opening an index in a directory. Takes care of
- creating a FileStorage object for you. dirname is the filename of the
- directory in containing the index. indexname is the name of the index to
- create; you only need to specify this if you have multiple indexes within
- the same storage object.
- :param dirname: the path string of the directory in which to create the
- index.
- :param indexname: the name of the index to create; you only need to specify
- this if you have multiple indexes within the same storage object.
- """
- from whoosh.filedb.filestore import FileStorage
- if indexname is None:
- indexname = _DEF_INDEX_NAME
- storage = FileStorage(dirname, readonly=readonly)
- return FileIndex(storage, schema=schema, indexname=indexname)
- def exists_in(dirname, indexname=None):
- """Returns True if dirname contains a Whoosh index.
- :param dirname: the file path of a directory.
- :param indexname: the name of the index. If None, the default index name is
- used.
- """
- if os.path.exists(dirname):
- try:
- ix = open_dir(dirname, indexname=indexname)
- return ix.latest_generation() > -1
- except EmptyIndexError:
- pass
- return False
- def exists(storage, indexname=None):
- """Deprecated; use ``storage.index_exists()``.
- :param storage: a store.Storage object.
- :param indexname: the name of the index. If None, the default index name is
- used.
- """
- return storage.index_exists(indexname)
- def version_in(dirname, indexname=None):
- """Returns a tuple of (release_version, format_version), where
- release_version is the release version number of the Whoosh code that
- created the index -- e.g. (0, 1, 24) -- and format_version is the version
- number of the on-disk format used for the index -- e.g. -102.
- You should avoid attaching significance to the second number (the index
- version). This is simply a version number for the TOC file and probably
- should not have been exposed in a public interface. The best way to check
- if the current version of Whoosh can open an index is to actually try to
- open it and see if it raises a ``whoosh.index.IndexVersionError`` exception.
- Note that the release and format version are available as attributes on the
- Index object in Index.release and Index.version.
- :param dirname: the file path of a directory containing an index.
- :param indexname: the name of the index. If None, the default index name is
- used.
- :returns: ((major_ver, minor_ver, build_ver), format_ver)
- """
- from whoosh.filedb.filestore import FileStorage
- storage = FileStorage(dirname)
- return version(storage, indexname=indexname)
- def version(storage, indexname=None):
- """Returns a tuple of (release_version, format_version), where
- release_version is the release version number of the Whoosh code that
- created the index -- e.g. (0, 1, 24) -- and format_version is the version
- number of the on-disk format used for the index -- e.g. -102.
- You should avoid attaching significance to the second number (the index
- version). This is simply a version number for the TOC file and probably
- should not have been exposed in a public interface. The best way to check
- if the current version of Whoosh can open an index is to actually try to
- open it and see if it raises a ``whoosh.index.IndexVersionError`` exception.
- Note that the release and format version are available as attributes on the
- Index object in Index.release and Index.version.
- :param storage: a store.Storage object.
- :param indexname: the name of the index. If None, the default index name is
- used.
- :returns: ((major_ver, minor_ver, build_ver), format_ver)
- """
- try:
- if indexname is None:
- indexname = _DEF_INDEX_NAME
- ix = storage.open_index(indexname)
- return (ix.release, ix.version)
- except IndexVersionError:
- e = sys.exc_info()[1]
- return (None, e.version)
- # Index base class
- class Index(object):
- """Represents an indexed collection of documents.
- """
- def close(self):
- """Closes any open resources held by the Index object itself. This may
- not close all resources being used everywhere, for example by a
- Searcher object.
- """
- pass
- def add_field(self, fieldname, fieldspec):
- """Adds a field to the index's schema.
- :param fieldname: the name of the field to add.
- :param fieldspec: an instantiated :class:`whoosh.fields.FieldType`
- object.
- """
- w = self.writer()
- w.add_field(fieldname, fieldspec)
- w.commit()
- def remove_field(self, fieldname):
- """Removes the named field from the index's schema. Depending on the
- backend implementation, this may or may not actually remove existing
- data for the field from the index. Optimizing the index should always
- clear out existing data for a removed field.
- """
- w = self.writer()
- w.remove_field(fieldname)
- w.commit()
- def latest_generation(self):
- """Returns the generation number of the latest generation of this
- index, or -1 if the backend doesn't support versioning.
- """
- return -1
- def refresh(self):
- """Returns a new Index object representing the latest generation
- of this index (if this object is the latest generation, or the backend
- doesn't support versioning, returns self).
- :returns: :class:`Index`
- """
- return self
- def up_to_date(self):
- """Returns True if this object represents the latest generation of
- this index. Returns False if this object is not the latest generation
- (that is, someone else has updated the index since you opened this
- object).
- """
- return True
- def last_modified(self):
- """Returns the last modified time of the index, or -1 if the backend
- doesn't support last-modified times.
- """
- return -1
- def is_empty(self):
- """Returns True if this index is empty (that is, it has never had any
- documents successfully written to it.
- """
- raise NotImplementedError
- def optimize(self):
- """Optimizes this index, if necessary.
- """
- pass
- def doc_count_all(self):
- """Returns the total number of documents, DELETED OR UNDELETED,
- in this index.
- """
- r = self.reader()
- try:
- return r.doc_count_all()
- finally:
- r.close()
- def doc_count(self):
- """Returns the total number of UNDELETED documents in this index.
- """
- r = self.reader()
- try:
- return r.doc_count()
- finally:
- r.close()
- def searcher(self, **kwargs):
- """Returns a Searcher object for this index. Keyword arguments are
- passed to the Searcher object's constructor.
- :rtype: :class:`whoosh.searching.Searcher`
- """
- from whoosh.searching import Searcher
- return Searcher(self.reader(), fromindex=self, **kwargs)
- def field_length(self, fieldname):
- """Returns the total length of the field across all documents.
- """
- r = self.reader()
- try:
- return r.field_length(fieldname)
- finally:
- r.close()
- def max_field_length(self, fieldname):
- """Returns the maximum length of the field across all documents.
- """
- r = self.reader()
- try:
- return r.max_field_length(fieldname)
- finally:
- r.close()
- def reader(self, reuse=None):
- """Returns an IndexReader object for this index.
- :param reuse: an existing reader. Some implementations may recycle
- resources from this existing reader to create the new reader. Note
- that any resources in the "recycled" reader that are not used by
- the new reader will be CLOSED, so you CANNOT use it afterward.
- :rtype: :class:`whoosh.reading.IndexReader`
- """
- raise NotImplementedError
- def writer(self, **kwargs):
- """Returns an IndexWriter object for this index.
- :rtype: :class:`whoosh.writing.IndexWriter`
- """
- raise NotImplementedError
- def delete_by_term(self, fieldname, text, searcher=None):
- w = self.writer()
- w.delete_by_term(fieldname, text, searcher=searcher)
- w.commit()
- def delete_by_query(self, q, searcher=None):
- w = self.writer()
- w.delete_by_query(q, searcher=searcher)
- w.commit()
- # Codec-based index implementation
- def clean_files(storage, indexname, gen, segments):
- # Attempts to remove unused index files (called when a new generation
- # is created). If existing Index and/or reader objects have the files
- # open, they may not be deleted immediately (i.e. on Windows) but will
- # probably be deleted eventually by a later call to clean_files.
- current_segment_names = set(s.segment_id() for s in segments)
- tocpattern = TOC._pattern(indexname)
- segpattern = TOC._segment_pattern(indexname)
- todelete = set()
- for filename in storage:
- if filename.startswith("."):
- continue
- tocm = tocpattern.match(filename)
- segm = segpattern.match(filename)
- if tocm:
- if int(tocm.group(1)) != gen:
- todelete.add(filename)
- elif segm:
- name = segm.group(1)
- if name not in current_segment_names:
- todelete.add(filename)
- for filename in todelete:
- try:
- storage.delete_file(filename)
- except OSError:
- # Another process still has this file open, I guess
- pass
- class FileIndex(Index):
- def __init__(self, storage, schema=None, indexname=_DEF_INDEX_NAME):
- from whoosh.filedb.filestore import Storage
- if not isinstance(storage, Storage):
- raise ValueError("%r is not a Storage object" % storage)
- if not isinstance(indexname, string_type):
- raise ValueError("indexname %r is not a string" % indexname)
- if schema:
- schema = ensure_schema(schema)
- self.storage = storage
- self._schema = schema
- self.indexname = indexname
- # Try reading the TOC to see if it's possible
- TOC.read(self.storage, self.indexname, schema=self._schema)
- @classmethod
- def create(cls, storage, schema, indexname=_DEF_INDEX_NAME):
- TOC.create(storage, schema, indexname)
- return cls(storage, schema, indexname)
- def __repr__(self):
- return "%s(%r, %r)" % (self.__class__.__name__,
- self.storage, self.indexname)
- def close(self):
- pass
- # add_field
- # remove_field
- def latest_generation(self):
- return TOC._latest_generation(self.storage, self.indexname)
- # refresh
- # up_to_date
- def last_modified(self):
- gen = self.latest_generation()
- filename = TOC._filename(self.indexname, gen)
- return self.storage.file_modified(filename)
- def is_empty(self):
- return len(self._read_toc().segments) == 0
- def optimize(self, **kwargs):
- w = self.writer(**kwargs)
- w.commit(optimize=True)
- # searcher
- def writer(self, procs=1, **kwargs):
- if procs > 1:
- from whoosh.multiproc import MpWriter
- return MpWriter(self, procs=procs, **kwargs)
- else:
- from whoosh.writing import SegmentWriter
- return SegmentWriter(self, **kwargs)
- def lock(self, name):
- """Returns a lock object that you can try to call acquire() on to
- lock the index.
- """
- return self.storage.lock(self.indexname + "_" + name)
- def _read_toc(self):
- return TOC.read(self.storage, self.indexname, schema=self._schema)
- def _segments(self):
- return self._read_toc().segments
- def _current_schema(self):
- return self._read_toc().schema
- @property
- def schema(self):
- return self._current_schema()
- @property
- def release(self):
- return self._read_toc().release
- @property
- def version(self):
- return self._read_toc().version
- @classmethod
- def _reader(cls, storage, schema, segments, generation, reuse=None):
- # Returns a reader for the given segments, possibly reusing already
- # opened readers
- from whoosh.reading import SegmentReader, MultiReader, EmptyReader
- reusable = {}
- try:
- if len(segments) == 0:
- # This index has no segments! Return an EmptyReader object,
- # which simply returns empty or zero to every method
- return EmptyReader(schema)
- if reuse:
- # Put all atomic readers in a dictionary keyed by their
- # generation, so we can re-use them if them if possible
- readers = [r for r, _ in reuse.leaf_readers()]
- reusable = dict((r.generation(), r) for r in readers)
- # Make a function to open readers, which reuses reusable readers.
- # It removes any readers it reuses from the "reusable" dictionary,
- # so later we can close any readers left in the dictionary.
- def segreader(segment):
- segid = segment.segment_id()
- if segid in reusable:
- r = reusable[segid]
- del reusable[segid]
- return r
- else:
- return SegmentReader(storage, schema, segment,
- generation=generation)
- if len(segments) == 1:
- # This index has one segment, so return a SegmentReader object
- # for the segment
- return segreader(segments[0])
- else:
- # This index has multiple segments, so create a list of
- # SegmentReaders for the segments, then composite them with a
- # MultiReader
- readers = [segreader(segment) for segment in segments]
- return MultiReader(readers, generation=generation)
- finally:
- for r in reusable.values():
- r.close()
- def reader(self, reuse=None):
- retries = 10
- while retries > 0:
- # Read the information from the TOC file
- try:
- info = self._read_toc()
- return self._reader(self.storage, info.schema, info.segments,
- info.generation, reuse=reuse)
- except IOError:
- # Presume that we got a "file not found error" because a writer
- # deleted one of the files just as we were trying to open it,
- # and so retry a few times before actually raising the
- # exception
- e = sys.exc_info()[1]
- retries -= 1
- if retries <= 0:
- raise e
- sleep(0.05)
- # TOC class
- class TOC(object):
- """Object representing the state of the index after a commit. Essentially
- a container for the index's schema and the list of segment objects.
- """
- def __init__(self, schema, segments, generation,
- version=_CURRENT_TOC_VERSION, release=__version__):
- self.schema = schema
- self.segments = segments
- self.generation = generation
- self.version = version
- self.release = release
- @classmethod
- def _filename(cls, indexname, gen):
- return "_%s_%s.toc" % (indexname, gen)
- @classmethod
- def _pattern(cls, indexname):
- return re.compile("^_%s_([0-9]+).toc$" % indexname)
- @classmethod
- def _segment_pattern(cls, indexname):
- return re.compile("(%s_[0-9a-z]+)[.][A-Za-z0-9_.]+" % indexname)
- @classmethod
- def _latest_generation(cls, storage, indexname):
- pattern = cls._pattern(indexname)
- mx = -1
- for filename in storage:
- m = pattern.match(filename)
- if m:
- mx = max(int(m.group(1)), mx)
- return mx
- @classmethod
- def create(cls, storage, schema, indexname=_DEF_INDEX_NAME):
- schema = ensure_schema(schema)
- # Clear existing files
- prefix = "_%s_" % indexname
- for filename in storage:
- if filename.startswith(prefix):
- storage.delete_file(filename)
- # Write a TOC file with an empty list of segments
- toc = cls(schema, [], 0)
- toc.write(storage, indexname)
- @classmethod
- def _read_preamble(cls, stream):
- # Check that the number of bytes per data type are the same on this
- # platform as where the index was created, otherwise it'll be crazy
- def check_size(name, target):
- sz = stream.read_varint()
- if sz != target:
- raise IndexError("Index was created on different architecture:"
- " saved %s = %s, this computer = %s"
- % (name, sz, target))
- check_size("int", _INT_SIZE)
- check_size("long", _LONG_SIZE)
- check_size("float", _FLOAT_SIZE)
- if not stream.read_int() == -12345:
- raise IndexError("Number misread: byte order problem")
- # Index format version
- toc_version = stream.read_int()
- # Whoosh release version, e.g. (2, 4, 1)
- release = (stream.read_varint(), stream.read_varint(),
- stream.read_varint())
- return toc_version, release
- @classmethod
- def _write_preamble(cls, stream):
- stream.write_varint(_INT_SIZE)
- stream.write_varint(_LONG_SIZE)
- stream.write_varint(_FLOAT_SIZE)
- stream.write_int(-12345)
- stream.write_int(_CURRENT_TOC_VERSION)
- for num in __version__[:3]:
- stream.write_varint(num)
- @classmethod
- def read(cls, storage, indexname, gen=None, schema=None):
- if gen is None:
- gen = cls._latest_generation(storage, indexname)
- if gen < 0:
- raise EmptyIndexError("Index %r does not exist in %r"
- % (indexname, storage))
- tocfilename = cls._filename(indexname, gen)
- stream = storage.open_file(tocfilename)
- stream = ChecksumFile(stream)
- # Do general sanity checks at the beginning and read the version
- # numbers
- toc_version, release = cls._read_preamble(stream)
- if toc_version != _CURRENT_TOC_VERSION:
- # If there's a backwards-compatible loader function for this
- # version, use it to load the rest of the TOC
- if toc_version in toc_loaders:
- loader = toc_loaders[toc_version]
- schema, segments = loader(stream, gen, schema, toc_version)
- else:
- # Otherwise, raise an error
- raise IndexVersionError("Can't read format %s" % toc_version,
- toc_version)
- else:
- loader = cls._read_info
- schema, segments = loader(stream, gen, schema, toc_version)
- file_check = stream.checksum()
- orig_check = stream.read_uint()
- if file_check != orig_check:
- raise Exception("TOC checksum does not match %d != %d"
- % (file_check, orig_check))
- stream.close()
- return cls(schema, segments, gen, version=toc_version, release=release)
- @classmethod
- def _read_info(cls, stream, gen, schema, version):
- # Read the schema and segments from the TOC file
- # Read the pickled schema bytes
- pick = stream.read_string()
- # If the user passed a schema, use it, otherwise unpickle the schema
- # we just read
- if not schema:
- schema = loads(pick)
- # Read the list of segments
- numsegments = stream.read_varint()
- segments = []
- for _ in xrange(numsegments):
- segtype = stream.read_string() # @UnusedVariable
- segment = loads(stream.read_string())
- segments.append(segment)
- return schema, segments
- def write(self, storage, indexname):
- schema = ensure_schema(self.schema)
- schema.clean()
- # Use a temporary file for atomic write
- tocfilename = self._filename(indexname, self.generation)
- tempfilename = '%s.%s' % (tocfilename, time())
- stream = storage.create_file(tempfilename)
- stream = ChecksumFile(stream)
- # Write the sanity checks and version numbers
- self._write_preamble(stream)
- # Write pickles as strings to allow them to be skipped
- stream.write_string(dumps(schema, -1))
- # Write the list of segments
- stream.write_varint(len(self.segments))
- for segment in self.segments:
- # Write the segment's module and class name before the pickle to
- # possibly allow later versions to load the segment differently
- # based on the class (for backwards compatibility)
- segtype = segment.__class__
- typename = "%s.%s" % (segtype.__module__, segtype.__name__)
- stream.write_string(typename.encode("latin1"))
- stream.write_string(dumps(segment, -1))
- stream.write_uint(stream.checksum())
- stream.close()
- storage.rename_file(tempfilename, tocfilename, safe=True)