fileindex.py | searchcode

/lib/whoosh/filedb/fileindex.py

https://bitbucket.org/xkjq/wikidpad_svn
Python | 452 lines | 327 code | 71 blank | 54 comment | 37 complexity | be39e493b9ad7461ccfffd8cef1e9653 MD5 | raw file
Possible License(s): LGPL-2.1

#===============================================================================

# Copyright 2009 Matt Chaput

# 

# Licensed under the Apache License, Version 2.0 (the "License");

# you may not use this file except in compliance with the License.

# You may obtain a copy of the License at

# 

#    http://www.apache.org/licenses/LICENSE-2.0

# 

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

#===============================================================================



import cPickle, os, re

from bisect import bisect_right

from time import time

from threading import Lock



from whoosh import __version__

from whoosh.fields import Schema

from whoosh.index import Index

from whoosh.index import EmptyIndexError, IndexVersionError

from whoosh.index import _DEF_INDEX_NAME

from whoosh.store import Storage, LockError

from whoosh.system import _INT_SIZE, _FLOAT_SIZE, _LONG_SIZE





_INDEX_VERSION = -110





# TOC read/write functions



def _toc_filename(indexname, gen):

    return "_%s_%s.toc" % (indexname, gen)



def _toc_pattern(indexname):

    """Returns a regular expression object that matches TOC filenames.

    name is the name of the index.

    """



    return re.compile("^_%s_([0-9]+).toc$" % indexname)



def _segment_pattern(indexname):

    """Returns a regular expression object that matches segment filenames.

    name is the name of the index.

    """



    return re.compile("(_%s_[0-9]+).(%s)" % (indexname,

                                             Segment.EXTENSIONS.values()))





def _latest_generation(storage, indexname):

    pattern = _toc_pattern(indexname)



    max = -1

    for filename in storage:

        m = pattern.match(filename)

        if m:

            num = int(m.group(1))

            if num > max: max = num

    return max





def _create_index(storage, schema, indexname=_DEF_INDEX_NAME):

    # Clear existing files

    prefix = "_%s_" % indexname

    for filename in storage:

        if filename.startswith(prefix):

            storage.delete_file(filename)

    

    # Write a TOC file with an empty list of segments

    _write_toc(storage, schema, indexname, 0, 0, [])





def _write_toc(storage, schema, indexname, gen, segment_counter, segments):

    schema.clean()



    # Use a temporary file for atomic write.

    tocfilename = _toc_filename(indexname, gen)

    tempfilename = '%s.%s' % (tocfilename, time())

    stream = storage.create_file(tempfilename)



    stream.write_varint(_INT_SIZE)

    stream.write_varint(_LONG_SIZE)

    stream.write_varint(_FLOAT_SIZE)

    stream.write_int(-12345)



    stream.write_int(_INDEX_VERSION)

    for num in __version__[:3]:

        stream.write_varint(num)



    stream.write_string(cPickle.dumps(schema, -1))

    stream.write_int(gen)

    stream.write_int(segment_counter)

    stream.write_pickle(segments)

    stream.close()



    # Rename temporary file to the proper filename

    storage.rename_file(tempfilename, tocfilename, safe=True)





class Toc(object):

    def __init__(self, **kwargs):

        for name, value in kwargs.iteritems():

            setattr(self, name, value)

        



def _read_toc(storage, schema, indexname):

    gen = _latest_generation(storage, indexname)

    if gen < 0:

        raise EmptyIndexError("Index %r does not exist in %r" % (indexname, storage))

    

    # Read the content of this index from the .toc file.

    tocfilename = _toc_filename(indexname, gen)

    stream = storage.open_file(tocfilename)



    def check_size(name, target):

        sz = stream.read_varint()

        if sz != target:

            raise IndexError("Index was created on different architecture:"

                             " saved %s = %s, this computer = %s" % (name, sz, target))



    check_size("int", _INT_SIZE)

    check_size("long", _LONG_SIZE)

    check_size("float", _FLOAT_SIZE)



    if not stream.read_int() == -12345:

        raise IndexError("Number misread: byte order problem")



    version = stream.read_int()

    if version != _INDEX_VERSION:

        raise IndexVersionError("Can't read format %s" % version, version)

    release = (stream.read_varint(), stream.read_varint(), stream.read_varint())

    

    # If the user supplied a schema object with the constructor, don't load

    # the pickled schema from the saved index.

    if schema:

        stream.skip_string()

    else:

        schema = cPickle.loads(stream.read_string())

    

    # Generation

    index_gen = stream.read_int()

    assert gen == index_gen

    

    segment_counter = stream.read_int()

    segments = stream.read_pickle()

    

    stream.close()

    return Toc(version=version, release=release, schema=schema,

               segment_counter=segment_counter, segments=segments,

               generation=gen)





def _next_segment_name(self):

    #Returns the name of the next segment in sequence.

    if self.segment_num_lock is None:

        self.segment_num_lock = Lock()

    

    if self.segment_num_lock.acquire():

        try:

            self.segment_counter += 1

            return 

        finally:

            self.segment_num_lock.release()

    else:

        raise LockError





def _clean_files(storage, indexname, gen, segments):

    # Attempts to remove unused index files (called when a new generation

    # is created). If existing Index and/or reader objects have the files

    # open, they may not be deleted immediately (i.e. on Windows) but will

    # probably be deleted eventually by a later call to clean_files.



    current_segment_names = set(s.name for s in segments)



    tocpattern = _toc_pattern(indexname)

    segpattern = _segment_pattern(indexname)



    todelete = set()

    for filename in storage:

        tocm = tocpattern.match(filename)

        segm = segpattern.match(filename)

        if tocm:

            if int(tocm.group(1)) != gen:

                todelete.add(filename)

        elif segm:

            name = segm.group(1)

            if name not in current_segment_names:

                todelete.add(filename)

    

    for filename in todelete:

        try:

            storage.delete_file(filename)

        except OSError:

            # Another process still has this file open

            pass





# Index placeholder object



class FileIndex(Index):

    def __init__(self, storage, schema=None, indexname=_DEF_INDEX_NAME):

        if not isinstance(storage, Storage):

            raise ValueError("%r is not a Storage object" % storage)

        if schema is not None and not isinstance(schema, Schema):

            raise ValueError("%r is not a Schema object" % schema)

        if not isinstance(indexname, (str, unicode)):

            raise ValueError("indexname %r is not a string" % indexname)

        

        self.storage = storage

        self._schema = schema

        self.indexname = indexname

        

        # Try reading the TOC to see if it's possible

        _read_toc(self.storage, self._schema, self.indexname)



    def __repr__(self):

        return "%s(%r, %r)" % (self.__class__.__name__,

                               self.storage, self.indexname)



    def close(self):

        pass



    # add_field

    # remove_field

    

    def latest_generation(self):

        return _latest_generation(self.storage, self.indexname)

    

    # refresh

    # up_to_date

    

    def last_modified(self):

        gen = self.latest_generation()

        filename = _toc_filename(self.indexname, gen)

        return self.storage.file_modified(filename)



    def is_empty(self):

        info = _read_toc(self.storage, self.schema, self.indexname)

        return len(info.segments) == 0

    

    def optimize(self):

        w = self.writer()

        w.commit(optimize=True)



    # searcher

    

    def writer(self, **kwargs):

        from whoosh.filedb.filewriting import SegmentWriter

        return SegmentWriter(self, **kwargs)



    def lock(self, name):

        """Returns a lock object that you can try to call acquire() on to

        lock the index.

        """

        

        return self.storage.lock(self.indexname + "_" + name)



    def _read_toc(self):

        return _read_toc(self.storage, self._schema, self.indexname)



    def _segments(self):

        return self._read_toc().segments

    

    def _current_schema(self):

        return self._read_toc().schema

    

    @property

    def schema(self):

        return self._current_schema()



    def reader(self):

        lock = self.lock("READLOCK")

        lock.acquire(True)

        try:

            info = self._read_toc()

            

            from whoosh.filedb.filereading import SegmentReader

            if len(info.segments) == 0:

                from whoosh.reading import EmptyReader

                return EmptyReader(info.schema)

            elif len(info.segments) == 1:

                return SegmentReader(self.storage, info.schema,

                                     info.segments[0], info.generation)

            else:

                from whoosh.reading import MultiReader

                readers = [SegmentReader(self.storage, info.schema, segment, -2)

                           for segment in info.segments]

                return MultiReader(readers, info.generation)

        finally:

            lock.release()





class Segment(object):

    """Do not instantiate this object directly. It is used by the Index object

    to hold information about a segment. A list of objects of this class are

    pickled as part of the TOC file.

    

    The TOC file stores a minimal amount of information -- mostly a list of

    Segment objects. Segments are the real reverse indexes. Having multiple

    segments allows quick incremental indexing: just create a new segment for

    the new documents, and have the index overlay the new segment over previous

    ones for purposes of reading/search. "Optimizing" the index combines the

    contents of existing segments into one (removing any deleted documents

    along the way).

    """



    EXTENSIONS = {"fieldlengths": "fln", "storedfields": "sto",

                  "termsindex": "trm", "termposts": "pst",

                  "vectorindex": "vec", "vectorposts": "vps"}

    

    def __init__(self, name, doccount, fieldlength_totals, fieldlength_maxes,

                 deleted=None):

        """

        :param name: The name of the segment (the Index object computes this

            from its name and the generation).

        :param doccount: The maximum document number in the segment.

        :param term_count: Total count of all terms in all documents.

        :param fieldlength_totals: A dictionary mapping field numbers to the

            total number of terms in that field across all documents in the

            segment.

        :param deleted: A set of deleted document numbers, or None if no

            deleted documents exist in this segment.

        """



        assert isinstance(name, basestring)

        assert isinstance(doccount, (int, long))

        assert fieldlength_totals is None or isinstance(fieldlength_totals, dict), "fl_totals=%r" % fieldlength_totals

        assert fieldlength_maxes is None or isinstance(fieldlength_maxes, dict), "fl_maxes=%r" % fieldlength_maxes

        

        self.name = name

        self.doccount = doccount

        self.fieldlength_totals = fieldlength_totals

        self.fieldlength_maxes = fieldlength_maxes

        self.deleted = deleted

        

        self._filenames = set()

        for attr, ext in self.EXTENSIONS.iteritems():

            fname = "%s.%s" % (self.name, ext)

            setattr(self, attr + "_filename", fname)

            self._filenames.add(fname)



    def __repr__(self):

        return "%s(%r)" % (self.__class__.__name__, self.name)



    def copy(self):

        if self.deleted:

            deleted = set(self.deleted)

        else:

            deleted = None

        return Segment(self.name, self.doccount, self.fieldlength_totals,

                       self.fieldlength_maxes, deleted)



    def filenames(self):

        return self._filenames



    def doc_count_all(self):

        """

        :returns: the total number of documents, DELETED OR UNDELETED, in this

            segment.

        """

        return self.doccount



    def doc_count(self):

        """

        :returns: the number of (undeleted) documents in this segment.

        """

        return self.doccount - self.deleted_count()



    def has_deletions(self):

        """

        :returns: True if any documents in this segment are deleted.

        """

        return self.deleted_count() > 0



    def deleted_count(self):

        """

        :returns: the total number of deleted documents in this segment.

        """

        if self.deleted is None: return 0

        return len(self.deleted)



    def field_length(self, fieldname, default=0):

        """Returns the total number of terms in the given field across all

        documents in this segment.

        """

        return self.fieldlength_totals.get(fieldname, default)



    def max_field_length(self, fieldname, default=0):

        """Returns the maximum length of the given field in any of the

        documents in the segment.

        """

        return self.fieldlength_maxes.get(fieldname, default)



    def delete_document(self, docnum, delete=True):

        """Deletes the given document number. The document is not actually

        removed from the index until it is optimized.



        :param docnum: The document number to delete.

        :param delete: If False, this undeletes a deleted document.

        """



        if delete:

            if self.deleted is None:

                self.deleted = set()

            elif docnum in self.deleted:

                raise KeyError("Document %s in segment %r is already deleted"

                               % (docnum, self.name))



            self.deleted.add(docnum)

        else:

            if self.deleted is None or docnum not in self.deleted:

                raise KeyError("Document %s is not deleted" % docnum)



            self.deleted.clear(docnum)



    def is_deleted(self, docnum):

        """:returns: True if the given document number is deleted."""



        if self.deleted is None: return False

        return docnum in self.deleted