PageRenderTime 74ms CodeModel.GetById 16ms app.highlight 52ms RepoModel.GetById 1ms app.codeStats 0ms

/bangkokhotel/lib/python2.5/site-packages/whoosh/filedb/filewriting.py

https://bitbucket.org/luisrodriguez/bangkokhotel
Python | 572 lines | 531 code | 8 blank | 33 comment | 0 complexity | fbe6a2d5d645659a796771bc34f152b4 MD5 | raw file
  1# Copyright 2007 Matt Chaput. All rights reserved.
  2#
  3# Redistribution and use in source and binary forms, with or without
  4# modification, are permitted provided that the following conditions are met:
  5#
  6#    1. Redistributions of source code must retain the above copyright notice,
  7#       this list of conditions and the following disclaimer.
  8#
  9#    2. Redistributions in binary form must reproduce the above copyright
 10#       notice, this list of conditions and the following disclaimer in the
 11#       documentation and/or other materials provided with the distribution.
 12#
 13# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
 14# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 15# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 16# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 17# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 18# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 19# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 20# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 21# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 22# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 23#
 24# The views and conclusions contained in the software and documentation are
 25# those of the authors and should not be interpreted as representing official
 26# policies, either expressed or implied, of Matt Chaput.
 27
 28from __future__ import with_statement
 29from bisect import bisect_right
 30
 31from whoosh.fields import UnknownFieldError
 32from whoosh.store import LockError
 33from whoosh.support.filelock import try_for
 34from whoosh.support.externalsort import SortingPool
 35from whoosh.util import fib
 36from whoosh.writing import IndexWriter, IndexingError
 37
 38
 39# Merge policies
 40
 41# A merge policy is a callable that takes the Index object, the SegmentWriter
 42# object, and the current segment list (not including the segment being
 43# written), and returns an updated segment list (not including the segment
 44# being written).
 45
 46def NO_MERGE(writer, segments):
 47    """This policy does not merge any existing segments.
 48    """
 49    return segments
 50
 51
 52def MERGE_SMALL(writer, segments):
 53    """This policy merges small segments, where "small" is defined using a
 54    heuristic based on the fibonacci sequence.
 55    """
 56
 57    from whoosh.filedb.filereading import SegmentReader
 58
 59    newsegments = []
 60    sorted_segment_list = sorted(segments, key=lambda s: s.doc_count_all())
 61    total_docs = 0
 62    for i, seg in enumerate(sorted_segment_list):
 63        count = seg.doc_count_all()
 64        if count > 0:
 65            total_docs += count
 66            if total_docs < fib(i + 5):
 67                reader = SegmentReader(writer.storage, writer.schema, seg)
 68                writer.add_reader(reader)
 69                reader.close()
 70            else:
 71                newsegments.append(seg)
 72    return newsegments
 73
 74
 75def OPTIMIZE(writer, segments):
 76    """This policy merges all existing segments.
 77    """
 78
 79    from whoosh.filedb.filereading import SegmentReader
 80
 81    for seg in segments:
 82        reader = SegmentReader(writer.storage, writer.schema, seg)
 83        writer.add_reader(reader)
 84        reader.close()
 85    return []
 86
 87
 88class PostingPool(SortingPool):
 89    # Subclass whoosh.support.externalsort.SortingPool to use knowledge of
 90    # postings to set run size in bytes instead of items
 91
 92    def __init__(self, limitmb=128, **kwargs):
 93        SortingPool.__init__(self, **kwargs)
 94        self.limit = limitmb * 1024 * 1024
 95        self.currentsize = 0
 96
 97    def add(self, item):
 98        # item = (fieldname, text, docnum, weight, valuestring)
 99        size = (28 + 4 * 5  # tuple = 28 + 4 * length
100                + 21 + len(item[0])  # fieldname = str = 21 + length
101                + 26 + len(item[1]) * 2  # text = unicode = 26 + 2 * length
102                + 18  # docnum = long = 18
103                + 16  # weight = float = 16
104                + 21 + len(item[4] or ''))  # valuestring
105        self.currentsize += size
106        if self.currentsize > self.limit:
107            self.save()
108        self.current.append(item)
109
110    def iter_postings(self):
111        # This is just an alias for items() to be consistent with the
112        # iter_postings()/add_postings() interface of a lot of other classes
113        return self.items()
114
115    def save(self):
116        SortingPool.save(self)
117        self.currentsize = 0
118
119
120def renumber_postings(reader, startdoc, docmap):
121    for fieldname, text, docnum, weight, value in reader.iter_postings():
122        newdoc = docmap[docnum] if docmap else startdoc + docnum
123        yield (fieldname, text, newdoc, weight, value)
124
125
126# Writer object
127
128class SegmentWriter(IndexWriter):
129    def __init__(self, ix, poolclass=None, timeout=0.0, delay=0.1, _lk=True,
130                 limitmb=128, docbase=0, codec=None, compound=True, **kwargs):
131        # Lock the index
132        self.writelock = None
133        if _lk:
134            self.writelock = ix.lock("WRITELOCK")
135            if not try_for(self.writelock.acquire, timeout=timeout,
136                           delay=delay):
137                raise LockError
138
139        if codec is None:
140            from whoosh.codec import default_codec
141            codec = default_codec()
142        self.codec = codec
143
144        # Get info from the index
145        self.storage = ix.storage
146        self.indexname = ix.indexname
147        info = ix._read_toc()
148        self.generation = info.generation + 1
149        self.schema = info.schema
150        self.segments = info.segments
151        self.docnum = self.docbase = docbase
152        self._setup_doc_offsets()
153
154        # Internals
155        self.compound = compound
156        poolprefix = "whoosh_%s_" % self.indexname
157        self.pool = PostingPool(limitmb=limitmb, prefix=poolprefix)
158        newsegment = self.newsegment = codec.new_segment(self.storage,
159                                                         self.indexname)
160        self.is_closed = False
161        self._added = False
162
163        # Set up writers
164        self.perdocwriter = codec.per_document_writer(self.storage, newsegment)
165        self.fieldwriter = codec.field_writer(self.storage, newsegment)
166
167    def __repr__(self):
168        return "<%s %r>" % (self.__class__.__name__, self.newsegment)
169
170    def _setup_doc_offsets(self):
171        self._doc_offsets = []
172        base = 0
173        for s in self.segments:
174            self._doc_offsets.append(base)
175            base += s.doc_count_all()
176
177    def _check_state(self):
178        if self.is_closed:
179            raise IndexingError("This writer is closed")
180
181    def add_field(self, fieldname, fieldspec, **kwargs):
182        self._check_state()
183        if self._added:
184            raise Exception("Can't modify schema after adding data to writer")
185        super(SegmentWriter, self).add_field(fieldname, fieldspec, **kwargs)
186
187    def remove_field(self, fieldname):
188        self._check_state()
189        if self._added:
190            raise Exception("Can't modify schema after adding data to writer")
191        super(SegmentWriter, self).remove_field(fieldname)
192
193    def _document_segment(self, docnum):
194        #Returns the index.Segment object containing the given document
195        #number.
196        offsets = self._doc_offsets
197        if len(offsets) == 1:
198            return 0
199        return bisect_right(offsets, docnum) - 1
200
201    def _segment_and_docnum(self, docnum):
202        #Returns an (index.Segment, segment_docnum) pair for the segment
203        #containing the given document number.
204
205        segmentnum = self._document_segment(docnum)
206        offset = self._doc_offsets[segmentnum]
207        segment = self.segments[segmentnum]
208        return segment, docnum - offset
209
210    def has_deletions(self):
211        """
212        Returns True if this index has documents that are marked deleted but
213        haven't been optimized out of the index yet.
214        """
215
216        return any(s.has_deletions() for s in self.segments)
217
218    def delete_document(self, docnum, delete=True):
219        self._check_state()
220        if docnum >= sum(seg.doccount for seg in self.segments):
221            raise IndexingError("No document ID %r in this index" % docnum)
222        segment, segdocnum = self._segment_and_docnum(docnum)
223        segment.delete_document(segdocnum, delete=delete)
224
225    def deleted_count(self):
226        """
227        :returns: the total number of deleted documents in the index.
228        """
229
230        return sum(s.deleted_count() for s in self.segments)
231
232    def is_deleted(self, docnum):
233        segment, segdocnum = self._segment_and_docnum(docnum)
234        return segment.is_deleted(segdocnum)
235
236    def reader(self, reuse=None):
237        from whoosh.filedb.fileindex import FileIndex
238
239        self._check_state()
240        return FileIndex._reader(self.storage, self.schema, self.segments,
241                                 self.generation, reuse=reuse)
242
243    def iter_postings(self):
244        return self.pool.iter_postings()
245
246    def add_postings(self, lengths, items, startdoc, docmap):
247        # items = (fieldname, text, docnum, weight, valuestring) ...
248        schema = self.schema
249
250        # Make a generator to strip out deleted fields and renumber the docs
251        # before passing them down to the field writer
252        def gen():
253            for fieldname, text, docnum, weight, valuestring in items:
254                if fieldname not in schema:
255                    continue
256                if docmap is not None:
257                    newdoc = docmap[docnum]
258                else:
259                    newdoc = startdoc + docnum
260                yield (fieldname, text, newdoc, weight, valuestring)
261
262        self.fieldwriter.add_postings(schema, lengths, gen())
263
264    def _make_docmap(self, reader, newdoc):
265        # If the reader has deletions, make a dictionary mapping the docnums
266        # of undeleted documents to new sequential docnums starting at newdoc
267        hasdel = reader.has_deletions()
268        if hasdel:
269            docmap = {}
270            for docnum in reader.all_doc_ids():
271                if reader.is_deleted(docnum):
272                    continue
273                docmap[docnum] = newdoc
274                newdoc += 1
275        else:
276            docmap = None
277            newdoc += reader.doc_count_all()
278        # Return the map and the new lowest unused document number
279        return docmap, newdoc
280
281    def _merge_per_doc(self, reader, docmap):
282        schema = self.schema
283        newdoc = self.docnum
284        perdocwriter = self.perdocwriter
285        sharedfields = set(schema.names()) & set(reader.schema.names())
286
287        for docnum in reader.all_doc_ids():
288            # Skip deleted documents
289            if docmap and docnum not in docmap:
290                continue
291            # Renumber around deletions
292            if docmap:
293                newdoc = docmap[docnum]
294
295            # Get the stored fields
296            d = reader.stored_fields(docnum)
297            # Start a new document in the writer
298            perdocwriter.start_doc(newdoc)
299            # For each field in the document, copy its stored value,
300            # length, and vectors (if any) to the writer
301            for fieldname in sharedfields:
302                field = schema[fieldname]
303                length = (reader.doc_field_length(docnum, fieldname, 0)
304                          if field.scorable else 0)
305                perdocwriter.add_field(fieldname, field, d.get(fieldname),
306                                       length)
307                if field.vector and reader.has_vector(docnum, fieldname):
308                    v = reader.vector(docnum, fieldname)
309                    perdocwriter.add_vector_matcher(fieldname, field, v)
310            # Finish the new document
311            perdocwriter.finish_doc()
312            newdoc += 1
313
314    def _merge_fields(self, reader, docmap):
315        # Add inverted index postings to the pool, renumbering document number
316        # references as necessary
317        add_post = self.pool.add
318        # Note: iter_postings() only yields postings for undeleted docs
319        for p in renumber_postings(reader, self.docnum, docmap):
320            add_post(p)
321
322    def add_reader(self, reader):
323        self._check_state()
324
325        # Make a docnum map to renumber around deleted documents
326        docmap, newdoc = self._make_docmap(reader, self.docnum)
327        # Add per-document values
328        self._merge_per_doc(reader, docmap)
329        # Add field postings
330        self._merge_fields(reader, docmap)
331
332        self.docnum = newdoc
333        self._added = True
334
335    def _check_fields(self, schema, fieldnames):
336        # Check if the caller gave us a bogus field
337        for name in fieldnames:
338            if name not in schema:
339                raise UnknownFieldError("No field named %r in %s"
340                                        % (name, schema))
341
342    def add_document(self, **fields):
343        self._check_state()
344        perdocwriter = self.perdocwriter
345        schema = self.schema
346        docnum = self.docnum
347        add_post = self.pool.add
348
349        docboost = self._doc_boost(fields)
350        fieldnames = sorted([name for name in fields.keys()
351                             if not name.startswith("_")])
352        self._check_fields(schema, fieldnames)
353
354        perdocwriter.start_doc(docnum)
355        # For each field...
356        for fieldname in fieldnames:
357            value = fields.get(fieldname)
358            if value is None:
359                continue
360            field = schema[fieldname]
361
362            length = 0
363            if field.indexed:
364                # TODO: Method for adding progressive field values, ie
365                # setting start_pos/start_char?
366                fieldboost = self._field_boost(fields, fieldname, docboost)
367                # Ask the field to return a list of (text, weight, valuestring)
368                # tuples and the number of terms in the field
369                items = field.index(value)
370                # Only store the length if the field is marked scorable
371                scorable = field.scorable
372                # Add the terms to the pool
373                for text, freq, weight, valuestring in items:
374                    #assert w != ""
375                    weight *= fieldboost
376                    if scorable:
377                        length += freq
378                    add_post((fieldname, text, docnum, weight, valuestring))
379
380            if field.separate_spelling():
381                # For fields which use different tokens for spelling, insert
382                # fake postings for the spellable words, where docnum=None
383                # means "this is a spelling word"
384
385                # TODO: think of something less hacktacular
386                for text in field.spellable_words(value):
387                    add_post((fieldname, text, None, None, None))
388
389            vformat = field.vector
390            if vformat:
391                analyzer = field.analyzer
392                vitems = sorted(vformat.word_values(value, analyzer,
393                                                    mode="index"))
394                perdocwriter.add_vector_items(fieldname, field, vitems)
395
396            # Figure out what value to store for this field
397            storedval = None
398            if field.stored:
399                storedkey = "_stored_%s" % fieldname
400                if storedkey in fields:
401                    storedval = fields.get(storedkey)
402                else:
403                    storedval = value
404
405            # Add the stored value and length for this field to the per-
406            # document writer
407            perdocwriter.add_field(fieldname, field, storedval, length)
408        perdocwriter.finish_doc()
409        self._added = True
410        self.docnum += 1
411
412    def doc_count(self):
413        return self.docnum - self.docbase
414
415    def get_segment(self):
416        newsegment = self.newsegment
417        newsegment.doccount = self.doc_count()
418        return newsegment
419
420    def _merge_segments(self, mergetype, optimize, merge):
421        if mergetype:
422            pass
423        elif optimize:
424            mergetype = OPTIMIZE
425        elif not merge:
426            mergetype = NO_MERGE
427        else:
428            mergetype = MERGE_SMALL
429
430        # Call the merge policy function. The policy may choose to merge
431        # other segments into this writer's pool
432        return mergetype(self, self.segments)
433
434    def _flush_segment(self):
435        lengths = self.perdocwriter.lengths_reader()
436        postings = self.pool.iter_postings()
437        self.fieldwriter.add_postings(self.schema, lengths, postings)
438
439    def _close_segment(self):
440        self.perdocwriter.close()
441        self.fieldwriter.close()
442        self.pool.cleanup()
443
444    def _assemble_segment(self):
445        if self.compound:
446            # Assemble the segment files into a compound file
447            newsegment = self.get_segment()
448            newsegment.create_compound_file(self.storage)
449            newsegment.compound = True
450
451    def _commit_toc(self, segments):
452        # Write a new TOC with the new segment list (and delete old files)
453        self.codec.commit_toc(self.storage, self.indexname, self.schema,
454                              segments, self.generation)
455
456    def _finish(self):
457        if self.writelock:
458            self.writelock.release()
459        self.is_closed = True
460        #self.storage.close()
461
462    def _partial_segment(self):
463        # For use by a parent multiprocessing writer: Closes out the segment
464        # but leaves the pool files intact so the parent can access them
465        self._check_state()
466        self.perdocwriter.close()
467        self.fieldwriter.close()
468        # Don't call self.pool.cleanup()! We want to grab the pool files.
469        return self.get_segment()
470
471    def commit(self, mergetype=None, optimize=False, merge=True):
472        """Finishes writing and saves all additions and changes to disk.
473        
474        There are four possible ways to use this method::
475        
476            # Merge small segments but leave large segments, trying to
477            # balance fast commits with fast searching:
478            writer.commit()
479        
480            # Merge all segments into a single segment:
481            writer.commit(optimize=True)
482            
483            # Don't merge any existing segments:
484            writer.commit(merge=False)
485            
486            # Use a custom merge function
487            writer.commit(mergetype=my_merge_function)
488        
489        :param mergetype: a custom merge function taking a Writer object and
490            segment list as arguments, and returning a new segment list. If you
491            supply a ``mergetype`` function, the values of the ``optimize`` and
492            ``merge`` arguments are ignored.
493        :param optimize: if True, all existing segments are merged with the
494            documents you've added to this writer (and the value of the
495            ``merge`` argument is ignored).
496        :param merge: if False, do not merge small segments.
497        """
498
499        self._check_state()
500        try:
501            # Merge old segments if necessary
502            finalsegments = self._merge_segments(mergetype, optimize, merge)
503            if self._added:
504                # Finish writing segment
505                self._flush_segment()
506                # Close segment files
507                self._close_segment()
508                # Assemble compound segment if necessary
509                self._assemble_segment()
510
511                # Add the new segment to the list of remaining segments
512                # returned by the merge policy function
513                finalsegments.append(self.get_segment())
514            else:
515                # Close segment files
516                self._close_segment()
517            # Write TOC
518            self._commit_toc(finalsegments)
519        finally:
520            # Final cleanup
521            self._finish()
522
523    def cancel(self):
524        self._check_state()
525        self._close_segment()
526        self._finish()
527
528
529# Retroactively add spelling files to an existing index
530
531def add_spelling(ix, fieldnames, commit=True):
532    """Adds spelling files to an existing index that was created without
533    them, and modifies the schema so the given fields have the ``spelling``
534    attribute. Only works on filedb indexes.
535    
536    >>> ix = index.open_dir("testindex")
537    >>> add_spelling(ix, ["content", "tags"])
538    
539    :param ix: a :class:`whoosh.filedb.fileindex.FileIndex` object.
540    :param fieldnames: a list of field names to create word graphs for.
541    :param force: if True, overwrites existing word graph files. This is only
542        useful for debugging.
543    """
544
545    from whoosh.filedb.filereading import SegmentReader
546    from whoosh.support import dawg
547
548    writer = ix.writer()
549    storage = writer.storage
550    schema = writer.schema
551    segments = writer.segments
552
553    for segment in segments:
554        r = SegmentReader(storage, schema, segment)
555        f = segment.create_file(storage, ".dag")
556        gw = dawg.GraphWriter(f)
557        for fieldname in fieldnames:
558            gw.start_field(fieldname)
559            for word in r.lexicon(fieldname):
560                gw.insert(word)
561            gw.finish_field()
562        gw.close()
563
564    for fieldname in fieldnames:
565        schema[fieldname].spelling = True
566
567    if commit:
568        writer.commit(merge=False)
569
570
571
572