/bangkokhotel/lib/python2.5/site-packages/whoosh/filedb/filewriting.py
Python | 572 lines | 531 code | 8 blank | 33 comment | 0 complexity | fbe6a2d5d645659a796771bc34f152b4 MD5 | raw file
1# Copyright 2007 Matt Chaput. All rights reserved.
2#
3# Redistribution and use in source and binary forms, with or without
4# modification, are permitted provided that the following conditions are met:
5#
6# 1. Redistributions of source code must retain the above copyright notice,
7# this list of conditions and the following disclaimer.
8#
9# 2. Redistributions in binary form must reproduce the above copyright
10# notice, this list of conditions and the following disclaimer in the
11# documentation and/or other materials provided with the distribution.
12#
13# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
14# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
15# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
16# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
17# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
18# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
19# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
22# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23#
24# The views and conclusions contained in the software and documentation are
25# those of the authors and should not be interpreted as representing official
26# policies, either expressed or implied, of Matt Chaput.
27
28from __future__ import with_statement
29from bisect import bisect_right
30
31from whoosh.fields import UnknownFieldError
32from whoosh.store import LockError
33from whoosh.support.filelock import try_for
34from whoosh.support.externalsort import SortingPool
35from whoosh.util import fib
36from whoosh.writing import IndexWriter, IndexingError
37
38
39# Merge policies
40
41# A merge policy is a callable that takes the Index object, the SegmentWriter
42# object, and the current segment list (not including the segment being
43# written), and returns an updated segment list (not including the segment
44# being written).
45
46def NO_MERGE(writer, segments):
47 """This policy does not merge any existing segments.
48 """
49 return segments
50
51
52def MERGE_SMALL(writer, segments):
53 """This policy merges small segments, where "small" is defined using a
54 heuristic based on the fibonacci sequence.
55 """
56
57 from whoosh.filedb.filereading import SegmentReader
58
59 newsegments = []
60 sorted_segment_list = sorted(segments, key=lambda s: s.doc_count_all())
61 total_docs = 0
62 for i, seg in enumerate(sorted_segment_list):
63 count = seg.doc_count_all()
64 if count > 0:
65 total_docs += count
66 if total_docs < fib(i + 5):
67 reader = SegmentReader(writer.storage, writer.schema, seg)
68 writer.add_reader(reader)
69 reader.close()
70 else:
71 newsegments.append(seg)
72 return newsegments
73
74
75def OPTIMIZE(writer, segments):
76 """This policy merges all existing segments.
77 """
78
79 from whoosh.filedb.filereading import SegmentReader
80
81 for seg in segments:
82 reader = SegmentReader(writer.storage, writer.schema, seg)
83 writer.add_reader(reader)
84 reader.close()
85 return []
86
87
88class PostingPool(SortingPool):
89 # Subclass whoosh.support.externalsort.SortingPool to use knowledge of
90 # postings to set run size in bytes instead of items
91
92 def __init__(self, limitmb=128, **kwargs):
93 SortingPool.__init__(self, **kwargs)
94 self.limit = limitmb * 1024 * 1024
95 self.currentsize = 0
96
97 def add(self, item):
98 # item = (fieldname, text, docnum, weight, valuestring)
99 size = (28 + 4 * 5 # tuple = 28 + 4 * length
100 + 21 + len(item[0]) # fieldname = str = 21 + length
101 + 26 + len(item[1]) * 2 # text = unicode = 26 + 2 * length
102 + 18 # docnum = long = 18
103 + 16 # weight = float = 16
104 + 21 + len(item[4] or '')) # valuestring
105 self.currentsize += size
106 if self.currentsize > self.limit:
107 self.save()
108 self.current.append(item)
109
110 def iter_postings(self):
111 # This is just an alias for items() to be consistent with the
112 # iter_postings()/add_postings() interface of a lot of other classes
113 return self.items()
114
115 def save(self):
116 SortingPool.save(self)
117 self.currentsize = 0
118
119
120def renumber_postings(reader, startdoc, docmap):
121 for fieldname, text, docnum, weight, value in reader.iter_postings():
122 newdoc = docmap[docnum] if docmap else startdoc + docnum
123 yield (fieldname, text, newdoc, weight, value)
124
125
126# Writer object
127
128class SegmentWriter(IndexWriter):
129 def __init__(self, ix, poolclass=None, timeout=0.0, delay=0.1, _lk=True,
130 limitmb=128, docbase=0, codec=None, compound=True, **kwargs):
131 # Lock the index
132 self.writelock = None
133 if _lk:
134 self.writelock = ix.lock("WRITELOCK")
135 if not try_for(self.writelock.acquire, timeout=timeout,
136 delay=delay):
137 raise LockError
138
139 if codec is None:
140 from whoosh.codec import default_codec
141 codec = default_codec()
142 self.codec = codec
143
144 # Get info from the index
145 self.storage = ix.storage
146 self.indexname = ix.indexname
147 info = ix._read_toc()
148 self.generation = info.generation + 1
149 self.schema = info.schema
150 self.segments = info.segments
151 self.docnum = self.docbase = docbase
152 self._setup_doc_offsets()
153
154 # Internals
155 self.compound = compound
156 poolprefix = "whoosh_%s_" % self.indexname
157 self.pool = PostingPool(limitmb=limitmb, prefix=poolprefix)
158 newsegment = self.newsegment = codec.new_segment(self.storage,
159 self.indexname)
160 self.is_closed = False
161 self._added = False
162
163 # Set up writers
164 self.perdocwriter = codec.per_document_writer(self.storage, newsegment)
165 self.fieldwriter = codec.field_writer(self.storage, newsegment)
166
167 def __repr__(self):
168 return "<%s %r>" % (self.__class__.__name__, self.newsegment)
169
170 def _setup_doc_offsets(self):
171 self._doc_offsets = []
172 base = 0
173 for s in self.segments:
174 self._doc_offsets.append(base)
175 base += s.doc_count_all()
176
177 def _check_state(self):
178 if self.is_closed:
179 raise IndexingError("This writer is closed")
180
181 def add_field(self, fieldname, fieldspec, **kwargs):
182 self._check_state()
183 if self._added:
184 raise Exception("Can't modify schema after adding data to writer")
185 super(SegmentWriter, self).add_field(fieldname, fieldspec, **kwargs)
186
187 def remove_field(self, fieldname):
188 self._check_state()
189 if self._added:
190 raise Exception("Can't modify schema after adding data to writer")
191 super(SegmentWriter, self).remove_field(fieldname)
192
193 def _document_segment(self, docnum):
194 #Returns the index.Segment object containing the given document
195 #number.
196 offsets = self._doc_offsets
197 if len(offsets) == 1:
198 return 0
199 return bisect_right(offsets, docnum) - 1
200
201 def _segment_and_docnum(self, docnum):
202 #Returns an (index.Segment, segment_docnum) pair for the segment
203 #containing the given document number.
204
205 segmentnum = self._document_segment(docnum)
206 offset = self._doc_offsets[segmentnum]
207 segment = self.segments[segmentnum]
208 return segment, docnum - offset
209
210 def has_deletions(self):
211 """
212 Returns True if this index has documents that are marked deleted but
213 haven't been optimized out of the index yet.
214 """
215
216 return any(s.has_deletions() for s in self.segments)
217
218 def delete_document(self, docnum, delete=True):
219 self._check_state()
220 if docnum >= sum(seg.doccount for seg in self.segments):
221 raise IndexingError("No document ID %r in this index" % docnum)
222 segment, segdocnum = self._segment_and_docnum(docnum)
223 segment.delete_document(segdocnum, delete=delete)
224
225 def deleted_count(self):
226 """
227 :returns: the total number of deleted documents in the index.
228 """
229
230 return sum(s.deleted_count() for s in self.segments)
231
232 def is_deleted(self, docnum):
233 segment, segdocnum = self._segment_and_docnum(docnum)
234 return segment.is_deleted(segdocnum)
235
236 def reader(self, reuse=None):
237 from whoosh.filedb.fileindex import FileIndex
238
239 self._check_state()
240 return FileIndex._reader(self.storage, self.schema, self.segments,
241 self.generation, reuse=reuse)
242
243 def iter_postings(self):
244 return self.pool.iter_postings()
245
246 def add_postings(self, lengths, items, startdoc, docmap):
247 # items = (fieldname, text, docnum, weight, valuestring) ...
248 schema = self.schema
249
250 # Make a generator to strip out deleted fields and renumber the docs
251 # before passing them down to the field writer
252 def gen():
253 for fieldname, text, docnum, weight, valuestring in items:
254 if fieldname not in schema:
255 continue
256 if docmap is not None:
257 newdoc = docmap[docnum]
258 else:
259 newdoc = startdoc + docnum
260 yield (fieldname, text, newdoc, weight, valuestring)
261
262 self.fieldwriter.add_postings(schema, lengths, gen())
263
264 def _make_docmap(self, reader, newdoc):
265 # If the reader has deletions, make a dictionary mapping the docnums
266 # of undeleted documents to new sequential docnums starting at newdoc
267 hasdel = reader.has_deletions()
268 if hasdel:
269 docmap = {}
270 for docnum in reader.all_doc_ids():
271 if reader.is_deleted(docnum):
272 continue
273 docmap[docnum] = newdoc
274 newdoc += 1
275 else:
276 docmap = None
277 newdoc += reader.doc_count_all()
278 # Return the map and the new lowest unused document number
279 return docmap, newdoc
280
281 def _merge_per_doc(self, reader, docmap):
282 schema = self.schema
283 newdoc = self.docnum
284 perdocwriter = self.perdocwriter
285 sharedfields = set(schema.names()) & set(reader.schema.names())
286
287 for docnum in reader.all_doc_ids():
288 # Skip deleted documents
289 if docmap and docnum not in docmap:
290 continue
291 # Renumber around deletions
292 if docmap:
293 newdoc = docmap[docnum]
294
295 # Get the stored fields
296 d = reader.stored_fields(docnum)
297 # Start a new document in the writer
298 perdocwriter.start_doc(newdoc)
299 # For each field in the document, copy its stored value,
300 # length, and vectors (if any) to the writer
301 for fieldname in sharedfields:
302 field = schema[fieldname]
303 length = (reader.doc_field_length(docnum, fieldname, 0)
304 if field.scorable else 0)
305 perdocwriter.add_field(fieldname, field, d.get(fieldname),
306 length)
307 if field.vector and reader.has_vector(docnum, fieldname):
308 v = reader.vector(docnum, fieldname)
309 perdocwriter.add_vector_matcher(fieldname, field, v)
310 # Finish the new document
311 perdocwriter.finish_doc()
312 newdoc += 1
313
314 def _merge_fields(self, reader, docmap):
315 # Add inverted index postings to the pool, renumbering document number
316 # references as necessary
317 add_post = self.pool.add
318 # Note: iter_postings() only yields postings for undeleted docs
319 for p in renumber_postings(reader, self.docnum, docmap):
320 add_post(p)
321
322 def add_reader(self, reader):
323 self._check_state()
324
325 # Make a docnum map to renumber around deleted documents
326 docmap, newdoc = self._make_docmap(reader, self.docnum)
327 # Add per-document values
328 self._merge_per_doc(reader, docmap)
329 # Add field postings
330 self._merge_fields(reader, docmap)
331
332 self.docnum = newdoc
333 self._added = True
334
335 def _check_fields(self, schema, fieldnames):
336 # Check if the caller gave us a bogus field
337 for name in fieldnames:
338 if name not in schema:
339 raise UnknownFieldError("No field named %r in %s"
340 % (name, schema))
341
342 def add_document(self, **fields):
343 self._check_state()
344 perdocwriter = self.perdocwriter
345 schema = self.schema
346 docnum = self.docnum
347 add_post = self.pool.add
348
349 docboost = self._doc_boost(fields)
350 fieldnames = sorted([name for name in fields.keys()
351 if not name.startswith("_")])
352 self._check_fields(schema, fieldnames)
353
354 perdocwriter.start_doc(docnum)
355 # For each field...
356 for fieldname in fieldnames:
357 value = fields.get(fieldname)
358 if value is None:
359 continue
360 field = schema[fieldname]
361
362 length = 0
363 if field.indexed:
364 # TODO: Method for adding progressive field values, ie
365 # setting start_pos/start_char?
366 fieldboost = self._field_boost(fields, fieldname, docboost)
367 # Ask the field to return a list of (text, weight, valuestring)
368 # tuples and the number of terms in the field
369 items = field.index(value)
370 # Only store the length if the field is marked scorable
371 scorable = field.scorable
372 # Add the terms to the pool
373 for text, freq, weight, valuestring in items:
374 #assert w != ""
375 weight *= fieldboost
376 if scorable:
377 length += freq
378 add_post((fieldname, text, docnum, weight, valuestring))
379
380 if field.separate_spelling():
381 # For fields which use different tokens for spelling, insert
382 # fake postings for the spellable words, where docnum=None
383 # means "this is a spelling word"
384
385 # TODO: think of something less hacktacular
386 for text in field.spellable_words(value):
387 add_post((fieldname, text, None, None, None))
388
389 vformat = field.vector
390 if vformat:
391 analyzer = field.analyzer
392 vitems = sorted(vformat.word_values(value, analyzer,
393 mode="index"))
394 perdocwriter.add_vector_items(fieldname, field, vitems)
395
396 # Figure out what value to store for this field
397 storedval = None
398 if field.stored:
399 storedkey = "_stored_%s" % fieldname
400 if storedkey in fields:
401 storedval = fields.get(storedkey)
402 else:
403 storedval = value
404
405 # Add the stored value and length for this field to the per-
406 # document writer
407 perdocwriter.add_field(fieldname, field, storedval, length)
408 perdocwriter.finish_doc()
409 self._added = True
410 self.docnum += 1
411
412 def doc_count(self):
413 return self.docnum - self.docbase
414
415 def get_segment(self):
416 newsegment = self.newsegment
417 newsegment.doccount = self.doc_count()
418 return newsegment
419
420 def _merge_segments(self, mergetype, optimize, merge):
421 if mergetype:
422 pass
423 elif optimize:
424 mergetype = OPTIMIZE
425 elif not merge:
426 mergetype = NO_MERGE
427 else:
428 mergetype = MERGE_SMALL
429
430 # Call the merge policy function. The policy may choose to merge
431 # other segments into this writer's pool
432 return mergetype(self, self.segments)
433
434 def _flush_segment(self):
435 lengths = self.perdocwriter.lengths_reader()
436 postings = self.pool.iter_postings()
437 self.fieldwriter.add_postings(self.schema, lengths, postings)
438
439 def _close_segment(self):
440 self.perdocwriter.close()
441 self.fieldwriter.close()
442 self.pool.cleanup()
443
444 def _assemble_segment(self):
445 if self.compound:
446 # Assemble the segment files into a compound file
447 newsegment = self.get_segment()
448 newsegment.create_compound_file(self.storage)
449 newsegment.compound = True
450
451 def _commit_toc(self, segments):
452 # Write a new TOC with the new segment list (and delete old files)
453 self.codec.commit_toc(self.storage, self.indexname, self.schema,
454 segments, self.generation)
455
456 def _finish(self):
457 if self.writelock:
458 self.writelock.release()
459 self.is_closed = True
460 #self.storage.close()
461
462 def _partial_segment(self):
463 # For use by a parent multiprocessing writer: Closes out the segment
464 # but leaves the pool files intact so the parent can access them
465 self._check_state()
466 self.perdocwriter.close()
467 self.fieldwriter.close()
468 # Don't call self.pool.cleanup()! We want to grab the pool files.
469 return self.get_segment()
470
471 def commit(self, mergetype=None, optimize=False, merge=True):
472 """Finishes writing and saves all additions and changes to disk.
473
474 There are four possible ways to use this method::
475
476 # Merge small segments but leave large segments, trying to
477 # balance fast commits with fast searching:
478 writer.commit()
479
480 # Merge all segments into a single segment:
481 writer.commit(optimize=True)
482
483 # Don't merge any existing segments:
484 writer.commit(merge=False)
485
486 # Use a custom merge function
487 writer.commit(mergetype=my_merge_function)
488
489 :param mergetype: a custom merge function taking a Writer object and
490 segment list as arguments, and returning a new segment list. If you
491 supply a ``mergetype`` function, the values of the ``optimize`` and
492 ``merge`` arguments are ignored.
493 :param optimize: if True, all existing segments are merged with the
494 documents you've added to this writer (and the value of the
495 ``merge`` argument is ignored).
496 :param merge: if False, do not merge small segments.
497 """
498
499 self._check_state()
500 try:
501 # Merge old segments if necessary
502 finalsegments = self._merge_segments(mergetype, optimize, merge)
503 if self._added:
504 # Finish writing segment
505 self._flush_segment()
506 # Close segment files
507 self._close_segment()
508 # Assemble compound segment if necessary
509 self._assemble_segment()
510
511 # Add the new segment to the list of remaining segments
512 # returned by the merge policy function
513 finalsegments.append(self.get_segment())
514 else:
515 # Close segment files
516 self._close_segment()
517 # Write TOC
518 self._commit_toc(finalsegments)
519 finally:
520 # Final cleanup
521 self._finish()
522
523 def cancel(self):
524 self._check_state()
525 self._close_segment()
526 self._finish()
527
528
529# Retroactively add spelling files to an existing index
530
531def add_spelling(ix, fieldnames, commit=True):
532 """Adds spelling files to an existing index that was created without
533 them, and modifies the schema so the given fields have the ``spelling``
534 attribute. Only works on filedb indexes.
535
536 >>> ix = index.open_dir("testindex")
537 >>> add_spelling(ix, ["content", "tags"])
538
539 :param ix: a :class:`whoosh.filedb.fileindex.FileIndex` object.
540 :param fieldnames: a list of field names to create word graphs for.
541 :param force: if True, overwrites existing word graph files. This is only
542 useful for debugging.
543 """
544
545 from whoosh.filedb.filereading import SegmentReader
546 from whoosh.support import dawg
547
548 writer = ix.writer()
549 storage = writer.storage
550 schema = writer.schema
551 segments = writer.segments
552
553 for segment in segments:
554 r = SegmentReader(storage, schema, segment)
555 f = segment.create_file(storage, ".dag")
556 gw = dawg.GraphWriter(f)
557 for fieldname in fieldnames:
558 gw.start_field(fieldname)
559 for word in r.lexicon(fieldname):
560 gw.insert(word)
561 gw.finish_field()
562 gw.close()
563
564 for fieldname in fieldnames:
565 schema[fieldname].spelling = True
566
567 if commit:
568 writer.commit(merge=False)
569
570
571
572