/bangkokhotel/lib/python2.5/site-packages/whoosh/filedb/filewriting.py

https://bitbucket.org/luisrodriguez/bangkokhotel · Python · 572 lines · 354 code · 44 blank · 174 comment · 23 complexity · fbe6a2d5d645659a796771bc34f152b4 MD5 · raw file

  1. # Copyright 2007 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. from __future__ import with_statement
  28. from bisect import bisect_right
  29. from whoosh.fields import UnknownFieldError
  30. from whoosh.store import LockError
  31. from whoosh.support.filelock import try_for
  32. from whoosh.support.externalsort import SortingPool
  33. from whoosh.util import fib
  34. from whoosh.writing import IndexWriter, IndexingError
  35. # Merge policies
  36. # A merge policy is a callable that takes the Index object, the SegmentWriter
  37. # object, and the current segment list (not including the segment being
  38. # written), and returns an updated segment list (not including the segment
  39. # being written).
  40. def NO_MERGE(writer, segments):
  41. """This policy does not merge any existing segments.
  42. """
  43. return segments
  44. def MERGE_SMALL(writer, segments):
  45. """This policy merges small segments, where "small" is defined using a
  46. heuristic based on the fibonacci sequence.
  47. """
  48. from whoosh.filedb.filereading import SegmentReader
  49. newsegments = []
  50. sorted_segment_list = sorted(segments, key=lambda s: s.doc_count_all())
  51. total_docs = 0
  52. for i, seg in enumerate(sorted_segment_list):
  53. count = seg.doc_count_all()
  54. if count > 0:
  55. total_docs += count
  56. if total_docs < fib(i + 5):
  57. reader = SegmentReader(writer.storage, writer.schema, seg)
  58. writer.add_reader(reader)
  59. reader.close()
  60. else:
  61. newsegments.append(seg)
  62. return newsegments
  63. def OPTIMIZE(writer, segments):
  64. """This policy merges all existing segments.
  65. """
  66. from whoosh.filedb.filereading import SegmentReader
  67. for seg in segments:
  68. reader = SegmentReader(writer.storage, writer.schema, seg)
  69. writer.add_reader(reader)
  70. reader.close()
  71. return []
  72. class PostingPool(SortingPool):
  73. # Subclass whoosh.support.externalsort.SortingPool to use knowledge of
  74. # postings to set run size in bytes instead of items
  75. def __init__(self, limitmb=128, **kwargs):
  76. SortingPool.__init__(self, **kwargs)
  77. self.limit = limitmb * 1024 * 1024
  78. self.currentsize = 0
  79. def add(self, item):
  80. # item = (fieldname, text, docnum, weight, valuestring)
  81. size = (28 + 4 * 5 # tuple = 28 + 4 * length
  82. + 21 + len(item[0]) # fieldname = str = 21 + length
  83. + 26 + len(item[1]) * 2 # text = unicode = 26 + 2 * length
  84. + 18 # docnum = long = 18
  85. + 16 # weight = float = 16
  86. + 21 + len(item[4] or '')) # valuestring
  87. self.currentsize += size
  88. if self.currentsize > self.limit:
  89. self.save()
  90. self.current.append(item)
  91. def iter_postings(self):
  92. # This is just an alias for items() to be consistent with the
  93. # iter_postings()/add_postings() interface of a lot of other classes
  94. return self.items()
  95. def save(self):
  96. SortingPool.save(self)
  97. self.currentsize = 0
  98. def renumber_postings(reader, startdoc, docmap):
  99. for fieldname, text, docnum, weight, value in reader.iter_postings():
  100. newdoc = docmap[docnum] if docmap else startdoc + docnum
  101. yield (fieldname, text, newdoc, weight, value)
  102. # Writer object
  103. class SegmentWriter(IndexWriter):
  104. def __init__(self, ix, poolclass=None, timeout=0.0, delay=0.1, _lk=True,
  105. limitmb=128, docbase=0, codec=None, compound=True, **kwargs):
  106. # Lock the index
  107. self.writelock = None
  108. if _lk:
  109. self.writelock = ix.lock("WRITELOCK")
  110. if not try_for(self.writelock.acquire, timeout=timeout,
  111. delay=delay):
  112. raise LockError
  113. if codec is None:
  114. from whoosh.codec import default_codec
  115. codec = default_codec()
  116. self.codec = codec
  117. # Get info from the index
  118. self.storage = ix.storage
  119. self.indexname = ix.indexname
  120. info = ix._read_toc()
  121. self.generation = info.generation + 1
  122. self.schema = info.schema
  123. self.segments = info.segments
  124. self.docnum = self.docbase = docbase
  125. self._setup_doc_offsets()
  126. # Internals
  127. self.compound = compound
  128. poolprefix = "whoosh_%s_" % self.indexname
  129. self.pool = PostingPool(limitmb=limitmb, prefix=poolprefix)
  130. newsegment = self.newsegment = codec.new_segment(self.storage,
  131. self.indexname)
  132. self.is_closed = False
  133. self._added = False
  134. # Set up writers
  135. self.perdocwriter = codec.per_document_writer(self.storage, newsegment)
  136. self.fieldwriter = codec.field_writer(self.storage, newsegment)
  137. def __repr__(self):
  138. return "<%s %r>" % (self.__class__.__name__, self.newsegment)
  139. def _setup_doc_offsets(self):
  140. self._doc_offsets = []
  141. base = 0
  142. for s in self.segments:
  143. self._doc_offsets.append(base)
  144. base += s.doc_count_all()
  145. def _check_state(self):
  146. if self.is_closed:
  147. raise IndexingError("This writer is closed")
  148. def add_field(self, fieldname, fieldspec, **kwargs):
  149. self._check_state()
  150. if self._added:
  151. raise Exception("Can't modify schema after adding data to writer")
  152. super(SegmentWriter, self).add_field(fieldname, fieldspec, **kwargs)
  153. def remove_field(self, fieldname):
  154. self._check_state()
  155. if self._added:
  156. raise Exception("Can't modify schema after adding data to writer")
  157. super(SegmentWriter, self).remove_field(fieldname)
  158. def _document_segment(self, docnum):
  159. #Returns the index.Segment object containing the given document
  160. #number.
  161. offsets = self._doc_offsets
  162. if len(offsets) == 1:
  163. return 0
  164. return bisect_right(offsets, docnum) - 1
  165. def _segment_and_docnum(self, docnum):
  166. #Returns an (index.Segment, segment_docnum) pair for the segment
  167. #containing the given document number.
  168. segmentnum = self._document_segment(docnum)
  169. offset = self._doc_offsets[segmentnum]
  170. segment = self.segments[segmentnum]
  171. return segment, docnum - offset
  172. def has_deletions(self):
  173. """
  174. Returns True if this index has documents that are marked deleted but
  175. haven't been optimized out of the index yet.
  176. """
  177. return any(s.has_deletions() for s in self.segments)
  178. def delete_document(self, docnum, delete=True):
  179. self._check_state()
  180. if docnum >= sum(seg.doccount for seg in self.segments):
  181. raise IndexingError("No document ID %r in this index" % docnum)
  182. segment, segdocnum = self._segment_and_docnum(docnum)
  183. segment.delete_document(segdocnum, delete=delete)
  184. def deleted_count(self):
  185. """
  186. :returns: the total number of deleted documents in the index.
  187. """
  188. return sum(s.deleted_count() for s in self.segments)
  189. def is_deleted(self, docnum):
  190. segment, segdocnum = self._segment_and_docnum(docnum)
  191. return segment.is_deleted(segdocnum)
  192. def reader(self, reuse=None):
  193. from whoosh.filedb.fileindex import FileIndex
  194. self._check_state()
  195. return FileIndex._reader(self.storage, self.schema, self.segments,
  196. self.generation, reuse=reuse)
  197. def iter_postings(self):
  198. return self.pool.iter_postings()
  199. def add_postings(self, lengths, items, startdoc, docmap):
  200. # items = (fieldname, text, docnum, weight, valuestring) ...
  201. schema = self.schema
  202. # Make a generator to strip out deleted fields and renumber the docs
  203. # before passing them down to the field writer
  204. def gen():
  205. for fieldname, text, docnum, weight, valuestring in items:
  206. if fieldname not in schema:
  207. continue
  208. if docmap is not None:
  209. newdoc = docmap[docnum]
  210. else:
  211. newdoc = startdoc + docnum
  212. yield (fieldname, text, newdoc, weight, valuestring)
  213. self.fieldwriter.add_postings(schema, lengths, gen())
  214. def _make_docmap(self, reader, newdoc):
  215. # If the reader has deletions, make a dictionary mapping the docnums
  216. # of undeleted documents to new sequential docnums starting at newdoc
  217. hasdel = reader.has_deletions()
  218. if hasdel:
  219. docmap = {}
  220. for docnum in reader.all_doc_ids():
  221. if reader.is_deleted(docnum):
  222. continue
  223. docmap[docnum] = newdoc
  224. newdoc += 1
  225. else:
  226. docmap = None
  227. newdoc += reader.doc_count_all()
  228. # Return the map and the new lowest unused document number
  229. return docmap, newdoc
  230. def _merge_per_doc(self, reader, docmap):
  231. schema = self.schema
  232. newdoc = self.docnum
  233. perdocwriter = self.perdocwriter
  234. sharedfields = set(schema.names()) & set(reader.schema.names())
  235. for docnum in reader.all_doc_ids():
  236. # Skip deleted documents
  237. if docmap and docnum not in docmap:
  238. continue
  239. # Renumber around deletions
  240. if docmap:
  241. newdoc = docmap[docnum]
  242. # Get the stored fields
  243. d = reader.stored_fields(docnum)
  244. # Start a new document in the writer
  245. perdocwriter.start_doc(newdoc)
  246. # For each field in the document, copy its stored value,
  247. # length, and vectors (if any) to the writer
  248. for fieldname in sharedfields:
  249. field = schema[fieldname]
  250. length = (reader.doc_field_length(docnum, fieldname, 0)
  251. if field.scorable else 0)
  252. perdocwriter.add_field(fieldname, field, d.get(fieldname),
  253. length)
  254. if field.vector and reader.has_vector(docnum, fieldname):
  255. v = reader.vector(docnum, fieldname)
  256. perdocwriter.add_vector_matcher(fieldname, field, v)
  257. # Finish the new document
  258. perdocwriter.finish_doc()
  259. newdoc += 1
  260. def _merge_fields(self, reader, docmap):
  261. # Add inverted index postings to the pool, renumbering document number
  262. # references as necessary
  263. add_post = self.pool.add
  264. # Note: iter_postings() only yields postings for undeleted docs
  265. for p in renumber_postings(reader, self.docnum, docmap):
  266. add_post(p)
  267. def add_reader(self, reader):
  268. self._check_state()
  269. # Make a docnum map to renumber around deleted documents
  270. docmap, newdoc = self._make_docmap(reader, self.docnum)
  271. # Add per-document values
  272. self._merge_per_doc(reader, docmap)
  273. # Add field postings
  274. self._merge_fields(reader, docmap)
  275. self.docnum = newdoc
  276. self._added = True
  277. def _check_fields(self, schema, fieldnames):
  278. # Check if the caller gave us a bogus field
  279. for name in fieldnames:
  280. if name not in schema:
  281. raise UnknownFieldError("No field named %r in %s"
  282. % (name, schema))
  283. def add_document(self, **fields):
  284. self._check_state()
  285. perdocwriter = self.perdocwriter
  286. schema = self.schema
  287. docnum = self.docnum
  288. add_post = self.pool.add
  289. docboost = self._doc_boost(fields)
  290. fieldnames = sorted([name for name in fields.keys()
  291. if not name.startswith("_")])
  292. self._check_fields(schema, fieldnames)
  293. perdocwriter.start_doc(docnum)
  294. # For each field...
  295. for fieldname in fieldnames:
  296. value = fields.get(fieldname)
  297. if value is None:
  298. continue
  299. field = schema[fieldname]
  300. length = 0
  301. if field.indexed:
  302. # TODO: Method for adding progressive field values, ie
  303. # setting start_pos/start_char?
  304. fieldboost = self._field_boost(fields, fieldname, docboost)
  305. # Ask the field to return a list of (text, weight, valuestring)
  306. # tuples and the number of terms in the field
  307. items = field.index(value)
  308. # Only store the length if the field is marked scorable
  309. scorable = field.scorable
  310. # Add the terms to the pool
  311. for text, freq, weight, valuestring in items:
  312. #assert w != ""
  313. weight *= fieldboost
  314. if scorable:
  315. length += freq
  316. add_post((fieldname, text, docnum, weight, valuestring))
  317. if field.separate_spelling():
  318. # For fields which use different tokens for spelling, insert
  319. # fake postings for the spellable words, where docnum=None
  320. # means "this is a spelling word"
  321. # TODO: think of something less hacktacular
  322. for text in field.spellable_words(value):
  323. add_post((fieldname, text, None, None, None))
  324. vformat = field.vector
  325. if vformat:
  326. analyzer = field.analyzer
  327. vitems = sorted(vformat.word_values(value, analyzer,
  328. mode="index"))
  329. perdocwriter.add_vector_items(fieldname, field, vitems)
  330. # Figure out what value to store for this field
  331. storedval = None
  332. if field.stored:
  333. storedkey = "_stored_%s" % fieldname
  334. if storedkey in fields:
  335. storedval = fields.get(storedkey)
  336. else:
  337. storedval = value
  338. # Add the stored value and length for this field to the per-
  339. # document writer
  340. perdocwriter.add_field(fieldname, field, storedval, length)
  341. perdocwriter.finish_doc()
  342. self._added = True
  343. self.docnum += 1
  344. def doc_count(self):
  345. return self.docnum - self.docbase
  346. def get_segment(self):
  347. newsegment = self.newsegment
  348. newsegment.doccount = self.doc_count()
  349. return newsegment
  350. def _merge_segments(self, mergetype, optimize, merge):
  351. if mergetype:
  352. pass
  353. elif optimize:
  354. mergetype = OPTIMIZE
  355. elif not merge:
  356. mergetype = NO_MERGE
  357. else:
  358. mergetype = MERGE_SMALL
  359. # Call the merge policy function. The policy may choose to merge
  360. # other segments into this writer's pool
  361. return mergetype(self, self.segments)
  362. def _flush_segment(self):
  363. lengths = self.perdocwriter.lengths_reader()
  364. postings = self.pool.iter_postings()
  365. self.fieldwriter.add_postings(self.schema, lengths, postings)
  366. def _close_segment(self):
  367. self.perdocwriter.close()
  368. self.fieldwriter.close()
  369. self.pool.cleanup()
  370. def _assemble_segment(self):
  371. if self.compound:
  372. # Assemble the segment files into a compound file
  373. newsegment = self.get_segment()
  374. newsegment.create_compound_file(self.storage)
  375. newsegment.compound = True
  376. def _commit_toc(self, segments):
  377. # Write a new TOC with the new segment list (and delete old files)
  378. self.codec.commit_toc(self.storage, self.indexname, self.schema,
  379. segments, self.generation)
  380. def _finish(self):
  381. if self.writelock:
  382. self.writelock.release()
  383. self.is_closed = True
  384. #self.storage.close()
  385. def _partial_segment(self):
  386. # For use by a parent multiprocessing writer: Closes out the segment
  387. # but leaves the pool files intact so the parent can access them
  388. self._check_state()
  389. self.perdocwriter.close()
  390. self.fieldwriter.close()
  391. # Don't call self.pool.cleanup()! We want to grab the pool files.
  392. return self.get_segment()
  393. def commit(self, mergetype=None, optimize=False, merge=True):
  394. """Finishes writing and saves all additions and changes to disk.
  395. There are four possible ways to use this method::
  396. # Merge small segments but leave large segments, trying to
  397. # balance fast commits with fast searching:
  398. writer.commit()
  399. # Merge all segments into a single segment:
  400. writer.commit(optimize=True)
  401. # Don't merge any existing segments:
  402. writer.commit(merge=False)
  403. # Use a custom merge function
  404. writer.commit(mergetype=my_merge_function)
  405. :param mergetype: a custom merge function taking a Writer object and
  406. segment list as arguments, and returning a new segment list. If you
  407. supply a ``mergetype`` function, the values of the ``optimize`` and
  408. ``merge`` arguments are ignored.
  409. :param optimize: if True, all existing segments are merged with the
  410. documents you've added to this writer (and the value of the
  411. ``merge`` argument is ignored).
  412. :param merge: if False, do not merge small segments.
  413. """
  414. self._check_state()
  415. try:
  416. # Merge old segments if necessary
  417. finalsegments = self._merge_segments(mergetype, optimize, merge)
  418. if self._added:
  419. # Finish writing segment
  420. self._flush_segment()
  421. # Close segment files
  422. self._close_segment()
  423. # Assemble compound segment if necessary
  424. self._assemble_segment()
  425. # Add the new segment to the list of remaining segments
  426. # returned by the merge policy function
  427. finalsegments.append(self.get_segment())
  428. else:
  429. # Close segment files
  430. self._close_segment()
  431. # Write TOC
  432. self._commit_toc(finalsegments)
  433. finally:
  434. # Final cleanup
  435. self._finish()
  436. def cancel(self):
  437. self._check_state()
  438. self._close_segment()
  439. self._finish()
  440. # Retroactively add spelling files to an existing index
  441. def add_spelling(ix, fieldnames, commit=True):
  442. """Adds spelling files to an existing index that was created without
  443. them, and modifies the schema so the given fields have the ``spelling``
  444. attribute. Only works on filedb indexes.
  445. >>> ix = index.open_dir("testindex")
  446. >>> add_spelling(ix, ["content", "tags"])
  447. :param ix: a :class:`whoosh.filedb.fileindex.FileIndex` object.
  448. :param fieldnames: a list of field names to create word graphs for.
  449. :param force: if True, overwrites existing word graph files. This is only
  450. useful for debugging.
  451. """
  452. from whoosh.filedb.filereading import SegmentReader
  453. from whoosh.support import dawg
  454. writer = ix.writer()
  455. storage = writer.storage
  456. schema = writer.schema
  457. segments = writer.segments
  458. for segment in segments:
  459. r = SegmentReader(storage, schema, segment)
  460. f = segment.create_file(storage, ".dag")
  461. gw = dawg.GraphWriter(f)
  462. for fieldname in fieldnames:
  463. gw.start_field(fieldname)
  464. for word in r.lexicon(fieldname):
  465. gw.insert(word)
  466. gw.finish_field()
  467. gw.close()
  468. for fieldname in fieldnames:
  469. schema[fieldname].spelling = True
  470. if commit:
  471. writer.commit(merge=False)