/bangkokhotel/lib/python2.5/site-packages/whoosh/codec/whoosh2.py

https://bitbucket.org/luisrodriguez/bangkokhotel · Python · 1048 lines · 748 code · 209 blank · 91 comment · 111 complexity · 293c167d6802093bed75d095d2e4e599 MD5 · raw file

  1. # Copyright 2011 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. from array import array
  28. from collections import defaultdict
  29. from struct import Struct
  30. from whoosh.compat import (loads, dumps, xrange, iteritems, itervalues, b,
  31. bytes_type, string_type, integer_types)
  32. from whoosh.codec import base
  33. from whoosh.codec.base import (minimize_ids, deminimize_ids, minimize_weights,
  34. deminimize_weights, minimize_values,
  35. deminimize_values)
  36. from whoosh.filedb.fileindex import TOC, clean_files
  37. from whoosh.filedb.filetables import CodedOrderedWriter, CodedOrderedReader
  38. from whoosh.matching import ListMatcher
  39. from whoosh.reading import TermNotFound
  40. from whoosh.store import Storage
  41. from whoosh.support.dawg import GraphWriter, GraphReader
  42. from whoosh.system import (pack_ushort, pack_long, unpack_ushort, unpack_long,
  43. _INT_SIZE, _LONG_SIZE)
  44. from whoosh.util import byte_to_length, length_to_byte, utf8encode, utf8decode
  45. # Standard codec top-level object
  46. class W2Codec(base.Codec):
  47. TERMS_EXT = ".trm" # Term index
  48. POSTS_EXT = ".pst" # Term postings
  49. DAWG_EXT = ".dag" # Spelling graph file
  50. LENGTHS_EXT = ".fln" # Field lengths file
  51. VECTOR_EXT = ".vec" # Vector index
  52. VPOSTS_EXT = ".vps" # Vector postings
  53. STORED_EXT = ".sto" # Stored fields file
  54. def __init__(self, blocklimit=128, compression=3, loadlengths=False,
  55. inlinelimit=1):
  56. self.blocklimit = blocklimit
  57. self.compression = compression
  58. self.loadlengths = loadlengths
  59. self.inlinelimit = inlinelimit
  60. # Per-document value writer
  61. def per_document_writer(self, storage, segment):
  62. return W2PerDocWriter(storage, segment, blocklimit=self.blocklimit,
  63. compression=self.compression)
  64. # Inverted index writer
  65. def field_writer(self, storage, segment):
  66. return W2FieldWriter(storage, segment, blocklimit=self.blocklimit,
  67. compression=self.compression,
  68. inlinelimit=self.inlinelimit)
  69. # Readers
  70. def terms_reader(self, storage, segment):
  71. tifile = segment.open_file(storage, self.TERMS_EXT)
  72. postfile = segment.open_file(storage, self.POSTS_EXT)
  73. return W2TermsReader(tifile, postfile)
  74. def lengths_reader(self, storage, segment):
  75. flfile = segment.open_file(storage, self.LENGTHS_EXT)
  76. doccount = segment.doc_count_all()
  77. # Check the first byte of the file to see if it's an old format
  78. firstbyte = flfile.read(1)
  79. flfile.seek(0)
  80. if firstbyte != b("~"):
  81. from whoosh.codec.legacy import load_old_lengths
  82. lengths = load_old_lengths(InMemoryLengths(), flfile, doccount)
  83. elif self.loadlengths:
  84. lengths = InMemoryLengths.from_file(flfile, doccount)
  85. else:
  86. lengths = OnDiskLengths(flfile, doccount)
  87. return lengths
  88. def vector_reader(self, storage, segment):
  89. vifile = segment.open_file(storage, self.VECTOR_EXT)
  90. postfile = segment.open_file(storage, self.VPOSTS_EXT)
  91. return W2VectorReader(vifile, postfile)
  92. def stored_fields_reader(self, storage, segment):
  93. sffile = segment.open_file(storage, self.STORED_EXT)
  94. return StoredFieldReader(sffile)
  95. def graph_reader(self, storage, segment):
  96. dawgfile = segment.open_file(storage, self.DAWG_EXT)
  97. return GraphReader(dawgfile)
  98. # Segments and generations
  99. def new_segment(self, storage, indexname):
  100. return W2Segment(indexname)
  101. def commit_toc(self, storage, indexname, schema, segments, generation,
  102. clean=True):
  103. toc = TOC(schema, segments, generation)
  104. toc.write(storage, indexname)
  105. # Delete leftover files
  106. if clean:
  107. clean_files(storage, indexname, generation, segments)
  108. # Per-document value writer
  109. class W2PerDocWriter(base.PerDocumentWriter):
  110. def __init__(self, storage, segment, blocklimit=128, compression=3):
  111. if not isinstance(blocklimit, int):
  112. raise ValueError
  113. self.storage = storage
  114. self.segment = segment
  115. self.blocklimit = blocklimit
  116. self.compression = compression
  117. self.doccount = 0
  118. sffile = segment.create_file(storage, W2Codec.STORED_EXT)
  119. self.stored = StoredFieldWriter(sffile)
  120. self.storedfields = None
  121. self.lengths = InMemoryLengths()
  122. # We'll wait to create the vector files until someone actually tries
  123. # to add a vector
  124. self.vindex = self.vpostfile = None
  125. def _make_vector_files(self):
  126. vifile = self.segment.create_file(self.storage, W2Codec.VECTOR_EXT)
  127. self.vindex = VectorWriter(vifile)
  128. self.vpostfile = self.segment.create_file(self.storage,
  129. W2Codec.VPOSTS_EXT)
  130. def start_doc(self, docnum):
  131. self.docnum = docnum
  132. self.storedfields = {}
  133. self.doccount = max(self.doccount, docnum + 1)
  134. def add_field(self, fieldname, fieldobj, value, length):
  135. if length:
  136. self.lengths.add(self.docnum, fieldname, length)
  137. if value is not None:
  138. self.storedfields[fieldname] = value
  139. def _new_block(self, vformat):
  140. postingsize = vformat.posting_size
  141. return W2Block(postingsize, stringids=True)
  142. def add_vector_items(self, fieldname, fieldobj, items):
  143. if self.vindex is None:
  144. self._make_vector_files()
  145. # items = (text, freq, weight, valuestring) ...
  146. postfile = self.vpostfile
  147. blocklimit = self.blocklimit
  148. block = self._new_block(fieldobj.vector)
  149. startoffset = postfile.tell()
  150. postfile.write(block.magic) # Magic number
  151. blockcount = 0
  152. postfile.write_uint(0) # Placeholder for block count
  153. countdown = blocklimit
  154. for text, _, weight, valuestring in items:
  155. block.add(text, weight, valuestring)
  156. countdown -= 1
  157. if countdown == 0:
  158. block.to_file(postfile, compression=self.compression)
  159. block = self._new_block(fieldobj.vector)
  160. blockcount += 1
  161. countdown = blocklimit
  162. # If there are leftover items in the current block, write them out
  163. if block:
  164. block.to_file(postfile, compression=self.compression)
  165. blockcount += 1
  166. # Seek back to the start of this list of posting blocks and write the
  167. # number of blocks
  168. postfile.flush()
  169. here = postfile.tell()
  170. postfile.seek(startoffset + 4)
  171. postfile.write_uint(blockcount)
  172. postfile.seek(here)
  173. # Add to the index
  174. self.vindex.add((self.docnum, fieldname), startoffset)
  175. def add_vector_matcher(self, fieldname, fieldobj, vmatcher):
  176. def readitems():
  177. while vmatcher.is_active():
  178. text = vmatcher.id()
  179. weight = vmatcher.weight()
  180. valuestring = vmatcher.value()
  181. yield (text, None, weight, valuestring)
  182. vmatcher.next()
  183. self.add_vector_items(fieldname, fieldobj, readitems())
  184. def finish_doc(self):
  185. self.stored.add(self.storedfields)
  186. self.storedfields = None
  187. def lengths_reader(self):
  188. return self.lengths
  189. def close(self):
  190. if self.storedfields is not None:
  191. self.stored.add(self.storedfields)
  192. self.stored.close()
  193. flfile = self.segment.create_file(self.storage, W2Codec.LENGTHS_EXT)
  194. self.lengths.to_file(flfile, self.doccount)
  195. if self.vindex:
  196. self.vindex.close()
  197. self.vpostfile.close()
  198. # Inverted index writer
  199. class W2FieldWriter(base.FieldWriter):
  200. def __init__(self, storage, segment, blocklimit=128, compression=3,
  201. inlinelimit=1):
  202. assert isinstance(storage, Storage)
  203. assert isinstance(segment, base.Segment)
  204. assert isinstance(blocklimit, int)
  205. assert isinstance(compression, int)
  206. assert isinstance(inlinelimit, int)
  207. self.storage = storage
  208. self.segment = segment
  209. self.fieldname = None
  210. self.text = None
  211. self.field = None
  212. self.format = None
  213. self.spelling = False
  214. tifile = segment.create_file(storage, W2Codec.TERMS_EXT)
  215. self.termsindex = TermIndexWriter(tifile)
  216. self.postfile = segment.create_file(storage, W2Codec.POSTS_EXT)
  217. # We'll wait to create the DAWG builder until someone actually adds
  218. # a spelled field
  219. self.dawg = None
  220. self.blocklimit = blocklimit
  221. self.compression = compression
  222. self.inlinelimit = inlinelimit
  223. self.block = None
  224. self.terminfo = None
  225. self._infield = False
  226. def _make_dawg_files(self):
  227. dawgfile = self.segment.create_file(self.storage, W2Codec.DAWG_EXT)
  228. self.dawg = GraphWriter(dawgfile)
  229. def _new_block(self):
  230. return W2Block(self.format.posting_size)
  231. def _reset_block(self):
  232. self.block = self._new_block()
  233. def _write_block(self):
  234. self.terminfo.add_block(self.block)
  235. self.block.to_file(self.postfile, compression=self.compression)
  236. self._reset_block()
  237. self.blockcount += 1
  238. def _start_blocklist(self):
  239. postfile = self.postfile
  240. self._reset_block()
  241. # Magic number
  242. self.startoffset = postfile.tell()
  243. postfile.write(W2Block.magic)
  244. # Placeholder for block count
  245. self.blockcount = 0
  246. postfile.write_uint(0)
  247. def start_field(self, fieldname, fieldobj):
  248. self.fieldname = fieldname
  249. self.field = fieldobj
  250. self.format = fieldobj.format
  251. self.spelling = fieldobj.spelling and not fieldobj.separate_spelling()
  252. self._dawgfield = False
  253. if self.spelling or fieldobj.separate_spelling():
  254. if self.dawg is None:
  255. self._make_dawg_files()
  256. self.dawg.start_field(fieldname)
  257. self._dawgfield = True
  258. self._infield = True
  259. def start_term(self, text):
  260. if self.block is not None:
  261. raise Exception("Called start_term in a block")
  262. self.text = text
  263. self.terminfo = base.FileTermInfo()
  264. if self.spelling:
  265. self.dawg.insert(text)
  266. self._start_blocklist()
  267. def add(self, docnum, weight, valuestring, length):
  268. self.block.add(docnum, weight, valuestring, length)
  269. if len(self.block) > self.blocklimit:
  270. self._write_block()
  271. def add_spell_word(self, fieldname, text):
  272. if self.dawg is None:
  273. self._make_dawg_files()
  274. self.dawg.insert(text)
  275. def finish_term(self):
  276. block = self.block
  277. if block is None:
  278. raise Exception("Called finish_term when not in a block")
  279. terminfo = self.terminfo
  280. if self.blockcount < 1 and block and len(block) < self.inlinelimit:
  281. # Inline the single block
  282. terminfo.add_block(block)
  283. vals = None if not block.values else tuple(block.values)
  284. postings = (tuple(block.ids), tuple(block.weights), vals)
  285. else:
  286. if block:
  287. # Write the current unfinished block to disk
  288. self._write_block()
  289. # Seek back to the start of this list of posting blocks and write
  290. # the number of blocks
  291. postfile = self.postfile
  292. postfile.flush()
  293. here = postfile.tell()
  294. postfile.seek(self.startoffset + 4)
  295. postfile.write_uint(self.blockcount)
  296. postfile.seek(here)
  297. self.block = None
  298. postings = self.startoffset
  299. self.block = None
  300. terminfo.postings = postings
  301. self.termsindex.add((self.fieldname, self.text), terminfo)
  302. def finish_field(self):
  303. if not self._infield:
  304. raise Exception("Called finish_field before start_field")
  305. self._infield = False
  306. if self._dawgfield:
  307. self.dawg.finish_field()
  308. self._dawgfield = False
  309. def close(self):
  310. self.termsindex.close()
  311. self.postfile.close()
  312. if self.dawg is not None:
  313. self.dawg.close()
  314. # Matcher
  315. class PostingMatcher(base.BlockPostingMatcher):
  316. def __init__(self, postfile, startoffset, fmt, scorer=None, term=None,
  317. stringids=False):
  318. self.postfile = postfile
  319. self.startoffset = startoffset
  320. self.format = fmt
  321. self.scorer = scorer
  322. self._term = term
  323. self.stringids = stringids
  324. postfile.seek(startoffset)
  325. magic = postfile.read(4)
  326. if magic != W2Block.magic:
  327. from whoosh.codec.legacy import old_block_type
  328. self.blockclass = old_block_type(magic)
  329. else:
  330. self.blockclass = W2Block
  331. self.blockcount = postfile.read_uint()
  332. self.baseoffset = postfile.tell()
  333. self._active = True
  334. self.currentblock = -1
  335. self._next_block()
  336. def is_active(self):
  337. return self._active
  338. def _read_block(self, offset):
  339. pf = self.postfile
  340. pf.seek(offset)
  341. return self.blockclass.from_file(pf, self.format.posting_size,
  342. stringids=self.stringids)
  343. def _consume_block(self):
  344. self.block.read_ids()
  345. self.block.read_weights()
  346. self.i = 0
  347. def _next_block(self, consume=True):
  348. if not (self.currentblock < self.blockcount):
  349. raise Exception("No next block")
  350. self.currentblock += 1
  351. if self.currentblock == self.blockcount:
  352. self._active = False
  353. return
  354. if self.currentblock == 0:
  355. pos = self.baseoffset
  356. else:
  357. pos = self.block.nextoffset
  358. self.block = self._read_block(pos)
  359. if consume:
  360. self._consume_block()
  361. def _skip_to_block(self, targetfn):
  362. skipped = 0
  363. while self._active and targetfn():
  364. self._next_block(consume=False)
  365. skipped += 1
  366. if self._active:
  367. self._consume_block()
  368. return skipped
  369. def score(self):
  370. return self.scorer.score(self)
  371. # Tables
  372. # Term index
  373. class TermIndexWriter(CodedOrderedWriter):
  374. def __init__(self, dbfile):
  375. super(TermIndexWriter, self).__init__(dbfile)
  376. self.fieldcounter = 0
  377. self.fieldmap = {}
  378. def keycoder(self, key):
  379. # Encode term
  380. fieldmap = self.fieldmap
  381. fieldname, text = key
  382. if fieldname in fieldmap:
  383. fieldnum = fieldmap[fieldname]
  384. else:
  385. fieldnum = self.fieldcounter
  386. fieldmap[fieldname] = fieldnum
  387. self.fieldcounter += 1
  388. key = pack_ushort(fieldnum) + utf8encode(text)[0]
  389. return key
  390. def valuecoder(self, terminfo):
  391. return terminfo.to_string()
  392. def close(self):
  393. self._write_hashes()
  394. dbfile = self.dbfile
  395. dbfile.write_uint(len(self.index))
  396. for n in self.index:
  397. dbfile.write_long(n)
  398. dbfile.write_pickle(self.fieldmap)
  399. self._write_directory()
  400. self.dbfile.close()
  401. class PostingIndexBase(CodedOrderedReader):
  402. # Shared base class for terms index and vector index readers
  403. def __init__(self, dbfile, postfile):
  404. CodedOrderedReader.__init__(self, dbfile)
  405. self.postfile = postfile
  406. dbfile.seek(self.indexbase + self.length * _LONG_SIZE)
  407. self.fieldmap = dbfile.read_pickle()
  408. self.names = [None] * len(self.fieldmap)
  409. for name, num in iteritems(self.fieldmap):
  410. self.names[num] = name
  411. def close(self):
  412. CodedOrderedReader.close(self)
  413. self.postfile.close()
  414. class W2TermsReader(PostingIndexBase):
  415. # Implements whoosh.codec.base.TermsReader
  416. def terminfo(self, fieldname, text):
  417. return self[fieldname, text]
  418. def matcher(self, fieldname, text, format_, scorer=None):
  419. # Note this does not filter out deleted documents; a higher level is
  420. # expected to wrap this matcher to eliminate deleted docs
  421. pf = self.postfile
  422. term = (fieldname, text)
  423. try:
  424. terminfo = self[term]
  425. except KeyError:
  426. raise TermNotFound("No term %s:%r" % (fieldname, text))
  427. p = terminfo.postings
  428. if isinstance(p, integer_types):
  429. # terminfo.postings is an offset into the posting file
  430. pr = PostingMatcher(pf, p, format_, scorer=scorer, term=term)
  431. else:
  432. # terminfo.postings is an inlined tuple of (ids, weights, values)
  433. docids, weights, values = p
  434. pr = ListMatcher(docids, weights, values, format_, scorer=scorer,
  435. term=term, terminfo=terminfo)
  436. return pr
  437. def keycoder(self, key):
  438. fieldname, text = key
  439. fnum = self.fieldmap.get(fieldname, 65535)
  440. return pack_ushort(fnum) + utf8encode(text)[0]
  441. def keydecoder(self, v):
  442. assert isinstance(v, bytes_type)
  443. return (self.names[unpack_ushort(v[:2])[0]], utf8decode(v[2:])[0])
  444. def valuedecoder(self, v):
  445. assert isinstance(v, bytes_type)
  446. return base.FileTermInfo.from_string(v)
  447. def frequency(self, key):
  448. datapos = self.range_for_key(key)[0]
  449. return base.FileTermInfo.read_weight(self.dbfile, datapos)
  450. def doc_frequency(self, key):
  451. datapos = self.range_for_key(key)[0]
  452. return base.FileTermInfo.read_doc_freq(self.dbfile, datapos)
  453. # Vectors
  454. # docnum, fieldnum
  455. _vectorkey_struct = Struct("!IH")
  456. class VectorWriter(TermIndexWriter):
  457. def keycoder(self, key):
  458. fieldmap = self.fieldmap
  459. docnum, fieldname = key
  460. if fieldname in fieldmap:
  461. fieldnum = fieldmap[fieldname]
  462. else:
  463. fieldnum = self.fieldcounter
  464. fieldmap[fieldname] = fieldnum
  465. self.fieldcounter += 1
  466. return _vectorkey_struct.pack(docnum, fieldnum)
  467. def valuecoder(self, offset):
  468. return pack_long(offset)
  469. class W2VectorReader(PostingIndexBase):
  470. # Implements whoosh.codec.base.VectorReader
  471. def matcher(self, docnum, fieldname, format_):
  472. pf = self.postfile
  473. offset = self[(docnum, fieldname)]
  474. pr = PostingMatcher(pf, offset, format_, stringids=True)
  475. return pr
  476. def keycoder(self, key):
  477. return _vectorkey_struct.pack(key[0], self.fieldmap[key[1]])
  478. def keydecoder(self, v):
  479. docnum, fieldnum = _vectorkey_struct.unpack(v)
  480. return (docnum, self.names[fieldnum])
  481. def valuedecoder(self, v):
  482. return unpack_long(v)[0]
  483. # Field lengths
  484. class LengthsBase(base.LengthsReader):
  485. magic = b("~LN1")
  486. def __init__(self):
  487. self.starts = {}
  488. self.totals = {}
  489. self.minlens = {}
  490. self.maxlens = {}
  491. def _read_header(self, dbfile, doccount):
  492. first = dbfile.read(4) # Magic
  493. assert first == self.magic
  494. version = dbfile.read_int() # Version number
  495. assert version == 1
  496. dc = dbfile.read_uint() # Number of documents saved
  497. if doccount is None:
  498. doccount = dc
  499. assert dc == doccount, "read=%s argument=%s" % (dc, doccount)
  500. self._count = doccount
  501. fieldcount = dbfile.read_ushort() # Number of fields
  502. # Read per-field info
  503. for i in xrange(fieldcount):
  504. fieldname = dbfile.read_string().decode('utf-8')
  505. self.totals[fieldname] = dbfile.read_long()
  506. self.minlens[fieldname] = byte_to_length(dbfile.read_byte())
  507. self.maxlens[fieldname] = byte_to_length(dbfile.read_byte())
  508. self.starts[fieldname] = i * doccount
  509. # Add header length to per-field offsets
  510. eoh = dbfile.tell() # End of header
  511. for fieldname in self.starts:
  512. self.starts[fieldname] += eoh
  513. def doc_count_all(self):
  514. return self._count
  515. def field_length(self, fieldname):
  516. return self.totals.get(fieldname, 0)
  517. def min_field_length(self, fieldname):
  518. return self.minlens.get(fieldname, 0)
  519. def max_field_length(self, fieldname):
  520. return self.maxlens.get(fieldname, 0)
  521. class InMemoryLengths(LengthsBase):
  522. def __init__(self):
  523. LengthsBase.__init__(self)
  524. self.totals = defaultdict(int)
  525. self.lengths = {}
  526. self._count = 0
  527. # IO
  528. def to_file(self, dbfile, doccount):
  529. self._pad_arrays(doccount)
  530. fieldnames = list(self.lengths.keys())
  531. dbfile.write(self.magic)
  532. dbfile.write_int(1) # Format version number
  533. dbfile.write_uint(doccount) # Number of documents
  534. dbfile.write_ushort(len(self.lengths)) # Number of fields
  535. # Write per-field info
  536. for fieldname in fieldnames:
  537. dbfile.write_string(fieldname.encode('utf-8')) # Fieldname
  538. dbfile.write_long(self.field_length(fieldname))
  539. dbfile.write_byte(length_to_byte(self.min_field_length(fieldname)))
  540. dbfile.write_byte(length_to_byte(self.max_field_length(fieldname)))
  541. # Write byte arrays
  542. for fieldname in fieldnames:
  543. dbfile.write_array(self.lengths[fieldname])
  544. dbfile.close()
  545. @classmethod
  546. def from_file(cls, dbfile, doccount=None):
  547. obj = cls()
  548. obj._read_header(dbfile, doccount)
  549. for fieldname, start in iteritems(obj.starts):
  550. obj.lengths[fieldname] = dbfile.get_array(start, "B", obj._count)
  551. dbfile.close()
  552. return obj
  553. # Get
  554. def doc_field_length(self, docnum, fieldname, default=0):
  555. try:
  556. arry = self.lengths[fieldname]
  557. except KeyError:
  558. return default
  559. if docnum >= len(arry):
  560. return default
  561. return byte_to_length(arry[docnum])
  562. # Min/max cache setup -- not meant to be called while adding
  563. def _minmax(self, fieldname, op, cache):
  564. if fieldname in cache:
  565. return cache[fieldname]
  566. else:
  567. ls = self.lengths[fieldname]
  568. if ls:
  569. result = byte_to_length(op(ls))
  570. else:
  571. result = 0
  572. cache[fieldname] = result
  573. return result
  574. def min_field_length(self, fieldname):
  575. return self._minmax(fieldname, min, self.minlens)
  576. def max_field_length(self, fieldname):
  577. return self._minmax(fieldname, max, self.maxlens)
  578. # Add
  579. def _create_field(self, fieldname, docnum):
  580. dc = max(self._count, docnum + 1)
  581. self.lengths[fieldname] = array("B", (0 for _ in xrange(dc)))
  582. self._count = dc
  583. def _pad_arrays(self, doccount):
  584. # Pad out arrays to full length
  585. for fieldname in self.lengths.keys():
  586. arry = self.lengths[fieldname]
  587. if len(arry) < doccount:
  588. for _ in xrange(doccount - len(arry)):
  589. arry.append(0)
  590. self._count = doccount
  591. def add(self, docnum, fieldname, length):
  592. lengths = self.lengths
  593. if length:
  594. if fieldname not in lengths:
  595. self._create_field(fieldname, docnum)
  596. arry = self.lengths[fieldname]
  597. count = docnum + 1
  598. if len(arry) < count:
  599. for _ in xrange(count - len(arry)):
  600. arry.append(0)
  601. if count > self._count:
  602. self._count = count
  603. byte = length_to_byte(length)
  604. arry[docnum] = byte
  605. self.totals[fieldname] += length
  606. def add_other(self, other):
  607. lengths = self.lengths
  608. totals = self.totals
  609. doccount = self._count
  610. for fname in other.lengths:
  611. if fname not in lengths:
  612. lengths[fname] = array("B")
  613. self._pad_arrays(doccount)
  614. for fname in other.lengths:
  615. lengths[fname].extend(other.lengths[fname])
  616. self._count = doccount + other._count
  617. self._pad_arrays(self._count)
  618. for fname in other.totals:
  619. totals[fname] += other.totals[fname]
  620. class OnDiskLengths(LengthsBase):
  621. def __init__(self, dbfile, doccount=None):
  622. LengthsBase.__init__(self)
  623. self.dbfile = dbfile
  624. self._read_header(dbfile, doccount)
  625. def doc_field_length(self, docnum, fieldname, default=0):
  626. try:
  627. start = self.starts[fieldname]
  628. except KeyError:
  629. return default
  630. return byte_to_length(self.dbfile.get_byte(start + docnum))
  631. def close(self):
  632. self.dbfile.close()
  633. # Stored fields
  634. _stored_pointer_struct = Struct("!qI") # offset, length
  635. stored_pointer_size = _stored_pointer_struct.size
  636. pack_stored_pointer = _stored_pointer_struct.pack
  637. unpack_stored_pointer = _stored_pointer_struct.unpack
  638. class StoredFieldWriter(object):
  639. def __init__(self, dbfile):
  640. self.dbfile = dbfile
  641. self.length = 0
  642. self.directory = []
  643. self.dbfile.write_long(0)
  644. self.dbfile.write_uint(0)
  645. self.names = []
  646. self.name_map = {}
  647. def add(self, vdict):
  648. f = self.dbfile
  649. names = self.names
  650. name_map = self.name_map
  651. vlist = [None] * len(names)
  652. for k, v in iteritems(vdict):
  653. if k in name_map:
  654. vlist[name_map[k]] = v
  655. else:
  656. name_map[k] = len(names)
  657. names.append(k)
  658. vlist.append(v)
  659. vstring = dumps(tuple(vlist), -1)[2:-1]
  660. self.length += 1
  661. self.directory.append(pack_stored_pointer(f.tell(), len(vstring)))
  662. f.write(vstring)
  663. def add_reader(self, sfreader):
  664. add = self.add
  665. for vdict in sfreader:
  666. add(vdict)
  667. def close(self):
  668. f = self.dbfile
  669. dirpos = f.tell()
  670. f.write_pickle(self.names)
  671. for pair in self.directory:
  672. f.write(pair)
  673. f.flush()
  674. f.seek(0)
  675. f.write_long(dirpos)
  676. f.write_uint(self.length)
  677. f.close()
  678. class StoredFieldReader(object):
  679. def __init__(self, dbfile):
  680. self.dbfile = dbfile
  681. dbfile.seek(0)
  682. dirpos = dbfile.read_long()
  683. self.length = dbfile.read_uint()
  684. self.basepos = dbfile.tell()
  685. dbfile.seek(dirpos)
  686. nameobj = dbfile.read_pickle()
  687. if isinstance(nameobj, dict):
  688. # Previous versions stored the list of names as a map of names to
  689. # positions... it seemed to make sense at the time...
  690. self.names = [None] * len(nameobj)
  691. for name, pos in iteritems(nameobj):
  692. self.names[pos] = name
  693. else:
  694. self.names = nameobj
  695. self.directory_offset = dbfile.tell()
  696. def close(self):
  697. self.dbfile.close()
  698. def __iter__(self):
  699. dbfile = self.dbfile
  700. names = self.names
  701. lengths = array("I")
  702. dbfile.seek(self.directory_offset)
  703. for i in xrange(self.length):
  704. dbfile.seek(_LONG_SIZE, 1)
  705. lengths.append(dbfile.read_uint())
  706. dbfile.seek(self.basepos)
  707. for length in lengths:
  708. vlist = loads(dbfile.read(length) + b("."))
  709. vdict = dict((names[i], vlist[i]) for i in xrange(len(vlist))
  710. if vlist[i] is not None)
  711. yield vdict
  712. def __getitem__(self, num):
  713. if num > self.length - 1:
  714. raise IndexError("Tried to get document %s, file has %s"
  715. % (num, self.length))
  716. dbfile = self.dbfile
  717. start = self.directory_offset + num * stored_pointer_size
  718. dbfile.seek(start)
  719. ptr = dbfile.read(stored_pointer_size)
  720. if len(ptr) != stored_pointer_size:
  721. raise Exception("Error reading %r @%s %s < %s"
  722. % (dbfile, start, len(ptr), stored_pointer_size))
  723. position, length = unpack_stored_pointer(ptr)
  724. dbfile.seek(position)
  725. vlist = loads(dbfile.read(length) + b("."))
  726. names = self.names
  727. # Recreate a dictionary by putting the field names and values back
  728. # together by position. We can't just use dict(zip(...)) because we
  729. # want to filter out the None values.
  730. vdict = dict((names[i], vlist[i]) for i in xrange(len(vlist))
  731. if vlist[i] is not None)
  732. return vdict
  733. # Segment object
  734. class W2Segment(base.Segment):
  735. def __init__(self, indexname, doccount=0, segid=None, deleted=None):
  736. """
  737. :param name: The name of the segment (the Index object computes this
  738. from its name and the generation).
  739. :param doccount: The maximum document number in the segment.
  740. :param term_count: Total count of all terms in all documents.
  741. :param deleted: A set of deleted document numbers, or None if no
  742. deleted documents exist in this segment.
  743. """
  744. assert isinstance(indexname, string_type)
  745. self.indexname = indexname
  746. assert isinstance(doccount, integer_types)
  747. self.doccount = doccount
  748. self.segid = self._random_id() if segid is None else segid
  749. self.deleted = deleted
  750. self.compound = False
  751. def codec(self, **kwargs):
  752. return W2Codec(**kwargs)
  753. def doc_count_all(self):
  754. return self.doccount
  755. def doc_count(self):
  756. return self.doccount - self.deleted_count()
  757. def has_deletions(self):
  758. return self.deleted_count() > 0
  759. def deleted_count(self):
  760. if self.deleted is None:
  761. return 0
  762. return len(self.deleted)
  763. def delete_document(self, docnum, delete=True):
  764. if delete:
  765. if self.deleted is None:
  766. self.deleted = set()
  767. self.deleted.add(docnum)
  768. elif self.deleted is not None and docnum in self.deleted:
  769. self.deleted.clear(docnum)
  770. def is_deleted(self, docnum):
  771. if self.deleted is None:
  772. return False
  773. return docnum in self.deleted
  774. # Posting blocks
  775. class W2Block(base.BlockBase):
  776. magic = b("Blk3")
  777. infokeys = ("count", "maxid", "maxweight", "minlength", "maxlength",
  778. "idcode", "compression", "idslen", "weightslen")
  779. def to_file(self, postfile, compression=3):
  780. ids = self.ids
  781. idcode, idstring = minimize_ids(ids, self.stringids, compression)
  782. wtstring = minimize_weights(self.weights, compression)
  783. vstring = minimize_values(self.postingsize, self.values, compression)
  784. info = (len(ids), ids[-1], self.maxweight,
  785. length_to_byte(self.minlength), length_to_byte(self.maxlength),
  786. idcode, compression, len(idstring), len(wtstring))
  787. infostring = dumps(info, -1)
  788. # Offset to next block
  789. postfile.write_uint(len(infostring) + len(idstring) + len(wtstring)
  790. + len(vstring))
  791. # Block contents
  792. postfile.write(infostring)
  793. postfile.write(idstring)
  794. postfile.write(wtstring)
  795. postfile.write(vstring)
  796. @classmethod
  797. def from_file(cls, postfile, postingsize, stringids=False):
  798. block = cls(postingsize, stringids=stringids)
  799. block.postfile = postfile
  800. delta = postfile.read_uint()
  801. block.nextoffset = postfile.tell() + delta
  802. info = postfile.read_pickle()
  803. block.dataoffset = postfile.tell()
  804. for key, value in zip(cls.infokeys, info):
  805. if key in ("minlength", "maxlength"):
  806. value = byte_to_length(value)
  807. setattr(block, key, value)
  808. return block
  809. def read_ids(self):
  810. offset = self.dataoffset
  811. self.postfile.seek(offset)
  812. idstring = self.postfile.read(self.idslen)
  813. ids = deminimize_ids(self.idcode, self.count, idstring,
  814. self.compression)
  815. self.ids = ids
  816. return ids
  817. def read_weights(self):
  818. if self.weightslen == 0:
  819. weights = [1.0] * self.count
  820. else:
  821. offset = self.dataoffset + self.idslen
  822. self.postfile.seek(offset)
  823. wtstring = self.postfile.read(self.weightslen)
  824. weights = deminimize_weights(self.count, wtstring,
  825. self.compression)
  826. self.weights = weights
  827. return weights
  828. def read_values(self):
  829. postingsize = self.postingsize
  830. if postingsize == 0:
  831. values = [None] * self.count
  832. else:
  833. offset = self.dataoffset + self.idslen + self.weightslen
  834. self.postfile.seek(offset)
  835. vstring = self.postfile.read(self.nextoffset - offset)
  836. values = deminimize_values(postingsize, self.count, vstring,
  837. self.compression)
  838. self.values = values
  839. return values