/bangkokhotel/lib/python2.5/site-packages/whoosh/codec/base.py

https://bitbucket.org/luisrodriguez/bangkokhotel · Python · 856 lines · 628 code · 139 blank · 89 comment · 79 complexity · 3db4c6dc46be67b29fa2d55bd10d7717 MD5 · raw file

  1. # Copyright 2011 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. """
  28. This module contains base classes/interfaces for "codec" objects.
  29. """
  30. import random
  31. from array import array
  32. from struct import Struct, pack
  33. from bisect import bisect_right
  34. from whoosh.compat import (loads, dumps, b, bytes_type, string_type, xrange,
  35. array_frombytes, array_tobytes)
  36. from whoosh.filedb.compound import CompoundStorage
  37. from whoosh.matching import Matcher, ReadTooFar
  38. from whoosh.reading import TermInfo
  39. from whoosh.spans import Span
  40. from whoosh.system import (_INT_SIZE, _FLOAT_SIZE, pack_long, unpack_long,
  41. IS_LITTLE)
  42. from whoosh.util import byte_to_length, length_to_byte
  43. try:
  44. from zlib import compress, decompress
  45. can_compress = True
  46. except ImportError:
  47. can_compress = False
  48. # Base classes
  49. class Codec(object):
  50. # Per document value writer
  51. def per_document_writer(self, storage, segment):
  52. raise NotImplementedError
  53. # Inverted index writer
  54. def field_writer(self, storage, segment):
  55. raise NotImplementedError
  56. # Readers
  57. def terms_reader(self, storage, segment):
  58. raise NotImplementedError
  59. def lengths_reader(self, storage, segment):
  60. raise NotImplementedError
  61. def vector_reader(self, storage, segment):
  62. raise NotImplementedError
  63. def stored_fields_reader(self, storage, segment):
  64. raise NotImplementedError
  65. def graph_reader(self, storage, segment):
  66. raise NotImplementedError
  67. # Segments and generations
  68. def new_segment(self, storage, indexname):
  69. raise NotImplementedError
  70. def commit_toc(self, storage, indexname, schema, segments, generation):
  71. raise NotImplementedError
  72. # Writer classes
  73. class PerDocumentWriter(object):
  74. def start_doc(self, docnum):
  75. raise NotImplementedError
  76. def add_field(self, fieldname, fieldobj, value, length):
  77. raise NotImplementedError
  78. def add_vector_items(self, fieldname, fieldobj, items):
  79. raise NotImplementedError
  80. def add_vector_matcher(self, fieldname, fieldobj, vmatcher):
  81. def readitems():
  82. while vmatcher.is_active():
  83. text = vmatcher.id()
  84. weight = vmatcher.weight()
  85. valuestring = vmatcher.value()
  86. yield (text, None, weight, valuestring)
  87. vmatcher.next()
  88. self.add_vector_items(fieldname, fieldobj, readitems())
  89. def finish_doc(self):
  90. pass
  91. def lengths_reader(self):
  92. raise NotImplementedError
  93. class FieldWriter(object):
  94. def add_postings(self, schema, lengths, items):
  95. start_field = self.start_field
  96. start_term = self.start_term
  97. add = self.add
  98. finish_term = self.finish_term
  99. finish_field = self.finish_field
  100. # items = (fieldname, text, docnum, weight, valuestring) ...
  101. lastfn = None
  102. lasttext = None
  103. dfl = lengths.doc_field_length
  104. for fieldname, text, docnum, weight, valuestring in items:
  105. # Items where docnum is None indicate words that should be added
  106. # to the spelling graph
  107. if docnum is None and (fieldname != lastfn or text != lasttext):
  108. self.add_spell_word(fieldname, text)
  109. lastfn = fieldname
  110. lasttext = text
  111. continue
  112. # This comparison is so convoluted because Python 3 removed the
  113. # ability to compare a string to None
  114. if ((lastfn is not None and fieldname < lastfn)
  115. or (fieldname == lastfn and lasttext is not None
  116. and text < lasttext)):
  117. raise Exception("Postings are out of order: %r:%s .. %r:%s" %
  118. (lastfn, lasttext, fieldname, text))
  119. if fieldname != lastfn or text != lasttext:
  120. if lasttext is not None:
  121. finish_term()
  122. if fieldname != lastfn:
  123. if lastfn is not None:
  124. finish_field()
  125. start_field(fieldname, schema[fieldname])
  126. lastfn = fieldname
  127. start_term(text)
  128. lasttext = text
  129. length = dfl(docnum, fieldname)
  130. add(docnum, weight, valuestring, length)
  131. if lasttext is not None:
  132. finish_term()
  133. finish_field()
  134. def start_field(self, fieldname, fieldobj):
  135. raise NotImplementedError
  136. def start_term(self, text):
  137. raise NotImplementedError
  138. def add(self, docnum, weight, valuestring, length):
  139. raise NotImplementedError
  140. def add_spell_word(self, fieldname, text):
  141. raise NotImplementedError
  142. def finish_term(self):
  143. raise NotImplementedError
  144. def finish_field(self):
  145. pass
  146. def close(self):
  147. pass
  148. # Reader classes
  149. class TermsReader(object):
  150. def __contains__(self, term):
  151. raise NotImplementedError
  152. def terms(self):
  153. raise NotImplementedError
  154. def terms_from(self, fieldname, prefix):
  155. raise NotImplementedError
  156. def items(self):
  157. raise NotImplementedError
  158. def items_from(self, fieldname, prefix):
  159. raise NotImplementedError
  160. def terminfo(self, fieldname, text):
  161. raise NotImplementedError
  162. def frequency(self, fieldname, text):
  163. return self.terminfo(fieldname, text).weight()
  164. def doc_frequency(self, fieldname, text):
  165. return self.terminfo(fieldname, text).doc_frequency()
  166. def graph_reader(self, fieldname, text):
  167. raise NotImplementedError
  168. def matcher(self, fieldname, text, format_, scorer=None):
  169. raise NotImplementedError
  170. def close(self):
  171. pass
  172. class VectorReader(object):
  173. def __contains__(self, key):
  174. raise NotImplementedError
  175. def matcher(self, docnum, fieldname, format_):
  176. raise NotImplementedError
  177. class LengthsReader(object):
  178. def doc_count_all(self):
  179. raise NotImplementedError
  180. def doc_field_length(self, docnum, fieldname, default=0):
  181. raise NotImplementedError
  182. def field_length(self, fieldname):
  183. raise NotImplementedError
  184. def min_field_length(self, fieldname):
  185. raise NotImplementedError
  186. def max_field_length(self, fieldname):
  187. raise NotImplementedError
  188. def close(self):
  189. pass
  190. class MultiLengths(LengthsReader):
  191. def __init__(self, lengths, offset=0):
  192. self.lengths = []
  193. self.doc_offsets = []
  194. self._count = 0
  195. for lr in lengths:
  196. if lr.doc_count_all():
  197. self.lengths.append(lr)
  198. self.doc_offsets.append(self._count)
  199. self._count += lr.doc_count_all()
  200. self.is_closed = False
  201. def _document_reader(self, docnum):
  202. return max(0, bisect_right(self.doc_offsets, docnum) - 1)
  203. def _reader_and_docnum(self, docnum):
  204. lnum = self._document_reader(docnum)
  205. offset = self.doc_offsets[lnum]
  206. return lnum, docnum - offset
  207. def doc_count_all(self):
  208. return self._count
  209. def doc_field_length(self, docnum, fieldname, default=0):
  210. x, y = self._reader_and_docnum(docnum)
  211. return self.lengths[x].doc_field_length(y, fieldname, default=default)
  212. def min_field_length(self):
  213. return min(lr.min_field_length() for lr in self.lengths)
  214. def max_field_length(self):
  215. return max(lr.max_field_length() for lr in self.lengths)
  216. def close(self):
  217. for lr in self.lengths:
  218. lr.close()
  219. self.is_closed = True
  220. class StoredFieldsReader(object):
  221. def __iter__(self):
  222. raise NotImplementedError
  223. def __getitem__(self, docnum):
  224. raise NotImplementedError
  225. def cell(self, docnum, fieldname):
  226. fielddict = self.get(docnum)
  227. return fielddict.get(fieldname)
  228. def column(self, fieldname):
  229. for fielddict in self:
  230. yield fielddict.get(fieldname)
  231. def close(self):
  232. pass
  233. # File posting matcher middleware
  234. class FilePostingMatcher(Matcher):
  235. # Subclasses need to set
  236. # self._term -- (fieldname, text) or None
  237. # self.scorer -- a Scorer object or None
  238. # self.format -- Format object for the posting values
  239. def __repr__(self):
  240. return "%s(%r, %r, %s)" % (self.__class__.__name__, str(self.postfile),
  241. self.term(), self.is_active())
  242. def term(self):
  243. return self._term
  244. def items_as(self, astype):
  245. decoder = self.format.decoder(astype)
  246. for id, value in self.all_items():
  247. yield (id, decoder(value))
  248. def supports(self, astype):
  249. return self.format.supports(astype)
  250. def value_as(self, astype):
  251. decoder = self.format.decoder(astype)
  252. return decoder(self.value())
  253. def spans(self):
  254. if self.supports("characters"):
  255. return [Span(pos, startchar=startchar, endchar=endchar)
  256. for pos, startchar, endchar in self.value_as("characters")]
  257. elif self.supports("positions"):
  258. return [Span(pos) for pos in self.value_as("positions")]
  259. else:
  260. raise Exception("Field does not support positions (%r)"
  261. % self._term)
  262. def supports_block_quality(self):
  263. return self.scorer and self.scorer.supports_block_quality()
  264. def max_quality(self):
  265. return self.scorer.max_quality
  266. def block_quality(self):
  267. return self.scorer.block_quality(self)
  268. class BlockPostingMatcher(FilePostingMatcher):
  269. # Subclasses need to set
  270. # self.block -- BlockBase object for the current block
  271. # self.i -- Numerical index to the current place in the block
  272. # And implement
  273. # _read_block()
  274. # _next_block()
  275. # _skip_to_block()
  276. def id(self):
  277. return self.block.ids[self.i]
  278. def weight(self):
  279. weights = self.block.weights
  280. if not weights:
  281. weights = self.block.read_weights()
  282. return weights[self.i]
  283. def value(self):
  284. values = self.block.values
  285. if values is None:
  286. values = self.block.read_values()
  287. return values[self.i]
  288. def all_ids(self):
  289. nextoffset = self.baseoffset
  290. for _ in xrange(self.blockcount):
  291. block = self._read_block(nextoffset)
  292. nextoffset = block.nextoffset
  293. ids = block.read_ids()
  294. for id in ids:
  295. yield id
  296. def next(self):
  297. if self.i == self.block.count - 1:
  298. self._next_block()
  299. return True
  300. else:
  301. self.i += 1
  302. return False
  303. def skip_to(self, id):
  304. if not self.is_active():
  305. raise ReadTooFar
  306. i = self.i
  307. # If we're already in the block with the target ID, do nothing
  308. if id <= self.block.ids[i]:
  309. return
  310. # Skip to the block that would contain the target ID
  311. if id > self.block.maxid:
  312. self._skip_to_block(lambda: id > self.block.maxid)
  313. if not self.is_active():
  314. return
  315. # Iterate through the IDs in the block until we find or pass the
  316. # target
  317. ids = self.block.ids
  318. i = self.i
  319. while ids[i] < id:
  320. i += 1
  321. if i == len(ids):
  322. self._active = False
  323. return
  324. self.i = i
  325. def skip_to_quality(self, minquality):
  326. bq = self.block_quality
  327. if bq() > minquality:
  328. return 0
  329. return self._skip_to_block(lambda: bq() <= minquality)
  330. def block_min_length(self):
  331. return self.block.min_length()
  332. def block_max_length(self):
  333. return self.block.max_length()
  334. def block_max_weight(self):
  335. return self.block.max_weight()
  336. def block_max_wol(self):
  337. return self.block.max_wol()
  338. # File TermInfo
  339. NO_ID = 0xffffffff
  340. class FileTermInfo(TermInfo):
  341. # Freq, Doc freq, min len, max length, max weight, unused, min ID, max ID
  342. struct = Struct("!fIBBffII")
  343. def __init__(self, *args, **kwargs):
  344. self.postings = None
  345. if "postings" in kwargs:
  346. self.postings = kwargs["postings"]
  347. del kwargs["postings"]
  348. TermInfo.__init__(self, *args, **kwargs)
  349. # filedb specific methods
  350. def add_block(self, block):
  351. self._weight += sum(block.weights)
  352. self._df += len(block)
  353. ml = block.min_length()
  354. if self._minlength is None:
  355. self._minlength = ml
  356. else:
  357. self._minlength = min(self._minlength, ml)
  358. self._maxlength = max(self._maxlength, block.max_length())
  359. self._maxweight = max(self._maxweight, block.max_weight())
  360. if self._minid is None:
  361. self._minid = block.ids[0]
  362. self._maxid = block.ids[-1]
  363. def to_string(self):
  364. # Encode the lengths as 0-255 values
  365. ml = 0 if self._minlength is None else length_to_byte(self._minlength)
  366. xl = length_to_byte(self._maxlength)
  367. # Convert None values to the out-of-band NO_ID constant so they can be
  368. # stored as unsigned ints
  369. mid = NO_ID if self._minid is None else self._minid
  370. xid = NO_ID if self._maxid is None else self._maxid
  371. # Pack the term info into bytes
  372. st = self.struct.pack(self._weight, self._df, ml, xl, self._maxweight,
  373. 0, mid, xid)
  374. if isinstance(self.postings, tuple):
  375. # Postings are inlined - dump them using the pickle protocol
  376. isinlined = 1
  377. st += dumps(self.postings, -1)[2:-1]
  378. else:
  379. # Append postings pointer as long to end of term info bytes
  380. isinlined = 0
  381. # It's possible for a term info to not have a pointer to postings
  382. # on disk, in which case postings will be None. Convert a None
  383. # value to -1 so it can be stored as a long.
  384. p = -1 if self.postings is None else self.postings
  385. st += pack_long(p)
  386. # Prepend byte indicating whether the postings are inlined to the term
  387. # info bytes
  388. return pack("B", isinlined) + st
  389. @classmethod
  390. def from_string(cls, s):
  391. assert isinstance(s, bytes_type)
  392. if isinstance(s, string_type):
  393. hbyte = ord(s[0]) # Python 2.x - str
  394. else:
  395. hbyte = s[0] # Python 3 - bytes
  396. if hbyte < 2:
  397. st = cls.struct
  398. # Weight, Doc freq, min len, max len, max w, unused, min ID, max ID
  399. w, df, ml, xl, xw, _, mid, xid = st.unpack(s[1:st.size + 1])
  400. mid = None if mid == NO_ID else mid
  401. xid = None if xid == NO_ID else xid
  402. # Postings
  403. pstr = s[st.size + 1:]
  404. if hbyte == 0:
  405. p = unpack_long(pstr)[0]
  406. else:
  407. p = loads(pstr + b("."))
  408. else:
  409. # Old format was encoded as a variable length pickled tuple
  410. v = loads(s + b("."))
  411. if len(v) == 1:
  412. w = df = 1
  413. p = v[0]
  414. elif len(v) == 2:
  415. w = df = v[1]
  416. p = v[0]
  417. else:
  418. w, p, df = v
  419. # Fake values for stats which weren't stored before
  420. ml = 1
  421. xl = 255
  422. xw = 999999999
  423. mid = -1
  424. xid = -1
  425. ml = byte_to_length(ml)
  426. xl = byte_to_length(xl)
  427. obj = cls(w, df, ml, xl, xw, mid, xid)
  428. obj.postings = p
  429. return obj
  430. @classmethod
  431. def read_weight(cls, dbfile, datapos):
  432. return dbfile.get_float(datapos + 1)
  433. @classmethod
  434. def read_doc_freq(cls, dbfile, datapos):
  435. return dbfile.get_uint(datapos + 1 + _FLOAT_SIZE)
  436. @classmethod
  437. def read_min_and_max_length(cls, dbfile, datapos):
  438. lenpos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE
  439. ml = byte_to_length(dbfile.get_byte(lenpos))
  440. xl = byte_to_length(dbfile.get_byte(lenpos + 1))
  441. return ml, xl
  442. @classmethod
  443. def read_max_weight(cls, dbfile, datapos):
  444. weightspos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE + 2
  445. return dbfile.get_float(weightspos)
  446. # Segment base class
  447. class Segment(object):
  448. """Do not instantiate this object directly. It is used by the Index object
  449. to hold information about a segment. A list of objects of this class are
  450. pickled as part of the TOC file.
  451. The TOC file stores a minimal amount of information -- mostly a list of
  452. Segment objects. Segments are the real reverse indexes. Having multiple
  453. segments allows quick incremental indexing: just create a new segment for
  454. the new documents, and have the index overlay the new segment over previous
  455. ones for purposes of reading/search. "Optimizing" the index combines the
  456. contents of existing segments into one (removing any deleted documents
  457. along the way).
  458. """
  459. # These must be valid separate characters in CASE-INSENSTIVE filenames
  460. IDCHARS = "0123456789abcdefghijklmnopqrstuvwxyz"
  461. # Extension for compound segment files
  462. COMPOUND_EXT = ".seg"
  463. # self.indexname
  464. # self.segid
  465. @classmethod
  466. def _random_id(cls, size=12):
  467. return "".join(random.choice(cls.IDCHARS) for _ in xrange(size))
  468. def __repr__(self):
  469. return "<%s %s>" % (self.__class__.__name__, getattr(self, "segid", ""))
  470. def codec(self):
  471. raise NotImplementedError
  472. def segment_id(self):
  473. if hasattr(self, "name"):
  474. # Old segment class
  475. return self.name
  476. else:
  477. return "%s_%s" % (self.indexname, self.segid)
  478. def is_compound(self):
  479. if not hasattr(self, "compound"):
  480. return False
  481. return self.compound
  482. # File convenience methods
  483. def make_filename(self, ext):
  484. return "%s%s" % (self.segment_id(), ext)
  485. def list_files(self, storage):
  486. prefix = "%s." % self.segment_id()
  487. return [name for name in storage.list() if name.startswith(prefix)]
  488. def create_file(self, storage, ext, **kwargs):
  489. """Convenience method to create a new file in the given storage named
  490. with this segment's ID and the given extension. Any keyword arguments
  491. are passed to the storage's create_file method.
  492. """
  493. fname = self.make_filename(ext)
  494. return storage.create_file(fname, **kwargs)
  495. def open_file(self, storage, ext, **kwargs):
  496. """Convenience method to open a file in the given storage named with
  497. this segment's ID and the given extension. Any keyword arguments are
  498. passed to the storage's open_file method.
  499. """
  500. fname = self.make_filename(ext)
  501. return storage.open_file(fname, **kwargs)
  502. def create_compound_file(self, storage):
  503. segfiles = self.list_files(storage)
  504. assert not any(name.endswith(self.COMPOUND_EXT) for name in segfiles)
  505. cfile = self.create_file(storage, self.COMPOUND_EXT)
  506. CompoundStorage.assemble(cfile, storage, segfiles)
  507. for name in segfiles:
  508. storage.delete_file(name)
  509. def open_compound_file(self, storage):
  510. name = self.make_filename(self.COMPOUND_EXT)
  511. return CompoundStorage(storage, name)
  512. # Abstract methods dealing with document counts and deletions
  513. def doc_count_all(self):
  514. """
  515. Returns the total number of documents, DELETED OR UNDELETED, in this
  516. segment.
  517. """
  518. raise NotImplementedError
  519. def doc_count(self):
  520. """
  521. :returns: the number of (undeleted) documents in this segment.
  522. """
  523. raise NotImplementedError
  524. def has_deletions(self):
  525. """
  526. :returns: True if any documents in this segment are deleted.
  527. """
  528. raise NotImplementedError
  529. def deleted_count(self):
  530. """
  531. :returns: the total number of deleted documents in this segment.
  532. """
  533. raise NotImplementedError
  534. def delete_document(self, docnum, delete=True):
  535. """Deletes the given document number. The document is not actually
  536. removed from the index until it is optimized.
  537. :param docnum: The document number to delete.
  538. :param delete: If False, this undeletes a deleted document.
  539. """
  540. raise NotImplementedError
  541. def is_deleted(self, docnum):
  542. """:returns: True if the given document number is deleted."""
  543. raise NotImplementedError
  544. # Posting block format
  545. class BlockBase(object):
  546. def __init__(self, postingsize, stringids=False):
  547. self.postingsize = postingsize
  548. self.stringids = stringids
  549. self.ids = [] if stringids else array("I")
  550. self.weights = array("f")
  551. self.values = None
  552. self.minlength = None
  553. self.maxlength = 0
  554. self.maxweight = 0
  555. def __len__(self):
  556. return len(self.ids)
  557. def __nonzero__(self):
  558. return bool(self.ids)
  559. def min_id(self):
  560. if self.ids:
  561. return self.ids[0]
  562. else:
  563. raise IndexError
  564. def max_id(self):
  565. if self.ids:
  566. return self.ids[-1]
  567. else:
  568. raise IndexError
  569. def min_length(self):
  570. return self.minlength
  571. def max_length(self):
  572. return self.maxlength
  573. def max_weight(self):
  574. return self.maxweight
  575. def add(self, id_, weight, valuestring, length=None):
  576. self.ids.append(id_)
  577. self.weights.append(weight)
  578. if weight > self.maxweight:
  579. self.maxweight = weight
  580. if valuestring:
  581. if self.values is None:
  582. self.values = []
  583. self.values.append(valuestring)
  584. if length:
  585. if self.minlength is None or length < self.minlength:
  586. self.minlength = length
  587. if length > self.maxlength:
  588. self.maxlength = length
  589. def to_file(self, postfile):
  590. raise NotImplementedError
  591. # Utility functions
  592. def minimize_ids(arry, stringids, compression=0):
  593. amax = arry[-1]
  594. if stringids:
  595. typecode = ''
  596. string = dumps(arry)
  597. else:
  598. typecode = arry.typecode
  599. if amax <= 255:
  600. typecode = "B"
  601. elif amax <= 65535:
  602. typecode = "H"
  603. if typecode != arry.typecode:
  604. arry = array(typecode, iter(arry))
  605. if not IS_LITTLE:
  606. arry.byteswap()
  607. string = array_tobytes(arry)
  608. if compression:
  609. string = compress(string, compression)
  610. return (typecode, string)
  611. def deminimize_ids(typecode, count, string, compression=0):
  612. if compression:
  613. string = decompress(string)
  614. if typecode == '':
  615. return loads(string)
  616. else:
  617. arry = array(typecode)
  618. array_frombytes(arry, string)
  619. if not IS_LITTLE:
  620. arry.byteswap()
  621. return arry
  622. def minimize_weights(weights, compression=0):
  623. if all(w == 1.0 for w in weights):
  624. string = b("")
  625. else:
  626. if not IS_LITTLE:
  627. weights.byteswap()
  628. string = array_tobytes(weights)
  629. if string and compression:
  630. string = compress(string, compression)
  631. return string
  632. def deminimize_weights(count, string, compression=0):
  633. if not string:
  634. return array("f", (1.0 for _ in xrange(count)))
  635. if compression:
  636. string = decompress(string)
  637. arry = array("f")
  638. array_frombytes(arry, string)
  639. if not IS_LITTLE:
  640. arry.byteswap()
  641. return arry
  642. def minimize_values(postingsize, values, compression=0):
  643. if postingsize < 0:
  644. string = dumps(values, -1)[2:]
  645. elif postingsize == 0:
  646. string = b('')
  647. else:
  648. string = b('').join(values)
  649. if string and compression:
  650. string = compress(string, compression)
  651. return string
  652. def deminimize_values(postingsize, count, string, compression=0):
  653. if compression:
  654. string = decompress(string)
  655. if postingsize < 0:
  656. return loads(string)
  657. elif postingsize == 0:
  658. return [None] * count
  659. else:
  660. return [string[i:i + postingsize] for i
  661. in xrange(0, len(string), postingsize)]