/bangkokhotel/lib/python2.5/site-packages/whoosh/support/numlists.py

https://bitbucket.org/luisrodriguez/bangkokhotel · Python · 315 lines · 227 code · 62 blank · 26 comment · 37 complexity · 515e1b1705ae5671bd7cba174cffdd94 MD5 · raw file

  1. from array import array
  2. from whoosh.compat import xrange
  3. from whoosh.system import pack_byte, unpack_byte
  4. from whoosh.system import pack_ushort_le, unpack_ushort_le
  5. from whoosh.system import pack_uint_le, unpack_uint_le
  6. def delta_encode(nums):
  7. base = 0
  8. for n in nums:
  9. yield n - base
  10. base = n
  11. def delta_decode(nums):
  12. base = 0
  13. for n in nums:
  14. base += n
  15. yield base
  16. class NumberEncoding(object):
  17. def write_deltas(self, f, numbers):
  18. return self.write_nums(f, delta_encode(numbers))
  19. def read_deltas(self, f, n):
  20. return delta_decode(self.read_nums(f, n))
  21. def get(self, f, pos, i):
  22. f.seek(pos)
  23. for n in self.read_nums(i + 1):
  24. pass
  25. return n
  26. # Fixed width encodings
  27. class FixedEncoding(NumberEncoding):
  28. @classmethod
  29. def write_nums(cls, f, numbers):
  30. _encode = cls._encode
  31. for n in numbers:
  32. f.write(_encode(n))
  33. @classmethod
  34. def read_nums(cls, f, n):
  35. _decode = cls._decode
  36. for _ in xrange(n):
  37. f.write(_decode(n))
  38. @classmethod
  39. def get(cls, f, pos, i):
  40. f.seek(pos + i * cls.size)
  41. return cls._decode(f.read(cls.size))
  42. class ByteEncoding(FixedEncoding):
  43. size = 1
  44. maxint = 255
  45. _encode = pack_byte
  46. _decode = unpack_byte
  47. class UShortEncoding(FixedEncoding):
  48. size = 2
  49. maxint = 2 ** 16 - 1
  50. _encode = pack_ushort_le
  51. _decode = unpack_ushort_le
  52. class UIntEncoding(FixedEncoding):
  53. size = 4
  54. maxint = 2 ** 32 - 1
  55. _encode = pack_uint_le
  56. _decode = unpack_uint_le
  57. # High-bit encoded variable-length integer
  58. class Varints(NumberEncoding):
  59. maxint = None
  60. @staticmethod
  61. def write_nums(f, numbers):
  62. for n in numbers:
  63. f.write_varint(n)
  64. @staticmethod
  65. def read_nums(f, n):
  66. for _ in xrange(n):
  67. yield f.read_varint()
  68. # Simple16 algorithm for storing arrays of positive integers (usually delta
  69. # encoded lists of sorted integers)
  70. #
  71. # 1. http://www2008.org/papers/pdf/p387-zhangA.pdf
  72. # 2. http://www2009.org/proceedings/pdf/p401.pdf
  73. class Simple16(NumberEncoding):
  74. # The maximum possible integer value Simple16 can encode is < 2^28.
  75. # Therefore, in order to use Simple16, the application must have its own
  76. # code to encode numbers in the range of [2^28, 2^32). A simple way is just
  77. # write those numbers as 32-bit integers (that is, no compression for very
  78. # big numbers).
  79. _numsize = 16
  80. _bitsize = 28
  81. maxint = 2 ** _bitsize - 1
  82. # Number of stored numbers per code
  83. _num = [28, 21, 21, 21, 14, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 1]
  84. # Number of bits for each number per code
  85. _bits = [
  86. (1,) * 28,
  87. (2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
  88. (1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1),
  89. (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2),
  90. (2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2),
  91. (4, 3, 3, 3, 3, 3, 3, 3, 3),
  92. (3, 4, 4, 4, 4, 3, 3, 3),
  93. (4, 4, 4, 4, 4, 4, 4),
  94. (5, 5, 5, 5, 4, 4),
  95. (4, 4, 5, 5, 5, 5),
  96. (6, 6, 6, 5, 5),
  97. (5, 5, 6, 6, 6),
  98. (7, 7, 7, 7),
  99. (10, 9, 9),
  100. (14, 14),
  101. (28,),
  102. ]
  103. @classmethod
  104. def write_nums(cls, f, numbers):
  105. _compress = cls._compress
  106. i = 0
  107. while i < len(numbers):
  108. value, taken = _compress(numbers, i, len(numbers) - i)
  109. f.write_uint_le(value)
  110. i += taken
  111. @classmethod
  112. def _compress(cls, inarray, inoffset, n):
  113. _numsize = cls._numsize
  114. _bitsize = cls._bitsize
  115. _num = cls._num
  116. _bits = cls._bits
  117. for key in xrange(_numsize):
  118. value = key << _bitsize
  119. num = _num[key] if _num[key] < n else n
  120. bits = 0
  121. j = 0
  122. while j < num and inarray[inoffset + j] < (1 << _bits[key][j]):
  123. x = inarray[inoffset + j]
  124. value |= x << bits
  125. bits += _bits[key][j]
  126. j += 1
  127. if j == num:
  128. return value, num
  129. raise Exception
  130. @classmethod
  131. def read_nums(cls, f, n):
  132. _decompress = cls._decompress
  133. i = 0
  134. while i < n:
  135. value = unpack_uint_le(f.read(4))[0]
  136. for v in _decompress(value, n - i):
  137. yield v
  138. i += 1
  139. @classmethod
  140. def _decompress(cls, value, n):
  141. _numsize = cls._numsize
  142. _bitsize = cls._bitsize
  143. _num = cls._num
  144. _bits = cls._bits
  145. key = value >> _bitsize
  146. num = _num[key] if _num[key] < n else n
  147. bits = 0
  148. for j in xrange(num):
  149. v = value >> bits
  150. yield v & (0xffffffff >> (32 - _bits[key][j]))
  151. bits += _bits[key][j]
  152. @classmethod
  153. def get(cls, f, pos, i):
  154. f.seek(pos)
  155. base = 0
  156. value = unpack_uint_le(f.read(4))
  157. key = value >> cls._bitsize
  158. num = cls._num[key]
  159. while i > base + num:
  160. base += num
  161. value = unpack_uint_le(f.read(4))
  162. key = value >> cls._bitsize
  163. num = cls._num[key]
  164. offset = i - base
  165. if offset:
  166. value = value >> sum(cls._bits[key][:offset])
  167. return value & (2 ** cls._bits[key][offset] - 1)
  168. # Google Packed Ints algorithm: a set of four numbers is preceded by a "key"
  169. # byte, which encodes how many bytes each of the next four integers use
  170. # (stored in the byte as four 2-bit numbers)
  171. class GInts(NumberEncoding):
  172. maxint = 2 ** 32 - 1
  173. # Number of future bytes to expect after a "key" byte value of N -- used to
  174. # skip ahead from a key byte
  175. _lens = array("B", [4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 5, 6,
  176. 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9,
  177. 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11,
  178. 12, 13, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7,
  179. 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10,
  180. 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11,
  181. 12, 13, 14, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9,
  182. 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11,
  183. 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12,
  184. 13, 14, 12, 13, 14, 15, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11,
  185. 12, 13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10,
  186. 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 10, 11, 12, 13, 11,
  187. 12, 13, 14, 12, 13, 14, 15, 13, 14, 15, 16])
  188. @staticmethod
  189. def key_to_sizes(key):
  190. """Returns a list of the sizes of the next four numbers given a key
  191. byte.
  192. """
  193. return [(key >> (i * 2) & 3) + 1 for i in xrange(4)]
  194. @classmethod
  195. def write_nums(cls, f, numbers):
  196. buf = array("B")
  197. count = 0
  198. key = 0
  199. for v in numbers:
  200. shift = count * 2
  201. if v < 256:
  202. buf.append(v)
  203. elif v < 65536:
  204. key |= 1 << shift
  205. buf.extend(pack_ushort_le(v))
  206. elif v < 16777216:
  207. key |= 2 << shift
  208. buf.extend(pack_uint_le(v)[:3])
  209. else:
  210. key |= 3 << shift
  211. buf.extend(pack_uint_le(v))
  212. count += 1
  213. if count == 4:
  214. f.write_byte(key)
  215. f.write(buf)
  216. count = 0
  217. key = 0
  218. del buf[:] # Clear the buffer
  219. # Write out leftovers in the buffer
  220. if count:
  221. f.write_byte(key)
  222. f.write(buf)
  223. @classmethod
  224. def read_nums(cls, f, n):
  225. """Read N integers from the bytes stream dbfile. Expects that the file
  226. starts at a key byte.
  227. """
  228. count = 0
  229. for _ in xrange(n):
  230. if count == 0:
  231. key = f.read_byte()
  232. code = key >> (count * 2) & 3
  233. if code == 0:
  234. yield f.read_byte()
  235. elif code == 1:
  236. yield f.read_ushort_le()
  237. elif code == 2:
  238. yield unpack_uint_le(f.read(3) + "\x00")[0]
  239. else:
  240. yield f.read_uint_le()
  241. count = (count + 1) % 4
  242. @classmethod
  243. def get(cls, f, pos, i):
  244. f.seek(pos)
  245. base = 0
  246. key = f.read_byte()
  247. while i > base + 4:
  248. base += 4
  249. f.seek(cls._lens[key], 1)
  250. key = f.read_byte()
  251. for n in cls.read_nums((i + 1) - base):
  252. pass
  253. return n