PageRenderTime 52ms CodeModel.GetById 24ms app.highlight 26ms RepoModel.GetById 0ms app.codeStats 0ms

/bangkokhotel/lib/python2.5/site-packages/whoosh/support/numlists.py

https://bitbucket.org/luisrodriguez/bangkokhotel
Python | 315 lines | 227 code | 62 blank | 26 comment | 37 complexity | 515e1b1705ae5671bd7cba174cffdd94 MD5 | raw file
  1from array import array
  2
  3from whoosh.compat import xrange
  4from whoosh.system import pack_byte, unpack_byte
  5from whoosh.system import pack_ushort_le, unpack_ushort_le
  6from whoosh.system import pack_uint_le, unpack_uint_le
  7
  8
  9def delta_encode(nums):
 10    base = 0
 11    for n in nums:
 12        yield n - base
 13        base = n
 14
 15
 16def delta_decode(nums):
 17    base = 0
 18    for n in nums:
 19        base += n
 20        yield base
 21
 22
 23class NumberEncoding(object):
 24    def write_deltas(self, f, numbers):
 25        return self.write_nums(f, delta_encode(numbers))
 26
 27    def read_deltas(self, f, n):
 28        return delta_decode(self.read_nums(f, n))
 29
 30    def get(self, f, pos, i):
 31        f.seek(pos)
 32        for n in self.read_nums(i + 1):
 33            pass
 34        return n
 35
 36
 37# Fixed width encodings
 38
 39class FixedEncoding(NumberEncoding):
 40    @classmethod
 41    def write_nums(cls, f, numbers):
 42        _encode = cls._encode
 43
 44        for n in numbers:
 45            f.write(_encode(n))
 46
 47    @classmethod
 48    def read_nums(cls, f, n):
 49        _decode = cls._decode
 50
 51        for _ in xrange(n):
 52            f.write(_decode(n))
 53
 54    @classmethod
 55    def get(cls, f, pos, i):
 56        f.seek(pos + i * cls.size)
 57        return cls._decode(f.read(cls.size))
 58
 59
 60class ByteEncoding(FixedEncoding):
 61    size = 1
 62    maxint = 255
 63    _encode = pack_byte
 64    _decode = unpack_byte
 65
 66
 67class UShortEncoding(FixedEncoding):
 68    size = 2
 69    maxint = 2 ** 16 - 1
 70    _encode = pack_ushort_le
 71    _decode = unpack_ushort_le
 72
 73
 74class UIntEncoding(FixedEncoding):
 75    size = 4
 76    maxint = 2 ** 32 - 1
 77    _encode = pack_uint_le
 78    _decode = unpack_uint_le
 79
 80
 81# High-bit encoded variable-length integer
 82
 83class Varints(NumberEncoding):
 84    maxint = None
 85
 86    @staticmethod
 87    def write_nums(f, numbers):
 88        for n in numbers:
 89            f.write_varint(n)
 90
 91    @staticmethod
 92    def read_nums(f, n):
 93        for _ in xrange(n):
 94            yield f.read_varint()
 95
 96
 97# Simple16 algorithm for storing arrays of positive integers (usually delta
 98# encoded lists of sorted integers)
 99#
100# 1. http://www2008.org/papers/pdf/p387-zhangA.pdf
101# 2. http://www2009.org/proceedings/pdf/p401.pdf
102
103class Simple16(NumberEncoding):
104    # The maximum possible integer value Simple16 can encode is < 2^28.
105    # Therefore, in order to use Simple16, the application must have its own
106    # code to encode numbers in the range of [2^28, 2^32). A simple way is just
107    # write those numbers as 32-bit integers (that is, no compression for very
108    # big numbers).
109    _numsize = 16
110    _bitsize = 28
111    maxint = 2 ** _bitsize - 1
112
113    # Number of stored numbers per code
114    _num = [28, 21, 21, 21, 14, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 1]
115    # Number of bits for each number per code
116    _bits = [
117    (1,) * 28,
118    (2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
119    (1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1),
120    (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2),
121    (2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2),
122    (4, 3, 3, 3, 3, 3, 3, 3, 3),
123    (3, 4, 4, 4, 4, 3, 3, 3),
124    (4, 4, 4, 4, 4, 4, 4),
125    (5, 5, 5, 5, 4, 4),
126    (4, 4, 5, 5, 5, 5),
127    (6, 6, 6, 5, 5),
128    (5, 5, 6, 6, 6),
129    (7, 7, 7, 7),
130    (10, 9, 9),
131    (14, 14),
132    (28,),
133    ]
134
135    @classmethod
136    def write_nums(cls, f, numbers):
137        _compress = cls._compress
138
139        i = 0
140        while i < len(numbers):
141            value, taken = _compress(numbers, i, len(numbers) - i)
142            f.write_uint_le(value)
143            i += taken
144
145    @classmethod
146    def _compress(cls, inarray, inoffset, n):
147        _numsize = cls._numsize
148        _bitsize = cls._bitsize
149        _num = cls._num
150        _bits = cls._bits
151
152        for key in xrange(_numsize):
153            value = key << _bitsize
154            num = _num[key] if _num[key] < n else n
155            bits = 0
156
157            j = 0
158            while j < num and inarray[inoffset + j] < (1 << _bits[key][j]):
159                x = inarray[inoffset + j]
160                value |= x << bits
161                bits += _bits[key][j]
162                j += 1
163
164            if j == num:
165                return value, num
166
167        raise Exception
168
169    @classmethod
170    def read_nums(cls, f, n):
171        _decompress = cls._decompress
172
173        i = 0
174        while i < n:
175            value = unpack_uint_le(f.read(4))[0]
176            for v in _decompress(value, n - i):
177                yield v
178                i += 1
179
180    @classmethod
181    def _decompress(cls, value, n):
182        _numsize = cls._numsize
183        _bitsize = cls._bitsize
184        _num = cls._num
185        _bits = cls._bits
186
187        key = value >> _bitsize
188        num = _num[key] if _num[key] < n else n
189        bits = 0
190        for j in xrange(num):
191            v = value >> bits
192            yield v & (0xffffffff >> (32 - _bits[key][j]))
193            bits += _bits[key][j]
194
195    @classmethod
196    def get(cls, f, pos, i):
197        f.seek(pos)
198        base = 0
199        value = unpack_uint_le(f.read(4))
200        key = value >> cls._bitsize
201        num = cls._num[key]
202        while i > base + num:
203            base += num
204            value = unpack_uint_le(f.read(4))
205            key = value >> cls._bitsize
206            num = cls._num[key]
207
208        offset = i - base
209        if offset:
210            value = value >> sum(cls._bits[key][:offset])
211        return value & (2 ** cls._bits[key][offset] - 1)
212
213
214# Google Packed Ints algorithm: a set of four numbers is preceded by a "key"
215# byte, which encodes how many bytes each of the next four integers use
216# (stored in the byte as four 2-bit numbers)
217
218class GInts(NumberEncoding):
219    maxint = 2 ** 32 - 1
220
221    # Number of future bytes to expect after a "key" byte value of N -- used to
222    # skip ahead from a key byte
223    _lens = array("B", [4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 5, 6,
224    7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9,
225    10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11,
226    12, 13, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7,
227    8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10,
228    11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11,
229    12, 13, 14, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9,
230    10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11,
231    12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12,
232    13, 14, 12, 13, 14, 15, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11,
233    12, 13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10,
234    11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 10, 11, 12, 13, 11,
235    12, 13, 14, 12, 13, 14, 15, 13, 14, 15, 16])
236
237    @staticmethod
238    def key_to_sizes(key):
239        """Returns a list of the sizes of the next four numbers given a key
240        byte.
241        """
242
243        return [(key >> (i * 2) & 3) + 1 for i in xrange(4)]
244
245    @classmethod
246    def write_nums(cls, f, numbers):
247        buf = array("B")
248        count = 0
249        key = 0
250        for v in numbers:
251            shift = count * 2
252            if v < 256:
253                buf.append(v)
254            elif v < 65536:
255                key |= 1 << shift
256                buf.extend(pack_ushort_le(v))
257            elif v < 16777216:
258                key |= 2 << shift
259                buf.extend(pack_uint_le(v)[:3])
260            else:
261                key |= 3 << shift
262                buf.extend(pack_uint_le(v))
263
264            count += 1
265            if count == 4:
266                f.write_byte(key)
267                f.write(buf)
268                count = 0
269                key = 0
270                del buf[:]  # Clear the buffer
271
272        # Write out leftovers in the buffer
273        if count:
274            f.write_byte(key)
275            f.write(buf)
276
277    @classmethod
278    def read_nums(cls, f, n):
279        """Read N integers from the bytes stream dbfile. Expects that the file
280        starts at a key byte.
281        """
282
283        count = 0
284
285        for _ in xrange(n):
286            if count == 0:
287                key = f.read_byte()
288            code = key >> (count * 2) & 3
289            if code == 0:
290                yield f.read_byte()
291            elif code == 1:
292                yield f.read_ushort_le()
293            elif code == 2:
294                yield unpack_uint_le(f.read(3) + "\x00")[0]
295            else:
296                yield f.read_uint_le()
297
298            count = (count + 1) % 4
299
300    @classmethod
301    def get(cls, f, pos, i):
302        f.seek(pos)
303        base = 0
304        key = f.read_byte()
305        while i > base + 4:
306            base += 4
307            f.seek(cls._lens[key], 1)
308            key = f.read_byte()
309
310        for n in cls.read_nums((i + 1) - base):
311            pass
312        return n
313
314
315