PageRenderTime 55ms CodeModel.GetById 18ms app.highlight 33ms RepoModel.GetById 1ms app.codeStats 0ms

/bangkokhotel/lib/python2.5/site-packages/whoosh/filedb/filetables.py

https://bitbucket.org/luisrodriguez/bangkokhotel
Python | 548 lines | 488 code | 1 blank | 59 comment | 0 complexity | 08d30a00bb22cb741b7afcf3d5a2375a MD5 | raw file
  1# Copyright 2009 Matt Chaput. All rights reserved.
  2#
  3# Redistribution and use in source and binary forms, with or without
  4# modification, are permitted provided that the following conditions are met:
  5#
  6#    1. Redistributions of source code must retain the above copyright notice,
  7#       this list of conditions and the following disclaimer.
  8#
  9#    2. Redistributions in binary form must reproduce the above copyright
 10#       notice, this list of conditions and the following disclaimer in the
 11#       documentation and/or other materials provided with the distribution.
 12#
 13# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
 14# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 15# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 16# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 17# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 18# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 19# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 20# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 21# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 22# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 23#
 24# The views and conclusions contained in the software and documentation are
 25# those of the authors and should not be interpreted as representing official
 26# policies, either expressed or implied, of Matt Chaput.
 27
 28"""This module defines writer and reader classes for a fast, immutable
 29on-disk key-value database format. The current format is based heavily on
 30D. J. Bernstein's CDB format (http://cr.yp.to/cdb.html).
 31"""
 32
 33from binascii import crc32
 34from collections import defaultdict
 35from hashlib import md5  #@UnresolvedImport
 36from struct import Struct
 37
 38from whoosh.compat import long_type, xrange, b, bytes_type
 39from whoosh.system import _INT_SIZE, _LONG_SIZE
 40
 41
 42_4GB = 4 * 1024 * 1024 * 1024
 43
 44
 45def cdb_hash(key):
 46    h = long_type(5381)
 47    for c in key:
 48        h = (h + (h << 5)) & 0xffffffff ^ ord(c)
 49    return h
 50
 51
 52def md5_hash(key):
 53    return int(md5(key).hexdigest(), 16) & 0xffffffff
 54
 55
 56def crc_hash(key):
 57    return crc32(key) & 0xffffffff
 58
 59
 60hash_functions = (hash, cdb_hash, md5_hash, crc_hash)
 61
 62_header_entry_struct = Struct("!qI")  # Position, number of slots
 63header_entry_size = _header_entry_struct.size
 64pack_header_entry = _header_entry_struct.pack
 65unpack_header_entry = _header_entry_struct.unpack
 66
 67_lengths_struct = Struct("!II")  # Length of key, length of data
 68lengths_size = _lengths_struct.size
 69pack_lengths = _lengths_struct.pack
 70unpack_lengths = _lengths_struct.unpack
 71
 72
 73# Table classes
 74
 75class HashWriter(object):
 76    def __init__(self, dbfile, format=1, hashtype=2):
 77        self.dbfile = dbfile
 78        self.format = format
 79        self.hashtype = hashtype
 80
 81        if format:
 82            dbfile.write(b("HASH"))
 83            self.header_size = 16 + 256 * header_entry_size
 84            _pointer_struct = Struct("!Iq")  # Hash value, position
 85        else:
 86            # Old format
 87            self.header_size = 256 * header_entry_size
 88            _pointer_struct = Struct("!qq")  # Hash value, position
 89            self.hashtype = 0
 90
 91        self.hash_func = hash_functions[self.hashtype]
 92        self.pointer_size = _pointer_struct.size
 93        self.pack_pointer = _pointer_struct.pack
 94
 95        # Seek past the first "header_size" bytes of the file... we'll come
 96        # back here to write the header later
 97        dbfile.seek(self.header_size)
 98        # Store the directory of hashed values
 99        self.hashes = defaultdict(list)
100
101    def add_all(self, items):
102        dbfile = self.dbfile
103        hash_func = self.hash_func
104        hashes = self.hashes
105        pos = dbfile.tell()
106        write = dbfile.write
107
108        for key, value in items:
109            if not isinstance(key, bytes_type):
110                raise TypeError("Key %r should be bytes" % key)
111            if not isinstance(value, bytes_type):
112                raise TypeError("Value %r should be bytes" % value)
113            write(pack_lengths(len(key), len(value)))
114            write(key)
115            write(value)
116
117            h = hash_func(key)
118            hashes[h & 255].append((h, pos))
119            pos += lengths_size + len(key) + len(value)
120
121    def add(self, key, value):
122        self.add_all(((key, value),))
123
124    def _write_hashes(self):
125        dbfile = self.dbfile
126        hashes = self.hashes
127        directory = self.directory = []
128
129        pos = dbfile.tell()
130        for i in xrange(0, 256):
131            entries = hashes[i]
132            numslots = 2 * len(entries)
133            directory.append((pos, numslots))
134
135            null = (0, 0)
136            hashtable = [null] * numslots
137            for hashval, position in entries:
138                n = (hashval >> 8) % numslots
139                while hashtable[n] != null:
140                    n = (n + 1) % numslots
141                hashtable[n] = (hashval, position)
142
143            write = dbfile.write
144            for hashval, position in hashtable:
145                write(self.pack_pointer(hashval, position))
146                pos += self.pointer_size
147
148        dbfile.flush()
149        self._end_of_hashes = dbfile.tell()
150
151    def _write_directory(self):
152        dbfile = self.dbfile
153        directory = self.directory
154
155        dbfile.seek(4)
156        if self.format:
157            dbfile.write_byte(self.hashtype)
158            dbfile.write(b("\x00\x00\x00"))  # Unused
159            dbfile.write_long(self._end_of_hashes)
160
161        for position, numslots in directory:
162            dbfile.write(pack_header_entry(position, numslots))
163
164        dbfile.flush()
165        assert dbfile.tell() == self.header_size
166
167    def close(self):
168        self._write_hashes()
169        self._write_directory()
170        self.dbfile.close()
171
172
173class HashReader(object):
174    def __init__(self, dbfile):
175        self.dbfile = dbfile
176
177        dbfile.seek(0)
178        magic = dbfile.read(4)
179        if magic == b("HASH"):
180            self.format = 1
181            self.header_size = 16 + 256 * header_entry_size
182            _pointer_struct = Struct("!Iq")  # Hash value, position
183            self.hashtype = dbfile.read_byte()
184            dbfile.read(3)  # Unused
185            self._end_of_hashes = dbfile.read_long()
186            assert self._end_of_hashes >= self.header_size
187        else:
188            # Old format
189            self.format = self.hashtype = 0
190            self.header_size = 256 * header_entry_size
191            _pointer_struct = Struct("!qq")  # Hash value, position
192
193        self.hash_func = hash_functions[self.hashtype]
194        self.buckets = []
195        for _ in xrange(256):
196            he = unpack_header_entry(dbfile.read(header_entry_size))
197            self.buckets.append(he)
198        self._start_of_hashes = self.buckets[0][0]
199
200        self.pointer_size = _pointer_struct.size
201        self.unpack_pointer = _pointer_struct.unpack
202
203        self.is_closed = False
204
205    def close(self):
206        if self.is_closed:
207            raise Exception("Tried to close %r twice" % self)
208        self.dbfile.close()
209        self.is_closed = True
210
211    def read(self, position, length):
212        self.dbfile.seek(position)
213        return self.dbfile.read(length)
214
215    def _ranges(self, pos=None):
216        if pos is None:
217            pos = self.header_size
218        eod = self._start_of_hashes
219        read = self.read
220        while pos < eod:
221            keylen, datalen = unpack_lengths(read(pos, lengths_size))
222            keypos = pos + lengths_size
223            datapos = pos + lengths_size + keylen
224            pos = datapos + datalen
225            yield (keypos, keylen, datapos, datalen)
226
227    def __iter__(self):
228        return iter(self.items())
229
230    def items(self):
231        read = self.read
232        for keypos, keylen, datapos, datalen in self._ranges():
233            key = read(keypos, keylen)
234            value = read(datapos, datalen)
235            yield (key, value)
236
237    def keys(self):
238        read = self.read
239        for keypos, keylen, _, _ in self._ranges():
240            yield read(keypos, keylen)
241
242    def values(self):
243        read = self.read
244        for _, _, datapos, datalen in self._ranges():
245            yield read(datapos, datalen)
246
247    def __getitem__(self, key):
248        for data in self.all(key):
249            return data
250        raise KeyError(key)
251
252    def get(self, key, default=None):
253        for data in self.all(key):
254            return data
255        return default
256
257    def all(self, key):
258        read = self.read
259        for datapos, datalen in self.ranges_for_key(key):
260            yield read(datapos, datalen)
261
262    def __contains__(self, key):
263        for _ in self.ranges_for_key(key):
264            return True
265        return False
266
267    def _hashtable_info(self, keyhash):
268        # Return (directory_position, number_of_hash_entries)
269        return self.buckets[keyhash & 255]
270
271    def _key_position(self, key):
272        keyhash = self.hash_func(key)
273        hpos, hslots = self._hashtable_info(keyhash)
274        if not hslots:
275            raise KeyError(key)
276        slotpos = hpos + (((keyhash >> 8) % hslots) * header_entry_size)
277
278        return self.dbfile.get_long(slotpos + _INT_SIZE)
279
280    def _key_at(self, pos):
281        keylen = self.dbfile.get_uint(pos)
282        return self.read(pos + lengths_size, keylen)
283
284    def ranges_for_key(self, key):
285        read = self.read
286        pointer_size = self.pointer_size
287        if not isinstance(key, bytes_type):
288            raise TypeError("Key %r should be bytes" % key)
289        keyhash = self.hash_func(key)
290        hpos, hslots = self._hashtable_info(keyhash)
291        if not hslots:
292            return
293
294        slotpos = hpos + (((keyhash >> 8) % hslots) * pointer_size)
295        for _ in xrange(hslots):
296            slothash, pos = self.unpack_pointer(read(slotpos, pointer_size))
297            if not pos:
298                return
299
300            slotpos += pointer_size
301            # If we reach the end of the hashtable, wrap around
302            if slotpos == hpos + (hslots * pointer_size):
303                slotpos = hpos
304
305            if slothash == keyhash:
306                keylen, datalen = unpack_lengths(read(pos, lengths_size))
307                if keylen == len(key):
308                    if key == read(pos + lengths_size, keylen):
309                        yield (pos + lengths_size + keylen, datalen)
310
311    def range_for_key(self, key):
312        for item in self.ranges_for_key(key):
313            return item
314        raise KeyError(key)
315
316    def end_of_hashes(self):
317        if self.format:
318            return self._end_of_hashes
319        else:
320            lastpos, lastnum = self.buckets[255]
321            return lastpos + lastnum * self.pointer_size
322
323
324class OrderedHashWriter(HashWriter):
325    def __init__(self, dbfile):
326        HashWriter.__init__(self, dbfile)
327        self.index = []
328        self.lastkey = None
329
330    def add_all(self, items):
331        dbfile = self.dbfile
332        hashes = self.hashes
333        hash_func = self.hash_func
334        pos = dbfile.tell()
335        write = dbfile.write
336
337        index = self.index
338        lk = self.lastkey or b('')
339
340        for key, value in items:
341            if not isinstance(key, bytes_type):
342                raise TypeError("Key %r should be bytes" % key)
343            if not isinstance(value, bytes_type):
344                raise TypeError("Value %r should be bytes" % value)
345            if key <= lk:
346                raise ValueError("Keys must increase: %r .. %r" % (lk, key))
347            lk = key
348
349            index.append(pos)
350            write(pack_lengths(len(key), len(value)))
351            write(key)
352            write(value)
353
354            h = hash_func(key)
355            hashes[h & 255].append((h, pos))
356
357            pos += lengths_size + len(key) + len(value)
358
359        self.lastkey = lk
360
361    def close(self):
362        self._write_hashes()
363        dbfile = self.dbfile
364
365        dbfile.write_uint(len(self.index))
366        for n in self.index:
367            dbfile.write_long(n)
368
369        self._write_directory()
370        self.dbfile.close()
371
372
373class OrderedHashReader(HashReader):
374    def __init__(self, dbfile):
375        HashReader.__init__(self, dbfile)
376        dbfile.seek(self.end_of_hashes())
377        self.length = dbfile.read_uint()
378        self.indexbase = dbfile.tell()
379
380    def _closest_key(self, key):
381        dbfile = self.dbfile
382        key_at = self._key_at
383        indexbase = self.indexbase
384        lo = 0
385        hi = self.length
386        if not isinstance(key, bytes_type):
387            raise TypeError("Key %r should be bytes" % key)
388        while lo < hi:
389            mid = (lo + hi) // 2
390            midkey = key_at(dbfile.get_long(indexbase + mid * _LONG_SIZE))
391            if midkey < key:
392                lo = mid + 1
393            else:
394                hi = mid
395        #i = max(0, mid - 1)
396        if lo == self.length:
397            return None
398        return dbfile.get_long(indexbase + lo * _LONG_SIZE)
399
400    def closest_key(self, key):
401        pos = self._closest_key(key)
402        if pos is None:
403            return None
404        return self._key_at(pos)
405
406    def _ranges_from(self, key):
407        #read = self.read
408        pos = self._closest_key(key)
409        if pos is None:
410            return
411
412        for x in self._ranges(pos=pos):
413            yield x
414
415    def items_from(self, key):
416        read = self.read
417        for keypos, keylen, datapos, datalen in self._ranges_from(key):
418            yield (read(keypos, keylen), read(datapos, datalen))
419
420    def keys_from(self, key):
421        read = self.read
422        for keypos, keylen, _, _ in self._ranges_from(key):
423            yield read(keypos, keylen)
424
425    def values_from(self, key):
426        read = self.read
427        for _, _, datapos, datalen in self._ranges_from(key):
428            yield read(datapos, datalen)
429
430
431class CodedHashWriter(HashWriter):
432    # Abstract base class, subclass must implement keycoder and valuecoder
433
434    def __init__(self, dbfile):
435        sup = super(CodedHashWriter, self)
436        sup.__init__(dbfile)
437
438        self._add = sup.add
439
440    def add(self, key, data):
441        self._add(self.keycoder(key), self.valuecoder(data))
442
443
444class CodedHashReader(HashReader):
445    # Abstract base class, subclass must implement keycoder, keydecoder and
446    # valuecoder
447
448    def __init__(self, dbfile):
449        sup = super(CodedHashReader, self)
450        sup.__init__(dbfile)
451
452        self._items = sup.items
453        self._keys = sup.keys
454        self._get = sup.get
455        self._getitem = sup.__getitem__
456        self._contains = sup.__contains__
457
458    def __getitem__(self, key):
459        k = self.keycoder(key)
460        return self.valuedecoder(self._getitem(k))
461
462    def __contains__(self, key):
463        return self._contains(self.keycoder(key))
464
465    def get(self, key, default=None):
466        k = self.keycoder(key)
467        return self.valuedecoder(self._get(k, default))
468
469    def items(self):
470        kd = self.keydecoder
471        vd = self.valuedecoder
472        for key, value in self._items():
473            yield (kd(key), vd(value))
474
475    def keys(self):
476        kd = self.keydecoder
477        for k in self._keys():
478            yield kd(k)
479
480
481class CodedOrderedWriter(OrderedHashWriter):
482    # Abstract base class, subclasses must implement keycoder and valuecoder
483
484    def __init__(self, dbfile):
485        sup = super(CodedOrderedWriter, self)
486        sup.__init__(dbfile)
487        self._add = sup.add
488
489    def add(self, key, data):
490        self._add(self.keycoder(key), self.valuecoder(data))
491
492
493class CodedOrderedReader(OrderedHashReader):
494    # Abstract base class, subclasses must implement keycoder, keydecoder,
495    # and valuedecoder
496
497    def __init__(self, dbfile):
498        OrderedHashReader.__init__(self, dbfile)
499
500    def __getitem__(self, key):
501        k = self.keycoder(key)
502        return self.valuedecoder(OrderedHashReader.__getitem__(self, k))
503
504    def __contains__(self, key):
505        try:
506            codedkey = self.keycoder(key)
507        except KeyError:
508            return False
509        return OrderedHashReader.__contains__(self, codedkey)
510
511    def get(self, key, default=None):
512        k = self.keycoder(key)
513        return self.valuedecoder(OrderedHashReader.get(self, k, default))
514
515    def items(self):
516        kd = self.keydecoder
517        vd = self.valuedecoder
518        for key, value in OrderedHashReader.items(self):
519            yield (kd(key), vd(value))
520
521    def items_from(self, key):
522        fromkey = self.keycoder(key)
523        kd = self.keydecoder
524        vd = self.valuedecoder
525        for key, value in OrderedHashReader.items_from(self, fromkey):
526            yield (kd(key), vd(value))
527
528    def keys(self):
529        kd = self.keydecoder
530        for k in OrderedHashReader.keys(self):
531            yield kd(k)
532
533    def keys_from(self, key):
534        kd = self.keydecoder
535        for k in OrderedHashReader.keys_from(self, self.keycoder(key)):
536            yield kd(k)
537
538    def range_for_key(self, key):
539        return OrderedHashReader.range_for_key(self, self.keycoder(key))
540
541    def values(self):
542        vd = self.valuedecoder
543        for v in OrderedHashReader.values(self):
544            yield vd(v)
545
546
547
548