/bangkokhotel/lib/python2.5/site-packages/whoosh/filedb/filetables.py
Python | 548 lines | 488 code | 1 blank | 59 comment | 0 complexity | 08d30a00bb22cb741b7afcf3d5a2375a MD5 | raw file
1# Copyright 2009 Matt Chaput. All rights reserved.
2#
3# Redistribution and use in source and binary forms, with or without
4# modification, are permitted provided that the following conditions are met:
5#
6# 1. Redistributions of source code must retain the above copyright notice,
7# this list of conditions and the following disclaimer.
8#
9# 2. Redistributions in binary form must reproduce the above copyright
10# notice, this list of conditions and the following disclaimer in the
11# documentation and/or other materials provided with the distribution.
12#
13# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
14# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
15# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
16# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
17# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
18# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
19# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
22# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23#
24# The views and conclusions contained in the software and documentation are
25# those of the authors and should not be interpreted as representing official
26# policies, either expressed or implied, of Matt Chaput.
27
28"""This module defines writer and reader classes for a fast, immutable
29on-disk key-value database format. The current format is based heavily on
30D. J. Bernstein's CDB format (http://cr.yp.to/cdb.html).
31"""
32
33from binascii import crc32
34from collections import defaultdict
35from hashlib import md5 #@UnresolvedImport
36from struct import Struct
37
38from whoosh.compat import long_type, xrange, b, bytes_type
39from whoosh.system import _INT_SIZE, _LONG_SIZE
40
41
42_4GB = 4 * 1024 * 1024 * 1024
43
44
45def cdb_hash(key):
46 h = long_type(5381)
47 for c in key:
48 h = (h + (h << 5)) & 0xffffffff ^ ord(c)
49 return h
50
51
52def md5_hash(key):
53 return int(md5(key).hexdigest(), 16) & 0xffffffff
54
55
56def crc_hash(key):
57 return crc32(key) & 0xffffffff
58
59
60hash_functions = (hash, cdb_hash, md5_hash, crc_hash)
61
62_header_entry_struct = Struct("!qI") # Position, number of slots
63header_entry_size = _header_entry_struct.size
64pack_header_entry = _header_entry_struct.pack
65unpack_header_entry = _header_entry_struct.unpack
66
67_lengths_struct = Struct("!II") # Length of key, length of data
68lengths_size = _lengths_struct.size
69pack_lengths = _lengths_struct.pack
70unpack_lengths = _lengths_struct.unpack
71
72
73# Table classes
74
75class HashWriter(object):
76 def __init__(self, dbfile, format=1, hashtype=2):
77 self.dbfile = dbfile
78 self.format = format
79 self.hashtype = hashtype
80
81 if format:
82 dbfile.write(b("HASH"))
83 self.header_size = 16 + 256 * header_entry_size
84 _pointer_struct = Struct("!Iq") # Hash value, position
85 else:
86 # Old format
87 self.header_size = 256 * header_entry_size
88 _pointer_struct = Struct("!qq") # Hash value, position
89 self.hashtype = 0
90
91 self.hash_func = hash_functions[self.hashtype]
92 self.pointer_size = _pointer_struct.size
93 self.pack_pointer = _pointer_struct.pack
94
95 # Seek past the first "header_size" bytes of the file... we'll come
96 # back here to write the header later
97 dbfile.seek(self.header_size)
98 # Store the directory of hashed values
99 self.hashes = defaultdict(list)
100
101 def add_all(self, items):
102 dbfile = self.dbfile
103 hash_func = self.hash_func
104 hashes = self.hashes
105 pos = dbfile.tell()
106 write = dbfile.write
107
108 for key, value in items:
109 if not isinstance(key, bytes_type):
110 raise TypeError("Key %r should be bytes" % key)
111 if not isinstance(value, bytes_type):
112 raise TypeError("Value %r should be bytes" % value)
113 write(pack_lengths(len(key), len(value)))
114 write(key)
115 write(value)
116
117 h = hash_func(key)
118 hashes[h & 255].append((h, pos))
119 pos += lengths_size + len(key) + len(value)
120
121 def add(self, key, value):
122 self.add_all(((key, value),))
123
124 def _write_hashes(self):
125 dbfile = self.dbfile
126 hashes = self.hashes
127 directory = self.directory = []
128
129 pos = dbfile.tell()
130 for i in xrange(0, 256):
131 entries = hashes[i]
132 numslots = 2 * len(entries)
133 directory.append((pos, numslots))
134
135 null = (0, 0)
136 hashtable = [null] * numslots
137 for hashval, position in entries:
138 n = (hashval >> 8) % numslots
139 while hashtable[n] != null:
140 n = (n + 1) % numslots
141 hashtable[n] = (hashval, position)
142
143 write = dbfile.write
144 for hashval, position in hashtable:
145 write(self.pack_pointer(hashval, position))
146 pos += self.pointer_size
147
148 dbfile.flush()
149 self._end_of_hashes = dbfile.tell()
150
151 def _write_directory(self):
152 dbfile = self.dbfile
153 directory = self.directory
154
155 dbfile.seek(4)
156 if self.format:
157 dbfile.write_byte(self.hashtype)
158 dbfile.write(b("\x00\x00\x00")) # Unused
159 dbfile.write_long(self._end_of_hashes)
160
161 for position, numslots in directory:
162 dbfile.write(pack_header_entry(position, numslots))
163
164 dbfile.flush()
165 assert dbfile.tell() == self.header_size
166
167 def close(self):
168 self._write_hashes()
169 self._write_directory()
170 self.dbfile.close()
171
172
173class HashReader(object):
174 def __init__(self, dbfile):
175 self.dbfile = dbfile
176
177 dbfile.seek(0)
178 magic = dbfile.read(4)
179 if magic == b("HASH"):
180 self.format = 1
181 self.header_size = 16 + 256 * header_entry_size
182 _pointer_struct = Struct("!Iq") # Hash value, position
183 self.hashtype = dbfile.read_byte()
184 dbfile.read(3) # Unused
185 self._end_of_hashes = dbfile.read_long()
186 assert self._end_of_hashes >= self.header_size
187 else:
188 # Old format
189 self.format = self.hashtype = 0
190 self.header_size = 256 * header_entry_size
191 _pointer_struct = Struct("!qq") # Hash value, position
192
193 self.hash_func = hash_functions[self.hashtype]
194 self.buckets = []
195 for _ in xrange(256):
196 he = unpack_header_entry(dbfile.read(header_entry_size))
197 self.buckets.append(he)
198 self._start_of_hashes = self.buckets[0][0]
199
200 self.pointer_size = _pointer_struct.size
201 self.unpack_pointer = _pointer_struct.unpack
202
203 self.is_closed = False
204
205 def close(self):
206 if self.is_closed:
207 raise Exception("Tried to close %r twice" % self)
208 self.dbfile.close()
209 self.is_closed = True
210
211 def read(self, position, length):
212 self.dbfile.seek(position)
213 return self.dbfile.read(length)
214
215 def _ranges(self, pos=None):
216 if pos is None:
217 pos = self.header_size
218 eod = self._start_of_hashes
219 read = self.read
220 while pos < eod:
221 keylen, datalen = unpack_lengths(read(pos, lengths_size))
222 keypos = pos + lengths_size
223 datapos = pos + lengths_size + keylen
224 pos = datapos + datalen
225 yield (keypos, keylen, datapos, datalen)
226
227 def __iter__(self):
228 return iter(self.items())
229
230 def items(self):
231 read = self.read
232 for keypos, keylen, datapos, datalen in self._ranges():
233 key = read(keypos, keylen)
234 value = read(datapos, datalen)
235 yield (key, value)
236
237 def keys(self):
238 read = self.read
239 for keypos, keylen, _, _ in self._ranges():
240 yield read(keypos, keylen)
241
242 def values(self):
243 read = self.read
244 for _, _, datapos, datalen in self._ranges():
245 yield read(datapos, datalen)
246
247 def __getitem__(self, key):
248 for data in self.all(key):
249 return data
250 raise KeyError(key)
251
252 def get(self, key, default=None):
253 for data in self.all(key):
254 return data
255 return default
256
257 def all(self, key):
258 read = self.read
259 for datapos, datalen in self.ranges_for_key(key):
260 yield read(datapos, datalen)
261
262 def __contains__(self, key):
263 for _ in self.ranges_for_key(key):
264 return True
265 return False
266
267 def _hashtable_info(self, keyhash):
268 # Return (directory_position, number_of_hash_entries)
269 return self.buckets[keyhash & 255]
270
271 def _key_position(self, key):
272 keyhash = self.hash_func(key)
273 hpos, hslots = self._hashtable_info(keyhash)
274 if not hslots:
275 raise KeyError(key)
276 slotpos = hpos + (((keyhash >> 8) % hslots) * header_entry_size)
277
278 return self.dbfile.get_long(slotpos + _INT_SIZE)
279
280 def _key_at(self, pos):
281 keylen = self.dbfile.get_uint(pos)
282 return self.read(pos + lengths_size, keylen)
283
284 def ranges_for_key(self, key):
285 read = self.read
286 pointer_size = self.pointer_size
287 if not isinstance(key, bytes_type):
288 raise TypeError("Key %r should be bytes" % key)
289 keyhash = self.hash_func(key)
290 hpos, hslots = self._hashtable_info(keyhash)
291 if not hslots:
292 return
293
294 slotpos = hpos + (((keyhash >> 8) % hslots) * pointer_size)
295 for _ in xrange(hslots):
296 slothash, pos = self.unpack_pointer(read(slotpos, pointer_size))
297 if not pos:
298 return
299
300 slotpos += pointer_size
301 # If we reach the end of the hashtable, wrap around
302 if slotpos == hpos + (hslots * pointer_size):
303 slotpos = hpos
304
305 if slothash == keyhash:
306 keylen, datalen = unpack_lengths(read(pos, lengths_size))
307 if keylen == len(key):
308 if key == read(pos + lengths_size, keylen):
309 yield (pos + lengths_size + keylen, datalen)
310
311 def range_for_key(self, key):
312 for item in self.ranges_for_key(key):
313 return item
314 raise KeyError(key)
315
316 def end_of_hashes(self):
317 if self.format:
318 return self._end_of_hashes
319 else:
320 lastpos, lastnum = self.buckets[255]
321 return lastpos + lastnum * self.pointer_size
322
323
324class OrderedHashWriter(HashWriter):
325 def __init__(self, dbfile):
326 HashWriter.__init__(self, dbfile)
327 self.index = []
328 self.lastkey = None
329
330 def add_all(self, items):
331 dbfile = self.dbfile
332 hashes = self.hashes
333 hash_func = self.hash_func
334 pos = dbfile.tell()
335 write = dbfile.write
336
337 index = self.index
338 lk = self.lastkey or b('')
339
340 for key, value in items:
341 if not isinstance(key, bytes_type):
342 raise TypeError("Key %r should be bytes" % key)
343 if not isinstance(value, bytes_type):
344 raise TypeError("Value %r should be bytes" % value)
345 if key <= lk:
346 raise ValueError("Keys must increase: %r .. %r" % (lk, key))
347 lk = key
348
349 index.append(pos)
350 write(pack_lengths(len(key), len(value)))
351 write(key)
352 write(value)
353
354 h = hash_func(key)
355 hashes[h & 255].append((h, pos))
356
357 pos += lengths_size + len(key) + len(value)
358
359 self.lastkey = lk
360
361 def close(self):
362 self._write_hashes()
363 dbfile = self.dbfile
364
365 dbfile.write_uint(len(self.index))
366 for n in self.index:
367 dbfile.write_long(n)
368
369 self._write_directory()
370 self.dbfile.close()
371
372
373class OrderedHashReader(HashReader):
374 def __init__(self, dbfile):
375 HashReader.__init__(self, dbfile)
376 dbfile.seek(self.end_of_hashes())
377 self.length = dbfile.read_uint()
378 self.indexbase = dbfile.tell()
379
380 def _closest_key(self, key):
381 dbfile = self.dbfile
382 key_at = self._key_at
383 indexbase = self.indexbase
384 lo = 0
385 hi = self.length
386 if not isinstance(key, bytes_type):
387 raise TypeError("Key %r should be bytes" % key)
388 while lo < hi:
389 mid = (lo + hi) // 2
390 midkey = key_at(dbfile.get_long(indexbase + mid * _LONG_SIZE))
391 if midkey < key:
392 lo = mid + 1
393 else:
394 hi = mid
395 #i = max(0, mid - 1)
396 if lo == self.length:
397 return None
398 return dbfile.get_long(indexbase + lo * _LONG_SIZE)
399
400 def closest_key(self, key):
401 pos = self._closest_key(key)
402 if pos is None:
403 return None
404 return self._key_at(pos)
405
406 def _ranges_from(self, key):
407 #read = self.read
408 pos = self._closest_key(key)
409 if pos is None:
410 return
411
412 for x in self._ranges(pos=pos):
413 yield x
414
415 def items_from(self, key):
416 read = self.read
417 for keypos, keylen, datapos, datalen in self._ranges_from(key):
418 yield (read(keypos, keylen), read(datapos, datalen))
419
420 def keys_from(self, key):
421 read = self.read
422 for keypos, keylen, _, _ in self._ranges_from(key):
423 yield read(keypos, keylen)
424
425 def values_from(self, key):
426 read = self.read
427 for _, _, datapos, datalen in self._ranges_from(key):
428 yield read(datapos, datalen)
429
430
431class CodedHashWriter(HashWriter):
432 # Abstract base class, subclass must implement keycoder and valuecoder
433
434 def __init__(self, dbfile):
435 sup = super(CodedHashWriter, self)
436 sup.__init__(dbfile)
437
438 self._add = sup.add
439
440 def add(self, key, data):
441 self._add(self.keycoder(key), self.valuecoder(data))
442
443
444class CodedHashReader(HashReader):
445 # Abstract base class, subclass must implement keycoder, keydecoder and
446 # valuecoder
447
448 def __init__(self, dbfile):
449 sup = super(CodedHashReader, self)
450 sup.__init__(dbfile)
451
452 self._items = sup.items
453 self._keys = sup.keys
454 self._get = sup.get
455 self._getitem = sup.__getitem__
456 self._contains = sup.__contains__
457
458 def __getitem__(self, key):
459 k = self.keycoder(key)
460 return self.valuedecoder(self._getitem(k))
461
462 def __contains__(self, key):
463 return self._contains(self.keycoder(key))
464
465 def get(self, key, default=None):
466 k = self.keycoder(key)
467 return self.valuedecoder(self._get(k, default))
468
469 def items(self):
470 kd = self.keydecoder
471 vd = self.valuedecoder
472 for key, value in self._items():
473 yield (kd(key), vd(value))
474
475 def keys(self):
476 kd = self.keydecoder
477 for k in self._keys():
478 yield kd(k)
479
480
481class CodedOrderedWriter(OrderedHashWriter):
482 # Abstract base class, subclasses must implement keycoder and valuecoder
483
484 def __init__(self, dbfile):
485 sup = super(CodedOrderedWriter, self)
486 sup.__init__(dbfile)
487 self._add = sup.add
488
489 def add(self, key, data):
490 self._add(self.keycoder(key), self.valuecoder(data))
491
492
493class CodedOrderedReader(OrderedHashReader):
494 # Abstract base class, subclasses must implement keycoder, keydecoder,
495 # and valuedecoder
496
497 def __init__(self, dbfile):
498 OrderedHashReader.__init__(self, dbfile)
499
500 def __getitem__(self, key):
501 k = self.keycoder(key)
502 return self.valuedecoder(OrderedHashReader.__getitem__(self, k))
503
504 def __contains__(self, key):
505 try:
506 codedkey = self.keycoder(key)
507 except KeyError:
508 return False
509 return OrderedHashReader.__contains__(self, codedkey)
510
511 def get(self, key, default=None):
512 k = self.keycoder(key)
513 return self.valuedecoder(OrderedHashReader.get(self, k, default))
514
515 def items(self):
516 kd = self.keydecoder
517 vd = self.valuedecoder
518 for key, value in OrderedHashReader.items(self):
519 yield (kd(key), vd(value))
520
521 def items_from(self, key):
522 fromkey = self.keycoder(key)
523 kd = self.keydecoder
524 vd = self.valuedecoder
525 for key, value in OrderedHashReader.items_from(self, fromkey):
526 yield (kd(key), vd(value))
527
528 def keys(self):
529 kd = self.keydecoder
530 for k in OrderedHashReader.keys(self):
531 yield kd(k)
532
533 def keys_from(self, key):
534 kd = self.keydecoder
535 for k in OrderedHashReader.keys_from(self, self.keycoder(key)):
536 yield kd(k)
537
538 def range_for_key(self, key):
539 return OrderedHashReader.range_for_key(self, self.keycoder(key))
540
541 def values(self):
542 vd = self.valuedecoder
543 for v in OrderedHashReader.values(self):
544 yield vd(v)
545
546
547
548