/bangkokhotel/lib/python2.5/site-packages/whoosh/filedb/filetables.py

https://bitbucket.org/luisrodriguez/bangkokhotel · Python · 548 lines · 488 code · 1 blank · 59 comment · 0 complexity · 08d30a00bb22cb741b7afcf3d5a2375a MD5 · raw file

  1. # Copyright 2009 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. """This module defines writer and reader classes for a fast, immutable
  28. on-disk key-value database format. The current format is based heavily on
  29. D. J. Bernstein's CDB format (http://cr.yp.to/cdb.html).
  30. """
  31. from binascii import crc32
  32. from collections import defaultdict
  33. from hashlib import md5 #@UnresolvedImport
  34. from struct import Struct
  35. from whoosh.compat import long_type, xrange, b, bytes_type
  36. from whoosh.system import _INT_SIZE, _LONG_SIZE
  37. _4GB = 4 * 1024 * 1024 * 1024
  38. def cdb_hash(key):
  39. h = long_type(5381)
  40. for c in key:
  41. h = (h + (h << 5)) & 0xffffffff ^ ord(c)
  42. return h
  43. def md5_hash(key):
  44. return int(md5(key).hexdigest(), 16) & 0xffffffff
  45. def crc_hash(key):
  46. return crc32(key) & 0xffffffff
  47. hash_functions = (hash, cdb_hash, md5_hash, crc_hash)
  48. _header_entry_struct = Struct("!qI") # Position, number of slots
  49. header_entry_size = _header_entry_struct.size
  50. pack_header_entry = _header_entry_struct.pack
  51. unpack_header_entry = _header_entry_struct.unpack
  52. _lengths_struct = Struct("!II") # Length of key, length of data
  53. lengths_size = _lengths_struct.size
  54. pack_lengths = _lengths_struct.pack
  55. unpack_lengths = _lengths_struct.unpack
  56. # Table classes
  57. class HashWriter(object):
  58. def __init__(self, dbfile, format=1, hashtype=2):
  59. self.dbfile = dbfile
  60. self.format = format
  61. self.hashtype = hashtype
  62. if format:
  63. dbfile.write(b("HASH"))
  64. self.header_size = 16 + 256 * header_entry_size
  65. _pointer_struct = Struct("!Iq") # Hash value, position
  66. else:
  67. # Old format
  68. self.header_size = 256 * header_entry_size
  69. _pointer_struct = Struct("!qq") # Hash value, position
  70. self.hashtype = 0
  71. self.hash_func = hash_functions[self.hashtype]
  72. self.pointer_size = _pointer_struct.size
  73. self.pack_pointer = _pointer_struct.pack
  74. # Seek past the first "header_size" bytes of the file... we'll come
  75. # back here to write the header later
  76. dbfile.seek(self.header_size)
  77. # Store the directory of hashed values
  78. self.hashes = defaultdict(list)
  79. def add_all(self, items):
  80. dbfile = self.dbfile
  81. hash_func = self.hash_func
  82. hashes = self.hashes
  83. pos = dbfile.tell()
  84. write = dbfile.write
  85. for key, value in items:
  86. if not isinstance(key, bytes_type):
  87. raise TypeError("Key %r should be bytes" % key)
  88. if not isinstance(value, bytes_type):
  89. raise TypeError("Value %r should be bytes" % value)
  90. write(pack_lengths(len(key), len(value)))
  91. write(key)
  92. write(value)
  93. h = hash_func(key)
  94. hashes[h & 255].append((h, pos))
  95. pos += lengths_size + len(key) + len(value)
  96. def add(self, key, value):
  97. self.add_all(((key, value),))
  98. def _write_hashes(self):
  99. dbfile = self.dbfile
  100. hashes = self.hashes
  101. directory = self.directory = []
  102. pos = dbfile.tell()
  103. for i in xrange(0, 256):
  104. entries = hashes[i]
  105. numslots = 2 * len(entries)
  106. directory.append((pos, numslots))
  107. null = (0, 0)
  108. hashtable = [null] * numslots
  109. for hashval, position in entries:
  110. n = (hashval >> 8) % numslots
  111. while hashtable[n] != null:
  112. n = (n + 1) % numslots
  113. hashtable[n] = (hashval, position)
  114. write = dbfile.write
  115. for hashval, position in hashtable:
  116. write(self.pack_pointer(hashval, position))
  117. pos += self.pointer_size
  118. dbfile.flush()
  119. self._end_of_hashes = dbfile.tell()
  120. def _write_directory(self):
  121. dbfile = self.dbfile
  122. directory = self.directory
  123. dbfile.seek(4)
  124. if self.format:
  125. dbfile.write_byte(self.hashtype)
  126. dbfile.write(b("\x00\x00\x00")) # Unused
  127. dbfile.write_long(self._end_of_hashes)
  128. for position, numslots in directory:
  129. dbfile.write(pack_header_entry(position, numslots))
  130. dbfile.flush()
  131. assert dbfile.tell() == self.header_size
  132. def close(self):
  133. self._write_hashes()
  134. self._write_directory()
  135. self.dbfile.close()
  136. class HashReader(object):
  137. def __init__(self, dbfile):
  138. self.dbfile = dbfile
  139. dbfile.seek(0)
  140. magic = dbfile.read(4)
  141. if magic == b("HASH"):
  142. self.format = 1
  143. self.header_size = 16 + 256 * header_entry_size
  144. _pointer_struct = Struct("!Iq") # Hash value, position
  145. self.hashtype = dbfile.read_byte()
  146. dbfile.read(3) # Unused
  147. self._end_of_hashes = dbfile.read_long()
  148. assert self._end_of_hashes >= self.header_size
  149. else:
  150. # Old format
  151. self.format = self.hashtype = 0
  152. self.header_size = 256 * header_entry_size
  153. _pointer_struct = Struct("!qq") # Hash value, position
  154. self.hash_func = hash_functions[self.hashtype]
  155. self.buckets = []
  156. for _ in xrange(256):
  157. he = unpack_header_entry(dbfile.read(header_entry_size))
  158. self.buckets.append(he)
  159. self._start_of_hashes = self.buckets[0][0]
  160. self.pointer_size = _pointer_struct.size
  161. self.unpack_pointer = _pointer_struct.unpack
  162. self.is_closed = False
  163. def close(self):
  164. if self.is_closed:
  165. raise Exception("Tried to close %r twice" % self)
  166. self.dbfile.close()
  167. self.is_closed = True
  168. def read(self, position, length):
  169. self.dbfile.seek(position)
  170. return self.dbfile.read(length)
  171. def _ranges(self, pos=None):
  172. if pos is None:
  173. pos = self.header_size
  174. eod = self._start_of_hashes
  175. read = self.read
  176. while pos < eod:
  177. keylen, datalen = unpack_lengths(read(pos, lengths_size))
  178. keypos = pos + lengths_size
  179. datapos = pos + lengths_size + keylen
  180. pos = datapos + datalen
  181. yield (keypos, keylen, datapos, datalen)
  182. def __iter__(self):
  183. return iter(self.items())
  184. def items(self):
  185. read = self.read
  186. for keypos, keylen, datapos, datalen in self._ranges():
  187. key = read(keypos, keylen)
  188. value = read(datapos, datalen)
  189. yield (key, value)
  190. def keys(self):
  191. read = self.read
  192. for keypos, keylen, _, _ in self._ranges():
  193. yield read(keypos, keylen)
  194. def values(self):
  195. read = self.read
  196. for _, _, datapos, datalen in self._ranges():
  197. yield read(datapos, datalen)
  198. def __getitem__(self, key):
  199. for data in self.all(key):
  200. return data
  201. raise KeyError(key)
  202. def get(self, key, default=None):
  203. for data in self.all(key):
  204. return data
  205. return default
  206. def all(self, key):
  207. read = self.read
  208. for datapos, datalen in self.ranges_for_key(key):
  209. yield read(datapos, datalen)
  210. def __contains__(self, key):
  211. for _ in self.ranges_for_key(key):
  212. return True
  213. return False
  214. def _hashtable_info(self, keyhash):
  215. # Return (directory_position, number_of_hash_entries)
  216. return self.buckets[keyhash & 255]
  217. def _key_position(self, key):
  218. keyhash = self.hash_func(key)
  219. hpos, hslots = self._hashtable_info(keyhash)
  220. if not hslots:
  221. raise KeyError(key)
  222. slotpos = hpos + (((keyhash >> 8) % hslots) * header_entry_size)
  223. return self.dbfile.get_long(slotpos + _INT_SIZE)
  224. def _key_at(self, pos):
  225. keylen = self.dbfile.get_uint(pos)
  226. return self.read(pos + lengths_size, keylen)
  227. def ranges_for_key(self, key):
  228. read = self.read
  229. pointer_size = self.pointer_size
  230. if not isinstance(key, bytes_type):
  231. raise TypeError("Key %r should be bytes" % key)
  232. keyhash = self.hash_func(key)
  233. hpos, hslots = self._hashtable_info(keyhash)
  234. if not hslots:
  235. return
  236. slotpos = hpos + (((keyhash >> 8) % hslots) * pointer_size)
  237. for _ in xrange(hslots):
  238. slothash, pos = self.unpack_pointer(read(slotpos, pointer_size))
  239. if not pos:
  240. return
  241. slotpos += pointer_size
  242. # If we reach the end of the hashtable, wrap around
  243. if slotpos == hpos + (hslots * pointer_size):
  244. slotpos = hpos
  245. if slothash == keyhash:
  246. keylen, datalen = unpack_lengths(read(pos, lengths_size))
  247. if keylen == len(key):
  248. if key == read(pos + lengths_size, keylen):
  249. yield (pos + lengths_size + keylen, datalen)
  250. def range_for_key(self, key):
  251. for item in self.ranges_for_key(key):
  252. return item
  253. raise KeyError(key)
  254. def end_of_hashes(self):
  255. if self.format:
  256. return self._end_of_hashes
  257. else:
  258. lastpos, lastnum = self.buckets[255]
  259. return lastpos + lastnum * self.pointer_size
  260. class OrderedHashWriter(HashWriter):
  261. def __init__(self, dbfile):
  262. HashWriter.__init__(self, dbfile)
  263. self.index = []
  264. self.lastkey = None
  265. def add_all(self, items):
  266. dbfile = self.dbfile
  267. hashes = self.hashes
  268. hash_func = self.hash_func
  269. pos = dbfile.tell()
  270. write = dbfile.write
  271. index = self.index
  272. lk = self.lastkey or b('')
  273. for key, value in items:
  274. if not isinstance(key, bytes_type):
  275. raise TypeError("Key %r should be bytes" % key)
  276. if not isinstance(value, bytes_type):
  277. raise TypeError("Value %r should be bytes" % value)
  278. if key <= lk:
  279. raise ValueError("Keys must increase: %r .. %r" % (lk, key))
  280. lk = key
  281. index.append(pos)
  282. write(pack_lengths(len(key), len(value)))
  283. write(key)
  284. write(value)
  285. h = hash_func(key)
  286. hashes[h & 255].append((h, pos))
  287. pos += lengths_size + len(key) + len(value)
  288. self.lastkey = lk
  289. def close(self):
  290. self._write_hashes()
  291. dbfile = self.dbfile
  292. dbfile.write_uint(len(self.index))
  293. for n in self.index:
  294. dbfile.write_long(n)
  295. self._write_directory()
  296. self.dbfile.close()
  297. class OrderedHashReader(HashReader):
  298. def __init__(self, dbfile):
  299. HashReader.__init__(self, dbfile)
  300. dbfile.seek(self.end_of_hashes())
  301. self.length = dbfile.read_uint()
  302. self.indexbase = dbfile.tell()
  303. def _closest_key(self, key):
  304. dbfile = self.dbfile
  305. key_at = self._key_at
  306. indexbase = self.indexbase
  307. lo = 0
  308. hi = self.length
  309. if not isinstance(key, bytes_type):
  310. raise TypeError("Key %r should be bytes" % key)
  311. while lo < hi:
  312. mid = (lo + hi) // 2
  313. midkey = key_at(dbfile.get_long(indexbase + mid * _LONG_SIZE))
  314. if midkey < key:
  315. lo = mid + 1
  316. else:
  317. hi = mid
  318. #i = max(0, mid - 1)
  319. if lo == self.length:
  320. return None
  321. return dbfile.get_long(indexbase + lo * _LONG_SIZE)
  322. def closest_key(self, key):
  323. pos = self._closest_key(key)
  324. if pos is None:
  325. return None
  326. return self._key_at(pos)
  327. def _ranges_from(self, key):
  328. #read = self.read
  329. pos = self._closest_key(key)
  330. if pos is None:
  331. return
  332. for x in self._ranges(pos=pos):
  333. yield x
  334. def items_from(self, key):
  335. read = self.read
  336. for keypos, keylen, datapos, datalen in self._ranges_from(key):
  337. yield (read(keypos, keylen), read(datapos, datalen))
  338. def keys_from(self, key):
  339. read = self.read
  340. for keypos, keylen, _, _ in self._ranges_from(key):
  341. yield read(keypos, keylen)
  342. def values_from(self, key):
  343. read = self.read
  344. for _, _, datapos, datalen in self._ranges_from(key):
  345. yield read(datapos, datalen)
  346. class CodedHashWriter(HashWriter):
  347. # Abstract base class, subclass must implement keycoder and valuecoder
  348. def __init__(self, dbfile):
  349. sup = super(CodedHashWriter, self)
  350. sup.__init__(dbfile)
  351. self._add = sup.add
  352. def add(self, key, data):
  353. self._add(self.keycoder(key), self.valuecoder(data))
  354. class CodedHashReader(HashReader):
  355. # Abstract base class, subclass must implement keycoder, keydecoder and
  356. # valuecoder
  357. def __init__(self, dbfile):
  358. sup = super(CodedHashReader, self)
  359. sup.__init__(dbfile)
  360. self._items = sup.items
  361. self._keys = sup.keys
  362. self._get = sup.get
  363. self._getitem = sup.__getitem__
  364. self._contains = sup.__contains__
  365. def __getitem__(self, key):
  366. k = self.keycoder(key)
  367. return self.valuedecoder(self._getitem(k))
  368. def __contains__(self, key):
  369. return self._contains(self.keycoder(key))
  370. def get(self, key, default=None):
  371. k = self.keycoder(key)
  372. return self.valuedecoder(self._get(k, default))
  373. def items(self):
  374. kd = self.keydecoder
  375. vd = self.valuedecoder
  376. for key, value in self._items():
  377. yield (kd(key), vd(value))
  378. def keys(self):
  379. kd = self.keydecoder
  380. for k in self._keys():
  381. yield kd(k)
  382. class CodedOrderedWriter(OrderedHashWriter):
  383. # Abstract base class, subclasses must implement keycoder and valuecoder
  384. def __init__(self, dbfile):
  385. sup = super(CodedOrderedWriter, self)
  386. sup.__init__(dbfile)
  387. self._add = sup.add
  388. def add(self, key, data):
  389. self._add(self.keycoder(key), self.valuecoder(data))
  390. class CodedOrderedReader(OrderedHashReader):
  391. # Abstract base class, subclasses must implement keycoder, keydecoder,
  392. # and valuedecoder
  393. def __init__(self, dbfile):
  394. OrderedHashReader.__init__(self, dbfile)
  395. def __getitem__(self, key):
  396. k = self.keycoder(key)
  397. return self.valuedecoder(OrderedHashReader.__getitem__(self, k))
  398. def __contains__(self, key):
  399. try:
  400. codedkey = self.keycoder(key)
  401. except KeyError:
  402. return False
  403. return OrderedHashReader.__contains__(self, codedkey)
  404. def get(self, key, default=None):
  405. k = self.keycoder(key)
  406. return self.valuedecoder(OrderedHashReader.get(self, k, default))
  407. def items(self):
  408. kd = self.keydecoder
  409. vd = self.valuedecoder
  410. for key, value in OrderedHashReader.items(self):
  411. yield (kd(key), vd(value))
  412. def items_from(self, key):
  413. fromkey = self.keycoder(key)
  414. kd = self.keydecoder
  415. vd = self.valuedecoder
  416. for key, value in OrderedHashReader.items_from(self, fromkey):
  417. yield (kd(key), vd(value))
  418. def keys(self):
  419. kd = self.keydecoder
  420. for k in OrderedHashReader.keys(self):
  421. yield kd(k)
  422. def keys_from(self, key):
  423. kd = self.keydecoder
  424. for k in OrderedHashReader.keys_from(self, self.keycoder(key)):
  425. yield kd(k)
  426. def range_for_key(self, key):
  427. return OrderedHashReader.range_for_key(self, self.keycoder(key))
  428. def values(self):
  429. vd = self.valuedecoder
  430. for v in OrderedHashReader.values(self):
  431. yield vd(v)