/tools/genome_diversity/cdblib.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 230 lines · 132 code · 47 blank · 51 comment · 36 complexity · 755bdbfdc604256b5329cc01d487b875 MD5 · raw file

  1. #!/usr/bin/env python2.5
  2. '''
  3. Manipulate DJB's Constant Databases. These are 2 level disk-based hash tables
  4. that efficiently handle many keys, while remaining space-efficient.
  5. http://cr.yp.to/cdb.html
  6. When generated databases are only used with Python code, consider using hash()
  7. rather than djb_hash() for a tidy speedup.
  8. '''
  9. from _struct import Struct
  10. from itertools import chain
  11. def py_djb_hash(s):
  12. '''Return the value of DJB's hash function for the given 8-bit string.'''
  13. h = 5381
  14. for c in s:
  15. h = (((h << 5) + h) ^ ord(c)) & 0xffffffff
  16. return h
  17. try:
  18. from _cdblib import djb_hash
  19. except ImportError:
  20. djb_hash = py_djb_hash
  21. read_2_le4 = Struct('<LL').unpack
  22. write_2_le4 = Struct('<LL').pack
  23. class Reader(object):
  24. '''A dictionary-like object for reading a Constant Database accessed
  25. through a string or string-like sequence, such as mmap.mmap().'''
  26. def __init__(self, data, hashfn=djb_hash):
  27. '''Create an instance reading from a sequence and using hashfn to hash
  28. keys.'''
  29. if len(data) < 2048:
  30. raise IOError('CDB too small')
  31. self.data = data
  32. self.hashfn = hashfn
  33. self.index = [read_2_le4(data[i:i+8]) for i in xrange(0, 2048, 8)]
  34. self.table_start = min(p[0] for p in self.index)
  35. # Assume load load factor is 0.5 like official CDB.
  36. self.length = sum(p[1] >> 1 for p in self.index)
  37. def iteritems(self):
  38. '''Like dict.iteritems(). Items are returned in insertion order.'''
  39. pos = 2048
  40. while pos < self.table_start:
  41. klen, dlen = read_2_le4(self.data[pos:pos+8])
  42. pos += 8
  43. key = self.data[pos:pos+klen]
  44. pos += klen
  45. data = self.data[pos:pos+dlen]
  46. pos += dlen
  47. yield key, data
  48. def items(self):
  49. '''Like dict.items().'''
  50. return list(self.iteritems())
  51. def iterkeys(self):
  52. '''Like dict.iterkeys().'''
  53. return (p[0] for p in self.iteritems())
  54. __iter__ = iterkeys
  55. def itervalues(self):
  56. '''Like dict.itervalues().'''
  57. return (p[1] for p in self.iteritems())
  58. def keys(self):
  59. '''Like dict.keys().'''
  60. return [p[0] for p in self.iteritems()]
  61. def values(self):
  62. '''Like dict.values().'''
  63. return [p[1] for p in self.iteritems()]
  64. def __getitem__(self, key):
  65. '''Like dict.__getitem__().'''
  66. value = self.get(key)
  67. if value is None:
  68. raise KeyError(key)
  69. return value
  70. def has_key(self, key):
  71. '''Return True if key exists in the database.'''
  72. return self.get(key) is not None
  73. __contains__ = has_key
  74. def __len__(self):
  75. '''Return the number of records in the database.'''
  76. return self.length
  77. def gets(self, key):
  78. '''Yield values for key in insertion order.'''
  79. # Truncate to 32 bits and remove sign.
  80. h = self.hashfn(key) & 0xffffffff
  81. start, nslots = self.index[h & 0xff]
  82. if nslots:
  83. end = start + (nslots << 3)
  84. slot_off = start + (((h >> 8) % nslots) << 3)
  85. for pos in chain(xrange(slot_off, end, 8),
  86. xrange(start, slot_off, 8)):
  87. rec_h, rec_pos = read_2_le4(self.data[pos:pos+8])
  88. if not rec_h:
  89. break
  90. elif rec_h == h:
  91. klen, dlen = read_2_le4(self.data[rec_pos:rec_pos+8])
  92. rec_pos += 8
  93. if self.data[rec_pos:rec_pos+klen] == key:
  94. rec_pos += klen
  95. yield self.data[rec_pos:rec_pos+dlen]
  96. def get(self, key, default=None):
  97. '''Get the first value for key, returning default if missing.'''
  98. # Avoid exception catch when handling default case; much faster.
  99. return chain(self.gets(key), (default,)).next()
  100. def getint(self, key, default=None, base=0):
  101. '''Get the first value for key converted it to an int, returning
  102. default if missing.'''
  103. value = self.get(key, default)
  104. if value is not default:
  105. return int(value, base)
  106. return value
  107. def getints(self, key, base=0):
  108. '''Yield values for key in insertion order after converting to int.'''
  109. return (int(v, base) for v in self.gets(key))
  110. def getstring(self, key, default=None, encoding='utf-8'):
  111. '''Get the first value for key decoded as unicode, returning default if
  112. not found.'''
  113. value = self.get(key, default)
  114. if value is not default:
  115. return value.decode(encoding)
  116. return value
  117. def getstrings(self, key, encoding='utf-8'):
  118. '''Yield values for key in insertion order after decoding as
  119. unicode.'''
  120. return (v.decode(encoding) for v in self.gets(key))
  121. class Writer(object):
  122. '''Object for building new Constant Databases, and writing them to a
  123. seekable file-like object.'''
  124. def __init__(self, fp, hashfn=djb_hash):
  125. '''Create an instance writing to a file-like object, using hashfn to
  126. hash keys.'''
  127. self.fp = fp
  128. self.hashfn = hashfn
  129. fp.write('\x00' * 2048)
  130. self._unordered = [[] for i in xrange(256)]
  131. def put(self, key, value=''):
  132. '''Write a string key/value pair to the output file.'''
  133. assert type(key) is str and type(value) is str
  134. pos = self.fp.tell()
  135. self.fp.write(write_2_le4(len(key), len(value)))
  136. self.fp.write(key)
  137. self.fp.write(value)
  138. h = self.hashfn(key) & 0xffffffff
  139. self._unordered[h & 0xff].append((h, pos))
  140. def puts(self, key, values):
  141. '''Write more than one value for the same key to the output file.
  142. Equivalent to calling put() in a loop.'''
  143. for value in values:
  144. self.put(key, value)
  145. def putint(self, key, value):
  146. '''Write an integer as a base-10 string associated with the given key
  147. to the output file.'''
  148. self.put(key, str(value))
  149. def putints(self, key, values):
  150. '''Write zero or more integers for the same key to the output file.
  151. Equivalent to calling putint() in a loop.'''
  152. self.puts(key, (str(value) for value in values))
  153. def putstring(self, key, value, encoding='utf-8'):
  154. '''Write a unicode string associated with the given key to the output
  155. file after encoding it as UTF-8 or the given encoding.'''
  156. self.put(key, unicode.encode(value, encoding))
  157. def putstrings(self, key, values, encoding='utf-8'):
  158. '''Write zero or more unicode strings to the output file. Equivalent to
  159. calling putstring() in a loop.'''
  160. self.puts(key, (unicode.encode(value, encoding) for value in values))
  161. def finalize(self):
  162. '''Write the final hash tables to the output file, and write out its
  163. index. The output file remains open upon return.'''
  164. index = []
  165. for tbl in self._unordered:
  166. length = len(tbl) << 1
  167. ordered = [(0, 0)] * length
  168. for pair in tbl:
  169. where = (pair[0] >> 8) % length
  170. for i in chain(xrange(where, length), xrange(0, where)):
  171. if not ordered[i][0]:
  172. ordered[i] = pair
  173. break
  174. index.append((self.fp.tell(), length))
  175. for pair in ordered:
  176. self.fp.write(write_2_le4(*pair))
  177. self.fp.seek(0)
  178. for pair in index:
  179. self.fp.write(write_2_le4(*pair))
  180. self.fp = None # prevent double finalize()