/bangkokhotel/lib/python2.5/site-packages/whoosh/util.py

https://bitbucket.org/luisrodriguez/bangkokhotel · Python · 671 lines · 452 code · 85 blank · 134 comment · 45 complexity · 40649f8c29d80c1e7e4a6102934b9b0c MD5 · raw file

  1. # Copyright 2007 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. """Miscellaneous utility functions and classes.
  28. """
  29. from __future__ import with_statement
  30. import codecs
  31. import re
  32. import sys
  33. import time
  34. from array import array
  35. from bisect import insort, bisect_left, bisect_right
  36. from copy import copy
  37. from functools import wraps
  38. from struct import pack, unpack
  39. from threading import Lock
  40. from whoosh.compat import xrange, u, b, string_type
  41. from whoosh.compat import array_tobytes
  42. from whoosh.system import pack_ushort_le, pack_uint_le
  43. from whoosh.system import unpack_ushort_le, unpack_uint_le
  44. if sys.platform == 'win32':
  45. now = time.clock
  46. else:
  47. now = time.time
  48. # Note: these functions return a tuple of (text, length), so when you call
  49. # them, you have to add [0] on the end, e.g. str = utf8encode(unicode)[0]
  50. utf8encode = codecs.getencoder("utf-8")
  51. utf8decode = codecs.getdecoder("utf-8")
  52. #utf16encode = codecs.getencoder("utf-16-be")
  53. #utf16decode = codecs.getdecoder("utf-16-be")
  54. #utf32encode = codecs.getencoder("utf-32-be")
  55. #utf32decode = codecs.getdecoder("utf-32-be")
  56. # Functions
  57. def make_binary_tree(fn, args, **kwargs):
  58. """Takes a function/class that takes two positional arguments and a list of
  59. arguments and returns a binary tree of results/instances.
  60. >>> make_binary_tree(UnionMatcher, [matcher1, matcher2, matcher3])
  61. UnionMatcher(matcher1, UnionMatcher(matcher2, matcher3))
  62. Any keyword arguments given to this function are passed to the class
  63. initializer.
  64. """
  65. count = len(args)
  66. if not count:
  67. raise ValueError("Called make_binary_tree with empty list")
  68. elif count == 1:
  69. return args[0]
  70. half = count // 2
  71. return fn(make_binary_tree(fn, args[:half], **kwargs),
  72. make_binary_tree(fn, args[half:], **kwargs), **kwargs)
  73. def make_weighted_tree(fn, ls, **kwargs):
  74. """Takes a function/class that takes two positional arguments and a list of
  75. (weight, argument) tuples and returns a huffman-like weighted tree of
  76. results/instances.
  77. """
  78. if not ls:
  79. raise ValueError("Called make_weighted_tree with empty list")
  80. ls.sort()
  81. while len(ls) > 1:
  82. a = ls.pop(0)
  83. b = ls.pop(0)
  84. insort(ls, (a[0] + b[0], fn(a[1], b[1])))
  85. return ls[0][1]
  86. # Varint cache
  87. # Build a cache of the varint byte sequences for the first N integers, so we
  88. # don't have to constantly recalculate them on the fly. This makes a small but
  89. # noticeable difference.
  90. def _varint(i):
  91. a = array("B")
  92. while (i & ~0x7F) != 0:
  93. a.append((i & 0x7F) | 0x80)
  94. i = i >> 7
  95. a.append(i)
  96. return array_tobytes(a)
  97. _varint_cache_size = 512
  98. _varint_cache = []
  99. for i in xrange(0, _varint_cache_size):
  100. _varint_cache.append(_varint(i))
  101. _varint_cache = tuple(_varint_cache)
  102. def varint(i):
  103. """Encodes the given integer into a string of the minimum number of bytes.
  104. """
  105. if i < len(_varint_cache):
  106. return _varint_cache[i]
  107. return _varint(i)
  108. def varint_to_int(vi):
  109. b = ord(vi[0])
  110. p = 1
  111. i = b & 0x7f
  112. shift = 7
  113. while b & 0x80 != 0:
  114. b = ord(vi[p])
  115. p += 1
  116. i |= (b & 0x7F) << shift
  117. shift += 7
  118. return i
  119. def signed_varint(i):
  120. """Zig-zag encodes a signed integer into a varint.
  121. """
  122. if i >= 0:
  123. return varint(i << 1)
  124. return varint((i << 1) ^ (~0))
  125. def decode_signed_varint(i):
  126. """Zig-zag decodes an integer value.
  127. """
  128. if not i & 1:
  129. return i >> 1
  130. return (i >> 1) ^ (~0)
  131. def read_varint(readfn):
  132. """
  133. Reads a variable-length encoded integer.
  134. :param readfn: a callable that reads a given number of bytes,
  135. like file.read().
  136. """
  137. b = ord(readfn(1))
  138. i = b & 0x7F
  139. shift = 7
  140. while b & 0x80 != 0:
  141. b = ord(readfn(1))
  142. i |= (b & 0x7F) << shift
  143. shift += 7
  144. return i
  145. # Fibonacci function
  146. _fib_cache = {}
  147. def fib(n):
  148. """Returns the nth value in the Fibonacci sequence.
  149. """
  150. if n <= 2:
  151. return n
  152. if n in _fib_cache:
  153. return _fib_cache[n]
  154. result = fib(n - 1) + fib(n - 2)
  155. _fib_cache[n] = result
  156. return result
  157. # Float-to-byte encoding/decoding
  158. def float_to_byte(value, mantissabits=5, zeroexp=2):
  159. """Encodes a floating point number in a single byte.
  160. """
  161. # Assume int size == float size
  162. fzero = (63 - zeroexp) << mantissabits
  163. bits = unpack("i", pack("f", value))[0]
  164. smallfloat = bits >> (24 - mantissabits)
  165. if smallfloat < fzero:
  166. # Map negative numbers and 0 to 0
  167. # Map underflow to next smallest non-zero number
  168. if bits <= 0:
  169. result = chr(0)
  170. else:
  171. result = chr(1)
  172. elif smallfloat >= fzero + 0x100:
  173. # Map overflow to largest number
  174. result = chr(255)
  175. else:
  176. result = chr(smallfloat - fzero)
  177. return b(result)
  178. def byte_to_float(b, mantissabits=5, zeroexp=2):
  179. """Decodes a floating point number stored in a single byte.
  180. """
  181. if type(b) is not int:
  182. b = ord(b)
  183. if b == 0:
  184. return 0.0
  185. bits = (b & 0xff) << (24 - mantissabits)
  186. bits += (63 - zeroexp) << 24
  187. return unpack("f", pack("i", bits))[0]
  188. # Length-to-byte approximation functions
  189. # Old implementation:
  190. #def length_to_byte(length):
  191. # """Returns a logarithmic approximation of the given number, in the range
  192. # 0-255. The approximation has high precision at the low end (e.g.
  193. # 1 -> 0, 2 -> 1, 3 -> 2 ...) and low precision at the high end. Numbers
  194. # equal to or greater than 108116 all approximate to 255.
  195. #
  196. # This is useful for storing field lengths, where the general case is small
  197. # documents and very large documents are more rare.
  198. # """
  199. #
  200. # # This encoding formula works up to 108116 -> 255, so if the length is
  201. # # equal to or greater than that limit, just return 255.
  202. # if length >= 108116:
  203. # return 255
  204. #
  205. # # The parameters of this formula where chosen heuristically so that low
  206. # # numbers would approximate closely, and the byte range 0-255 would cover
  207. # # a decent range of document lengths (i.e. 1 to ~100000).
  208. # return int(round(log((length / 27.0) + 1, 1.033)))
  209. #def _byte_to_length(n):
  210. # return int(round((pow(1.033, n) - 1) * 27))
  211. #_b2l_cache = array("i", (_byte_to_length(i) for i in xrange(256)))
  212. #byte_to_length = _b2l_cache.__getitem__
  213. # New implementation
  214. # Instead of computing the actual formula to get the byte for any given length,
  215. # precompute the length associated with each byte, and use bisect to find the
  216. # nearest value. This gives quite a large speed-up.
  217. #
  218. # Note that this does not give all the same answers as the old, "real"
  219. # implementation since this implementation always "rounds down" (thanks to the
  220. # bisect_left) while the old implementation would "round up" or "round down"
  221. # depending on the input. Since this is a fairly gross approximation anyway,
  222. # I don't think it matters much.
  223. # Values generated using the formula from the "old" implementation above
  224. _length_byte_cache = array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14,
  225. 16, 17, 18, 20, 21, 23, 25, 26, 28, 30, 32, 34, 36, 38, 40, 42, 45, 47, 49, 52,
  226. 54, 57, 60, 63, 66, 69, 72, 75, 79, 82, 86, 89, 93, 97, 101, 106, 110, 114,
  227. 119, 124, 129, 134, 139, 145, 150, 156, 162, 169, 175, 182, 189, 196, 203, 211,
  228. 219, 227, 235, 244, 253, 262, 271, 281, 291, 302, 313, 324, 336, 348, 360, 373,
  229. 386, 399, 414, 428, 443, 459, 475, 491, 508, 526, 544, 563, 583, 603, 623, 645,
  230. 667, 690, 714, 738, 763, 789, 816, 844, 873, 903, 933, 965, 998, 1032, 1066,
  231. 1103, 1140, 1178, 1218, 1259, 1302, 1345, 1391, 1438, 1486, 1536, 1587, 1641,
  232. 1696, 1753, 1811, 1872, 1935, 1999, 2066, 2135, 2207, 2280, 2356, 2435, 2516,
  233. 2600, 2687, 2777, 2869, 2965, 3063, 3165, 3271, 3380, 3492, 3608, 3728, 3852,
  234. 3980, 4112, 4249, 4390, 4536, 4686, 4842, 5002, 5168, 5340, 5517, 5700, 5889,
  235. 6084, 6286, 6494, 6709, 6932, 7161, 7398, 7643, 7897, 8158, 8428, 8707, 8995,
  236. 9293, 9601, 9918, 10247, 10586, 10936, 11298, 11671, 12057, 12456, 12868,
  237. 13294, 13733, 14187, 14656, 15141, 15641, 16159, 16693, 17244, 17814, 18403,
  238. 19011, 19640, 20289, 20959, 21652, 22367, 23106, 23869, 24658, 25472, 26314,
  239. 27183, 28081, 29009, 29967, 30957, 31979, 33035, 34126, 35254, 36418, 37620,
  240. 38863, 40146, 41472, 42841, 44256, 45717, 47227, 48786, 50397, 52061, 53780,
  241. 55556, 57390, 59285, 61242, 63264, 65352, 67510, 69739, 72041, 74419, 76876,
  242. 79414, 82035, 84743, 87541, 90430, 93416, 96499, 99684, 102975, 106374])
  243. def length_to_byte(length):
  244. if length is None:
  245. return 0
  246. if length >= 106374:
  247. return 255
  248. else:
  249. return bisect_left(_length_byte_cache, length)
  250. byte_to_length = _length_byte_cache.__getitem__
  251. # Prefix encoding functions
  252. def first_diff(a, b):
  253. """Returns the position of the first differing character in the strings
  254. a and b. For example, first_diff('render', 'rending') == 4. This function
  255. limits the return value to 255 so the difference can be encoded in a single
  256. byte.
  257. """
  258. i = 0
  259. for i in xrange(0, len(a)):
  260. if a[i] != b[i] or i == 255:
  261. break
  262. return i
  263. def prefix_encode(a, b):
  264. """Compresses string b as an integer (encoded in a byte) representing
  265. the prefix it shares with a, followed by the suffix encoded as UTF-8.
  266. """
  267. i = first_diff(a, b)
  268. return chr(i) + b[i:].encode("utf8")
  269. def prefix_encode_all(ls):
  270. """Compresses the given list of (unicode) strings by storing each string
  271. (except the first one) as an integer (encoded in a byte) representing
  272. the prefix it shares with its predecessor, followed by the suffix encoded
  273. as UTF-8.
  274. """
  275. last = u('')
  276. for w in ls:
  277. i = first_diff(last, w)
  278. yield chr(i) + w[i:].encode("utf8")
  279. last = w
  280. def prefix_decode_all(ls):
  281. """Decompresses a list of strings compressed by prefix_encode().
  282. """
  283. last = u('')
  284. for w in ls:
  285. i = ord(w[0])
  286. decoded = last[:i] + w[1:].decode("utf8")
  287. yield decoded
  288. last = decoded
  289. # Natural key sorting function
  290. _nkre = re.compile(r"\D+|\d+", re.UNICODE)
  291. def _nkconv(i):
  292. try:
  293. return int(i)
  294. except ValueError:
  295. return i.lower()
  296. def natural_key(s):
  297. """Converts string ``s`` into a tuple that will sort "naturally" (i.e.,
  298. ``name5`` will come before ``name10`` and ``1`` will come before ``A``).
  299. This function is designed to be used as the ``key`` argument to sorting
  300. functions.
  301. :param s: the str/unicode string to convert.
  302. :rtype: tuple
  303. """
  304. # Use _nkre to split the input string into a sequence of
  305. # digit runs and non-digit runs. Then use _nkconv() to convert
  306. # the digit runs into ints and the non-digit runs to lowercase.
  307. return tuple(_nkconv(m) for m in _nkre.findall(s))
  308. # Mixins and decorators
  309. class ClosableMixin(object):
  310. """Mix-in for classes with a close() method to allow them to be used as a
  311. context manager.
  312. """
  313. def __enter__(self):
  314. return self
  315. def __exit__(self, *exc_info):
  316. self.close()
  317. def protected(func):
  318. """Decorator for storage-access methods. This decorator (a) checks if the
  319. object has already been closed, and (b) synchronizes on a threading lock.
  320. The parent object must have 'is_closed' and '_sync_lock' attributes.
  321. """
  322. @wraps(func)
  323. def protected_wrapper(self, *args, **kwargs):
  324. if self.is_closed:
  325. raise Exception("%r has been closed" % self)
  326. with self._sync_lock:
  327. return func(self, *args, **kwargs)
  328. return protected_wrapper
  329. def synchronized(func):
  330. """Decorator for storage-access methods, which synchronizes on a threading
  331. lock. The parent object must have 'is_closed' and '_sync_lock' attributes.
  332. """
  333. @wraps(func)
  334. def synchronized_wrapper(self, *args, **kwargs):
  335. with self._sync_lock:
  336. return func(self, *args, **kwargs)
  337. return synchronized_wrapper
  338. def unbound_cache(func):
  339. """Caching decorator with an unbounded cache size.
  340. """
  341. cache = {}
  342. @wraps(func)
  343. def caching_wrapper(*args):
  344. try:
  345. return cache[args]
  346. except KeyError:
  347. result = func(*args)
  348. cache[args] = result
  349. return result
  350. return caching_wrapper
  351. def lru_cache(maxsize=100):
  352. """Double-barrel least-recently-used cache decorator. This is a simple
  353. LRU algorithm that keeps a primary and secondary dict. Keys are checked
  354. in the primary dict, and then the secondary. Once the primary dict fills
  355. up, the secondary dict is cleared and the two dicts are swapped.
  356. This function duplicates (more-or-less) the protocol of the
  357. ``functools.lru_cache`` decorator in the Python 3.2 standard library.
  358. Arguments to the cached function must be hashable.
  359. View the cache statistics named tuple (hits, misses, maxsize, currsize)
  360. with f.cache_info(). Clear the cache and statistics with f.cache_clear().
  361. Access the underlying function with f.__wrapped__.
  362. """
  363. def decorating_function(user_function):
  364. # Cache1, Cache2, Pointer, Hits, Misses
  365. stats = [{}, {}, 0, 0, 0]
  366. @wraps(user_function)
  367. def wrapper(*args):
  368. ptr = stats[2]
  369. a = stats[ptr]
  370. b = stats[not ptr]
  371. key = args
  372. if key in a:
  373. stats[3] += 1 # Hit
  374. return a[key]
  375. elif key in b:
  376. stats[3] += 1 # Hit
  377. return b[key]
  378. else:
  379. stats[4] += 1 # Miss
  380. result = user_function(*args)
  381. a[key] = result
  382. if len(a) >= maxsize:
  383. stats[2] = not ptr
  384. b.clear()
  385. return result
  386. def cache_info():
  387. """Report cache statistics"""
  388. return (stats[3], stats[4], maxsize, len(stats[0]) + len(stats[1]))
  389. def cache_clear():
  390. """Clear the cache and cache statistics"""
  391. stats[0].clear()
  392. stats[1].clear()
  393. stats[3] = stats[4] = 0
  394. wrapper.cache_info = cache_info
  395. wrapper.cache_clear = cache_clear
  396. return wrapper
  397. return decorating_function
  398. def clockface_lru_cache(maxsize=100):
  399. """Least-recently-used cache decorator.
  400. This function duplicates (more-or-less) the protocol of the
  401. ``functools.lru_cache`` decorator in the Python 3.2 standard library, but
  402. uses the clock face LRU algorithm instead of an ordered dictionary.
  403. If *maxsize* is set to None, the LRU features are disabled and the cache
  404. can grow without bound.
  405. Arguments to the cached function must be hashable.
  406. View the cache statistics named tuple (hits, misses, maxsize, currsize)
  407. with f.cache_info(). Clear the cache and statistics with f.cache_clear().
  408. Access the underlying function with f.__wrapped__.
  409. """
  410. def decorating_function(user_function):
  411. stats = [0, 0, 0] # hits, misses, hand
  412. data = {}
  413. if maxsize:
  414. # The keys at each point on the clock face
  415. clock_keys = [None] * maxsize
  416. # The "referenced" bits at each point on the clock face
  417. clock_refs = array("B", (0 for _ in xrange(maxsize)))
  418. lock = Lock()
  419. @wraps(user_function)
  420. def wrapper(*args):
  421. key = args
  422. try:
  423. with lock:
  424. pos, result = data[key]
  425. # The key is in the cache. Set the key's reference bit
  426. clock_refs[pos] = 1
  427. # Record a cache hit
  428. stats[0] += 1
  429. except KeyError:
  430. # Compute the value
  431. result = user_function(*args)
  432. with lock:
  433. # Current position of the clock hand
  434. hand = stats[2]
  435. # Remember to stop here after a full revolution
  436. end = hand
  437. # Sweep around the clock looking for a position with
  438. # the reference bit off
  439. while True:
  440. hand = (hand + 1) % maxsize
  441. current_ref = clock_refs[hand]
  442. if current_ref:
  443. # This position's "referenced" bit is set. Turn
  444. # the bit off and move on.
  445. clock_refs[hand] = 0
  446. elif not current_ref or hand == end:
  447. # We've either found a position with the
  448. # "reference" bit off or reached the end of the
  449. # circular cache. So we'll replace this
  450. # position with the new key
  451. current_key = clock_keys[hand]
  452. if current_key in data:
  453. del data[current_key]
  454. clock_keys[hand] = key
  455. clock_refs[hand] = 1
  456. break
  457. # Put the key and result in the cache
  458. data[key] = (hand, result)
  459. # Save the new hand position
  460. stats[2] = hand
  461. # Record a cache miss
  462. stats[1] += 1
  463. return result
  464. else:
  465. @wraps(user_function)
  466. def wrapper(*args):
  467. key = args
  468. try:
  469. result = data[key]
  470. stats[0] += 1
  471. except KeyError:
  472. result = user_function(*args)
  473. data[key] = result
  474. stats[1] += 1
  475. return result
  476. def cache_info():
  477. """Report cache statistics"""
  478. return (stats[0], stats[1], maxsize, len(data))
  479. def cache_clear():
  480. """Clear the cache and cache statistics"""
  481. data.clear()
  482. stats[0] = stats[1] = stats[2] = 0
  483. for i in xrange(maxsize):
  484. clock_keys[i] = None
  485. clock_refs[i] = 0
  486. wrapper.cache_info = cache_info
  487. wrapper.cache_clear = cache_clear
  488. return wrapper
  489. return decorating_function
  490. def find_object(name, blacklist=None, whitelist=None):
  491. """Imports and returns an object given a fully qualified name.
  492. >>> find_object("whoosh.analysis.StopFilter")
  493. <class 'whoosh.analysis.StopFilter'>
  494. """
  495. if blacklist:
  496. for pre in blacklist:
  497. if name.startswith(pre):
  498. raise TypeError("%r: can't instantiate names starting with %r"
  499. % (name, pre))
  500. if whitelist:
  501. passes = False
  502. for pre in whitelist:
  503. if name.startswith(pre):
  504. passes = True
  505. break
  506. if not passes:
  507. raise TypeError("Can't instantiate %r" % name)
  508. lastdot = name.rfind(".")
  509. assert lastdot > -1, "Name %r must be fully qualified" % name
  510. modname = name[:lastdot]
  511. clsname = name[lastdot + 1:]
  512. mod = __import__(modname, fromlist=[clsname])
  513. cls = getattr(mod, clsname)
  514. return cls
  515. def rcompile(pattern, flags=0, verbose=False):
  516. """A wrapper for re.compile that checks whether "pattern" is a regex object
  517. or a string to be compiled, and automatically adds the re.UNICODE flag.
  518. """
  519. if not isinstance(pattern, string_type):
  520. # If it's not a string, assume it's already a compiled pattern
  521. return pattern
  522. if verbose:
  523. flags |= re.VERBOSE
  524. return re.compile(pattern, re.UNICODE | flags)