/bangkokhotel/lib/python2.5/site-packages/whoosh/util.py
Python | 671 lines | 466 code | 72 blank | 133 comment | 36 complexity | 40649f8c29d80c1e7e4a6102934b9b0c MD5 | raw file
1# Copyright 2007 Matt Chaput. All rights reserved.
2#
3# Redistribution and use in source and binary forms, with or without
4# modification, are permitted provided that the following conditions are met:
5#
6# 1. Redistributions of source code must retain the above copyright notice,
7# this list of conditions and the following disclaimer.
8#
9# 2. Redistributions in binary form must reproduce the above copyright
10# notice, this list of conditions and the following disclaimer in the
11# documentation and/or other materials provided with the distribution.
12#
13# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
14# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
15# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
16# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
17# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
18# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
19# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
22# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23#
24# The views and conclusions contained in the software and documentation are
25# those of the authors and should not be interpreted as representing official
26# policies, either expressed or implied, of Matt Chaput.
27
28"""Miscellaneous utility functions and classes.
29"""
30
31from __future__ import with_statement
32import codecs
33import re
34import sys
35import time
36from array import array
37from bisect import insort, bisect_left, bisect_right
38from copy import copy
39from functools import wraps
40from struct import pack, unpack
41from threading import Lock
42
43from whoosh.compat import xrange, u, b, string_type
44from whoosh.compat import array_tobytes
45from whoosh.system import pack_ushort_le, pack_uint_le
46from whoosh.system import unpack_ushort_le, unpack_uint_le
47
48
49if sys.platform == 'win32':
50 now = time.clock
51else:
52 now = time.time
53
54
55# Note: these functions return a tuple of (text, length), so when you call
56# them, you have to add [0] on the end, e.g. str = utf8encode(unicode)[0]
57
58utf8encode = codecs.getencoder("utf-8")
59utf8decode = codecs.getdecoder("utf-8")
60
61#utf16encode = codecs.getencoder("utf-16-be")
62#utf16decode = codecs.getdecoder("utf-16-be")
63#utf32encode = codecs.getencoder("utf-32-be")
64#utf32decode = codecs.getdecoder("utf-32-be")
65
66
67# Functions
68
69def make_binary_tree(fn, args, **kwargs):
70 """Takes a function/class that takes two positional arguments and a list of
71 arguments and returns a binary tree of results/instances.
72
73 >>> make_binary_tree(UnionMatcher, [matcher1, matcher2, matcher3])
74 UnionMatcher(matcher1, UnionMatcher(matcher2, matcher3))
75
76 Any keyword arguments given to this function are passed to the class
77 initializer.
78 """
79
80 count = len(args)
81 if not count:
82 raise ValueError("Called make_binary_tree with empty list")
83 elif count == 1:
84 return args[0]
85
86 half = count // 2
87 return fn(make_binary_tree(fn, args[:half], **kwargs),
88 make_binary_tree(fn, args[half:], **kwargs), **kwargs)
89
90
91def make_weighted_tree(fn, ls, **kwargs):
92 """Takes a function/class that takes two positional arguments and a list of
93 (weight, argument) tuples and returns a huffman-like weighted tree of
94 results/instances.
95 """
96
97 if not ls:
98 raise ValueError("Called make_weighted_tree with empty list")
99
100 ls.sort()
101 while len(ls) > 1:
102 a = ls.pop(0)
103 b = ls.pop(0)
104 insort(ls, (a[0] + b[0], fn(a[1], b[1])))
105 return ls[0][1]
106
107
108# Varint cache
109
110# Build a cache of the varint byte sequences for the first N integers, so we
111# don't have to constantly recalculate them on the fly. This makes a small but
112# noticeable difference.
113
114def _varint(i):
115 a = array("B")
116 while (i & ~0x7F) != 0:
117 a.append((i & 0x7F) | 0x80)
118 i = i >> 7
119 a.append(i)
120 return array_tobytes(a)
121
122
123_varint_cache_size = 512
124_varint_cache = []
125for i in xrange(0, _varint_cache_size):
126 _varint_cache.append(_varint(i))
127_varint_cache = tuple(_varint_cache)
128
129
130def varint(i):
131 """Encodes the given integer into a string of the minimum number of bytes.
132 """
133 if i < len(_varint_cache):
134 return _varint_cache[i]
135 return _varint(i)
136
137
138def varint_to_int(vi):
139 b = ord(vi[0])
140 p = 1
141 i = b & 0x7f
142 shift = 7
143 while b & 0x80 != 0:
144 b = ord(vi[p])
145 p += 1
146 i |= (b & 0x7F) << shift
147 shift += 7
148 return i
149
150
151def signed_varint(i):
152 """Zig-zag encodes a signed integer into a varint.
153 """
154
155 if i >= 0:
156 return varint(i << 1)
157 return varint((i << 1) ^ (~0))
158
159
160def decode_signed_varint(i):
161 """Zig-zag decodes an integer value.
162 """
163
164 if not i & 1:
165 return i >> 1
166 return (i >> 1) ^ (~0)
167
168
169def read_varint(readfn):
170 """
171 Reads a variable-length encoded integer.
172
173 :param readfn: a callable that reads a given number of bytes,
174 like file.read().
175 """
176
177 b = ord(readfn(1))
178 i = b & 0x7F
179
180 shift = 7
181 while b & 0x80 != 0:
182 b = ord(readfn(1))
183 i |= (b & 0x7F) << shift
184 shift += 7
185 return i
186
187
188# Fibonacci function
189
190_fib_cache = {}
191
192
193def fib(n):
194 """Returns the nth value in the Fibonacci sequence.
195 """
196
197 if n <= 2:
198 return n
199 if n in _fib_cache:
200 return _fib_cache[n]
201 result = fib(n - 1) + fib(n - 2)
202 _fib_cache[n] = result
203 return result
204
205
206# Float-to-byte encoding/decoding
207
208def float_to_byte(value, mantissabits=5, zeroexp=2):
209 """Encodes a floating point number in a single byte.
210 """
211
212 # Assume int size == float size
213
214 fzero = (63 - zeroexp) << mantissabits
215 bits = unpack("i", pack("f", value))[0]
216 smallfloat = bits >> (24 - mantissabits)
217 if smallfloat < fzero:
218 # Map negative numbers and 0 to 0
219 # Map underflow to next smallest non-zero number
220 if bits <= 0:
221 result = chr(0)
222 else:
223 result = chr(1)
224 elif smallfloat >= fzero + 0x100:
225 # Map overflow to largest number
226 result = chr(255)
227 else:
228 result = chr(smallfloat - fzero)
229 return b(result)
230
231
232def byte_to_float(b, mantissabits=5, zeroexp=2):
233 """Decodes a floating point number stored in a single byte.
234 """
235 if type(b) is not int:
236 b = ord(b)
237 if b == 0:
238 return 0.0
239
240 bits = (b & 0xff) << (24 - mantissabits)
241 bits += (63 - zeroexp) << 24
242 return unpack("f", pack("i", bits))[0]
243
244
245# Length-to-byte approximation functions
246
247# Old implementation:
248
249#def length_to_byte(length):
250# """Returns a logarithmic approximation of the given number, in the range
251# 0-255. The approximation has high precision at the low end (e.g.
252# 1 -> 0, 2 -> 1, 3 -> 2 ...) and low precision at the high end. Numbers
253# equal to or greater than 108116 all approximate to 255.
254#
255# This is useful for storing field lengths, where the general case is small
256# documents and very large documents are more rare.
257# """
258#
259# # This encoding formula works up to 108116 -> 255, so if the length is
260# # equal to or greater than that limit, just return 255.
261# if length >= 108116:
262# return 255
263#
264# # The parameters of this formula where chosen heuristically so that low
265# # numbers would approximate closely, and the byte range 0-255 would cover
266# # a decent range of document lengths (i.e. 1 to ~100000).
267# return int(round(log((length / 27.0) + 1, 1.033)))
268#def _byte_to_length(n):
269# return int(round((pow(1.033, n) - 1) * 27))
270#_b2l_cache = array("i", (_byte_to_length(i) for i in xrange(256)))
271#byte_to_length = _b2l_cache.__getitem__
272
273# New implementation
274
275# Instead of computing the actual formula to get the byte for any given length,
276# precompute the length associated with each byte, and use bisect to find the
277# nearest value. This gives quite a large speed-up.
278#
279# Note that this does not give all the same answers as the old, "real"
280# implementation since this implementation always "rounds down" (thanks to the
281# bisect_left) while the old implementation would "round up" or "round down"
282# depending on the input. Since this is a fairly gross approximation anyway,
283# I don't think it matters much.
284
285# Values generated using the formula from the "old" implementation above
286_length_byte_cache = array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14,
28716, 17, 18, 20, 21, 23, 25, 26, 28, 30, 32, 34, 36, 38, 40, 42, 45, 47, 49, 52,
28854, 57, 60, 63, 66, 69, 72, 75, 79, 82, 86, 89, 93, 97, 101, 106, 110, 114,
289119, 124, 129, 134, 139, 145, 150, 156, 162, 169, 175, 182, 189, 196, 203, 211,
290219, 227, 235, 244, 253, 262, 271, 281, 291, 302, 313, 324, 336, 348, 360, 373,
291386, 399, 414, 428, 443, 459, 475, 491, 508, 526, 544, 563, 583, 603, 623, 645,
292667, 690, 714, 738, 763, 789, 816, 844, 873, 903, 933, 965, 998, 1032, 1066,
2931103, 1140, 1178, 1218, 1259, 1302, 1345, 1391, 1438, 1486, 1536, 1587, 1641,
2941696, 1753, 1811, 1872, 1935, 1999, 2066, 2135, 2207, 2280, 2356, 2435, 2516,
2952600, 2687, 2777, 2869, 2965, 3063, 3165, 3271, 3380, 3492, 3608, 3728, 3852,
2963980, 4112, 4249, 4390, 4536, 4686, 4842, 5002, 5168, 5340, 5517, 5700, 5889,
2976084, 6286, 6494, 6709, 6932, 7161, 7398, 7643, 7897, 8158, 8428, 8707, 8995,
2989293, 9601, 9918, 10247, 10586, 10936, 11298, 11671, 12057, 12456, 12868,
29913294, 13733, 14187, 14656, 15141, 15641, 16159, 16693, 17244, 17814, 18403,
30019011, 19640, 20289, 20959, 21652, 22367, 23106, 23869, 24658, 25472, 26314,
30127183, 28081, 29009, 29967, 30957, 31979, 33035, 34126, 35254, 36418, 37620,
30238863, 40146, 41472, 42841, 44256, 45717, 47227, 48786, 50397, 52061, 53780,
30355556, 57390, 59285, 61242, 63264, 65352, 67510, 69739, 72041, 74419, 76876,
30479414, 82035, 84743, 87541, 90430, 93416, 96499, 99684, 102975, 106374])
305
306
307def length_to_byte(length):
308 if length is None:
309 return 0
310 if length >= 106374:
311 return 255
312 else:
313 return bisect_left(_length_byte_cache, length)
314
315byte_to_length = _length_byte_cache.__getitem__
316
317
318# Prefix encoding functions
319
320def first_diff(a, b):
321 """Returns the position of the first differing character in the strings
322 a and b. For example, first_diff('render', 'rending') == 4. This function
323 limits the return value to 255 so the difference can be encoded in a single
324 byte.
325 """
326
327 i = 0
328 for i in xrange(0, len(a)):
329 if a[i] != b[i] or i == 255:
330 break
331 return i
332
333
334def prefix_encode(a, b):
335 """Compresses string b as an integer (encoded in a byte) representing
336 the prefix it shares with a, followed by the suffix encoded as UTF-8.
337 """
338 i = first_diff(a, b)
339 return chr(i) + b[i:].encode("utf8")
340
341
342def prefix_encode_all(ls):
343 """Compresses the given list of (unicode) strings by storing each string
344 (except the first one) as an integer (encoded in a byte) representing
345 the prefix it shares with its predecessor, followed by the suffix encoded
346 as UTF-8.
347 """
348
349 last = u('')
350 for w in ls:
351 i = first_diff(last, w)
352 yield chr(i) + w[i:].encode("utf8")
353 last = w
354
355
356def prefix_decode_all(ls):
357 """Decompresses a list of strings compressed by prefix_encode().
358 """
359
360 last = u('')
361 for w in ls:
362 i = ord(w[0])
363 decoded = last[:i] + w[1:].decode("utf8")
364 yield decoded
365 last = decoded
366
367
368# Natural key sorting function
369
370_nkre = re.compile(r"\D+|\d+", re.UNICODE)
371
372
373def _nkconv(i):
374 try:
375 return int(i)
376 except ValueError:
377 return i.lower()
378
379
380def natural_key(s):
381 """Converts string ``s`` into a tuple that will sort "naturally" (i.e.,
382 ``name5`` will come before ``name10`` and ``1`` will come before ``A``).
383 This function is designed to be used as the ``key`` argument to sorting
384 functions.
385
386 :param s: the str/unicode string to convert.
387 :rtype: tuple
388 """
389
390 # Use _nkre to split the input string into a sequence of
391 # digit runs and non-digit runs. Then use _nkconv() to convert
392 # the digit runs into ints and the non-digit runs to lowercase.
393 return tuple(_nkconv(m) for m in _nkre.findall(s))
394
395
396# Mixins and decorators
397
398class ClosableMixin(object):
399 """Mix-in for classes with a close() method to allow them to be used as a
400 context manager.
401 """
402
403 def __enter__(self):
404 return self
405
406 def __exit__(self, *exc_info):
407 self.close()
408
409
410def protected(func):
411 """Decorator for storage-access methods. This decorator (a) checks if the
412 object has already been closed, and (b) synchronizes on a threading lock.
413 The parent object must have 'is_closed' and '_sync_lock' attributes.
414 """
415
416 @wraps(func)
417 def protected_wrapper(self, *args, **kwargs):
418 if self.is_closed:
419 raise Exception("%r has been closed" % self)
420 with self._sync_lock:
421 return func(self, *args, **kwargs)
422
423 return protected_wrapper
424
425
426def synchronized(func):
427 """Decorator for storage-access methods, which synchronizes on a threading
428 lock. The parent object must have 'is_closed' and '_sync_lock' attributes.
429 """
430
431 @wraps(func)
432 def synchronized_wrapper(self, *args, **kwargs):
433 with self._sync_lock:
434 return func(self, *args, **kwargs)
435
436 return synchronized_wrapper
437
438
439def unbound_cache(func):
440 """Caching decorator with an unbounded cache size.
441 """
442
443 cache = {}
444
445 @wraps(func)
446 def caching_wrapper(*args):
447 try:
448 return cache[args]
449 except KeyError:
450 result = func(*args)
451 cache[args] = result
452 return result
453
454 return caching_wrapper
455
456
457def lru_cache(maxsize=100):
458 """Double-barrel least-recently-used cache decorator. This is a simple
459 LRU algorithm that keeps a primary and secondary dict. Keys are checked
460 in the primary dict, and then the secondary. Once the primary dict fills
461 up, the secondary dict is cleared and the two dicts are swapped.
462
463 This function duplicates (more-or-less) the protocol of the
464 ``functools.lru_cache`` decorator in the Python 3.2 standard library.
465
466 Arguments to the cached function must be hashable.
467
468 View the cache statistics named tuple (hits, misses, maxsize, currsize)
469 with f.cache_info(). Clear the cache and statistics with f.cache_clear().
470 Access the underlying function with f.__wrapped__.
471 """
472
473 def decorating_function(user_function):
474 # Cache1, Cache2, Pointer, Hits, Misses
475 stats = [{}, {}, 0, 0, 0]
476
477 @wraps(user_function)
478 def wrapper(*args):
479 ptr = stats[2]
480 a = stats[ptr]
481 b = stats[not ptr]
482 key = args
483
484 if key in a:
485 stats[3] += 1 # Hit
486 return a[key]
487 elif key in b:
488 stats[3] += 1 # Hit
489 return b[key]
490 else:
491 stats[4] += 1 # Miss
492 result = user_function(*args)
493 a[key] = result
494 if len(a) >= maxsize:
495 stats[2] = not ptr
496 b.clear()
497 return result
498
499 def cache_info():
500 """Report cache statistics"""
501 return (stats[3], stats[4], maxsize, len(stats[0]) + len(stats[1]))
502
503 def cache_clear():
504 """Clear the cache and cache statistics"""
505 stats[0].clear()
506 stats[1].clear()
507 stats[3] = stats[4] = 0
508
509 wrapper.cache_info = cache_info
510 wrapper.cache_clear = cache_clear
511
512 return wrapper
513 return decorating_function
514
515
516def clockface_lru_cache(maxsize=100):
517 """Least-recently-used cache decorator.
518
519 This function duplicates (more-or-less) the protocol of the
520 ``functools.lru_cache`` decorator in the Python 3.2 standard library, but
521 uses the clock face LRU algorithm instead of an ordered dictionary.
522
523 If *maxsize* is set to None, the LRU features are disabled and the cache
524 can grow without bound.
525
526 Arguments to the cached function must be hashable.
527
528 View the cache statistics named tuple (hits, misses, maxsize, currsize)
529 with f.cache_info(). Clear the cache and statistics with f.cache_clear().
530 Access the underlying function with f.__wrapped__.
531 """
532
533 def decorating_function(user_function):
534
535 stats = [0, 0, 0] # hits, misses, hand
536 data = {}
537
538 if maxsize:
539 # The keys at each point on the clock face
540 clock_keys = [None] * maxsize
541 # The "referenced" bits at each point on the clock face
542 clock_refs = array("B", (0 for _ in xrange(maxsize)))
543 lock = Lock()
544
545 @wraps(user_function)
546 def wrapper(*args):
547 key = args
548 try:
549 with lock:
550 pos, result = data[key]
551 # The key is in the cache. Set the key's reference bit
552 clock_refs[pos] = 1
553 # Record a cache hit
554 stats[0] += 1
555 except KeyError:
556 # Compute the value
557 result = user_function(*args)
558 with lock:
559 # Current position of the clock hand
560 hand = stats[2]
561 # Remember to stop here after a full revolution
562 end = hand
563 # Sweep around the clock looking for a position with
564 # the reference bit off
565 while True:
566 hand = (hand + 1) % maxsize
567 current_ref = clock_refs[hand]
568 if current_ref:
569 # This position's "referenced" bit is set. Turn
570 # the bit off and move on.
571 clock_refs[hand] = 0
572 elif not current_ref or hand == end:
573 # We've either found a position with the
574 # "reference" bit off or reached the end of the
575 # circular cache. So we'll replace this
576 # position with the new key
577 current_key = clock_keys[hand]
578 if current_key in data:
579 del data[current_key]
580 clock_keys[hand] = key
581 clock_refs[hand] = 1
582 break
583 # Put the key and result in the cache
584 data[key] = (hand, result)
585 # Save the new hand position
586 stats[2] = hand
587 # Record a cache miss
588 stats[1] += 1
589 return result
590
591 else:
592 @wraps(user_function)
593 def wrapper(*args):
594 key = args
595 try:
596 result = data[key]
597 stats[0] += 1
598 except KeyError:
599 result = user_function(*args)
600 data[key] = result
601 stats[1] += 1
602 return result
603
604 def cache_info():
605 """Report cache statistics"""
606 return (stats[0], stats[1], maxsize, len(data))
607
608 def cache_clear():
609 """Clear the cache and cache statistics"""
610 data.clear()
611 stats[0] = stats[1] = stats[2] = 0
612 for i in xrange(maxsize):
613 clock_keys[i] = None
614 clock_refs[i] = 0
615
616 wrapper.cache_info = cache_info
617 wrapper.cache_clear = cache_clear
618 return wrapper
619
620 return decorating_function
621
622
623def find_object(name, blacklist=None, whitelist=None):
624 """Imports and returns an object given a fully qualified name.
625
626 >>> find_object("whoosh.analysis.StopFilter")
627 <class 'whoosh.analysis.StopFilter'>
628 """
629
630 if blacklist:
631 for pre in blacklist:
632 if name.startswith(pre):
633 raise TypeError("%r: can't instantiate names starting with %r"
634 % (name, pre))
635 if whitelist:
636 passes = False
637 for pre in whitelist:
638 if name.startswith(pre):
639 passes = True
640 break
641 if not passes:
642 raise TypeError("Can't instantiate %r" % name)
643
644 lastdot = name.rfind(".")
645
646 assert lastdot > -1, "Name %r must be fully qualified" % name
647 modname = name[:lastdot]
648 clsname = name[lastdot + 1:]
649
650 mod = __import__(modname, fromlist=[clsname])
651 cls = getattr(mod, clsname)
652 return cls
653
654
655def rcompile(pattern, flags=0, verbose=False):
656 """A wrapper for re.compile that checks whether "pattern" is a regex object
657 or a string to be compiled, and automatically adds the re.UNICODE flag.
658 """
659
660 if not isinstance(pattern, string_type):
661 # If it's not a string, assume it's already a compiled pattern
662 return pattern
663 if verbose:
664 flags |= re.VERBOSE
665 return re.compile(pattern, re.UNICODE | flags)
666
667
668
669
670
671