PageRenderTime 69ms CodeModel.GetById 15ms app.highlight 47ms RepoModel.GetById 1ms app.codeStats 0ms

/bangkokhotel/lib/python2.5/site-packages/whoosh/util.py

https://bitbucket.org/luisrodriguez/bangkokhotel
Python | 671 lines | 466 code | 72 blank | 133 comment | 36 complexity | 40649f8c29d80c1e7e4a6102934b9b0c MD5 | raw file
  1# Copyright 2007 Matt Chaput. All rights reserved.
  2#
  3# Redistribution and use in source and binary forms, with or without
  4# modification, are permitted provided that the following conditions are met:
  5#
  6#    1. Redistributions of source code must retain the above copyright notice,
  7#       this list of conditions and the following disclaimer.
  8#
  9#    2. Redistributions in binary form must reproduce the above copyright
 10#       notice, this list of conditions and the following disclaimer in the
 11#       documentation and/or other materials provided with the distribution.
 12#
 13# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
 14# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 15# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 16# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 17# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 18# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 19# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 20# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 21# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 22# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 23#
 24# The views and conclusions contained in the software and documentation are
 25# those of the authors and should not be interpreted as representing official
 26# policies, either expressed or implied, of Matt Chaput.
 27
 28"""Miscellaneous utility functions and classes.
 29"""
 30
 31from __future__ import with_statement
 32import codecs
 33import re
 34import sys
 35import time
 36from array import array
 37from bisect import insort, bisect_left, bisect_right
 38from copy import copy
 39from functools import wraps
 40from struct import pack, unpack
 41from threading import Lock
 42
 43from whoosh.compat import xrange, u, b, string_type
 44from whoosh.compat import array_tobytes
 45from whoosh.system import pack_ushort_le, pack_uint_le
 46from whoosh.system import unpack_ushort_le, unpack_uint_le
 47
 48
 49if sys.platform == 'win32':
 50    now = time.clock
 51else:
 52    now = time.time
 53
 54
 55# Note: these functions return a tuple of (text, length), so when you call
 56# them, you have to add [0] on the end, e.g. str = utf8encode(unicode)[0]
 57
 58utf8encode = codecs.getencoder("utf-8")
 59utf8decode = codecs.getdecoder("utf-8")
 60
 61#utf16encode = codecs.getencoder("utf-16-be")
 62#utf16decode = codecs.getdecoder("utf-16-be")
 63#utf32encode = codecs.getencoder("utf-32-be")
 64#utf32decode = codecs.getdecoder("utf-32-be")
 65
 66
 67# Functions
 68
 69def make_binary_tree(fn, args, **kwargs):
 70    """Takes a function/class that takes two positional arguments and a list of
 71    arguments and returns a binary tree of results/instances.
 72    
 73    >>> make_binary_tree(UnionMatcher, [matcher1, matcher2, matcher3])
 74    UnionMatcher(matcher1, UnionMatcher(matcher2, matcher3))
 75    
 76    Any keyword arguments given to this function are passed to the class
 77    initializer.
 78    """
 79
 80    count = len(args)
 81    if not count:
 82        raise ValueError("Called make_binary_tree with empty list")
 83    elif count == 1:
 84        return args[0]
 85
 86    half = count // 2
 87    return fn(make_binary_tree(fn, args[:half], **kwargs),
 88              make_binary_tree(fn, args[half:], **kwargs), **kwargs)
 89
 90
 91def make_weighted_tree(fn, ls, **kwargs):
 92    """Takes a function/class that takes two positional arguments and a list of
 93    (weight, argument) tuples and returns a huffman-like weighted tree of
 94    results/instances.
 95    """
 96
 97    if not ls:
 98        raise ValueError("Called make_weighted_tree with empty list")
 99
100    ls.sort()
101    while len(ls) > 1:
102        a = ls.pop(0)
103        b = ls.pop(0)
104        insort(ls, (a[0] + b[0], fn(a[1], b[1])))
105    return ls[0][1]
106
107
108# Varint cache
109
110# Build a cache of the varint byte sequences for the first N integers, so we
111# don't have to constantly recalculate them on the fly. This makes a small but
112# noticeable difference.
113
114def _varint(i):
115    a = array("B")
116    while (i & ~0x7F) != 0:
117        a.append((i & 0x7F) | 0x80)
118        i = i >> 7
119    a.append(i)
120    return array_tobytes(a)
121
122
123_varint_cache_size = 512
124_varint_cache = []
125for i in xrange(0, _varint_cache_size):
126    _varint_cache.append(_varint(i))
127_varint_cache = tuple(_varint_cache)
128
129
130def varint(i):
131    """Encodes the given integer into a string of the minimum number  of bytes.
132    """
133    if i < len(_varint_cache):
134        return _varint_cache[i]
135    return _varint(i)
136
137
138def varint_to_int(vi):
139    b = ord(vi[0])
140    p = 1
141    i = b & 0x7f
142    shift = 7
143    while b & 0x80 != 0:
144        b = ord(vi[p])
145        p += 1
146        i |= (b & 0x7F) << shift
147        shift += 7
148    return i
149
150
151def signed_varint(i):
152    """Zig-zag encodes a signed integer into a varint.
153    """
154
155    if i >= 0:
156        return varint(i << 1)
157    return varint((i << 1) ^ (~0))
158
159
160def decode_signed_varint(i):
161    """Zig-zag decodes an integer value.
162    """
163
164    if not i & 1:
165        return i >> 1
166    return (i >> 1) ^ (~0)
167
168
169def read_varint(readfn):
170    """
171    Reads a variable-length encoded integer.
172    
173    :param readfn: a callable that reads a given number of bytes,
174        like file.read().
175    """
176
177    b = ord(readfn(1))
178    i = b & 0x7F
179
180    shift = 7
181    while b & 0x80 != 0:
182        b = ord(readfn(1))
183        i |= (b & 0x7F) << shift
184        shift += 7
185    return i
186
187
188# Fibonacci function
189
190_fib_cache = {}
191
192
193def fib(n):
194    """Returns the nth value in the Fibonacci sequence.
195    """
196
197    if n <= 2:
198        return n
199    if n in _fib_cache:
200        return _fib_cache[n]
201    result = fib(n - 1) + fib(n - 2)
202    _fib_cache[n] = result
203    return result
204
205
206# Float-to-byte encoding/decoding
207
208def float_to_byte(value, mantissabits=5, zeroexp=2):
209    """Encodes a floating point number in a single byte.
210    """
211
212    # Assume int size == float size
213
214    fzero = (63 - zeroexp) << mantissabits
215    bits = unpack("i", pack("f", value))[0]
216    smallfloat = bits >> (24 - mantissabits)
217    if smallfloat < fzero:
218        # Map negative numbers and 0 to 0
219        # Map underflow to next smallest non-zero number
220        if bits <= 0:
221            result = chr(0)
222        else:
223            result = chr(1)
224    elif smallfloat >= fzero + 0x100:
225        # Map overflow to largest number
226        result = chr(255)
227    else:
228        result = chr(smallfloat - fzero)
229    return b(result)
230
231
232def byte_to_float(b, mantissabits=5, zeroexp=2):
233    """Decodes a floating point number stored in a single byte.
234    """
235    if type(b) is not int:
236        b = ord(b)
237    if b == 0:
238        return 0.0
239
240    bits = (b & 0xff) << (24 - mantissabits)
241    bits += (63 - zeroexp) << 24
242    return unpack("f", pack("i", bits))[0]
243
244
245# Length-to-byte approximation functions
246
247# Old implementation:
248
249#def length_to_byte(length):
250#    """Returns a logarithmic approximation of the given number, in the range
251#    0-255. The approximation has high precision at the low end (e.g.
252#    1 -> 0, 2 -> 1, 3 -> 2 ...) and low precision at the high end. Numbers
253#    equal to or greater than 108116 all approximate to 255.
254#
255#    This is useful for storing field lengths, where the general case is small
256#    documents and very large documents are more rare.
257#    """
258#
259#    # This encoding formula works up to 108116 -> 255, so if the length is
260#    # equal to or greater than that limit, just return 255.
261#    if length >= 108116:
262#        return 255
263#
264#    # The parameters of this formula where chosen heuristically so that low
265#    # numbers would approximate closely, and the byte range 0-255 would cover
266#    # a decent range of document lengths (i.e. 1 to ~100000).
267#    return int(round(log((length / 27.0) + 1, 1.033)))
268#def _byte_to_length(n):
269#    return int(round((pow(1.033, n) - 1) * 27))
270#_b2l_cache = array("i", (_byte_to_length(i) for i in xrange(256)))
271#byte_to_length = _b2l_cache.__getitem__
272
273# New implementation
274
275# Instead of computing the actual formula to get the byte for any given length,
276# precompute the length associated with each byte, and use bisect to find the
277# nearest value. This gives quite a large speed-up.
278#
279# Note that this does not give all the same answers as the old, "real"
280# implementation since this implementation always "rounds down" (thanks to the
281# bisect_left) while the old implementation would "round up" or "round down"
282# depending on the input. Since this is a fairly gross approximation anyway,
283# I don't think it matters much.
284
285# Values generated using the formula from the "old" implementation above
286_length_byte_cache = array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14,
28716, 17, 18, 20, 21, 23, 25, 26, 28, 30, 32, 34, 36, 38, 40, 42, 45, 47, 49, 52,
28854, 57, 60, 63, 66, 69, 72, 75, 79, 82, 86, 89, 93, 97, 101, 106, 110, 114,
289119, 124, 129, 134, 139, 145, 150, 156, 162, 169, 175, 182, 189, 196, 203, 211,
290219, 227, 235, 244, 253, 262, 271, 281, 291, 302, 313, 324, 336, 348, 360, 373,
291386, 399, 414, 428, 443, 459, 475, 491, 508, 526, 544, 563, 583, 603, 623, 645,
292667, 690, 714, 738, 763, 789, 816, 844, 873, 903, 933, 965, 998, 1032, 1066,
2931103, 1140, 1178, 1218, 1259, 1302, 1345, 1391, 1438, 1486, 1536, 1587, 1641,
2941696, 1753, 1811, 1872, 1935, 1999, 2066, 2135, 2207, 2280, 2356, 2435, 2516,
2952600, 2687, 2777, 2869, 2965, 3063, 3165, 3271, 3380, 3492, 3608, 3728, 3852,
2963980, 4112, 4249, 4390, 4536, 4686, 4842, 5002, 5168, 5340, 5517, 5700, 5889,
2976084, 6286, 6494, 6709, 6932, 7161, 7398, 7643, 7897, 8158, 8428, 8707, 8995,
2989293, 9601, 9918, 10247, 10586, 10936, 11298, 11671, 12057, 12456, 12868,
29913294, 13733, 14187, 14656, 15141, 15641, 16159, 16693, 17244, 17814, 18403,
30019011, 19640, 20289, 20959, 21652, 22367, 23106, 23869, 24658, 25472, 26314,
30127183, 28081, 29009, 29967, 30957, 31979, 33035, 34126, 35254, 36418, 37620,
30238863, 40146, 41472, 42841, 44256, 45717, 47227, 48786, 50397, 52061, 53780,
30355556, 57390, 59285, 61242, 63264, 65352, 67510, 69739, 72041, 74419, 76876,
30479414, 82035, 84743, 87541, 90430, 93416, 96499, 99684, 102975, 106374])
305
306
307def length_to_byte(length):
308    if length is None:
309        return 0
310    if length >= 106374:
311        return 255
312    else:
313        return bisect_left(_length_byte_cache, length)
314
315byte_to_length = _length_byte_cache.__getitem__
316
317
318# Prefix encoding functions
319
320def first_diff(a, b):
321    """Returns the position of the first differing character in the strings
322    a and b. For example, first_diff('render', 'rending') == 4. This function
323    limits the return value to 255 so the difference can be encoded in a single
324    byte.
325    """
326
327    i = 0
328    for i in xrange(0, len(a)):
329        if a[i] != b[i] or i == 255:
330            break
331    return i
332
333
334def prefix_encode(a, b):
335    """Compresses string b as an integer (encoded in a byte) representing
336    the prefix it shares with a, followed by the suffix encoded as UTF-8.
337    """
338    i = first_diff(a, b)
339    return chr(i) + b[i:].encode("utf8")
340
341
342def prefix_encode_all(ls):
343    """Compresses the given list of (unicode) strings by storing each string
344    (except the first one) as an integer (encoded in a byte) representing
345    the prefix it shares with its predecessor, followed by the suffix encoded
346    as UTF-8.
347    """
348
349    last = u('')
350    for w in ls:
351        i = first_diff(last, w)
352        yield chr(i) + w[i:].encode("utf8")
353        last = w
354
355
356def prefix_decode_all(ls):
357    """Decompresses a list of strings compressed by prefix_encode().
358    """
359
360    last = u('')
361    for w in ls:
362        i = ord(w[0])
363        decoded = last[:i] + w[1:].decode("utf8")
364        yield decoded
365        last = decoded
366
367
368# Natural key sorting function
369
370_nkre = re.compile(r"\D+|\d+", re.UNICODE)
371
372
373def _nkconv(i):
374    try:
375        return int(i)
376    except ValueError:
377        return i.lower()
378
379
380def natural_key(s):
381    """Converts string ``s`` into a tuple that will sort "naturally" (i.e.,
382    ``name5`` will come before ``name10`` and ``1`` will come before ``A``).
383    This function is designed to be used as the ``key`` argument to sorting
384    functions.
385    
386    :param s: the str/unicode string to convert.
387    :rtype: tuple
388    """
389
390    # Use _nkre to split the input string into a sequence of
391    # digit runs and non-digit runs. Then use _nkconv() to convert
392    # the digit runs into ints and the non-digit runs to lowercase.
393    return tuple(_nkconv(m) for m in _nkre.findall(s))
394
395
396# Mixins and decorators
397
398class ClosableMixin(object):
399    """Mix-in for classes with a close() method to allow them to be used as a
400    context manager.
401    """
402
403    def __enter__(self):
404        return self
405
406    def __exit__(self, *exc_info):
407        self.close()
408
409
410def protected(func):
411    """Decorator for storage-access methods. This decorator (a) checks if the
412    object has already been closed, and (b) synchronizes on a threading lock.
413    The parent object must have 'is_closed' and '_sync_lock' attributes.
414    """
415
416    @wraps(func)
417    def protected_wrapper(self, *args, **kwargs):
418        if self.is_closed:
419            raise Exception("%r has been closed" % self)
420        with self._sync_lock:
421            return func(self, *args, **kwargs)
422
423    return protected_wrapper
424
425
426def synchronized(func):
427    """Decorator for storage-access methods, which synchronizes on a threading
428    lock. The parent object must have 'is_closed' and '_sync_lock' attributes.
429    """
430
431    @wraps(func)
432    def synchronized_wrapper(self, *args, **kwargs):
433        with self._sync_lock:
434            return func(self, *args, **kwargs)
435
436    return synchronized_wrapper
437
438
439def unbound_cache(func):
440    """Caching decorator with an unbounded cache size.
441    """
442
443    cache = {}
444
445    @wraps(func)
446    def caching_wrapper(*args):
447        try:
448            return cache[args]
449        except KeyError:
450            result = func(*args)
451            cache[args] = result
452            return result
453
454    return caching_wrapper
455
456
457def lru_cache(maxsize=100):
458    """Double-barrel least-recently-used cache decorator. This is a simple
459    LRU algorithm that keeps a primary and secondary dict. Keys are checked
460    in the primary dict, and then the secondary. Once the primary dict fills
461    up, the secondary dict is cleared and the two dicts are swapped.
462    
463    This function duplicates (more-or-less) the protocol of the
464    ``functools.lru_cache`` decorator in the Python 3.2 standard library.
465
466    Arguments to the cached function must be hashable.
467
468    View the cache statistics named tuple (hits, misses, maxsize, currsize)
469    with f.cache_info().  Clear the cache and statistics with f.cache_clear().
470    Access the underlying function with f.__wrapped__.
471    """
472
473    def decorating_function(user_function):
474        # Cache1, Cache2, Pointer, Hits, Misses
475        stats = [{}, {}, 0, 0, 0]
476
477        @wraps(user_function)
478        def wrapper(*args):
479            ptr = stats[2]
480            a = stats[ptr]
481            b = stats[not ptr]
482            key = args
483
484            if key in a:
485                stats[3] += 1  # Hit
486                return a[key]
487            elif key in b:
488                stats[3] += 1  # Hit
489                return b[key]
490            else:
491                stats[4] += 1  # Miss
492                result = user_function(*args)
493                a[key] = result
494                if len(a) >= maxsize:
495                    stats[2] = not ptr
496                    b.clear()
497                return result
498
499        def cache_info():
500            """Report cache statistics"""
501            return (stats[3], stats[4], maxsize, len(stats[0]) + len(stats[1]))
502
503        def cache_clear():
504            """Clear the cache and cache statistics"""
505            stats[0].clear()
506            stats[1].clear()
507            stats[3] = stats[4] = 0
508
509        wrapper.cache_info = cache_info
510        wrapper.cache_clear = cache_clear
511
512        return wrapper
513    return decorating_function
514
515
516def clockface_lru_cache(maxsize=100):
517    """Least-recently-used cache decorator.
518
519    This function duplicates (more-or-less) the protocol of the
520    ``functools.lru_cache`` decorator in the Python 3.2 standard library, but
521    uses the clock face LRU algorithm instead of an ordered dictionary.
522
523    If *maxsize* is set to None, the LRU features are disabled and the cache
524    can grow without bound.
525
526    Arguments to the cached function must be hashable.
527
528    View the cache statistics named tuple (hits, misses, maxsize, currsize)
529    with f.cache_info().  Clear the cache and statistics with f.cache_clear().
530    Access the underlying function with f.__wrapped__.
531    """
532
533    def decorating_function(user_function):
534
535        stats = [0, 0, 0]  # hits, misses, hand
536        data = {}
537
538        if maxsize:
539            # The keys at each point on the clock face
540            clock_keys = [None] * maxsize
541            # The "referenced" bits at each point on the clock face
542            clock_refs = array("B", (0 for _ in xrange(maxsize)))
543            lock = Lock()
544
545            @wraps(user_function)
546            def wrapper(*args):
547                key = args
548                try:
549                    with lock:
550                        pos, result = data[key]
551                        # The key is in the cache. Set the key's reference bit
552                        clock_refs[pos] = 1
553                        # Record a cache hit
554                        stats[0] += 1
555                except KeyError:
556                    # Compute the value
557                    result = user_function(*args)
558                    with lock:
559                        # Current position of the clock hand
560                        hand = stats[2]
561                        # Remember to stop here after a full revolution
562                        end = hand
563                        # Sweep around the clock looking for a position with
564                        # the reference bit off
565                        while True:
566                            hand = (hand + 1) % maxsize
567                            current_ref = clock_refs[hand]
568                            if current_ref:
569                                # This position's "referenced" bit is set. Turn
570                                # the bit off and move on.
571                                clock_refs[hand] = 0
572                            elif not current_ref or hand == end:
573                                # We've either found a position with the
574                                # "reference" bit off or reached the end of the
575                                # circular cache. So we'll replace this
576                                # position with the new key
577                                current_key = clock_keys[hand]
578                                if current_key in data:
579                                    del data[current_key]
580                                clock_keys[hand] = key
581                                clock_refs[hand] = 1
582                                break
583                        # Put the key and result in the cache
584                        data[key] = (hand, result)
585                        # Save the new hand position
586                        stats[2] = hand
587                        # Record a cache miss
588                        stats[1] += 1
589                return result
590
591        else:
592            @wraps(user_function)
593            def wrapper(*args):
594                key = args
595                try:
596                    result = data[key]
597                    stats[0] += 1
598                except KeyError:
599                    result = user_function(*args)
600                    data[key] = result
601                    stats[1] += 1
602                return result
603
604        def cache_info():
605            """Report cache statistics"""
606            return (stats[0], stats[1], maxsize, len(data))
607
608        def cache_clear():
609            """Clear the cache and cache statistics"""
610            data.clear()
611            stats[0] = stats[1] = stats[2] = 0
612            for i in xrange(maxsize):
613                clock_keys[i] = None
614                clock_refs[i] = 0
615
616        wrapper.cache_info = cache_info
617        wrapper.cache_clear = cache_clear
618        return wrapper
619
620    return decorating_function
621
622
623def find_object(name, blacklist=None, whitelist=None):
624    """Imports and returns an object given a fully qualified name.
625    
626    >>> find_object("whoosh.analysis.StopFilter")
627    <class 'whoosh.analysis.StopFilter'>
628    """
629
630    if blacklist:
631        for pre in blacklist:
632            if name.startswith(pre):
633                raise TypeError("%r: can't instantiate names starting with %r"
634                                % (name, pre))
635    if whitelist:
636        passes = False
637        for pre in whitelist:
638            if name.startswith(pre):
639                passes = True
640                break
641        if not passes:
642            raise TypeError("Can't instantiate %r" % name)
643
644    lastdot = name.rfind(".")
645
646    assert lastdot > -1, "Name %r must be fully qualified" % name
647    modname = name[:lastdot]
648    clsname = name[lastdot + 1:]
649
650    mod = __import__(modname, fromlist=[clsname])
651    cls = getattr(mod, clsname)
652    return cls
653
654
655def rcompile(pattern, flags=0, verbose=False):
656    """A wrapper for re.compile that checks whether "pattern" is a regex object
657    or a string to be compiled, and automatically adds the re.UNICODE flag.
658    """
659
660    if not isinstance(pattern, string_type):
661        # If it's not a string, assume it's already a compiled pattern
662        return pattern
663    if verbose:
664        flags |= re.VERBOSE
665    return re.compile(pattern, re.UNICODE | flags)
666
667
668
669
670
671