PageRenderTime 38ms CodeModel.GetById 20ms app.highlight 15ms RepoModel.GetById 0ms app.codeStats 0ms

/bangkokhotel/lib/python2.5/site-packages/whoosh/support/externalsort.py

https://bitbucket.org/luisrodriguez/bangkokhotel
Python | 242 lines | 171 code | 14 blank | 57 comment | 10 complexity | 06bf5d9b10512f2318011f14025ad06e MD5 | raw file
  1# Copyright 2011 Matt Chaput. All rights reserved.
  2#
  3# Redistribution and use in source and binary forms, with or without
  4# modification, are permitted provided that the following conditions are met:
  5#
  6#    1. Redistributions of source code must retain the above copyright notice,
  7#       this list of conditions and the following disclaimer.
  8#
  9#    2. Redistributions in binary form must reproduce the above copyright
 10#       notice, this list of conditions and the following disclaimer in the
 11#       documentation and/or other materials provided with the distribution.
 12#
 13# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
 14# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 15# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 16# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 17# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 18# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 19# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 20# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 21# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 22# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 23#
 24# The views and conclusions contained in the software and documentation are
 25# those of the authors and should not be interpreted as representing official
 26# policies, either expressed or implied, of Matt Chaput.
 27
 28"""
 29This module implements a general external merge sort for Python objects.
 30"""
 31
 32from __future__ import with_statement
 33
 34import os, tempfile
 35from heapq import heapify, heappop, heapreplace
 36
 37from whoosh.compat import xrange, dump, load
 38
 39
 40## Python 3.2 had a bug that make marshal.load unusable
 41#if (hasattr(platform, "python_implementation")
 42#    and platform.python_implementation() == "CPython"
 43#    and platform.python_version() == "3.2.0"):
 44#    # Use pickle instead of marshal on Python 3.2
 45#    from whoosh.compat import dump as dump_pickle
 46#    from whoosh.compat import load
 47#
 48#    def dump(obj, f):
 49#        dump_pickle(obj, f, -1)
 50#else:
 51#    from marshal import dump, load
 52
 53
 54try:
 55    from heapq import merge
 56
 57    def imerge(iterables):
 58        return merge(*iterables)
 59except ImportError:
 60    def imerge(iterables):
 61        _hpop, _hreplace, _Stop = (heappop, heapreplace, StopIteration)
 62        h = []
 63        h_append = h.append
 64        for itnum, it in enumerate(map(iter, iterables)):
 65            try:
 66                nx = it.next
 67                h_append([nx(), itnum, nx])
 68            except _Stop:
 69                pass
 70        heapify(h)
 71
 72        while 1:
 73            try:
 74                while 1:
 75                    v, itnum, nx = s = h[0]
 76                    yield v
 77                    s[0] = nx()
 78                    _hreplace(h, s)
 79            except _Stop:
 80                _hpop(h)
 81            except IndexError:
 82                return
 83
 84
 85class SortingPool(object):
 86    """This object implements a general K-way external merge sort for Python
 87    objects.
 88    
 89    >>> pool = MergePool()
 90    >>> # Add an unlimited number of items in any order
 91    >>> for item in my_items:
 92    ...     pool.add(item)
 93    ...
 94    >>> # Get the items back in sorted order
 95    >>> for item in pool.items():
 96    ...     print(item)
 97    
 98    This class uses the `marshal` module to write the items to temporary files,
 99    so you can only sort marshal-able types (generally: numbers, strings,
100    tuples, lists, and dicts).
101    """
102
103    filenamechars = "abcdefghijklmnopqrstuvwxyz_1234567890"
104
105    def __init__(self, maxsize=1000000, tempdir=None, prefix="",
106                 suffix=".run"):
107        """
108        :param maxsize: the maximum number of items to keep in memory at once.
109        :param tempdir: the path of a directory to use for temporary file
110            storage. The default is to use the system's temp directory.
111        :param prefix: a prefix to add to temporary filenames.
112        :param suffix: a suffix to add to temporary filenames.
113        """
114
115        self.tempdir = tempdir
116        if maxsize < 1:
117            raise ValueError("maxsize=%s must be >= 1" % maxsize)
118        self.maxsize = maxsize
119        self.prefix = prefix
120        self.suffix = suffix
121        # Current run queue
122        self.current = []
123        # List of run filenames
124        self.runs = []
125
126    def _new_run(self):
127        fd, path = tempfile.mkstemp(prefix=self.prefix, suffix=self.suffix,
128                                    dir=self.tempdir)
129        f = os.fdopen(fd, "wb")
130        return path, f
131
132    @staticmethod
133    def _read_run(path):
134        import os.path
135        f = open(path, "rb")
136        try:
137            while True:
138                yield load(f)
139        except EOFError:
140            return
141        finally:
142            f.close()
143            os.remove(path)
144
145    @classmethod
146    def _merge_runs(cls, paths):
147        iters = [cls._read_run(path) for path in paths]
148        for item in imerge(iters):
149            yield item
150
151    def add(self, item):
152        """Adds `item` to the pool to be sorted.
153        """
154
155        if len(self.current) >= self.maxsize:
156            self.save()
157        self.current.append(item)
158
159    def _write_run(self, f, items):
160        for item in items:
161            dump(item, f, -1)
162        f.close()
163
164    def _add_run(self, filename):
165        self.runs.append(filename)
166
167    def save(self):
168        current = self.current
169        if current:
170            current.sort()
171            path, f = self._new_run()
172            self._write_run(f, current)
173            self._add_run(path)
174            self.current = []
175
176    def cleanup(self):
177        for path in self.runs:
178            try:
179                os.remove(path)
180            except OSError:
181                pass
182
183    def reduce_to(self, target, k):
184        # Reduce the number of runs to "target" by merging "k" runs at a time
185
186        if k < 2:
187            raise ValueError("k=%s must be > 2" % k)
188        if target < 1:
189            raise ValueError("target=%s must be >= 1" % target)
190        runs = self.runs
191        while len(runs) > target:
192            newpath, f = self._new_run()
193            # Take k runs off the end of the run list
194            tomerge = []
195            while runs and len(tomerge) < k:
196                tomerge.append(runs.pop())
197            # Merge them into a new run and add it at the start of the list
198            self._write_run(f, self._merge_runs(tomerge))
199            runs.insert(0, newpath)
200
201    def items(self, maxfiles=128):
202        """Returns a sorted list or iterator of the items in the pool.
203        
204        :param maxfiles: maximum number of files to open at once.
205        """
206
207        if maxfiles < 2:
208            raise ValueError("maxfiles=%s must be >= 2" % maxfiles)
209
210        if not self.runs:
211            # We never wrote a run to disk, so just sort the queue in memory
212            # and return that
213            return sorted(self.current)
214        # Write a new run with the leftover items in the queue
215        self.save()
216
217        # If we have more runs than allowed open files, merge some of the runs
218        if maxfiles < len(self.runs):
219            self.reduce_to(maxfiles, maxfiles)
220
221        # Take all the runs off the run list and merge them
222        runs = self.runs
223        self.runs = []  # Minor detail, makes this object reusable
224        return self._merge_runs(runs)
225
226
227def sort(items, maxsize=100000, tempdir=None, maxfiles=128):
228    """Sorts the given items using an external merge sort.
229    
230    :param tempdir: the path of a directory to use for temporary file
231        storage. The default is to use the system's temp directory.
232    :param maxsize: the maximum number of items to keep in memory at once.
233    :param maxfiles: maximum number of files to open at once.
234    """
235
236    p = SortingPool(maxsize=maxsize, tempdir=tempdir)
237    for item in items:
238        p.add(item)
239    return p.items(maxfiles=maxfiles)
240
241
242