/bangkokhotel/lib/python2.5/site-packages/whoosh/support/externalsort.py
Python | 242 lines | 171 code | 14 blank | 57 comment | 10 complexity | 06bf5d9b10512f2318011f14025ad06e MD5 | raw file
1# Copyright 2011 Matt Chaput. All rights reserved.
2#
3# Redistribution and use in source and binary forms, with or without
4# modification, are permitted provided that the following conditions are met:
5#
6# 1. Redistributions of source code must retain the above copyright notice,
7# this list of conditions and the following disclaimer.
8#
9# 2. Redistributions in binary form must reproduce the above copyright
10# notice, this list of conditions and the following disclaimer in the
11# documentation and/or other materials provided with the distribution.
12#
13# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
14# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
15# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
16# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
17# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
18# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
19# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
22# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23#
24# The views and conclusions contained in the software and documentation are
25# those of the authors and should not be interpreted as representing official
26# policies, either expressed or implied, of Matt Chaput.
27
28"""
29This module implements a general external merge sort for Python objects.
30"""
31
32from __future__ import with_statement
33
34import os, tempfile
35from heapq import heapify, heappop, heapreplace
36
37from whoosh.compat import xrange, dump, load
38
39
40## Python 3.2 had a bug that make marshal.load unusable
41#if (hasattr(platform, "python_implementation")
42# and platform.python_implementation() == "CPython"
43# and platform.python_version() == "3.2.0"):
44# # Use pickle instead of marshal on Python 3.2
45# from whoosh.compat import dump as dump_pickle
46# from whoosh.compat import load
47#
48# def dump(obj, f):
49# dump_pickle(obj, f, -1)
50#else:
51# from marshal import dump, load
52
53
54try:
55 from heapq import merge
56
57 def imerge(iterables):
58 return merge(*iterables)
59except ImportError:
60 def imerge(iterables):
61 _hpop, _hreplace, _Stop = (heappop, heapreplace, StopIteration)
62 h = []
63 h_append = h.append
64 for itnum, it in enumerate(map(iter, iterables)):
65 try:
66 nx = it.next
67 h_append([nx(), itnum, nx])
68 except _Stop:
69 pass
70 heapify(h)
71
72 while 1:
73 try:
74 while 1:
75 v, itnum, nx = s = h[0]
76 yield v
77 s[0] = nx()
78 _hreplace(h, s)
79 except _Stop:
80 _hpop(h)
81 except IndexError:
82 return
83
84
85class SortingPool(object):
86 """This object implements a general K-way external merge sort for Python
87 objects.
88
89 >>> pool = MergePool()
90 >>> # Add an unlimited number of items in any order
91 >>> for item in my_items:
92 ... pool.add(item)
93 ...
94 >>> # Get the items back in sorted order
95 >>> for item in pool.items():
96 ... print(item)
97
98 This class uses the `marshal` module to write the items to temporary files,
99 so you can only sort marshal-able types (generally: numbers, strings,
100 tuples, lists, and dicts).
101 """
102
103 filenamechars = "abcdefghijklmnopqrstuvwxyz_1234567890"
104
105 def __init__(self, maxsize=1000000, tempdir=None, prefix="",
106 suffix=".run"):
107 """
108 :param maxsize: the maximum number of items to keep in memory at once.
109 :param tempdir: the path of a directory to use for temporary file
110 storage. The default is to use the system's temp directory.
111 :param prefix: a prefix to add to temporary filenames.
112 :param suffix: a suffix to add to temporary filenames.
113 """
114
115 self.tempdir = tempdir
116 if maxsize < 1:
117 raise ValueError("maxsize=%s must be >= 1" % maxsize)
118 self.maxsize = maxsize
119 self.prefix = prefix
120 self.suffix = suffix
121 # Current run queue
122 self.current = []
123 # List of run filenames
124 self.runs = []
125
126 def _new_run(self):
127 fd, path = tempfile.mkstemp(prefix=self.prefix, suffix=self.suffix,
128 dir=self.tempdir)
129 f = os.fdopen(fd, "wb")
130 return path, f
131
132 @staticmethod
133 def _read_run(path):
134 import os.path
135 f = open(path, "rb")
136 try:
137 while True:
138 yield load(f)
139 except EOFError:
140 return
141 finally:
142 f.close()
143 os.remove(path)
144
145 @classmethod
146 def _merge_runs(cls, paths):
147 iters = [cls._read_run(path) for path in paths]
148 for item in imerge(iters):
149 yield item
150
151 def add(self, item):
152 """Adds `item` to the pool to be sorted.
153 """
154
155 if len(self.current) >= self.maxsize:
156 self.save()
157 self.current.append(item)
158
159 def _write_run(self, f, items):
160 for item in items:
161 dump(item, f, -1)
162 f.close()
163
164 def _add_run(self, filename):
165 self.runs.append(filename)
166
167 def save(self):
168 current = self.current
169 if current:
170 current.sort()
171 path, f = self._new_run()
172 self._write_run(f, current)
173 self._add_run(path)
174 self.current = []
175
176 def cleanup(self):
177 for path in self.runs:
178 try:
179 os.remove(path)
180 except OSError:
181 pass
182
183 def reduce_to(self, target, k):
184 # Reduce the number of runs to "target" by merging "k" runs at a time
185
186 if k < 2:
187 raise ValueError("k=%s must be > 2" % k)
188 if target < 1:
189 raise ValueError("target=%s must be >= 1" % target)
190 runs = self.runs
191 while len(runs) > target:
192 newpath, f = self._new_run()
193 # Take k runs off the end of the run list
194 tomerge = []
195 while runs and len(tomerge) < k:
196 tomerge.append(runs.pop())
197 # Merge them into a new run and add it at the start of the list
198 self._write_run(f, self._merge_runs(tomerge))
199 runs.insert(0, newpath)
200
201 def items(self, maxfiles=128):
202 """Returns a sorted list or iterator of the items in the pool.
203
204 :param maxfiles: maximum number of files to open at once.
205 """
206
207 if maxfiles < 2:
208 raise ValueError("maxfiles=%s must be >= 2" % maxfiles)
209
210 if not self.runs:
211 # We never wrote a run to disk, so just sort the queue in memory
212 # and return that
213 return sorted(self.current)
214 # Write a new run with the leftover items in the queue
215 self.save()
216
217 # If we have more runs than allowed open files, merge some of the runs
218 if maxfiles < len(self.runs):
219 self.reduce_to(maxfiles, maxfiles)
220
221 # Take all the runs off the run list and merge them
222 runs = self.runs
223 self.runs = [] # Minor detail, makes this object reusable
224 return self._merge_runs(runs)
225
226
227def sort(items, maxsize=100000, tempdir=None, maxfiles=128):
228 """Sorts the given items using an external merge sort.
229
230 :param tempdir: the path of a directory to use for temporary file
231 storage. The default is to use the system's temp directory.
232 :param maxsize: the maximum number of items to keep in memory at once.
233 :param maxfiles: maximum number of files to open at once.
234 """
235
236 p = SortingPool(maxsize=maxsize, tempdir=tempdir)
237 for item in items:
238 p.add(item)
239 return p.items(maxfiles=maxfiles)
240
241
242