PageRenderTime 41ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/whoosh/filedb/fileindex.py

https://bitbucket.org/xkjq/wikidpad_svn
Python | 452 lines | 327 code | 71 blank | 54 comment | 37 complexity | be39e493b9ad7461ccfffd8cef1e9653 MD5 | raw file
Possible License(s): LGPL-2.1
  1. #===============================================================================
  2. # Copyright 2009 Matt Chaput
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #===============================================================================
  16. import cPickle, os, re
  17. from bisect import bisect_right
  18. from time import time
  19. from threading import Lock
  20. from whoosh import __version__
  21. from whoosh.fields import Schema
  22. from whoosh.index import Index
  23. from whoosh.index import EmptyIndexError, IndexVersionError
  24. from whoosh.index import _DEF_INDEX_NAME
  25. from whoosh.store import Storage, LockError
  26. from whoosh.system import _INT_SIZE, _FLOAT_SIZE, _LONG_SIZE
  27. _INDEX_VERSION = -110
  28. # TOC read/write functions
  29. def _toc_filename(indexname, gen):
  30. return "_%s_%s.toc" % (indexname, gen)
  31. def _toc_pattern(indexname):
  32. """Returns a regular expression object that matches TOC filenames.
  33. name is the name of the index.
  34. """
  35. return re.compile("^_%s_([0-9]+).toc$" % indexname)
  36. def _segment_pattern(indexname):
  37. """Returns a regular expression object that matches segment filenames.
  38. name is the name of the index.
  39. """
  40. return re.compile("(_%s_[0-9]+).(%s)" % (indexname,
  41. Segment.EXTENSIONS.values()))
  42. def _latest_generation(storage, indexname):
  43. pattern = _toc_pattern(indexname)
  44. max = -1
  45. for filename in storage:
  46. m = pattern.match(filename)
  47. if m:
  48. num = int(m.group(1))
  49. if num > max: max = num
  50. return max
  51. def _create_index(storage, schema, indexname=_DEF_INDEX_NAME):
  52. # Clear existing files
  53. prefix = "_%s_" % indexname
  54. for filename in storage:
  55. if filename.startswith(prefix):
  56. storage.delete_file(filename)
  57. # Write a TOC file with an empty list of segments
  58. _write_toc(storage, schema, indexname, 0, 0, [])
  59. def _write_toc(storage, schema, indexname, gen, segment_counter, segments):
  60. schema.clean()
  61. # Use a temporary file for atomic write.
  62. tocfilename = _toc_filename(indexname, gen)
  63. tempfilename = '%s.%s' % (tocfilename, time())
  64. stream = storage.create_file(tempfilename)
  65. stream.write_varint(_INT_SIZE)
  66. stream.write_varint(_LONG_SIZE)
  67. stream.write_varint(_FLOAT_SIZE)
  68. stream.write_int(-12345)
  69. stream.write_int(_INDEX_VERSION)
  70. for num in __version__[:3]:
  71. stream.write_varint(num)
  72. stream.write_string(cPickle.dumps(schema, -1))
  73. stream.write_int(gen)
  74. stream.write_int(segment_counter)
  75. stream.write_pickle(segments)
  76. stream.close()
  77. # Rename temporary file to the proper filename
  78. storage.rename_file(tempfilename, tocfilename, safe=True)
  79. class Toc(object):
  80. def __init__(self, **kwargs):
  81. for name, value in kwargs.iteritems():
  82. setattr(self, name, value)
  83. def _read_toc(storage, schema, indexname):
  84. gen = _latest_generation(storage, indexname)
  85. if gen < 0:
  86. raise EmptyIndexError("Index %r does not exist in %r" % (indexname, storage))
  87. # Read the content of this index from the .toc file.
  88. tocfilename = _toc_filename(indexname, gen)
  89. stream = storage.open_file(tocfilename)
  90. def check_size(name, target):
  91. sz = stream.read_varint()
  92. if sz != target:
  93. raise IndexError("Index was created on different architecture:"
  94. " saved %s = %s, this computer = %s" % (name, sz, target))
  95. check_size("int", _INT_SIZE)
  96. check_size("long", _LONG_SIZE)
  97. check_size("float", _FLOAT_SIZE)
  98. if not stream.read_int() == -12345:
  99. raise IndexError("Number misread: byte order problem")
  100. version = stream.read_int()
  101. if version != _INDEX_VERSION:
  102. raise IndexVersionError("Can't read format %s" % version, version)
  103. release = (stream.read_varint(), stream.read_varint(), stream.read_varint())
  104. # If the user supplied a schema object with the constructor, don't load
  105. # the pickled schema from the saved index.
  106. if schema:
  107. stream.skip_string()
  108. else:
  109. schema = cPickle.loads(stream.read_string())
  110. # Generation
  111. index_gen = stream.read_int()
  112. assert gen == index_gen
  113. segment_counter = stream.read_int()
  114. segments = stream.read_pickle()
  115. stream.close()
  116. return Toc(version=version, release=release, schema=schema,
  117. segment_counter=segment_counter, segments=segments,
  118. generation=gen)
  119. def _next_segment_name(self):
  120. #Returns the name of the next segment in sequence.
  121. if self.segment_num_lock is None:
  122. self.segment_num_lock = Lock()
  123. if self.segment_num_lock.acquire():
  124. try:
  125. self.segment_counter += 1
  126. return
  127. finally:
  128. self.segment_num_lock.release()
  129. else:
  130. raise LockError
  131. def _clean_files(storage, indexname, gen, segments):
  132. # Attempts to remove unused index files (called when a new generation
  133. # is created). If existing Index and/or reader objects have the files
  134. # open, they may not be deleted immediately (i.e. on Windows) but will
  135. # probably be deleted eventually by a later call to clean_files.
  136. current_segment_names = set(s.name for s in segments)
  137. tocpattern = _toc_pattern(indexname)
  138. segpattern = _segment_pattern(indexname)
  139. todelete = set()
  140. for filename in storage:
  141. tocm = tocpattern.match(filename)
  142. segm = segpattern.match(filename)
  143. if tocm:
  144. if int(tocm.group(1)) != gen:
  145. todelete.add(filename)
  146. elif segm:
  147. name = segm.group(1)
  148. if name not in current_segment_names:
  149. todelete.add(filename)
  150. for filename in todelete:
  151. try:
  152. storage.delete_file(filename)
  153. except OSError:
  154. # Another process still has this file open
  155. pass
  156. # Index placeholder object
  157. class FileIndex(Index):
  158. def __init__(self, storage, schema=None, indexname=_DEF_INDEX_NAME):
  159. if not isinstance(storage, Storage):
  160. raise ValueError("%r is not a Storage object" % storage)
  161. if schema is not None and not isinstance(schema, Schema):
  162. raise ValueError("%r is not a Schema object" % schema)
  163. if not isinstance(indexname, (str, unicode)):
  164. raise ValueError("indexname %r is not a string" % indexname)
  165. self.storage = storage
  166. self._schema = schema
  167. self.indexname = indexname
  168. # Try reading the TOC to see if it's possible
  169. _read_toc(self.storage, self._schema, self.indexname)
  170. def __repr__(self):
  171. return "%s(%r, %r)" % (self.__class__.__name__,
  172. self.storage, self.indexname)
  173. def close(self):
  174. pass
  175. # add_field
  176. # remove_field
  177. def latest_generation(self):
  178. return _latest_generation(self.storage, self.indexname)
  179. # refresh
  180. # up_to_date
  181. def last_modified(self):
  182. gen = self.latest_generation()
  183. filename = _toc_filename(self.indexname, gen)
  184. return self.storage.file_modified(filename)
  185. def is_empty(self):
  186. info = _read_toc(self.storage, self.schema, self.indexname)
  187. return len(info.segments) == 0
  188. def optimize(self):
  189. w = self.writer()
  190. w.commit(optimize=True)
  191. # searcher
  192. def writer(self, **kwargs):
  193. from whoosh.filedb.filewriting import SegmentWriter
  194. return SegmentWriter(self, **kwargs)
  195. def lock(self, name):
  196. """Returns a lock object that you can try to call acquire() on to
  197. lock the index.
  198. """
  199. return self.storage.lock(self.indexname + "_" + name)
  200. def _read_toc(self):
  201. return _read_toc(self.storage, self._schema, self.indexname)
  202. def _segments(self):
  203. return self._read_toc().segments
  204. def _current_schema(self):
  205. return self._read_toc().schema
  206. @property
  207. def schema(self):
  208. return self._current_schema()
  209. def reader(self):
  210. lock = self.lock("READLOCK")
  211. lock.acquire(True)
  212. try:
  213. info = self._read_toc()
  214. from whoosh.filedb.filereading import SegmentReader
  215. if len(info.segments) == 0:
  216. from whoosh.reading import EmptyReader
  217. return EmptyReader(info.schema)
  218. elif len(info.segments) == 1:
  219. return SegmentReader(self.storage, info.schema,
  220. info.segments[0], info.generation)
  221. else:
  222. from whoosh.reading import MultiReader
  223. readers = [SegmentReader(self.storage, info.schema, segment, -2)
  224. for segment in info.segments]
  225. return MultiReader(readers, info.generation)
  226. finally:
  227. lock.release()
  228. class Segment(object):
  229. """Do not instantiate this object directly. It is used by the Index object
  230. to hold information about a segment. A list of objects of this class are
  231. pickled as part of the TOC file.
  232. The TOC file stores a minimal amount of information -- mostly a list of
  233. Segment objects. Segments are the real reverse indexes. Having multiple
  234. segments allows quick incremental indexing: just create a new segment for
  235. the new documents, and have the index overlay the new segment over previous
  236. ones for purposes of reading/search. "Optimizing" the index combines the
  237. contents of existing segments into one (removing any deleted documents
  238. along the way).
  239. """
  240. EXTENSIONS = {"fieldlengths": "fln", "storedfields": "sto",
  241. "termsindex": "trm", "termposts": "pst",
  242. "vectorindex": "vec", "vectorposts": "vps"}
  243. def __init__(self, name, doccount, fieldlength_totals, fieldlength_maxes,
  244. deleted=None):
  245. """
  246. :param name: The name of the segment (the Index object computes this
  247. from its name and the generation).
  248. :param doccount: The maximum document number in the segment.
  249. :param term_count: Total count of all terms in all documents.
  250. :param fieldlength_totals: A dictionary mapping field numbers to the
  251. total number of terms in that field across all documents in the
  252. segment.
  253. :param deleted: A set of deleted document numbers, or None if no
  254. deleted documents exist in this segment.
  255. """
  256. assert isinstance(name, basestring)
  257. assert isinstance(doccount, (int, long))
  258. assert fieldlength_totals is None or isinstance(fieldlength_totals, dict), "fl_totals=%r" % fieldlength_totals
  259. assert fieldlength_maxes is None or isinstance(fieldlength_maxes, dict), "fl_maxes=%r" % fieldlength_maxes
  260. self.name = name
  261. self.doccount = doccount
  262. self.fieldlength_totals = fieldlength_totals
  263. self.fieldlength_maxes = fieldlength_maxes
  264. self.deleted = deleted
  265. self._filenames = set()
  266. for attr, ext in self.EXTENSIONS.iteritems():
  267. fname = "%s.%s" % (self.name, ext)
  268. setattr(self, attr + "_filename", fname)
  269. self._filenames.add(fname)
  270. def __repr__(self):
  271. return "%s(%r)" % (self.__class__.__name__, self.name)
  272. def copy(self):
  273. if self.deleted:
  274. deleted = set(self.deleted)
  275. else:
  276. deleted = None
  277. return Segment(self.name, self.doccount, self.fieldlength_totals,
  278. self.fieldlength_maxes, deleted)
  279. def filenames(self):
  280. return self._filenames
  281. def doc_count_all(self):
  282. """
  283. :returns: the total number of documents, DELETED OR UNDELETED, in this
  284. segment.
  285. """
  286. return self.doccount
  287. def doc_count(self):
  288. """
  289. :returns: the number of (undeleted) documents in this segment.
  290. """
  291. return self.doccount - self.deleted_count()
  292. def has_deletions(self):
  293. """
  294. :returns: True if any documents in this segment are deleted.
  295. """
  296. return self.deleted_count() > 0
  297. def deleted_count(self):
  298. """
  299. :returns: the total number of deleted documents in this segment.
  300. """
  301. if self.deleted is None: return 0
  302. return len(self.deleted)
  303. def field_length(self, fieldname, default=0):
  304. """Returns the total number of terms in the given field across all
  305. documents in this segment.
  306. """
  307. return self.fieldlength_totals.get(fieldname, default)
  308. def max_field_length(self, fieldname, default=0):
  309. """Returns the maximum length of the given field in any of the
  310. documents in the segment.
  311. """
  312. return self.fieldlength_maxes.get(fieldname, default)
  313. def delete_document(self, docnum, delete=True):
  314. """Deletes the given document number. The document is not actually
  315. removed from the index until it is optimized.
  316. :param docnum: The document number to delete.
  317. :param delete: If False, this undeletes a deleted document.
  318. """
  319. if delete:
  320. if self.deleted is None:
  321. self.deleted = set()
  322. elif docnum in self.deleted:
  323. raise KeyError("Document %s in segment %r is already deleted"
  324. % (docnum, self.name))
  325. self.deleted.add(docnum)
  326. else:
  327. if self.deleted is None or docnum not in self.deleted:
  328. raise KeyError("Document %s is not deleted" % docnum)
  329. self.deleted.clear(docnum)
  330. def is_deleted(self, docnum):
  331. """:returns: True if the given document number is deleted."""
  332. if self.deleted is None: return False
  333. return docnum in self.deleted