PageRenderTime 51ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/src/whoosh/index.py

https://bitbucket.org/rayleyva/whoosh
Python | 737 lines | 681 code | 13 blank | 43 comment | 3 complexity | bc2ad6ff469e3fdce8121bec25b87e78 MD5 | raw file
Possible License(s): Apache-2.0
  1. # Copyright 2007 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. """Contains the main functions/classes for creating, maintaining, and using
  28. an index.
  29. """
  30. from __future__ import division
  31. import os.path, re, sys
  32. from time import time, sleep
  33. from whoosh import __version__
  34. from whoosh.compat import xrange
  35. from whoosh.filedb.structfile import ChecksumFile
  36. from whoosh.legacy import toc_loaders
  37. from whoosh.compat import dumps, loads, string_type
  38. from whoosh.fields import ensure_schema
  39. from whoosh.system import _INT_SIZE, _FLOAT_SIZE, _LONG_SIZE
  40. _DEF_INDEX_NAME = "MAIN"
  41. _CURRENT_TOC_VERSION = -111
  42. # Exceptions
  43. class LockError(Exception):
  44. pass
  45. class IndexError(Exception):
  46. """Generic index error."""
  47. class IndexVersionError(IndexError):
  48. """Raised when you try to open an index using a format that the current
  49. version of Whoosh cannot read. That is, when the index you're trying to
  50. open is either not backward or forward compatible with this version of
  51. Whoosh.
  52. """
  53. def __init__(self, msg, version, release=None):
  54. Exception.__init__(self, msg)
  55. self.version = version
  56. self.release = release
  57. class OutOfDateError(IndexError):
  58. """Raised when you try to commit changes to an index which is not the
  59. latest generation.
  60. """
  61. class EmptyIndexError(IndexError):
  62. """Raised when you try to work with an index that has no indexed terms.
  63. """
  64. # Convenience functions
  65. def create_in(dirname, schema, indexname=None):
  66. """Convenience function to create an index in a directory. Takes care of
  67. creating a FileStorage object for you.
  68. :param dirname: the path string of the directory in which to create the
  69. index.
  70. :param schema: a :class:`whoosh.fields.Schema` object describing the
  71. index's fields.
  72. :param indexname: the name of the index to create; you only need to specify
  73. this if you are creating multiple indexes within the same storage
  74. object.
  75. :returns: :class:`Index`
  76. """
  77. from whoosh.filedb.filestore import FileStorage
  78. if not indexname:
  79. indexname = _DEF_INDEX_NAME
  80. storage = FileStorage(dirname)
  81. return FileIndex.create(storage, schema, indexname)
  82. def open_dir(dirname, indexname=None, readonly=False, schema=None):
  83. """Convenience function for opening an index in a directory. Takes care of
  84. creating a FileStorage object for you. dirname is the filename of the
  85. directory in containing the index. indexname is the name of the index to
  86. create; you only need to specify this if you have multiple indexes within
  87. the same storage object.
  88. :param dirname: the path string of the directory in which to create the
  89. index.
  90. :param indexname: the name of the index to create; you only need to specify
  91. this if you have multiple indexes within the same storage object.
  92. """
  93. from whoosh.filedb.filestore import FileStorage
  94. if indexname is None:
  95. indexname = _DEF_INDEX_NAME
  96. storage = FileStorage(dirname, readonly=readonly)
  97. return FileIndex(storage, schema=schema, indexname=indexname)
  98. def exists_in(dirname, indexname=None):
  99. """Returns True if dirname contains a Whoosh index.
  100. :param dirname: the file path of a directory.
  101. :param indexname: the name of the index. If None, the default index name is
  102. used.
  103. """
  104. if os.path.exists(dirname):
  105. try:
  106. ix = open_dir(dirname, indexname=indexname)
  107. return ix.latest_generation() > -1
  108. except EmptyIndexError:
  109. pass
  110. return False
  111. def exists(storage, indexname=None):
  112. """Deprecated; use ``storage.index_exists()``.
  113. :param storage: a store.Storage object.
  114. :param indexname: the name of the index. If None, the default index name is
  115. used.
  116. """
  117. return storage.index_exists(indexname)
  118. def version_in(dirname, indexname=None):
  119. """Returns a tuple of (release_version, format_version), where
  120. release_version is the release version number of the Whoosh code that
  121. created the index -- e.g. (0, 1, 24) -- and format_version is the version
  122. number of the on-disk format used for the index -- e.g. -102.
  123. You should avoid attaching significance to the second number (the index
  124. version). This is simply a version number for the TOC file and probably
  125. should not have been exposed in a public interface. The best way to check
  126. if the current version of Whoosh can open an index is to actually try to
  127. open it and see if it raises a ``whoosh.index.IndexVersionError`` exception.
  128. Note that the release and format version are available as attributes on the
  129. Index object in Index.release and Index.version.
  130. :param dirname: the file path of a directory containing an index.
  131. :param indexname: the name of the index. If None, the default index name is
  132. used.
  133. :returns: ((major_ver, minor_ver, build_ver), format_ver)
  134. """
  135. from whoosh.filedb.filestore import FileStorage
  136. storage = FileStorage(dirname)
  137. return version(storage, indexname=indexname)
  138. def version(storage, indexname=None):
  139. """Returns a tuple of (release_version, format_version), where
  140. release_version is the release version number of the Whoosh code that
  141. created the index -- e.g. (0, 1, 24) -- and format_version is the version
  142. number of the on-disk format used for the index -- e.g. -102.
  143. You should avoid attaching significance to the second number (the index
  144. version). This is simply a version number for the TOC file and probably
  145. should not have been exposed in a public interface. The best way to check
  146. if the current version of Whoosh can open an index is to actually try to
  147. open it and see if it raises a ``whoosh.index.IndexVersionError`` exception.
  148. Note that the release and format version are available as attributes on the
  149. Index object in Index.release and Index.version.
  150. :param storage: a store.Storage object.
  151. :param indexname: the name of the index. If None, the default index name is
  152. used.
  153. :returns: ((major_ver, minor_ver, build_ver), format_ver)
  154. """
  155. try:
  156. if indexname is None:
  157. indexname = _DEF_INDEX_NAME
  158. ix = storage.open_index(indexname)
  159. return (ix.release, ix.version)
  160. except IndexVersionError:
  161. e = sys.exc_info()[1]
  162. return (None, e.version)
  163. # Index base class
  164. class Index(object):
  165. """Represents an indexed collection of documents.
  166. """
  167. def close(self):
  168. """Closes any open resources held by the Index object itself. This may
  169. not close all resources being used everywhere, for example by a
  170. Searcher object.
  171. """
  172. pass
  173. def add_field(self, fieldname, fieldspec):
  174. """Adds a field to the index's schema.
  175. :param fieldname: the name of the field to add.
  176. :param fieldspec: an instantiated :class:`whoosh.fields.FieldType`
  177. object.
  178. """
  179. w = self.writer()
  180. w.add_field(fieldname, fieldspec)
  181. w.commit()
  182. def remove_field(self, fieldname):
  183. """Removes the named field from the index's schema. Depending on the
  184. backend implementation, this may or may not actually remove existing
  185. data for the field from the index. Optimizing the index should always
  186. clear out existing data for a removed field.
  187. """
  188. w = self.writer()
  189. w.remove_field(fieldname)
  190. w.commit()
  191. def latest_generation(self):
  192. """Returns the generation number of the latest generation of this
  193. index, or -1 if the backend doesn't support versioning.
  194. """
  195. return -1
  196. def refresh(self):
  197. """Returns a new Index object representing the latest generation
  198. of this index (if this object is the latest generation, or the backend
  199. doesn't support versioning, returns self).
  200. :returns: :class:`Index`
  201. """
  202. return self
  203. def up_to_date(self):
  204. """Returns True if this object represents the latest generation of
  205. this index. Returns False if this object is not the latest generation
  206. (that is, someone else has updated the index since you opened this
  207. object).
  208. """
  209. return True
  210. def last_modified(self):
  211. """Returns the last modified time of the index, or -1 if the backend
  212. doesn't support last-modified times.
  213. """
  214. return -1
  215. def is_empty(self):
  216. """Returns True if this index is empty (that is, it has never had any
  217. documents successfully written to it.
  218. """
  219. raise NotImplementedError
  220. def optimize(self):
  221. """Optimizes this index, if necessary.
  222. """
  223. pass
  224. def doc_count_all(self):
  225. """Returns the total number of documents, DELETED OR UNDELETED,
  226. in this index.
  227. """
  228. r = self.reader()
  229. try:
  230. return r.doc_count_all()
  231. finally:
  232. r.close()
  233. def doc_count(self):
  234. """Returns the total number of UNDELETED documents in this index.
  235. """
  236. r = self.reader()
  237. try:
  238. return r.doc_count()
  239. finally:
  240. r.close()
  241. def searcher(self, **kwargs):
  242. """Returns a Searcher object for this index. Keyword arguments are
  243. passed to the Searcher object's constructor.
  244. :rtype: :class:`whoosh.searching.Searcher`
  245. """
  246. from whoosh.searching import Searcher
  247. return Searcher(self.reader(), fromindex=self, **kwargs)
  248. def field_length(self, fieldname):
  249. """Returns the total length of the field across all documents.
  250. """
  251. r = self.reader()
  252. try:
  253. return r.field_length(fieldname)
  254. finally:
  255. r.close()
  256. def max_field_length(self, fieldname):
  257. """Returns the maximum length of the field across all documents.
  258. """
  259. r = self.reader()
  260. try:
  261. return r.max_field_length(fieldname)
  262. finally:
  263. r.close()
  264. def reader(self, reuse=None):
  265. """Returns an IndexReader object for this index.
  266. :param reuse: an existing reader. Some implementations may recycle
  267. resources from this existing reader to create the new reader. Note
  268. that any resources in the "recycled" reader that are not used by
  269. the new reader will be CLOSED, so you CANNOT use it afterward.
  270. :rtype: :class:`whoosh.reading.IndexReader`
  271. """
  272. raise NotImplementedError
  273. def writer(self, **kwargs):
  274. """Returns an IndexWriter object for this index.
  275. :rtype: :class:`whoosh.writing.IndexWriter`
  276. """
  277. raise NotImplementedError
  278. def delete_by_term(self, fieldname, text, searcher=None):
  279. w = self.writer()
  280. w.delete_by_term(fieldname, text, searcher=searcher)
  281. w.commit()
  282. def delete_by_query(self, q, searcher=None):
  283. w = self.writer()
  284. w.delete_by_query(q, searcher=searcher)
  285. w.commit()
  286. # Codec-based index implementation
  287. def clean_files(storage, indexname, gen, segments):
  288. # Attempts to remove unused index files (called when a new generation
  289. # is created). If existing Index and/or reader objects have the files
  290. # open, they may not be deleted immediately (i.e. on Windows) but will
  291. # probably be deleted eventually by a later call to clean_files.
  292. current_segment_names = set(s.segment_id() for s in segments)
  293. tocpattern = TOC._pattern(indexname)
  294. segpattern = TOC._segment_pattern(indexname)
  295. todelete = set()
  296. for filename in storage:
  297. if filename.startswith("."):
  298. continue
  299. tocm = tocpattern.match(filename)
  300. segm = segpattern.match(filename)
  301. if tocm:
  302. if int(tocm.group(1)) != gen:
  303. todelete.add(filename)
  304. elif segm:
  305. name = segm.group(1)
  306. if name not in current_segment_names:
  307. todelete.add(filename)
  308. for filename in todelete:
  309. try:
  310. storage.delete_file(filename)
  311. except OSError:
  312. # Another process still has this file open, I guess
  313. pass
  314. class FileIndex(Index):
  315. def __init__(self, storage, schema=None, indexname=_DEF_INDEX_NAME):
  316. from whoosh.filedb.filestore import Storage
  317. if not isinstance(storage, Storage):
  318. raise ValueError("%r is not a Storage object" % storage)
  319. if not isinstance(indexname, string_type):
  320. raise ValueError("indexname %r is not a string" % indexname)
  321. if schema:
  322. schema = ensure_schema(schema)
  323. self.storage = storage
  324. self._schema = schema
  325. self.indexname = indexname
  326. # Try reading the TOC to see if it's possible
  327. TOC.read(self.storage, self.indexname, schema=self._schema)
  328. @classmethod
  329. def create(cls, storage, schema, indexname=_DEF_INDEX_NAME):
  330. TOC.create(storage, schema, indexname)
  331. return cls(storage, schema, indexname)
  332. def __repr__(self):
  333. return "%s(%r, %r)" % (self.__class__.__name__,
  334. self.storage, self.indexname)
  335. def close(self):
  336. pass
  337. # add_field
  338. # remove_field
  339. def latest_generation(self):
  340. return TOC._latest_generation(self.storage, self.indexname)
  341. # refresh
  342. # up_to_date
  343. def last_modified(self):
  344. gen = self.latest_generation()
  345. filename = TOC._filename(self.indexname, gen)
  346. return self.storage.file_modified(filename)
  347. def is_empty(self):
  348. return len(self._read_toc().segments) == 0
  349. def optimize(self, **kwargs):
  350. w = self.writer(**kwargs)
  351. w.commit(optimize=True)
  352. # searcher
  353. def writer(self, procs=1, **kwargs):
  354. if procs > 1:
  355. from whoosh.multiproc import MpWriter
  356. return MpWriter(self, procs=procs, **kwargs)
  357. else:
  358. from whoosh.writing import SegmentWriter
  359. return SegmentWriter(self, **kwargs)
  360. def lock(self, name):
  361. """Returns a lock object that you can try to call acquire() on to
  362. lock the index.
  363. """
  364. return self.storage.lock(self.indexname + "_" + name)
  365. def _read_toc(self):
  366. return TOC.read(self.storage, self.indexname, schema=self._schema)
  367. def _segments(self):
  368. return self._read_toc().segments
  369. def _current_schema(self):
  370. return self._read_toc().schema
  371. @property
  372. def schema(self):
  373. return self._current_schema()
  374. @property
  375. def release(self):
  376. return self._read_toc().release
  377. @property
  378. def version(self):
  379. return self._read_toc().version
  380. @classmethod
  381. def _reader(cls, storage, schema, segments, generation, reuse=None):
  382. # Returns a reader for the given segments, possibly reusing already
  383. # opened readers
  384. from whoosh.reading import SegmentReader, MultiReader, EmptyReader
  385. reusable = {}
  386. try:
  387. if len(segments) == 0:
  388. # This index has no segments! Return an EmptyReader object,
  389. # which simply returns empty or zero to every method
  390. return EmptyReader(schema)
  391. if reuse:
  392. # Put all atomic readers in a dictionary keyed by their
  393. # generation, so we can re-use them if them if possible
  394. readers = [r for r, _ in reuse.leaf_readers()]
  395. reusable = dict((r.generation(), r) for r in readers)
  396. # Make a function to open readers, which reuses reusable readers.
  397. # It removes any readers it reuses from the "reusable" dictionary,
  398. # so later we can close any readers left in the dictionary.
  399. def segreader(segment):
  400. segid = segment.segment_id()
  401. if segid in reusable:
  402. r = reusable[segid]
  403. del reusable[segid]
  404. return r
  405. else:
  406. return SegmentReader(storage, schema, segment,
  407. generation=generation)
  408. if len(segments) == 1:
  409. # This index has one segment, so return a SegmentReader object
  410. # for the segment
  411. return segreader(segments[0])
  412. else:
  413. # This index has multiple segments, so create a list of
  414. # SegmentReaders for the segments, then composite them with a
  415. # MultiReader
  416. readers = [segreader(segment) for segment in segments]
  417. return MultiReader(readers, generation=generation)
  418. finally:
  419. for r in reusable.values():
  420. r.close()
  421. def reader(self, reuse=None):
  422. retries = 10
  423. while retries > 0:
  424. # Read the information from the TOC file
  425. try:
  426. info = self._read_toc()
  427. return self._reader(self.storage, info.schema, info.segments,
  428. info.generation, reuse=reuse)
  429. except IOError:
  430. # Presume that we got a "file not found error" because a writer
  431. # deleted one of the files just as we were trying to open it,
  432. # and so retry a few times before actually raising the
  433. # exception
  434. e = sys.exc_info()[1]
  435. retries -= 1
  436. if retries <= 0:
  437. raise e
  438. sleep(0.05)
  439. # TOC class
  440. class TOC(object):
  441. """Object representing the state of the index after a commit. Essentially
  442. a container for the index's schema and the list of segment objects.
  443. """
  444. def __init__(self, schema, segments, generation,
  445. version=_CURRENT_TOC_VERSION, release=__version__):
  446. self.schema = schema
  447. self.segments = segments
  448. self.generation = generation
  449. self.version = version
  450. self.release = release
  451. @classmethod
  452. def _filename(cls, indexname, gen):
  453. return "_%s_%s.toc" % (indexname, gen)
  454. @classmethod
  455. def _pattern(cls, indexname):
  456. return re.compile("^_%s_([0-9]+).toc$" % indexname)
  457. @classmethod
  458. def _segment_pattern(cls, indexname):
  459. return re.compile("(%s_[0-9a-z]+)[.][A-Za-z0-9_.]+" % indexname)
  460. @classmethod
  461. def _latest_generation(cls, storage, indexname):
  462. pattern = cls._pattern(indexname)
  463. mx = -1
  464. for filename in storage:
  465. m = pattern.match(filename)
  466. if m:
  467. mx = max(int(m.group(1)), mx)
  468. return mx
  469. @classmethod
  470. def create(cls, storage, schema, indexname=_DEF_INDEX_NAME):
  471. schema = ensure_schema(schema)
  472. # Clear existing files
  473. prefix = "_%s_" % indexname
  474. for filename in storage:
  475. if filename.startswith(prefix):
  476. storage.delete_file(filename)
  477. # Write a TOC file with an empty list of segments
  478. toc = cls(schema, [], 0)
  479. toc.write(storage, indexname)
  480. @classmethod
  481. def _read_preamble(cls, stream):
  482. # Check that the number of bytes per data type are the same on this
  483. # platform as where the index was created, otherwise it'll be crazy
  484. def check_size(name, target):
  485. sz = stream.read_varint()
  486. if sz != target:
  487. raise IndexError("Index was created on different architecture:"
  488. " saved %s = %s, this computer = %s"
  489. % (name, sz, target))
  490. check_size("int", _INT_SIZE)
  491. check_size("long", _LONG_SIZE)
  492. check_size("float", _FLOAT_SIZE)
  493. if not stream.read_int() == -12345:
  494. raise IndexError("Number misread: byte order problem")
  495. # Index format version
  496. toc_version = stream.read_int()
  497. # Whoosh release version, e.g. (2, 4, 1)
  498. release = (stream.read_varint(), stream.read_varint(),
  499. stream.read_varint())
  500. return toc_version, release
  501. @classmethod
  502. def _write_preamble(cls, stream):
  503. stream.write_varint(_INT_SIZE)
  504. stream.write_varint(_LONG_SIZE)
  505. stream.write_varint(_FLOAT_SIZE)
  506. stream.write_int(-12345)
  507. stream.write_int(_CURRENT_TOC_VERSION)
  508. for num in __version__[:3]:
  509. stream.write_varint(num)
  510. @classmethod
  511. def read(cls, storage, indexname, gen=None, schema=None):
  512. if gen is None:
  513. gen = cls._latest_generation(storage, indexname)
  514. if gen < 0:
  515. raise EmptyIndexError("Index %r does not exist in %r"
  516. % (indexname, storage))
  517. tocfilename = cls._filename(indexname, gen)
  518. stream = storage.open_file(tocfilename)
  519. stream = ChecksumFile(stream)
  520. # Do general sanity checks at the beginning and read the version
  521. # numbers
  522. toc_version, release = cls._read_preamble(stream)
  523. if toc_version != _CURRENT_TOC_VERSION:
  524. # If there's a backwards-compatible loader function for this
  525. # version, use it to load the rest of the TOC
  526. if toc_version in toc_loaders:
  527. loader = toc_loaders[toc_version]
  528. schema, segments = loader(stream, gen, schema, toc_version)
  529. else:
  530. # Otherwise, raise an error
  531. raise IndexVersionError("Can't read format %s" % toc_version,
  532. toc_version)
  533. else:
  534. loader = cls._read_info
  535. schema, segments = loader(stream, gen, schema, toc_version)
  536. file_check = stream.checksum()
  537. orig_check = stream.read_uint()
  538. if file_check != orig_check:
  539. raise Exception("TOC checksum does not match %d != %d"
  540. % (file_check, orig_check))
  541. stream.close()
  542. return cls(schema, segments, gen, version=toc_version, release=release)
  543. @classmethod
  544. def _read_info(cls, stream, gen, schema, version):
  545. # Read the schema and segments from the TOC file
  546. # Read the pickled schema bytes
  547. pick = stream.read_string()
  548. # If the user passed a schema, use it, otherwise unpickle the schema
  549. # we just read
  550. if not schema:
  551. schema = loads(pick)
  552. # Read the list of segments
  553. numsegments = stream.read_varint()
  554. segments = []
  555. for _ in xrange(numsegments):
  556. segtype = stream.read_string() # @UnusedVariable
  557. segment = loads(stream.read_string())
  558. segments.append(segment)
  559. return schema, segments
  560. def write(self, storage, indexname):
  561. schema = ensure_schema(self.schema)
  562. schema.clean()
  563. # Use a temporary file for atomic write
  564. tocfilename = self._filename(indexname, self.generation)
  565. tempfilename = '%s.%s' % (tocfilename, time())
  566. stream = storage.create_file(tempfilename)
  567. stream = ChecksumFile(stream)
  568. # Write the sanity checks and version numbers
  569. self._write_preamble(stream)
  570. # Write pickles as strings to allow them to be skipped
  571. stream.write_string(dumps(schema, -1))
  572. # Write the list of segments
  573. stream.write_varint(len(self.segments))
  574. for segment in self.segments:
  575. # Write the segment's module and class name before the pickle to
  576. # possibly allow later versions to load the segment differently
  577. # based on the class (for backwards compatibility)
  578. segtype = segment.__class__
  579. typename = "%s.%s" % (segtype.__module__, segtype.__name__)
  580. stream.write_string(typename.encode("latin1"))
  581. stream.write_string(dumps(segment, -1))
  582. stream.write_uint(stream.checksum())
  583. stream.close()
  584. storage.rename_file(tempfilename, tocfilename, safe=True)