PageRenderTime 51ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/compat/haystack/backends/whoosh_backend.py

https://bitbucket.org/resplin/byteflow
Python | 536 lines | 526 code | 6 blank | 4 comment | 8 complexity | 6ae2b5e47782995f077f5127f452cd47 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. import os
  2. import re
  3. import shutil
  4. import warnings
  5. from django.conf import settings
  6. from django.core.exceptions import ImproperlyConfigured
  7. from django.db.models.loading import get_model
  8. from django.utils.datetime_safe import datetime
  9. from django.utils.encoding import force_unicode
  10. from haystack.backends import BaseSearchBackend, BaseSearchQuery, log_query
  11. from haystack.fields import DateField, DateTimeField, IntegerField, FloatField, BooleanField, MultiValueField
  12. from haystack.exceptions import MissingDependency, SearchBackendError
  13. from haystack.models import SearchResult
  14. from haystack.utils import get_identifier
  15. try:
  16. set
  17. except NameError:
  18. from sets import Set as set
  19. try:
  20. import whoosh
  21. from whoosh.analysis import StemmingAnalyzer
  22. from whoosh.fields import Schema, ID, STORED, TEXT, KEYWORD
  23. from whoosh import index
  24. from whoosh.qparser import QueryParser
  25. from whoosh.filedb.filestore import FileStorage
  26. from whoosh.spelling import SpellChecker
  27. except ImportError:
  28. raise MissingDependency("The 'whoosh' backend requires the installation of 'Whoosh'. Please refer to the documentation.")
  29. # Handle minimum requirement.
  30. if not hasattr(whoosh, '__version__') or whoosh.__version__ < (0, 3, 5):
  31. raise MissingDependency("The 'whoosh' backend requires version 0.3.5 or greater.")
  32. DATETIME_REGEX = re.compile('^(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})T(?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2})(\.\d{3,6}Z?)?$')
  33. BACKEND_NAME = 'whoosh'
  34. class SearchBackend(BaseSearchBackend):
  35. # Word reserved by Whoosh for special use.
  36. RESERVED_WORDS = (
  37. 'AND',
  38. 'NOT',
  39. 'OR',
  40. 'TO',
  41. )
  42. # Characters reserved by Whoosh for special use.
  43. # The '\\' must come first, so as not to overwrite the other slash replacements.
  44. RESERVED_CHARACTERS = (
  45. '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
  46. '[', ']', '^', '"', '~', '*', '?', ':', '.',
  47. )
  48. def __init__(self, site=None):
  49. super(SearchBackend, self).__init__(site)
  50. self.setup_complete = False
  51. if not hasattr(settings, 'HAYSTACK_WHOOSH_PATH'):
  52. raise ImproperlyConfigured('You must specify a HAYSTACK_WHOOSH_PATH in your settings.')
  53. def setup(self):
  54. """
  55. Defers loading until needed.
  56. """
  57. new_index = False
  58. # Make sure the index is there.
  59. if not os.path.exists(settings.HAYSTACK_WHOOSH_PATH):
  60. os.makedirs(settings.HAYSTACK_WHOOSH_PATH)
  61. new_index = True
  62. if not os.access(settings.HAYSTACK_WHOOSH_PATH, os.W_OK):
  63. raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % settings.HAYSTACK_WHOOSH_PATH)
  64. self.storage = FileStorage(settings.HAYSTACK_WHOOSH_PATH)
  65. self.content_field_name, self.schema = self.build_schema(self.site.all_searchfields())
  66. self.parser = QueryParser(self.content_field_name, schema=self.schema)
  67. if new_index is True:
  68. self.index = index.create_in(settings.HAYSTACK_WHOOSH_PATH, self.schema)
  69. else:
  70. try:
  71. self.index = self.storage.open_index(schema=self.schema)
  72. except index.EmptyIndexError:
  73. self.index = index.create_in(settings.HAYSTACK_WHOOSH_PATH, self.schema)
  74. self.setup_complete = True
  75. def build_schema(self, fields):
  76. schema_fields = {
  77. 'id': ID(stored=True, unique=True),
  78. 'django_ct': ID(stored=True),
  79. 'django_id': ID(stored=True),
  80. }
  81. # Grab the number of keys that are hard-coded into Haystack.
  82. # We'll use this to (possibly) fail slightly more gracefully later.
  83. initial_key_count = len(schema_fields)
  84. content_field_name = ''
  85. for field_name, field_class in fields.items():
  86. if isinstance(field_class, MultiValueField):
  87. if field_class.indexed is False:
  88. schema_fields[field_name] = KEYWORD(stored=True, commas=True)
  89. else:
  90. schema_fields[field_name] = KEYWORD(stored=True, commas=True, scorable=True)
  91. elif isinstance(field_class, (DateField, DateTimeField, IntegerField, FloatField, BooleanField)):
  92. if field_class.indexed is False:
  93. schema_fields[field_name] = STORED
  94. else:
  95. schema_fields[field_name] = ID(stored=True)
  96. else:
  97. schema_fields[field_name] = TEXT(stored=True, analyzer=StemmingAnalyzer())
  98. if field_class.document is True:
  99. content_field_name = field_name
  100. # Fail more gracefully than relying on the backend to die if no fields
  101. # are found.
  102. if len(schema_fields) <= initial_key_count:
  103. raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.")
  104. return (content_field_name, Schema(**schema_fields))
  105. def update(self, index, iterable, commit=True):
  106. if not self.setup_complete:
  107. self.setup()
  108. self.index = self.index.refresh()
  109. writer = self.index.writer()
  110. for obj in iterable:
  111. doc = index.prepare(obj)
  112. # Really make sure it's unicode, because Whoosh won't have it any
  113. # other way.
  114. for key in doc:
  115. doc[key] = self._from_python(doc[key])
  116. writer.update_document(**doc)
  117. if len(iterable) > 0:
  118. # For now, commit no matter what, as we run into locking issues otherwise.
  119. writer.commit()
  120. # If spelling support is desired, add to the dictionary.
  121. if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
  122. sp = SpellChecker(self.storage)
  123. sp.add_field(self.index, self.content_field_name)
  124. def remove(self, obj_or_string, commit=True):
  125. if not self.setup_complete:
  126. self.setup()
  127. self.index = self.index.refresh()
  128. whoosh_id = get_identifier(obj_or_string)
  129. self.index.delete_by_query(q=self.parser.parse(u'id:"%s"' % whoosh_id))
  130. # For now, commit no matter what, as we run into locking issues otherwise.
  131. self.index.commit()
  132. def clear(self, models=[], commit=True):
  133. if not self.setup_complete:
  134. self.setup()
  135. self.index = self.index.refresh()
  136. if not models:
  137. self.delete_index()
  138. else:
  139. models_to_delete = []
  140. for model in models:
  141. models_to_delete.append(u"django_ct:%s.%s" % (model._meta.app_label, model._meta.module_name))
  142. self.index.delete_by_query(q=self.parser.parse(u" OR ".join(models_to_delete)))
  143. # For now, commit no matter what, as we run into locking issues otherwise.
  144. self.index.commit()
  145. def delete_index(self):
  146. # Per the Whoosh mailing list, if wiping out everything from the index,
  147. # it's much more efficient to simply delete the index files.
  148. if os.path.exists(settings.HAYSTACK_WHOOSH_PATH):
  149. shutil.rmtree(settings.HAYSTACK_WHOOSH_PATH)
  150. # Recreate everything.
  151. self.setup()
  152. def optimize(self):
  153. if not self.setup_complete:
  154. self.setup()
  155. self.index = self.index.refresh()
  156. self.index.optimize()
  157. @log_query
  158. def search(self, query_string, sort_by=None, start_offset=0, end_offset=None,
  159. fields='', highlight=False, facets=None, date_facets=None, query_facets=None,
  160. narrow_queries=None, spelling_query=None,
  161. limit_to_registered_models=True, **kwargs):
  162. if not self.setup_complete:
  163. self.setup()
  164. # A zero length query should return no results.
  165. if len(query_string) == 0:
  166. return {
  167. 'results': [],
  168. 'hits': 0,
  169. }
  170. query_string = force_unicode(query_string)
  171. # A one-character query (non-wildcard) gets nabbed by a stopwords
  172. # filter and should yield zero results.
  173. if len(query_string) <= 1 and query_string != u'*':
  174. return {
  175. 'results': [],
  176. 'hits': 0,
  177. }
  178. reverse = False
  179. if sort_by is not None:
  180. # Determine if we need to reverse the results and if Whoosh can
  181. # handle what it's being asked to sort by. Reversing is an
  182. # all-or-nothing action, unfortunately.
  183. sort_by_list = []
  184. reverse_counter = 0
  185. for order_by in sort_by:
  186. if order_by.startswith('-'):
  187. reverse_counter += 1
  188. if len(sort_by) > 1 and reverse_counter > 1:
  189. raise SearchBackendError("Whoosh does not handle more than one field and any field being ordered in reverse.")
  190. for order_by in sort_by:
  191. if order_by.startswith('-'):
  192. sort_by_list.append(order_by[1:])
  193. if len(sort_by_list) == 1:
  194. reverse = True
  195. else:
  196. sort_by_list.append(order_by)
  197. if len(sort_by_list) == 1:
  198. reverse = False
  199. sort_by = sort_by_list[0]
  200. if facets is not None:
  201. warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2)
  202. if date_facets is not None:
  203. warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2)
  204. if query_facets is not None:
  205. warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2)
  206. narrowed_results = None
  207. self.index = self.index.refresh()
  208. if limit_to_registered_models:
  209. # Using narrow queries, limit the results to only models registered
  210. # with the current site.
  211. if narrow_queries is None:
  212. narrow_queries = set()
  213. registered_models = self.build_registered_models_list()
  214. if len(registered_models) > 0:
  215. narrow_queries.add('django_ct:(%s)' % ' OR '.join(registered_models))
  216. if narrow_queries is not None:
  217. # Potentially expensive? I don't see another way to do it in Whoosh...
  218. narrow_searcher = self.index.searcher()
  219. for nq in narrow_queries:
  220. recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_unicode(nq)))
  221. if narrowed_results:
  222. narrowed_results.filter(recent_narrowed_results)
  223. else:
  224. narrowed_results = recent_narrowed_results
  225. self.index = self.index.refresh()
  226. if self.index.doc_count():
  227. searcher = self.index.searcher()
  228. parsed_query = self.parser.parse(query_string)
  229. # In the event of an invalid/stopworded query, recover gracefully.
  230. if parsed_query is None:
  231. return {
  232. 'results': [],
  233. 'hits': 0,
  234. }
  235. raw_results = searcher.search(parsed_query, sortedby=sort_by, reverse=reverse)
  236. # Handle the case where the results have been narrowed.
  237. if narrowed_results:
  238. raw_results.filter(narrowed_results)
  239. return self._process_results(raw_results, start_offset, end_offset, highlight=highlight, query_string=query_string, spelling_query=spelling_query)
  240. else:
  241. if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False):
  242. if spelling_query:
  243. spelling_suggestion = self.create_spelling_suggestion(spelling_query)
  244. else:
  245. spelling_suggestion = self.create_spelling_suggestion(query_string)
  246. else:
  247. spelling_suggestion = None
  248. return {
  249. 'results': [],
  250. 'hits': 0,
  251. 'spelling_suggestion': spelling_suggestion,
  252. }
  253. def more_like_this(self, model_instance, additional_query_string=None):
  254. warnings.warn("Whoosh does not handle More Like This.", Warning, stacklevel=2)
  255. return {
  256. 'results': [],
  257. 'hits': 0,
  258. }
  259. def _process_results(self, raw_results, start_offset, end_offset, highlight=False, query_string='', spelling_query=None):
  260. from haystack import site
  261. results = []
  262. # It's important to grab the hits first before slicing. Otherwise, this
  263. # can cause pagination failures.
  264. hits = len(raw_results)
  265. sliced_results = raw_results[start_offset:end_offset]
  266. facets = {}
  267. spelling_suggestion = None
  268. indexed_models = site.get_indexed_models()
  269. for doc_offset, raw_result in enumerate(sliced_results):
  270. score = raw_results.score(doc_offset + start_offset) or 0
  271. raw_result = dict(raw_result)
  272. app_label, model_name = raw_result['django_ct'].split('.')
  273. additional_fields = {}
  274. model = get_model(app_label, model_name)
  275. if model and model in indexed_models:
  276. for key, value in raw_result.items():
  277. index = site.get_index(model)
  278. string_key = str(key)
  279. if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
  280. # Special-cased due to the nature of KEYWORD fields.
  281. if isinstance(index.fields[string_key], MultiValueField):
  282. additional_fields[string_key] = value.split(',')
  283. else:
  284. additional_fields[string_key] = index.fields[string_key].convert(value)
  285. else:
  286. additional_fields[string_key] = self._to_python(value)
  287. del(additional_fields['django_ct'])
  288. del(additional_fields['django_id'])
  289. if highlight:
  290. from whoosh import analysis
  291. from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter
  292. sa = analysis.StemmingAnalyzer()
  293. terms = [term.replace('*', '') for term in query_string.split()]
  294. additional_fields['highlighted'] = {
  295. self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())],
  296. }
  297. result = SearchResult(app_label, model_name, raw_result['django_id'], score, **additional_fields)
  298. results.append(result)
  299. else:
  300. hits -= 1
  301. if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False):
  302. if spelling_query:
  303. spelling_suggestion = self.create_spelling_suggestion(spelling_query)
  304. else:
  305. spelling_suggestion = self.create_spelling_suggestion(query_string)
  306. return {
  307. 'results': results,
  308. 'hits': hits,
  309. 'facets': facets,
  310. 'spelling_suggestion': spelling_suggestion,
  311. }
  312. def create_spelling_suggestion(self, query_string):
  313. spelling_suggestion = None
  314. sp = SpellChecker(self.storage)
  315. cleaned_query = force_unicode(query_string)
  316. if not query_string:
  317. return spelling_suggestion
  318. # Clean the string.
  319. for rev_word in self.RESERVED_WORDS:
  320. cleaned_query = cleaned_query.replace(rev_word, '')
  321. for rev_char in self.RESERVED_CHARACTERS:
  322. cleaned_query = cleaned_query.replace(rev_char, '')
  323. # Break it down.
  324. query_words = cleaned_query.split()
  325. suggested_words = []
  326. for word in query_words:
  327. suggestions = sp.suggest(word, number=1)
  328. if len(suggestions) > 0:
  329. suggested_words.append(suggestions[0])
  330. spelling_suggestion = ' '.join(suggested_words)
  331. return spelling_suggestion
  332. def _from_python(self, value):
  333. """
  334. Converts Python values to a string for Whoosh.
  335. Code courtesy of pysolr.
  336. """
  337. if hasattr(value, 'strftime'):
  338. if hasattr(value, 'hour'):
  339. value = force_unicode(value.strftime('%Y-%m-%dT%H:%M:%S'))
  340. else:
  341. value = force_unicode(value.strftime('%Y-%m-%dT00:00:00'))
  342. elif isinstance(value, bool):
  343. if value:
  344. value = u'true'
  345. else:
  346. value = u'false'
  347. elif isinstance(value, (list, tuple)):
  348. value = u','.join([force_unicode(v) for v in value])
  349. else:
  350. value = force_unicode(value)
  351. return value
  352. def _to_python(self, value):
  353. """
  354. Converts values from Whoosh to native Python values.
  355. A port of the same method in pysolr, as they deal with data the same way.
  356. """
  357. if value == 'true':
  358. return True
  359. elif value == 'false':
  360. return False
  361. if value:
  362. possible_datetime = DATETIME_REGEX.search(value)
  363. if possible_datetime:
  364. date_values = possible_datetime.groupdict()
  365. for dk, dv in date_values.items():
  366. date_values[dk] = int(dv)
  367. return datetime(date_values['year'], date_values['month'], date_values['day'], date_values['hour'], date_values['minute'], date_values['second'])
  368. try:
  369. # This is slightly gross but it's hard to tell otherwise what the
  370. # string's original type might have been. Be careful who you trust.
  371. converted_value = eval(value)
  372. # Try to handle most built-in types.
  373. if isinstance(converted_value, (list, tuple, set, dict, int, float, long, complex)):
  374. return converted_value
  375. except:
  376. # If it fails (SyntaxError or its ilk) or we don't trust it,
  377. # continue on.
  378. pass
  379. return value
  380. class SearchQuery(BaseSearchQuery):
  381. def __init__(self, backend=None):
  382. super(SearchQuery, self).__init__(backend=backend)
  383. self.backend = backend or SearchBackend()
  384. def build_query_fragment(self, field, filter_type, value):
  385. result = ''
  386. if filter_type != 'in':
  387. # 'in' is a bit of a special case, as we don't want to
  388. # convert a valid list/tuple to string. Defer handling it
  389. # until later...
  390. value = self.backend._from_python(value)
  391. # Check to see if it's a phrase for an exact match.
  392. if ' ' in value:
  393. value = '"%s"' % value
  394. # 'content' is a special reserved word, much like 'pk' in
  395. # Django's ORM layer. It indicates 'no special field'.
  396. if field == 'content':
  397. result = value
  398. else:
  399. filter_types = {
  400. 'exact': "%s:%s",
  401. 'gt': "%s:{%s TO}",
  402. 'gte': "%s:[%s TO]",
  403. 'lt': "%s:{TO %s}",
  404. 'lte': "%s:[TO %s]",
  405. 'startswith': "%s:%s*",
  406. }
  407. if filter_type != 'in':
  408. possible_datetime = DATETIME_REGEX.search(value)
  409. if possible_datetime:
  410. value = self.clean(value)
  411. result = filter_types[filter_type] % (field, value)
  412. else:
  413. in_options = []
  414. for possible_value in value:
  415. pv = self.backend._from_python(possible_value)
  416. possible_datetime = DATETIME_REGEX.search(pv)
  417. if possible_datetime:
  418. pv = self.clean(pv)
  419. in_options.append('%s:"%s"' % (field, pv))
  420. result = "(%s)" % " OR ".join(in_options)
  421. return result