PageRenderTime 55ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/compat/haystack/query.py

https://bitbucket.org/resplin/byteflow
Python | 622 lines | 447 code | 83 blank | 92 comment | 96 complexity | 25c3408dfbfc72bbb3beb7d87fde40cb MD5 | raw file
Possible License(s): BSD-3-Clause
  1. import re
  2. from django.conf import settings
  3. from haystack import backend
  4. from haystack.backends import SQ
  5. from haystack.constants import REPR_OUTPUT_SIZE, ITERATOR_LOAD_PER_QUERY, DEFAULT_OPERATOR
  6. from haystack.exceptions import NotRegistered
  7. class SearchQuerySet(object):
  8. """
  9. Provides a way to specify search parameters and lazily load results.
  10. Supports chaining (a la QuerySet) to narrow the search.
  11. """
  12. def __init__(self, site=None, query=None):
  13. self.query = query or backend.SearchQuery()
  14. self._result_cache = []
  15. self._result_count = None
  16. self._cache_full = False
  17. self._load_all = False
  18. self._ignored_result_count = 0
  19. if site is not None:
  20. self.site = site
  21. else:
  22. from haystack import site as main_site
  23. self.site = main_site
  24. def __getstate__(self):
  25. """
  26. For pickling.
  27. """
  28. len(self)
  29. obj_dict = self.__dict__.copy()
  30. obj_dict['_iter'] = None
  31. return obj_dict
  32. def __repr__(self):
  33. data = list(self[:REPR_OUTPUT_SIZE])
  34. if len(self) > REPR_OUTPUT_SIZE:
  35. data[-1] = "...(remaining elements truncated)..."
  36. return repr(data)
  37. def __len__(self):
  38. if not self._result_count:
  39. self._result_count = self.query.get_count()
  40. # This needs to return the actual number of hits, not what's in the cache.
  41. return self._result_count - self._ignored_result_count
  42. def __iter__(self):
  43. if self._cache_is_full():
  44. # We've got a fully populated cache. Let Python do the hard work.
  45. return iter(self._result_cache)
  46. return self._manual_iter()
  47. def __and__(self, other):
  48. if isinstance(other, EmptySearchQuerySet):
  49. return other._clone()
  50. combined = self._clone()
  51. combined.query.combine(other.query, SQ.AND)
  52. return combined
  53. def __or__(self, other):
  54. combined = self._clone()
  55. if isinstance(other, EmptySearchQuerySet):
  56. return combined
  57. combined.query.combine(other.query, SQ.OR)
  58. return combined
  59. def _cache_is_full(self):
  60. if not self.query.has_run():
  61. return False
  62. if len(self) <= 0:
  63. return True
  64. try:
  65. self._result_cache.index(None)
  66. return False
  67. except ValueError:
  68. # No ``None``s found in the results. Check the length of the cache.
  69. return len(self._result_cache) > 0
  70. def _manual_iter(self):
  71. # If we're here, our cache isn't fully populated.
  72. # For efficiency, fill the cache as we go if we run out of results.
  73. # Also, this can't be part of the __iter__ method due to Python's rules
  74. # about generator functions.
  75. current_position = 0
  76. current_cache_max = 0
  77. while True:
  78. if len(self._result_cache) > 0:
  79. try:
  80. current_cache_max = self._result_cache.index(None)
  81. except ValueError:
  82. current_cache_max = len(self._result_cache)
  83. while current_position < current_cache_max:
  84. yield self._result_cache[current_position]
  85. current_position += 1
  86. if self._cache_is_full():
  87. raise StopIteration
  88. # We've run out of results and haven't hit our limit.
  89. # Fill more of the cache.
  90. if not self._fill_cache(current_position, current_position + ITERATOR_LOAD_PER_QUERY):
  91. raise StopIteration
  92. def _fill_cache(self, start, end):
  93. # Tell the query where to start from and how many we'd like.
  94. self.query._reset()
  95. self.query.set_limits(start, end)
  96. results = self.query.get_results()
  97. if len(results) == 0:
  98. return False
  99. # Setup the full cache now that we know how many results there are.
  100. # We need the ``None``s as placeholders to know what parts of the
  101. # cache we have/haven't filled.
  102. # Using ``None`` like this takes up very little memory. In testing,
  103. # an array of 100,000 ``None``s consumed less than .5 Mb, which ought
  104. # to be an acceptable loss for consistent and more efficient caching.
  105. if len(self._result_cache) == 0:
  106. self._result_cache = [None for i in xrange(self.query.get_count())]
  107. if start is None:
  108. start = 0
  109. if end is None:
  110. end = self.query.get_count()
  111. # Check if we wish to load all objects.
  112. if self._load_all:
  113. original_results = []
  114. models_pks = {}
  115. loaded_objects = {}
  116. # Remember the search position for each result so we don't have to resort later.
  117. for result in results:
  118. original_results.append(result)
  119. models_pks.setdefault(result.model, []).append(result.pk)
  120. # Load the objects for each model in turn.
  121. for model in models_pks:
  122. loaded_objects[model] = model._default_manager.in_bulk(models_pks[model])
  123. to_cache = []
  124. for result in results:
  125. if self._load_all:
  126. # We have to deal with integer keys being cast from strings; if this
  127. # fails we've got a character pk.
  128. try:
  129. result.pk = int(result.pk)
  130. except ValueError:
  131. pass
  132. try:
  133. result._object = loaded_objects[result.model][result.pk]
  134. except (KeyError, IndexError):
  135. # The object was either deleted since we indexed or should
  136. # be ignored; fail silently.
  137. self._ignored_result_count += 1
  138. continue
  139. to_cache.append(result)
  140. # Assign by slice.
  141. self._result_cache[start:start + len(to_cache)] = to_cache
  142. return True
  143. def __getitem__(self, k):
  144. """
  145. Retrieves an item or slice from the set of results.
  146. """
  147. if not isinstance(k, (slice, int, long)):
  148. raise TypeError
  149. assert ((not isinstance(k, slice) and (k >= 0))
  150. or (isinstance(k, slice) and (k.start is None or k.start >= 0)
  151. and (k.stop is None or k.stop >= 0))), \
  152. "Negative indexing is not supported."
  153. # Remember if it's a slice or not. We're going to treat everything as
  154. # a slice to simply the logic and will `.pop()` at the end as needed.
  155. if isinstance(k, slice):
  156. is_slice = True
  157. start = k.start
  158. if k.stop is not None:
  159. bound = int(k.stop)
  160. else:
  161. bound = None
  162. else:
  163. is_slice = False
  164. start = k
  165. bound = k + 1
  166. # We need check to see if we need to populate more of the cache.
  167. if len(self._result_cache) <= 0 or (None in self._result_cache[start:bound] and not self._cache_is_full()):
  168. try:
  169. self._fill_cache(start, bound)
  170. except StopIteration:
  171. # There's nothing left, even though the bound is higher.
  172. pass
  173. # Cache should be full enough for our needs.
  174. if is_slice:
  175. return self._result_cache[start:bound]
  176. else:
  177. return self._result_cache[start]
  178. # Methods that return a SearchQuerySet.
  179. def all(self):
  180. """Returns all results for the query."""
  181. return self._clone()
  182. def none(self):
  183. """Returns all results for the query."""
  184. return self._clone(klass=EmptySearchQuerySet)
  185. def filter(self, *args, **kwargs):
  186. """Narrows the search based on certain attributes and the default operator."""
  187. if getattr(settings, 'HAYSTACK_DEFAULT_OPERATOR', DEFAULT_OPERATOR) == 'OR':
  188. return self.filter_or(*args, **kwargs)
  189. else:
  190. return self.filter_and(*args, **kwargs)
  191. def exclude(self, *args, **kwargs):
  192. """Narrows the search by ensuring certain attributes are not included."""
  193. clone = self._clone()
  194. clone.query.add_filter(~SQ(*args, **kwargs))
  195. return clone
  196. def filter_and(self, *args, **kwargs):
  197. """Narrows the search by looking for (and including) certain attributes."""
  198. clone = self._clone()
  199. clone.query.add_filter(SQ(*args, **kwargs))
  200. return clone
  201. def filter_or(self, *args, **kwargs):
  202. """Narrows the search by ensuring certain attributes are not included."""
  203. clone = self._clone()
  204. clone.query.add_filter(SQ(*args, **kwargs), use_or=True)
  205. return clone
  206. def order_by(self, *args):
  207. """Alters the order in which the results should appear."""
  208. clone = self._clone()
  209. for field in args:
  210. clone.query.add_order_by(field)
  211. return clone
  212. def highlight(self):
  213. """Adds highlighting to the results."""
  214. clone = self._clone()
  215. clone.query.add_highlight()
  216. return clone
  217. def models(self, *models):
  218. """Accepts an arbitrary number of Model classes to include in the search."""
  219. clone = self._clone()
  220. for model in models:
  221. if model in self.site.get_indexed_models():
  222. clone.query.add_model(model)
  223. return clone
  224. def boost(self, term, boost):
  225. """Boosts a certain aspect of the query."""
  226. clone = self._clone()
  227. clone.query.add_boost(term, boost)
  228. return clone
  229. def facet(self, field):
  230. """Adds faceting to a query for the provided field."""
  231. clone = self._clone()
  232. clone.query.add_field_facet(field)
  233. return clone
  234. def date_facet(self, field, start_date, end_date, gap_by, gap_amount=1):
  235. """Adds faceting to a query for the provided field by date."""
  236. clone = self._clone()
  237. clone.query.add_date_facet(field, start_date, end_date, gap_by, gap_amount=gap_amount)
  238. return clone
  239. def query_facet(self, field, query):
  240. """Adds faceting to a query for the provided field with a custom query."""
  241. clone = self._clone()
  242. clone.query.add_query_facet(field, query)
  243. return clone
  244. def narrow(self, query):
  245. """Pushes existing facet choices into the search."""
  246. clone = self._clone()
  247. clone.query.add_narrow_query(query)
  248. return clone
  249. def raw_search(self, query_string, **kwargs):
  250. """Passes a raw query directly to the backend."""
  251. clone = self._clone()
  252. clone.query.raw_search(query_string, **kwargs)
  253. return clone
  254. def load_all(self):
  255. """Efficiently populates the objects in the search results."""
  256. clone = self._clone()
  257. clone._load_all = True
  258. return clone
  259. def load_all_queryset(self, model, queryset):
  260. # DRL_TODO: Remove before 1.0.
  261. from haystack.exceptions import HaystackError
  262. raise HaystackError("This method is deprecated. Please use the `RelatedSearchQuerySet` instead.")
  263. def auto_query(self, query_string):
  264. """
  265. Performs a best guess constructing the search query.
  266. This method is somewhat naive but works well enough for the simple,
  267. common cases.
  268. """
  269. clone = self._clone()
  270. # Pull out anything wrapped in quotes and do an exact match on it.
  271. quote_regex = re.compile(r'([\'"])(.*?)\1')
  272. result = quote_regex.search(query_string)
  273. while result is not None:
  274. full_match = result.group()
  275. query_string = query_string.replace(full_match, '', 1)
  276. exact_match = result.groups()[1]
  277. clone = clone.filter(content=clone.query.clean(exact_match))
  278. # Re-search the string for other exact matches.
  279. result = quote_regex.search(query_string)
  280. # Pseudo-tokenize the rest of the query.
  281. keywords = query_string.split()
  282. # Loop through keywords and add filters to the query.
  283. for keyword in keywords:
  284. exclude = False
  285. if keyword.startswith('-') and len(keyword) > 1:
  286. keyword = keyword[1:]
  287. exclude = True
  288. cleaned_keyword = clone.query.clean(keyword)
  289. if exclude:
  290. clone = clone.exclude(content=cleaned_keyword)
  291. else:
  292. clone = clone.filter(content=cleaned_keyword)
  293. return clone
  294. # Methods that do not return a SearchQuerySet.
  295. def count(self):
  296. """Returns the total number of matching results."""
  297. clone = self._clone()
  298. return len(clone)
  299. def best_match(self):
  300. """Returns the best/top search result that matches the query."""
  301. clone = self._clone()
  302. return clone[0]
  303. def latest(self, date_field):
  304. """Returns the most recent search result that matches the query."""
  305. clone = self._clone()
  306. clone.query.clear_order_by()
  307. clone.query.add_order_by("-%s" % date_field)
  308. return clone.best_match()
  309. def more_like_this(self, model_instance):
  310. """Finds similar results to the object passed in."""
  311. clone = self._clone()
  312. clone.query.more_like_this(model_instance)
  313. return clone
  314. def facet_counts(self):
  315. """
  316. Returns the facet counts found by the query.
  317. This will cause the query to execute and should generally be used when
  318. presenting the data.
  319. """
  320. clone = self._clone()
  321. return clone.query.get_facet_counts()
  322. def spelling_suggestion(self, preferred_query=None):
  323. """
  324. Returns the spelling suggestion found by the query.
  325. To work, you must set ``settings.HAYSTACK_INCLUDE_SPELLING`` to True.
  326. Otherwise, ``None`` will be returned.
  327. This will cause the query to execute and should generally be used when
  328. presenting the data.
  329. """
  330. clone = self._clone()
  331. return clone.query.get_spelling_suggestion(preferred_query)
  332. # Utility methods.
  333. def _clone(self, klass=None):
  334. if klass is None:
  335. klass = self.__class__
  336. query = self.query._clone()
  337. clone = klass(site=self.site, query=query)
  338. clone._load_all = self._load_all
  339. return clone
  340. class EmptySearchQuerySet(SearchQuerySet):
  341. """
  342. A stubbed SearchQuerySet that behaves as normal but always returns no
  343. results.
  344. """
  345. def __len__(self):
  346. return 0
  347. def _cache_is_full(self):
  348. # Pretend the cache is always full with no results.
  349. return True
  350. def _clone(self, klass=None):
  351. clone = super(EmptySearchQuerySet, self)._clone(klass=klass)
  352. clone._result_cache = []
  353. return clone
  354. def __getitem__(self, k):
  355. if isinstance(k, slice):
  356. return []
  357. else:
  358. raise IndexError("'EmptySearchQuerySet' have no results to access.")
  359. class RelatedSearchQuerySet(SearchQuerySet):
  360. """
  361. A variant of the SearchQuerySet that can handle `load_all_queryset`s.
  362. This is predominantly different in the `_fill_cache` method, as it is
  363. far less efficient but needs to fill the cache before it to maintain
  364. consistency.
  365. """
  366. _load_all_querysets = {}
  367. _result_cache = []
  368. def _cache_is_full(self):
  369. return len(self._result_cache) >= len(self)
  370. def _manual_iter(self):
  371. # If we're here, our cache isn't fully populated.
  372. # For efficiency, fill the cache as we go if we run out of results.
  373. # Also, this can't be part of the __iter__ method due to Python's rules
  374. # about generator functions.
  375. current_position = 0
  376. current_cache_max = 0
  377. while True:
  378. current_cache_max = len(self._result_cache)
  379. while current_position < current_cache_max:
  380. yield self._result_cache[current_position]
  381. current_position += 1
  382. if self._cache_is_full():
  383. raise StopIteration
  384. # We've run out of results and haven't hit our limit.
  385. # Fill more of the cache.
  386. start = current_position + self._ignored_result_count
  387. if not self._fill_cache(start, start + ITERATOR_LOAD_PER_QUERY):
  388. raise StopIteration
  389. def _fill_cache(self, start, end):
  390. # Tell the query where to start from and how many we'd like.
  391. self.query._reset()
  392. self.query.set_limits(start, end)
  393. results = self.query.get_results()
  394. if len(results) == 0:
  395. return False
  396. if start is None:
  397. start = 0
  398. if end is None:
  399. end = self.query.get_count()
  400. # Check if we wish to load all objects.
  401. if self._load_all:
  402. original_results = []
  403. models_pks = {}
  404. loaded_objects = {}
  405. # Remember the search position for each result so we don't have to resort later.
  406. for result in results:
  407. original_results.append(result)
  408. models_pks.setdefault(result.model, []).append(result.pk)
  409. # Load the objects for each model in turn.
  410. for model in models_pks:
  411. if model in self._load_all_querysets:
  412. # Use the overriding queryset.
  413. loaded_objects[model] = self._load_all_querysets[model].in_bulk(models_pks[model])
  414. else:
  415. # Check the SearchIndex for the model for an override.
  416. try:
  417. index = self.site.get_index(model)
  418. qs = index.load_all_queryset()
  419. loaded_objects[model] = qs.in_bulk(models_pks[model])
  420. except NotRegistered:
  421. # The model returned doesn't seem to be registered with
  422. # the current site. We should silently fail and populate
  423. # nothing for those objects.
  424. loaded_objects[model] = []
  425. if len(results) + len(self._result_cache) < len(self) and len(results) < ITERATOR_LOAD_PER_QUERY:
  426. self._ignored_result_count += ITERATOR_LOAD_PER_QUERY - len(results)
  427. for result in results:
  428. if self._load_all:
  429. # We have to deal with integer keys being cast from strings; if this
  430. # fails we've got a character pk.
  431. try:
  432. result.pk = int(result.pk)
  433. except ValueError:
  434. pass
  435. try:
  436. result._object = loaded_objects[result.model][result.pk]
  437. except (KeyError, IndexError):
  438. # The object was either deleted since we indexed or should
  439. # be ignored; fail silently.
  440. self._ignored_result_count += 1
  441. continue
  442. self._result_cache.append(result)
  443. return True
  444. def __getitem__(self, k):
  445. """
  446. Retrieves an item or slice from the set of results.
  447. """
  448. if not isinstance(k, (slice, int, long)):
  449. raise TypeError
  450. assert ((not isinstance(k, slice) and (k >= 0))
  451. or (isinstance(k, slice) and (k.start is None or k.start >= 0)
  452. and (k.stop is None or k.stop >= 0))), \
  453. "Negative indexing is not supported."
  454. # Remember if it's a slice or not. We're going to treat everything as
  455. # a slice to simply the logic and will `.pop()` at the end as needed.
  456. if isinstance(k, slice):
  457. is_slice = True
  458. start = k.start
  459. if k.stop is not None:
  460. bound = int(k.stop)
  461. else:
  462. bound = None
  463. else:
  464. is_slice = False
  465. start = k
  466. bound = k + 1
  467. # We need check to see if we need to populate more of the cache.
  468. if len(self._result_cache) <= 0 or not self._cache_is_full():
  469. try:
  470. while len(self._result_cache) < bound and not self._cache_is_full():
  471. current_max = len(self._result_cache) + self._ignored_result_count
  472. self._fill_cache(current_max, current_max + ITERATOR_LOAD_PER_QUERY)
  473. except StopIteration:
  474. # There's nothing left, even though the bound is higher.
  475. pass
  476. # Cache should be full enough for our needs.
  477. if is_slice:
  478. return self._result_cache[start:bound]
  479. else:
  480. return self._result_cache[start]
  481. def load_all_queryset(self, model, queryset):
  482. """
  483. Allows for specifying a custom ``QuerySet`` that changes how ``load_all``
  484. will fetch records for the provided model.
  485. This is useful for post-processing the results from the query, enabling
  486. things like adding ``select_related`` or filtering certain data.
  487. """
  488. clone = self._clone()
  489. clone._load_all_querysets[model] = queryset
  490. return clone
  491. def _clone(self, klass=None):
  492. if klass is None:
  493. klass = self.__class__
  494. query = self.query._clone()
  495. clone = klass(site=self.site, query=query)
  496. clone._load_all = self._load_all
  497. clone._load_all_querysets = self._load_all_querysets
  498. return clone