PageRenderTime 2641ms CodeModel.GetById 41ms RepoModel.GetById 1ms app.codeStats 0ms

/tv/lib/search.py

https://github.com/kazcw/miro
Python | 286 lines | 198 code | 12 blank | 76 comment | 17 complexity | fb7add9228a3af9fe384e54d7ac659f3 MD5 | raw file
  1. # Miro - an RSS based video player application
  2. # Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011
  3. # Participatory Culture Foundation
  4. #
  5. # This program is free software; you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation; either version 2 of the License, or
  8. # (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License
  16. # along with this program; if not, write to the Free Software
  17. # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  18. #
  19. # In addition, as a special exception, the copyright holders give
  20. # permission to link the code of portions of this program with the OpenSSL
  21. # library.
  22. #
  23. # You must obey the GNU General Public License in all respects for all of
  24. # the code used other than OpenSSL. If you modify file(s) with this
  25. # exception, you may extend this exception to your version of the file(s),
  26. # but you are not obligated to do so. If you do not wish to do so, delete
  27. # this exception statement from your version. If you delete this exception
  28. # statement from all source files in the program, then also delete it here.
  29. """search.py -- Indexed searching of items.
  30. To make incremental search fast, we index the N-grams for each item.
  31. """
  32. import collections
  33. import os
  34. import re
  35. from miro import ngrams
  36. from miro.plat.utils import filename_to_unicode
  37. # XXX not correct as we don't take into account of foreign quotation marks
  38. QUOTEKILLER = re.compile(r'(?<!\\)"')
  39. SLASHKILLER = re.compile(r'\\.')
  40. # Let's hope all this stuff is in Unicode...
  41. WORDMATCHER = re.compile("\w+", re.UNICODE)
  42. NGRAM_MIN = 3
  43. NGRAM_MAX = 5
  44. SEARCHOBJECTS = {}
  45. def _get_boolean_search(search_string):
  46. if not SEARCHOBJECTS.has_key(search_string):
  47. SEARCHOBJECTS[search_string] = BooleanSearch(search_string)
  48. return SEARCHOBJECTS[search_string]
  49. class BooleanSearch:
  50. def __init__ (self, search_string):
  51. self.string = search_string
  52. self.positive_terms = []
  53. self.negative_terms = []
  54. self.parse_string()
  55. def parse_string(self):
  56. inquote = False
  57. i = 0
  58. while i < len (self.string) and self.string[i] == ' ':
  59. i += 1
  60. laststart = i
  61. while (i < len(self.string)):
  62. i = laststart
  63. while (i < len(self.string)):
  64. if self.string[i] == '"':
  65. inquote = not inquote
  66. if not inquote and self.string[i] == ' ':
  67. break
  68. if self.string[i] == '\\':
  69. i += 1
  70. i += 1
  71. if inquote:
  72. self.process(self.string[laststart:])
  73. else:
  74. self.process(self.string[laststart:i])
  75. while i < len (self.string) and self.string[i] == ' ':
  76. i += 1
  77. laststart = i
  78. def process(self, substring):
  79. if substring[0] == '-':
  80. substring = substring[1:]
  81. term_list = self.negative_terms
  82. else:
  83. term_list = self.positive_terms
  84. substring = QUOTEKILLER.sub("", substring)
  85. substring = SLASHKILLER.sub(lambda x: x.group(0)[1], substring)
  86. term_list.append(substring.lower())
  87. def as_string(self):
  88. return self.string
  89. def _calc_search_text(item_info):
  90. match_against = [item_info.name, item_info.description]
  91. if item_info.artist is not None:
  92. match_against.append(item_info.artist)
  93. if item_info.album is not None:
  94. match_against.append(item_info.album)
  95. if item_info.genre is not None:
  96. match_against.append(item_info.genre)
  97. if item_info.feed_name is not None:
  98. match_against.append(item_info.feed_name)
  99. if item_info.download_info and item_info.download_info.torrent:
  100. match_against.append(u'torrent')
  101. if item_info.video_path:
  102. filename = os.path.basename(item_info.video_path)
  103. match_against.append(filename_to_unicode(filename))
  104. return (' '.join(match_against)).lower()
  105. def calc_search_terms(item_info):
  106. """Return a list of terms that we want to index for an ItemInfo. """
  107. return WORDMATCHER.findall(_calc_search_text(item_info))
  108. def _ngrams_for_term(term):
  109. """Given a term, return a list of N-grams that we should search for.
  110. If the term is shorter than NGRAM_MAX, this is just the term itself.
  111. If it's longer, we split it up into a bunch of N-grams to search for.
  112. """
  113. if len(term) < NGRAM_MIN:
  114. # term is shorter than our smallest ngrams, return an empty list,
  115. # which causes us to match everything
  116. return []
  117. elif len(term) <= NGRAM_MAX:
  118. # normal case, search for term in using the N-grams we've calculated
  119. return [term]
  120. else:
  121. # term is longer than our longest N-grams, try the best we can using
  122. # substrings of term. We only need to use the longest N-grams, since
  123. # shorter N-grams will just be substrings of those.
  124. return ngrams.breakup_word(term, NGRAM_MAX, NGRAM_MAX)
  125. def _ngrams_for_item(item_info):
  126. """Given an ItemInfo, return a list of N-grams contained."""
  127. return ngrams.breakup_list(item_info.search_terms, NGRAM_MIN, NGRAM_MAX)
  128. def item_matches(item, search_text):
  129. """Test if a single ItemInfo matches a search
  130. :param item: Item to test
  131. :param search_text: search_text to search with
  132. :returns: True if the item matches the search string
  133. """
  134. parsed_search = _get_boolean_search(search_text)
  135. match_against = [item.title, item.description, item.entry_description]
  136. match_against.append(item.artist)
  137. match_against.append(item.album)
  138. match_against.append(item.genre)
  139. match_against.append(item.get_source_for_search())
  140. if item.filename:
  141. filename = os.path.basename(item.filename)
  142. match_against.append(filename_to_unicode(filename))
  143. match_against_text = (' '.join(term.lower() for term in match_against
  144. if term is not None))
  145. for term in parsed_search.positive_terms:
  146. if term not in match_against_text:
  147. return False
  148. for term in parsed_search.negative_terms:
  149. if term in match_against_text:
  150. return False
  151. return True
  152. def list_matches(item_infos, search_text):
  153. """
  154. Optimized version of item_matches() which filters a iterable
  155. of item_infos.
  156. Right now, the optimization is for a short search string and a lot of
  157. items (the typical case). This will probably be slow for long search
  158. strings since we'll need to iterate over all of the terms.
  159. """
  160. parsed_search = _get_boolean_search(search_text)
  161. positive_set = set()
  162. negative_set = set()
  163. for term in parsed_search.positive_terms:
  164. positive_set |= set(_ngrams_for_term(term))
  165. for term in parsed_search.negative_terms:
  166. negative_set |= set(_ngrams_for_term(term))
  167. for info in item_infos:
  168. item_ngrams_set = set(_ngrams_for_item(info))
  169. match = positive_set.issubset(item_ngrams_set)
  170. if match and negative_set:
  171. match = negative_set.isdisjoint(item_ngrams_set)
  172. if match:
  173. yield info
  174. class ItemSearcher(object):
  175. """Index Item objects so that they can be searched quickly """
  176. def __init__(self):
  177. # map N-grams -> set of item ids
  178. self._ngram_map = collections.defaultdict(set)
  179. # map item id -> list of N-grams
  180. self._item_ngrams = {}
  181. def add_item(self, item_info):
  182. """Add an item info to the index."""
  183. self._add_item(item_info)
  184. def update_item(self, item_info):
  185. """Update the index based on an item info changing.
  186. Raises a KeyError if item_info is not currently in the index
  187. """
  188. self._remove_item(item_info.id)
  189. self._add_item(item_info)
  190. def remove_item(self, item_id):
  191. """Remove an item from the index.
  192. Raises a KeyError if item_info is not currently in the index
  193. """
  194. self._remove_item(item_id)
  195. def _add_item(self, item_info):
  196. item_ngrams = _ngrams_for_item(item_info)
  197. for ngram in item_ngrams:
  198. self._ngram_map[ngram].add(item_info.id)
  199. self._item_ngrams[item_info.id] = item_ngrams
  200. def _remove_item(self, item_id):
  201. for ngram in self._item_ngrams.pop(item_id):
  202. self._ngram_map[ngram].discard(item_id)
  203. def _term_search(self, term):
  204. grams = _ngrams_for_term(term)
  205. # note that we need to copy the value from _ngram_map. We don't want
  206. # our calls to intersection_update to change it.
  207. rv = set(self._ngram_map[grams[0]])
  208. for gram in grams[1:]:
  209. rv.intersection_update(self._ngram_map[gram])
  210. return rv
  211. def search(self, search_text):
  212. """Search through the index items.
  213. :param search_text: search_text to search with
  214. :returns: set of ids that match the search
  215. """
  216. parsed_search = _get_boolean_search(search_text)
  217. # filter out terms smaller than the smallest N-gram we index.
  218. positive_terms = [t for t in parsed_search.positive_terms
  219. if len(t) >= NGRAM_MIN]
  220. negative_terms = [t for t in parsed_search.negative_terms
  221. if len(t) >= NGRAM_MIN]
  222. if positive_terms:
  223. first_term = positive_terms[0]
  224. matching_ids = self._term_search(first_term)
  225. for term in positive_terms[1:]:
  226. matching_ids.intersection_update(self._term_search(term))
  227. else:
  228. matching_ids = set(self._item_ngrams.keys())
  229. for term in negative_terms:
  230. matching_ids.difference_update(self._term_search(term))
  231. return matching_ids
  232. match_against = [item_info.name, item_info.description]
  233. if item_info.artist is not None:
  234. match_against.append(item_info.artist)
  235. if item_info.album is not None:
  236. match_against.append(item_info.album)
  237. if item_info.genre is not None:
  238. match_against.append(item_info.genre)
  239. if item_info.feed_name is not None:
  240. match_against.append(item_info.feed_name)
  241. if item_info.download_info and item_info.download_info.torrent:
  242. match_against.append(u'torrent')
  243. if item_info.video_path:
  244. filename = os.path.basename(item_info.video_path)
  245. match_against.append(filename_to_unicode(filename))
  246. return (' '.join(match_against)).lower()