/tv/lib/search.py
Python | 286 lines | 198 code | 12 blank | 76 comment | 17 complexity | fb7add9228a3af9fe384e54d7ac659f3 MD5 | raw file
- # Miro - an RSS based video player application
- # Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011
- # Participatory Culture Foundation
- #
- # This program is free software; you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or
- # (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program; if not, write to the Free Software
- # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- #
- # In addition, as a special exception, the copyright holders give
- # permission to link the code of portions of this program with the OpenSSL
- # library.
- #
- # You must obey the GNU General Public License in all respects for all of
- # the code used other than OpenSSL. If you modify file(s) with this
- # exception, you may extend this exception to your version of the file(s),
- # but you are not obligated to do so. If you do not wish to do so, delete
- # this exception statement from your version. If you delete this exception
- # statement from all source files in the program, then also delete it here.
- """search.py -- Indexed searching of items.
- To make incremental search fast, we index the N-grams for each item.
- """
- import collections
- import os
- import re
- from miro import ngrams
- from miro.plat.utils import filename_to_unicode
- # XXX not correct as we don't take into account of foreign quotation marks
- QUOTEKILLER = re.compile(r'(?<!\\)"')
- SLASHKILLER = re.compile(r'\\.')
- # Let's hope all this stuff is in Unicode...
- WORDMATCHER = re.compile("\w+", re.UNICODE)
- NGRAM_MIN = 3
- NGRAM_MAX = 5
- SEARCHOBJECTS = {}
- def _get_boolean_search(search_string):
- if not SEARCHOBJECTS.has_key(search_string):
- SEARCHOBJECTS[search_string] = BooleanSearch(search_string)
- return SEARCHOBJECTS[search_string]
- class BooleanSearch:
- def __init__ (self, search_string):
- self.string = search_string
- self.positive_terms = []
- self.negative_terms = []
- self.parse_string()
- def parse_string(self):
- inquote = False
- i = 0
- while i < len (self.string) and self.string[i] == ' ':
- i += 1
- laststart = i
- while (i < len(self.string)):
- i = laststart
- while (i < len(self.string)):
- if self.string[i] == '"':
- inquote = not inquote
- if not inquote and self.string[i] == ' ':
- break
- if self.string[i] == '\\':
- i += 1
- i += 1
- if inquote:
- self.process(self.string[laststart:])
- else:
- self.process(self.string[laststart:i])
- while i < len (self.string) and self.string[i] == ' ':
- i += 1
- laststart = i
- def process(self, substring):
- if substring[0] == '-':
- substring = substring[1:]
- term_list = self.negative_terms
- else:
- term_list = self.positive_terms
- substring = QUOTEKILLER.sub("", substring)
- substring = SLASHKILLER.sub(lambda x: x.group(0)[1], substring)
- term_list.append(substring.lower())
- def as_string(self):
- return self.string
- def _calc_search_text(item_info):
- match_against = [item_info.name, item_info.description]
- if item_info.artist is not None:
- match_against.append(item_info.artist)
- if item_info.album is not None:
- match_against.append(item_info.album)
- if item_info.genre is not None:
- match_against.append(item_info.genre)
- if item_info.feed_name is not None:
- match_against.append(item_info.feed_name)
- if item_info.download_info and item_info.download_info.torrent:
- match_against.append(u'torrent')
- if item_info.video_path:
- filename = os.path.basename(item_info.video_path)
- match_against.append(filename_to_unicode(filename))
- return (' '.join(match_against)).lower()
- def calc_search_terms(item_info):
- """Return a list of terms that we want to index for an ItemInfo. """
- return WORDMATCHER.findall(_calc_search_text(item_info))
- def _ngrams_for_term(term):
- """Given a term, return a list of N-grams that we should search for.
- If the term is shorter than NGRAM_MAX, this is just the term itself.
- If it's longer, we split it up into a bunch of N-grams to search for.
- """
- if len(term) < NGRAM_MIN:
- # term is shorter than our smallest ngrams, return an empty list,
- # which causes us to match everything
- return []
- elif len(term) <= NGRAM_MAX:
- # normal case, search for term in using the N-grams we've calculated
- return [term]
- else:
- # term is longer than our longest N-grams, try the best we can using
- # substrings of term. We only need to use the longest N-grams, since
- # shorter N-grams will just be substrings of those.
- return ngrams.breakup_word(term, NGRAM_MAX, NGRAM_MAX)
- def _ngrams_for_item(item_info):
- """Given an ItemInfo, return a list of N-grams contained."""
- return ngrams.breakup_list(item_info.search_terms, NGRAM_MIN, NGRAM_MAX)
- def item_matches(item, search_text):
- """Test if a single ItemInfo matches a search
- :param item: Item to test
- :param search_text: search_text to search with
- :returns: True if the item matches the search string
- """
- parsed_search = _get_boolean_search(search_text)
- match_against = [item.title, item.description, item.entry_description]
- match_against.append(item.artist)
- match_against.append(item.album)
- match_against.append(item.genre)
- match_against.append(item.get_source_for_search())
- if item.filename:
- filename = os.path.basename(item.filename)
- match_against.append(filename_to_unicode(filename))
- match_against_text = (' '.join(term.lower() for term in match_against
- if term is not None))
- for term in parsed_search.positive_terms:
- if term not in match_against_text:
- return False
- for term in parsed_search.negative_terms:
- if term in match_against_text:
- return False
- return True
- def list_matches(item_infos, search_text):
- """
- Optimized version of item_matches() which filters a iterable
- of item_infos.
- Right now, the optimization is for a short search string and a lot of
- items (the typical case). This will probably be slow for long search
- strings since we'll need to iterate over all of the terms.
- """
- parsed_search = _get_boolean_search(search_text)
- positive_set = set()
- negative_set = set()
- for term in parsed_search.positive_terms:
- positive_set |= set(_ngrams_for_term(term))
- for term in parsed_search.negative_terms:
- negative_set |= set(_ngrams_for_term(term))
- for info in item_infos:
- item_ngrams_set = set(_ngrams_for_item(info))
- match = positive_set.issubset(item_ngrams_set)
- if match and negative_set:
- match = negative_set.isdisjoint(item_ngrams_set)
- if match:
- yield info
- class ItemSearcher(object):
- """Index Item objects so that they can be searched quickly """
- def __init__(self):
- # map N-grams -> set of item ids
- self._ngram_map = collections.defaultdict(set)
- # map item id -> list of N-grams
- self._item_ngrams = {}
- def add_item(self, item_info):
- """Add an item info to the index."""
- self._add_item(item_info)
- def update_item(self, item_info):
- """Update the index based on an item info changing.
- Raises a KeyError if item_info is not currently in the index
- """
- self._remove_item(item_info.id)
- self._add_item(item_info)
- def remove_item(self, item_id):
- """Remove an item from the index.
- Raises a KeyError if item_info is not currently in the index
- """
- self._remove_item(item_id)
- def _add_item(self, item_info):
- item_ngrams = _ngrams_for_item(item_info)
- for ngram in item_ngrams:
- self._ngram_map[ngram].add(item_info.id)
- self._item_ngrams[item_info.id] = item_ngrams
- def _remove_item(self, item_id):
- for ngram in self._item_ngrams.pop(item_id):
- self._ngram_map[ngram].discard(item_id)
- def _term_search(self, term):
- grams = _ngrams_for_term(term)
- # note that we need to copy the value from _ngram_map. We don't want
- # our calls to intersection_update to change it.
- rv = set(self._ngram_map[grams[0]])
- for gram in grams[1:]:
- rv.intersection_update(self._ngram_map[gram])
- return rv
- def search(self, search_text):
- """Search through the index items.
- :param search_text: search_text to search with
- :returns: set of ids that match the search
- """
- parsed_search = _get_boolean_search(search_text)
- # filter out terms smaller than the smallest N-gram we index.
- positive_terms = [t for t in parsed_search.positive_terms
- if len(t) >= NGRAM_MIN]
- negative_terms = [t for t in parsed_search.negative_terms
- if len(t) >= NGRAM_MIN]
- if positive_terms:
- first_term = positive_terms[0]
- matching_ids = self._term_search(first_term)
- for term in positive_terms[1:]:
- matching_ids.intersection_update(self._term_search(term))
- else:
- matching_ids = set(self._item_ngrams.keys())
- for term in negative_terms:
- matching_ids.difference_update(self._term_search(term))
- return matching_ids
- match_against = [item_info.name, item_info.description]
- if item_info.artist is not None:
- match_against.append(item_info.artist)
- if item_info.album is not None:
- match_against.append(item_info.album)
- if item_info.genre is not None:
- match_against.append(item_info.genre)
- if item_info.feed_name is not None:
- match_against.append(item_info.feed_name)
- if item_info.download_info and item_info.download_info.torrent:
- match_against.append(u'torrent')
- if item_info.video_path:
- filename = os.path.basename(item_info.video_path)
- match_against.append(filename_to_unicode(filename))
- return (' '.join(match_against)).lower()