/feedzilla/processors/content_filter.py
Python | 70 lines | 69 code | 0 blank | 1 comment | 1 complexity | 8f370c2113f451249cc420b5fea51853 MD5 | raw file
Possible License(s): BSD-3-Clause
- """
- This module provides functions to filter posts by keyword in post's content and post's tags.
- I do not using re.I flag in regexps because of current locale does not affect
- on it. So the solution is to make both searchable text and regexp in lower case.
- """
- # Copyright: 2011, Grigoriy Petukhov
- # Author: Grigoriy Petukhov (http://lorien.name)
- # License: BSD
- import re
- import locale
- from django.utils.html import strip_tags
- from feedzilla.models import FilterTag, FilterWord
- TAGS = []
- WORDS = []
- def build_regexp(value, exact):
- """
- Build regexp for the tag/word filter.
- If filter `exact` attribute is one then make regexp to
- match the word, i.e., matched fragment shuld be surrounded with
- spaces or text start or text end.
- """
- value = value.lower()
- if exact:
- value = u'\b%s\b' % value
- return re.compile(ur'%s' % value, re.U | re.I)
- def load_filters():
- """
- Calculate regexp objects for all filters.
- """
- for obj in FilterTag.objects.all():
- TAGS.append(build_regexp(obj.value, obj.exact))
- for obj in FilterWord.objects.all():
- WORDS.append(build_regexp(obj.value, obj.exact))
- class ContentFilterProcessor(object):
- """
- This processor search for certain fragment in content and tags
- of the post and mark post as active/inactive.
- """
- def process(self, post):
- post.active = self.match_filters(post)
- def match_filters(self, post):
- for tag in post.tags.all():
- for filter_tag in TAGS:
- if filter_tag.search(tag.name.lower()):
- return True
- title = strip_tags(post.title).lower()
- text = strip_tags(post.content).lower()
- for filter_word in WORDS:
- if filter_word.search(text) or filter_word.search(title):
- return True
- return False
- load_filters()