/sickbeard/name_parser/parser.py
Python | 390 lines | 360 code | 7 blank | 23 comment | 0 complexity | e72ac4a0ebafe061caff119341aa2f98 MD5 | raw file
- # Author: Nic Wolfe <nic@wolfeden.ca>
- # URL: http://code.google.com/p/sickbeard/
- #
- # This file is part of Sick Beard.
- #
- # Sick Beard is free software: you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation, either version 3 of the License, or
- # (at your option) any later version.
- #
- # Sick Beard is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with Sick Beard. If not, see <http://www.gnu.org/licenses/>.
-
- import datetime
- import os.path
- import re
-
- import regexes
-
- import sickbeard
-
- from sickbeard import logger
- from sickbeard.common import showLanguages
-
- class NameParser(object):
- def __init__(self, file_name=True):
-
- self.file_name = file_name
- self.compiled_regexes = []
- self.compiled_language_regexes =[]
- self._compile_regexes()
-
- def clean_series_name(self, series_name):
- """Cleans up series name by removing any . and _
- characters, along with any trailing hyphens.
-
- Is basically equivalent to replacing all _ and . with a
- space, but handles decimal numbers in string, for example:
-
- >>> cleanRegexedSeriesName("an.example.1.0.test")
- 'an example 1.0 test'
- >>> cleanRegexedSeriesName("an_example_1.0_test")
- 'an example 1.0 test'
-
- Stolen from dbr's tvnamer
- """
-
- series_name = re.sub("(\D)\.(?!\s)(\D)", "\\1 \\2", series_name)
- series_name = re.sub("(\d)\.(\d{4})", "\\1 \\2", series_name) # if it ends in a year then don't keep the dot
- series_name = re.sub("(\D)\.(?!\s)", "\\1 ", series_name)
- series_name = re.sub("\.(?!\s)(\D)", " \\1", series_name)
- series_name = series_name.replace("_", " ")
- series_name = re.sub("-$", "", series_name)
- return series_name.strip()
-
- def _compile_regexes(self):
- for (cur_pattern_name, cur_pattern) in regexes.ep_regexes:
- try:
- cur_regex = re.compile(cur_pattern, re.VERBOSE | re.IGNORECASE)
- except re.error, errormsg:
- logger.log(u"WARNING: Invalid episode_pattern, %s. %s" % (errormsg, cur_pattern))
- else:
- self.compiled_regexes.append((cur_pattern_name, cur_regex))
-
- for (cur_pattern_name, cur_pattern) in regexes.language_regexes.iteritems():
- try:
- cur_regex = re.compile(cur_pattern, re.VERBOSE | re.IGNORECASE)
- except re.error, errormsg:
- logger.log(u"WARNING: Invalid language_pattern, %s. %s" % (errormsg, cur_regex.pattern))
- else:
- self.compiled_language_regexes.append((cur_pattern_name, cur_regex))
-
- def _parse_string(self, name):
-
- if not name:
- return None
-
- for (cur_regex_name, cur_regex) in self.compiled_regexes:
- match = cur_regex.match(name)
-
- if not match:
- continue
-
- result = ParseResult(name)
- result.which_regex = [cur_regex_name]
-
- named_groups = match.groupdict().keys()
-
- if 'series_name' in named_groups:
- result.series_name = match.group('series_name')
- if result.series_name:
- result.series_name = self.clean_series_name(result.series_name)
-
- if 'season_num' in named_groups:
- tmp_season = int(match.group('season_num'))
- if cur_regex_name == 'bare' and tmp_season in (19,20):
- continue
- result.season_number = tmp_season
-
- if 'ep_num' in named_groups:
- ep_num = self._convert_number(match.group('ep_num'))
- if 'extra_ep_num' in named_groups and match.group('extra_ep_num'):
- result.episode_numbers = range(ep_num, self._convert_number(match.group('extra_ep_num'))+1)
- else:
- result.episode_numbers = [ep_num]
-
- if 'air_year' in named_groups and 'air_month' in named_groups and 'air_day' in named_groups:
- year = int(match.group('air_year'))
- month = int(match.group('air_month'))
- day = int(match.group('air_day'))
-
- # make an attempt to detect YYYY-DD-MM formats
- if month > 12:
- tmp_month = month
- month = day
- day = tmp_month
-
- try:
- result.air_date = datetime.date(year, month, day)
- except ValueError, e:
- raise InvalidNameException(e.message)
-
- if 'extra_info' in named_groups:
- tmp_extra_info = match.group('extra_info')
-
- result.audio_langs = 'en'
-
- if tmp_extra_info:
- for (cur_lang_regex_name, cur_lang_regex) in self.compiled_language_regexes:
- lang_match = cur_lang_regex.match(name)
-
- if not lang_match:
- continue
- else:
- logger.log(u"Found " + showLanguages.get(cur_lang_regex_name) + " episode",logger.DEBUG)
- result.audio_langs = cur_lang_regex_name
-
- #if tmp_extra_info and re.search(r'(^|\w|[. _-])*(german)(([. _-])(dubbed))?\w*([. _-]|$)', tmp_extra_info, re.I):
- # logger.log(u"Found german episode")
- #result.series_language = 'de'
-
-
- # Show.S04.Special is almost certainly not every episode in the season
- if tmp_extra_info and cur_regex_name == 'season_only' and re.match(r'([. _-]|^)(special|extra)\w*([. _-]|$)', tmp_extra_info, re.I):
- continue
- result.extra_info = tmp_extra_info
-
- if 'release_group' in named_groups:
- result.release_group = match.group('release_group')
-
- return result
-
- return None
-
- def _combine_results(self, first, second, attr):
- # if the first doesn't exist then return the second or nothing
- if not first:
- if not second:
- return None
- else:
- return getattr(second, attr)
-
- # if the second doesn't exist then return the first
- if not second:
- return getattr(first, attr)
-
- a = getattr(first, attr)
- b = getattr(second, attr)
-
- # if a is good use it
- if a != None or (type(a) == list and len(a)):
- return a
- # if not use b (if b isn't set it'll just be default)
- else:
- return b
-
- def _unicodify(self, obj, encoding = "utf-8"):
- if isinstance(obj, basestring):
- if not isinstance(obj, unicode):
- obj = unicode(obj, encoding)
- return obj
-
- def _convert_number(self, number):
- if type(number) == int:
- return number
-
- # good lord I'm lazy
- if number.lower() == 'i': return 1
- if number.lower() == 'ii': return 2
- if number.lower() == 'iii': return 3
- if number.lower() == 'iv': return 4
- if number.lower() == 'v': return 5
- if number.lower() == 'vi': return 6
- if number.lower() == 'vii': return 7
- if number.lower() == 'viii': return 8
- if number.lower() == 'ix': return 9
- if number.lower() == 'x': return 10
- if number.lower() == 'xi': return 11
- if number.lower() == 'xii': return 12
- if number.lower() == 'xiii': return 13
- if number.lower() == 'xiv': return 14
- if number.lower() == 'xv': return 15
- if number.lower() == 'xvi': return 16
- if number.lower() == 'xvii': return 17
- if number.lower() == 'xviii': return 18
- if number.lower() == 'xix': return 19
- if number.lower() == 'xx': return 20
- if number.lower() == 'xxi': return 21
- if number.lower() == 'xxii': return 22
- if number.lower() == 'xxiii': return 23
- if number.lower() == 'xxiv': return 24
- if number.lower() == 'xxv': return 25
- if number.lower() == 'xxvi': return 26
- if number.lower() == 'xxvii': return 27
- if number.lower() == 'xxviii': return 28
- if number.lower() == 'xxix': return 29
-
- return int(number)
-
- def parse(self, name):
-
- name = self._unicodify(name)
-
- cached = name_parser_cache.get(name)
- if cached:
- return cached
-
- # break it into parts if there are any (dirname, file name, extension)
- dir_name, file_name = os.path.split(name)
- ext_match = re.match('(.*)\.\w{3,4}$', file_name)
- if ext_match and self.file_name:
- base_file_name = ext_match.group(1)
- else:
- base_file_name = file_name
-
- # use only the direct parent dir
- dir_name = os.path.basename(dir_name)
-
- # set up a result to use
- final_result = ParseResult(name)
-
- # try parsing the file name
- for i in range(1985,2020):
- base_file_name=base_file_name.replace(str(i),"",2)
- file_name_result = self._parse_string(base_file_name)
-
- # parse the dirname for extra info if needed
- dir_name_result = self._parse_string(dir_name)
-
- # build the ParseResult object
- final_result.air_date = self._combine_results(file_name_result, dir_name_result, 'air_date')
- final_result.audio_langs = self._combine_results(file_name_result, dir_name_result, 'audio_langs')
-
- if not final_result.air_date:
- final_result.season_number = self._combine_results(file_name_result, dir_name_result, 'season_number')
- final_result.episode_numbers = self._combine_results(file_name_result, dir_name_result, 'episode_numbers')
-
- # if the dirname has a release group/show name I believe it over the filename
- final_result.series_name = self._combine_results(dir_name_result, file_name_result, 'series_name')
- final_result.extra_info = self._combine_results(dir_name_result, file_name_result, 'extra_info')
- final_result.release_group = self._combine_results(dir_name_result, file_name_result, 'release_group')
-
- final_result.which_regex = []
- if final_result == file_name_result:
- final_result.which_regex = file_name_result.which_regex
- elif final_result == dir_name_result:
- final_result.which_regex = dir_name_result.which_regex
- else:
- if file_name_result:
- final_result.which_regex += file_name_result.which_regex
- if dir_name_result:
- final_result.which_regex += dir_name_result.which_regex
-
- # if there's no useful info in it then raise an exception
- if final_result.season_number == None and not final_result.episode_numbers and final_result.air_date == None and not final_result.series_name:
- raise InvalidNameException("Unable to parse "+name.encode(sickbeard.SYS_ENCODING))
-
- name_parser_cache.add(name, final_result)
- # return it
- return final_result
-
- class ParseResult(object):
- def __init__(self,
- original_name,
- series_name=None,
- season_number=None,
- episode_numbers=None,
- extra_info=None,
- release_group=None,
- air_date=None,
- audio_langs = 'en'
- ):
-
- self.original_name = original_name
-
- self.series_name = series_name
- self.season_number = season_number
- if not episode_numbers:
- self.episode_numbers = []
- else:
- self.episode_numbers = episode_numbers
-
- self.extra_info = extra_info
- self.release_group = release_group
-
- self.air_date = air_date
-
- self.which_regex = None
-
- self.audio_langs = audio_langs
-
- def __eq__(self, other):
- if not other:
- return False
-
- if self.series_name != other.series_name:
- return False
- if self.season_number != other.season_number:
- return False
- if self.episode_numbers != other.episode_numbers:
- return False
- if self.extra_info != other.extra_info:
- return False
- if self.release_group != other.release_group:
- return False
- if self.air_date != other.air_date:
- return False
- if self.audio_langs != other.audio_langs:
- return False
-
- return True
-
- def __str__(self):
- if self.series_name != None:
- to_return = self.series_name + u' - '
- else:
- to_return = u''
- if self.season_number != None:
- to_return += 'S'+str(self.season_number)
- if self.episode_numbers and len(self.episode_numbers):
- for e in self.episode_numbers:
- to_return += 'E'+str(e)
-
- if self.air_by_date:
- to_return += str(self.air_date)
-
- if self.extra_info:
- to_return += ' - ' + self.extra_info
- if self.release_group:
- to_return += ' (' + self.release_group + ')'
-
- to_return += ' [ABD: '+str(self.air_by_date)+']'
-
- return to_return.encode('utf-8')
-
- def _is_air_by_date(self):
- if self.season_number == None and len(self.episode_numbers) == 0 and self.air_date:
- return True
- return False
- air_by_date = property(_is_air_by_date)
-
- class NameParserCache(object):
- #TODO: check if the fifo list can beskiped and only use one dict
- _previous_parsed_list = [] # keep a fifo list of the cached items
- _previous_parsed = {}
- _cache_size = 100
-
- def add(self, name, parse_result):
- self._previous_parsed[name] = parse_result
- self._previous_parsed_list.append(name)
- while len(self._previous_parsed_list) > self._cache_size:
- del_me = self._previous_parsed_list.pop(0)
- self._previous_parsed.pop(del_me)
-
- def get(self, name):
- if name in self._previous_parsed:
- logger.log("Using cached parse result for: " + name, logger.DEBUG)
- return self._previous_parsed[name]
- else:
- return None
-
- name_parser_cache = NameParserCache()
-
- class InvalidNameException(Exception):
- "The given name is not valid"