PageRenderTime 31ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/sickbeard/name_parser/parser.py

https://gitlab.com/akila-33/Sick-Beard
Python | 390 lines | 360 code | 7 blank | 23 comment | 0 complexity | e72ac4a0ebafe061caff119341aa2f98 MD5 | raw file
  1. # Author: Nic Wolfe <nic@wolfeden.ca>
  2. # URL: http://code.google.com/p/sickbeard/
  3. #
  4. # This file is part of Sick Beard.
  5. #
  6. # Sick Beard is free software: you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation, either version 3 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # Sick Beard is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public License
  17. # along with Sick Beard. If not, see <http://www.gnu.org/licenses/>.
  18. import datetime
  19. import os.path
  20. import re
  21. import regexes
  22. import sickbeard
  23. from sickbeard import logger
  24. from sickbeard.common import showLanguages
  25. class NameParser(object):
  26. def __init__(self, file_name=True):
  27. self.file_name = file_name
  28. self.compiled_regexes = []
  29. self.compiled_language_regexes =[]
  30. self._compile_regexes()
  31. def clean_series_name(self, series_name):
  32. """Cleans up series name by removing any . and _
  33. characters, along with any trailing hyphens.
  34. Is basically equivalent to replacing all _ and . with a
  35. space, but handles decimal numbers in string, for example:
  36. >>> cleanRegexedSeriesName("an.example.1.0.test")
  37. 'an example 1.0 test'
  38. >>> cleanRegexedSeriesName("an_example_1.0_test")
  39. 'an example 1.0 test'
  40. Stolen from dbr's tvnamer
  41. """
  42. series_name = re.sub("(\D)\.(?!\s)(\D)", "\\1 \\2", series_name)
  43. series_name = re.sub("(\d)\.(\d{4})", "\\1 \\2", series_name) # if it ends in a year then don't keep the dot
  44. series_name = re.sub("(\D)\.(?!\s)", "\\1 ", series_name)
  45. series_name = re.sub("\.(?!\s)(\D)", " \\1", series_name)
  46. series_name = series_name.replace("_", " ")
  47. series_name = re.sub("-$", "", series_name)
  48. return series_name.strip()
  49. def _compile_regexes(self):
  50. for (cur_pattern_name, cur_pattern) in regexes.ep_regexes:
  51. try:
  52. cur_regex = re.compile(cur_pattern, re.VERBOSE | re.IGNORECASE)
  53. except re.error, errormsg:
  54. logger.log(u"WARNING: Invalid episode_pattern, %s. %s" % (errormsg, cur_pattern))
  55. else:
  56. self.compiled_regexes.append((cur_pattern_name, cur_regex))
  57. for (cur_pattern_name, cur_pattern) in regexes.language_regexes.iteritems():
  58. try:
  59. cur_regex = re.compile(cur_pattern, re.VERBOSE | re.IGNORECASE)
  60. except re.error, errormsg:
  61. logger.log(u"WARNING: Invalid language_pattern, %s. %s" % (errormsg, cur_regex.pattern))
  62. else:
  63. self.compiled_language_regexes.append((cur_pattern_name, cur_regex))
  64. def _parse_string(self, name):
  65. if not name:
  66. return None
  67. for (cur_regex_name, cur_regex) in self.compiled_regexes:
  68. match = cur_regex.match(name)
  69. if not match:
  70. continue
  71. result = ParseResult(name)
  72. result.which_regex = [cur_regex_name]
  73. named_groups = match.groupdict().keys()
  74. if 'series_name' in named_groups:
  75. result.series_name = match.group('series_name')
  76. if result.series_name:
  77. result.series_name = self.clean_series_name(result.series_name)
  78. if 'season_num' in named_groups:
  79. tmp_season = int(match.group('season_num'))
  80. if cur_regex_name == 'bare' and tmp_season in (19,20):
  81. continue
  82. result.season_number = tmp_season
  83. if 'ep_num' in named_groups:
  84. ep_num = self._convert_number(match.group('ep_num'))
  85. if 'extra_ep_num' in named_groups and match.group('extra_ep_num'):
  86. result.episode_numbers = range(ep_num, self._convert_number(match.group('extra_ep_num'))+1)
  87. else:
  88. result.episode_numbers = [ep_num]
  89. if 'air_year' in named_groups and 'air_month' in named_groups and 'air_day' in named_groups:
  90. year = int(match.group('air_year'))
  91. month = int(match.group('air_month'))
  92. day = int(match.group('air_day'))
  93. # make an attempt to detect YYYY-DD-MM formats
  94. if month > 12:
  95. tmp_month = month
  96. month = day
  97. day = tmp_month
  98. try:
  99. result.air_date = datetime.date(year, month, day)
  100. except ValueError, e:
  101. raise InvalidNameException(e.message)
  102. if 'extra_info' in named_groups:
  103. tmp_extra_info = match.group('extra_info')
  104. result.audio_langs = 'en'
  105. if tmp_extra_info:
  106. for (cur_lang_regex_name, cur_lang_regex) in self.compiled_language_regexes:
  107. lang_match = cur_lang_regex.match(name)
  108. if not lang_match:
  109. continue
  110. else:
  111. logger.log(u"Found " + showLanguages.get(cur_lang_regex_name) + " episode",logger.DEBUG)
  112. result.audio_langs = cur_lang_regex_name
  113. #if tmp_extra_info and re.search(r'(^|\w|[. _-])*(german)(([. _-])(dubbed))?\w*([. _-]|$)', tmp_extra_info, re.I):
  114. # logger.log(u"Found german episode")
  115. #result.series_language = 'de'
  116. # Show.S04.Special is almost certainly not every episode in the season
  117. if tmp_extra_info and cur_regex_name == 'season_only' and re.match(r'([. _-]|^)(special|extra)\w*([. _-]|$)', tmp_extra_info, re.I):
  118. continue
  119. result.extra_info = tmp_extra_info
  120. if 'release_group' in named_groups:
  121. result.release_group = match.group('release_group')
  122. return result
  123. return None
  124. def _combine_results(self, first, second, attr):
  125. # if the first doesn't exist then return the second or nothing
  126. if not first:
  127. if not second:
  128. return None
  129. else:
  130. return getattr(second, attr)
  131. # if the second doesn't exist then return the first
  132. if not second:
  133. return getattr(first, attr)
  134. a = getattr(first, attr)
  135. b = getattr(second, attr)
  136. # if a is good use it
  137. if a != None or (type(a) == list and len(a)):
  138. return a
  139. # if not use b (if b isn't set it'll just be default)
  140. else:
  141. return b
  142. def _unicodify(self, obj, encoding = "utf-8"):
  143. if isinstance(obj, basestring):
  144. if not isinstance(obj, unicode):
  145. obj = unicode(obj, encoding)
  146. return obj
  147. def _convert_number(self, number):
  148. if type(number) == int:
  149. return number
  150. # good lord I'm lazy
  151. if number.lower() == 'i': return 1
  152. if number.lower() == 'ii': return 2
  153. if number.lower() == 'iii': return 3
  154. if number.lower() == 'iv': return 4
  155. if number.lower() == 'v': return 5
  156. if number.lower() == 'vi': return 6
  157. if number.lower() == 'vii': return 7
  158. if number.lower() == 'viii': return 8
  159. if number.lower() == 'ix': return 9
  160. if number.lower() == 'x': return 10
  161. if number.lower() == 'xi': return 11
  162. if number.lower() == 'xii': return 12
  163. if number.lower() == 'xiii': return 13
  164. if number.lower() == 'xiv': return 14
  165. if number.lower() == 'xv': return 15
  166. if number.lower() == 'xvi': return 16
  167. if number.lower() == 'xvii': return 17
  168. if number.lower() == 'xviii': return 18
  169. if number.lower() == 'xix': return 19
  170. if number.lower() == 'xx': return 20
  171. if number.lower() == 'xxi': return 21
  172. if number.lower() == 'xxii': return 22
  173. if number.lower() == 'xxiii': return 23
  174. if number.lower() == 'xxiv': return 24
  175. if number.lower() == 'xxv': return 25
  176. if number.lower() == 'xxvi': return 26
  177. if number.lower() == 'xxvii': return 27
  178. if number.lower() == 'xxviii': return 28
  179. if number.lower() == 'xxix': return 29
  180. return int(number)
  181. def parse(self, name):
  182. name = self._unicodify(name)
  183. cached = name_parser_cache.get(name)
  184. if cached:
  185. return cached
  186. # break it into parts if there are any (dirname, file name, extension)
  187. dir_name, file_name = os.path.split(name)
  188. ext_match = re.match('(.*)\.\w{3,4}$', file_name)
  189. if ext_match and self.file_name:
  190. base_file_name = ext_match.group(1)
  191. else:
  192. base_file_name = file_name
  193. # use only the direct parent dir
  194. dir_name = os.path.basename(dir_name)
  195. # set up a result to use
  196. final_result = ParseResult(name)
  197. # try parsing the file name
  198. for i in range(1985,2020):
  199. base_file_name=base_file_name.replace(str(i),"",2)
  200. file_name_result = self._parse_string(base_file_name)
  201. # parse the dirname for extra info if needed
  202. dir_name_result = self._parse_string(dir_name)
  203. # build the ParseResult object
  204. final_result.air_date = self._combine_results(file_name_result, dir_name_result, 'air_date')
  205. final_result.audio_langs = self._combine_results(file_name_result, dir_name_result, 'audio_langs')
  206. if not final_result.air_date:
  207. final_result.season_number = self._combine_results(file_name_result, dir_name_result, 'season_number')
  208. final_result.episode_numbers = self._combine_results(file_name_result, dir_name_result, 'episode_numbers')
  209. # if the dirname has a release group/show name I believe it over the filename
  210. final_result.series_name = self._combine_results(dir_name_result, file_name_result, 'series_name')
  211. final_result.extra_info = self._combine_results(dir_name_result, file_name_result, 'extra_info')
  212. final_result.release_group = self._combine_results(dir_name_result, file_name_result, 'release_group')
  213. final_result.which_regex = []
  214. if final_result == file_name_result:
  215. final_result.which_regex = file_name_result.which_regex
  216. elif final_result == dir_name_result:
  217. final_result.which_regex = dir_name_result.which_regex
  218. else:
  219. if file_name_result:
  220. final_result.which_regex += file_name_result.which_regex
  221. if dir_name_result:
  222. final_result.which_regex += dir_name_result.which_regex
  223. # if there's no useful info in it then raise an exception
  224. if final_result.season_number == None and not final_result.episode_numbers and final_result.air_date == None and not final_result.series_name:
  225. raise InvalidNameException("Unable to parse "+name.encode(sickbeard.SYS_ENCODING))
  226. name_parser_cache.add(name, final_result)
  227. # return it
  228. return final_result
  229. class ParseResult(object):
  230. def __init__(self,
  231. original_name,
  232. series_name=None,
  233. season_number=None,
  234. episode_numbers=None,
  235. extra_info=None,
  236. release_group=None,
  237. air_date=None,
  238. audio_langs = 'en'
  239. ):
  240. self.original_name = original_name
  241. self.series_name = series_name
  242. self.season_number = season_number
  243. if not episode_numbers:
  244. self.episode_numbers = []
  245. else:
  246. self.episode_numbers = episode_numbers
  247. self.extra_info = extra_info
  248. self.release_group = release_group
  249. self.air_date = air_date
  250. self.which_regex = None
  251. self.audio_langs = audio_langs
  252. def __eq__(self, other):
  253. if not other:
  254. return False
  255. if self.series_name != other.series_name:
  256. return False
  257. if self.season_number != other.season_number:
  258. return False
  259. if self.episode_numbers != other.episode_numbers:
  260. return False
  261. if self.extra_info != other.extra_info:
  262. return False
  263. if self.release_group != other.release_group:
  264. return False
  265. if self.air_date != other.air_date:
  266. return False
  267. if self.audio_langs != other.audio_langs:
  268. return False
  269. return True
  270. def __str__(self):
  271. if self.series_name != None:
  272. to_return = self.series_name + u' - '
  273. else:
  274. to_return = u''
  275. if self.season_number != None:
  276. to_return += 'S'+str(self.season_number)
  277. if self.episode_numbers and len(self.episode_numbers):
  278. for e in self.episode_numbers:
  279. to_return += 'E'+str(e)
  280. if self.air_by_date:
  281. to_return += str(self.air_date)
  282. if self.extra_info:
  283. to_return += ' - ' + self.extra_info
  284. if self.release_group:
  285. to_return += ' (' + self.release_group + ')'
  286. to_return += ' [ABD: '+str(self.air_by_date)+']'
  287. return to_return.encode('utf-8')
  288. def _is_air_by_date(self):
  289. if self.season_number == None and len(self.episode_numbers) == 0 and self.air_date:
  290. return True
  291. return False
  292. air_by_date = property(_is_air_by_date)
  293. class NameParserCache(object):
  294. #TODO: check if the fifo list can beskiped and only use one dict
  295. _previous_parsed_list = [] # keep a fifo list of the cached items
  296. _previous_parsed = {}
  297. _cache_size = 100
  298. def add(self, name, parse_result):
  299. self._previous_parsed[name] = parse_result
  300. self._previous_parsed_list.append(name)
  301. while len(self._previous_parsed_list) > self._cache_size:
  302. del_me = self._previous_parsed_list.pop(0)
  303. self._previous_parsed.pop(del_me)
  304. def get(self, name):
  305. if name in self._previous_parsed:
  306. logger.log("Using cached parse result for: " + name, logger.DEBUG)
  307. return self._previous_parsed[name]
  308. else:
  309. return None
  310. name_parser_cache = NameParserCache()
  311. class InvalidNameException(Exception):
  312. "The given name is not valid"