PageRenderTime 44ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/Parser.py

https://gitlab.com/Fremis/IRCourse
Python | 276 lines | 223 code | 19 blank | 34 comment | 24 complexity | ef38d58353c251d7fa8500d7162942e6 MD5 | raw file
  1. import os
  2. import re
  3. from collections import OrderedDict
  4. import json
  5. import time
  6. from bs4 import BeautifulSoup, NavigableString
  7. import gc
  8. class profiler(object):
  9. def __enter__(self):
  10. self._startTime = time.clock()
  11. def __exit__(self, type, value, traceback):
  12. print("Elapsed time: {:.8f} sec".format(time.clock() - self._startTime))
  13. class StatsParser:
  14. """
  15. hell a lot of regex here, to list them -
  16. sub_reg - a regex to replace a criterion with an empty symbol, so dictionary would be more representive
  17. id_reg - getting an id from link(cuz structure!)
  18. description_sub_reg - replacing borders of our description reg
  19. description_reg - a regex for parsing out description without criterions, screenshots, download span and so on
  20. keygen_reg - finding a keygen
  21. lang_reg - finding a language
  22. quality_reg - finding a quality
  23. """
  24. sub_reg = re.compile('\D+:.?')
  25. id_reg = re.compile('\D+=')
  26. description_sub_reg = re.compile(
  27. '(?i)(?:описание:|-описание:|краткое описание:)|'
  28. '(?:[^\s]*качество:|скриншоты\n|скриншоты:|'
  29. 'качество:|качество видео:|download)[^\>]*', re.UNICODE)
  30. # rating_reg = re.compile('\d\.\d/\d{2}')
  31. description_reg = re.compile(
  32. '(?i)(?:описание|-описание|краткое описание):(?:\D|\d)+(?:[^\s]*качество:|скриншоты\n|скриншоты:|'
  33. 'download|качество:|качество видео:)', re.UNICODE)
  34. keygen_reg = re.compile('(?i)(?:таблетка|таблэтка|кряк|keygen|лечение|•таблетка|таблетка от|• таблэтка):\s*.+')
  35. lang_reg = re.compile('(?i)(?:язык интерфейса|язык|озвучка|перевод|язык озвучки|требуемый язык игры|'
  36. 'язык интерфейса игры|•язык интерфейса|•язык озвучки|•язык oзвучки|язык игр|'
  37. 'язык мода|язык перевода):\s*.+')
  38. quality_reg = re.compile(
  39. '(?i)(?:качество|-качество|исходная раздача--качество видео|качество видео|-качество видео|'
  40. 'исходная раздача-качество видео|--качество видео):\s*.+')
  41. def parse(self, file):
  42. """
  43. Takes a file and retrieves needed stuff(stats)
  44. More in stats:
  45. link - url of a document
  46. id - id of a topic on rutracker
  47. title - title of a topic
  48. description - filtered description of a movie/game/series
  49. len - number of words in description
  50. pages - number of pages in comments
  51. lang - language of interface/sub/dub
  52. qual - video quality(e.g. DvDRip, Blue ray)
  53. keygen - if a videogame torrent has a keygen or needs(doesn't need) it
  54. magnet - magnet link for useful purposes in query
  55. :param file: file to parse
  56. :return: a dictionary of stats that are used for sorting query results
  57. """
  58. doc = OrderedDict()
  59. with open(file, 'r', encoding='windows-1251') as open_file:
  60. try:
  61. soup = BeautifulSoup(open_file, 'lxml')
  62. except UnicodeDecodeError:
  63. print('pls')
  64. return None
  65. body = soup.find("div", {"class": "post_body"})
  66. if body:
  67. for post in body.find_all("span", {"class": "post-b"}):
  68. post.insert(0, NavigableString('\n'))
  69. # 100% exists
  70. category = soup.find("td", {"class": "nav w100 pad_2 brand-bg-white"}).text.lower()
  71. if ("игры" or "игр" or "консолей" or "аддоны") in category:
  72. doc['category'] = "games"
  73. elif ("кино" or "театр" or "video" or "мульт" or "аниме") in category:
  74. doc['category'] = "movies"
  75. elif "сериалы" in category:
  76. doc['category'] = "series"
  77. else:
  78. doc['category'] = None
  79. text_body = body.text
  80. doc['link'] = soup.find('h1', {"class": "maintitle"}).find("a", {"id": "topic-title"}).get('href')
  81. doc["id"] = re.sub(self.id_reg, '', doc["link"])
  82. doc["title"] = soup.title.string.replace(":: RuTracker.org", '')
  83. # can variate
  84. description = self.description_reg.search(text_body)
  85. if description:
  86. doc['description'] = re.sub(self.description_sub_reg, '', description.group()).strip()
  87. doc["len"] = len(doc['description'].split(' '))
  88. else:
  89. doc['description'] = None
  90. doc['len'] = 0
  91. '''
  92. rating = self.rating_reg.search(text_body)
  93. if rating:
  94. doc['rating'] = rating.group()
  95. else:
  96. doc['rating'] = None'''
  97. number_of_pages = soup.find('table', {'id': 'pagination'})
  98. if number_of_pages:
  99. doc["pages"] = int(number_of_pages.text.strip().split('\n')[0].split(' ')[-1])
  100. else:
  101. doc["pages"] = 1
  102. lang = self.lang_reg.search(text_body)
  103. if lang:
  104. doc['lang'] = re.sub(self.sub_reg, '', lang.group()).strip()
  105. else:
  106. doc['lang'] = None
  107. qual = self.quality_reg.search(text_body)
  108. if qual:
  109. doc['qual'] = re.sub(self.sub_reg, '', qual.group())
  110. else:
  111. doc['qual'] = None
  112. keygen = self.keygen_reg.search(text_body)
  113. if keygen:
  114. doc['keygen'] = re.sub(self.sub_reg, '', keygen.group()).strip()
  115. else:
  116. doc['keygen'] = None
  117. magnet = soup.find("div", {"class": "attach_link guest"})
  118. if magnet:
  119. doc["magnet"] = magnet.find("a").get('href')
  120. else:
  121. doc['magnet'] = None
  122. return doc
  123. return None
  124. def get_stats_from_files(self, files_per_json):
  125. """
  126. Reading all files from hard_coded folders, parsing html documents and retrieving some useful stats.
  127. More on stats in parse() method.
  128. :param files_per_json: limit of files written to JSON before clearing memory(for big data purposes)in k(1=1000)
  129. :return: not sure if return is necessary , just in case~
  130. Returning an array of dictionaries each of them containing stats for a single doc.
  131. """
  132. # getting vars
  133. current_path = os.getcwd()
  134. doc_stats = []
  135. games_folders_amount = len(os.listdir(current_path + '\\corpus\\RutrackerGames\\'))
  136. movies_folders_amount = len(os.listdir(current_path + '\\corpus\\RutrackerMovies\\'))
  137. series_folders_amount = len(os.listdir(current_path + '\\corpus\\RutrackerSeries\\'))
  138. number_of_dumps = 0
  139. # pasring games
  140. for i in range(games_folders_amount):
  141. file_list = os.listdir(current_path + '\\corpus\\RutrackerGames\\' + str(i) + '\\')
  142. file_dir = current_path + '\\corpus\\RutrackerGames\\' + str(i) + '\\'
  143. for file in file_list:
  144. filename = file_dir + file
  145. doc = self.parse(filename)
  146. if doc:
  147. doc_stats.append(doc)
  148. print('{0}k games have been parsed'.format(i + 1))
  149. # dumping games
  150. with open(current_path + '\\index\\games.json', 'w', encoding='utf-8') as open_file:
  151. json.dump(doc_stats, open_file, ensure_ascii=False, indent=4, separators=(',', ': '))
  152. print('games have been dumped')
  153. # clearing RAM
  154. doc_stats.clear()
  155. gc.collect()
  156. # parsing films
  157. for i in range(movies_folders_amount):
  158. file_list = os.listdir(current_path + '\\corpus\\RutrackerMovies\\' + str(i) + '\\')
  159. file_dir = current_path + '\\corpus\\RutrackerMovies\\' + str(i) + '\\'
  160. for file in file_list:
  161. filename = file_dir + file
  162. doc = self.parse(filename)
  163. if doc:
  164. doc_stats.append(doc)
  165. print('{0}k movies have been parsed'.format(i + 1))
  166. # dumping every <limit_of_files> movies and (i != 0))
  167. if (i % files_per_json == 0) or i == (movies_folders_amount - 1):
  168. number_of_dumps += 1
  169. with open(current_path + '\\index\\films' + str(number_of_dumps) + '.json', 'w',
  170. encoding='utf-8') as open_file:
  171. json.dump(doc_stats, open_file, ensure_ascii=False, indent=4, separators=(',', ': '))
  172. doc_stats.clear()
  173. print('movies have been dumped {0} times'.format(number_of_dumps))
  174. for i in range(series_folders_amount):
  175. file_list = os.listdir(current_path + '\\corpus\\RutrackerSeries\\' + str(i) + '\\')
  176. file_dir = current_path + '\\corpus\\RutrackerSeries\\' + str(i) + '\\'
  177. for file in file_list:
  178. filename = file_dir + file
  179. doc = self.parse(filename)
  180. if doc:
  181. doc_stats.append(doc)
  182. print('{0}k of serials have been parsed'.format(i + 1))
  183. # dumping games
  184. with open(current_path + '\\index\\series.json', 'w', encoding='utf-8') as open_file:
  185. json.dump(doc_stats, open_file, ensure_ascii=False, indent=4, separators=(',', ': '))
  186. print('serials have been dumped')
  187. return doc_stats
  188. def adding_tlen(self):
  189. file_list = os.listdir(current_path + '\\index\\')
  190. for file in file_list:
  191. if file:
  192. filename = current_path + '\\index\\' + file
  193. with open(filename, 'r', encoding='utf-8') as f:
  194. docs = json.load(f)
  195. for doc in docs:
  196. doc['tlen'] = len(doc['title'].split())
  197. with open(filename, 'w', encoding='utf-8') as f:
  198. json.dump(docs, f, ensure_ascii=False, indent=4, separators=(',', ': '))
  199. print('tlen added')
  200. def adding_category(self):
  201. file_list = os.listdir(current_path + '\\index\\')
  202. for file in file_list:
  203. print(file)
  204. if "films" in file:
  205. category = "movies"
  206. elif "series" in file:
  207. category = "series"
  208. elif "games" in file:
  209. category = "games"
  210. print(category
  211. )
  212. filename = current_path + '\\index\\' + file
  213. with open(filename, 'r', encoding='utf-8') as f:
  214. docs = json.load(f)
  215. for doc in docs:
  216. doc['category'] = category
  217. with open(filename, 'w', encoding='utf-8') as f:
  218. json.dump(docs, f, ensure_ascii=False, indent=4, separators=(',', ': '))
  219. print('category added')
  220. if __name__ == '__main__':
  221. current_path = os.getcwd()
  222. parser = StatsParser()
  223. stats = []
  224. with profiler() as p:
  225. stats = parser.get_stats_from_files(50)
  226. parser.adding_tlen()
  227. parser.adding_category()
  228. '''
  229. doc = parser.parse('4442588')
  230. for pls in doc:
  231. print(pls)
  232. print(doc[pls])
  233. stats = []
  234. with profiler() as p:
  235. with open(current_path + '\\models\\films1.json', 'r', encoding='utf-8') as open_file:
  236. stats = json.load(open_file)
  237. print(len(stats))
  238. for i in range(10):
  239. print(type(stats[i]))
  240. print(stats[i])'''