/Parser.py
Python | 276 lines | 223 code | 19 blank | 34 comment | 24 complexity | ef38d58353c251d7fa8500d7162942e6 MD5 | raw file
- import os
- import re
- from collections import OrderedDict
- import json
- import time
- from bs4 import BeautifulSoup, NavigableString
- import gc
- class profiler(object):
- def __enter__(self):
- self._startTime = time.clock()
- def __exit__(self, type, value, traceback):
- print("Elapsed time: {:.8f} sec".format(time.clock() - self._startTime))
- class StatsParser:
- """
- hell a lot of regex here, to list them -
- sub_reg - a regex to replace a criterion with an empty symbol, so dictionary would be more representive
- id_reg - getting an id from link(cuz structure!)
- description_sub_reg - replacing borders of our description reg
- description_reg - a regex for parsing out description without criterions, screenshots, download span and so on
- keygen_reg - finding a keygen
- lang_reg - finding a language
- quality_reg - finding a quality
- """
- sub_reg = re.compile('\D+:.?')
- id_reg = re.compile('\D+=')
- description_sub_reg = re.compile(
- '(?i)(?:описание:|-описание:|краткое описание:)|'
- '(?:[^\s]*качество:|скриншоты\n|скриншоты:|'
- 'качество:|качество видео:|download)[^\>]*', re.UNICODE)
- # rating_reg = re.compile('\d\.\d/\d{2}')
- description_reg = re.compile(
- '(?i)(?:описание|-описание|краткое описание):(?:\D|\d)+(?:[^\s]*качество:|скриншоты\n|скриншоты:|'
- 'download|качество:|качество видео:)', re.UNICODE)
- keygen_reg = re.compile('(?i)(?:таблетка|таблэтка|кряк|keygen|лечение|•таблетка|таблетка от|• таблэтка):\s*.+')
- lang_reg = re.compile('(?i)(?:язык интерфейса|язык|озвучка|перевод|язык озвучки|требуемый язык игры|'
- 'язык интерфейса игры|•язык интерфейса|•язык озвучки|•язык oзвучки|язык игр|'
- 'язык мода|язык перевода):\s*.+')
- quality_reg = re.compile(
- '(?i)(?:качество|-качество|исходная раздача--качество видео|качество видео|-качество видео|'
- 'исходная раздача-качество видео|--качество видео):\s*.+')
- def parse(self, file):
- """
- Takes a file and retrieves needed stuff(stats)
- More in stats:
- link - url of a document
- id - id of a topic on rutracker
- title - title of a topic
- description - filtered description of a movie/game/series
- len - number of words in description
- pages - number of pages in comments
- lang - language of interface/sub/dub
- qual - video quality(e.g. DvDRip, Blue ray)
- keygen - if a videogame torrent has a keygen or needs(doesn't need) it
- magnet - magnet link for useful purposes in query
- :param file: file to parse
- :return: a dictionary of stats that are used for sorting query results
- """
- doc = OrderedDict()
- with open(file, 'r', encoding='windows-1251') as open_file:
- try:
- soup = BeautifulSoup(open_file, 'lxml')
- except UnicodeDecodeError:
- print('pls')
- return None
- body = soup.find("div", {"class": "post_body"})
- if body:
- for post in body.find_all("span", {"class": "post-b"}):
- post.insert(0, NavigableString('\n'))
- # 100% exists
- category = soup.find("td", {"class": "nav w100 pad_2 brand-bg-white"}).text.lower()
- if ("игры" or "игр" or "консолей" or "аддоны") in category:
- doc['category'] = "games"
- elif ("кино" or "театр" or "video" or "мульт" or "аниме") in category:
- doc['category'] = "movies"
- elif "сериалы" in category:
- doc['category'] = "series"
- else:
- doc['category'] = None
- text_body = body.text
- doc['link'] = soup.find('h1', {"class": "maintitle"}).find("a", {"id": "topic-title"}).get('href')
- doc["id"] = re.sub(self.id_reg, '', doc["link"])
- doc["title"] = soup.title.string.replace(":: RuTracker.org", '')
- # can variate
- description = self.description_reg.search(text_body)
- if description:
- doc['description'] = re.sub(self.description_sub_reg, '', description.group()).strip()
- doc["len"] = len(doc['description'].split(' '))
- else:
- doc['description'] = None
- doc['len'] = 0
- '''
- rating = self.rating_reg.search(text_body)
- if rating:
- doc['rating'] = rating.group()
- else:
- doc['rating'] = None'''
- number_of_pages = soup.find('table', {'id': 'pagination'})
- if number_of_pages:
- doc["pages"] = int(number_of_pages.text.strip().split('\n')[0].split(' ')[-1])
- else:
- doc["pages"] = 1
- lang = self.lang_reg.search(text_body)
- if lang:
- doc['lang'] = re.sub(self.sub_reg, '', lang.group()).strip()
- else:
- doc['lang'] = None
- qual = self.quality_reg.search(text_body)
- if qual:
- doc['qual'] = re.sub(self.sub_reg, '', qual.group())
- else:
- doc['qual'] = None
- keygen = self.keygen_reg.search(text_body)
- if keygen:
- doc['keygen'] = re.sub(self.sub_reg, '', keygen.group()).strip()
- else:
- doc['keygen'] = None
- magnet = soup.find("div", {"class": "attach_link guest"})
- if magnet:
- doc["magnet"] = magnet.find("a").get('href')
- else:
- doc['magnet'] = None
- return doc
- return None
- def get_stats_from_files(self, files_per_json):
- """
- Reading all files from hard_coded folders, parsing html documents and retrieving some useful stats.
- More on stats in parse() method.
- :param files_per_json: limit of files written to JSON before clearing memory(for big data purposes)in k(1=1000)
- :return: not sure if return is necessary , just in case~
- Returning an array of dictionaries each of them containing stats for a single doc.
- """
- # getting vars
- current_path = os.getcwd()
- doc_stats = []
- games_folders_amount = len(os.listdir(current_path + '\\corpus\\RutrackerGames\\'))
- movies_folders_amount = len(os.listdir(current_path + '\\corpus\\RutrackerMovies\\'))
- series_folders_amount = len(os.listdir(current_path + '\\corpus\\RutrackerSeries\\'))
- number_of_dumps = 0
- # pasring games
- for i in range(games_folders_amount):
- file_list = os.listdir(current_path + '\\corpus\\RutrackerGames\\' + str(i) + '\\')
- file_dir = current_path + '\\corpus\\RutrackerGames\\' + str(i) + '\\'
- for file in file_list:
- filename = file_dir + file
- doc = self.parse(filename)
- if doc:
- doc_stats.append(doc)
- print('{0}k games have been parsed'.format(i + 1))
- # dumping games
- with open(current_path + '\\index\\games.json', 'w', encoding='utf-8') as open_file:
- json.dump(doc_stats, open_file, ensure_ascii=False, indent=4, separators=(',', ': '))
- print('games have been dumped')
- # clearing RAM
- doc_stats.clear()
- gc.collect()
- # parsing films
- for i in range(movies_folders_amount):
- file_list = os.listdir(current_path + '\\corpus\\RutrackerMovies\\' + str(i) + '\\')
- file_dir = current_path + '\\corpus\\RutrackerMovies\\' + str(i) + '\\'
- for file in file_list:
- filename = file_dir + file
- doc = self.parse(filename)
- if doc:
- doc_stats.append(doc)
- print('{0}k movies have been parsed'.format(i + 1))
- # dumping every <limit_of_files> movies and (i != 0))
- if (i % files_per_json == 0) or i == (movies_folders_amount - 1):
- number_of_dumps += 1
- with open(current_path + '\\index\\films' + str(number_of_dumps) + '.json', 'w',
- encoding='utf-8') as open_file:
- json.dump(doc_stats, open_file, ensure_ascii=False, indent=4, separators=(',', ': '))
- doc_stats.clear()
- print('movies have been dumped {0} times'.format(number_of_dumps))
- for i in range(series_folders_amount):
- file_list = os.listdir(current_path + '\\corpus\\RutrackerSeries\\' + str(i) + '\\')
- file_dir = current_path + '\\corpus\\RutrackerSeries\\' + str(i) + '\\'
- for file in file_list:
- filename = file_dir + file
- doc = self.parse(filename)
- if doc:
- doc_stats.append(doc)
- print('{0}k of serials have been parsed'.format(i + 1))
- # dumping games
- with open(current_path + '\\index\\series.json', 'w', encoding='utf-8') as open_file:
- json.dump(doc_stats, open_file, ensure_ascii=False, indent=4, separators=(',', ': '))
- print('serials have been dumped')
- return doc_stats
- def adding_tlen(self):
- file_list = os.listdir(current_path + '\\index\\')
- for file in file_list:
- if file:
- filename = current_path + '\\index\\' + file
- with open(filename, 'r', encoding='utf-8') as f:
- docs = json.load(f)
- for doc in docs:
- doc['tlen'] = len(doc['title'].split())
- with open(filename, 'w', encoding='utf-8') as f:
- json.dump(docs, f, ensure_ascii=False, indent=4, separators=(',', ': '))
- print('tlen added')
- def adding_category(self):
- file_list = os.listdir(current_path + '\\index\\')
- for file in file_list:
- print(file)
- if "films" in file:
- category = "movies"
- elif "series" in file:
- category = "series"
- elif "games" in file:
- category = "games"
- print(category
- )
- filename = current_path + '\\index\\' + file
- with open(filename, 'r', encoding='utf-8') as f:
- docs = json.load(f)
- for doc in docs:
- doc['category'] = category
- with open(filename, 'w', encoding='utf-8') as f:
- json.dump(docs, f, ensure_ascii=False, indent=4, separators=(',', ': '))
- print('category added')
- if __name__ == '__main__':
- current_path = os.getcwd()
- parser = StatsParser()
- stats = []
- with profiler() as p:
- stats = parser.get_stats_from_files(50)
- parser.adding_tlen()
- parser.adding_category()
- '''
- doc = parser.parse('4442588')
- for pls in doc:
- print(pls)
- print(doc[pls])
- stats = []
- with profiler() as p:
- with open(current_path + '\\models\\films1.json', 'r', encoding='utf-8') as open_file:
- stats = json.load(open_file)
- print(len(stats))
- for i in range(10):
- print(type(stats[i]))
- print(stats[i])'''