Parser.py | searchcode

/Parser.py

https://gitlab.com/Fremis/IRCourse
Python | 276 lines | 223 code | 19 blank | 34 comment | 24 complexity | ef38d58353c251d7fa8500d7162942e6 MD5 | raw file

import os
import re
from collections import OrderedDict
import json

import time

from bs4 import BeautifulSoup, NavigableString
import gc


class profiler(object):
    def __enter__(self):
        self._startTime = time.clock()

    def __exit__(self, type, value, traceback):
        print("Elapsed time: {:.8f} sec".format(time.clock() - self._startTime))


class StatsParser:
    """
    hell a lot of regex here, to list them -

    sub_reg - a regex to replace a criterion with an empty symbol, so dictionary would be more representive
    id_reg - getting an id from link(cuz structure!)
    description_sub_reg - replacing borders of our description reg

    description_reg - a regex for parsing out description without criterions, screenshots, download span and so on
    keygen_reg - finding a keygen
    lang_reg - finding a language
    quality_reg - finding a quality
    """
    sub_reg = re.compile('\D+:.?')
    id_reg = re.compile('\D+=')
    description_sub_reg = re.compile(
        '(?i)(?:описание:|-описание:|краткое описание:)|'
        '(?:[^\s]*качество:|скриншоты\n|скриншоты:|'
        'качество:|качество видео:|download)[^\>]*', re.UNICODE)

    # rating_reg = re.compile('\d\.\d/\d{2}')

    description_reg = re.compile(
        '(?i)(?:описание|-описание|краткое описание):(?:\D|\d)+(?:[^\s]*качество:|скриншоты\n|скриншоты:|'
        'download|качество:|качество видео:)', re.UNICODE)

    keygen_reg = re.compile('(?i)(?:таблетка|таблэтка|кряк|keygen|лечение|•таблетка|таблетка от|• таблэтка):\s*.+')
    lang_reg = re.compile('(?i)(?:язык интерфейса|язык|озвучка|перевод|язык озвучки|требуемый язык игры|'
                          'язык интерфейса игры|•язык интерфейса|•язык озвучки|•язык oзвучки|язык игр|'
                          'язык мода|язык перевода):\s*.+')
    quality_reg = re.compile(
        '(?i)(?:качество|-качество|исходная раздача--качество видео|качество видео|-качество видео|'
        'исходная раздача-качество видео|--качество видео):\s*.+')

    def parse(self, file):
        """
        Takes a file and retrieves needed stuff(stats)
        More in stats:
        link - url of a document
        id - id of a topic on rutracker
        title - title of a topic
        description - filtered description of a movie/game/series
        len - number of words in description
        pages - number of pages in comments
        lang - language of interface/sub/dub
        qual - video quality(e.g. DvDRip, Blue ray)
        keygen - if a videogame torrent has a keygen or needs(doesn't need) it
        magnet - magnet link for useful purposes in query
        :param file: file to parse
        :return: a dictionary of stats that are used for sorting query results
        """
        doc = OrderedDict()

        with open(file, 'r', encoding='windows-1251') as open_file:
            try:
                soup = BeautifulSoup(open_file, 'lxml')
            except UnicodeDecodeError:
                print('pls')
                return None

        body = soup.find("div", {"class": "post_body"})

        if body:
            for post in body.find_all("span", {"class": "post-b"}):
                post.insert(0, NavigableString('\n'))
            # 100% exists
            category = soup.find("td", {"class": "nav w100 pad_2 brand-bg-white"}).text.lower()
            if ("игры" or "игр" or "консолей" or "аддоны") in category:
                doc['category'] = "games"
            elif ("кино" or "театр" or "video" or "мульт" or "аниме") in category:
                doc['category'] = "movies"
            elif "сериалы" in category:
                doc['category'] = "series"
            else:
                doc['category'] = None
            text_body = body.text
            doc['link'] = soup.find('h1', {"class": "maintitle"}).find("a", {"id": "topic-title"}).get('href')
            doc["id"] = re.sub(self.id_reg, '', doc["link"])
            doc["title"] = soup.title.string.replace(":: RuTracker.org", '')

            # can variate
            description = self.description_reg.search(text_body)
            if description:
                doc['description'] = re.sub(self.description_sub_reg, '', description.group()).strip()
                doc["len"] = len(doc['description'].split(' '))
            else:
                doc['description'] = None
                doc['len'] = 0

            '''
            rating = self.rating_reg.search(text_body)
            if rating:
                doc['rating'] = rating.group()
            else:
                doc['rating'] = None'''

            number_of_pages = soup.find('table', {'id': 'pagination'})
            if number_of_pages:
                doc["pages"] = int(number_of_pages.text.strip().split('\n')[0].split(' ')[-1])
            else:
                doc["pages"] = 1

            lang = self.lang_reg.search(text_body)
            if lang:
                doc['lang'] = re.sub(self.sub_reg, '', lang.group()).strip()
            else:
                doc['lang'] = None

            qual = self.quality_reg.search(text_body)
            if qual:
                doc['qual'] = re.sub(self.sub_reg, '', qual.group())
            else:
                doc['qual'] = None

            keygen = self.keygen_reg.search(text_body)
            if keygen:
                doc['keygen'] = re.sub(self.sub_reg, '', keygen.group()).strip()
            else:
                doc['keygen'] = None

            magnet = soup.find("div", {"class": "attach_link guest"})
            if magnet:
                doc["magnet"] = magnet.find("a").get('href')
            else:
                doc['magnet'] = None
            return doc

        return None

    def get_stats_from_files(self, files_per_json):
        """
        Reading all files from hard_coded folders, parsing html documents and retrieving some useful stats.
        More on stats in parse() method.
        :param files_per_json: limit of files written to JSON before clearing memory(for big data purposes)in k(1=1000)
        :return: not sure if return is necessary , just in case~
        Returning an array of dictionaries each of them containing stats for a single doc.
        """
        # getting vars
        current_path = os.getcwd()
        doc_stats = []
        games_folders_amount = len(os.listdir(current_path + '\\corpus\\RutrackerGames\\'))
        movies_folders_amount = len(os.listdir(current_path + '\\corpus\\RutrackerMovies\\'))
        series_folders_amount = len(os.listdir(current_path + '\\corpus\\RutrackerSeries\\'))
        number_of_dumps = 0

        # pasring games

        for i in range(games_folders_amount):
            file_list = os.listdir(current_path + '\\corpus\\RutrackerGames\\' + str(i) + '\\')
            file_dir = current_path + '\\corpus\\RutrackerGames\\' + str(i) + '\\'
            for file in file_list:
                filename = file_dir + file
                doc = self.parse(filename)
                if doc:
                    doc_stats.append(doc)
            print('{0}k games have been parsed'.format(i + 1))
        # dumping games
        with open(current_path + '\\index\\games.json', 'w', encoding='utf-8') as open_file:
            json.dump(doc_stats, open_file, ensure_ascii=False, indent=4, separators=(',', ': '))
        print('games have been dumped')
        # clearing RAM
        doc_stats.clear()
        gc.collect()
        # parsing films
        for i in range(movies_folders_amount):
            file_list = os.listdir(current_path + '\\corpus\\RutrackerMovies\\' + str(i) + '\\')
            file_dir = current_path + '\\corpus\\RutrackerMovies\\' + str(i) + '\\'
            for file in file_list:
                filename = file_dir + file
                doc = self.parse(filename)
                if doc:
                    doc_stats.append(doc)
            print('{0}k movies have been parsed'.format(i + 1))
            # dumping every <limit_of_files> movies  and (i != 0))
            if (i % files_per_json == 0) or i == (movies_folders_amount - 1):
                number_of_dumps += 1
                with open(current_path + '\\index\\films' + str(number_of_dumps) + '.json', 'w',
                          encoding='utf-8') as open_file:
                    json.dump(doc_stats, open_file, ensure_ascii=False, indent=4, separators=(',', ': '))
                doc_stats.clear()
                print('movies have been dumped {0} times'.format(number_of_dumps))
        for i in range(series_folders_amount):
            file_list = os.listdir(current_path + '\\corpus\\RutrackerSeries\\' + str(i) + '\\')
            file_dir = current_path + '\\corpus\\RutrackerSeries\\' + str(i) + '\\'
            for file in file_list:
                filename = file_dir + file
                doc = self.parse(filename)
                if doc:
                    doc_stats.append(doc)
            print('{0}k of serials have been parsed'.format(i + 1))
        # dumping games
        with open(current_path + '\\index\\series.json', 'w', encoding='utf-8') as open_file:
            json.dump(doc_stats, open_file, ensure_ascii=False, indent=4, separators=(',', ': '))
        print('serials have been dumped')
        return doc_stats

    def adding_tlen(self):
        file_list = os.listdir(current_path + '\\index\\')
        for file in file_list:
            if file:
                filename = current_path + '\\index\\' + file
                with open(filename, 'r', encoding='utf-8') as f:
                    docs = json.load(f)
                    for doc in docs:
                        doc['tlen'] = len(doc['title'].split())
                with open(filename, 'w', encoding='utf-8') as f:
                    json.dump(docs, f, ensure_ascii=False, indent=4, separators=(',', ': '))
                print('tlen added')

    def adding_category(self):
        file_list = os.listdir(current_path + '\\index\\')
        for file in file_list:
            print(file)
            if "films" in file:
                category = "movies"
            elif "series" in file:
                category = "series"
            elif "games" in file:
                category = "games"
            print(category
                  )
            filename = current_path + '\\index\\' + file
            with open(filename, 'r', encoding='utf-8') as f:
                docs = json.load(f)
                for doc in docs:
                    doc['category'] = category
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(docs, f, ensure_ascii=False, indent=4, separators=(',', ': '))
            print('category added')


if __name__ == '__main__':
    current_path = os.getcwd()
    parser = StatsParser()
    stats = []
    with profiler() as p:
        stats = parser.get_stats_from_files(50)
        parser.adding_tlen()
        parser.adding_category()


'''
    doc = parser.parse('4442588')
    for pls in doc:
        print(pls)
        print(doc[pls])



    stats = []
    with profiler() as p:
        with open(current_path + '\\models\\films1.json', 'r', encoding='utf-8') as open_file:
            stats = json.load(open_file)
    print(len(stats))
    for i in range(10):
        print(type(stats[i]))
        print(stats[i])'''