filter.py - Extracts text from html files. All json output …

/filter.py

https://gitlab.com/cobhuni/hadith_alislam_extractor · Python · 197 lines · 144 code · 8 blank · 45 comment · 5 complexity · b51b0d1103d440be57adb87f0d26a45b MD5 · raw file

#!usr/bin/python3.4
#
#     filter.py
#
# Extracts text from html files. All json output include a tag "original", with original hadith texts.
# The ones with exegesis include also a second tag "commentary" with the text of the commentary.
# 
# The format of the file name is as follows:
#
#     infile name:  hadith.al-islam-BOOKID-PID-CHAPTERID[-SUBCHAPTERID-SECTIONID].html
#     outfile name: hadith.al-islam-BOOKID-PID-CHAPTERID[-SUBCHAPTERID-SECTIONID].json
#
#    json format of files of books with id in range 24-32:
#
#        { "original" : text }
#
#    json format of files of books with id in range 33-39:
#
#        { "original" : text,
#          "commentary" : text }
#
# usage:
#   $ python filter.py ../../data/all/reduced ../../data/all/filtered
#
#################################################################################

import sys
import os
import json
import re
from argparse import ArgumentParser
from bs4 import BeautifulSoup
import multiprocessing as mp
import functools as ft


################################################

#
# functions
#

def window(iterfiles, size):
    """ Yields a group of size elements from iterfiles.
    If iterfiles is consumed, then returns None.

    This generator is used for getting args for multiprocessing.

    Args:
        iterfiles (list_iterator): Sequence string indicating filenames.
        size: Number of filenames that have to be read and yielded from iterfiles.

    Yield:
        string: Group of as many strings read from iterfiles as the number indicated by size.
                Yields None in case iterfiles is consumed.
    """
    for i in range(size):

        try:
            yield next(iterfiles)

        except StopIteration:
            yield None

def get_cell_from_html(t, fn):
    """ Check the text in html is inside an expected structure.
    
    Args:
        t (class 'bs4.BeautifulSoup'): html to parse
        fn (str): filename, to trace errors.

    Returns:
        class 'bs4.BeautifulSoup': cell containing the text
    
    """
    if len(t.findAll('tr', recursive=False)) != 1:
        print('Error in file "%s": more than one <tr> inside table' % fn, file=sys.stderr)
        print(table1.prettify())  #DEBUG
        sys.exit(1)  #DEBUG
    
    if len(t.tr.findAll('td', recursive=False)) != 1:
        print('Error in file "%s": more than one <td> inside table.tr' % fn, file=sys.stderr)
        print(t.prettify())  #DEBUG
        sys.exit(1)  #DEBUG
    
    if len(t.tr.td.findAll('p', recursive=False)) != 1:
        print('Error in file "%s": more than one <p> inside table.t.td' % fn, file=sys.stderr)
        print(table1.prettify())  #DEBUG
        sys.exit(1)  #DEBUG

    cell = t.tr.td.p
    
    if not cell:
        print('Error in file "%s": No text found in first table.' % fn, file=sys.stderr)
        sys.exit(1)
    
    return cell


def process_file(input_dir, output_dir, fname):
    """ Extract book, chapter, subchapter and texts from file fname.
        
    Args:
        input_dir (str): path of input files
        output_dir (str): path to save output files
        fname (str): Input filename to parse and filter.

    Return:
        list: list containing 1 or 2 lists of dicts.
              Each dict is an object with the following structure:
                  {"type" : "...", "text" : "..."}
    """
    if not fname: return

    print('\n**', fname, file=sys.stderr)  #DEBUG

    with open(os.path.join(args.input_dir, fname)) as inf:
        soup = BeautifulSoup(inf.read(),'lxml')

    # search texts
    found = soup.findAll(lambda tag: tag.name=='table' and 'style' in tag.attrs and 'Arabic Transparent' in tag.attrs['style'])

    if not found:
        print('Error in file "%s": No text found' % fname, file=sys.stderr)
        sys.exit(1)

    if len(found) > 2:
        print('Error in file "%s": More than two blocks of data found' % fname, file=sys.stderr)        
        sys.exit(1)

    table1, *table2 = found

    cell = get_cell_from_html(table1, fname)
    text_table1 = re.sub(r'\s+', ' ', cell.get_text())

    texts = {'original' : text_table1}

    if table2:

        table2 = table2[0]
        
        cell = get_cell_from_html(table2, fname)
        text_table2 = re.sub(r'\s+', ' ', cell.get_text())

        texts['commentary'] = text_table2

    outfname = fname.rsplit('.',1)[0] + '.json'
    print('>>', outfname, file=sys.stderr)  #DEBUG

    with open(os.path.join(args.output_dir, outfname), 'w') as outf:
        json.dump(texts, outf, ensure_ascii=False)

################################################


if __name__ == '__main__':

    parser = ArgumentParser(description='extracts texts from html files and dump into json files')
    parser.add_argument('input_dir', action='store', help='input directory with html files')
    parser.add_argument('output_dir', action='store', help='output directory where json files will be saved')
    args = parser.parse_args()

    # get list of all html files to process
    html_fnames = (f.name for f in os.scandir(args.input_dir) if f.is_file() and os.path.splitext(f.name)[1]=='.html')
    
    for fn in html_fnames:

        data = process_file(args.input_dir, args.output_dir, fn)

#################################################
#    # create a Pool object with 4 processes
#    pool = mp.Pool(processes=4)
#        
#    # get 4 first files
#    func_args = list(window(html_files, 4))
#    
#    #cnt = 0 #DEBUG
#    
#    # process groups of 4 files each time until iterator is exhausted
#    while func_args[0] != None:
#
#        # prepare funct with contant args
#        func = ft.partial(process_file, args.input_dir, args.output_dir)
#
#        # do processing
#        #data = pool.map_async(func, func_args)
#        data = pool.map(func, func_args)
#
#        #if cnt >50: break #DEBUG
#        #cnt += 1 #DEBUG
#
#        # get next group of 4 files
#        func_args = list(window(html_files, 4))
#
#    pool.close()
#    pool.join()
####################################################
Tech Fingerprint

Alerts (13)

'print(' Use logging module for better control and configurability
77 78 82 83 87 88 94 115 124 128 148
'list(' Avoid unnecessary list conversions; use generators where possible
175 193