/filter.py
Python | 197 lines | 144 code | 8 blank | 45 comment | 5 complexity | b51b0d1103d440be57adb87f0d26a45b MD5 | raw file
- #!usr/bin/python3.4
- #
- # filter.py
- #
- # Extracts text from html files. All json output include a tag "original", with original hadith texts.
- # The ones with exegesis include also a second tag "commentary" with the text of the commentary.
- #
- # The format of the file name is as follows:
- #
- # infile name: hadith.al-islam-BOOKID-PID-CHAPTERID[-SUBCHAPTERID-SECTIONID].html
- # outfile name: hadith.al-islam-BOOKID-PID-CHAPTERID[-SUBCHAPTERID-SECTIONID].json
- #
- # json format of files of books with id in range 24-32:
- #
- # { "original" : text }
- #
- # json format of files of books with id in range 33-39:
- #
- # { "original" : text,
- # "commentary" : text }
- #
- # usage:
- # $ python filter.py ../../data/all/reduced ../../data/all/filtered
- #
- #################################################################################
- import sys
- import os
- import json
- import re
- from argparse import ArgumentParser
- from bs4 import BeautifulSoup
- import multiprocessing as mp
- import functools as ft
- ################################################
- #
- # functions
- #
- def window(iterfiles, size):
- """ Yields a group of size elements from iterfiles.
- If iterfiles is consumed, then returns None.
- This generator is used for getting args for multiprocessing.
- Args:
- iterfiles (list_iterator): Sequence string indicating filenames.
- size: Number of filenames that have to be read and yielded from iterfiles.
- Yield:
- string: Group of as many strings read from iterfiles as the number indicated by size.
- Yields None in case iterfiles is consumed.
- """
- for i in range(size):
- try:
- yield next(iterfiles)
- except StopIteration:
- yield None
- def get_cell_from_html(t, fn):
- """ Check the text in html is inside an expected structure.
-
- Args:
- t (class 'bs4.BeautifulSoup'): html to parse
- fn (str): filename, to trace errors.
- Returns:
- class 'bs4.BeautifulSoup': cell containing the text
-
- """
- if len(t.findAll('tr', recursive=False)) != 1:
- print('Error in file "%s": more than one <tr> inside table' % fn, file=sys.stderr)
- print(table1.prettify()) #DEBUG
- sys.exit(1) #DEBUG
-
- if len(t.tr.findAll('td', recursive=False)) != 1:
- print('Error in file "%s": more than one <td> inside table.tr' % fn, file=sys.stderr)
- print(t.prettify()) #DEBUG
- sys.exit(1) #DEBUG
-
- if len(t.tr.td.findAll('p', recursive=False)) != 1:
- print('Error in file "%s": more than one <p> inside table.t.td' % fn, file=sys.stderr)
- print(table1.prettify()) #DEBUG
- sys.exit(1) #DEBUG
- cell = t.tr.td.p
-
- if not cell:
- print('Error in file "%s": No text found in first table.' % fn, file=sys.stderr)
- sys.exit(1)
-
- return cell
- def process_file(input_dir, output_dir, fname):
- """ Extract book, chapter, subchapter and texts from file fname.
-
- Args:
- input_dir (str): path of input files
- output_dir (str): path to save output files
- fname (str): Input filename to parse and filter.
- Return:
- list: list containing 1 or 2 lists of dicts.
- Each dict is an object with the following structure:
- {"type" : "...", "text" : "..."}
- """
- if not fname: return
- print('\n**', fname, file=sys.stderr) #DEBUG
- with open(os.path.join(args.input_dir, fname)) as inf:
- soup = BeautifulSoup(inf.read(),'lxml')
- # search texts
- found = soup.findAll(lambda tag: tag.name=='table' and 'style' in tag.attrs and 'Arabic Transparent' in tag.attrs['style'])
- if not found:
- print('Error in file "%s": No text found' % fname, file=sys.stderr)
- sys.exit(1)
- if len(found) > 2:
- print('Error in file "%s": More than two blocks of data found' % fname, file=sys.stderr)
- sys.exit(1)
- table1, *table2 = found
- cell = get_cell_from_html(table1, fname)
- text_table1 = re.sub(r'\s+', ' ', cell.get_text())
- texts = {'original' : text_table1}
- if table2:
- table2 = table2[0]
-
- cell = get_cell_from_html(table2, fname)
- text_table2 = re.sub(r'\s+', ' ', cell.get_text())
- texts['commentary'] = text_table2
- outfname = fname.rsplit('.',1)[0] + '.json'
- print('>>', outfname, file=sys.stderr) #DEBUG
- with open(os.path.join(args.output_dir, outfname), 'w') as outf:
- json.dump(texts, outf, ensure_ascii=False)
- ################################################
- if __name__ == '__main__':
- parser = ArgumentParser(description='extracts texts from html files and dump into json files')
- parser.add_argument('input_dir', action='store', help='input directory with html files')
- parser.add_argument('output_dir', action='store', help='output directory where json files will be saved')
- args = parser.parse_args()
- # get list of all html files to process
- html_fnames = (f.name for f in os.scandir(args.input_dir) if f.is_file() and os.path.splitext(f.name)[1]=='.html')
-
- for fn in html_fnames:
- data = process_file(args.input_dir, args.output_dir, fn)
- #################################################
- # # create a Pool object with 4 processes
- # pool = mp.Pool(processes=4)
- #
- # # get 4 first files
- # func_args = list(window(html_files, 4))
- #
- # #cnt = 0 #DEBUG
- #
- # # process groups of 4 files each time until iterator is exhausted
- # while func_args[0] != None:
- #
- # # prepare funct with contant args
- # func = ft.partial(process_file, args.input_dir, args.output_dir)
- #
- # # do processing
- # #data = pool.map_async(func, func_args)
- # data = pool.map(func, func_args)
- #
- # #if cnt >50: break #DEBUG
- # #cnt += 1 #DEBUG
- #
- # # get next group of 4 files
- # func_args = list(window(html_files, 4))
- #
- # pool.close()
- # pool.join()
- ####################################################