PageRenderTime 36ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/filter.py

https://gitlab.com/cobhuni/hadith_alislam_extractor
Python | 197 lines | 144 code | 8 blank | 45 comment | 5 complexity | b51b0d1103d440be57adb87f0d26a45b MD5 | raw file
  1. #!usr/bin/python3.4
  2. #
  3. # filter.py
  4. #
  5. # Extracts text from html files. All json output include a tag "original", with original hadith texts.
  6. # The ones with exegesis include also a second tag "commentary" with the text of the commentary.
  7. #
  8. # The format of the file name is as follows:
  9. #
  10. # infile name: hadith.al-islam-BOOKID-PID-CHAPTERID[-SUBCHAPTERID-SECTIONID].html
  11. # outfile name: hadith.al-islam-BOOKID-PID-CHAPTERID[-SUBCHAPTERID-SECTIONID].json
  12. #
  13. # json format of files of books with id in range 24-32:
  14. #
  15. # { "original" : text }
  16. #
  17. # json format of files of books with id in range 33-39:
  18. #
  19. # { "original" : text,
  20. # "commentary" : text }
  21. #
  22. # usage:
  23. # $ python filter.py ../../data/all/reduced ../../data/all/filtered
  24. #
  25. #################################################################################
  26. import sys
  27. import os
  28. import json
  29. import re
  30. from argparse import ArgumentParser
  31. from bs4 import BeautifulSoup
  32. import multiprocessing as mp
  33. import functools as ft
  34. ################################################
  35. #
  36. # functions
  37. #
  38. def window(iterfiles, size):
  39. """ Yields a group of size elements from iterfiles.
  40. If iterfiles is consumed, then returns None.
  41. This generator is used for getting args for multiprocessing.
  42. Args:
  43. iterfiles (list_iterator): Sequence string indicating filenames.
  44. size: Number of filenames that have to be read and yielded from iterfiles.
  45. Yield:
  46. string: Group of as many strings read from iterfiles as the number indicated by size.
  47. Yields None in case iterfiles is consumed.
  48. """
  49. for i in range(size):
  50. try:
  51. yield next(iterfiles)
  52. except StopIteration:
  53. yield None
  54. def get_cell_from_html(t, fn):
  55. """ Check the text in html is inside an expected structure.
  56. Args:
  57. t (class 'bs4.BeautifulSoup'): html to parse
  58. fn (str): filename, to trace errors.
  59. Returns:
  60. class 'bs4.BeautifulSoup': cell containing the text
  61. """
  62. if len(t.findAll('tr', recursive=False)) != 1:
  63. print('Error in file "%s": more than one <tr> inside table' % fn, file=sys.stderr)
  64. print(table1.prettify()) #DEBUG
  65. sys.exit(1) #DEBUG
  66. if len(t.tr.findAll('td', recursive=False)) != 1:
  67. print('Error in file "%s": more than one <td> inside table.tr' % fn, file=sys.stderr)
  68. print(t.prettify()) #DEBUG
  69. sys.exit(1) #DEBUG
  70. if len(t.tr.td.findAll('p', recursive=False)) != 1:
  71. print('Error in file "%s": more than one <p> inside table.t.td' % fn, file=sys.stderr)
  72. print(table1.prettify()) #DEBUG
  73. sys.exit(1) #DEBUG
  74. cell = t.tr.td.p
  75. if not cell:
  76. print('Error in file "%s": No text found in first table.' % fn, file=sys.stderr)
  77. sys.exit(1)
  78. return cell
  79. def process_file(input_dir, output_dir, fname):
  80. """ Extract book, chapter, subchapter and texts from file fname.
  81. Args:
  82. input_dir (str): path of input files
  83. output_dir (str): path to save output files
  84. fname (str): Input filename to parse and filter.
  85. Return:
  86. list: list containing 1 or 2 lists of dicts.
  87. Each dict is an object with the following structure:
  88. {"type" : "...", "text" : "..."}
  89. """
  90. if not fname: return
  91. print('\n**', fname, file=sys.stderr) #DEBUG
  92. with open(os.path.join(args.input_dir, fname)) as inf:
  93. soup = BeautifulSoup(inf.read(),'lxml')
  94. # search texts
  95. found = soup.findAll(lambda tag: tag.name=='table' and 'style' in tag.attrs and 'Arabic Transparent' in tag.attrs['style'])
  96. if not found:
  97. print('Error in file "%s": No text found' % fname, file=sys.stderr)
  98. sys.exit(1)
  99. if len(found) > 2:
  100. print('Error in file "%s": More than two blocks of data found' % fname, file=sys.stderr)
  101. sys.exit(1)
  102. table1, *table2 = found
  103. cell = get_cell_from_html(table1, fname)
  104. text_table1 = re.sub(r'\s+', ' ', cell.get_text())
  105. texts = {'original' : text_table1}
  106. if table2:
  107. table2 = table2[0]
  108. cell = get_cell_from_html(table2, fname)
  109. text_table2 = re.sub(r'\s+', ' ', cell.get_text())
  110. texts['commentary'] = text_table2
  111. outfname = fname.rsplit('.',1)[0] + '.json'
  112. print('>>', outfname, file=sys.stderr) #DEBUG
  113. with open(os.path.join(args.output_dir, outfname), 'w') as outf:
  114. json.dump(texts, outf, ensure_ascii=False)
  115. ################################################
  116. if __name__ == '__main__':
  117. parser = ArgumentParser(description='extracts texts from html files and dump into json files')
  118. parser.add_argument('input_dir', action='store', help='input directory with html files')
  119. parser.add_argument('output_dir', action='store', help='output directory where json files will be saved')
  120. args = parser.parse_args()
  121. # get list of all html files to process
  122. html_fnames = (f.name for f in os.scandir(args.input_dir) if f.is_file() and os.path.splitext(f.name)[1]=='.html')
  123. for fn in html_fnames:
  124. data = process_file(args.input_dir, args.output_dir, fn)
  125. #################################################
  126. # # create a Pool object with 4 processes
  127. # pool = mp.Pool(processes=4)
  128. #
  129. # # get 4 first files
  130. # func_args = list(window(html_files, 4))
  131. #
  132. # #cnt = 0 #DEBUG
  133. #
  134. # # process groups of 4 files each time until iterator is exhausted
  135. # while func_args[0] != None:
  136. #
  137. # # prepare funct with contant args
  138. # func = ft.partial(process_file, args.input_dir, args.output_dir)
  139. #
  140. # # do processing
  141. # #data = pool.map_async(func, func_args)
  142. # data = pool.map(func, func_args)
  143. #
  144. # #if cnt >50: break #DEBUG
  145. # #cnt += 1 #DEBUG
  146. #
  147. # # get next group of 4 files
  148. # func_args = list(window(html_files, 4))
  149. #
  150. # pool.close()
  151. # pool.join()
  152. ####################################################