PageRenderTime 65ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 0ms

/tsvconverter2/tsvconverter.py

https://gitlab.com/cobhuni/wiki_export
Python | 368 lines | 242 code | 31 blank | 95 comment | 17 complexity | 5cb293f4fb84e7b4028d1b5c2dc5c09e MD5 | raw file
  1. #!/usr/bin/python3.4
  2. #
  3. # tsvconverter.py @DEPRECATED - Converts annotated text stored in json into tsv format 2.
  4. #
  5. # Copyright (C) 2016 Alicia González Martínez, aliciagm85+code@gmail.com
  6. #
  7. # This program is free software: you can redistribute it and/or modify
  8. # it under the terms of the GNU General Public License as published by
  9. # the Free Software Foundation, either version 3 of the License, or
  10. # (at your option) any later version.
  11. #
  12. # This program is distributed in the hope that it will be useful,
  13. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. # GNU General Public License for more details.
  16. #
  17. # You should have received a copy of the GNU General Public License
  18. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  19. #
  20. ######################################################################################
  21. #
  22. # Input must include the name of the document to parse and a division in sections.
  23. #
  24. # Json Input:
  25. # [ { "title" : str ,
  26. # "content" : [ { "section" : str|null, # name of section
  27. # "text" : str # body of text
  28. # }, ...
  29. # ]
  30. # }, ...
  31. # ]
  32. #
  33. # Optionally, the text may include delimiters indicating pages, in the format:
  34. #
  35. # PAGE<digit>EGAP
  36. # where digit is an Arabic or Indo-arabic numeral optionaly followed
  37. # by "v" or "r". E.g.: PAGE٨٣rEGAP.
  38. #
  39. # Page information must be separated by spaces from the rest of the text.
  40. #
  41. # This information is converted into an annotation in the tsv.
  42. #
  43. # Dependencies:
  44. # ../tokenizer/tokenizer.groovy
  45. #
  46. # +--------------------------------------------+
  47. # | TSVConverter |
  48. # +............................................|
  49. # | _page_pattern <<static>>: _sre.SRE_Pattern |
  50. # | _pagekw_out_open<<static>>: str |
  51. # | _pagekw_out_close<<static>>: str |
  52. # | _section_label<<static>>: str |
  53. # | _section_feature<<static>>: str |
  54. # | _page_label<<static>>: str |
  55. # | _page_feature<<static>>: str |
  56. # | _page_pattern<<static>>: str |
  57. # | _MAX_LEN_WORD <<static>>: int |
  58. # | title: str | #
  59. # | content: list | # ﺎﺤﻔﻇ ﺎﻟﺮﻣﺯ ﻱﺍ ﻚﺒﻴﻜﺟ
  60. # |............................................| #
  61. # | _tokenizerWrapper(self, txt): list |
  62. # | convert(): str |
  63. # +--------------------------------------------+
  64. #
  65. # Usage:
  66. # $ python tsvconverter.py <infile> <outfile>
  67. #
  68. # TODO:
  69. # * check \n !!
  70. # * generate layer files?? NO, but explain the creation of layers in webanno in the readme
  71. # * put a 0 when there is no info for a tag
  72. #
  73. ###############################################################################
  74. import os
  75. import sys
  76. import re
  77. import json
  78. import argparse
  79. import itertools as it
  80. from configparser import ConfigParser
  81. from subprocess import Popen, PIPE
  82. CURRENT_PATH = os.path.dirname(os.path.realpath(__file__))
  83. try:
  84. import util
  85. except ImportError:
  86. # append parent directory to path
  87. sys.path.insert(0, os.path.join(CURRENT_PATH, '..'))
  88. import util
  89. config = ConfigParser(inline_comment_prefixes=('#'))
  90. config.read(os.path.join(CURRENT_PATH, '../config.ini'))
  91. # process to segment and tokenize text
  92. TOKENIZER = os.path.join(CURRENT_PATH, '../tokenizer/tokenizer.groovy')
  93. class TSVConverter:
  94. """Converts json into tsv.
  95. Class attributes:
  96. _page_allowed (str): Pattern of page info within input text.
  97. _pagekw_out_open (str): Opening keyword for indicating page info within the text.
  98. _pagekw_out_close (str): Closing keyword for indicating page info within the text.
  99. _section_label (str): Name of section custom layer in webanno.
  100. _section_feature (str): Name of feature of section custom layer in webanno.
  101. _page_label (str): Name of page custom layer in webanno.
  102. _page_feature (str): Name of feature of page custom layer in webanno.
  103. _page_pattern (_sre.SRE_Pattern): Allowed format of page info.
  104. _MAX_LEN_WORD (int): Maximum number of characters an Arabic word is expected to have.
  105. """
  106. _page_allowed = config.get('json format', 'page allowed')
  107. _pagekw_out_open = config.get('json format', 'opening page keyword output')
  108. _pagekw_out_close = config.get('json format', 'closing page keyword output')
  109. _section_label = config.get('webanno', 'section layer name').replace(' ','').capitalize()
  110. _section_feature = config.get('webanno', 'section layer feature').replace(' ','')
  111. _page_label = config.get('webanno', 'page layer name').replace(' ','').capitalize()
  112. _page_feature = config.get('webanno', 'page layer feature').replace(' ','')
  113. _page_pattern = '%s(%s)%s' % (_pagekw_out_open,
  114. _page_pattern,
  115. _pagekw_out_close)
  116. _MAX_LEN_WORD = config.getint('arabic words', 'max length')
  117. _ARABIC_VOWELS = list(util.tochar(d)[0] for d in config['arabic vocalic diacritics'].values())
  118. _VOWELS_ERROR = re.compile(r'[%s]{2,}' % ''.join(_ARABIC_VOWELS))
  119. def __init__(self, data):
  120. """ Constructor.
  121. Args:
  122. data (str): Json containing title of scan together with sections
  123. and texts to parse and convert into tsv.
  124. Instance attributes:
  125. title (): Name of the document.
  126. content (list): chunks of text from the document separated by sections.
  127. Format: [{"section" : str|null, "text" : str}, ...]
  128. Page delimiters are inserted within the text.
  129. """
  130. data = json.loads(data)
  131. self.title = data['title']
  132. self.content = data['content']
  133. def _tokenizerWrapper(self, plain_text, tokenizer_path=TOKENIZER):
  134. """ Sends plain_text to process tokenizer_path and collect the output - a json struct
  135. containing a list of sentences splitted from plain_text and a list of tokens
  136. for each sentence.
  137. Args:
  138. plain_text (str): Text to split in sentences and tokenize.
  139. tokenizer_path (str): Path of tokenizer process to call.
  140. Returns:
  141. list: Json object containing splitted and tokenized text.
  142. [{'sentence'=str, 'tokens'=[str,str,...]}, ...]
  143. Raises:
  144. OSError: If process call fails.
  145. """
  146. if not os.path.isfile(tokenizer_path):
  147. print('Fatal error: Script "%s" not found.' % tokenizer_path, file=sys.stderr)
  148. sys.exit(1)
  149. # segment and tokenize text
  150. #FIXME inefficient shit
  151. try:
  152. tokenizer_proc = Popen(['groovy', tokenizer_path], stdin=PIPE, stdout=PIPE, stderr=PIPE)
  153. out, err = tokenizer_proc.communicate(plain_text.encode('utf-8'))
  154. except OSError as err:
  155. print('Error opening tokenizer process: %s' % err, file=sys.stderr)
  156. sys.exit(1)
  157. if err.strip():
  158. print('Fatal error trying to execute %s:\n\n%s.' % (tokenizer_path, err), file=sys.stderr)
  159. sys.exit(1)
  160. return json.loads(out.decode('utf8'))
  161. def _error_checker(self, token, section):
  162. """ Check if there are possible typos in token and show warnings.
  163. Args:
  164. token (str): Word to check.
  165. section (str): Name of the section the token belongs to.
  166. """
  167. if not re.match(r'%s' % TSVConverter._page_pattern, token):
  168. # word with non arabic char in an arabic alphabetic word
  169. if any(util.isArabicalpha(c) for c in token) and \
  170. any(not util.isArabicalpha(c) for c in token):
  171. print('Warning in section "%s" of scan %s: word "%s" may contain a typo (non-Arabic chars inside word)'
  172. % (section, self.title, token), file=sys.stderr)
  173. # exceeds max length
  174. if len(token) > TSVConverter._MAX_LEN_WORD:
  175. print('Warning in section "%s" of scan %s: word "%s" may contain a typo (word too long)'
  176. % (section, self.title, token), file=sys.stderr)
  177. # if ta marbuta (U+0629) in the middle
  178. # it has to be last character or one after last, if word include vowels of case
  179. if len(token) > 4:
  180. if 'ة' in token[1:-3]:
  181. print('Warning in section "%s" of scan %s: word "%s" may contain a typo (ta marbuta in the middle)'
  182. % (section, self.title, token), file=sys.stderr)
  183. # there cannot be more than one vocalic diacritic together
  184. if TSVConverter._VOWELS_ERROR.search(token):
  185. print('Warning in section "%s" of scan %s: There are 2 or more vocalic diacritics together in token "%s"'
  186. % (section, self.title, token), file=sys.stderr)
  187. def convert(self):
  188. """ Parse json with section, page and text info and dumps all in tsv format.
  189. Returns:
  190. str: Sequence of lines corresponding to the tsv.
  191. Raise:
  192. Exception: Reraises Exceptions catched by _tokenizerWrapper.
  193. ValueError: If page info is not parsed correctly.
  194. Example:
  195. >>> input = {"title": "Nabrawi.djvu", "content": [{"section": "section 1", "text": \
  196. ... "PAGE٥٤EGAP \n الطائعين بغير الايمان"}, {"section": "section 2", "text": \
  197. ... "نااش حخن شحخسي ش حرة ودقيق PAGE٥٥EGAP ة ومتكاملة ومتنوعة ومحايدة، PAGE٤٤EGAP يستطيع الجميع المساهمة في"}]}
  198. >>> tsv = TSVConverter(json.dumps(input))
  199. >>> tsvout = tsv.convert()
  200. >>> for t in tsvout.splitlines(): print(t)
  201. ...
  202. # webanno.custom.section | sectionname # webanno.custom.page | sectionpage
  203. #id=1
  204. #text=
  205. الطائعين بغير الايمان
  206. 1-1 الطائعين B-section 1 B-٥٤
  207. 1-2 بغير I-section 1 I-٥٤
  208. 1-3 الايمان I-section 1 I-٥٤
  209. #id=2
  210. #text=نااش حخن شحخسي ش حرة ودقيق ة ومتكاملة ومتنوعة ومحايدة، يستطيع الجميع المساهمة في
  211. 2-1 نااش B-section 2 I-٥٤
  212. 2-2 حخن I-section 2 I-٥٤
  213. (...)
  214. """
  215. out = []
  216. cnt_sentence = 0
  217. pageinfo = sectioninfo = ''
  218. newpage = False
  219. for chunk in self.content:
  220. newsection = True
  221. section = chunk['section']
  222. text = chunk['text']
  223. if section:
  224. sectioninfo = 'B-%s' % section
  225. try:
  226. tokenized = self._tokenizerWrapper(text)
  227. except Exception:
  228. raise
  229. for item in tokenized:
  230. cnt_sentence+=1
  231. sentence = item['sentence'] # str
  232. tokens = item['tokens'] # list of strings
  233. cleantxt = re.sub(r'%s' % TSVConverter._page_pattern, '', sentence)
  234. if TSVConverter._pagekw_out_open in cleantxt:
  235. raise ValueError('Bad format for page info in scan "%s" '
  236. 'Call the administrator.' % self.title)
  237. out.append('\n#id=%d' % cnt_sentence)
  238. out.append('#text=%s' % cleantxt)
  239. cnt_token = 0
  240. for token in tokens:
  241. # check for typos in token
  242. self._error_checker(token, section)
  243. # new page found, start B tag
  244. if TSVConverter._pagekw_out_open in token:
  245. pagefound = re.match('^%s$' % TSVConverter._page_pattern, token)
  246. if not pagefound:
  247. raise ValueError('Page information not well formated in scan "%s".' % self.title)
  248. if len(pagefound.groups()) != 1:
  249. raise ValueError('Page information not well formated in scan "%s".' % self.title)
  250. pageinfo = 'B-%s' % pagefound.groups(0)
  251. newpage = True
  252. continue
  253. # do not count a new token if page info is found
  254. else:
  255. cnt_token += 1
  256. if sectioninfo and not newsection:
  257. sectioninfo = 'I-%s' % section
  258. if pageinfo and not newpage:
  259. pageinfo = 'I' + pageinfo[1:]
  260. newpage = False
  261. newsection = False
  262. entry = '%d-%d\t%s\t%s\t%s' % (cnt_sentence, cnt_token, token,
  263. sectioninfo, pageinfo)
  264. out.append(re.sub('\t+', '\t', entry))
  265. header = ''
  266. if sectioninfo:
  267. header+=' # webanno.custom.%s | %s' % (TSVConverter._section_label,
  268. TSVConverter._section_feature)
  269. if pageinfo:
  270. header+=' # webanno.custom.%s | %s' % (TSVConverter._page_label,
  271. TSVConverter._page_feature)
  272. out.insert(0, header)
  273. return '\n'.join(out)
  274. if __name__ == '__main__':
  275. parser = argparse.ArgumentParser(description='Convert json into tsv')
  276. parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin,
  277. help='input file to parse [DEFAULT stdin]', metavar='infile.json')
  278. parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout,
  279. help='output file to create [DEFAULT stdout]', metavar='outfile.tsv')
  280. args = parser.parse_args()
  281. tsv = TSVConverter(args.infile.read())
  282. try:
  283. tsvout = tsv.convert()
  284. except Exception as e:
  285. print('Fatal error in TSVConverter: %s' % e, file=sys.stderr)
  286. sys.exit(1)
  287. print(tsvout, file=args.outfile)