tsvconverter.py - Copyright (C) 2016 Alicia González Martín…

/tsvconverter2/tsvconverter.py

https://gitlab.com/cobhuni/wiki_export · Python · 368 lines · 217 code · 37 blank · 114 comment · 19 complexity · 5cb293f4fb84e7b4028d1b5c2dc5c09e MD5 · raw file

#!/usr/bin/python3.4
#
#    tsvconverter.py  @DEPRECATED - Converts annotated text stored in json into tsv format 2.
#
#    Copyright (C) 2016  Alicia González Martínez, aliciagm85+code@gmail.com
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
######################################################################################
#
# Input must include the name of the document to parse and a division in sections.
#
# Json Input:
#     [ { "title" : str ,
#         "content" : [ { "section" : str|null,  # name of section
#                         "text"    : str        # body of text
#                       }, ...
#                     ]
#       }, ...
#     ]
#
# Optionally, the text may include delimiters indicating pages, in the format:
# 
#     PAGE<digit>EGAP
#       where digit is an Arabic or Indo-arabic numeral optionaly followed
#       by "v" or "r". E.g.: PAGE٨٣rEGAP.
#     
#     Page information must be separated by spaces from the rest of the text.
#
#     This information is converted into an annotation in the tsv.
#
# Dependencies:
#   ../tokenizer/tokenizer.groovy
#
#   +--------------------------------------------+
#   | TSVConverter                               |
#   +............................................|
#   | _page_pattern <<static>>: _sre.SRE_Pattern |
#   | _pagekw_out_open<<static>>: str            |
#   | _pagekw_out_close<<static>>: str           |
#   | _section_label<<static>>: str              |
#   | _section_feature<<static>>: str            |
#   | _page_label<<static>>: str                 |
#   | _page_feature<<static>>: str               |
#   | _page_pattern<<static>>: str               |
#   | _MAX_LEN_WORD <<static>>: int              |
#   | title: str                                 |          #    
#   | content: list                              |           #   ﺎﺤﻔﻇ ﺎﻟﺮﻣﺯ ﻱﺍ ﻚﺒﻴﻜﺟ
#   |............................................|            #   
#   | _tokenizerWrapper(self, txt): list         |      
#   | convert(): str                             |      
#   +--------------------------------------------+      
# 
# Usage:
#   $ python tsvconverter.py <infile> <outfile>
#
# TODO:
#   * check \n !!
#   * generate layer files?? NO, but explain the creation of layers in webanno in the readme
#   * put a 0 when there is no info for a tag
#                                                                            
###############################################################################

import os
import sys
import re
import json
import argparse
import itertools as it
from configparser import ConfigParser
from subprocess import Popen, PIPE

CURRENT_PATH = os.path.dirname(os.path.realpath(__file__))

try:
    import util

except ImportError:
    # append parent directory to path
    sys.path.insert(0, os.path.join(CURRENT_PATH, '..'))
    import util

config = ConfigParser(inline_comment_prefixes=('#'))
config.read(os.path.join(CURRENT_PATH, '../config.ini'))

# process to segment and tokenize text
TOKENIZER = os.path.join(CURRENT_PATH, '../tokenizer/tokenizer.groovy')


class TSVConverter:
    """Converts json into tsv.
    
    Class attributes:
        _page_allowed (str): Pattern of page info within input text.
        _pagekw_out_open (str): Opening keyword for indicating page info within the text.
        _pagekw_out_close (str): Closing keyword for indicating page info within the text.

        _section_label (str): Name of section custom layer in webanno.
        _section_feature (str): Name of feature of section custom layer in webanno.
        _page_label (str): Name of page custom layer in webanno.
        _page_feature (str): Name of feature of page custom layer in webanno.
        
        _page_pattern (_sre.SRE_Pattern): Allowed format of page info.

        _MAX_LEN_WORD (int): Maximum number of characters an Arabic word is expected to have.
    
    """
    _page_allowed = config.get('json format', 'page allowed')
    _pagekw_out_open = config.get('json format', 'opening page keyword output')
    _pagekw_out_close = config.get('json format', 'closing page keyword output')
                      
    _section_label = config.get('webanno', 'section layer name').replace(' ','').capitalize()
    _section_feature = config.get('webanno', 'section layer feature').replace(' ','')
    _page_label = config.get('webanno', 'page layer name').replace(' ','').capitalize()
    _page_feature = config.get('webanno', 'page layer feature').replace(' ','')
                           
    _page_pattern = '%s(%s)%s' % (_pagekw_out_open,
                                  _page_pattern,
                                  _pagekw_out_close)

    _MAX_LEN_WORD = config.getint('arabic words', 'max length')

    _ARABIC_VOWELS = list(util.tochar(d)[0] for d in config['arabic vocalic diacritics'].values())
    _VOWELS_ERROR = re.compile(r'[%s]{2,}' % ''.join(_ARABIC_VOWELS))

    def __init__(self, data):
        """ Constructor.

        Args:
            data (str): Json containing title of scan together with sections
                and texts to parse and convert into tsv.

        Instance attributes:
            title (): Name of the document.
            content (list): chunks of text from the document separated by sections.
                Format: [{"section" : str|null, "text" : str}, ...]
                Page delimiters are inserted within the text.

        """
        data = json.loads(data)
        self.title = data['title']
        self.content = data['content']

    def _tokenizerWrapper(self, plain_text, tokenizer_path=TOKENIZER):
        """ Sends plain_text to process tokenizer_path and collect the output - a json struct
            containing a list of sentences splitted from plain_text and a list of tokens
            for each sentence.

        Args:
            plain_text (str): Text to split in sentences and tokenize.
            tokenizer_path (str): Path of tokenizer process to call.

        Returns:
            list: Json object containing splitted and tokenized text.
                [{'sentence'=str, 'tokens'=[str,str,...]}, ...]

        Raises:
            OSError: If process call fails.

        """
        if not os.path.isfile(tokenizer_path):
            print('Fatal error: Script "%s" not found.' % tokenizer_path, file=sys.stderr)
            sys.exit(1)

        # segment and tokenize text
        #FIXME inefficient shit
        try:
            tokenizer_proc = Popen(['groovy', tokenizer_path], stdin=PIPE, stdout=PIPE, stderr=PIPE)
            out, err = tokenizer_proc.communicate(plain_text.encode('utf-8'))
        except OSError as err:
            print('Error opening tokenizer process: %s' % err, file=sys.stderr)
            sys.exit(1)

        if err.strip():
            print('Fatal error trying to execute %s:\n\n%s.' % (tokenizer_path, err), file=sys.stderr)
            sys.exit(1)

        return json.loads(out.decode('utf8'))


    def _error_checker(self, token, section):
        """ Check if there are possible typos in token and show warnings.

        Args:
            token (str): Word to check.
            section (str): Name of the section the token belongs to.

        """
        if not re.match(r'%s' % TSVConverter._page_pattern, token):
        
            # word with non arabic char in an arabic alphabetic word
            if any(util.isArabicalpha(c) for c in token) and \
                any(not util.isArabicalpha(c) for c in token):
                print('Warning in section "%s" of scan %s: word "%s" may contain a typo (non-Arabic chars inside word)'
                       % (section, self.title, token), file=sys.stderr)
        
            # exceeds max length
            if len(token) > TSVConverter._MAX_LEN_WORD:
                print('Warning in section "%s" of scan %s: word "%s" may contain a typo (word too long)'
                       % (section, self.title, token), file=sys.stderr)
        
            # if ta marbuta (U+0629) in the middle
            # it has to be last character or one after last, if word include vowels of case
            if len(token) > 4:
                if 'ة' in token[1:-3]:
                    print('Warning in section "%s" of scan %s: word "%s" may contain a typo (ta marbuta in the middle)'
                         % (section, self.title, token), file=sys.stderr)  


            # there cannot be more than one vocalic diacritic together
            if TSVConverter._VOWELS_ERROR.search(token):
                print('Warning in section "%s" of scan %s: There are 2 or more vocalic diacritics together in token "%s"'
                       % (section, self.title, token), file=sys.stderr)


      
    def convert(self):
        """ Parse json with section, page and text info and dumps all in tsv format.

        Returns:
            str: Sequence of lines corresponding to the tsv.

        Raise:
            Exception: Reraises Exceptions catched by _tokenizerWrapper.
            ValueError: If page info is not parsed correctly.

        Example:
            >>> input = {"title": "Nabrawi.djvu", "content": [{"section": "section 1", "text": \
            ... "PAGE٥٤EGAP \n الطائعين بغير الايمان"}, {"section": "section 2", "text": \
            ... "نااش حخن شحخسي ش حرة ودقيق PAGE٥٥EGAP ة ومتكاملة ومتنوعة ومحايدة، PAGE٤٤EGAP يستطيع الجميع المساهمة في"}]}
            >>> tsv = TSVConverter(json.dumps(input))
            >>> tsvout = tsv.convert()
            >>> for t in tsvout.splitlines(): print(t)
            ... 
             # webanno.custom.section | sectionname # webanno.custom.page | sectionpage
            
            #id=1
            #text= 
             الطائعين بغير الايمان
            1-1 الطائعين    B-section 1 B-٥٤
            1-2 بغير    I-section 1 I-٥٤
            1-3 الايمان I-section 1 I-٥٤
            
            #id=2
            #text=نااش حخن شحخسي ش حرة ودقيق  ة ومتكاملة ومتنوعة ومحايدة،  يستطيع الجميع المساهمة في
            2-1 نااش    B-section 2 I-٥٤
            2-2 حخن I-section 2 I-٥٤
            (...)

        """
        out = []
        cnt_sentence = 0

        pageinfo = sectioninfo = ''
        newpage = False

        for chunk in self.content:

            newsection = True
            
            section = chunk['section']
            text = chunk['text']

            if section:
                sectioninfo = 'B-%s' % section

            try:
                tokenized = self._tokenizerWrapper(text)
            except Exception:
                raise

            for item in tokenized:
                cnt_sentence+=1
                
                sentence = item['sentence'] # str
                tokens = item['tokens'] # list of strings

                cleantxt = re.sub(r'%s' % TSVConverter._page_pattern, '', sentence)

                if TSVConverter._pagekw_out_open in cleantxt:
                    raise ValueError('Bad format for page info in scan "%s" '
                                     'Call the administrator.' % self.title)

                out.append('\n#id=%d' % cnt_sentence)
                out.append('#text=%s' % cleantxt)
                
                cnt_token = 0
                for token in tokens:

                    # check for typos in token
                    self._error_checker(token, section)
                    
                    # new page found, start B tag
                    if TSVConverter._pagekw_out_open in token:

                        pagefound = re.match('^%s$' % TSVConverter._page_pattern, token)

                        if not pagefound:
                            raise ValueError('Page information not well formated in scan "%s".' % self.title)

                        if len(pagefound.groups()) != 1:
                            raise ValueError('Page information not well formated in scan "%s".' % self.title)

                        pageinfo = 'B-%s' % pagefound.groups(0)
                        newpage = True

                        continue

                    # do not count a new token if page info is found                        
                    else:
                        cnt_token += 1

                    if sectioninfo and not newsection:
                        sectioninfo = 'I-%s' % section

                    if pageinfo and not newpage:
                        pageinfo = 'I' + pageinfo[1:]
                    
                    newpage = False
                    newsection = False

                    entry = '%d-%d\t%s\t%s\t%s' % (cnt_sentence, cnt_token, token,
                                                   sectioninfo, pageinfo)
                        
                    out.append(re.sub('\t+', '\t', entry))

        header = ''
        
        if sectioninfo:
            header+=' # webanno.custom.%s | %s' % (TSVConverter._section_label,
                                                   TSVConverter._section_feature)
     
        if pageinfo:
            header+=' # webanno.custom.%s | %s' % (TSVConverter._page_label,
                                                   TSVConverter._page_feature)

        out.insert(0, header)
        return '\n'.join(out)


if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Convert json into tsv')
    parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin,
                                  help='input file to parse [DEFAULT stdin]', metavar='infile.json')
    parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout,
                                   help='output file to create [DEFAULT stdout]', metavar='outfile.tsv')
    args = parser.parse_args()

    tsv = TSVConverter(args.infile.read())
    try:
        tsvout = tsv.convert()
    except Exception as e:
        print('Fatal error in TSVConverter: %s' % e, file=sys.stderr)
        sys.exit(1)

    print(tsvout, file=args.outfile)
Alerts (12)

'list(' Avoid unnecessary list conversions; use generators where possible
134
'print(' Use logging module for better control and configurability
173 182 186 205 210 217 223 244 365 368
'open(' Use 'with open()' to ensure Files are properly closed
179