StanfordParser.py | searchcode

/TEES/Tools/StanfordParser.py

https://bitbucket.org/yumyai/tees · Python · 481 lines · 337 code · 32 blank · 112 comment · 89 complexity · cf2ef03d36860347b91ba0c2b0d3685b MD5 · raw file

import sys, os
import shutil
import subprocess
import tempfile
import tarfile
import codecs
from ProcessUtils import *
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import cElementTree as ET
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)),"..")))
import TEES.Utils.ElementTreeUtils as ETUtils
import TEES.Utils.Settings as Settings
import TEES.Utils.Download as Download
import TEES.Utils.Settings as Settings
from TEES.Tools import Tool
#stanfordParserDir = "/home/jari/biotext/tools/stanford-parser-2010-08-20"
#stanfordParserDir = "/home/jari/temp_exec/stanford-parser-2010-08-20"
#stanfordParserDir = Settings.STANFORD_PARSER_DIR
#stanfordParserArgs = ["java", "-mx150m", "-cp", 
#    "stanford-parser.jar", "edu.stanford.nlp.trees.EnglishGrammaticalStructure", 
#    "-CCprocessed", "-treeFile", "-keepPunct"]
#stanfordParserArgs = ["java", "-mx500m", "-cp", 
#    "stanford-parser.jar", "edu.stanford.nlp.trees.EnglishGrammaticalStructure", 
#    "-CCprocessed", "-keepPunct", "-treeFile"]

escDict={"-LRB-":"(",
         "-RRB-":")",
         "-LCB-":"{",
         "-RCB-":"}",
         "-LSB-":"[",
         "-RSB-":"]",
         "``":"\"",
         "''":"\""}

def install(destDir=None, downloadDir=None, redownload=False, updateLocalSettings=False):
    print >> sys.stderr, "Installing Stanford Parser"
    if downloadDir is None:
        downloadDir = os.path.join(Settings.DATAPATH, "tools/download/")
    if destDir is None:
        destDir = os.path.join(Settings.DATAPATH, "tools/")
    items = Download.downloadAndExtract(Settings.URL["STANFORD_PARSER"], destDir, downloadDir)
    stanfordPath = Download.getTopDir(destDir, items)
    Tool.finalizeInstall(["stanford-parser.jar"], 
                         {"stanford-parser.jar":"java -cp stanford-parser.jar edu.stanford.nlp.trees.EnglishGrammaticalStructure"},
                         stanfordPath, {"STANFORD_PARSER_DIR":stanfordPath}, updateLocalSettings)


#    url = URL["STANFORD_PARSER"]
#    packageName = url.split("/")[-1].split(".")[0]
#    # Download
#    if downloadDir is None:
#        downloadDir = os.path.join(Settings.DATAPATH, "tools/download/")
#    downloadFile = Download.download(url, downloadDir, clear=redownload)
#    # Prepare destination
#    if destDir is None:
#        destDir = os.path.join(Settings.DATAPATH, "tools/")
#    installDir =  os.path.join(destDir, packageName)
#    if os.path.exists(installDir):
#        print >> sys.stderr, "Removing existing installation at", installDir
#        shutil.rmtree(installDir)
#    # Unpack
#    print >> sys.stderr, "Extracting", downloadFile, "to", destDir
#    f = tarfile.open(downloadFile, 'r:gz')
#    f.extractall(destDir)
#    f.close()
#    
#    if test(destDir):
#        Settings.setLocal("STANFORD_PARSER_DIR", destDir, updateLocalSettings)

def runStanford(input, output, stanfordParserArgs):
    #global stanfordParserArgs
    ##args = ["java", "-mx150m", "-cp", "stanford-parser.jar", "edu.stanford.nlp.trees.EnglishGrammaticalStructure", "-CCprocessed", "-treeFile", input]
    #args = ["java", "-mx500m", "-cp", "stanford-parser.jar", "edu.stanford.nlp.trees.EnglishGrammaticalStructure", "-CCprocessed", "-treeFile", input] 
    #return subprocess.Popen(args, stdout=codecs.open(output, "wt", "utf-8"))
    return subprocess.Popen(stanfordParserArgs + [input], stdout=codecs.open(output, "wt", "utf-8"))
    #return subprocess.Popen(stanfordParserArgs + [input], stdout=codecs.open(output, "wt", "latin1", "replace"))

def getUnicode(string):
    try:
        string = string.encode('raw_unicode_escape').decode('utf-8') # fix latin1?
    except:
        pass
    return string

def addDependencies(outfile, parse, tokenByIndex=None, sentenceId=None, skipExtra=0):
    global escDict
    escSymbols = sorted(escDict.keys())
    
    # A list of tokens for debugging
    tokens = []
    for key in sorted(tokenByIndex):
        tokens.append(tokenByIndex[key].get("text"))

    depCount = 1
    line = outfile.readline()
    #line = line.encode('raw_unicode_escape').decode('utf-8') # fix latin1?
    line = getUnicode(line)
    deps = []
    # BioNLP'09 Shared Task GENIA uses _two_ newlines to denote a failed parse (usually it's one,
    # the same as the BLLIP parser. To survive this, skipExtra can be used to define the number
    # of lines to skip, if the first line of a dependency parse is empty (indicating a failed parse) 
    if line.strip() == "" and skipExtra > 0:
        for i in range(skipExtra):
            outfile.readline()
    while line.strip() != "":            
        # Add dependencies
        depType, rest = line.strip()[:-1].split("(")
        t1, t2 = rest.split(", ")
        t1Word, t1Index = t1.rsplit("-", 1)
        for escSymbol in escSymbols:
            t1Word = t1Word.replace(escSymbol, escDict[escSymbol])
        while not t1Index[-1].isdigit(): t1Index = t1Index[:-1] # invalid literal for int() with base 10: "7'"
        t1Index = int(t1Index)
        t2Word, t2Index = t2.rsplit("-", 1)
        for escSymbol in escSymbols:
            t2Word = t2Word.replace(escSymbol, escDict[escSymbol])
        while not t2Index[-1].isdigit(): t2Index = t2Index[:-1] # invalid literal for int() with base 10: "7'"
        t2Index = int(t2Index)
        # Make element
        #if depType == "root":
        #    assert t1Word == "ROOT"
        #    if tokenByIndex is not None and t2Index-1 in tokenByIndex:
        #        tokenByIndex[t2Index-1].set("stanford-root", "True")
        if depType != "root":
            dep = ET.Element("dependency")
            dep.set("id", "sd_" + str(depCount))
            alignmentError = False
            if tokenByIndex is not None:
                if t1Index-1 not in tokenByIndex:
                    print >> sys.stderr, "Token not found", (t1Word, depCount, sentenceId)
                    deps = []
                    while line.strip() != "": line = outfile.readline()
                    break
                if t2Index-1 not in tokenByIndex:
                    print >> sys.stderr, "Token not found", (t2Word, depCount, sentenceId)
                    deps = []
                    while line.strip() != "": line = outfile.readline()
                    break
                if t1Word != tokenByIndex[t1Index-1].get("text"):
                    print >> sys.stderr, "Alignment error", (t1Word, tokenByIndex[t1Index-1].get("text"), t1Index-1, depCount, sentenceId, tokens)
                    alignmentError = True
                    if parse.get("stanfordAlignmentError") is None:
                        parse.set("stanfordAlignmentError", t1Word)
                if t2Word != tokenByIndex[t2Index-1].get("text"):
                    print >> sys.stderr, "Alignment error", (t2Word, tokenByIndex[t2Index-1].get("text"), t2Index-1, depCount, sentenceId, tokens)
                    alignmentError = True
                    if parse.get("stanfordAlignmentError") is None:
                        parse.set("stanfordAlignmentError", t2Word)
                dep.set("t1", tokenByIndex[t1Index-1].get("id"))
                dep.set("t2", tokenByIndex[t2Index-1].get("id"))
            else:
                dep.set("t1", "bt_" + str(t1Index))
                dep.set("t2", "bt_" + str(t2Index))
            dep.set("type", depType)
            parse.insert(depCount-1, dep)
            depCount += 1
            if not alignmentError:
                deps.append(dep)
        line = outfile.readline()
        try:
            line = getUnicode(line)
            #line = line.encode('raw_unicode_escape').decode('utf-8') # fix latin1?
        except:
            print "Type", type(line)
            print "Repr", repr(line)
            print line
            raise
    return deps

#def convert(input, output=None):
#    global stanfordParserDir, stanfordParserArgs
#
#    workdir = tempfile.mkdtemp()
#    if output is None:
#        output = os.path.join(workdir, "stanford-output.txt")
#    
#    input = os.path.abspath(input)
#    numCorpusSentences = 0
#    inputFile = codecs.open(input, "rt", "utf-8")
#    for line in inputFile:
#        numCorpusSentences += 1
#    inputFile.close()
#    cwd = os.getcwd()
#    os.chdir(stanfordParserDir)
#    #args = ["java", "-mx150m", "-cp", 
#    #        "stanford-parser.jar", "edu.stanford.nlp.trees.EnglishGrammaticalStructure", 
#    #        "-CCprocessed", "-treeFile", "-keepPunct",
#    #        input]
#    args = stanfordParserArgs + [input]
#    #subprocess.call(args,
#    process = subprocess.Popen(args, 
#        stdout=codecs.open(output, "wt", "utf-8"))
#    waitForProcess(process, numCorpusSentences, True, output, "StanfordParser", "Stanford Conversion")
#    os.chdir(cwd)
#
#    lines = None    
#    if output is None:
#        outFile = codecs.open(output, "rt", "utf-8")
#        lines = outFile.readlines()
#        outFile.close()
#    
#    shutil.rmtree(workdir)
#    return lines

def convertXML(parser, input, output=None, debug=False, reparse=False, stanfordParserDir=None, stanfordParserArgs=None):
    #global stanfordParserDir, stanfordParserArgs
    if stanfordParserDir is None:
        stanfordParserDir = Settings.STANFORD_PARSER_DIR
    if stanfordParserArgs is None:
        # not sure how necessary the "-mx500m" option is, and how exactly Java
        # options interact, but adding user defined options from Settings.JAVA
        # after the "-mx500m" hopefully works.
        stanfordParserArgs = Settings.JAVA.split()[0:1] + ["-mx500m"] + \
                             Settings.JAVA.split()[1:] + \
                             ["-cp", "stanford-parser.jar", 
                              "edu.stanford.nlp.trees.EnglishGrammaticalStructure", 
                              "-CCprocessed", "-keepPunct", "-treeFile"]
    print >> sys.stderr, "Running Stanford conversion"
    print >> sys.stderr, "Stanford tools at:", stanfordParserDir
    print >> sys.stderr, "Stanford tools arguments:", " ".join(stanfordParserArgs)
    parseTimeStamp = time.strftime("%d.%m.%y %H:%M:%S")
    print >> sys.stderr, "Stanford time stamp:", parseTimeStamp
    
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    workdir = tempfile.mkdtemp()
    if debug:
        print >> sys.stderr, "Stanford parser workdir", workdir
    stanfordInput = os.path.join(workdir, "input")
    stanfordInputFile = codecs.open(stanfordInput, "wt", "utf-8")
    
    # Put penn tree lines in input file
    existingCount = 0
    for sentence in corpusRoot.getiterator("sentence"):
        if sentence.find("sentenceanalyses") is not None: # old format
            sentenceAnalyses = setDefaultElement(sentence, "sentenceanalyses")
            parses = setDefaultElement(sentenceAnalyses, "parses")
            parse = getElementByAttrib(parses, "parse", {"parser":parser})
        else:
            analyses = setDefaultElement(sentence, "analyses")
            parse = getElementByAttrib(analyses, "parse", {"parser":parser})
        if parse is None:
            print "NONE"
            continue
        if len(parse.findall("dependency")) > 0:
            if reparse: # remove existing stanford conversion
                for dep in parse.findall("dependency"):
                    parse.remove(dep)
                del parse.attrib["stanford"]
            else: # don't reparse
                existingCount += 1
                continue
        pennTree = parse.get("pennstring")
        if pennTree is None or pennTree == "":
            continue
        stanfordInputFile.write(pennTree + "\n")
    stanfordInputFile.close()
    if existingCount != 0:
        print >> sys.stderr, "Skipping", existingCount, "already converted sentences."
    
    # Run Stanford parser
    stanfordOutput = runSentenceProcess(runStanford, stanfordParserDir, stanfordInput, 
                                        workdir, True, "StanfordParser", 
                                        "Stanford Conversion", timeout=600,
                                        outputArgs={"encoding":"latin1", "errors":"replace"},
                                        processArgs={"stanfordParserArgs":stanfordParserArgs})   
    #stanfordOutputFile = codecs.open(stanfordOutput, "rt", "utf-8")
    stanfordOutputFile = codecs.open(stanfordOutput, "rt", "latin1", "replace")
    
    # Get output and insert dependencies
    noDepCount = 0
    failCount = 0
    sentenceCount = 0
    for document in corpusRoot.findall("document"):
        for sentence in document.findall("sentence"):
            # Get parse
            if sentence.find("sentenceanalyses") is not None: # old format
                sentenceAnalyses = setDefaultElement(sentence, "sentenceanalyses")
                parses = setDefaultElement(sentenceAnalyses, "parses")
                parse = getElementByAttrib(parses, "parse", {"parser":parser})
            else:
                analyses = setDefaultElement(sentence, "analyses")
                parse = getElementByAttrib(analyses, "parse", {"parser":parser})
            if parse is None:
                parse = ET.SubElement(analyses, "parse")
                parse.set("parser", "None")
            if reparse:
                assert len(parse.findall("dependency")) == 0
            elif len(parse.findall("dependency")) > 0: # don't reparse
                continue
            pennTree = parse.get("pennstring")
            if pennTree is None or pennTree == "":
                parse.set("stanford", "no_penn")
                continue
            parse.set("stanfordSource", "TEES") # parser was run through this wrapper
            parse.set("stanfordDate", parseTimeStamp) # links the parse to the log file
            # Get tokens
            if sentence.find("analyses") is not None:
                tokenization = getElementByAttrib(sentence.find("analyses"), "tokenization", {"tokenizer":parse.get("tokenizer")})
            else:
                tokenization = getElementByAttrib(sentence.find("sentenceanalyses").find("tokenizations"), "tokenization", {"tokenizer":parse.get("tokenizer")})
            assert tokenization is not None
            count = 0
            tokenByIndex = {}
            for token in tokenization.findall("token"):
                tokenByIndex[count] = token
                count += 1
            # Insert dependencies
            origId = document.get("pmid")
            if origId is None:
                origId = document.get("origId")
            origId = str(origId)
            deps = addDependencies(stanfordOutputFile, parse, tokenByIndex, (sentence.get("id"), origId))
            if len(deps) == 0:
                parse.set("stanford", "no_dependencies")
                noDepCount += 1
                if parse.get("stanfordAlignmentError") is not None:
                    failCount += 1
            else:
                parse.set("stanford", "ok")
                if parse.get("stanfordAlignmentError") is not None:
                    failCount += 1
                    parse.set("stanford", "partial")
            sentenceCount += 1
    stanfordOutputFile.close()
    # Remove work directory
    if not debug:
        shutil.rmtree(workdir)
        
    print >> sys.stderr, "Stanford conversion was done for", sentenceCount, "sentences,", noDepCount, "had no dependencies,", failCount, "failed"
    
    if output is not None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree

def insertParse(sentence, stanfordOutputFile, parser, extraAttributes={}, skipExtra=0):
    # Get parse
    analyses = setDefaultElement(sentence, "analyses")
    #parses = setDefaultElement(sentenceAnalyses, "parses")
    parse = getElementByAttrib(analyses, "parse", {"parser":parser})
    if parse is None:
        parse = ET.SubElement(analyses, "parse")
        parse.set("parser", "None")
    # Remove existing dependencies
    if len(parse.findall("dependency")) > 0:
        for dependency in parse.findall("dependency"):
            parse.remove(dependency)
    # If no penn tree exists, the stanford parsing can't have happened either
    pennTree = parse.get("pennstring")
    if pennTree is None or pennTree == "":
        parse.set("stanford", "no_penn")
    # Must not exit early, so that reading of the stanfordOutputFile stays in sync with the sentences
    #if len(parse.findall("dependency")) > 0: # don't reparse
    #    return True
    #pennTree = parse.get("pennstring")
    #if pennTree is None or pennTree == "":
    #    parse.set("stanford", "no_penn")
    #    return False
    for attr in sorted(extraAttributes.keys()):
        parse.set(attr, extraAttributes[attr])
    # Get tokens
    tokenByIndex = {}
    tokenization = getElementByAttrib(sentence.find("analyses"), "tokenization", {"tokenizer":parse.get("tokenizer")})
    if tokenization is not None:
        count = 0
        for token in tokenization.findall("token"):
            tokenByIndex[count] = token
            count += 1
    # Insert dependencies
    deps = addDependencies(stanfordOutputFile, parse, tokenByIndex, (sentence.get("id"), sentence.get("origId")), skipExtra=skipExtra)
    if len(deps) == 0:
        parse.set("stanford", "no_dependencies")
    else:
        parse.set("stanford", "ok")
    return True

def insertParses(input, parsePath, output=None, parseName="McCC", extraAttributes={}, skipExtra=0):
    import tarfile
    from SentenceSplitter import openFile
    """
    Divide text in the "text" attributes of document and section 
    elements into sentence elements. These sentence elements are
    inserted into their respective parent elements.
    """  
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    print >> sys.stderr, "Inserting parses from", parsePath
    if parsePath.find(".tar.gz") != -1:
        tarFilePath, parsePath = parsePath.split(".tar.gz")
        tarFilePath += ".tar.gz"
        tarFile = tarfile.open(tarFilePath)
        if parsePath[0] == "/":
            parsePath = parsePath[1:]
    else:
        tarFile = None
    
    docCount = 0
    failCount = 0
    sentenceCount = 0
    docsWithStanford = 0
    sentencesCreated = 0
    sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")]
    counter = ProgressCounter(len(sourceElements), "McCC Parse Insertion")
    for document in sourceElements:
        docCount += 1
        docId = document.get("id")
        origId = document.get("pmid")
        if origId is None:
            origId = document.get("origId")
        origId = str(origId)
        if docId is None:
            docId = "CORPUS.d" + str(docCount)
        
        f = openFile(os.path.join(parsePath, origId + ".sd"), tarFile)
        if f is None: # file with BioNLP'11 extension not found, try BioNLP'09 extension
            f = openFile(os.path.join(parsePath, origId + ".dep"), tarFile)
        if f is not None:
            sentences = document.findall("sentence")
            # TODO: Following for-loop is the same as when used with a real parser, and should
            # be moved to its own function.
            for sentence in sentences:
                sentenceCount += 1
                counter.update(0, "Processing Documents ("+sentence.get("id")+"/" + origId + "): ")
                if not insertParse(sentence, f, parseName, extraAttributes={}, skipExtra=skipExtra):
                    failCount += 1
            f.close()
        counter.update(1, "Processing Documents ("+document.get("id")+"/" + origId + "): ")
    
    if tarFile is not None:
        tarFile.close()
    #print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences"
    #print >> sys.stderr, docsWithSentences, "/", docCount, "documents have stanford parses"

    print >> sys.stderr, "Stanford conversion was inserted to", sentenceCount, "sentences" #, failCount, "failed"
        
    if output is not None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree


if __name__=="__main__":
    import sys
    
    from optparse import OptionParser, OptionGroup
    # Import Psyco if available
    try:
        import psyco
        psyco.full()
        print >> sys.stderr, "Found Psyco, using"
    except ImportError:
        print >> sys.stderr, "Psyco not installed"

    optparser = OptionParser(description="Stanford Parser dependency converter wrapper")
    optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE")
    optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.")
    optparser.add_option("-p", "--parse", default=None, dest="parse", help="Name of parse element.")
    optparser.add_option("--debug", default=False, action="store_true", dest="debug", help="")
    optparser.add_option("--reparse", default=False, action="store_true", dest="reparse", help="")
    group = OptionGroup(optparser, "Install Options", "")
    group.add_option("--install", default=None, action="store_true", dest="install", help="Install BANNER")
    group.add_option("--installDir", default=None, dest="installDir", help="Install directory")
    group.add_option("--downloadDir", default=None, dest="downloadDir", help="Install files download directory")
    group.add_option("--redownload", default=False, action="store_true", dest="redownload", help="Redownload install files")
    optparser.add_option_group(group)
    (options, args) = optparser.parse_args()
    
    if options.install:
        install(options.installDir, options.downloadDir, redownload=options.redownload)
    else:
        convertXML(input=options.input, output=options.output, parser=options.parse, debug=options.debug, reparse=options.reparse)
Tech Fingerprint

Alerts (24)

'import *' Avoid to prevent namespace pollution; import specific names or use aliases
7
'def' Ensure functions have docstrings for documentation
37 72 80 87 172 207 342 383
'tarfile.open(' Potential decompression bomb vulnerability in Python code if input is untrusted; ensure to limit the number of bytes read.
65 400
'global' Avoid global variables; use function parameters or class attributes for better scope management
73 88 173 208
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
83 165
Complexity hotspot; lines 104 to 105 (total complexity: 3)
104 105
'type(' Use isinstance() for type checking instead of type()
166
Complexity hotspot; lines 250 to 252 (total complexity: 3)
250 251 252
'del' Avoid unless necessary; Python's garbage collector typically handles object deletion
254