PageRenderTime 50ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/TEES/Tools/StanfordParser.py

https://bitbucket.org/yumyai/tees
Python | 481 lines | 432 code | 9 blank | 40 comment | 7 complexity | cf2ef03d36860347b91ba0c2b0d3685b MD5 | raw file
  1. import sys, os
  2. import shutil
  3. import subprocess
  4. import tempfile
  5. import tarfile
  6. import codecs
  7. from ProcessUtils import *
  8. try:
  9. import xml.etree.cElementTree as ET
  10. except ImportError:
  11. import cElementTree as ET
  12. sys.path.append(os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)),"..")))
  13. import TEES.Utils.ElementTreeUtils as ETUtils
  14. import TEES.Utils.Settings as Settings
  15. import TEES.Utils.Download as Download
  16. import TEES.Utils.Settings as Settings
  17. from TEES.Tools import Tool
  18. #stanfordParserDir = "/home/jari/biotext/tools/stanford-parser-2010-08-20"
  19. #stanfordParserDir = "/home/jari/temp_exec/stanford-parser-2010-08-20"
  20. #stanfordParserDir = Settings.STANFORD_PARSER_DIR
  21. #stanfordParserArgs = ["java", "-mx150m", "-cp",
  22. # "stanford-parser.jar", "edu.stanford.nlp.trees.EnglishGrammaticalStructure",
  23. # "-CCprocessed", "-treeFile", "-keepPunct"]
  24. #stanfordParserArgs = ["java", "-mx500m", "-cp",
  25. # "stanford-parser.jar", "edu.stanford.nlp.trees.EnglishGrammaticalStructure",
  26. # "-CCprocessed", "-keepPunct", "-treeFile"]
  27. escDict={"-LRB-":"(",
  28. "-RRB-":")",
  29. "-LCB-":"{",
  30. "-RCB-":"}",
  31. "-LSB-":"[",
  32. "-RSB-":"]",
  33. "``":"\"",
  34. "''":"\""}
  35. def install(destDir=None, downloadDir=None, redownload=False, updateLocalSettings=False):
  36. print >> sys.stderr, "Installing Stanford Parser"
  37. if downloadDir is None:
  38. downloadDir = os.path.join(Settings.DATAPATH, "tools/download/")
  39. if destDir is None:
  40. destDir = os.path.join(Settings.DATAPATH, "tools/")
  41. items = Download.downloadAndExtract(Settings.URL["STANFORD_PARSER"], destDir, downloadDir)
  42. stanfordPath = Download.getTopDir(destDir, items)
  43. Tool.finalizeInstall(["stanford-parser.jar"],
  44. {"stanford-parser.jar":"java -cp stanford-parser.jar edu.stanford.nlp.trees.EnglishGrammaticalStructure"},
  45. stanfordPath, {"STANFORD_PARSER_DIR":stanfordPath}, updateLocalSettings)
  46. # url = URL["STANFORD_PARSER"]
  47. # packageName = url.split("/")[-1].split(".")[0]
  48. # # Download
  49. # if downloadDir is None:
  50. # downloadDir = os.path.join(Settings.DATAPATH, "tools/download/")
  51. # downloadFile = Download.download(url, downloadDir, clear=redownload)
  52. # # Prepare destination
  53. # if destDir is None:
  54. # destDir = os.path.join(Settings.DATAPATH, "tools/")
  55. # installDir = os.path.join(destDir, packageName)
  56. # if os.path.exists(installDir):
  57. # print >> sys.stderr, "Removing existing installation at", installDir
  58. # shutil.rmtree(installDir)
  59. # # Unpack
  60. # print >> sys.stderr, "Extracting", downloadFile, "to", destDir
  61. # f = tarfile.open(downloadFile, 'r:gz')
  62. # f.extractall(destDir)
  63. # f.close()
  64. #
  65. # if test(destDir):
  66. # Settings.setLocal("STANFORD_PARSER_DIR", destDir, updateLocalSettings)
  67. def runStanford(input, output, stanfordParserArgs):
  68. #global stanfordParserArgs
  69. ##args = ["java", "-mx150m", "-cp", "stanford-parser.jar", "edu.stanford.nlp.trees.EnglishGrammaticalStructure", "-CCprocessed", "-treeFile", input]
  70. #args = ["java", "-mx500m", "-cp", "stanford-parser.jar", "edu.stanford.nlp.trees.EnglishGrammaticalStructure", "-CCprocessed", "-treeFile", input]
  71. #return subprocess.Popen(args, stdout=codecs.open(output, "wt", "utf-8"))
  72. return subprocess.Popen(stanfordParserArgs + [input], stdout=codecs.open(output, "wt", "utf-8"))
  73. #return subprocess.Popen(stanfordParserArgs + [input], stdout=codecs.open(output, "wt", "latin1", "replace"))
  74. def getUnicode(string):
  75. try:
  76. string = string.encode('raw_unicode_escape').decode('utf-8') # fix latin1?
  77. except:
  78. pass
  79. return string
  80. def addDependencies(outfile, parse, tokenByIndex=None, sentenceId=None, skipExtra=0):
  81. global escDict
  82. escSymbols = sorted(escDict.keys())
  83. # A list of tokens for debugging
  84. tokens = []
  85. for key in sorted(tokenByIndex):
  86. tokens.append(tokenByIndex[key].get("text"))
  87. depCount = 1
  88. line = outfile.readline()
  89. #line = line.encode('raw_unicode_escape').decode('utf-8') # fix latin1?
  90. line = getUnicode(line)
  91. deps = []
  92. # BioNLP'09 Shared Task GENIA uses _two_ newlines to denote a failed parse (usually it's one,
  93. # the same as the BLLIP parser. To survive this, skipExtra can be used to define the number
  94. # of lines to skip, if the first line of a dependency parse is empty (indicating a failed parse)
  95. if line.strip() == "" and skipExtra > 0:
  96. for i in range(skipExtra):
  97. outfile.readline()
  98. while line.strip() != "":
  99. # Add dependencies
  100. depType, rest = line.strip()[:-1].split("(")
  101. t1, t2 = rest.split(", ")
  102. t1Word, t1Index = t1.rsplit("-", 1)
  103. for escSymbol in escSymbols:
  104. t1Word = t1Word.replace(escSymbol, escDict[escSymbol])
  105. while not t1Index[-1].isdigit(): t1Index = t1Index[:-1] # invalid literal for int() with base 10: "7'"
  106. t1Index = int(t1Index)
  107. t2Word, t2Index = t2.rsplit("-", 1)
  108. for escSymbol in escSymbols:
  109. t2Word = t2Word.replace(escSymbol, escDict[escSymbol])
  110. while not t2Index[-1].isdigit(): t2Index = t2Index[:-1] # invalid literal for int() with base 10: "7'"
  111. t2Index = int(t2Index)
  112. # Make element
  113. #if depType == "root":
  114. # assert t1Word == "ROOT"
  115. # if tokenByIndex is not None and t2Index-1 in tokenByIndex:
  116. # tokenByIndex[t2Index-1].set("stanford-root", "True")
  117. if depType != "root":
  118. dep = ET.Element("dependency")
  119. dep.set("id", "sd_" + str(depCount))
  120. alignmentError = False
  121. if tokenByIndex is not None:
  122. if t1Index-1 not in tokenByIndex:
  123. print >> sys.stderr, "Token not found", (t1Word, depCount, sentenceId)
  124. deps = []
  125. while line.strip() != "": line = outfile.readline()
  126. break
  127. if t2Index-1 not in tokenByIndex:
  128. print >> sys.stderr, "Token not found", (t2Word, depCount, sentenceId)
  129. deps = []
  130. while line.strip() != "": line = outfile.readline()
  131. break
  132. if t1Word != tokenByIndex[t1Index-1].get("text"):
  133. print >> sys.stderr, "Alignment error", (t1Word, tokenByIndex[t1Index-1].get("text"), t1Index-1, depCount, sentenceId, tokens)
  134. alignmentError = True
  135. if parse.get("stanfordAlignmentError") is None:
  136. parse.set("stanfordAlignmentError", t1Word)
  137. if t2Word != tokenByIndex[t2Index-1].get("text"):
  138. print >> sys.stderr, "Alignment error", (t2Word, tokenByIndex[t2Index-1].get("text"), t2Index-1, depCount, sentenceId, tokens)
  139. alignmentError = True
  140. if parse.get("stanfordAlignmentError") is None:
  141. parse.set("stanfordAlignmentError", t2Word)
  142. dep.set("t1", tokenByIndex[t1Index-1].get("id"))
  143. dep.set("t2", tokenByIndex[t2Index-1].get("id"))
  144. else:
  145. dep.set("t1", "bt_" + str(t1Index))
  146. dep.set("t2", "bt_" + str(t2Index))
  147. dep.set("type", depType)
  148. parse.insert(depCount-1, dep)
  149. depCount += 1
  150. if not alignmentError:
  151. deps.append(dep)
  152. line = outfile.readline()
  153. try:
  154. line = getUnicode(line)
  155. #line = line.encode('raw_unicode_escape').decode('utf-8') # fix latin1?
  156. except:
  157. print "Type", type(line)
  158. print "Repr", repr(line)
  159. print line
  160. raise
  161. return deps
  162. #def convert(input, output=None):
  163. # global stanfordParserDir, stanfordParserArgs
  164. #
  165. # workdir = tempfile.mkdtemp()
  166. # if output is None:
  167. # output = os.path.join(workdir, "stanford-output.txt")
  168. #
  169. # input = os.path.abspath(input)
  170. # numCorpusSentences = 0
  171. # inputFile = codecs.open(input, "rt", "utf-8")
  172. # for line in inputFile:
  173. # numCorpusSentences += 1
  174. # inputFile.close()
  175. # cwd = os.getcwd()
  176. # os.chdir(stanfordParserDir)
  177. # #args = ["java", "-mx150m", "-cp",
  178. # # "stanford-parser.jar", "edu.stanford.nlp.trees.EnglishGrammaticalStructure",
  179. # # "-CCprocessed", "-treeFile", "-keepPunct",
  180. # # input]
  181. # args = stanfordParserArgs + [input]
  182. # #subprocess.call(args,
  183. # process = subprocess.Popen(args,
  184. # stdout=codecs.open(output, "wt", "utf-8"))
  185. # waitForProcess(process, numCorpusSentences, True, output, "StanfordParser", "Stanford Conversion")
  186. # os.chdir(cwd)
  187. #
  188. # lines = None
  189. # if output is None:
  190. # outFile = codecs.open(output, "rt", "utf-8")
  191. # lines = outFile.readlines()
  192. # outFile.close()
  193. #
  194. # shutil.rmtree(workdir)
  195. # return lines
  196. def convertXML(parser, input, output=None, debug=False, reparse=False, stanfordParserDir=None, stanfordParserArgs=None):
  197. #global stanfordParserDir, stanfordParserArgs
  198. if stanfordParserDir is None:
  199. stanfordParserDir = Settings.STANFORD_PARSER_DIR
  200. if stanfordParserArgs is None:
  201. # not sure how necessary the "-mx500m" option is, and how exactly Java
  202. # options interact, but adding user defined options from Settings.JAVA
  203. # after the "-mx500m" hopefully works.
  204. stanfordParserArgs = Settings.JAVA.split()[0:1] + ["-mx500m"] + \
  205. Settings.JAVA.split()[1:] + \
  206. ["-cp", "stanford-parser.jar",
  207. "edu.stanford.nlp.trees.EnglishGrammaticalStructure",
  208. "-CCprocessed", "-keepPunct", "-treeFile"]
  209. print >> sys.stderr, "Running Stanford conversion"
  210. print >> sys.stderr, "Stanford tools at:", stanfordParserDir
  211. print >> sys.stderr, "Stanford tools arguments:", " ".join(stanfordParserArgs)
  212. parseTimeStamp = time.strftime("%d.%m.%y %H:%M:%S")
  213. print >> sys.stderr, "Stanford time stamp:", parseTimeStamp
  214. print >> sys.stderr, "Loading corpus", input
  215. corpusTree = ETUtils.ETFromObj(input)
  216. print >> sys.stderr, "Corpus file loaded"
  217. corpusRoot = corpusTree.getroot()
  218. workdir = tempfile.mkdtemp()
  219. if debug:
  220. print >> sys.stderr, "Stanford parser workdir", workdir
  221. stanfordInput = os.path.join(workdir, "input")
  222. stanfordInputFile = codecs.open(stanfordInput, "wt", "utf-8")
  223. # Put penn tree lines in input file
  224. existingCount = 0
  225. for sentence in corpusRoot.getiterator("sentence"):
  226. if sentence.find("sentenceanalyses") is not None: # old format
  227. sentenceAnalyses = setDefaultElement(sentence, "sentenceanalyses")
  228. parses = setDefaultElement(sentenceAnalyses, "parses")
  229. parse = getElementByAttrib(parses, "parse", {"parser":parser})
  230. else:
  231. analyses = setDefaultElement(sentence, "analyses")
  232. parse = getElementByAttrib(analyses, "parse", {"parser":parser})
  233. if parse is None:
  234. print "NONE"
  235. continue
  236. if len(parse.findall("dependency")) > 0:
  237. if reparse: # remove existing stanford conversion
  238. for dep in parse.findall("dependency"):
  239. parse.remove(dep)
  240. del parse.attrib["stanford"]
  241. else: # don't reparse
  242. existingCount += 1
  243. continue
  244. pennTree = parse.get("pennstring")
  245. if pennTree is None or pennTree == "":
  246. continue
  247. stanfordInputFile.write(pennTree + "\n")
  248. stanfordInputFile.close()
  249. if existingCount != 0:
  250. print >> sys.stderr, "Skipping", existingCount, "already converted sentences."
  251. # Run Stanford parser
  252. stanfordOutput = runSentenceProcess(runStanford, stanfordParserDir, stanfordInput,
  253. workdir, True, "StanfordParser",
  254. "Stanford Conversion", timeout=600,
  255. outputArgs={"encoding":"latin1", "errors":"replace"},
  256. processArgs={"stanfordParserArgs":stanfordParserArgs})
  257. #stanfordOutputFile = codecs.open(stanfordOutput, "rt", "utf-8")
  258. stanfordOutputFile = codecs.open(stanfordOutput, "rt", "latin1", "replace")
  259. # Get output and insert dependencies
  260. noDepCount = 0
  261. failCount = 0
  262. sentenceCount = 0
  263. for document in corpusRoot.findall("document"):
  264. for sentence in document.findall("sentence"):
  265. # Get parse
  266. if sentence.find("sentenceanalyses") is not None: # old format
  267. sentenceAnalyses = setDefaultElement(sentence, "sentenceanalyses")
  268. parses = setDefaultElement(sentenceAnalyses, "parses")
  269. parse = getElementByAttrib(parses, "parse", {"parser":parser})
  270. else:
  271. analyses = setDefaultElement(sentence, "analyses")
  272. parse = getElementByAttrib(analyses, "parse", {"parser":parser})
  273. if parse is None:
  274. parse = ET.SubElement(analyses, "parse")
  275. parse.set("parser", "None")
  276. if reparse:
  277. assert len(parse.findall("dependency")) == 0
  278. elif len(parse.findall("dependency")) > 0: # don't reparse
  279. continue
  280. pennTree = parse.get("pennstring")
  281. if pennTree is None or pennTree == "":
  282. parse.set("stanford", "no_penn")
  283. continue
  284. parse.set("stanfordSource", "TEES") # parser was run through this wrapper
  285. parse.set("stanfordDate", parseTimeStamp) # links the parse to the log file
  286. # Get tokens
  287. if sentence.find("analyses") is not None:
  288. tokenization = getElementByAttrib(sentence.find("analyses"), "tokenization", {"tokenizer":parse.get("tokenizer")})
  289. else:
  290. tokenization = getElementByAttrib(sentence.find("sentenceanalyses").find("tokenizations"), "tokenization", {"tokenizer":parse.get("tokenizer")})
  291. assert tokenization is not None
  292. count = 0
  293. tokenByIndex = {}
  294. for token in tokenization.findall("token"):
  295. tokenByIndex[count] = token
  296. count += 1
  297. # Insert dependencies
  298. origId = document.get("pmid")
  299. if origId is None:
  300. origId = document.get("origId")
  301. origId = str(origId)
  302. deps = addDependencies(stanfordOutputFile, parse, tokenByIndex, (sentence.get("id"), origId))
  303. if len(deps) == 0:
  304. parse.set("stanford", "no_dependencies")
  305. noDepCount += 1
  306. if parse.get("stanfordAlignmentError") is not None:
  307. failCount += 1
  308. else:
  309. parse.set("stanford", "ok")
  310. if parse.get("stanfordAlignmentError") is not None:
  311. failCount += 1
  312. parse.set("stanford", "partial")
  313. sentenceCount += 1
  314. stanfordOutputFile.close()
  315. # Remove work directory
  316. if not debug:
  317. shutil.rmtree(workdir)
  318. print >> sys.stderr, "Stanford conversion was done for", sentenceCount, "sentences,", noDepCount, "had no dependencies,", failCount, "failed"
  319. if output is not None:
  320. print >> sys.stderr, "Writing output to", output
  321. ETUtils.write(corpusRoot, output)
  322. return corpusTree
  323. def insertParse(sentence, stanfordOutputFile, parser, extraAttributes={}, skipExtra=0):
  324. # Get parse
  325. analyses = setDefaultElement(sentence, "analyses")
  326. #parses = setDefaultElement(sentenceAnalyses, "parses")
  327. parse = getElementByAttrib(analyses, "parse", {"parser":parser})
  328. if parse is None:
  329. parse = ET.SubElement(analyses, "parse")
  330. parse.set("parser", "None")
  331. # Remove existing dependencies
  332. if len(parse.findall("dependency")) > 0:
  333. for dependency in parse.findall("dependency"):
  334. parse.remove(dependency)
  335. # If no penn tree exists, the stanford parsing can't have happened either
  336. pennTree = parse.get("pennstring")
  337. if pennTree is None or pennTree == "":
  338. parse.set("stanford", "no_penn")
  339. # Must not exit early, so that reading of the stanfordOutputFile stays in sync with the sentences
  340. #if len(parse.findall("dependency")) > 0: # don't reparse
  341. # return True
  342. #pennTree = parse.get("pennstring")
  343. #if pennTree is None or pennTree == "":
  344. # parse.set("stanford", "no_penn")
  345. # return False
  346. for attr in sorted(extraAttributes.keys()):
  347. parse.set(attr, extraAttributes[attr])
  348. # Get tokens
  349. tokenByIndex = {}
  350. tokenization = getElementByAttrib(sentence.find("analyses"), "tokenization", {"tokenizer":parse.get("tokenizer")})
  351. if tokenization is not None:
  352. count = 0
  353. for token in tokenization.findall("token"):
  354. tokenByIndex[count] = token
  355. count += 1
  356. # Insert dependencies
  357. deps = addDependencies(stanfordOutputFile, parse, tokenByIndex, (sentence.get("id"), sentence.get("origId")), skipExtra=skipExtra)
  358. if len(deps) == 0:
  359. parse.set("stanford", "no_dependencies")
  360. else:
  361. parse.set("stanford", "ok")
  362. return True
  363. def insertParses(input, parsePath, output=None, parseName="McCC", extraAttributes={}, skipExtra=0):
  364. import tarfile
  365. from SentenceSplitter import openFile
  366. """
  367. Divide text in the "text" attributes of document and section
  368. elements into sentence elements. These sentence elements are
  369. inserted into their respective parent elements.
  370. """
  371. print >> sys.stderr, "Loading corpus", input
  372. corpusTree = ETUtils.ETFromObj(input)
  373. print >> sys.stderr, "Corpus file loaded"
  374. corpusRoot = corpusTree.getroot()
  375. print >> sys.stderr, "Inserting parses from", parsePath
  376. if parsePath.find(".tar.gz") != -1:
  377. tarFilePath, parsePath = parsePath.split(".tar.gz")
  378. tarFilePath += ".tar.gz"
  379. tarFile = tarfile.open(tarFilePath)
  380. if parsePath[0] == "/":
  381. parsePath = parsePath[1:]
  382. else:
  383. tarFile = None
  384. docCount = 0
  385. failCount = 0
  386. sentenceCount = 0
  387. docsWithStanford = 0
  388. sentencesCreated = 0
  389. sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")]
  390. counter = ProgressCounter(len(sourceElements), "McCC Parse Insertion")
  391. for document in sourceElements:
  392. docCount += 1
  393. docId = document.get("id")
  394. origId = document.get("pmid")
  395. if origId is None:
  396. origId = document.get("origId")
  397. origId = str(origId)
  398. if docId is None:
  399. docId = "CORPUS.d" + str(docCount)
  400. f = openFile(os.path.join(parsePath, origId + ".sd"), tarFile)
  401. if f is None: # file with BioNLP'11 extension not found, try BioNLP'09 extension
  402. f = openFile(os.path.join(parsePath, origId + ".dep"), tarFile)
  403. if f is not None:
  404. sentences = document.findall("sentence")
  405. # TODO: Following for-loop is the same as when used with a real parser, and should
  406. # be moved to its own function.
  407. for sentence in sentences:
  408. sentenceCount += 1
  409. counter.update(0, "Processing Documents ("+sentence.get("id")+"/" + origId + "): ")
  410. if not insertParse(sentence, f, parseName, extraAttributes={}, skipExtra=skipExtra):
  411. failCount += 1
  412. f.close()
  413. counter.update(1, "Processing Documents ("+document.get("id")+"/" + origId + "): ")
  414. if tarFile is not None:
  415. tarFile.close()
  416. #print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences"
  417. #print >> sys.stderr, docsWithSentences, "/", docCount, "documents have stanford parses"
  418. print >> sys.stderr, "Stanford conversion was inserted to", sentenceCount, "sentences" #, failCount, "failed"
  419. if output is not None:
  420. print >> sys.stderr, "Writing output to", output
  421. ETUtils.write(corpusRoot, output)
  422. return corpusTree
  423. if __name__=="__main__":
  424. import sys
  425. from optparse import OptionParser, OptionGroup
  426. # Import Psyco if available
  427. try:
  428. import psyco
  429. psyco.full()
  430. print >> sys.stderr, "Found Psyco, using"
  431. except ImportError:
  432. print >> sys.stderr, "Psyco not installed"
  433. optparser = OptionParser(description="Stanford Parser dependency converter wrapper")
  434. optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE")
  435. optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.")
  436. optparser.add_option("-p", "--parse", default=None, dest="parse", help="Name of parse element.")
  437. optparser.add_option("--debug", default=False, action="store_true", dest="debug", help="")
  438. optparser.add_option("--reparse", default=False, action="store_true", dest="reparse", help="")
  439. group = OptionGroup(optparser, "Install Options", "")
  440. group.add_option("--install", default=None, action="store_true", dest="install", help="Install BANNER")
  441. group.add_option("--installDir", default=None, dest="installDir", help="Install directory")
  442. group.add_option("--downloadDir", default=None, dest="downloadDir", help="Install files download directory")
  443. group.add_option("--redownload", default=False, action="store_true", dest="redownload", help="Redownload install files")
  444. optparser.add_option_group(group)
  445. (options, args) = optparser.parse_args()
  446. if options.install:
  447. install(options.installDir, options.downloadDir, redownload=options.redownload)
  448. else:
  449. convertXML(input=options.input, output=options.output, parser=options.parse, debug=options.debug, reparse=options.reparse)