PageRenderTime 51ms CodeModel.GetById 28ms RepoModel.GetById 0ms app.codeStats 0ms

/src/main/scala/opennlp/scalabha/tree/Tok2Trees.scala

https://bitbucket.org/dmateescu/scalabha
Scala | 168 lines | 129 code | 16 blank | 23 comment | 18 complexity | 17e2fae1f49c5078a182f408db5608b3 MD5 | raw file
Possible License(s): Apache-2.0
  1. package opennlp.scalabha.tree
  2. import scala.xml._
  3. import org.clapper.argot.ArgotParser._
  4. import opennlp.scalabha.log.SimpleLogger
  5. import org.clapper.argot.{ArgotUsageException, ArgotParser, ArgotConverters}
  6. import java.io._
  7. import org.xml.sax.SAXParseException
  8. import ArgotConverters._
  9. import com.sun.org.apache.xpath.internal.operations.Mult
  10. import opennlp.scalabha.model.{TreeNode, Value, Node}
  11. object Tok2Trees {
  12. val parser = new ArgotParser(this.getClass.getName, preUsage = Some("Version 0.0"))
  13. val help = parser.flag[Boolean](List("h", "help"), "print help")
  14. val inputOpt = parser.option[String](List("i", "inputTokens"), "FILE_OR_DIR", "Input inputFile or directory to tokenize")
  15. val outputOpt = parser.option[String](List("o", "outputTrees"), "DIR", "Output location for the tree files. " +
  16. "Each tree gets its own file, and they are named from the input file.")
  17. val debug = parser.flag[Boolean](List("d", "debug"), "Assert this flag if you want to see ridicuous quantities of output.")
  18. var log: SimpleLogger = new SimpleLogger(
  19. this.getClass.getName,
  20. SimpleLogger.WARN,
  21. new BufferedWriter(new OutputStreamWriter(System.err)))
  22. val tagDictionary = Map(
  23. ("." -> "."),
  24. ("," -> ","),
  25. ("..." -> "..."),
  26. ("?" -> "?"),
  27. ("!" -> "!")
  28. ).withDefaultValue("x")
  29. def getTree(tokLine: String): Node =
  30. Node("TOP",
  31. tokLine
  32. .replaceAll("\\(", "-LRB-")
  33. .replaceAll("\\)", "-RRB-")
  34. .split("<EOS>")
  35. .map(s => s.trim)
  36. .filter(s => s.length > 0)
  37. .map(sentence => Node("S", sentence.split("\\s+").map(word => Node(tagDictionary(word), List[Value](Value(word)))).toList))
  38. .toList
  39. )
  40. def getFormattedString(tokLine: String): String = getTree(tokLine).getCanonicalString.replaceAll("\\s*\\(S", "\n (S")
  41. /**
  42. * Build a rudimentary syntax tree from a tokenized line.
  43. * @param tokLine A space-separated list of tokens
  44. * @return a string representation of a syntax tree.
  45. */
  46. def apply(tokLine: String): String = getFormattedString(tokLine)
  47. // These conspire to form a list of only tags that are not autogenerated
  48. val autoGenTags = List("TOP", "S", "x") //FIXME dry
  49. val autoGenOk: (TreeNode) => Boolean =
  50. (node) => {
  51. node.getTagStrings.filter((str) => (!autoGenTags.contains(str))).length == 0
  52. }
  53. /**
  54. * A file is ok to overwrite if it does not exist, or it is an autogenerated file, which we
  55. * can tell from the structure.
  56. */
  57. def okToWrite(file: File): Boolean = {
  58. val okNotExist = !file.exists()
  59. val okBoilerplate =
  60. (file.canWrite && MultiLineTreeParser(file.getPath).filter {
  61. (treeNode) => !autoGenOk(treeNode)
  62. }.length == 0)
  63. okNotExist || okBoilerplate
  64. }
  65. /**
  66. * Transform a token file into a directory of rudimentary tree file.
  67. * @param inputfile A file consisting of lines of tokenized text, with sentences delimited by <EOS> tags
  68. * @param treeDir The directory to write trees to. Each tree (corresponding to a line in the token file)
  69. * gets its own file.
  70. * @return Nothing. The output is written to treeDir.
  71. */
  72. def apply(inputFile: File, treeDir: File) {
  73. log.debug("Started file transform in:%s out:%s\n".format(inputFile.getPath, treeDir.getPath))
  74. assert(inputFile.isFile, "input file is not a file.")
  75. assert(inputFile.getName.endsWith(".tok"))
  76. val baseName = inputFile.getName.substring(0, inputFile.getName.length() - 4)
  77. log.debug("Making parent directories and text file\n")
  78. treeDir.mkdirs()
  79. log.info("%s -> %s/%s.{tree#...}.tree\n".format(inputFile.getAbsolutePath, treeDir.getAbsolutePath, baseName))
  80. // I'm reading the whole input file on purpose, since we're dong a lot of small write jobs,
  81. // I don't want to waste time reading in sub-file chunks.
  82. val lines = scala.io.Source.fromFile(inputFile, "UTF-8").getLines().toList
  83. val width = math.log10(lines.length).toInt + 1
  84. for ((line, i) <- lines.zipWithIndex) {
  85. val index = i + 1
  86. val outputFile = new File(treeDir, ("%s.%0" + width + "d.tree").format(baseName, index))
  87. if (okToWrite(outputFile)) {
  88. log.trace("Writing %s.\n".format(outputFile.getPath))
  89. val writer = new OutputStreamWriter(new FileOutputStream(outputFile), "UTF-8")
  90. val treeString = apply(line)
  91. writer.write(treeString + "\n")
  92. writer.close()
  93. } else {
  94. log.warn(("File %s: This file looks like it's been modified." +
  95. " Delete it and re-run this program if you want to overwrite it. Skipping...\n").format(outputFile.getPath))
  96. }
  97. }
  98. }
  99. /**
  100. * Descend a directory structure looking for token files, and recreate the same directory structure
  101. * with tree files, re-rooted at treeDir
  102. */
  103. def applyDir(inputDir: File, treeDir: File) {
  104. assert(inputDir.isDirectory)
  105. for (child <- inputDir.listFiles().sorted) {
  106. if (child.isDirectory) {
  107. val pathDescentStep = child.getName
  108. applyDir(child, new File(treeDir, pathDescentStep))
  109. } else if (child.isFile && child.getName.endsWith(".tok")) {
  110. apply(child, new File(treeDir, child.getName.substring(0, child.getName.length() - 4)))
  111. }
  112. }
  113. }
  114. def main(args: Array[String]) {
  115. var warnings = 0
  116. var errors = 0
  117. try {
  118. parser.parse(args)
  119. if (help.value.isDefined) {
  120. parser.usage()
  121. }
  122. if (debug.value.isDefined) {
  123. log.logLevel = SimpleLogger.DEBUG
  124. }
  125. MultiLineTreeParser.log.logLevel = log.logLevel
  126. val inputFile = inputOpt.value match {
  127. case Some(filename) => new File(filename).getAbsoluteFile
  128. case None => parser.usage("You must specify an input file")
  129. }
  130. val textFile = outputOpt.value match {
  131. case Some(filename) => new File(filename)
  132. case None => parser.usage("You must specify a text file")
  133. }
  134. if (inputFile.isFile) {
  135. apply(inputFile, textFile)
  136. } else if (inputFile.isDirectory) {
  137. applyDir(inputFile, textFile)
  138. } else {
  139. parser.usage("input file must be a regular file")
  140. }
  141. val (transformWarnings, transformErrors) = log.getStats()
  142. warnings = transformWarnings
  143. errors = transformErrors
  144. log.summary("Warnings,Errors: %s\n".format((warnings, errors)))
  145. }
  146. catch {
  147. case e: ArgotUsageException =>
  148. println(e.message)
  149. }
  150. System.exit(errors)
  151. }
  152. }