scalabha /src/main/scala/opennlp/scalabha/tree/Tok2Trees.scala

Language Scala Lines 169
MD5 Hash 17e2fae1f49c5078a182f408db5608b3
Repository https://bitbucket.org/dmateescu/scalabha View Raw File View Project SPDX
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
package opennlp.scalabha.tree

import scala.xml._
import org.clapper.argot.ArgotParser._
import opennlp.scalabha.log.SimpleLogger
import org.clapper.argot.{ArgotUsageException, ArgotParser, ArgotConverters}
import java.io._
import org.xml.sax.SAXParseException
import ArgotConverters._
import com.sun.org.apache.xpath.internal.operations.Mult
import opennlp.scalabha.model.{TreeNode, Value, Node}

object Tok2Trees {
  val parser = new ArgotParser(this.getClass.getName, preUsage = Some("Version 0.0"))
  val help = parser.flag[Boolean](List("h", "help"), "print help")
  val inputOpt = parser.option[String](List("i", "inputTokens"), "FILE_OR_DIR", "Input inputFile or directory to tokenize")
  val outputOpt = parser.option[String](List("o", "outputTrees"), "DIR", "Output location for the tree files. " +
    "Each tree gets its own file, and they are named from the input file.")

  val debug = parser.flag[Boolean](List("d", "debug"), "Assert this flag if you want to see ridicuous quantities of output.")

  var log: SimpleLogger = new SimpleLogger(
    this.getClass.getName,
    SimpleLogger.WARN,
    new BufferedWriter(new OutputStreamWriter(System.err)))

  val tagDictionary = Map(
    ("." -> "."),
    ("," -> ","),
    ("..." -> "..."),
    ("?" -> "?"),
    ("!" -> "!")
  ).withDefaultValue("x")

  def getTree(tokLine: String): Node =
    Node("TOP",
      tokLine
        .replaceAll("\\(", "-LRB-")
        .replaceAll("\\)", "-RRB-")
        .split("<EOS>")
        .map(s => s.trim)
        .filter(s => s.length > 0)
        .map(sentence => Node("S", sentence.split("\\s+").map(word => Node(tagDictionary(word), List[Value](Value(word)))).toList))
        .toList
    )

  def getFormattedString(tokLine: String): String = getTree(tokLine).getCanonicalString.replaceAll("\\s*\\(S", "\n    (S")

  /**
   * Build a rudimentary syntax tree from a tokenized line.
   * @param tokLine A space-separated list of tokens
   * @return a string representation of a syntax tree.
   */
  def apply(tokLine: String): String = getFormattedString(tokLine)

  // These conspire to form a list of only tags that are not autogenerated
  val autoGenTags = List("TOP", "S", "x") //FIXME dry
  val autoGenOk: (TreeNode) => Boolean =
    (node) => {
      node.getTagStrings.filter((str) => (!autoGenTags.contains(str))).length == 0
    }

  /**
   * A file is ok to overwrite if it does not exist, or it is an autogenerated file, which we
   * can tell from the structure.
   */
  def okToWrite(file: File): Boolean = {
    val okNotExist = !file.exists()
    val okBoilerplate =
      (file.canWrite && MultiLineTreeParser(file.getPath).filter {
        (treeNode) => !autoGenOk(treeNode)
      }.length == 0)
    okNotExist || okBoilerplate
  }

  /**
   * Transform a token file into a directory of rudimentary tree file.
   * @param inputfile A file consisting of lines of tokenized text, with sentences delimited by <EOS> tags
   * @param treeDir The directory to write trees to. Each tree (corresponding to a line in the token file)
   * gets its own file.
   * @return Nothing. The output is written to treeDir.
   */
  def apply(inputFile: File, treeDir: File) {
    log.debug("Started file transform in:%s out:%s\n".format(inputFile.getPath, treeDir.getPath))
    assert(inputFile.isFile, "input file is not a file.")
    assert(inputFile.getName.endsWith(".tok"))
    val baseName = inputFile.getName.substring(0, inputFile.getName.length() - 4)
    log.debug("Making parent directories and text file\n")
    treeDir.mkdirs()
    log.info("%s -> %s/%s.{tree#...}.tree\n".format(inputFile.getAbsolutePath, treeDir.getAbsolutePath, baseName))

    // I'm reading the whole input file on purpose, since we're dong a lot of small write jobs,
    // I don't want to waste time reading in sub-file chunks.
    val lines = scala.io.Source.fromFile(inputFile, "UTF-8").getLines().toList
    val width = math.log10(lines.length).toInt + 1
    for ((line, i) <- lines.zipWithIndex) {
      val index = i + 1
      val outputFile = new File(treeDir, ("%s.%0" + width + "d.tree").format(baseName, index))
      if (okToWrite(outputFile)) {
        log.trace("Writing %s.\n".format(outputFile.getPath))
        val writer = new OutputStreamWriter(new FileOutputStream(outputFile), "UTF-8")
        val treeString = apply(line)
        writer.write(treeString + "\n")
        writer.close()
      } else {
        log.warn(("File %s: This file looks like it's been modified." +
          " Delete it and re-run this program if you want to overwrite it. Skipping...\n").format(outputFile.getPath))
      }
    }
  }

  /**
   * Descend a directory structure looking for token files, and recreate the same directory structure
   * with tree files, re-rooted at treeDir
   */
  def applyDir(inputDir: File, treeDir: File) {
    assert(inputDir.isDirectory)
    for (child <- inputDir.listFiles().sorted) {
      if (child.isDirectory) {
        val pathDescentStep = child.getName
        applyDir(child, new File(treeDir, pathDescentStep))
      } else if (child.isFile && child.getName.endsWith(".tok")) {
        apply(child, new File(treeDir, child.getName.substring(0, child.getName.length() - 4)))
      }
    }
  }

  def main(args: Array[String]) {
    var warnings = 0
    var errors = 0
    try {
      parser.parse(args)

      if (help.value.isDefined) {
        parser.usage()
      }
      if (debug.value.isDefined) {
        log.logLevel = SimpleLogger.DEBUG
      }
      MultiLineTreeParser.log.logLevel = log.logLevel
      val inputFile = inputOpt.value match {
        case Some(filename) => new File(filename).getAbsoluteFile
        case None => parser.usage("You must specify an input file")
      }
      val textFile = outputOpt.value match {
        case Some(filename) => new File(filename)
        case None => parser.usage("You must specify a text file")
      }
      if (inputFile.isFile) {
        apply(inputFile, textFile)
      } else if (inputFile.isDirectory) {
        applyDir(inputFile, textFile)
      } else {
        parser.usage("input file must be a regular file")
      }
      val (transformWarnings, transformErrors) = log.getStats()
      warnings = transformWarnings
      errors = transformErrors
      log.summary("Warnings,Errors: %s\n".format((warnings, errors)))
    }
    catch {
      case e: ArgotUsageException =>
        println(e.message)
    }
    System.exit(errors)
  }

}
Back to Top