PageRenderTime 49ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/src/main/scala/org/fiasana/X2TXT.scala

https://bitbucket.org/dmateescu/scalabha
Scala | 187 lines | 147 code | 14 blank | 26 comment | 19 complexity | 0dce6e20cb6a8b500b03415e39737322 MD5 | raw file
Possible License(s): Apache-2.0
  1. package org.fiasana
  2. import scala.xml._
  3. import org.clapper.argot.ArgotParser._
  4. import opennlp.scalabha.log.SimpleLogger
  5. import org.clapper.argot.{ArgotUsageException, ArgotParser, ArgotConverters}
  6. import java.io._
  7. import scala.sys.process._
  8. import org.xml.sax.SAXParseException
  9. import opennlp.scalabha.util.FileUtils
  10. import java.util.regex.Pattern
  11. import util.matching.Regex
  12. import ArgotConverters._
  13. object X2TXT {
  14. val parser = new ArgotParser(this.getClass.getName, preUsage = Some("Version 0.0"))
  15. val help = parser.flag[Boolean](List("h", "help"), "print help")
  16. val input = parser.option[String](List("x", "xml-input"), "FILE_OR_DIR", "Input inputFile or directory to tokenize")
  17. val textOutput = parser.option[String](List("t", "text-output"), "FILE_OR_DIR", "Output location for intermediate text files. " +
  18. "If none is specified, the input inputFile's directory will be used.")
  19. val debug = parser.flag[Boolean](List("d", "debug"), "Assert this flag if you want to see ridicuous quantities of output.")
  20. var log: SimpleLogger = new SimpleLogger(
  21. this.getClass.getName,
  22. SimpleLogger.WARN,
  23. new BufferedWriter(new OutputStreamWriter(System.err)))
  24. /**
  25. * @param xmlTree This is a parsed XML tree to be transformed to text
  26. * @param fileName This is the name of the file the XML came from. It's used for logging errors
  27. * @return A map from language name to list of text strings. Each string in the
  28. * list represents all the text for that language in an align node.
  29. * The text strings are in the same order they appeared in in the XML.
  30. */
  31. def apply(xmlTree: Elem, fileName: String): Map[String, List[String]] = {
  32. val languages = (xmlTree \ "file" \ "@languages").text.split(",").toList.map(s=>s.trim)
  33. var resultMap = languages.map(s=>(s,List[String]())).toMap
  34. log.debug("Parsing XML\n")
  35. xmlTree \\ "align" foreach {
  36. align =>
  37. val textNodes = (align \ "text")
  38. val langToText= textNodes.map( textNode => (
  39. (textNode \ "@langid").text,
  40. (textNode \ "s").map(
  41. sentenceNode =>
  42. "%s <EOS>".format(sentenceNode.text.replaceAll("\\n"," "))).mkString(" ")
  43. ))
  44. val langToTextMap = langToText.toMap.withDefaultValue("<EOS>")
  45. resultMap = resultMap.map{ // TODO is there a fancier functional way to do this?
  46. case(lang,list) => (lang,langToTextMap(lang)::list)
  47. }
  48. val missingLangs = resultMap.keySet.diff(langToTextMap.keySet)
  49. if (missingLangs.size > 0) {
  50. log.err(("In file %s, missing language%s \"%s\" " +
  51. "in the following align node. All align nodes must" +
  52. " contain a single text node for each language:\n%s\n\n\n")
  53. .format(fileName, if (missingLangs.size > 1) "s" else "",
  54. missingLangs.toList.sorted.mkString(","), align.toString()))
  55. }
  56. if (langToText.length != langToTextMap.size) {
  57. log.err(("In file %s, there is more than one text node " +
  58. "for a language. All align nodes must contain a single " +
  59. "text node for each language:\n%s\n\n\n")
  60. .format(fileName, align.toString()))
  61. }
  62. val unknownLanguages = langToTextMap.keySet.diff(resultMap.keySet)
  63. if (unknownLanguages.size > 0) {
  64. log.err("In file %s, found unknown language%s \"%s\" in align node:\n%s\n\n\n".format(
  65. fileName,
  66. if (unknownLanguages.size > 1) "s" else "",
  67. unknownLanguages.toList.sorted.mkString(","),
  68. align
  69. ))
  70. }
  71. }
  72. resultMap.map{
  73. case(lang,list) => (lang, list.reverse)
  74. }
  75. }
  76. /**
  77. * @param inputFile This is the XML file to transform to text
  78. * @param textFile This is the prefix file to use for generating output files.
  79. * The way it works is that textFile's path gets appended with ".lang.txt", where
  80. * ".lang" is substituted for each of the languages specified in the XML file.
  81. *
  82. * @return Nothing. The output is written to the files generated from textFile.
  83. */
  84. def apply(inputFile: File, textFile: File) {
  85. log.debug("Started file transform\n")
  86. assert(inputFile.isFile, "input file is not a file.")
  87. // the output files should be ready to have ".LANG.txt" or ".LANG.tok" appended
  88. //ensure the appropriate parent dirs exist
  89. log.debug("Making parent directories and text file\n")
  90. new File(textFile.getParent).mkdirs()
  91. log.debug("%s -> %s.{langs...}.txt\n".format(inputFile.getPath, textFile.getPath))
  92. try {
  93. log.debug("Extracting text from XML\n")
  94. val textLines = apply(XML.load(new InputStreamReader(new FileInputStream(inputFile), "UTF-8")),
  95. inputFile.getName)
  96. log.debug("Opening output streams\n")
  97. textLines.foreach{
  98. case(lang,lines) => {
  99. val writer = new OutputStreamWriter(new FileOutputStream(
  100. new File("%s.%s.txt".format(textFile.getPath, lang))), "UTF-8")
  101. lines.foreach(s=>writer.write(s+"\n"))
  102. writer.close()
  103. }
  104. }
  105. } catch {
  106. case e: SAXParseException =>
  107. log.err("Malformed XML in input file: %s, column: %s, line: %s, message: %s\n".format(inputFile.getAbsolutePath,
  108. e.getColumnNumber, e.getLineNumber, e.getMessage))
  109. return
  110. case e: Exception =>
  111. log.err("Caught an error: %s".format(e.getMessage))
  112. return
  113. }
  114. log.debug("Exiting file transform\n")
  115. }
  116. /**
  117. * Recursively descend a directory structure, transforming XML to text files.
  118. * @param inputDir This is the root to start descending from
  119. * @param treeDir This is the root to start creating text files at.
  120. * The directory structure in inputDir will be recreated in treeDir, so
  121. * <em>in/A.xml</em> is transformed to <em>in/A.lang1.txt</em> and
  122. * <em>in/another/path/B.xml</em> is
  123. * transformed to <em>in/another/path/B.lang1.txt</em>.
  124. */
  125. def applyDir(inputDir: File, textDir: File) {
  126. assert(inputDir.isDirectory)
  127. for (child <- inputDir.listFiles().sorted) {
  128. if (child.isDirectory) {
  129. val pathDescentStep = child.getName
  130. applyDir(child, new File(textDir, pathDescentStep))
  131. } else if (child.isFile && child.getName.endsWith(".xml")) {
  132. apply(child, new File(textDir, child.getName.substring(0, child.getName.length() - 4)))
  133. }
  134. }
  135. }
  136. def main(args: Array[String]) {
  137. var warnings = 0
  138. var errors = 0
  139. try {
  140. parser.parse(args)
  141. if (help.value.isDefined) {
  142. parser.usage()
  143. }
  144. if (debug.value.isDefined) {
  145. log.logLevel = SimpleLogger.DEBUG
  146. }
  147. val inputFile = input.value match {
  148. case Some(filename) => new File(filename).getAbsoluteFile
  149. case None => parser.usage("You must specify an input file")
  150. }
  151. val textFile = textOutput.value match {
  152. case Some(filename) => new File(filename)
  153. case None => parser.usage("You must specify a text file")
  154. }
  155. if (inputFile.isFile) {
  156. apply(inputFile, textFile)
  157. } else if (inputFile.isDirectory) {
  158. applyDir(inputFile, textFile)
  159. } else {
  160. parser.usage("input file must be a regular file")
  161. }
  162. val (transformWarnings,transformErrors) = log.getStats()
  163. warnings = transformWarnings
  164. errors = transformErrors
  165. log.summary("Warnings,Errors: %s\n".format((warnings,errors)))
  166. }
  167. catch {
  168. case e: ArgotUsageException =>
  169. println(e.message)
  170. }
  171. System.exit(errors)
  172. }
  173. }