PageRenderTime 44ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/src/main/scala/org/fiasana/XmlToInfo.scala

https://bitbucket.org/dmateescu/scalabha
Scala | 183 lines | 149 code | 23 blank | 11 comment | 26 complexity | da8e45737e77bfd6a58d166659cadbd6 MD5 | raw file
Possible License(s): Apache-2.0
  1. package org.fiasana
  2. import scala.xml._
  3. import org.clapper.argot.ArgotParser._
  4. import opennlp.scalabha.log.SimpleLogger
  5. import org.clapper.argot.{ArgotUsageException, ArgotParser, ArgotConverters}
  6. import java.io._
  7. import scala.sys.process._
  8. import org.xml.sax.SAXParseException
  9. import opennlp.scalabha.util.FileUtils
  10. import java.util.regex.Pattern
  11. object XmlToInfo {
  12. import ArgotConverters._
  13. var log: SimpleLogger = new SimpleLogger(
  14. this.getClass.toString,
  15. SimpleLogger.TRACE,
  16. new BufferedWriter(new OutputStreamWriter(System.err)))
  17. def transformFile(inputFile: File, infoFileNameStripped: String, log: SimpleLogger) {
  18. log.debug("Started file transform\n")
  19. assert(inputFile.isFile, "input file is not a file.")
  20. // the output files should be ready to have ".LANG.txt" or ".LANG.tok" appended
  21. assert(!infoFileNameStripped.endsWith(".xml") && !infoFileNameStripped.endsWith(".xml"))
  22. //ensure the appropriate parent dirs exist
  23. log.debug("Making parent directories\n")
  24. new File(FileUtils.getPathParent(infoFileNameStripped)).mkdirs()
  25. log.trace("%s -> %s.trace\n".format(inputFile.getPath, infoFileNameStripped))
  26. try {
  27. log.debug("Loading XML\n")
  28. val root = XML.load(new InputStreamReader(new FileInputStream(inputFile), "UTF-8"))
  29. // val datasetAttrs = Map[String,String]()
  30. val datasetAttrs = root.attributes.asAttrMap
  31. log.debug(datasetAttrs.toString)
  32. val xmlTree = root \ "file"
  33. val fileName = inputFile.getName
  34. val fileAttrs = (for (it <- xmlTree) yield {
  35. it.attributes.asAttrMap.iterator
  36. }).iterator.flatten.toMap
  37. log.debug(fileAttrs.toString)
  38. val metadataAttrs = (for (it <- xmlTree \ "metadata") yield {
  39. it.attributes.asAttrMap.iterator
  40. }).iterator.flatten.toMap
  41. log.debug(metadataAttrs.toString)
  42. val languages = (xmlTree \ "@languages").text.split(",").toList
  43. log.debug("Opening output streams\n")
  44. val infoFile = new File(infoFileNameStripped + ".trace")
  45. val infoFileWriter = new OutputStreamWriter(new FileOutputStream(
  46. infoFile), "UTF-8")
  47. log.debug("Parsing XML\n")
  48. xmlTree \\ "unit" foreach {
  49. (unit) =>
  50. val unitAttrs = unit.attributes.asAttrMap
  51. unit \ "align" foreach {
  52. (align) =>
  53. val noteAttrs = (for ((note, i) <- (align \ "note").zipWithIndex) yield {
  54. (note.attributes.asAttrMap.toList.map {
  55. case (k, v) => ("%d-%s".format(i, k), v)
  56. } ::: List(("%d-text".format(i), note.text.replaceAll("\"|“|”", "'")))).iterator
  57. }).iterator.flatten.toMap
  58. infoFileWriter.write(("::source \"%s\" %s\n".format(fileName,
  59. List(
  60. (for ((k, v) <- datasetAttrs if (v.length > 0)) yield "::data-%s \"%s\"".format(k, v)).toList.mkString(" "),
  61. (for ((k, v) <- fileAttrs if (v.length > 0)) yield "::file-%s \"%s\"".format(k, v)).toList.mkString(" "),
  62. (for ((k, v) <- metadataAttrs if (v.length > 0)) yield "::meta-%s \"%s\"".format(k, v)).toList.mkString(" "),
  63. (for ((k, v) <- unitAttrs if (v.length > 0)) yield "::unit-%s \"%s\"".format(k, v)).toList.mkString(" "),
  64. (for ((k, v) <- noteAttrs if (v.length > 0)) yield "::note-%s \"%s\"".format(k, v)).toList.mkString(" ")
  65. ).mkString(" ")
  66. )))
  67. }
  68. }
  69. log.debug("Closing streams\n")
  70. infoFileWriter.close()
  71. if (infoFile.length() == 0) {
  72. infoFile.delete()
  73. }
  74. } catch {
  75. case e: SAXParseException =>
  76. log.err("Malformed XML in input file: %s, column: %s, line: %s, message: %s\n".format(inputFile.getAbsolutePath,
  77. e.getColumnNumber, e.getLineNumber, e.getMessage))
  78. return
  79. case e: Exception =>
  80. log.err("Caught an error: %s\n".format(e.getMessage))
  81. return
  82. }
  83. log.debug("Exiting file transform\n")
  84. }
  85. def transformDirectory(inputDirectory: File, newSubdirectories: String,
  86. infoFileNameOption: Option[String], log: SimpleLogger) {
  87. for (inputFile <- inputDirectory.listFiles if (inputFile.isFile && inputFile.getName.endsWith("xml"))) {
  88. val infoFileNameStripped = FileUtils.getStrippedOutputFileName(
  89. (if (infoFileNameOption.isDefined) infoFileNameOption.get else inputFile.getParent),
  90. newSubdirectories, inputFile.getName.replaceFirst(".xml$", ""))
  91. transformFile(inputFile, infoFileNameStripped, log)
  92. }
  93. }
  94. def transformDirectoryRecursive(inputDirectory: File, newSubdirectories: String,
  95. infoFileNameOption: Option[String], log: SimpleLogger) {
  96. // first, transform all the xml files at the current level
  97. transformDirectory(inputDirectory, newSubdirectories, infoFileNameOption, log)
  98. // then do the same for all the child directories
  99. for (inputSubDirectory <- inputDirectory.listFiles() if (inputSubDirectory.isDirectory)) {
  100. transformDirectoryRecursive(inputSubDirectory, newSubdirectories + FileUtils.FILE_SEPARATOR + inputSubDirectory.getName,
  101. infoFileNameOption, log)
  102. }
  103. }
  104. def main(args: Array[String]) {
  105. val parser = new ArgotParser(this.getClass.getName, preUsage = Some("Version 0.0"))
  106. val help = parser.flag[Boolean](List("h", "help"), "print help")
  107. val input = parser.option[String](List("i", "input"), "FILE_OR_DIR", "Input inputFile or directory to tokenize")
  108. val infoFileNameOption = parser.option[String](List("o", "output"), "FILE_OR_DIR", "Output location for trace files. If none is" +
  109. " specified, the input inputFile's directory will be used.")
  110. val recursive = parser.flag[Boolean](List("R", "recursive"), "If the input parameter is a directory, recursively tokenize" +
  111. " all xml files in or below that directory.")
  112. val debug = parser.flag[Boolean](List("d", "debug"), "Assert this flag if you want to see ridicuous quantities of output.")
  113. val skipRegex = parser.option[String](List("skip"), "REGEX", "Skip files whose absolute path matches this regex.")
  114. try {
  115. parser.parse(args)
  116. if (help.value.isDefined) {
  117. parser.usage()
  118. }
  119. if (debug.value.isDefined)
  120. log = new SimpleLogger(
  121. this.getClass.toString,
  122. SimpleLogger.DEBUG,
  123. new BufferedWriter(new OutputStreamWriter(System.err)))
  124. val skipFiles =
  125. if (skipRegex.value.isDefined) skipRegex.value.get.r else "".r
  126. if (input.value.isDefined) {
  127. val fileName = input.value.get
  128. val inputFile = new File(input.value.get).getAbsoluteFile
  129. if (!inputFile.exists()) {
  130. log.err("input file does not exist.")
  131. System.exit(1)
  132. }
  133. if (inputFile.isDirectory && recursive.value.isDefined) {
  134. log.debug("Main: doing recursive option\n")
  135. // then recursively descend and transform all files
  136. // treat the output files as directories and reconstruct the descent tree as a tree rooted there.
  137. transformDirectoryRecursive(inputFile, "", infoFileNameOption.value, log)
  138. } else if (inputFile.isDirectory) {
  139. log.debug("Main: doing directory option\n")
  140. // then just loop over all the files in inputFile
  141. // treat the output files as directories and create all the output files there.
  142. transformDirectory(inputFile, "", infoFileNameOption.value, log)
  143. } else {
  144. log.debug("Main: doing single file option\n")
  145. // then just transform inputFile
  146. // treat the output files as files and write them out.
  147. val infoFileNameStripped = FileUtils.getStrippedOutputFileName(
  148. (if (infoFileNameOption.value.isDefined) infoFileNameOption.value.get else inputFile.getParent), "",
  149. inputFile.getName.replaceFirst(".xml$", ""))
  150. transformFile(inputFile, infoFileNameStripped, log)
  151. }
  152. }
  153. log.summary("Warnings,Errors: %s\n".format(log.getStats()))
  154. }
  155. catch {
  156. case e: ArgotUsageException =>
  157. println(e.message)
  158. }
  159. }
  160. }