PageRenderTime 50ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/src/main/scala/updown/preproc/GenericPreprocessor.scala

https://bitbucket.org/MrChrisJohnson/polify
Scala | 211 lines | 182 code | 20 blank | 9 comment | 20 complexity | 9c87688aa3905aca679e625ec69b899d MD5 | raw file
  1. package updown.preproc
  2. import org.clapper.argot.{ArgotUsageException, ArgotParser, ArgotConverters}
  3. import ArgotConverters._
  4. import updown.data.SentimentLabel
  5. import updown.util.TokenizationPipes
  6. import com.weiglewilczek.slf4s.Logging
  7. import collection.immutable.List._
  8. import java.io.{File, FileOutputStream, OutputStreamWriter}
  9. abstract class GenericPreprocessor extends Logging {
  10. // this is here to make ArgotConverters appear used to IDEA.
  11. convertString _
  12. var pipeStages: Map[String, (List[String]) => List[String]] =
  13. Map[String, (List[String]) => List[String]](
  14. ("lowerCase" -> TokenizationPipes.toLowercase),
  15. ("addBigrams" -> TokenizationPipes.addNGrams(2)),
  16. ("basicTokenize" -> TokenizationPipes.basicTokenize),
  17. ("twokenize" -> TokenizationPipes.twokenize),
  18. ("twokenizeSkipGtOneGrams" -> TokenizationPipes.twokenizeSkipGtOneGrams),
  19. ("filterAlpha") -> TokenizationPipes.filterOnRegex("\\p{Alpha}+"),
  20. ("filterAlphaQuote") -> TokenizationPipes.filterOnRegex("(\\p{Alpha}|')+"),
  21. ("splitSpace" -> TokenizationPipes.splitOnDelimiter(" "))
  22. )
  23. val defaultPipeline = "twokenize|removeStopwords"
  24. val parser = new ArgotParser("updown run updown.preproc.PreprocStanfordTweets", preUsage = Some("Updown"))
  25. val inputFile = parser.option[String](List("i", "input"), "input", "path to stanford data file")
  26. val stopListFile = parser.option[String](List("s", "stoplist"), "stoplist", "path to stoplist file")
  27. val startId = parser.option[Int](List("start-id"), "ID", "id at which to start numbering lines")
  28. val textPipeline = parser.option[String](List("textPipeline"), "PIPELINE",
  29. ("specify the desired pipe stages seperated by |: \"addBiGrams|twokenize\". " +
  30. "Available options are in %s.").format(pipeStages.keySet))
  31. val targetFile = parser.option[String](List("t", "target"), "target", "target file")
  32. val featureFile = parser.option[String](List("f", "feature"), "feature", "feature file")
  33. def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, Either[SentimentLabel.Type, Map[String, SentimentLabel.Type]], String)]
  34. def getInputIterator(inputOption: Option[String]): Iterator[(String, String, Either[SentimentLabel.Type, Map[String, SentimentLabel.Type]], String)] = {
  35. logger.debug("entering getInputIterator")
  36. inputOption match {
  37. case Some(fileNameList) =>
  38. (for ((name, polarity) <- fileNameList.split("\\s*,\\s*").map((pair) => {
  39. val plist = pair.split("\\s*->\\s*")
  40. if (plist.length > 1) {
  41. (plist(0) -> plist(1))
  42. } else {
  43. logger.debug("the polarity for %s is not included on the command line, expecting it in the text.".format(plist(0)))
  44. (plist(0) -> SentimentLabel.toEnglishName(SentimentLabel.Unknown))
  45. }
  46. }
  47. ).toMap) yield {
  48. getInstanceIterator(name, polarity)
  49. }).iterator.flatten
  50. case None =>
  51. (for (line <- scala.io.Source.stdin.getLines()) yield {
  52. line.split("|") match {
  53. case Array(id, reviewer, polarityString, text) =>
  54. (id, reviewer, Left(SentimentLabel.figureItOut(polarityString)), text)
  55. case _ =>
  56. logger.error("Input must be of the form id|reviewer|polarity|text.")
  57. ("", "", Left(SentimentLabel.Abstained), "")
  58. }
  59. })
  60. }
  61. }
  62. def runThroughPipeLine(text: String, pipeLine: List[(List[String]) => List[String]]): List[String] = {
  63. var res = List(text)
  64. for (pipeStage <- pipeLine) {
  65. res = pipeStage(res)
  66. }
  67. res
  68. }
  69. def writeInstance(id: String, reviewer: String, text: String, polarity: String, writer: OutputStreamWriter) {
  70. writer.write("%s|%s|%s|%s\n".format(id, reviewer, text, polarity))
  71. }
  72. def writeTarget(id: String, target: String, writer: OutputStreamWriter) {
  73. writer.write("%s|%s\n".format(id, target))
  74. }
  75. def main(args: Array[String]) {
  76. logger.debug(args.toList.toString)
  77. try {
  78. parser.parse(args)
  79. // SET UP IO
  80. logger.debug("Inputfile: %s".format(inputFile.value))
  81. val inputLines = getInputIterator(inputFile.value)
  82. val targetWriter = new OutputStreamWriter(
  83. targetFile.value match {
  84. case Some(fileName) => new FileOutputStream(new File(fileName))
  85. case None => System.out
  86. }, "UTF-8")
  87. // Note: if you want to squelch output entirely, you can initialize the writer with
  88. // new java.io.OutputStream() { public void write ( int b ) { } }
  89. val featureWriter = new OutputStreamWriter(
  90. featureFile.value match {
  91. case Some(fileName) => new FileOutputStream(new File(fileName))
  92. case None => System.out
  93. }, "UTF-8")
  94. val stopSet: Set[String] =
  95. stopListFile.value match {
  96. case Some(fileName) =>
  97. scala.io.Source.fromFile(fileName).getLines.toSet
  98. case None => Set("a", "the", ".")
  99. }
  100. val tokpipe: (String, List[String] => List[String]) = ("removeStopwords", TokenizationPipes.filterOnStopset(stopSet))
  101. pipeStages = pipeStages + tokpipe
  102. logger.debug("Pipeline option: %s".format(textPipeline.value))
  103. val pipeline: List[(List[String]) => List[String]] = {
  104. val arg: String =
  105. if (textPipeline.value.isDefined) {
  106. textPipeline.value.get
  107. } else {
  108. defaultPipeline
  109. }
  110. (for (pipeStage <- arg.split("\\|")) yield {
  111. if (pipeStages.keySet.contains(pipeStage)) {
  112. pipeStages(pipeStage)
  113. } else {
  114. parser.usage("invalid pipeStage: %s".format(pipeStage))
  115. }
  116. }).toList
  117. }
  118. logger.debug("Pipeline: %s".format(pipeline))
  119. // STATS
  120. val idNumStart =
  121. startId.value match {
  122. case Some(id) => id
  123. case None => 0
  124. }
  125. var numLines = 0
  126. var numSkipped = 0
  127. var numClasses = scala.collection.mutable.Map[SentimentLabel.Type, Int]().withDefaultValue(0)
  128. var numLabels = 0
  129. // RUN
  130. for ((id, reviewer, polarityChoice, text) <- inputLines) {
  131. val outputID = if (id == "") (idNumStart + numLines).toString else id
  132. val outputText = runThroughPipeLine(text, pipeline).map((s) => s.replaceAll(",", "-COMMA-").replaceAll("\\|", "-PIPE-")).mkString(",")
  133. polarityChoice match {
  134. case Left(polarity) =>
  135. // no targets
  136. if (polarity != SentimentLabel.Abstained) {
  137. writeInstance(outputID, reviewer, outputText, polarity.toString, featureWriter)
  138. numLines += 1
  139. numClasses(polarity) += 1
  140. numLabels += 1
  141. } else {
  142. numClasses(SentimentLabel.Abstained) += 1
  143. }
  144. case Right(polarityMap) =>
  145. // map of target -> polarity
  146. val labelList = polarityMap.map {
  147. case (target, label) => label
  148. }.toList
  149. val targetList = polarityMap.map {
  150. case (target, label) => target
  151. }.toList
  152. if (labelList.filter((label) => label != SentimentLabel.Abstained).length > 0) {
  153. writeInstance(outputID, reviewer, outputText, labelList.mkString(","), featureWriter)
  154. writeTarget(outputID, targetList.mkString(","), targetWriter)
  155. numLines += 1
  156. numLabels += polarityMap.size
  157. for ((_, label) <- polarityMap) {
  158. numClasses(label) += 1
  159. }
  160. } else {
  161. numClasses(SentimentLabel.Abstained) += 1
  162. }
  163. }
  164. }
  165. featureWriter.flush()
  166. targetWriter.flush()
  167. logger.info("Stats:\n"+
  168. "Preprocessed " + numLines + " tweets. " +
  169. "Assigned %d labels.\n".format(numLabels) +
  170. (for ((label, count) <- numClasses if label != SentimentLabel.Abstained) yield
  171. "%20s: %10d instances (%2.2f%%)"
  172. .format(
  173. SentimentLabel.toEnglishName(label),
  174. count,
  175. count.toFloat / numLabels * 100)).mkString("\n") +
  176. "\n\n%20s: %10d instances"
  177. .format(
  178. "skipped",
  179. numClasses(SentimentLabel.Abstained))
  180. )
  181. // These may close stdout, so make sure they are last!
  182. featureWriter.close()
  183. targetWriter.close()
  184. }
  185. catch {
  186. case e: ArgotUsageException =>
  187. println(e.message)
  188. System.exit(1)
  189. }
  190. }
  191. }