PageRenderTime 26ms CodeModel.GetById 0ms RepoModel.GetById 0ms app.codeStats 0ms

/src/main/scala/updown/data/io/MDSDFormat.scala

https://bitbucket.org/speriosu/updown
Scala | 50 lines | 44 code | 6 blank | 0 comment | 1 complexity | d10606171cf9fb09d268e549d263a35c MD5 | raw file
  1. package updown.data.io
  2. import io.Source
  3. import updown.data.{SentimentLabel, GoldLabeledTweet, Tweet}
  4. import java.io.{FileWriter, OutputStreamWriter, File}
  5. object MDSDFormat extends Format {
  6. private val STRING_ENC = "UTF8"
  7. def read(inputFile: File) = {
  8. var lineNumber = 0
  9. Source.fromFile(STRING_ENC).getLines().map(
  10. (line) => {
  11. val (labelHash :: lineSplit) = line.split("\\s+").toList.reverse
  12. val words = (for (tokenNCount <- lineSplit) yield {
  13. val (token :: count :: _) = tokenNCount.split(":").toList
  14. (for (i <- 0 until Integer.valueOf(count)) yield token).iterator
  15. }).iterator.flatten
  16. val Array(_, label) = labelHash.split(":")
  17. GoldLabeledTweet(inputFile.getName + lineNumber, "?", words.toList, SentimentLabel.figureItOut(label))
  18. }
  19. )
  20. }
  21. private val count: ((List[String], Map[String, Int]) => Map[String, Int]) =
  22. (wordList, map) => {
  23. wordList match {
  24. case w :: ws => count(ws, map + ((w, map.getOrElse(w, 0) + 1)))
  25. case _ => map
  26. }
  27. }
  28. private def stringify_counts(counts: Map[String, Int]) = {
  29. counts.map {
  30. case (s, i) => "%s:%d".format(s, i)
  31. }.mkString(" ")
  32. }
  33. def write(outputFile: File, instances: Iterator[GoldLabeledTweet]) {
  34. val out = new FileWriter(outputFile)
  35. for (GoldLabeledTweet(id, uid, features, label) <- instances) {
  36. out.write(
  37. stringify_counts(count(features, Map()))
  38. + " #label#:%s".format(SentimentLabel.toEnglishName(label))
  39. + "\n"
  40. )
  41. }
  42. out.close()
  43. }
  44. }