MDSDFormat.scala | searchcode

/src/main/scala/updown/data/io/MDSDFormat.scala

https://bitbucket.org/speriosu/updown
Scala | 50 lines | 44 code | 6 blank | 0 comment | 1 complexity | d10606171cf9fb09d268e549d263a35c MD5 | raw file


package updown.data.io

import io.Source
import updown.data.{SentimentLabel, GoldLabeledTweet, Tweet}
import java.io.{FileWriter, OutputStreamWriter, File}

object MDSDFormat extends Format {
  private val STRING_ENC = "UTF8"

  def read(inputFile: File) = {
    var lineNumber = 0
    Source.fromFile(STRING_ENC).getLines().map(
      (line) => {
        val (labelHash :: lineSplit) = line.split("\\s+").toList.reverse
        val words = (for (tokenNCount <- lineSplit) yield {
          val (token :: count :: _) = tokenNCount.split(":").toList
          (for (i <- 0 until Integer.valueOf(count)) yield token).iterator
        }).iterator.flatten
        val Array(_, label) = labelHash.split(":")
        GoldLabeledTweet(inputFile.getName + lineNumber, "?", words.toList, SentimentLabel.figureItOut(label))
      }
    )
  }

  private val count: ((List[String], Map[String, Int]) => Map[String, Int]) =
    (wordList, map) => {
      wordList match {
        case w :: ws => count(ws, map + ((w, map.getOrElse(w, 0) + 1)))
        case _ => map
      }
    }

  private def stringify_counts(counts: Map[String, Int]) = {
    counts.map {
      case (s, i) => "%s:%d".format(s, i)
    }.mkString(" ")
  }

  def write(outputFile: File, instances: Iterator[GoldLabeledTweet]) {
    val out = new FileWriter(outputFile)
    for (GoldLabeledTweet(id, uid, features, label) <- instances) {
      out.write(
        stringify_counts(count(features, Map()))
          + " #label#:%s".format(SentimentLabel.toEnglishName(label))
          + "\n"
      )
    }
    out.close()
  }
}