updown /src/main/scala/updown/data/io/MDSDFormat.scala

Language Scala Lines 51
MD5 Hash d10606171cf9fb09d268e549d263a35c
Repository https://bitbucket.org/speriosu/updown View Raw File View Project SPDX
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
package updown.data.io

import io.Source
import updown.data.{SentimentLabel, GoldLabeledTweet, Tweet}
import java.io.{FileWriter, OutputStreamWriter, File}

object MDSDFormat extends Format {
  private val STRING_ENC = "UTF8"

  def read(inputFile: File) = {
    var lineNumber = 0
    Source.fromFile(STRING_ENC).getLines().map(
      (line) => {
        val (labelHash :: lineSplit) = line.split("\\s+").toList.reverse
        val words = (for (tokenNCount <- lineSplit) yield {
          val (token :: count :: _) = tokenNCount.split(":").toList
          (for (i <- 0 until Integer.valueOf(count)) yield token).iterator
        }).iterator.flatten
        val Array(_, label) = labelHash.split(":")
        GoldLabeledTweet(inputFile.getName + lineNumber, "?", words.toList, SentimentLabel.figureItOut(label))
      }
    )
  }

  private val count: ((List[String], Map[String, Int]) => Map[String, Int]) =
    (wordList, map) => {
      wordList match {
        case w :: ws => count(ws, map + ((w, map.getOrElse(w, 0) + 1)))
        case _ => map
      }
    }

  private def stringify_counts(counts: Map[String, Int]) = {
    counts.map {
      case (s, i) => "%s:%d".format(s, i)
    }.mkString(" ")
  }

  def write(outputFile: File, instances: Iterator[GoldLabeledTweet]) {
    val out = new FileWriter(outputFile)
    for (GoldLabeledTweet(id, uid, features, label) <- instances) {
      out.write(
        stringify_counts(count(features, Map()))
          + " #label#:%s".format(SentimentLabel.toEnglishName(label))
          + "\n"
      )
    }
    out.close()
  }
}
Back to Top