/src/main/scala/updown/data/io/TweetFeatureReader.scala
Scala | 38 lines | 16 code | 8 blank | 14 comment | 1 complexity | 2fcd76a6d4326031bf02972e79ceea23 MD5 | raw file
- package updown.data.io
- import updown.data._
- object TweetFeatureReader {
- val featureRowRE = """^([^|]*)\|([^|]*)\|([^|]*)\|(.*)$""".r //python verbose regexes are so much nicer :/
- def apply(inputFile: String): List[GoldLabeledTweet] = {
- val lines = scala.io.Source.fromFile(inputFile, "utf-8").getLines.toList
- for (line <- lines) yield {
- parseLine(line)
- }
- }
- def parseLine(line: String): GoldLabeledTweet = {
- val featureRowRE(tweetid, userid, featureString, label) = line
- val features = featureString.split(",").toList.map(_.trim).filter(_.length > 0) // filter out features that are all whitespace or the empty string
- GoldLabeledTweet(tweetid, userid, features, SentimentLabel.figureItOut(label))
- }
- }
- /*object RawTweetFeatureReader {
- val featureRowRE = """^([^|]*)\|([^|]*)\|([^|]*)\|(.*)$""".r
- def apply(inputFile: String): List[GoldLabeledTweet] = {
- val lines = scala.io.Source.fromFile(inputFile, "utf-8").getLines.toList
- for (line <- lines) yield {
- parseLine(line: String): GoldLabeledTweet = {
-
- }
- }
- }
- */