MaxLengthsProcessor.scala

/src/main/scala/edu/umass/cs/iesl/entizer/MaxLengthsProcessor.scala

https://github.com/kedarbellare/entizer · Scala · 63 lines · 47 code · 13 blank · 3 comment · 6 complexity · 10d7dbc73aadc8da9882677ba199b1fb MD5 · raw file

package edu.umass.cs.iesl.entizer

import com.mongodb.casbah.Imports._
import collection.mutable.{HashSet, HashMap}

/**
 * @author kedar
 */

class MaxLengthsProcessor(val inputColl: MongoCollection,
                          val useOracle: Boolean = false) extends ParallelCollectionProcessor {
  def name = "maxLengthFinder[oracle=" + useOracle + "]"

  def inputJob = {
    if (useOracle)
      JobCenter.Job(select = MongoDBObject("isRecord" -> 1, "bioLabels" -> 1))
    else
      JobCenter.Job(query = MongoDBObject("isRecord" -> true), select = MongoDBObject("isRecord" -> 1, "bioLabels" -> 1))
  }

  override def newOutputParams(isMaster: Boolean = false) = {
    val lblToMaxLength = new HashMap[String, Int]
    lblToMaxLength("O") = Short.MaxValue.toInt
    lblToMaxLength
  }

  override def merge(outputParams: Any, partialOutputParams: Any) {
    val outputMaxLengths = outputParams.asInstanceOf[HashMap[String, Int]]
    for ((lbl, maxlen) <- partialOutputParams.asInstanceOf[HashMap[String, Int]]) {
      outputMaxLengths(lbl) = math.max(maxlen, outputMaxLengths.getOrElse(lbl, 1))
    }
  }

  def process(dbo: DBObject, inputParams: Any, partialOutputParams: Any) {
    if (useOracle || dbo.as[Boolean]("isRecord")) {
      val partialMaxLengths = partialOutputParams.asInstanceOf[HashMap[String, Int]]
      val labels = MongoHelper.getListAttr[String](dbo, "bioLabels").toArray
      val segments = TextSegmentationHelper.getTextSegmentationFromBIO(labels)
      for (segment <- segments) {
        partialMaxLengths(segment.label) = math.max(segment.end - segment.begin, partialMaxLengths.getOrElse(segment.label, 1))
      }
    }
  }
}


class UniqueClusterProcessor(val inputColl: MongoCollection) extends ParallelCollectionProcessor {
  def name = "uniqueClusters"

  def inputJob = JobCenter.Job(select = MongoDBObject("cluster" -> 1))

  override def newOutputParams(isMaster: Boolean = false) = new HashSet[String]

  override def merge(outputParams: Any, partialOutputParams: Any) {
    outputParams.asInstanceOf[HashSet[String]] ++= partialOutputParams.asInstanceOf[HashSet[String]]
  }

  def process(dbo: DBObject, inputParams: Any, partialOutputParams: Any) {
    for (cluster <- dbo.getAs[String]("cluster")) {
      partialOutputParams.asInstanceOf[HashSet[String]] += cluster
    }
  }
}