/src/main/scala/edu/umass/cs/iesl/entizer/MaxLengthsProcessor.scala

https://github.com/kedarbellare/entizer · Scala · 63 lines · 47 code · 13 blank · 3 comment · 6 complexity · 10d7dbc73aadc8da9882677ba199b1fb MD5 · raw file

  1. package edu.umass.cs.iesl.entizer
  2. import com.mongodb.casbah.Imports._
  3. import collection.mutable.{HashSet, HashMap}
  4. /**
  5. * @author kedar
  6. */
  7. class MaxLengthsProcessor(val inputColl: MongoCollection,
  8. val useOracle: Boolean = false) extends ParallelCollectionProcessor {
  9. def name = "maxLengthFinder[oracle=" + useOracle + "]"
  10. def inputJob = {
  11. if (useOracle)
  12. JobCenter.Job(select = MongoDBObject("isRecord" -> 1, "bioLabels" -> 1))
  13. else
  14. JobCenter.Job(query = MongoDBObject("isRecord" -> true), select = MongoDBObject("isRecord" -> 1, "bioLabels" -> 1))
  15. }
  16. override def newOutputParams(isMaster: Boolean = false) = {
  17. val lblToMaxLength = new HashMap[String, Int]
  18. lblToMaxLength("O") = Short.MaxValue.toInt
  19. lblToMaxLength
  20. }
  21. override def merge(outputParams: Any, partialOutputParams: Any) {
  22. val outputMaxLengths = outputParams.asInstanceOf[HashMap[String, Int]]
  23. for ((lbl, maxlen) <- partialOutputParams.asInstanceOf[HashMap[String, Int]]) {
  24. outputMaxLengths(lbl) = math.max(maxlen, outputMaxLengths.getOrElse(lbl, 1))
  25. }
  26. }
  27. def process(dbo: DBObject, inputParams: Any, partialOutputParams: Any) {
  28. if (useOracle || dbo.as[Boolean]("isRecord")) {
  29. val partialMaxLengths = partialOutputParams.asInstanceOf[HashMap[String, Int]]
  30. val labels = MongoHelper.getListAttr[String](dbo, "bioLabels").toArray
  31. val segments = TextSegmentationHelper.getTextSegmentationFromBIO(labels)
  32. for (segment <- segments) {
  33. partialMaxLengths(segment.label) = math.max(segment.end - segment.begin, partialMaxLengths.getOrElse(segment.label, 1))
  34. }
  35. }
  36. }
  37. }
  38. class UniqueClusterProcessor(val inputColl: MongoCollection) extends ParallelCollectionProcessor {
  39. def name = "uniqueClusters"
  40. def inputJob = JobCenter.Job(select = MongoDBObject("cluster" -> 1))
  41. override def newOutputParams(isMaster: Boolean = false) = new HashSet[String]
  42. override def merge(outputParams: Any, partialOutputParams: Any) {
  43. outputParams.asInstanceOf[HashSet[String]] ++= partialOutputParams.asInstanceOf[HashSet[String]]
  44. }
  45. def process(dbo: DBObject, inputParams: Any, partialOutputParams: Any) {
  46. for (cluster <- dbo.getAs[String]("cluster")) {
  47. partialOutputParams.asInstanceOf[HashSet[String]] += cluster
  48. }
  49. }
  50. }