PageRenderTime 55ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 0ms

/indexer/src/main/scala/output/OutputHFile.scala

https://gitlab.com/18runt88/twofishes
Scala | 65 lines | 54 code | 8 blank | 3 comment | 2 complexity | 3f8991d3d10b1283942ba5bb11916fcd MD5 | raw file
  1. package com.foursquare.twofishes.output
  2. import com.foursquare.twofishes.{SlugEntry, SlugEntryMap}
  3. import com.foursquare.twofishes.mongo.MongoGeocodeDAO
  4. import com.foursquare.twofishes.util.DurationUtils
  5. import com.mongodb.Bytes
  6. import com.mongodb.casbah.Imports._
  7. import com.novus.salat._
  8. import com.novus.salat.annotations._
  9. import com.novus.salat.dao._
  10. import com.novus.salat.global._
  11. import java.io._
  12. import java.util.concurrent.CountDownLatch
  13. import org.apache.hadoop.hbase.util.Bytes._
  14. import scala.collection.mutable.HashMap
  15. import scalaj.collection.Implicits._
  16. import com.twitter.util.{Future, FuturePool}
  17. import java.util.concurrent.Executors
  18. class OutputIndexes(
  19. basepath: String,
  20. outputPrefixIndex: Boolean = true,
  21. slugEntryMap: SlugEntryMap.SlugEntryMap = HashMap.empty,
  22. outputRevgeo: Boolean = true,
  23. outputS2Covering: Boolean = true
  24. ) extends DurationUtils {
  25. def buildIndexes(s2CoveringLatch: Option[CountDownLatch]) {
  26. val fidMap = logPhase("preload fid map") { new FidMap(preload = true) }
  27. // This one wastes a lot of ram, so do it on it's own
  28. (new NameIndexer(basepath, fidMap, outputPrefixIndex)).writeIndex()
  29. // this should really really be done by now
  30. s2CoveringLatch.foreach(_.await())
  31. val hasPolyCursor =
  32. MongoGeocodeDAO.find(MongoDBObject("hasPoly" -> true))
  33. hasPolyCursor.option = Bytes.QUERYOPTION_NOTIMEOUT
  34. val polygonMap = logPhase("preloading polygon map") {
  35. hasPolyCursor.map(r => (r.polyId, (r._id, r.woeType))).toList
  36. .groupBy(_._1)
  37. .mapValues(v => v.map(_._2).toList)
  38. .toMap
  39. }
  40. val parallelizedIndexers = List(
  41. new IdIndexer(basepath, fidMap, slugEntryMap),
  42. new FeatureIndexer(basepath, fidMap, polygonMap),
  43. new PolygonIndexer(basepath, fidMap)
  44. ) ++ (if (outputRevgeo) {
  45. List(new RevGeoIndexer(basepath, fidMap, polygonMap))
  46. } else { Nil }) ++ (if (outputS2Covering) {
  47. List(new S2CoveringIndexer(basepath, fidMap))
  48. } else { Nil })
  49. val diskIoFuturePool = FuturePool(Executors.newFixedThreadPool(4))
  50. val indexFutures = parallelizedIndexers.map(indexer =>
  51. diskIoFuturePool(indexer.writeIndex())
  52. )
  53. // wait forever to finish
  54. Future.collect(indexFutures).apply()
  55. logger.info("all done with output")
  56. }
  57. }