PageRenderTime 55ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/indexer/src/main/scala/output/NameIndexer.scala

https://gitlab.com/18runt88/twofishes
Scala | 76 lines | 64 code | 12 blank | 0 comment | 10 complexity | 3dc4774cbd60ed36d632edb9cce25c95 MD5 | raw file
  1. package com.foursquare.twofishes.output
  2. import com.foursquare.twofishes.Indexes
  3. import com.foursquare.twofishes.mongo.NameIndexDAO
  4. import com.foursquare.twofishes.util.StoredFeatureId
  5. import com.mongodb.Bytes
  6. import com.mongodb.casbah.Imports._
  7. import com.novus.salat._
  8. import com.novus.salat.annotations._
  9. import com.novus.salat.dao._
  10. import com.novus.salat.global._
  11. import java.io._
  12. import org.apache.hadoop.hbase.util.Bytes._
  13. import scala.collection.mutable.HashSet
  14. import scalaj.collection.Implicits._
  15. class NameIndexer(
  16. override val basepath: String,
  17. override val fidMap: FidMap,
  18. outputPrefixIndex: Boolean
  19. ) extends Indexer {
  20. val index = Indexes.NameIndex
  21. override val outputs = Seq(index) ++
  22. (if (outputPrefixIndex) { Seq(PrefixIndexer.index) } else { Seq.empty })
  23. def writeIndexImpl() {
  24. var nameCount = 0
  25. val nameSize = NameIndexDAO.collection.count()
  26. val nameCursor = NameIndexDAO.find(MongoDBObject())
  27. .sort(orderBy = MongoDBObject("name" -> 1)) // sort by nameBytes asc
  28. nameCursor.option = Bytes.QUERYOPTION_NOTIMEOUT
  29. var prefixSet = new HashSet[String]
  30. var lastName = ""
  31. val nameFids = new HashSet[StoredFeatureId]
  32. val writer = buildHFileV1Writer(index)
  33. def writeFidsForLastName() {
  34. writer.append(lastName, fidsToCanonicalFids(nameFids.toList))
  35. if (outputPrefixIndex) {
  36. for {
  37. length <- 1 to math.min(PrefixIndexer.MaxPrefixLength, lastName.size)
  38. } {
  39. prefixSet.add(lastName.substring(0, length))
  40. }
  41. }
  42. }
  43. nameCursor.filterNot(_.name.isEmpty).foreach(n => {
  44. if (lastName != n.name) {
  45. if (lastName != "") {
  46. writeFidsForLastName()
  47. }
  48. nameFids.clear()
  49. lastName = n.name
  50. }
  51. nameFids.add(n.fidAsFeatureId)
  52. nameCount += 1
  53. if (nameCount % 100000 == 0) {
  54. logger.info("processed %d of %d names".format(nameCount, nameSize))
  55. }
  56. })
  57. writeFidsForLastName()
  58. writer.close()
  59. if (outputPrefixIndex) {
  60. val prefixIndexer = new PrefixIndexer(basepath, fidMap, prefixSet)
  61. prefixIndexer.writeIndex()
  62. }
  63. }
  64. }