PageRenderTime 62ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/indexer/src/main/scala/output/PrefixIndexer.scala

https://gitlab.com/18runt88/twofishes
Scala | 148 lines | 118 code | 22 blank | 8 comment | 11 complexity | 93044f9812308658cc1b97213b385979 MD5 | raw file
  1. package com.foursquare.twofishes.output
  2. import com.foursquare.twofishes.{FeatureNameFlags, Indexes, YahooWoeType}
  3. import com.foursquare.twofishes.mongo.{NameIndex, NameIndexDAO}
  4. import com.foursquare.twofishes.util.StoredFeatureId
  5. import com.mongodb.Bytes
  6. import com.mongodb.casbah.Imports._
  7. import com.novus.salat._
  8. import com.novus.salat.annotations._
  9. import com.novus.salat.dao._
  10. import com.novus.salat.global._
  11. import java.io._
  12. import org.apache.hadoop.hbase.util.Bytes._
  13. import scala.collection.mutable.HashSet
  14. import scalaj.collection.Implicits._
  15. object PrefixIndexer {
  16. val MaxPrefixLength = 5
  17. val MaxNameRecordsToFetchFromMongo = 1000
  18. val MaxFidsToStorePerPrefix = 50
  19. val MaxFidsWithPreferredNamesBeforeConsideringNonPreferred = 3
  20. val index = Indexes.PrefixIndex
  21. }
  22. class PrefixIndexer(
  23. override val basepath: String,
  24. override val fidMap: FidMap,
  25. prefixSet: HashSet[String]
  26. ) extends Indexer {
  27. val index = PrefixIndexer.index
  28. override val outputs = Seq(index)
  29. def hasFlag(record: NameIndex, flag: FeatureNameFlags) =
  30. (record.flags & flag.getValue) > 0
  31. def joinLists(lists: List[NameIndex]*): List[NameIndex] = {
  32. lists.toList.flatMap(l => {
  33. l.sortBy(_.pop * -1)
  34. })
  35. }
  36. private def roundRobinByCountryCode(records: List[NameIndex]): List[NameIndex] = {
  37. // to ensure global distribution of features from all countries, group by cc
  38. // and then pick the top from each group by turn and cycle through
  39. // input: a (US), b (US), c (CN), d (US), e (AU), f (AU), g (CN)
  40. // desired output: a (US), c (CN), e (AU), b (US), g (CN), f (AU), d (US)
  41. records.groupBy(_.cc) // (US -> a, b, d), (CN -> c, g), (AU -> e, f)
  42. .values.toList // (a, b, d), (c, g), (e, f)
  43. .flatMap(_.zipWithIndex) // (a, 0), (b, 1), (d, 2), (c, 0), (g, 1), (e, 0), (f, 1)
  44. .groupBy(_._2).toList // (0 -> a, c, e), (1 -> b, g, f), (2 -> d)
  45. .sortBy(_._1).flatMap(_._2.map(_._1)) // a, c, e, b, g, f, d
  46. }
  47. def sortRecordsByNames(records: List[NameIndex]) = {
  48. // val (pureNames, unpureNames) = records.partition(r => {
  49. // !hasFlag(r, FeatureNameFlags.ALIAS)
  50. // !hasFlag(r, FeatureNameFlags.DEACCENT)
  51. // })
  52. val (prefPureNames, nonPrefPureNames) =
  53. records.partition(r =>
  54. (hasFlag(r, FeatureNameFlags.PREFERRED) || hasFlag(r, FeatureNameFlags.ALT_NAME)) &&
  55. (r.lang == "en" || hasFlag(r, FeatureNameFlags.LOCAL_LANG))
  56. )
  57. val (secondBestNames, worstNames) =
  58. nonPrefPureNames.partition(r =>
  59. r.lang == "en"
  60. || hasFlag(r, FeatureNameFlags.LOCAL_LANG)
  61. )
  62. (joinLists(prefPureNames), joinLists(secondBestNames, worstNames))
  63. }
  64. def getRecordsByPrefix(prefix: String, limit: Int) = {
  65. val nameCursor = NameIndexDAO.find(
  66. MongoDBObject(
  67. "name" -> prefix,
  68. "excludeFromPrefixIndex" -> false)
  69. ).sort(orderBy = MongoDBObject("pop" -> -1)).limit(limit)
  70. nameCursor.option = Bytes.QUERYOPTION_NOTIMEOUT
  71. val prefixCursor = NameIndexDAO.find(
  72. MongoDBObject(
  73. "name" -> MongoDBObject("$regex" -> "^%s".format(prefix)),
  74. "excludeFromPrefixIndex" -> false)
  75. ).sort(orderBy = MongoDBObject("pop" -> -1)).limit(limit)
  76. prefixCursor.option = Bytes.QUERYOPTION_NOTIMEOUT
  77. (nameCursor ++ prefixCursor).toSeq.distinct.take(limit)
  78. }
  79. def writeIndexImpl() {
  80. logger.info("sorting prefix set")
  81. val sortedPrefixes = prefixSet.toList.sortWith(lexicalSort)
  82. logger.info("done sorting")
  83. val bestWoeTypes = List(
  84. YahooWoeType.POSTAL_CODE,
  85. YahooWoeType.TOWN,
  86. YahooWoeType.SUBURB,
  87. YahooWoeType.ADMIN3,
  88. YahooWoeType.AIRPORT,
  89. YahooWoeType.COUNTRY
  90. ).map(_.getValue)
  91. val prefixWriter = buildMapFileWriter(index,
  92. Map(
  93. ("MAX_PREFIX_LENGTH", PrefixIndexer.MaxPrefixLength.toString)
  94. )
  95. )
  96. val numPrefixes = sortedPrefixes.size
  97. for {
  98. (prefix, index) <- sortedPrefixes.zipWithIndex
  99. } {
  100. if (index % 1000 == 0) {
  101. logger.info("done with %d of %d prefixes".format(index, numPrefixes))
  102. }
  103. val records = getRecordsByPrefix(prefix, PrefixIndexer.MaxNameRecordsToFetchFromMongo)
  104. val (woeMatches, woeMismatches) = records.partition(r =>
  105. bestWoeTypes.contains(r.woeType))
  106. val (prefSortedRecords, unprefSortedRecords) =
  107. sortRecordsByNames(woeMatches.toList)
  108. val fids = new HashSet[StoredFeatureId]
  109. roundRobinByCountryCode(prefSortedRecords).foreach(f => {
  110. if (fids.size < PrefixIndexer.MaxFidsToStorePerPrefix) {
  111. fids.add(f.fidAsFeatureId)
  112. }
  113. })
  114. if (fids.size < PrefixIndexer.MaxFidsWithPreferredNamesBeforeConsideringNonPreferred) {
  115. roundRobinByCountryCode(unprefSortedRecords).foreach(f => {
  116. if (fids.size < PrefixIndexer.MaxFidsToStorePerPrefix) {
  117. fids.add(f.fidAsFeatureId)
  118. }
  119. })
  120. }
  121. prefixWriter.append(prefix, fidsToCanonicalFids(fids.toList))
  122. }
  123. prefixWriter.close()
  124. logger.info("done")
  125. }
  126. }