PageRenderTime 81ms CodeModel.GetById 46ms RepoModel.GetById 0ms app.codeStats 0ms

/indexer/src/main/scala/importers/geonames/SlugIndexer.scala

https://gitlab.com/18runt88/twofishes
Scala | 184 lines | 150 code | 20 blank | 14 comment | 21 complexity | 4c8549bcea92174e5cddce975fd6eabf MD5 | raw file
  1. // Copyright 2012 Foursquare Labs Inc. All Rights Reserved.
  2. package com.foursquare.twofishes.importers.geonames
  3. import com.foursquare.twofishes._
  4. import com.foursquare.twofishes.mongo.{GeocodeStorageWriteService, MongoGeocodeDAO}
  5. import com.foursquare.twofishes.util.{Helpers, NameUtils, SlugBuilder, StoredFeatureId}
  6. import java.io.File
  7. import scala.collection.mutable.{HashMap, HashSet}
  8. import scalaj.collection.Implicits._
  9. // TODO
  10. // stop using string representations of "a:b" featureids everywhere, PLEASE
  11. class SlugIndexer {
  12. val idToSlugMap = new HashMap[String, String]
  13. val slugEntryMap = new SlugEntryMap.SlugEntryMap
  14. var missingSlugList = new HashSet[String]
  15. def getBestSlug(id: StoredFeatureId): Option[String] = {
  16. idToSlugMap.get(id.humanReadableString)
  17. }
  18. def addMissingId(id: StoredFeatureId) {
  19. missingSlugList.add(id.humanReadableString)
  20. }
  21. Helpers.duration("readSlugs") { readSlugs() }
  22. def readSlugs() {
  23. // step 1 -- load existing slugs into ... memory?
  24. val files = List(
  25. new File("data/computed/slugs.txt"),
  26. new File("data/private/slugs.txt")
  27. )
  28. files.foreach(file =>
  29. if (file.exists) {
  30. val fileSource = scala.io.Source.fromFile(file)
  31. val lines = fileSource.getLines.toList.filterNot(_.startsWith("#"))
  32. lines.map(l => {
  33. val parts = l.split("\t")
  34. val slug = parts(0)
  35. val id = parts(1)
  36. val score = parts(2).toInt
  37. val deprecated = parts(3).toBoolean
  38. slugEntryMap(slug) = SlugEntry(id, score, deprecated = deprecated, permanent = true)
  39. if (!deprecated) {
  40. idToSlugMap(id) = slug
  41. }
  42. })
  43. }
  44. )
  45. println("read %d slugs".format(slugEntryMap.size))
  46. }
  47. // TODO: not in love with this talking directly to mongo, please fix
  48. import com.mongodb.casbah.Imports._
  49. import com.novus.salat._
  50. import com.novus.salat.annotations._
  51. import com.novus.salat.dao._
  52. import com.novus.salat.global._
  53. val parentMap = new HashMap[StoredFeatureId, Option[GeocodeFeature]]
  54. def findFeature(fid: StoredFeatureId): Option[GeocodeServingFeature] = {
  55. val ret = MongoGeocodeDAO.findOne(MongoDBObject("_id" -> fid.longId)).map(_.toGeocodeServingFeature)
  56. if (ret.isEmpty) {
  57. println("couldn't find %s".format(fid))
  58. }
  59. ret
  60. }
  61. def findParent(fid: StoredFeatureId): Option[GeocodeFeature] = {
  62. parentMap.getOrElseUpdate(fid, findFeature(fid).map(_.feature))
  63. }
  64. def calculateSlugScore(f: GeocodeServingFeature): Int = {
  65. f.scoringFeatures.boost + f.scoringFeatures.population
  66. }
  67. def matchSlugs(id: String, servingFeature: GeocodeServingFeature, possibleSlugs: List[String]): Option[String] = {
  68. // println("trying to generate a slug for %s".format(id))
  69. possibleSlugs.foreach(slug => {
  70. // println("possible slug: %s".format(slug))
  71. val existingSlug = slugEntryMap.get(slug)
  72. val score = calculateSlugScore(servingFeature)
  73. existingSlug match {
  74. case Some(existing) => {
  75. if (!existing.permanent && score > existing.score) {
  76. val evictedId = existingSlug.get.id
  77. // println("evicting %s and recursing".format(evictedId))
  78. slugEntryMap(slug) = SlugEntry(id, score, deprecated = false, permanent = false)
  79. buildSlug(evictedId)
  80. return Some(slug)
  81. }
  82. }
  83. case _ => {
  84. // println("picking %s".format(slug))
  85. slugEntryMap(slug) = SlugEntry(id, score, deprecated = false, permanent = false)
  86. idToSlugMap(id) = slug
  87. return Some(slug)
  88. }
  89. }
  90. })
  91. // println("failed to find any slug")
  92. return None
  93. }
  94. def buildSlug(id: String) {
  95. val oldSlug = idToSlugMap.get(id)
  96. val oldEntry = oldSlug.map(slug => slugEntryMap(slug))
  97. var newSlug: Option[String] = None
  98. for {
  99. fid <- StoredFeatureId.fromHumanReadableString(id)
  100. servingFeature <- findFeature(fid)
  101. if (servingFeature.scoringFeatures.population > 0 ||
  102. servingFeature.scoringFeatures.boost > 0 ||
  103. servingFeature.feature.geometry.wkbGeometryOption.nonEmpty ||
  104. servingFeature.feature.woeTypeOption.exists(YahooWoeTypes.isAdminWoeType) ||
  105. (servingFeature.feature.attributesOption.exists(_.adm1capOption.exists(a => a)) ||
  106. servingFeature.feature.attributesOption.exists(_.adm0capOption.exists(a => a)))
  107. )
  108. } {
  109. val parents = servingFeature.scoringFeatures.parentIds
  110. .flatMap(StoredFeatureId.fromLong _)
  111. .flatMap(findParent _).toList
  112. var possibleSlugs = SlugBuilder.makePossibleSlugs(servingFeature.feature, parents)
  113. // if a city is bigger than 2 million people, we'll attempt to use the bare city name as the slug
  114. // unless it's the US, where I'd rather have consistency of always doing city-state
  115. if (servingFeature.scoringFeatures.population > 2000000 && servingFeature.feature.cc != "US") {
  116. possibleSlugs = NameUtils.bestName(servingFeature.feature, Some("en"), false).toList.map(n => SlugBuilder.normalize(n.name)) ++ possibleSlugs
  117. }
  118. newSlug = matchSlugs(id, servingFeature, possibleSlugs)
  119. if (newSlug.isEmpty && possibleSlugs.nonEmpty) {
  120. var extraDigit = 1
  121. var slugFound = false
  122. while (!newSlug.isEmpty) {
  123. newSlug = matchSlugs(id, servingFeature, possibleSlugs.map(s => "%s-%d".format(s, extraDigit)))
  124. extraDigit += 1
  125. }
  126. }
  127. }
  128. if (newSlug != oldSlug) {
  129. println("deprecating old slug for %s %s -> %s".format(id, oldSlug, newSlug.getOrElse("newslug")))
  130. oldEntry.map(_.deprecated = true)
  131. }
  132. }
  133. def buildMissingSlugs() {
  134. println("building missing slugs for %d fetures".format(missingSlugList.size))
  135. // step 2 -- compute slugs for records without
  136. for {
  137. (id, index) <- missingSlugList.zipWithIndex
  138. } {
  139. if (index % 10000 == 0) {
  140. println("built %d of %d slugs".format(index, missingSlugList.size))
  141. }
  142. buildSlug(id)
  143. }
  144. // step 3 -- write new slug file
  145. println("writing new slug map for %d features".format(slugEntryMap.size))
  146. val p = new java.io.PrintWriter(new File("data/computed/slugs.txt"))
  147. slugEntryMap.keys.toList.sorted.foreach(slug =>
  148. p.println("%s\t%s".format(slug, slugEntryMap(slug)))
  149. )
  150. p.close()
  151. }
  152. def writeMissingSlugs(store: GeocodeStorageWriteService) {
  153. for {
  154. (id, index) <- missingSlugList.zipWithIndex
  155. slug <- idToSlugMap.get(id)
  156. fid <- StoredFeatureId.fromHumanReadableString(id)
  157. } {
  158. if (index % 10000 == 0) {
  159. println("flushed %d of %d slug to mongo".format(index, missingSlugList.size))
  160. }
  161. store.addSlugToRecord(fid, slug)
  162. }
  163. }
  164. }