PageRenderTime 27ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/geomesa-compute/src/main/scala/org/locationtech/geomesa/compute/spark/sql/GeoMesaSparkSql.scala

https://gitlab.com/zachcoyle/geomesa
Scala | 345 lines | 230 code | 39 blank | 76 comment | 47 complexity | 611dbebe238d7ad8a37a216ed79a5f53 MD5 | raw file
  1. /***********************************************************************
  2. * Copyright (c) 2013-2016 Commonwealth Computer Research, Inc.
  3. * All rights reserved. This program and the accompanying materials
  4. * are made available under the terms of the Apache License, Version 2.0
  5. * which accompanies this distribution and is available at
  6. * http://www.opensource.org/licenses/apache2.0.php.
  7. *************************************************************************/
  8. package org.locationtech.geomesa.compute.spark.sql
  9. import java.sql.Timestamp
  10. import java.util.concurrent.atomic.AtomicInteger
  11. import java.util.{Date, List => jList, Map => jMap, UUID}
  12. import com.typesafe.scalalogging.LazyLogging
  13. import com.vividsolutions.jts.geom.Geometry
  14. import org.apache.hadoop.conf.Configuration
  15. import org.apache.metamodel.query.FilterClause
  16. import org.apache.metamodel.{DataContext, query}
  17. import org.apache.spark.sql.types._
  18. import org.apache.spark.sql.{Row, _}
  19. import org.apache.spark.{SparkConf, SparkContext}
  20. import org.geotools.data.{DataStoreFinder, DataUtilities, Query}
  21. import org.geotools.factory.CommonFactoryFinder
  22. import org.geotools.filter.text.ecql.ECQL
  23. import org.geotools.filter.visitor.DuplicatingFilterVisitor
  24. import org.locationtech.geomesa.compute.spark.GeoMesaSpark
  25. import org.locationtech.geomesa.utils.geotools.SimpleFeatureTypes._
  26. import org.locationtech.geomesa.utils.text.WKTUtils
  27. import org.opengis.feature.`type`.AttributeDescriptor
  28. import org.opengis.feature.simple.SimpleFeatureType
  29. import org.opengis.filter.Filter
  30. import org.opengis.filter.expression.PropertyName
  31. import scala.collection.JavaConversions._
  32. /**
  33. * Class to manage running sql queries against geomesa using spark.
  34. *
  35. * There can only be a single spark context running in a given jvm, so it has to be managed using the
  36. * start/stop/register methods.
  37. */
  38. object GeoMesaSparkSql extends LazyLogging {
  39. private val ff = CommonFactoryFinder.getFilterFactory2
  40. // state to keep track of our sfts and data store connection parameters
  41. private val dsParams = scala.collection.mutable.Set.empty[Map[String, String]]
  42. private val sfts = scala.collection.mutable.Set.empty[SimpleFeatureType]
  43. // singleton spark context
  44. private var sc: SparkContext = null
  45. private var sparkSql: GeoMesaSparkSql = null
  46. private var running = false
  47. private val executing = new AtomicInteger(0)
  48. /**
  49. * Register a data store. This makes all schemas in the data store available for querying.
  50. * Synchronized to ensure it's only called when the spark context is not running.
  51. */
  52. def registerDataStore(params: Map[String, String]): Unit = synchronized {
  53. require(!running, "Can't register a data store in a running instance")
  54. val ds = DataStoreFinder.getDataStore(params)
  55. require(ds != null, "No data store found using provided parameters")
  56. dsParams += params
  57. sfts ++= ds.getTypeNames.map(ds.getSchema)
  58. }
  59. /**
  60. * Starts the spark context, if not already running.
  61. */
  62. def start(configs: Map[String, String] = Map.empty,
  63. distributedJars: Seq[String] = Seq.empty): Boolean = synchronized {
  64. if (running) {
  65. logger.debug("Trying to start an already started instance")
  66. false
  67. } else {
  68. val conf = GeoMesaSpark.init(new SparkConf(), sfts.toSeq)
  69. conf.setAppName("GeoMesaSql")
  70. conf.setMaster("yarn-client")
  71. conf.setJars(distributedJars)
  72. configs.foreach { case (k, v) => conf.set(k, v) }
  73. sc = new SparkContext(conf)
  74. sparkSql = new GeoMesaSparkSql(sc, dsParams.toSeq)
  75. running = true
  76. true
  77. }
  78. }
  79. /**
  80. * Stops the spark context, if running. Blocks until all current processes have finished executing.
  81. * Note that the synchronization on this method will prevent new tasks from executing.
  82. *
  83. * @param wait
  84. * if < 0, will block indefinitely
  85. * if >= 0, will return after that many millis
  86. * @return true if successfully stopped, else false
  87. */
  88. def stop(wait: Long = -1): Boolean = synchronized {
  89. if (running) {
  90. val start = System.currentTimeMillis()
  91. // wait for current queries to stop
  92. while (executing.get() > 0 && (wait == -1 || System.currentTimeMillis() - start < wait)) {
  93. Thread.sleep(1000)
  94. }
  95. if (executing.get() > 0) {
  96. return false
  97. }
  98. sc.stop()
  99. sc = null
  100. sparkSql = null
  101. running = false
  102. } else {
  103. logger.debug("Trying to stop an already stopped instance")
  104. }
  105. true
  106. }
  107. /**
  108. * Execute a sql query against geomesa. Where clause is interpreted as CQL.
  109. */
  110. def execute(sql: String, splits: Option[Int] = None): (StructType, Array[Row]) = {
  111. val canStart = synchronized {
  112. // we need to compare and modify the state inside the synchronized block
  113. if (running) {
  114. executing.incrementAndGet()
  115. }
  116. running
  117. }
  118. require(canStart, "Can only execute in a running instance")
  119. try {
  120. val results = sparkSql.query(sql, splits)
  121. // return the result schema and rows
  122. (results.schema, results.collect())
  123. } finally {
  124. executing.decrementAndGet()
  125. }
  126. }
  127. /**
  128. * Extracts CQL from the SQL query.
  129. */
  130. private def extractCql(where: FilterClause,
  131. context: DataContext,
  132. sftNames: Seq[String]): Map[String, Filter] = {
  133. val sqlVisitor = new SqlVisitor(context, sftNames)
  134. val result = scala.collection.mutable.Map.empty[String, Filter]
  135. // items should have an expression if they can't be parsed as SQL
  136. // we interpret that to mean that they are CQL instead
  137. where.getItems.flatMap(i => Option(i.getExpression)).map(ECQL.toFilter).foreach { filter =>
  138. sqlVisitor.referencedSfts.clear()
  139. val updated = filter.accept(sqlVisitor, null).asInstanceOf[Filter]
  140. require(sqlVisitor.referencedSfts.size == 1, "CQL filters across multiple tables are not supported")
  141. val typeName = sqlVisitor.referencedSfts.head
  142. result.put(typeName, result.get(typeName).map(c => ff.and(updated, c)).getOrElse(updated))
  143. }
  144. result.toMap
  145. }
  146. /**
  147. * Get the attribute names referenced in the query - used to select a subset of attributes from geomesa
  148. */
  149. def extractAttributeNames(sql: query.Query, cql: Map[String, Filter]): Map[String, Set[String]] = {
  150. val namesFromCql = cql.mapValues(DataUtilities.attributeNames(_).toSet)
  151. val namesFromSql = scala.collection.mutable.Map.empty[String, Set[String]]
  152. // we ignore the 'having' clause as it should always reference something from the select
  153. val selects = sql.getSelectClause.getItems ++
  154. sql.getWhereClause.getEvaluatedSelectItems ++
  155. sql.getGroupByClause.getEvaluatedSelectItems ++
  156. sql.getOrderByClause.getEvaluatedSelectItems
  157. selects.flatMap(s => Option(s.getColumn)).foreach { c =>
  158. val table = c.getTable.getName
  159. namesFromSql.put(table, namesFromSql.get(table).map(_ ++ Set(c.getName)).getOrElse(Set(c.getName)))
  160. }
  161. // combine the two maps
  162. namesFromSql.toMap ++ namesFromCql.map { case (k,v) =>
  163. k -> namesFromSql.get(k).map(_ ++ v.toSet).getOrElse(v.toSet)
  164. }
  165. }
  166. /**
  167. * Converts a simple feature attribute into a SQL data type
  168. */
  169. private def types(d: AttributeDescriptor): DataType = {
  170. val clas = d.getType.getBinding
  171. if (classOf[jList[_]].isAssignableFrom(clas)) {
  172. val listClass = d.getUserData.get(USER_DATA_LIST_TYPE).asInstanceOf[Class[_]]
  173. DataTypes.createArrayType(types(listClass))
  174. } else if (classOf[jMap[_, _]].isAssignableFrom(clas)) {
  175. val keyClass = d.getUserData.get(USER_DATA_MAP_KEY_TYPE).asInstanceOf[Class[_]]
  176. val valueClass = d.getUserData.get(USER_DATA_MAP_VALUE_TYPE).asInstanceOf[Class[_]]
  177. DataTypes.createMapType(types(keyClass), types(valueClass))
  178. } else {
  179. types(clas)
  180. }
  181. }
  182. /**
  183. * Converts a simple class type into a SQL data type
  184. */
  185. private def types(clas: Class[_]): DataType = {
  186. if (classOf[java.lang.String].isAssignableFrom(clas)) {
  187. StringType
  188. } else if (classOf[java.lang.Integer].isAssignableFrom(clas)) {
  189. IntegerType
  190. } else if (classOf[java.lang.Long].isAssignableFrom(clas)) {
  191. LongType
  192. } else if (classOf[java.lang.Float].isAssignableFrom(clas)) {
  193. FloatType
  194. } else if (classOf[java.lang.Double].isAssignableFrom(clas)) {
  195. DoubleType
  196. } else if (classOf[java.lang.Boolean].isAssignableFrom(clas)) {
  197. BooleanType
  198. } else if (classOf[java.util.Date].isAssignableFrom(clas)) {
  199. TimestampType
  200. } else if (classOf[UUID].isAssignableFrom(clas)) {
  201. StringType
  202. } else if (classOf[Geometry].isAssignableFrom(clas)) {
  203. StringType
  204. } else {
  205. throw new NotImplementedError(s"Binding $clas is not supported")
  206. }
  207. }
  208. }
  209. class GeoMesaSparkSql(sc: SparkContext, dsParams: Seq[Map[String, String]]) {
  210. // load up our sfts
  211. val sftsByName = dsParams.flatMap { params =>
  212. val ds = DataStoreFinder.getDataStore(params)
  213. require(ds != null, "No data store found using provided parameters")
  214. ds.getTypeNames.map { name =>
  215. val schema = ds.getSchema(name)
  216. name -> (schema, params)
  217. }
  218. }.foldLeft(Map.empty[String, (SimpleFeatureType, Map[String, String])])(_ + _)
  219. private val dataContext = new GeoMesaDataContext(sftsByName.mapValues(_._1))
  220. /**
  221. * Execute a sql query against geomesa. Where clause is interpreted as CQL.
  222. */
  223. def query(sql: String, splits: Option[Int]): DataFrame = {
  224. val parsedSql = dataContext.parseQuery(sql)
  225. // extract the feature types from the from clause
  226. val typeNames = parsedSql.getFromClause.getItems.map(_.getTable.getName)
  227. val sftsWithParams = typeNames.map(sftsByName.apply)
  228. // extract the cql from the where clause
  229. val where = parsedSql.getWhereClause
  230. val cql = GeoMesaSparkSql.extractCql(where, dataContext, typeNames)
  231. // clear out the cql from the where clause so spark doesn't try to parse it
  232. // if it' a sql expression, the expression field will be null
  233. // otherwise it has the raw expression, which we assume is cql
  234. where.getItems.filter(_.getExpression != null).foreach(where.removeItem)
  235. val sqlWithoutCql = parsedSql.toSql
  236. // restrict the attributes coming back to speed up the query
  237. val attributesByType = GeoMesaSparkSql.extractAttributeNames(parsedSql, cql)
  238. val sqlContext = new SQLContext(sc)
  239. // for each input sft, set up the sql table with the results from querying geomesa with the cql filter
  240. sftsWithParams.foreach { case (sft, params) =>
  241. val typeName = sft.getTypeName
  242. val allAttributes = sft.getAttributeDescriptors.map(_.getLocalName)
  243. val attributes = {
  244. val extracted = attributesByType(typeName).toList
  245. if (extracted.sorted == allAttributes.sorted) {
  246. None // if we've got all attributes, we don't need a transform
  247. } else {
  248. Some(extracted.toArray)
  249. }
  250. }
  251. val filter = cql.getOrElse(typeName, Filter.INCLUDE)
  252. val query = new Query(typeName, filter)
  253. attributes.foreach(query.setPropertyNames)
  254. // generate the sql schema based on the sft/query attributes
  255. val fields = attributes.getOrElse(allAttributes.toArray).map { field =>
  256. StructField(field, GeoMesaSparkSql.types(sft.getDescriptor(field)), nullable = true)
  257. }
  258. val schema = StructType(fields)
  259. // create an rdd from the query
  260. val features = GeoMesaSpark.rdd(new Configuration(), sc, params, query, splits)
  261. // convert records to rows - convert the values to sql-compatible ones
  262. val rowRdd = features.map { f =>
  263. val sqlAttributes = f.getAttributes.map {
  264. case g: Geometry => WKTUtils.write(g) // text
  265. case d: Date => new Timestamp(d.getTime) // sql timestamp
  266. case u: UUID => u.toString // text
  267. case a => a // others should map natively without explict conversion
  268. }
  269. Row(sqlAttributes: _*)
  270. }
  271. // apply the schema to the rdd
  272. val featuresDataFrame = sqlContext.createDataFrame(rowRdd, schema)
  273. // register the data frame as a table, so that it's available to the sql engine
  274. featuresDataFrame.registerTempTable(typeName)
  275. }
  276. // run the sql statement against our registered tables
  277. sqlContext.sql(sqlWithoutCql)
  278. }
  279. }
  280. /**
  281. * Extracts property names from a filter. Names are expected to either be qualified with the
  282. * feature type name (e.g. mysft.myattr), or be unambiguous among the feature types being queried.
  283. */
  284. class SqlVisitor(context: DataContext, sftNames: Seq[String]) extends DuplicatingFilterVisitor {
  285. val referencedSfts = scala.collection.mutable.Set.empty[String]
  286. override def visit(expression: PropertyName, extraData: AnyRef): AnyRef = {
  287. val name = expression.getPropertyName
  288. require(name != null && !name.isEmpty, "Property name is ambiguous: 'null'")
  289. val parts = name.split("\\.|/") // ECQL converts '.' into '/' in properties, so we have to match both
  290. require(parts.length < 3, s"Ambiguous property name in filter: '$name")
  291. if (parts.length == 2) {
  292. // qualified by sft name
  293. val matching = sftNames.filter(_ == parts.head)
  294. require(matching.nonEmpty, s"Property name does not match a table in from clause: '$name")
  295. referencedSfts.add(matching.head)
  296. getFactory(extraData).property(parts(1), expression.getNamespaceContext)
  297. } else {
  298. // not qualified - see if it unambiguously matches any of the tables
  299. val matching = sftNames.map(context.getTableByQualifiedLabel).flatMap(_.getColumns.find(_.getName == name))
  300. require(matching.nonEmpty, s"Property name does not match a table in from clause: '$name")
  301. require(matching.length == 1, s"Property name is ambiguous: '$name'")
  302. referencedSfts.add(matching.head.getTable.getName)
  303. expression
  304. }
  305. }
  306. }