PageRenderTime 42ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala

https://gitlab.com/KiaraGrouwstra/spark
Scala | 139 lines | 92 code | 22 blank | 25 comment | 1 complexity | 4252812317de385193a5950484ea49a4 MD5 | raw file
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. package org.apache.spark.sql.hive
  18. import java.sql.Timestamp
  19. import org.apache.hadoop.hive.conf.HiveConf
  20. import org.apache.spark.sql.execution.datasources.parquet.ParquetCompatibilityTest
  21. import org.apache.spark.sql.{Row, SQLConf}
  22. import org.apache.spark.sql.hive.test.TestHiveSingleton
  23. class ParquetHiveCompatibilitySuite extends ParquetCompatibilityTest with TestHiveSingleton {
  24. /**
  25. * Set the staging directory (and hence path to ignore Parquet files under)
  26. * to that set by [[HiveConf.ConfVars.STAGINGDIR]].
  27. */
  28. private val stagingDir = new HiveConf().getVar(HiveConf.ConfVars.STAGINGDIR)
  29. override protected def logParquetSchema(path: String): Unit = {
  30. val schema = readParquetSchema(path, { path =>
  31. !path.getName.startsWith("_") && !path.getName.startsWith(stagingDir)
  32. })
  33. logInfo(
  34. s"""Schema of the Parquet file written by parquet-avro:
  35. |$schema
  36. """.stripMargin)
  37. }
  38. private def testParquetHiveCompatibility(row: Row, hiveTypes: String*): Unit = {
  39. withTable("parquet_compat") {
  40. withTempPath { dir =>
  41. val path = dir.getCanonicalPath
  42. // Hive columns are always nullable, so here we append a all-null row.
  43. val rows = row :: Row(Seq.fill(row.length)(null): _*) :: Nil
  44. // Don't convert Hive metastore Parquet tables to let Hive write those Parquet files.
  45. withSQLConf(HiveContext.CONVERT_METASTORE_PARQUET.key -> "false") {
  46. withTempTable("data") {
  47. val fields = hiveTypes.zipWithIndex.map { case (typ, index) => s" col_$index $typ" }
  48. val ddl =
  49. s"""CREATE TABLE parquet_compat(
  50. |${fields.mkString(",\n")}
  51. |)
  52. |STORED AS PARQUET
  53. |LOCATION '$path'
  54. """.stripMargin
  55. logInfo(
  56. s"""Creating testing Parquet table with the following DDL:
  57. |$ddl
  58. """.stripMargin)
  59. sqlContext.sql(ddl)
  60. val schema = sqlContext.table("parquet_compat").schema
  61. val rowRDD = sqlContext.sparkContext.parallelize(rows).coalesce(1)
  62. sqlContext.createDataFrame(rowRDD, schema).registerTempTable("data")
  63. sqlContext.sql("INSERT INTO TABLE parquet_compat SELECT * FROM data")
  64. }
  65. }
  66. logParquetSchema(path)
  67. // Unfortunately parquet-hive doesn't add `UTF8` annotation to BINARY when writing strings.
  68. // Have to assume all BINARY values are strings here.
  69. withSQLConf(SQLConf.PARQUET_BINARY_AS_STRING.key -> "true") {
  70. checkAnswer(sqlContext.read.parquet(path), rows)
  71. }
  72. }
  73. }
  74. }
  75. test("simple primitives") {
  76. testParquetHiveCompatibility(
  77. Row(true, 1.toByte, 2.toShort, 3, 4.toLong, 5.1f, 6.1d, "foo"),
  78. "BOOLEAN", "TINYINT", "SMALLINT", "INT", "BIGINT", "FLOAT", "DOUBLE", "STRING")
  79. }
  80. test("SPARK-10177 timestamp") {
  81. testParquetHiveCompatibility(Row(Timestamp.valueOf("2015-08-24 00:31:00")), "TIMESTAMP")
  82. }
  83. test("array") {
  84. testParquetHiveCompatibility(
  85. Row(
  86. Seq[Integer](1: Integer, null, 2: Integer, null),
  87. Seq[String]("foo", null, "bar", null),
  88. Seq[Seq[Integer]](
  89. Seq[Integer](1: Integer, null),
  90. Seq[Integer](2: Integer, null))),
  91. "ARRAY<INT>",
  92. "ARRAY<STRING>",
  93. "ARRAY<ARRAY<INT>>")
  94. }
  95. test("map") {
  96. testParquetHiveCompatibility(
  97. Row(
  98. Map[Integer, String](
  99. (1: Integer) -> "foo",
  100. (2: Integer) -> null)),
  101. "MAP<INT, STRING>")
  102. }
  103. // HIVE-11625: Parquet map entries with null keys are dropped by Hive
  104. ignore("map entries with null keys") {
  105. testParquetHiveCompatibility(
  106. Row(
  107. Map[Integer, String](
  108. null.asInstanceOf[Integer] -> "bar",
  109. null.asInstanceOf[Integer] -> null)),
  110. "MAP<INT, STRING>")
  111. }
  112. test("struct") {
  113. testParquetHiveCompatibility(
  114. Row(Row(1, Seq("foo", "bar", null))),
  115. "STRUCT<f0: INT, f1: ARRAY<STRING>>")
  116. }
  117. }