PageRenderTime 46ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/hcatalog/core/src/test/java/org/apache/hive/hcatalog/mapreduce/TestHCatMultiOutputFormat.java

http://github.com/apache/hive
Java | 431 lines | 340 code | 41 blank | 50 comment | 17 complexity | 1e6201a031d9b8ccbe6720b55a8de6bc MD5 | raw file
Possible License(s): Apache-2.0
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing,
  13. * software distributed under the License is distributed on an
  14. * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  15. * KIND, either express or implied. See the License for the
  16. * specific language governing permissions and limitations
  17. * under the License.
  18. */
  19. package org.apache.hive.hcatalog.mapreduce;
  20. import java.io.File;
  21. import java.io.IOException;
  22. import java.io.OutputStream;
  23. import java.util.ArrayList;
  24. import java.util.HashMap;
  25. import java.util.List;
  26. import java.util.Random;
  27. import org.apache.hadoop.conf.Configuration;
  28. import org.apache.hadoop.fs.FileSystem;
  29. import org.apache.hadoop.fs.FileUtil;
  30. import org.apache.hadoop.fs.Path;
  31. import org.apache.hadoop.fs.permission.FsPermission;
  32. import org.apache.hadoop.hive.conf.HiveConf;
  33. import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
  34. import org.apache.hadoop.hive.metastore.MetaStoreTestUtils;
  35. import org.apache.hadoop.hive.metastore.api.FieldSchema;
  36. import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
  37. import org.apache.hadoop.hive.metastore.api.SerDeInfo;
  38. import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
  39. import org.apache.hadoop.hive.metastore.api.Table;
  40. import org.apache.hadoop.hive.metastore.conf.MetastoreConf;
  41. import org.apache.hadoop.hive.ql.QueryState;
  42. import org.apache.hadoop.hive.ql.exec.FetchTask;
  43. import org.apache.hadoop.hive.ql.exec.Utilities;
  44. import org.apache.hadoop.hive.ql.metadata.Hive;
  45. import org.apache.hadoop.hive.ql.metadata.Partition;
  46. import org.apache.hadoop.hive.ql.plan.FetchWork;
  47. import org.apache.hadoop.hive.ql.plan.PartitionDesc;
  48. import org.apache.hadoop.hive.ql.plan.TableDesc;
  49. import org.apache.hadoop.hive.serde.serdeConstants;
  50. import org.apache.hadoop.io.BytesWritable;
  51. import org.apache.hadoop.io.LongWritable;
  52. import org.apache.hadoop.io.Text;
  53. import org.apache.hadoop.mapred.JobConf;
  54. import org.apache.hadoop.mapred.MiniMRCluster;
  55. import org.apache.hadoop.mapreduce.Job;
  56. import org.apache.hadoop.mapreduce.Mapper;
  57. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  58. import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
  59. import org.apache.hive.hcatalog.cli.SemanticAnalysis.HCatSemanticAnalyzer;
  60. import org.apache.hive.hcatalog.common.HCatException;
  61. import org.apache.hive.hcatalog.data.DefaultHCatRecord;
  62. import org.apache.hive.hcatalog.data.HCatRecord;
  63. import org.apache.hive.hcatalog.data.schema.HCatFieldSchema;
  64. import org.apache.hive.hcatalog.data.schema.HCatSchema;
  65. import org.apache.hive.hcatalog.data.schema.HCatSchemaUtils;
  66. import org.apache.hive.hcatalog.mapreduce.MultiOutputFormat.JobConfigurer;
  67. import org.junit.AfterClass;
  68. import org.junit.Assert;
  69. import org.junit.BeforeClass;
  70. import org.junit.Test;
  71. import org.slf4j.Logger;
  72. import org.slf4j.LoggerFactory;
  73. public class TestHCatMultiOutputFormat {
  74. private static final Logger LOG = LoggerFactory.getLogger(TestHCatMultiOutputFormat.class);
  75. private static final String DATABASE = "default";
  76. private static final String[] tableNames = {"test1", "test2", "test3"};
  77. private static final String[] tablePerms = {"755", "750", "700"};
  78. private static Path warehousedir = null;
  79. private static HashMap<String, HCatSchema> schemaMap = new HashMap<String, HCatSchema>();
  80. private static HiveMetaStoreClient hmsc;
  81. private static MiniMRCluster mrCluster;
  82. private static Configuration mrConf;
  83. private static HiveConf hiveConf;
  84. private static File workDir;
  85. static {
  86. schemaMap.put(tableNames[0], new HCatSchema(ColumnHolder.hCattest1Cols));
  87. schemaMap.put(tableNames[1], new HCatSchema(ColumnHolder.hCattest2Cols));
  88. schemaMap.put(tableNames[2], new HCatSchema(ColumnHolder.hCattest3Cols));
  89. }
  90. /**
  91. * Private class which holds all the data for the test cases
  92. */
  93. private static class ColumnHolder {
  94. private static ArrayList<HCatFieldSchema> hCattest1Cols = new ArrayList<HCatFieldSchema>();
  95. private static ArrayList<HCatFieldSchema> hCattest2Cols = new ArrayList<HCatFieldSchema>();
  96. private static ArrayList<HCatFieldSchema> hCattest3Cols = new ArrayList<HCatFieldSchema>();
  97. private static ArrayList<FieldSchema> partitionCols = new ArrayList<FieldSchema>();
  98. private static ArrayList<FieldSchema> test1Cols = new ArrayList<FieldSchema>();
  99. private static ArrayList<FieldSchema> test2Cols = new ArrayList<FieldSchema>();
  100. private static ArrayList<FieldSchema> test3Cols = new ArrayList<FieldSchema>();
  101. private static HashMap<String, List<FieldSchema>> colMapping = new HashMap<String, List<FieldSchema>>();
  102. static {
  103. try {
  104. FieldSchema keyCol = new FieldSchema("key", serdeConstants.STRING_TYPE_NAME, "");
  105. test1Cols.add(keyCol);
  106. test2Cols.add(keyCol);
  107. test3Cols.add(keyCol);
  108. hCattest1Cols.add(HCatSchemaUtils.getHCatFieldSchema(keyCol));
  109. hCattest2Cols.add(HCatSchemaUtils.getHCatFieldSchema(keyCol));
  110. hCattest3Cols.add(HCatSchemaUtils.getHCatFieldSchema(keyCol));
  111. FieldSchema valueCol = new FieldSchema("value", serdeConstants.STRING_TYPE_NAME, "");
  112. test1Cols.add(valueCol);
  113. test3Cols.add(valueCol);
  114. hCattest1Cols.add(HCatSchemaUtils.getHCatFieldSchema(valueCol));
  115. hCattest3Cols.add(HCatSchemaUtils.getHCatFieldSchema(valueCol));
  116. FieldSchema extraCol = new FieldSchema("extra", serdeConstants.STRING_TYPE_NAME, "");
  117. test3Cols.add(extraCol);
  118. hCattest3Cols.add(HCatSchemaUtils.getHCatFieldSchema(extraCol));
  119. colMapping.put("test1", test1Cols);
  120. colMapping.put("test2", test2Cols);
  121. colMapping.put("test3", test3Cols);
  122. } catch (HCatException e) {
  123. LOG.error("Error in setting up schema fields for the table", e);
  124. throw new RuntimeException(e);
  125. }
  126. }
  127. static {
  128. partitionCols.add(new FieldSchema("ds", serdeConstants.STRING_TYPE_NAME, ""));
  129. partitionCols.add(new FieldSchema("cluster", serdeConstants.STRING_TYPE_NAME, ""));
  130. }
  131. }
  132. @BeforeClass
  133. public static void setup() throws Exception {
  134. System.clearProperty("mapred.job.tracker");
  135. String testDir = System.getProperty("test.tmp.dir", "./");
  136. testDir = testDir + "/test_multitable_" + Math.abs(new Random().nextLong()) + "/";
  137. workDir = new File(new File(testDir).getCanonicalPath());
  138. FileUtil.fullyDelete(workDir);
  139. workDir.mkdirs();
  140. warehousedir = new Path(System.getProperty("test.warehouse.dir"));
  141. HiveConf metastoreConf = new HiveConf();
  142. metastoreConf.setVar(HiveConf.ConfVars.METASTOREWAREHOUSE, warehousedir.toString());
  143. // Run hive metastore server
  144. MetaStoreTestUtils.startMetaStoreWithRetry(metastoreConf);
  145. // Read the warehouse dir, which can be changed so multiple MetaStore tests could be run on
  146. // the same server
  147. warehousedir = new Path(MetastoreConf.getVar(metastoreConf, MetastoreConf.ConfVars.WAREHOUSE));
  148. // LocalJobRunner does not work with mapreduce OutputCommitter. So need
  149. // to use MiniMRCluster. MAPREDUCE-2350
  150. Configuration conf = new Configuration(true);
  151. conf.set("yarn.scheduler.capacity.root.queues", "default");
  152. conf.set("yarn.scheduler.capacity.root.default.capacity", "100");
  153. FileSystem fs = FileSystem.get(conf);
  154. System.setProperty("hadoop.log.dir", new File(workDir, "/logs").getAbsolutePath());
  155. mrCluster = new MiniMRCluster(1, fs.getUri().toString(), 1, null, null,
  156. new JobConf(conf));
  157. mrConf = mrCluster.createJobConf();
  158. initializeSetup(metastoreConf);
  159. warehousedir.getFileSystem(conf).mkdirs(warehousedir);
  160. }
  161. private static void initializeSetup(HiveConf metastoreConf) throws Exception {
  162. hiveConf = new HiveConf(metastoreConf, TestHCatMultiOutputFormat.class);
  163. hiveConf.setIntVar(HiveConf.ConfVars.METASTORETHRIFTCONNECTIONRETRIES, 3);
  164. hiveConf.setIntVar(HiveConf.ConfVars.METASTORETHRIFTFAILURERETRIES, 3);
  165. hiveConf.set(HiveConf.ConfVars.SEMANTIC_ANALYZER_HOOK.varname,
  166. HCatSemanticAnalyzer.class.getName());
  167. hiveConf.set(HiveConf.ConfVars.PREEXECHOOKS.varname, "");
  168. hiveConf.set(HiveConf.ConfVars.POSTEXECHOOKS.varname, "");
  169. hiveConf.set(HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY.varname, "false");
  170. System.setProperty(HiveConf.ConfVars.PREEXECHOOKS.varname, " ");
  171. System.setProperty(HiveConf.ConfVars.POSTEXECHOOKS.varname, " ");
  172. System.setProperty(HiveConf.ConfVars.METASTOREWAREHOUSE.varname,
  173. MetastoreConf.getVar(hiveConf, MetastoreConf.ConfVars.WAREHOUSE));
  174. System.setProperty(HiveConf.ConfVars.METASTORECONNECTURLKEY.varname,
  175. MetastoreConf.getVar(hiveConf, MetastoreConf.ConfVars.CONNECT_URL_KEY));
  176. System.setProperty(HiveConf.ConfVars.METASTOREURIS.varname,
  177. MetastoreConf.getVar(hiveConf, MetastoreConf.ConfVars.THRIFT_URIS));
  178. hiveConf.set(HiveConf.ConfVars.METASTOREWAREHOUSE.varname, warehousedir.toString());
  179. try {
  180. hmsc = new HiveMetaStoreClient(hiveConf);
  181. initalizeTables();
  182. } catch (Throwable e) {
  183. LOG.error("Exception encountered while setting up testcase", e);
  184. throw new Exception(e);
  185. } finally {
  186. hmsc.close();
  187. }
  188. }
  189. private static void initalizeTables() throws Exception {
  190. for (String table : tableNames) {
  191. try {
  192. if (hmsc.getTable(DATABASE, table) != null) {
  193. hmsc.dropTable(DATABASE, table);
  194. }
  195. } catch (NoSuchObjectException ignored) {
  196. }
  197. }
  198. for (int i = 0; i < tableNames.length; i++) {
  199. createTable(tableNames[i], tablePerms[i]);
  200. }
  201. }
  202. private static void createTable(String tableName, String tablePerm) throws Exception {
  203. Table tbl = new Table();
  204. tbl.setDbName(DATABASE);
  205. tbl.setTableName(tableName);
  206. StorageDescriptor sd = new StorageDescriptor();
  207. sd.setCols(ColumnHolder.colMapping.get(tableName));
  208. tbl.setSd(sd);
  209. sd.setParameters(new HashMap<String, String>());
  210. sd.setSerdeInfo(new SerDeInfo());
  211. sd.getSerdeInfo().setName(tbl.getTableName());
  212. sd.getSerdeInfo().setParameters(new HashMap<String, String>());
  213. sd.setInputFormat(org.apache.hadoop.hive.ql.io.RCFileInputFormat.class.getName());
  214. sd.setOutputFormat(org.apache.hadoop.hive.ql.io.RCFileOutputFormat.class.getName());
  215. sd.getSerdeInfo().getParameters().put(serdeConstants.SERIALIZATION_FORMAT, "1");
  216. sd.getSerdeInfo().setSerializationLib(
  217. org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe.class.getName());
  218. tbl.setPartitionKeys(ColumnHolder.partitionCols);
  219. hmsc.createTable(tbl);
  220. Path path = new Path(warehousedir, tableName);
  221. FileSystem fs = path.getFileSystem(hiveConf);
  222. fs.setPermission(path, new FsPermission(tablePerm));
  223. }
  224. @AfterClass
  225. public static void tearDown() throws IOException {
  226. FileUtil.fullyDelete(workDir);
  227. FileSystem fs = warehousedir.getFileSystem(hiveConf);
  228. if (fs.exists(warehousedir)) {
  229. fs.delete(warehousedir, true);
  230. }
  231. if (mrCluster != null) {
  232. mrCluster.shutdown();
  233. }
  234. }
  235. /**
  236. * Simple test case.
  237. * <ol>
  238. * <li>Submits a mapred job which writes out one fixed line to each of the tables</li>
  239. * <li>uses hive fetch task to read the data and see if it matches what was written</li>
  240. * </ol>
  241. *
  242. * @throws Exception if any error occurs
  243. */
  244. @Test
  245. public void testOutputFormat() throws Throwable {
  246. HashMap<String, String> partitionValues = new HashMap<String, String>();
  247. partitionValues.put("ds", "1");
  248. partitionValues.put("cluster", "ag");
  249. ArrayList<OutputJobInfo> infoList = new ArrayList<OutputJobInfo>();
  250. infoList.add(OutputJobInfo.create("default", tableNames[0], partitionValues));
  251. infoList.add(OutputJobInfo.create("default", tableNames[1], partitionValues));
  252. infoList.add(OutputJobInfo.create("default", tableNames[2], partitionValues));
  253. Job job = new Job(hiveConf, "SampleJob");
  254. job.setMapperClass(MyMapper.class);
  255. job.setInputFormatClass(TextInputFormat.class);
  256. job.setOutputFormatClass(MultiOutputFormat.class);
  257. job.setNumReduceTasks(0);
  258. JobConfigurer configurer = MultiOutputFormat.createConfigurer(job);
  259. for (int i = 0; i < tableNames.length; i++) {
  260. configurer.addOutputFormat(tableNames[i], HCatOutputFormat.class, BytesWritable.class,
  261. HCatRecord.class);
  262. HCatOutputFormat.setOutput(configurer.getJob(tableNames[i]), infoList.get(i));
  263. HCatOutputFormat.setSchema(configurer.getJob(tableNames[i]),
  264. schemaMap.get(tableNames[i]));
  265. }
  266. configurer.configure();
  267. Path filePath = createInputFile();
  268. FileInputFormat.addInputPath(job, filePath);
  269. Assert.assertTrue(job.waitForCompletion(true));
  270. ArrayList<String> outputs = new ArrayList<String>();
  271. for (String tbl : tableNames) {
  272. outputs.add(getTableData(tbl, "default").get(0));
  273. }
  274. Assert.assertEquals("Comparing output of table " +
  275. tableNames[0] + " is not correct", outputs.get(0), "a,a,1,ag");
  276. Assert.assertEquals("Comparing output of table " +
  277. tableNames[1] + " is not correct", outputs.get(1),
  278. "a,1,ag");
  279. Assert.assertEquals("Comparing output of table " +
  280. tableNames[2] + " is not correct", outputs.get(2), "a,a,extra,1,ag");
  281. // Check permisssion on partition dirs and files created
  282. for (int i = 0; i < tableNames.length; i++) {
  283. Path partitionFile = new Path(warehousedir + "/" + tableNames[i]
  284. + "/ds=1/cluster=ag/part-m-00000");
  285. FileSystem fs = partitionFile.getFileSystem(mrConf);
  286. Assert.assertEquals("File permissions of table " + tableNames[i] + " is not correct",
  287. fs.getFileStatus(partitionFile).getPermission(),
  288. new FsPermission(tablePerms[i]));
  289. Assert.assertEquals("File permissions of table " + tableNames[i] + " is not correct",
  290. fs.getFileStatus(partitionFile.getParent()).getPermission(),
  291. new FsPermission(tablePerms[i]));
  292. Assert.assertEquals("File permissions of table " + tableNames[i] + " is not correct",
  293. fs.getFileStatus(partitionFile.getParent().getParent()).getPermission(),
  294. new FsPermission(tablePerms[i]));
  295. }
  296. LOG.info("File permissions verified");
  297. }
  298. /**
  299. * Create a input file for map
  300. *
  301. * @return absolute path of the file.
  302. * @throws IOException if any error encountered
  303. */
  304. private Path createInputFile() throws IOException {
  305. Path f = new Path(workDir + "/MultiTableInput.txt");
  306. FileSystem fs = FileSystem.get(mrConf);
  307. if (fs.exists(f)) {
  308. fs.delete(f, true);
  309. }
  310. OutputStream out = fs.create(f);
  311. for (int i = 0; i < 3; i++) {
  312. out.write("a,a\n".getBytes());
  313. }
  314. out.close();
  315. return f;
  316. }
  317. /**
  318. * Method to fetch table data
  319. *
  320. * @param table table name
  321. * @param database database
  322. * @return list of columns in comma seperated way
  323. * @throws Exception if any error occurs
  324. */
  325. private List<String> getTableData(String table, String database) throws Exception {
  326. QueryState queryState = new QueryState.Builder().build();
  327. HiveConf conf = queryState.getConf();
  328. conf.addResource("hive-site.xml");
  329. ArrayList<String> results = new ArrayList<String>();
  330. ArrayList<String> temp = new ArrayList<String>();
  331. Hive hive = Hive.get(conf);
  332. org.apache.hadoop.hive.ql.metadata.Table tbl = hive.getTable(database, table);
  333. FetchWork work;
  334. if (!tbl.getPartCols().isEmpty()) {
  335. List<Partition> partitions = hive.getPartitions(tbl);
  336. List<PartitionDesc> partDesc = new ArrayList<PartitionDesc>();
  337. List<Path> partLocs = new ArrayList<Path>();
  338. TableDesc tableDesc = Utilities.getTableDesc(tbl);
  339. for (Partition part : partitions) {
  340. partLocs.add(part.getDataLocation());
  341. partDesc.add(Utilities.getPartitionDescFromTableDesc(tableDesc, part, true));
  342. }
  343. work = new FetchWork(partLocs, partDesc, tableDesc);
  344. work.setLimit(100);
  345. } else {
  346. work = new FetchWork(tbl.getDataLocation(), Utilities.getTableDesc(tbl));
  347. }
  348. FetchTask task = new FetchTask();
  349. task.setWork(work);
  350. conf.set("_hive.hdfs.session.path", "path");
  351. conf.set("_hive.local.session.path", "path");
  352. task.initialize(queryState, null, null, new org.apache.hadoop.hive.ql.Context(conf));
  353. task.fetch(temp);
  354. for (String str : temp) {
  355. results.add(str.replace("\t", ","));
  356. }
  357. return results;
  358. }
  359. private static class MyMapper extends
  360. Mapper<LongWritable, Text, BytesWritable, HCatRecord> {
  361. private int i = 0;
  362. @Override
  363. protected void map(LongWritable key, Text value, Context context)
  364. throws IOException, InterruptedException {
  365. HCatRecord record = null;
  366. String[] splits = value.toString().split(",");
  367. switch (i) {
  368. case 0:
  369. record = new DefaultHCatRecord(2);
  370. record.set(0, splits[0]);
  371. record.set(1, splits[1]);
  372. break;
  373. case 1:
  374. record = new DefaultHCatRecord(1);
  375. record.set(0, splits[0]);
  376. break;
  377. case 2:
  378. record = new DefaultHCatRecord(3);
  379. record.set(0, splits[0]);
  380. record.set(1, splits[1]);
  381. record.set(2, "extra");
  382. break;
  383. default:
  384. Assert.fail("This should not happen!!!!!");
  385. }
  386. MultiOutputFormat.write(tableNames[i], null, record, context);
  387. i++;
  388. }
  389. }
  390. }