PageRenderTime 41ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/hcatalog/core/src/test/java/org/apache/hcatalog/mapreduce/TestHCatMultiOutputFormat.java

http://github.com/apache/hive
Java | 433 lines | 338 code | 44 blank | 51 comment | 17 complexity | 80bb6046ca72bfc2f4ff518b1336a317 MD5 | raw file
Possible License(s): Apache-2.0
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing,
  13. * software distributed under the License is distributed on an
  14. * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  15. * KIND, either express or implied. See the License for the
  16. * specific language governing permissions and limitations
  17. * under the License.
  18. */
  19. package org.apache.hcatalog.mapreduce;
  20. import java.io.File;
  21. import java.io.IOException;
  22. import java.io.OutputStream;
  23. import java.util.ArrayList;
  24. import java.util.HashMap;
  25. import java.util.List;
  26. import java.util.Random;
  27. import org.apache.hadoop.conf.Configuration;
  28. import org.apache.hadoop.fs.FileSystem;
  29. import org.apache.hadoop.fs.FileUtil;
  30. import org.apache.hadoop.fs.Path;
  31. import org.apache.hadoop.fs.permission.FsPermission;
  32. import org.apache.hadoop.hive.conf.HiveConf;
  33. import org.apache.hadoop.hive.metastore.HiveMetaStore;
  34. import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
  35. import org.apache.hadoop.hive.metastore.api.FieldSchema;
  36. import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
  37. import org.apache.hadoop.hive.metastore.api.SerDeInfo;
  38. import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
  39. import org.apache.hadoop.hive.metastore.api.Table;
  40. import org.apache.hadoop.hive.ql.exec.FetchTask;
  41. import org.apache.hadoop.hive.ql.exec.Utilities;
  42. import org.apache.hadoop.hive.ql.metadata.Hive;
  43. import org.apache.hadoop.hive.ql.metadata.Partition;
  44. import org.apache.hadoop.hive.ql.plan.FetchWork;
  45. import org.apache.hadoop.hive.ql.plan.PartitionDesc;
  46. import org.apache.hadoop.hive.serde.serdeConstants;
  47. import org.apache.hadoop.io.BytesWritable;
  48. import org.apache.hadoop.io.LongWritable;
  49. import org.apache.hadoop.io.Text;
  50. import org.apache.hadoop.mapred.JobConf;
  51. import org.apache.hadoop.mapred.MiniMRCluster;
  52. import org.apache.hadoop.mapreduce.Job;
  53. import org.apache.hadoop.mapreduce.Mapper;
  54. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  55. import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
  56. import org.apache.hcatalog.cli.SemanticAnalysis.HCatSemanticAnalyzer;
  57. import org.apache.hcatalog.common.HCatException;
  58. import org.apache.hcatalog.data.DefaultHCatRecord;
  59. import org.apache.hcatalog.data.HCatRecord;
  60. import org.apache.hcatalog.data.schema.HCatFieldSchema;
  61. import org.apache.hcatalog.data.schema.HCatSchema;
  62. import org.apache.hcatalog.data.schema.HCatSchemaUtils;
  63. import org.apache.hcatalog.mapreduce.MultiOutputFormat.JobConfigurer;
  64. import org.junit.AfterClass;
  65. import org.junit.Assert;
  66. import org.junit.BeforeClass;
  67. import org.junit.Test;
  68. import org.slf4j.Logger;
  69. import org.slf4j.LoggerFactory;
  70. /**
  71. * @deprecated Use/modify {@link org.apache.hive.hcatalog.mapreduce.TestHCatMultiOutputFormat} instead
  72. */
  73. public class TestHCatMultiOutputFormat {
  74. private static final Logger LOG = LoggerFactory.getLogger(TestHCatMultiOutputFormat.class);
  75. private static final String DATABASE = "default";
  76. private static final String[] tableNames = {"test1", "test2", "test3"};
  77. private static final String[] tablePerms = {"755", "750", "700"};
  78. private static Path warehousedir = null;
  79. private static HashMap<String, HCatSchema> schemaMap = new HashMap<String, HCatSchema>();
  80. private static HiveMetaStoreClient hmsc;
  81. private static MiniMRCluster mrCluster;
  82. private static Configuration mrConf;
  83. private static HiveConf hiveConf;
  84. private static File workDir;
  85. private static final String msPort = "20199";
  86. private static Thread t;
  87. static {
  88. schemaMap.put(tableNames[0], new HCatSchema(ColumnHolder.hCattest1Cols));
  89. schemaMap.put(tableNames[1], new HCatSchema(ColumnHolder.hCattest2Cols));
  90. schemaMap.put(tableNames[2], new HCatSchema(ColumnHolder.hCattest3Cols));
  91. }
  92. private static class RunMS implements Runnable {
  93. @Override
  94. public void run() {
  95. try {
  96. String warehouseConf = HiveConf.ConfVars.METASTOREWAREHOUSE.varname + "="
  97. + warehousedir.toString();
  98. HiveMetaStore.main(new String[]{"-v", "-p", msPort, "--hiveconf", warehouseConf});
  99. } catch (Throwable t) {
  100. System.err.println("Exiting. Got exception from metastore: " + t.getMessage());
  101. }
  102. }
  103. }
  104. /**
  105. * Private class which holds all the data for the test cases
  106. */
  107. private static class ColumnHolder {
  108. private static ArrayList<HCatFieldSchema> hCattest1Cols = new ArrayList<HCatFieldSchema>();
  109. private static ArrayList<HCatFieldSchema> hCattest2Cols = new ArrayList<HCatFieldSchema>();
  110. private static ArrayList<HCatFieldSchema> hCattest3Cols = new ArrayList<HCatFieldSchema>();
  111. private static ArrayList<FieldSchema> partitionCols = new ArrayList<FieldSchema>();
  112. private static ArrayList<FieldSchema> test1Cols = new ArrayList<FieldSchema>();
  113. private static ArrayList<FieldSchema> test2Cols = new ArrayList<FieldSchema>();
  114. private static ArrayList<FieldSchema> test3Cols = new ArrayList<FieldSchema>();
  115. private static HashMap<String, List<FieldSchema>> colMapping = new HashMap<String, List<FieldSchema>>();
  116. static {
  117. try {
  118. FieldSchema keyCol = new FieldSchema("key", serdeConstants.STRING_TYPE_NAME, "");
  119. test1Cols.add(keyCol);
  120. test2Cols.add(keyCol);
  121. test3Cols.add(keyCol);
  122. hCattest1Cols.add(HCatSchemaUtils.getHCatFieldSchema(keyCol));
  123. hCattest2Cols.add(HCatSchemaUtils.getHCatFieldSchema(keyCol));
  124. hCattest3Cols.add(HCatSchemaUtils.getHCatFieldSchema(keyCol));
  125. FieldSchema valueCol = new FieldSchema("value", serdeConstants.STRING_TYPE_NAME, "");
  126. test1Cols.add(valueCol);
  127. test3Cols.add(valueCol);
  128. hCattest1Cols.add(HCatSchemaUtils.getHCatFieldSchema(valueCol));
  129. hCattest3Cols.add(HCatSchemaUtils.getHCatFieldSchema(valueCol));
  130. FieldSchema extraCol = new FieldSchema("extra", serdeConstants.STRING_TYPE_NAME, "");
  131. test3Cols.add(extraCol);
  132. hCattest3Cols.add(HCatSchemaUtils.getHCatFieldSchema(extraCol));
  133. colMapping.put("test1", test1Cols);
  134. colMapping.put("test2", test2Cols);
  135. colMapping.put("test3", test3Cols);
  136. } catch (HCatException e) {
  137. LOG.error("Error in setting up schema fields for the table", e);
  138. throw new RuntimeException(e);
  139. }
  140. }
  141. static {
  142. partitionCols.add(new FieldSchema("ds", serdeConstants.STRING_TYPE_NAME, ""));
  143. partitionCols.add(new FieldSchema("cluster", serdeConstants.STRING_TYPE_NAME, ""));
  144. }
  145. }
  146. @BeforeClass
  147. public static void setup() throws Exception {
  148. String testDir = System.getProperty("test.data.dir", "./");
  149. testDir = testDir + "/test_multitable_" + Math.abs(new Random().nextLong()) + "/";
  150. workDir = new File(new File(testDir).getCanonicalPath());
  151. FileUtil.fullyDelete(workDir);
  152. workDir.mkdirs();
  153. warehousedir = new Path(workDir + "/warehouse");
  154. // Run hive metastore server
  155. t = new Thread(new RunMS());
  156. t.start();
  157. // LocalJobRunner does not work with mapreduce OutputCommitter. So need
  158. // to use MiniMRCluster. MAPREDUCE-2350
  159. Configuration conf = new Configuration(true);
  160. conf.set("yarn.scheduler.capacity.root.queues", "default");
  161. conf.set("yarn.scheduler.capacity.root.default.capacity", "100");
  162. FileSystem fs = FileSystem.get(conf);
  163. System.setProperty("hadoop.log.dir", new File(workDir, "/logs").getAbsolutePath());
  164. mrCluster = new MiniMRCluster(1, fs.getUri().toString(), 1, null, null,
  165. new JobConf(conf));
  166. mrConf = mrCluster.createJobConf();
  167. fs.mkdirs(warehousedir);
  168. initializeSetup();
  169. }
  170. private static void initializeSetup() throws Exception {
  171. hiveConf = new HiveConf(mrConf, TestHCatMultiOutputFormat.class);
  172. hiveConf.set("hive.metastore.local", "false");
  173. hiveConf.setVar(HiveConf.ConfVars.METASTOREURIS, "thrift://localhost:" + msPort);
  174. hiveConf.setIntVar(HiveConf.ConfVars.METASTORETHRIFTCONNECTIONRETRIES, 3);
  175. hiveConf.setIntVar(HiveConf.ConfVars.METASTORETHRIFTFAILURERETRIES, 3);
  176. hiveConf.set(HiveConf.ConfVars.SEMANTIC_ANALYZER_HOOK.varname,
  177. HCatSemanticAnalyzer.class.getName());
  178. hiveConf.set(HiveConf.ConfVars.PREEXECHOOKS.varname, "");
  179. hiveConf.set(HiveConf.ConfVars.POSTEXECHOOKS.varname, "");
  180. hiveConf.set(HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY.varname, "false");
  181. System.setProperty(HiveConf.ConfVars.PREEXECHOOKS.varname, " ");
  182. System.setProperty(HiveConf.ConfVars.POSTEXECHOOKS.varname, " ");
  183. hiveConf.set(HiveConf.ConfVars.METASTOREWAREHOUSE.varname, warehousedir.toString());
  184. try {
  185. hmsc = new HiveMetaStoreClient(hiveConf, null);
  186. initalizeTables();
  187. } catch (Throwable e) {
  188. LOG.error("Exception encountered while setting up testcase", e);
  189. throw new Exception(e);
  190. } finally {
  191. hmsc.close();
  192. }
  193. }
  194. private static void initalizeTables() throws Exception {
  195. for (String table : tableNames) {
  196. try {
  197. if (hmsc.getTable(DATABASE, table) != null) {
  198. hmsc.dropTable(DATABASE, table);
  199. }
  200. } catch (NoSuchObjectException ignored) {
  201. }
  202. }
  203. for (int i = 0; i < tableNames.length; i++) {
  204. createTable(tableNames[i], tablePerms[i]);
  205. }
  206. }
  207. private static void createTable(String tableName, String tablePerm) throws Exception {
  208. Table tbl = new Table();
  209. tbl.setDbName(DATABASE);
  210. tbl.setTableName(tableName);
  211. StorageDescriptor sd = new StorageDescriptor();
  212. sd.setCols(ColumnHolder.colMapping.get(tableName));
  213. tbl.setSd(sd);
  214. sd.setParameters(new HashMap<String, String>());
  215. sd.setSerdeInfo(new SerDeInfo());
  216. sd.getSerdeInfo().setName(tbl.getTableName());
  217. sd.getSerdeInfo().setParameters(new HashMap<String, String>());
  218. sd.setInputFormat(org.apache.hadoop.hive.ql.io.RCFileInputFormat.class.getName());
  219. sd.setOutputFormat(org.apache.hadoop.hive.ql.io.RCFileOutputFormat.class.getName());
  220. sd.getSerdeInfo().getParameters().put(serdeConstants.SERIALIZATION_FORMAT, "1");
  221. sd.getSerdeInfo().setSerializationLib(
  222. org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe.class.getName());
  223. tbl.setPartitionKeys(ColumnHolder.partitionCols);
  224. hmsc.createTable(tbl);
  225. FileSystem fs = FileSystem.get(mrConf);
  226. fs.setPermission(new Path(warehousedir, tableName), new FsPermission(tablePerm));
  227. }
  228. @AfterClass
  229. public static void tearDown() throws IOException {
  230. FileUtil.fullyDelete(workDir);
  231. FileSystem fs = FileSystem.get(mrConf);
  232. if (fs.exists(warehousedir)) {
  233. fs.delete(warehousedir, true);
  234. }
  235. if (mrCluster != null) {
  236. mrCluster.shutdown();
  237. }
  238. }
  239. /**
  240. * Simple test case.
  241. * <ol>
  242. * <li>Submits a mapred job which writes out one fixed line to each of the tables</li>
  243. * <li>uses hive fetch task to read the data and see if it matches what was written</li>
  244. * </ol>
  245. *
  246. * @throws Exception if any error occurs
  247. */
  248. @Test
  249. public void testOutputFormat() throws Throwable {
  250. HashMap<String, String> partitionValues = new HashMap<String, String>();
  251. partitionValues.put("ds", "1");
  252. partitionValues.put("cluster", "ag");
  253. ArrayList<OutputJobInfo> infoList = new ArrayList<OutputJobInfo>();
  254. infoList.add(OutputJobInfo.create("default", tableNames[0], partitionValues));
  255. infoList.add(OutputJobInfo.create("default", tableNames[1], partitionValues));
  256. infoList.add(OutputJobInfo.create("default", tableNames[2], partitionValues));
  257. Job job = new Job(hiveConf, "SampleJob");
  258. job.setMapperClass(MyMapper.class);
  259. job.setInputFormatClass(TextInputFormat.class);
  260. job.setOutputFormatClass(MultiOutputFormat.class);
  261. job.setNumReduceTasks(0);
  262. JobConfigurer configurer = MultiOutputFormat.createConfigurer(job);
  263. for (int i = 0; i < tableNames.length; i++) {
  264. configurer.addOutputFormat(tableNames[i], HCatOutputFormat.class, BytesWritable.class,
  265. HCatRecord.class);
  266. HCatOutputFormat.setOutput(configurer.getJob(tableNames[i]), infoList.get(i));
  267. HCatOutputFormat.setSchema(configurer.getJob(tableNames[i]),
  268. schemaMap.get(tableNames[i]));
  269. }
  270. configurer.configure();
  271. Path filePath = createInputFile();
  272. FileInputFormat.addInputPath(job, filePath);
  273. Assert.assertTrue(job.waitForCompletion(true));
  274. ArrayList<String> outputs = new ArrayList<String>();
  275. for (String tbl : tableNames) {
  276. outputs.add(getTableData(tbl, "default").get(0));
  277. }
  278. Assert.assertEquals("Comparing output of table " +
  279. tableNames[0] + " is not correct", outputs.get(0), "a,a,1,ag");
  280. Assert.assertEquals("Comparing output of table " +
  281. tableNames[1] + " is not correct", outputs.get(1), "a,1,ag");
  282. Assert.assertEquals("Comparing output of table " +
  283. tableNames[2] + " is not correct", outputs.get(2), "a,a,extra,1,ag");
  284. // Check permisssion on partition dirs and files created
  285. for (int i = 0; i < tableNames.length; i++) {
  286. Path partitionFile = new Path(warehousedir + "/" + tableNames[i]
  287. + "/ds=1/cluster=ag/part-m-00000");
  288. FileSystem fs = partitionFile.getFileSystem(mrConf);
  289. Assert.assertEquals("File permissions of table " + tableNames[i] + " is not correct",
  290. fs.getFileStatus(partitionFile).getPermission(),
  291. new FsPermission(tablePerms[i]));
  292. Assert.assertEquals("File permissions of table " + tableNames[i] + " is not correct",
  293. fs.getFileStatus(partitionFile.getParent()).getPermission(),
  294. new FsPermission(tablePerms[i]));
  295. Assert.assertEquals("File permissions of table " + tableNames[i] + " is not correct",
  296. fs.getFileStatus(partitionFile.getParent().getParent()).getPermission(),
  297. new FsPermission(tablePerms[i]));
  298. }
  299. LOG.info("File permissions verified");
  300. }
  301. /**
  302. * Create a input file for map
  303. *
  304. * @return absolute path of the file.
  305. * @throws IOException if any error encountered
  306. */
  307. private Path createInputFile() throws IOException {
  308. Path f = new Path(workDir + "/MultiTableInput.txt");
  309. FileSystem fs = FileSystem.get(mrConf);
  310. if (fs.exists(f)) {
  311. fs.delete(f, true);
  312. }
  313. OutputStream out = fs.create(f);
  314. for (int i = 0; i < 3; i++) {
  315. out.write("a,a\n".getBytes());
  316. }
  317. out.close();
  318. return f;
  319. }
  320. /**
  321. * Method to fetch table data
  322. *
  323. * @param table table name
  324. * @param database database
  325. * @return list of columns in comma seperated way
  326. * @throws Exception if any error occurs
  327. */
  328. private List<String> getTableData(String table, String database) throws Exception {
  329. HiveConf conf = new HiveConf();
  330. conf.addResource("hive-site.xml");
  331. ArrayList<String> results = new ArrayList<String>();
  332. ArrayList<String> temp = new ArrayList<String>();
  333. Hive hive = Hive.get(conf);
  334. org.apache.hadoop.hive.ql.metadata.Table tbl = hive.getTable(database, table);
  335. FetchWork work;
  336. if (!tbl.getPartCols().isEmpty()) {
  337. List<Partition> partitions = hive.getPartitions(tbl);
  338. List<PartitionDesc> partDesc = new ArrayList<PartitionDesc>();
  339. List<String> partLocs = new ArrayList<String>();
  340. for (Partition part : partitions) {
  341. partLocs.add(part.getLocation());
  342. partDesc.add(Utilities.getPartitionDesc(part));
  343. }
  344. work = new FetchWork(partLocs, partDesc, Utilities.getTableDesc(tbl));
  345. work.setLimit(100);
  346. } else {
  347. work = new FetchWork(tbl.getDataLocation().toString(), Utilities.getTableDesc(tbl));
  348. }
  349. FetchTask task = new FetchTask();
  350. task.setWork(work);
  351. task.initialize(conf, null, null);
  352. task.fetch(temp);
  353. for (String str : temp) {
  354. results.add(str.replace("\t", ","));
  355. }
  356. return results;
  357. }
  358. private static class MyMapper extends
  359. Mapper<LongWritable, Text, BytesWritable, HCatRecord> {
  360. private int i = 0;
  361. @Override
  362. protected void map(LongWritable key, Text value, Context context)
  363. throws IOException, InterruptedException {
  364. HCatRecord record = null;
  365. String[] splits = value.toString().split(",");
  366. switch (i) {
  367. case 0:
  368. record = new DefaultHCatRecord(2);
  369. record.set(0, splits[0]);
  370. record.set(1, splits[1]);
  371. break;
  372. case 1:
  373. record = new DefaultHCatRecord(1);
  374. record.set(0, splits[0]);
  375. break;
  376. case 2:
  377. record = new DefaultHCatRecord(3);
  378. record.set(0, splits[0]);
  379. record.set(1, splits[1]);
  380. record.set(2, "extra");
  381. break;
  382. default:
  383. Assert.fail("This should not happen!!!!!");
  384. }
  385. MultiOutputFormat.write(tableNames[i], null, record, context);
  386. i++;
  387. }
  388. }
  389. }