TestHCatMultiOutputFormat.java

/hcatalog/core/src/test/java/org/apache/hive/hcatalog/mapreduce/TestHCatMultiOutputFormat.java

http://github.com/apache/hive · Java · 431 lines · 340 code · 41 blank · 50 comment · 17 complexity · 1e6201a031d9b8ccbe6720b55a8de6bc MD5 · raw file


/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hive.hcatalog.mapreduce;

import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Random;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.metastore.MetaStoreTestUtils;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.metastore.api.SerDeInfo;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.metastore.conf.MetastoreConf;
import org.apache.hadoop.hive.ql.QueryState;
import org.apache.hadoop.hive.ql.exec.FetchTask;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.plan.FetchWork;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MiniMRCluster;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hive.hcatalog.cli.SemanticAnalysis.HCatSemanticAnalyzer;
import org.apache.hive.hcatalog.common.HCatException;
import org.apache.hive.hcatalog.data.DefaultHCatRecord;
import org.apache.hive.hcatalog.data.HCatRecord;
import org.apache.hive.hcatalog.data.schema.HCatFieldSchema;
import org.apache.hive.hcatalog.data.schema.HCatSchema;
import org.apache.hive.hcatalog.data.schema.HCatSchemaUtils;
import org.apache.hive.hcatalog.mapreduce.MultiOutputFormat.JobConfigurer;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class TestHCatMultiOutputFormat {

  private static final Logger LOG = LoggerFactory.getLogger(TestHCatMultiOutputFormat.class);

  private static final String DATABASE = "default";
  private static final String[] tableNames = {"test1", "test2", "test3"};
  private static final String[] tablePerms = {"755", "750", "700"};
  private static Path warehousedir = null;
  private static HashMap<String, HCatSchema> schemaMap = new HashMap<String, HCatSchema>();
  private static HiveMetaStoreClient hmsc;
  private static MiniMRCluster mrCluster;
  private static Configuration mrConf;
  private static HiveConf hiveConf;
  private static File workDir;

  static {
    schemaMap.put(tableNames[0], new HCatSchema(ColumnHolder.hCattest1Cols));
    schemaMap.put(tableNames[1], new HCatSchema(ColumnHolder.hCattest2Cols));
    schemaMap.put(tableNames[2], new HCatSchema(ColumnHolder.hCattest3Cols));
  }

  /**
   * Private class which holds all the data for the test cases
   */
  private static class ColumnHolder {

    private static ArrayList<HCatFieldSchema> hCattest1Cols = new ArrayList<HCatFieldSchema>();
    private static ArrayList<HCatFieldSchema> hCattest2Cols = new ArrayList<HCatFieldSchema>();
    private static ArrayList<HCatFieldSchema> hCattest3Cols = new ArrayList<HCatFieldSchema>();

    private static ArrayList<FieldSchema> partitionCols = new ArrayList<FieldSchema>();
    private static ArrayList<FieldSchema> test1Cols = new ArrayList<FieldSchema>();
    private static ArrayList<FieldSchema> test2Cols = new ArrayList<FieldSchema>();
    private static ArrayList<FieldSchema> test3Cols = new ArrayList<FieldSchema>();

    private static HashMap<String, List<FieldSchema>> colMapping = new HashMap<String, List<FieldSchema>>();

    static {
      try {
        FieldSchema keyCol = new FieldSchema("key", serdeConstants.STRING_TYPE_NAME, "");
        test1Cols.add(keyCol);
        test2Cols.add(keyCol);
        test3Cols.add(keyCol);
        hCattest1Cols.add(HCatSchemaUtils.getHCatFieldSchema(keyCol));
        hCattest2Cols.add(HCatSchemaUtils.getHCatFieldSchema(keyCol));
        hCattest3Cols.add(HCatSchemaUtils.getHCatFieldSchema(keyCol));
        FieldSchema valueCol = new FieldSchema("value", serdeConstants.STRING_TYPE_NAME, "");
        test1Cols.add(valueCol);
        test3Cols.add(valueCol);
        hCattest1Cols.add(HCatSchemaUtils.getHCatFieldSchema(valueCol));
        hCattest3Cols.add(HCatSchemaUtils.getHCatFieldSchema(valueCol));
        FieldSchema extraCol = new FieldSchema("extra", serdeConstants.STRING_TYPE_NAME, "");
        test3Cols.add(extraCol);
        hCattest3Cols.add(HCatSchemaUtils.getHCatFieldSchema(extraCol));
        colMapping.put("test1", test1Cols);
        colMapping.put("test2", test2Cols);
        colMapping.put("test3", test3Cols);
      } catch (HCatException e) {
        LOG.error("Error in setting up schema fields for the table", e);
        throw new RuntimeException(e);
      }
    }

    static {
      partitionCols.add(new FieldSchema("ds", serdeConstants.STRING_TYPE_NAME, ""));
      partitionCols.add(new FieldSchema("cluster", serdeConstants.STRING_TYPE_NAME, ""));
    }
  }

  @BeforeClass
  public static void setup() throws Exception {
    System.clearProperty("mapred.job.tracker");
    String testDir = System.getProperty("test.tmp.dir", "./");
    testDir = testDir + "/test_multitable_" + Math.abs(new Random().nextLong()) + "/";
    workDir = new File(new File(testDir).getCanonicalPath());
    FileUtil.fullyDelete(workDir);
    workDir.mkdirs();

    warehousedir = new Path(System.getProperty("test.warehouse.dir"));

    HiveConf metastoreConf = new HiveConf();
    metastoreConf.setVar(HiveConf.ConfVars.METASTOREWAREHOUSE, warehousedir.toString());

    // Run hive metastore server
    MetaStoreTestUtils.startMetaStoreWithRetry(metastoreConf);
    // Read the warehouse dir, which can be changed so multiple MetaStore tests could be run on
    // the same server
    warehousedir = new Path(MetastoreConf.getVar(metastoreConf, MetastoreConf.ConfVars.WAREHOUSE));
    // LocalJobRunner does not work with mapreduce OutputCommitter. So need
    // to use MiniMRCluster. MAPREDUCE-2350
    Configuration conf = new Configuration(true);
    conf.set("yarn.scheduler.capacity.root.queues", "default");
    conf.set("yarn.scheduler.capacity.root.default.capacity", "100");

    FileSystem fs = FileSystem.get(conf);
    System.setProperty("hadoop.log.dir", new File(workDir, "/logs").getAbsolutePath());
    mrCluster = new MiniMRCluster(1, fs.getUri().toString(), 1, null, null,
      new JobConf(conf));
    mrConf = mrCluster.createJobConf();

    initializeSetup(metastoreConf);

    warehousedir.getFileSystem(conf).mkdirs(warehousedir);
  }

  private static void initializeSetup(HiveConf metastoreConf) throws Exception {

    hiveConf = new HiveConf(metastoreConf, TestHCatMultiOutputFormat.class);
    hiveConf.setIntVar(HiveConf.ConfVars.METASTORETHRIFTCONNECTIONRETRIES, 3);
    hiveConf.setIntVar(HiveConf.ConfVars.METASTORETHRIFTFAILURERETRIES, 3);
    hiveConf.set(HiveConf.ConfVars.SEMANTIC_ANALYZER_HOOK.varname,
      HCatSemanticAnalyzer.class.getName());
    hiveConf.set(HiveConf.ConfVars.PREEXECHOOKS.varname, "");
    hiveConf.set(HiveConf.ConfVars.POSTEXECHOOKS.varname, "");
    hiveConf.set(HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY.varname, "false");
    System.setProperty(HiveConf.ConfVars.PREEXECHOOKS.varname, " ");
    System.setProperty(HiveConf.ConfVars.POSTEXECHOOKS.varname, " ");
    System.setProperty(HiveConf.ConfVars.METASTOREWAREHOUSE.varname,
        MetastoreConf.getVar(hiveConf, MetastoreConf.ConfVars.WAREHOUSE));
    System.setProperty(HiveConf.ConfVars.METASTORECONNECTURLKEY.varname,
        MetastoreConf.getVar(hiveConf, MetastoreConf.ConfVars.CONNECT_URL_KEY));
    System.setProperty(HiveConf.ConfVars.METASTOREURIS.varname,
        MetastoreConf.getVar(hiveConf, MetastoreConf.ConfVars.THRIFT_URIS));

    hiveConf.set(HiveConf.ConfVars.METASTOREWAREHOUSE.varname, warehousedir.toString());
    try {
      hmsc = new HiveMetaStoreClient(hiveConf);
      initalizeTables();
    } catch (Throwable e) {
      LOG.error("Exception encountered while setting up testcase", e);
      throw new Exception(e);
    } finally {
      hmsc.close();
    }
  }

  private static void initalizeTables() throws Exception {
    for (String table : tableNames) {
      try {
        if (hmsc.getTable(DATABASE, table) != null) {
          hmsc.dropTable(DATABASE, table);
        }
      } catch (NoSuchObjectException ignored) {
      }
    }
    for (int i = 0; i < tableNames.length; i++) {
      createTable(tableNames[i], tablePerms[i]);
    }
  }

  private static void createTable(String tableName, String tablePerm) throws Exception {
    Table tbl = new Table();
    tbl.setDbName(DATABASE);
    tbl.setTableName(tableName);
    StorageDescriptor sd = new StorageDescriptor();
    sd.setCols(ColumnHolder.colMapping.get(tableName));
    tbl.setSd(sd);
    sd.setParameters(new HashMap<String, String>());
    sd.setSerdeInfo(new SerDeInfo());
    sd.getSerdeInfo().setName(tbl.getTableName());
    sd.getSerdeInfo().setParameters(new HashMap<String, String>());
    sd.setInputFormat(org.apache.hadoop.hive.ql.io.RCFileInputFormat.class.getName());
    sd.setOutputFormat(org.apache.hadoop.hive.ql.io.RCFileOutputFormat.class.getName());
    sd.getSerdeInfo().getParameters().put(serdeConstants.SERIALIZATION_FORMAT, "1");
    sd.getSerdeInfo().setSerializationLib(
      org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe.class.getName());
    tbl.setPartitionKeys(ColumnHolder.partitionCols);

    hmsc.createTable(tbl);
    Path path = new Path(warehousedir, tableName);
    FileSystem fs = path.getFileSystem(hiveConf);
    fs.setPermission(path, new FsPermission(tablePerm));
  }

  @AfterClass
  public static void tearDown() throws IOException {
    FileUtil.fullyDelete(workDir);
    FileSystem fs = warehousedir.getFileSystem(hiveConf);
    if (fs.exists(warehousedir)) {
      fs.delete(warehousedir, true);
    }
    if (mrCluster != null) {
      mrCluster.shutdown();
    }
  }

  /**
   * Simple test case.
   * <ol>
   * <li>Submits a mapred job which writes out one fixed line to each of the tables</li>
   * <li>uses hive fetch task to read the data and see if it matches what was written</li>
   * </ol>
   *
   * @throws Exception if any error occurs
   */
  @Test
  public void testOutputFormat() throws Throwable {
    HashMap<String, String> partitionValues = new HashMap<String, String>();
    partitionValues.put("ds", "1");
    partitionValues.put("cluster", "ag");
    ArrayList<OutputJobInfo> infoList = new ArrayList<OutputJobInfo>();
    infoList.add(OutputJobInfo.create("default", tableNames[0], partitionValues));
    infoList.add(OutputJobInfo.create("default", tableNames[1], partitionValues));
    infoList.add(OutputJobInfo.create("default", tableNames[2], partitionValues));

    Job job = new Job(hiveConf, "SampleJob");

    job.setMapperClass(MyMapper.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(MultiOutputFormat.class);
    job.setNumReduceTasks(0);

    JobConfigurer configurer = MultiOutputFormat.createConfigurer(job);

    for (int i = 0; i < tableNames.length; i++) {
      configurer.addOutputFormat(tableNames[i], HCatOutputFormat.class, BytesWritable.class,
        HCatRecord.class);
      HCatOutputFormat.setOutput(configurer.getJob(tableNames[i]), infoList.get(i));
      HCatOutputFormat.setSchema(configurer.getJob(tableNames[i]),
        schemaMap.get(tableNames[i]));
    }
    configurer.configure();

    Path filePath = createInputFile();
    FileInputFormat.addInputPath(job, filePath);
    Assert.assertTrue(job.waitForCompletion(true));

    ArrayList<String> outputs = new ArrayList<String>();
    for (String tbl : tableNames) {
      outputs.add(getTableData(tbl, "default").get(0));
    }
    Assert.assertEquals("Comparing output of table " +
      tableNames[0] + " is not correct", outputs.get(0), "a,a,1,ag");
    Assert.assertEquals("Comparing output of table " +
      tableNames[1] + " is not correct", outputs.get(1),
      "a,1,ag");
    Assert.assertEquals("Comparing output of table " +
      tableNames[2] + " is not correct", outputs.get(2), "a,a,extra,1,ag");

    // Check permisssion on partition dirs and files created
    for (int i = 0; i < tableNames.length; i++) {
      Path partitionFile = new Path(warehousedir + "/" + tableNames[i]
        + "/ds=1/cluster=ag/part-m-00000");
      FileSystem fs = partitionFile.getFileSystem(mrConf);
      Assert.assertEquals("File permissions of table " + tableNames[i] + " is not correct",
        fs.getFileStatus(partitionFile).getPermission(),
        new FsPermission(tablePerms[i]));
      Assert.assertEquals("File permissions of table " + tableNames[i] + " is not correct",
        fs.getFileStatus(partitionFile.getParent()).getPermission(),
        new FsPermission(tablePerms[i]));
      Assert.assertEquals("File permissions of table " + tableNames[i] + " is not correct",
        fs.getFileStatus(partitionFile.getParent().getParent()).getPermission(),
        new FsPermission(tablePerms[i]));

    }
    LOG.info("File permissions verified");
  }

  /**
   * Create a input file for map
   *
   * @return absolute path of the file.
   * @throws IOException if any error encountered
   */
  private Path createInputFile() throws IOException {
    Path f = new Path(workDir + "/MultiTableInput.txt");
    FileSystem fs = FileSystem.get(mrConf);
    if (fs.exists(f)) {
      fs.delete(f, true);
    }
    OutputStream out = fs.create(f);
    for (int i = 0; i < 3; i++) {
      out.write("a,a\n".getBytes());
    }
    out.close();
    return f;
  }

  /**
   * Method to fetch table data
   *
   * @param table table name
   * @param database database
   * @return list of columns in comma seperated way
   * @throws Exception if any error occurs
   */
  private List<String> getTableData(String table, String database) throws Exception {
    QueryState queryState = new QueryState.Builder().build();
    HiveConf conf = queryState.getConf();
    conf.addResource("hive-site.xml");
    ArrayList<String> results = new ArrayList<String>();
    ArrayList<String> temp = new ArrayList<String>();
    Hive hive = Hive.get(conf);
    org.apache.hadoop.hive.ql.metadata.Table tbl = hive.getTable(database, table);
    FetchWork work;
    if (!tbl.getPartCols().isEmpty()) {
      List<Partition> partitions = hive.getPartitions(tbl);
      List<PartitionDesc> partDesc = new ArrayList<PartitionDesc>();
      List<Path> partLocs = new ArrayList<Path>();
      TableDesc tableDesc = Utilities.getTableDesc(tbl);
      for (Partition part : partitions) {
        partLocs.add(part.getDataLocation());
        partDesc.add(Utilities.getPartitionDescFromTableDesc(tableDesc, part, true));
      }
      work = new FetchWork(partLocs, partDesc, tableDesc);
      work.setLimit(100);
    } else {
      work = new FetchWork(tbl.getDataLocation(), Utilities.getTableDesc(tbl));
    }
    FetchTask task = new FetchTask();
    task.setWork(work);
    conf.set("_hive.hdfs.session.path", "path");
    conf.set("_hive.local.session.path", "path");
    task.initialize(queryState, null, null, new org.apache.hadoop.hive.ql.Context(conf));
    task.fetch(temp);
    for (String str : temp) {
      results.add(str.replace("\t", ","));
    }
    return results;
  }

  private static class MyMapper extends
    Mapper<LongWritable, Text, BytesWritable, HCatRecord> {

    private int i = 0;

    @Override
    protected void map(LongWritable key, Text value, Context context)
      throws IOException, InterruptedException {
      HCatRecord record = null;
      String[] splits = value.toString().split(",");
      switch (i) {
      case 0:
        record = new DefaultHCatRecord(2);
        record.set(0, splits[0]);
        record.set(1, splits[1]);
        break;
      case 1:
        record = new DefaultHCatRecord(1);
        record.set(0, splits[0]);
        break;
      case 2:
        record = new DefaultHCatRecord(3);
        record.set(0, splits[0]);
        record.set(1, splits[1]);
        record.set(2, "extra");
        break;
      default:
        Assert.fail("This should not happen!!!!!");
      }
      MultiOutputFormat.write(tableNames[i], null, record, context);
      i++;
    }
  }
}

Tech Fingerprint

Alerts (27)

'HashMap<' Maintainability Info: Method parameters and return types should generally use interface types (e.g., List<T>, Set<T>, Map<T, K>) instead of concrete implementation types (e.g., ArrayList<T>, HashMap<T, K>). This improves flexibility and hides implementation details.
85 112
'ArrayList<' Maintainability Info: Method parameters and return types should generally use interface types (e.g., List<T>, Set<T>, Map<T, K>) instead of concrete implementation types (e.g., ArrayList<T>, HashMap<T, K>). This improves flexibility and hides implementation details.
107 108 109 110
'throws Exception' Declaring 'throws Exception' is too broad. Declare specific checked exceptions that the method might throw, allowing callers to handle them appropriately.
147 182 213 227 363
'.mkdirs()' Correctness Info: The return value of File operations like delete(), mkdir(), renameTo() should be checked to ensure the operation succeeded.
153
'.close()' Manual .close() call detected. Prefer using try-with-resources (Java 7+) for automatic and safer resource management, especially handling exceptions during close.
209 351
'+' Performance Info: Using string concatenation ('+' or '+=') inside loops can be inefficient due to repeated String object creation. Use StringBuilder (or StringBuffer for thread-safety) instead.
309 311 314 318 319 321 324 327 342
'switch (' Ensure switch statements on enums or non-trivial types cover all cases or include a 'default:' label to handle unexpected values.
408
'=' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
410 419
'case' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
418