HCatOutputCommitter.java

/tags/release-0.0.0-rc0/src/java/org/apache/hcatalog/mapreduce/HCatOutputCommitter.java

# · Java · 401 lines · 273 code · 55 blank · 73 comment · 55 complexity · b0f3097226817e37e8bc8e6b17707e16 MD5 · raw file

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hcatalog.mapreduce;

import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.InvalidOperationException;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.JobStatus.State;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hcatalog.common.ErrorType;
import org.apache.hcatalog.common.HCatConstants;
import org.apache.hcatalog.common.HCatException;
import org.apache.hcatalog.common.HCatUtil;
import org.apache.hcatalog.data.schema.HCatFieldSchema;
import org.apache.hcatalog.data.schema.HCatSchema;
import org.apache.hcatalog.data.schema.HCatSchemaUtils;
import org.apache.thrift.TException;

public class HCatOutputCommitter extends OutputCommitter {

    /** The underlying output committer */
    private final OutputCommitter baseCommitter;

    public HCatOutputCommitter(OutputCommitter baseCommitter) {
        this.baseCommitter = baseCommitter;
    }

    @Override
    public void abortTask(TaskAttemptContext context) throws IOException {
        baseCommitter.abortTask(context);
    }

    @Override
    public void commitTask(TaskAttemptContext context) throws IOException {
        baseCommitter.commitTask(context);
    }

    @Override
    public boolean needsTaskCommit(TaskAttemptContext context) throws IOException {
        return baseCommitter.needsTaskCommit(context);
    }

    @Override
    public void setupJob(JobContext context) throws IOException {
      if( baseCommitter != null ) {
        baseCommitter.setupJob(context);
      }
    }

    @Override
    public void setupTask(TaskAttemptContext context) throws IOException {
        baseCommitter.setupTask(context);
    }

    @Override
    public void abortJob(JobContext jobContext, State state) throws IOException {
      if(baseCommitter != null) {
        baseCommitter.abortJob(jobContext, state);
      }
      OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(jobContext);

      try {
        HiveMetaStoreClient client = HCatOutputFormat.createHiveClient(
            jobInfo.getTableInfo().getServerUri(), jobContext.getConfiguration());
        // cancel the deleg. tokens that were acquired for this job now that
        // we are done - we should cancel if the tokens were acquired by
        // HCatOutputFormat and not if they were supplied by Oozie. In the latter
        // case the HCAT_KEY_TOKEN_SIGNATURE property in the conf will not be set
        String tokenStrForm = client.getTokenStrForm();
        if(tokenStrForm != null && jobContext.getConfiguration().get
            (HCatConstants.HCAT_KEY_TOKEN_SIGNATURE) != null) {
          client.cancelDelegationToken(tokenStrForm);
        }
      } catch(Exception e) {
        if( e instanceof HCatException ) {
          throw (HCatException) e;
        } else {
          throw new HCatException(ErrorType.ERROR_PUBLISHING_PARTITION, e);
        }
      }

      Path src = new Path(jobInfo.getLocation());
      FileSystem fs = src.getFileSystem(jobContext.getConfiguration());
      fs.delete(src, true);
    }

    public static final String SUCCEEDED_FILE_NAME = "_SUCCESS";
    static final String SUCCESSFUL_JOB_OUTPUT_DIR_MARKER =
      "mapreduce.fileoutputcommitter.marksuccessfuljobs";

    private static boolean getOutputDirMarking(Configuration conf) {
      return conf.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER,
                             false);
    }

    @Override
    public void commitJob(JobContext jobContext) throws IOException {
      if(baseCommitter != null) {
        baseCommitter.commitJob(jobContext);
      }
      // create _SUCCESS FILE if so requested.
      OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(jobContext);
      if(getOutputDirMarking(jobContext.getConfiguration())) {
        Path outputPath = new Path(jobInfo.getLocation());
        if (outputPath != null) {
          FileSystem fileSys = outputPath.getFileSystem(jobContext.getConfiguration());
          // create a file in the folder to mark it
          if (fileSys.exists(outputPath)) {
            Path filePath = new Path(outputPath, SUCCEEDED_FILE_NAME);
            if(!fileSys.exists(filePath)) { // may have been created by baseCommitter.commitJob()
              fileSys.create(filePath).close();
            }
          }
        }
      }
      cleanupJob(jobContext);
    }

    @Override
    public void cleanupJob(JobContext context) throws IOException {

      OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(context);
      Configuration conf = context.getConfiguration();
      Table table = jobInfo.getTable();
      StorageDescriptor tblSD = table.getSd();
      Path tblPath = new Path(tblSD.getLocation());
      FileSystem fs = tblPath.getFileSystem(conf);

      if( table.getPartitionKeys().size() == 0 ) {
        //non partitioned table

        if( baseCommitter != null ) {
          baseCommitter.cleanupJob(context);
        }

        //Move data from temp directory the actual table directory
        //No metastore operation required.
        Path src = new Path(jobInfo.getLocation());
        moveTaskOutputs(fs, src, src, tblPath);
        fs.delete(src, true);
        return;
      }

      HiveMetaStoreClient client = null;
      List<String> values = null;
      boolean partitionAdded = false;
      HCatTableInfo tableInfo = jobInfo.getTableInfo();

      try {
        client = HCatOutputFormat.createHiveClient(tableInfo.getServerUri(), conf);

        StorerInfo storer = InitializeInput.extractStorerInfo(table.getSd(),table.getParameters());

        Partition partition = new Partition();
        partition.setDbName(tableInfo.getDatabaseName());
        partition.setTableName(tableInfo.getTableName());
        partition.setSd(new StorageDescriptor(tblSD));
        partition.getSd().setLocation(jobInfo.getLocation());

        updateTableSchema(client, table, jobInfo.getOutputSchema());

        List<FieldSchema> fields = new ArrayList<FieldSchema>();
        for(HCatFieldSchema fieldSchema : jobInfo.getOutputSchema().getFields()) {
          fields.add(HCatSchemaUtils.getFieldSchema(fieldSchema));
        }

        partition.getSd().setCols(fields);

        Map<String,String> partKVs = tableInfo.getPartitionValues();
        //Get partition value list
        partition.setValues(getPartitionValueList(table,partKVs));

        Map<String, String> params = new HashMap<String, String>();
        params.put(HCatConstants.HCAT_ISD_CLASS, storer.getInputSDClass());
        params.put(HCatConstants.HCAT_OSD_CLASS, storer.getOutputSDClass());

        //Copy table level hcat.* keys to the partition
        for(Map.Entry<Object, Object> entry : storer.getProperties().entrySet()) {
          params.put(entry.getKey().toString(), entry.getValue().toString());
        }

        partition.setParameters(params);

        // Sets permissions and group name on partition dirs.
        FileStatus tblStat = fs.getFileStatus(tblPath);
        String grpName = tblStat.getGroup();
        FsPermission perms = tblStat.getPermission();
        Path partPath = tblPath;
        for(FieldSchema partKey : table.getPartitionKeys()){
          partPath = constructPartialPartPath(partPath, partKey.getName().toLowerCase(), partKVs);
          fs.setPermission(partPath, perms);
          try{
            fs.setOwner(partPath, null, grpName);
          } catch(AccessControlException ace){
            // log the messages before ignoring. Currently, logging is not built in Hcatalog.
          }
        }

        //Publish the new partition
        client.add_partition(partition);
        partitionAdded = true; //publish to metastore done

        if( baseCommitter != null ) {
          baseCommitter.cleanupJob(context);
        }
        // cancel the deleg. tokens that were acquired for this job now that
        // we are done - we should cancel if the tokens were acquired by
        // HCatOutputFormat and not if they were supplied by Oozie. In the latter
        // case the HCAT_KEY_TOKEN_SIGNATURE property in the conf will not be set
        String tokenStrForm = client.getTokenStrForm();
        if(tokenStrForm != null && context.getConfiguration().get
            (HCatConstants.HCAT_KEY_TOKEN_SIGNATURE) != null) {
          client.cancelDelegationToken(tokenStrForm);
        }
      } catch (Exception e) {

        if( partitionAdded ) {
          try {
            //baseCommitter.cleanupJob failed, try to clean up the metastore
            client.dropPartition(tableInfo.getDatabaseName(),
                    tableInfo.getTableName(), values);
          } catch(Exception te) {
            //Keep cause as the original exception
            throw new HCatException(ErrorType.ERROR_PUBLISHING_PARTITION, e);
          }
        }

        if( e instanceof HCatException ) {
          throw (HCatException) e;
        } else {
          throw new HCatException(ErrorType.ERROR_PUBLISHING_PARTITION, e);
        }
      } finally {
        if( client != null ) {
          client.close();
        }
      }
    }

    private Path constructPartialPartPath(Path partialPath, String partKey, Map<String,String> partKVs){

      StringBuilder sb = new StringBuilder(FileUtils.escapePathName(partKey));
      sb.append("=");
      sb.append(FileUtils.escapePathName(partKVs.get(partKey)));
      return new Path(partialPath, sb.toString());
    }

    /**
     * Update table schema, adding new columns as added for the partition.
     * @param client the client
     * @param table the table
     * @param partitionSchema the schema of the partition
     * @throws IOException Signals that an I/O exception has occurred.
     * @throws InvalidOperationException the invalid operation exception
     * @throws MetaException the meta exception
     * @throws TException the t exception
     */
    private void updateTableSchema(HiveMetaStoreClient client, Table table,
        HCatSchema partitionSchema) throws IOException, InvalidOperationException, MetaException, TException {

      List<FieldSchema> newColumns = HCatUtil.validatePartitionSchema(table, partitionSchema);

      if( newColumns.size() != 0 ) {
        List<FieldSchema> tableColumns = new ArrayList<FieldSchema>(table.getSd().getCols());
        tableColumns.addAll(newColumns);

        //Update table schema to add the newly added columns
        table.getSd().setCols(tableColumns);
        client.alter_table(table.getDbName(), table.getTableName(), table);
      }
    }

    /**
     * Convert the partition value map to a value list in the partition key order.
     * @param table the table being written to
     * @param valueMap the partition value map
     * @return the partition value list
     * @throws IOException
     */
    static List<String> getPartitionValueList(Table table, Map<String, String> valueMap) throws IOException {

      if( valueMap.size() != table.getPartitionKeys().size() ) {
          throw new HCatException(ErrorType.ERROR_INVALID_PARTITION_VALUES,
              "Table "
              + table.getTableName() + " has " +
              table.getPartitionKeys().size() + " partition keys, got "+
              valueMap.size());
      }

      List<String> values = new ArrayList<String>();

      for(FieldSchema schema : table.getPartitionKeys()) {
        String value = valueMap.get(schema.getName().toLowerCase());

        if( value == null ) {
          throw new HCatException(ErrorType.ERROR_MISSING_PARTITION_KEY,
              "Key " + schema.getName() + " of table " + table.getTableName());
        }

        values.add(value);
      }

      return values;
    }

    /**
     * Move all of the files from the temp directory to the final location
     * @param fs the output file system
     * @param file the file to move
     * @param src the source directory
     * @param dest the target directory
     * @throws IOException
     */
    private void moveTaskOutputs(FileSystem fs,
                                 Path file,
                                 Path src,
                                 Path dest) throws IOException {
      if (fs.isFile(file)) {
        Path finalOutputPath = getFinalPath(file, src, dest);

        if (!fs.rename(file, finalOutputPath)) {
          if (!fs.delete(finalOutputPath, true)) {
            throw new HCatException(ErrorType.ERROR_MOVE_FAILED, "Failed to delete existing path " + finalOutputPath);
          }
          if (!fs.rename(file, finalOutputPath)) {
            throw new HCatException(ErrorType.ERROR_MOVE_FAILED, "Failed to move output to " + dest);
          }
        }
      } else if(fs.getFileStatus(file).isDir()) {
        FileStatus[] paths = fs.listStatus(file);
        Path finalOutputPath = getFinalPath(file, src, dest);
        fs.mkdirs(finalOutputPath);

        if (paths != null) {
          for (FileStatus path : paths) {
            moveTaskOutputs(fs, path.getPath(), src, dest);
          }
        }
      }
    }

    /**
     * Find the final name of a given output file, given the output directory
     * and the work directory.
     * @param file the file to move
     * @param src the source directory
     * @param dest the target directory
     * @return the final path for the specific output file
     * @throws IOException
     */
    private Path getFinalPath(Path file, Path src,
                              Path dest) throws IOException {
      URI taskOutputUri = file.toUri();
      URI relativePath = src.toUri().relativize(taskOutputUri);
      if (taskOutputUri == relativePath) {
        throw new HCatException(ErrorType.ERROR_MOVE_FAILED, "Can not get the relative path: base = " +
            src + " child = " + file);
      }
      if (relativePath.getPath().length() > 0) {
        return new Path(dest, relativePath.getPath());
      } else {
        return dest;
      }
    }

}
Tech Fingerprint

Alerts (13)

Complexity hotspot; lines 105 to 106 (total complexity: 4)
105 106
'catch(Exception' Catching generic 'Exception' can hide specific runtime issues. Catch more specific exception types whenever possible. Ensure caught exceptions are logged or handled appropriately, not just swallowed.
109 257
'instanceof' Frequent 'instanceof' checks can indicate a need for better polymorphism (using overridden methods in subclasses) or visitor pattern. Consider if the design can be improved.
110 263
'.close()' Manual .close() call detected. Prefer using try-with-resources (Java 7+) for automatic and safer resource management, especially handling exceptions during close.
146 270
Complexity hotspot; lines 246 to 247 (total complexity: 4)
246 247
'catch (Exception' Catching generic 'Exception' can hide specific runtime issues. Catch more specific exception types whenever possible. Ensure caught exceptions are logged or handled appropriately, not just swallowed.
250
'+' Performance Info: Using string concatenation ('+' or '+=') inside loops can be inefficient due to repeated String object creation. Use StringBuilder (or StringBuffer for thread-safety) instead.
358 361