PageRenderTime 36ms CodeModel.GetById 14ms app.highlight 18ms RepoModel.GetById 1ms app.codeStats 0ms

/tags/release-0.0.0-rc0/src/java/org/apache/hcatalog/mapreduce/HCatOutputCommitter.java

#
Java | 401 lines | 273 code | 55 blank | 73 comment | 55 complexity | b0f3097226817e37e8bc8e6b17707e16 MD5 | raw file
  1/*
  2 * Licensed to the Apache Software Foundation (ASF) under one
  3 * or more contributor license agreements.  See the NOTICE file
  4 * distributed with this work for additional information
  5 * regarding copyright ownership.  The ASF licenses this file
  6 * to you under the Apache License, Version 2.0 (the
  7 * "License"); you may not use this file except in compliance
  8 * with the License.  You may obtain a copy of the License at
  9 *
 10 *     http://www.apache.org/licenses/LICENSE-2.0
 11 *
 12 * Unless required by applicable law or agreed to in writing, software
 13 * distributed under the License is distributed on an "AS IS" BASIS,
 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 * See the License for the specific language governing permissions and
 16 * limitations under the License.
 17 */
 18package org.apache.hcatalog.mapreduce;
 19
 20import java.io.IOException;
 21import java.net.URI;
 22import java.util.ArrayList;
 23import java.util.HashMap;
 24import java.util.List;
 25import java.util.Map;
 26
 27import org.apache.hadoop.conf.Configuration;
 28import org.apache.hadoop.fs.FileStatus;
 29import org.apache.hadoop.fs.FileSystem;
 30import org.apache.hadoop.fs.Path;
 31import org.apache.hadoop.fs.permission.FsPermission;
 32import org.apache.hadoop.hive.common.FileUtils;
 33import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
 34import org.apache.hadoop.hive.metastore.api.FieldSchema;
 35import org.apache.hadoop.hive.metastore.api.InvalidOperationException;
 36import org.apache.hadoop.hive.metastore.api.MetaException;
 37import org.apache.hadoop.hive.metastore.api.Partition;
 38import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
 39import org.apache.hadoop.hive.metastore.api.Table;
 40import org.apache.hadoop.mapreduce.JobContext;
 41import org.apache.hadoop.mapreduce.OutputCommitter;
 42import org.apache.hadoop.mapreduce.TaskAttemptContext;
 43import org.apache.hadoop.mapreduce.JobStatus.State;
 44import org.apache.hadoop.security.AccessControlException;
 45import org.apache.hcatalog.common.ErrorType;
 46import org.apache.hcatalog.common.HCatConstants;
 47import org.apache.hcatalog.common.HCatException;
 48import org.apache.hcatalog.common.HCatUtil;
 49import org.apache.hcatalog.data.schema.HCatFieldSchema;
 50import org.apache.hcatalog.data.schema.HCatSchema;
 51import org.apache.hcatalog.data.schema.HCatSchemaUtils;
 52import org.apache.thrift.TException;
 53
 54public class HCatOutputCommitter extends OutputCommitter {
 55
 56    /** The underlying output committer */
 57    private final OutputCommitter baseCommitter;
 58
 59    public HCatOutputCommitter(OutputCommitter baseCommitter) {
 60        this.baseCommitter = baseCommitter;
 61    }
 62
 63    @Override
 64    public void abortTask(TaskAttemptContext context) throws IOException {
 65        baseCommitter.abortTask(context);
 66    }
 67
 68    @Override
 69    public void commitTask(TaskAttemptContext context) throws IOException {
 70        baseCommitter.commitTask(context);
 71    }
 72
 73    @Override
 74    public boolean needsTaskCommit(TaskAttemptContext context) throws IOException {
 75        return baseCommitter.needsTaskCommit(context);
 76    }
 77
 78    @Override
 79    public void setupJob(JobContext context) throws IOException {
 80      if( baseCommitter != null ) {
 81        baseCommitter.setupJob(context);
 82      }
 83    }
 84
 85    @Override
 86    public void setupTask(TaskAttemptContext context) throws IOException {
 87        baseCommitter.setupTask(context);
 88    }
 89
 90    @Override
 91    public void abortJob(JobContext jobContext, State state) throws IOException {
 92      if(baseCommitter != null) {
 93        baseCommitter.abortJob(jobContext, state);
 94      }
 95      OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(jobContext);
 96
 97      try {
 98        HiveMetaStoreClient client = HCatOutputFormat.createHiveClient(
 99            jobInfo.getTableInfo().getServerUri(), jobContext.getConfiguration());
100        // cancel the deleg. tokens that were acquired for this job now that
101        // we are done - we should cancel if the tokens were acquired by
102        // HCatOutputFormat and not if they were supplied by Oozie. In the latter
103        // case the HCAT_KEY_TOKEN_SIGNATURE property in the conf will not be set
104        String tokenStrForm = client.getTokenStrForm();
105        if(tokenStrForm != null && jobContext.getConfiguration().get
106            (HCatConstants.HCAT_KEY_TOKEN_SIGNATURE) != null) {
107          client.cancelDelegationToken(tokenStrForm);
108        }
109      } catch(Exception e) {
110        if( e instanceof HCatException ) {
111          throw (HCatException) e;
112        } else {
113          throw new HCatException(ErrorType.ERROR_PUBLISHING_PARTITION, e);
114        }
115      }
116
117      Path src = new Path(jobInfo.getLocation());
118      FileSystem fs = src.getFileSystem(jobContext.getConfiguration());
119      fs.delete(src, true);
120    }
121
122    public static final String SUCCEEDED_FILE_NAME = "_SUCCESS";
123    static final String SUCCESSFUL_JOB_OUTPUT_DIR_MARKER =
124      "mapreduce.fileoutputcommitter.marksuccessfuljobs";
125
126    private static boolean getOutputDirMarking(Configuration conf) {
127      return conf.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER,
128                             false);
129    }
130
131    @Override
132    public void commitJob(JobContext jobContext) throws IOException {
133      if(baseCommitter != null) {
134        baseCommitter.commitJob(jobContext);
135      }
136      // create _SUCCESS FILE if so requested.
137      OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(jobContext);
138      if(getOutputDirMarking(jobContext.getConfiguration())) {
139        Path outputPath = new Path(jobInfo.getLocation());
140        if (outputPath != null) {
141          FileSystem fileSys = outputPath.getFileSystem(jobContext.getConfiguration());
142          // create a file in the folder to mark it
143          if (fileSys.exists(outputPath)) {
144            Path filePath = new Path(outputPath, SUCCEEDED_FILE_NAME);
145            if(!fileSys.exists(filePath)) { // may have been created by baseCommitter.commitJob()
146              fileSys.create(filePath).close();
147            }
148          }
149        }
150      }
151      cleanupJob(jobContext);
152    }
153
154    @Override
155    public void cleanupJob(JobContext context) throws IOException {
156
157      OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(context);
158      Configuration conf = context.getConfiguration();
159      Table table = jobInfo.getTable();
160      StorageDescriptor tblSD = table.getSd();
161      Path tblPath = new Path(tblSD.getLocation());
162      FileSystem fs = tblPath.getFileSystem(conf);
163
164      if( table.getPartitionKeys().size() == 0 ) {
165        //non partitioned table
166
167        if( baseCommitter != null ) {
168          baseCommitter.cleanupJob(context);
169        }
170
171        //Move data from temp directory the actual table directory
172        //No metastore operation required.
173        Path src = new Path(jobInfo.getLocation());
174        moveTaskOutputs(fs, src, src, tblPath);
175        fs.delete(src, true);
176        return;
177      }
178
179      HiveMetaStoreClient client = null;
180      List<String> values = null;
181      boolean partitionAdded = false;
182      HCatTableInfo tableInfo = jobInfo.getTableInfo();
183
184      try {
185        client = HCatOutputFormat.createHiveClient(tableInfo.getServerUri(), conf);
186
187        StorerInfo storer = InitializeInput.extractStorerInfo(table.getSd(),table.getParameters());
188
189        Partition partition = new Partition();
190        partition.setDbName(tableInfo.getDatabaseName());
191        partition.setTableName(tableInfo.getTableName());
192        partition.setSd(new StorageDescriptor(tblSD));
193        partition.getSd().setLocation(jobInfo.getLocation());
194
195        updateTableSchema(client, table, jobInfo.getOutputSchema());
196
197        List<FieldSchema> fields = new ArrayList<FieldSchema>();
198        for(HCatFieldSchema fieldSchema : jobInfo.getOutputSchema().getFields()) {
199          fields.add(HCatSchemaUtils.getFieldSchema(fieldSchema));
200        }
201
202        partition.getSd().setCols(fields);
203
204        Map<String,String> partKVs = tableInfo.getPartitionValues();
205        //Get partition value list
206        partition.setValues(getPartitionValueList(table,partKVs));
207
208        Map<String, String> params = new HashMap<String, String>();
209        params.put(HCatConstants.HCAT_ISD_CLASS, storer.getInputSDClass());
210        params.put(HCatConstants.HCAT_OSD_CLASS, storer.getOutputSDClass());
211
212        //Copy table level hcat.* keys to the partition
213        for(Map.Entry<Object, Object> entry : storer.getProperties().entrySet()) {
214          params.put(entry.getKey().toString(), entry.getValue().toString());
215        }
216
217        partition.setParameters(params);
218
219        // Sets permissions and group name on partition dirs.
220        FileStatus tblStat = fs.getFileStatus(tblPath);
221        String grpName = tblStat.getGroup();
222        FsPermission perms = tblStat.getPermission();
223        Path partPath = tblPath;
224        for(FieldSchema partKey : table.getPartitionKeys()){
225          partPath = constructPartialPartPath(partPath, partKey.getName().toLowerCase(), partKVs);
226          fs.setPermission(partPath, perms);
227          try{
228            fs.setOwner(partPath, null, grpName);
229          } catch(AccessControlException ace){
230            // log the messages before ignoring. Currently, logging is not built in Hcatalog.
231          }
232        }
233
234        //Publish the new partition
235        client.add_partition(partition);
236        partitionAdded = true; //publish to metastore done
237
238        if( baseCommitter != null ) {
239          baseCommitter.cleanupJob(context);
240        }
241        // cancel the deleg. tokens that were acquired for this job now that
242        // we are done - we should cancel if the tokens were acquired by
243        // HCatOutputFormat and not if they were supplied by Oozie. In the latter
244        // case the HCAT_KEY_TOKEN_SIGNATURE property in the conf will not be set
245        String tokenStrForm = client.getTokenStrForm();
246        if(tokenStrForm != null && context.getConfiguration().get
247            (HCatConstants.HCAT_KEY_TOKEN_SIGNATURE) != null) {
248          client.cancelDelegationToken(tokenStrForm);
249        }
250      } catch (Exception e) {
251
252        if( partitionAdded ) {
253          try {
254            //baseCommitter.cleanupJob failed, try to clean up the metastore
255            client.dropPartition(tableInfo.getDatabaseName(),
256                    tableInfo.getTableName(), values);
257          } catch(Exception te) {
258            //Keep cause as the original exception
259            throw new HCatException(ErrorType.ERROR_PUBLISHING_PARTITION, e);
260          }
261        }
262
263        if( e instanceof HCatException ) {
264          throw (HCatException) e;
265        } else {
266          throw new HCatException(ErrorType.ERROR_PUBLISHING_PARTITION, e);
267        }
268      } finally {
269        if( client != null ) {
270          client.close();
271        }
272      }
273    }
274
275    private Path constructPartialPartPath(Path partialPath, String partKey, Map<String,String> partKVs){
276
277      StringBuilder sb = new StringBuilder(FileUtils.escapePathName(partKey));
278      sb.append("=");
279      sb.append(FileUtils.escapePathName(partKVs.get(partKey)));
280      return new Path(partialPath, sb.toString());
281    }
282
283    /**
284     * Update table schema, adding new columns as added for the partition.
285     * @param client the client
286     * @param table the table
287     * @param partitionSchema the schema of the partition
288     * @throws IOException Signals that an I/O exception has occurred.
289     * @throws InvalidOperationException the invalid operation exception
290     * @throws MetaException the meta exception
291     * @throws TException the t exception
292     */
293    private void updateTableSchema(HiveMetaStoreClient client, Table table,
294        HCatSchema partitionSchema) throws IOException, InvalidOperationException, MetaException, TException {
295
296      List<FieldSchema> newColumns = HCatUtil.validatePartitionSchema(table, partitionSchema);
297
298      if( newColumns.size() != 0 ) {
299        List<FieldSchema> tableColumns = new ArrayList<FieldSchema>(table.getSd().getCols());
300        tableColumns.addAll(newColumns);
301
302        //Update table schema to add the newly added columns
303        table.getSd().setCols(tableColumns);
304        client.alter_table(table.getDbName(), table.getTableName(), table);
305      }
306    }
307
308    /**
309     * Convert the partition value map to a value list in the partition key order.
310     * @param table the table being written to
311     * @param valueMap the partition value map
312     * @return the partition value list
313     * @throws IOException
314     */
315    static List<String> getPartitionValueList(Table table, Map<String, String> valueMap) throws IOException {
316
317      if( valueMap.size() != table.getPartitionKeys().size() ) {
318          throw new HCatException(ErrorType.ERROR_INVALID_PARTITION_VALUES,
319              "Table "
320              + table.getTableName() + " has " +
321              table.getPartitionKeys().size() + " partition keys, got "+
322              valueMap.size());
323      }
324
325      List<String> values = new ArrayList<String>();
326
327      for(FieldSchema schema : table.getPartitionKeys()) {
328        String value = valueMap.get(schema.getName().toLowerCase());
329
330        if( value == null ) {
331          throw new HCatException(ErrorType.ERROR_MISSING_PARTITION_KEY,
332              "Key " + schema.getName() + " of table " + table.getTableName());
333        }
334
335        values.add(value);
336      }
337
338      return values;
339    }
340
341    /**
342     * Move all of the files from the temp directory to the final location
343     * @param fs the output file system
344     * @param file the file to move
345     * @param src the source directory
346     * @param dest the target directory
347     * @throws IOException
348     */
349    private void moveTaskOutputs(FileSystem fs,
350                                 Path file,
351                                 Path src,
352                                 Path dest) throws IOException {
353      if (fs.isFile(file)) {
354        Path finalOutputPath = getFinalPath(file, src, dest);
355
356        if (!fs.rename(file, finalOutputPath)) {
357          if (!fs.delete(finalOutputPath, true)) {
358            throw new HCatException(ErrorType.ERROR_MOVE_FAILED, "Failed to delete existing path " + finalOutputPath);
359          }
360          if (!fs.rename(file, finalOutputPath)) {
361            throw new HCatException(ErrorType.ERROR_MOVE_FAILED, "Failed to move output to " + dest);
362          }
363        }
364      } else if(fs.getFileStatus(file).isDir()) {
365        FileStatus[] paths = fs.listStatus(file);
366        Path finalOutputPath = getFinalPath(file, src, dest);
367        fs.mkdirs(finalOutputPath);
368
369        if (paths != null) {
370          for (FileStatus path : paths) {
371            moveTaskOutputs(fs, path.getPath(), src, dest);
372          }
373        }
374      }
375    }
376
377    /**
378     * Find the final name of a given output file, given the output directory
379     * and the work directory.
380     * @param file the file to move
381     * @param src the source directory
382     * @param dest the target directory
383     * @return the final path for the specific output file
384     * @throws IOException
385     */
386    private Path getFinalPath(Path file, Path src,
387                              Path dest) throws IOException {
388      URI taskOutputUri = file.toUri();
389      URI relativePath = src.toUri().relativize(taskOutputUri);
390      if (taskOutputUri == relativePath) {
391        throw new HCatException(ErrorType.ERROR_MOVE_FAILED, "Can not get the relative path: base = " +
392            src + " child = " + file);
393      }
394      if (relativePath.getPath().length() > 0) {
395        return new Path(dest, relativePath.getPath());
396      } else {
397        return dest;
398      }
399    }
400
401}