/tags/release-0.0.0-rc0/src/java/org/apache/hcatalog/mapreduce/HCatOutputCommitter.java

# · Java · 401 lines · 273 code · 55 blank · 73 comment · 55 complexity · b0f3097226817e37e8bc8e6b17707e16 MD5 · raw file

  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hcatalog.mapreduce;
  19. import java.io.IOException;
  20. import java.net.URI;
  21. import java.util.ArrayList;
  22. import java.util.HashMap;
  23. import java.util.List;
  24. import java.util.Map;
  25. import org.apache.hadoop.conf.Configuration;
  26. import org.apache.hadoop.fs.FileStatus;
  27. import org.apache.hadoop.fs.FileSystem;
  28. import org.apache.hadoop.fs.Path;
  29. import org.apache.hadoop.fs.permission.FsPermission;
  30. import org.apache.hadoop.hive.common.FileUtils;
  31. import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
  32. import org.apache.hadoop.hive.metastore.api.FieldSchema;
  33. import org.apache.hadoop.hive.metastore.api.InvalidOperationException;
  34. import org.apache.hadoop.hive.metastore.api.MetaException;
  35. import org.apache.hadoop.hive.metastore.api.Partition;
  36. import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
  37. import org.apache.hadoop.hive.metastore.api.Table;
  38. import org.apache.hadoop.mapreduce.JobContext;
  39. import org.apache.hadoop.mapreduce.OutputCommitter;
  40. import org.apache.hadoop.mapreduce.TaskAttemptContext;
  41. import org.apache.hadoop.mapreduce.JobStatus.State;
  42. import org.apache.hadoop.security.AccessControlException;
  43. import org.apache.hcatalog.common.ErrorType;
  44. import org.apache.hcatalog.common.HCatConstants;
  45. import org.apache.hcatalog.common.HCatException;
  46. import org.apache.hcatalog.common.HCatUtil;
  47. import org.apache.hcatalog.data.schema.HCatFieldSchema;
  48. import org.apache.hcatalog.data.schema.HCatSchema;
  49. import org.apache.hcatalog.data.schema.HCatSchemaUtils;
  50. import org.apache.thrift.TException;
  51. public class HCatOutputCommitter extends OutputCommitter {
  52. /** The underlying output committer */
  53. private final OutputCommitter baseCommitter;
  54. public HCatOutputCommitter(OutputCommitter baseCommitter) {
  55. this.baseCommitter = baseCommitter;
  56. }
  57. @Override
  58. public void abortTask(TaskAttemptContext context) throws IOException {
  59. baseCommitter.abortTask(context);
  60. }
  61. @Override
  62. public void commitTask(TaskAttemptContext context) throws IOException {
  63. baseCommitter.commitTask(context);
  64. }
  65. @Override
  66. public boolean needsTaskCommit(TaskAttemptContext context) throws IOException {
  67. return baseCommitter.needsTaskCommit(context);
  68. }
  69. @Override
  70. public void setupJob(JobContext context) throws IOException {
  71. if( baseCommitter != null ) {
  72. baseCommitter.setupJob(context);
  73. }
  74. }
  75. @Override
  76. public void setupTask(TaskAttemptContext context) throws IOException {
  77. baseCommitter.setupTask(context);
  78. }
  79. @Override
  80. public void abortJob(JobContext jobContext, State state) throws IOException {
  81. if(baseCommitter != null) {
  82. baseCommitter.abortJob(jobContext, state);
  83. }
  84. OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(jobContext);
  85. try {
  86. HiveMetaStoreClient client = HCatOutputFormat.createHiveClient(
  87. jobInfo.getTableInfo().getServerUri(), jobContext.getConfiguration());
  88. // cancel the deleg. tokens that were acquired for this job now that
  89. // we are done - we should cancel if the tokens were acquired by
  90. // HCatOutputFormat and not if they were supplied by Oozie. In the latter
  91. // case the HCAT_KEY_TOKEN_SIGNATURE property in the conf will not be set
  92. String tokenStrForm = client.getTokenStrForm();
  93. if(tokenStrForm != null && jobContext.getConfiguration().get
  94. (HCatConstants.HCAT_KEY_TOKEN_SIGNATURE) != null) {
  95. client.cancelDelegationToken(tokenStrForm);
  96. }
  97. } catch(Exception e) {
  98. if( e instanceof HCatException ) {
  99. throw (HCatException) e;
  100. } else {
  101. throw new HCatException(ErrorType.ERROR_PUBLISHING_PARTITION, e);
  102. }
  103. }
  104. Path src = new Path(jobInfo.getLocation());
  105. FileSystem fs = src.getFileSystem(jobContext.getConfiguration());
  106. fs.delete(src, true);
  107. }
  108. public static final String SUCCEEDED_FILE_NAME = "_SUCCESS";
  109. static final String SUCCESSFUL_JOB_OUTPUT_DIR_MARKER =
  110. "mapreduce.fileoutputcommitter.marksuccessfuljobs";
  111. private static boolean getOutputDirMarking(Configuration conf) {
  112. return conf.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER,
  113. false);
  114. }
  115. @Override
  116. public void commitJob(JobContext jobContext) throws IOException {
  117. if(baseCommitter != null) {
  118. baseCommitter.commitJob(jobContext);
  119. }
  120. // create _SUCCESS FILE if so requested.
  121. OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(jobContext);
  122. if(getOutputDirMarking(jobContext.getConfiguration())) {
  123. Path outputPath = new Path(jobInfo.getLocation());
  124. if (outputPath != null) {
  125. FileSystem fileSys = outputPath.getFileSystem(jobContext.getConfiguration());
  126. // create a file in the folder to mark it
  127. if (fileSys.exists(outputPath)) {
  128. Path filePath = new Path(outputPath, SUCCEEDED_FILE_NAME);
  129. if(!fileSys.exists(filePath)) { // may have been created by baseCommitter.commitJob()
  130. fileSys.create(filePath).close();
  131. }
  132. }
  133. }
  134. }
  135. cleanupJob(jobContext);
  136. }
  137. @Override
  138. public void cleanupJob(JobContext context) throws IOException {
  139. OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(context);
  140. Configuration conf = context.getConfiguration();
  141. Table table = jobInfo.getTable();
  142. StorageDescriptor tblSD = table.getSd();
  143. Path tblPath = new Path(tblSD.getLocation());
  144. FileSystem fs = tblPath.getFileSystem(conf);
  145. if( table.getPartitionKeys().size() == 0 ) {
  146. //non partitioned table
  147. if( baseCommitter != null ) {
  148. baseCommitter.cleanupJob(context);
  149. }
  150. //Move data from temp directory the actual table directory
  151. //No metastore operation required.
  152. Path src = new Path(jobInfo.getLocation());
  153. moveTaskOutputs(fs, src, src, tblPath);
  154. fs.delete(src, true);
  155. return;
  156. }
  157. HiveMetaStoreClient client = null;
  158. List<String> values = null;
  159. boolean partitionAdded = false;
  160. HCatTableInfo tableInfo = jobInfo.getTableInfo();
  161. try {
  162. client = HCatOutputFormat.createHiveClient(tableInfo.getServerUri(), conf);
  163. StorerInfo storer = InitializeInput.extractStorerInfo(table.getSd(),table.getParameters());
  164. Partition partition = new Partition();
  165. partition.setDbName(tableInfo.getDatabaseName());
  166. partition.setTableName(tableInfo.getTableName());
  167. partition.setSd(new StorageDescriptor(tblSD));
  168. partition.getSd().setLocation(jobInfo.getLocation());
  169. updateTableSchema(client, table, jobInfo.getOutputSchema());
  170. List<FieldSchema> fields = new ArrayList<FieldSchema>();
  171. for(HCatFieldSchema fieldSchema : jobInfo.getOutputSchema().getFields()) {
  172. fields.add(HCatSchemaUtils.getFieldSchema(fieldSchema));
  173. }
  174. partition.getSd().setCols(fields);
  175. Map<String,String> partKVs = tableInfo.getPartitionValues();
  176. //Get partition value list
  177. partition.setValues(getPartitionValueList(table,partKVs));
  178. Map<String, String> params = new HashMap<String, String>();
  179. params.put(HCatConstants.HCAT_ISD_CLASS, storer.getInputSDClass());
  180. params.put(HCatConstants.HCAT_OSD_CLASS, storer.getOutputSDClass());
  181. //Copy table level hcat.* keys to the partition
  182. for(Map.Entry<Object, Object> entry : storer.getProperties().entrySet()) {
  183. params.put(entry.getKey().toString(), entry.getValue().toString());
  184. }
  185. partition.setParameters(params);
  186. // Sets permissions and group name on partition dirs.
  187. FileStatus tblStat = fs.getFileStatus(tblPath);
  188. String grpName = tblStat.getGroup();
  189. FsPermission perms = tblStat.getPermission();
  190. Path partPath = tblPath;
  191. for(FieldSchema partKey : table.getPartitionKeys()){
  192. partPath = constructPartialPartPath(partPath, partKey.getName().toLowerCase(), partKVs);
  193. fs.setPermission(partPath, perms);
  194. try{
  195. fs.setOwner(partPath, null, grpName);
  196. } catch(AccessControlException ace){
  197. // log the messages before ignoring. Currently, logging is not built in Hcatalog.
  198. }
  199. }
  200. //Publish the new partition
  201. client.add_partition(partition);
  202. partitionAdded = true; //publish to metastore done
  203. if( baseCommitter != null ) {
  204. baseCommitter.cleanupJob(context);
  205. }
  206. // cancel the deleg. tokens that were acquired for this job now that
  207. // we are done - we should cancel if the tokens were acquired by
  208. // HCatOutputFormat and not if they were supplied by Oozie. In the latter
  209. // case the HCAT_KEY_TOKEN_SIGNATURE property in the conf will not be set
  210. String tokenStrForm = client.getTokenStrForm();
  211. if(tokenStrForm != null && context.getConfiguration().get
  212. (HCatConstants.HCAT_KEY_TOKEN_SIGNATURE) != null) {
  213. client.cancelDelegationToken(tokenStrForm);
  214. }
  215. } catch (Exception e) {
  216. if( partitionAdded ) {
  217. try {
  218. //baseCommitter.cleanupJob failed, try to clean up the metastore
  219. client.dropPartition(tableInfo.getDatabaseName(),
  220. tableInfo.getTableName(), values);
  221. } catch(Exception te) {
  222. //Keep cause as the original exception
  223. throw new HCatException(ErrorType.ERROR_PUBLISHING_PARTITION, e);
  224. }
  225. }
  226. if( e instanceof HCatException ) {
  227. throw (HCatException) e;
  228. } else {
  229. throw new HCatException(ErrorType.ERROR_PUBLISHING_PARTITION, e);
  230. }
  231. } finally {
  232. if( client != null ) {
  233. client.close();
  234. }
  235. }
  236. }
  237. private Path constructPartialPartPath(Path partialPath, String partKey, Map<String,String> partKVs){
  238. StringBuilder sb = new StringBuilder(FileUtils.escapePathName(partKey));
  239. sb.append("=");
  240. sb.append(FileUtils.escapePathName(partKVs.get(partKey)));
  241. return new Path(partialPath, sb.toString());
  242. }
  243. /**
  244. * Update table schema, adding new columns as added for the partition.
  245. * @param client the client
  246. * @param table the table
  247. * @param partitionSchema the schema of the partition
  248. * @throws IOException Signals that an I/O exception has occurred.
  249. * @throws InvalidOperationException the invalid operation exception
  250. * @throws MetaException the meta exception
  251. * @throws TException the t exception
  252. */
  253. private void updateTableSchema(HiveMetaStoreClient client, Table table,
  254. HCatSchema partitionSchema) throws IOException, InvalidOperationException, MetaException, TException {
  255. List<FieldSchema> newColumns = HCatUtil.validatePartitionSchema(table, partitionSchema);
  256. if( newColumns.size() != 0 ) {
  257. List<FieldSchema> tableColumns = new ArrayList<FieldSchema>(table.getSd().getCols());
  258. tableColumns.addAll(newColumns);
  259. //Update table schema to add the newly added columns
  260. table.getSd().setCols(tableColumns);
  261. client.alter_table(table.getDbName(), table.getTableName(), table);
  262. }
  263. }
  264. /**
  265. * Convert the partition value map to a value list in the partition key order.
  266. * @param table the table being written to
  267. * @param valueMap the partition value map
  268. * @return the partition value list
  269. * @throws IOException
  270. */
  271. static List<String> getPartitionValueList(Table table, Map<String, String> valueMap) throws IOException {
  272. if( valueMap.size() != table.getPartitionKeys().size() ) {
  273. throw new HCatException(ErrorType.ERROR_INVALID_PARTITION_VALUES,
  274. "Table "
  275. + table.getTableName() + " has " +
  276. table.getPartitionKeys().size() + " partition keys, got "+
  277. valueMap.size());
  278. }
  279. List<String> values = new ArrayList<String>();
  280. for(FieldSchema schema : table.getPartitionKeys()) {
  281. String value = valueMap.get(schema.getName().toLowerCase());
  282. if( value == null ) {
  283. throw new HCatException(ErrorType.ERROR_MISSING_PARTITION_KEY,
  284. "Key " + schema.getName() + " of table " + table.getTableName());
  285. }
  286. values.add(value);
  287. }
  288. return values;
  289. }
  290. /**
  291. * Move all of the files from the temp directory to the final location
  292. * @param fs the output file system
  293. * @param file the file to move
  294. * @param src the source directory
  295. * @param dest the target directory
  296. * @throws IOException
  297. */
  298. private void moveTaskOutputs(FileSystem fs,
  299. Path file,
  300. Path src,
  301. Path dest) throws IOException {
  302. if (fs.isFile(file)) {
  303. Path finalOutputPath = getFinalPath(file, src, dest);
  304. if (!fs.rename(file, finalOutputPath)) {
  305. if (!fs.delete(finalOutputPath, true)) {
  306. throw new HCatException(ErrorType.ERROR_MOVE_FAILED, "Failed to delete existing path " + finalOutputPath);
  307. }
  308. if (!fs.rename(file, finalOutputPath)) {
  309. throw new HCatException(ErrorType.ERROR_MOVE_FAILED, "Failed to move output to " + dest);
  310. }
  311. }
  312. } else if(fs.getFileStatus(file).isDir()) {
  313. FileStatus[] paths = fs.listStatus(file);
  314. Path finalOutputPath = getFinalPath(file, src, dest);
  315. fs.mkdirs(finalOutputPath);
  316. if (paths != null) {
  317. for (FileStatus path : paths) {
  318. moveTaskOutputs(fs, path.getPath(), src, dest);
  319. }
  320. }
  321. }
  322. }
  323. /**
  324. * Find the final name of a given output file, given the output directory
  325. * and the work directory.
  326. * @param file the file to move
  327. * @param src the source directory
  328. * @param dest the target directory
  329. * @return the final path for the specific output file
  330. * @throws IOException
  331. */
  332. private Path getFinalPath(Path file, Path src,
  333. Path dest) throws IOException {
  334. URI taskOutputUri = file.toUri();
  335. URI relativePath = src.toUri().relativize(taskOutputUri);
  336. if (taskOutputUri == relativePath) {
  337. throw new HCatException(ErrorType.ERROR_MOVE_FAILED, "Can not get the relative path: base = " +
  338. src + " child = " + file);
  339. }
  340. if (relativePath.getPath().length() > 0) {
  341. return new Path(dest, relativePath.getPath());
  342. } else {
  343. return dest;
  344. }
  345. }
  346. }