PageRenderTime 51ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/tags/release-0.1-rc2/hive/external/ql/src/java/org/apache/hadoop/hive/ql/index/compact/HiveCompactIndexInputFormat.java

#
Java | 150 lines | 112 code | 17 blank | 21 comment | 15 complexity | d03661ba878ce5ab50d562398b7dd908 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, JSON, CPL-1.0
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hive.ql.index.compact;
  19. import java.io.IOException;
  20. import java.util.ArrayList;
  21. import java.util.Iterator;
  22. import java.util.Set;
  23. import org.apache.commons.logging.Log;
  24. import org.apache.commons.logging.LogFactory;
  25. import org.apache.hadoop.fs.Path;
  26. import org.apache.hadoop.hive.ql.exec.Utilities;
  27. import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
  28. import org.apache.hadoop.hive.ql.io.HiveInputFormat;
  29. import org.apache.hadoop.hive.ql.io.IOPrepareCache;
  30. import org.apache.hadoop.hive.ql.metadata.HiveException;
  31. import org.apache.hadoop.hive.ql.plan.PartitionDesc;
  32. import org.apache.hadoop.io.SequenceFile;
  33. import org.apache.hadoop.mapred.FileInputFormat;
  34. import org.apache.hadoop.mapred.FileSplit;
  35. import org.apache.hadoop.mapred.InputFormat;
  36. import org.apache.hadoop.mapred.InputSplit;
  37. import org.apache.hadoop.mapred.JobConf;
  38. public class HiveCompactIndexInputFormat extends HiveInputFormat {
  39. public static final Log l4j = LogFactory.getLog("HiveIndexInputFormat");
  40. public HiveCompactIndexInputFormat() {
  41. super();
  42. }
  43. public InputSplit[] doGetSplits(JobConf job, int numSplits) throws IOException {
  44. super.init(job);
  45. Path[] dirs = FileInputFormat.getInputPaths(job);
  46. if (dirs.length == 0) {
  47. throw new IOException("No input paths specified in job");
  48. }
  49. JobConf newjob = new JobConf(job);
  50. ArrayList<InputSplit> result = new ArrayList<InputSplit>();
  51. // for each dir, get the InputFormat, and do getSplits.
  52. for (Path dir : dirs) {
  53. PartitionDesc part = HiveFileFormatUtils
  54. .getPartitionDescFromPathRecursively(pathToPartitionInfo, dir,
  55. IOPrepareCache.get().allocatePartitionDescMap(), true);
  56. // create a new InputFormat instance if this is the first time to see this
  57. // class
  58. Class inputFormatClass = part.getInputFileFormatClass();
  59. InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
  60. Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), newjob);
  61. FileInputFormat.setInputPaths(newjob, dir);
  62. newjob.setInputFormat(inputFormat.getClass());
  63. InputSplit[] iss = inputFormat.getSplits(newjob, numSplits / dirs.length);
  64. for (InputSplit is : iss) {
  65. result.add(new HiveInputSplit(is, inputFormatClass.getName()));
  66. }
  67. }
  68. return result.toArray(new HiveInputSplit[result.size()]);
  69. }
  70. @Override
  71. public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
  72. String indexFileStr = job.get("hive.index.compact.file");
  73. l4j.info("index_file is " + indexFileStr);
  74. HiveCompactIndexResult hiveIndexResult = null;
  75. if (indexFileStr != null) {
  76. try {
  77. hiveIndexResult = new HiveCompactIndexResult(indexFileStr, job);
  78. } catch (HiveException e) {
  79. l4j.error("Unable to read index..");
  80. throw new IOException(e);
  81. }
  82. Set<String> inputFiles = hiveIndexResult.buckets.keySet();
  83. Iterator<String> iter = inputFiles.iterator();
  84. boolean first = true;
  85. StringBuilder newInputPaths = new StringBuilder();
  86. while(iter.hasNext()) {
  87. String path = iter.next();
  88. if (path.trim().equalsIgnoreCase(""))
  89. continue;
  90. if (!first) {
  91. newInputPaths.append(",");
  92. } else {
  93. first = false;
  94. }
  95. newInputPaths.append(path);
  96. }
  97. FileInputFormat.setInputPaths(job, newInputPaths.toString());
  98. } else {
  99. return super.getSplits(job, numSplits);
  100. }
  101. HiveInputSplit[] splits = (HiveInputSplit[]) this.doGetSplits(job, numSplits);
  102. ArrayList<HiveInputSplit> newSplits = new ArrayList<HiveInputSplit>(
  103. numSplits);
  104. for (HiveInputSplit split : splits) {
  105. l4j.info("split start : " + split.getStart());
  106. l4j.info("split end : " + (split.getStart() + split.getLength()));
  107. try {
  108. if (hiveIndexResult.contains(split)) {
  109. // we may miss a sync here
  110. HiveInputSplit newSplit = split;
  111. if (split.inputFormatClassName().contains("RCFile")
  112. || split.inputFormatClassName().contains("SequenceFile")) {
  113. if (split.getStart() > SequenceFile.SYNC_INTERVAL) {
  114. newSplit = new HiveInputSplit(new FileSplit(split.getPath(), split
  115. .getStart()
  116. - SequenceFile.SYNC_INTERVAL, split.getLength()
  117. + SequenceFile.SYNC_INTERVAL, split.getLocations()), split
  118. .inputFormatClassName());
  119. }
  120. }
  121. newSplits.add(newSplit);
  122. }
  123. } catch (HiveException e) {
  124. throw new RuntimeException(
  125. "Unable to get metadata for input table split" + split.getPath());
  126. }
  127. }
  128. InputSplit retA[] = newSplits.toArray((new FileSplit[newSplits.size()]));
  129. l4j.info("Number of input splits: " + splits.length + " new input splits: "
  130. + retA.length);
  131. return retA;
  132. }
  133. }