PageRenderTime 40ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 1ms

/tags/release-0.0.0-rc0/hive/external/ql/src/java/org/apache/hadoop/hive/ql/io/BucketizedHiveInputFormat.java

#
Java | 147 lines | 99 code | 19 blank | 29 comment | 15 complexity | db0993db076a424f08e32fb99ab8ebb1 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, JSON, CPL-1.0
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hive.ql.io;
  19. import java.io.IOException;
  20. import java.util.ArrayList;
  21. import java.util.List;
  22. import org.apache.commons.logging.Log;
  23. import org.apache.commons.logging.LogFactory;
  24. import org.apache.hadoop.fs.FileStatus;
  25. import org.apache.hadoop.fs.FileSystem;
  26. import org.apache.hadoop.fs.Path;
  27. import org.apache.hadoop.hive.common.FileUtils;
  28. import org.apache.hadoop.hive.ql.plan.PartitionDesc;
  29. import org.apache.hadoop.io.Writable;
  30. import org.apache.hadoop.io.WritableComparable;
  31. import org.apache.hadoop.mapred.FileInputFormat;
  32. import org.apache.hadoop.mapred.InputFormat;
  33. import org.apache.hadoop.mapred.InputSplit;
  34. import org.apache.hadoop.mapred.InvalidInputException;
  35. import org.apache.hadoop.mapred.JobConf;
  36. import org.apache.hadoop.mapred.RecordReader;
  37. import org.apache.hadoop.mapred.Reporter;
  38. /**
  39. * BucketizedHiveInputFormat serves the similar function as hiveInputFormat but
  40. * its getSplits() always group splits from one input file into one wrapper
  41. * split. It is useful for the applications that requires input files to fit in
  42. * one mapper.
  43. */
  44. public class BucketizedHiveInputFormat<K extends WritableComparable, V extends Writable>
  45. extends HiveInputFormat<K, V> {
  46. public static final Log LOG = LogFactory
  47. .getLog("org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat");
  48. @Override
  49. public RecordReader getRecordReader(InputSplit split, JobConf job,
  50. Reporter reporter) throws IOException {
  51. BucketizedHiveInputSplit hsplit = (BucketizedHiveInputSplit) split;
  52. String inputFormatClassName = null;
  53. Class inputFormatClass = null;
  54. try {
  55. inputFormatClassName = hsplit.inputFormatClassName();
  56. inputFormatClass = job.getClassByName(inputFormatClassName);
  57. } catch (Exception e) {
  58. throw new IOException("cannot find class " + inputFormatClassName);
  59. }
  60. // clone a jobConf for setting needed columns for reading
  61. JobConf cloneJobConf = new JobConf(job);
  62. pushProjectionsAndFilters(cloneJobConf, inputFormatClass, hsplit.getPath()
  63. .toString(), hsplit.getPath().toUri().getPath());
  64. InputFormat inputFormat = getInputFormatFromCache(inputFormatClass,
  65. cloneJobConf);
  66. BucketizedHiveRecordReader<K, V> rr= new BucketizedHiveRecordReader(inputFormat, hsplit, cloneJobConf,
  67. reporter);
  68. rr.initIOContext(hsplit, cloneJobConf, inputFormatClass);
  69. return rr;
  70. }
  71. protected FileStatus[] listStatus(JobConf job, Path path) throws IOException {
  72. ArrayList<FileStatus> result = new ArrayList<FileStatus>();
  73. List<IOException> errors = new ArrayList<IOException>();
  74. FileSystem fs = path.getFileSystem(job);
  75. FileStatus[] matches = fs.globStatus(path);
  76. if (matches == null) {
  77. errors.add(new IOException("Input path does not exist: " + path));
  78. } else if (matches.length == 0) {
  79. errors.add(new IOException("Input Pattern " + path + " matches 0 files"));
  80. } else {
  81. for (FileStatus globStat : matches) {
  82. FileUtils.listStatusRecursively(fs, globStat, result);
  83. }
  84. }
  85. if (!errors.isEmpty()) {
  86. throw new InvalidInputException(errors);
  87. }
  88. LOG.info("Total input paths to process : " + result.size());
  89. return result.toArray(new FileStatus[result.size()]);
  90. }
  91. @Override
  92. public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
  93. init(job);
  94. Path[] dirs = FileInputFormat.getInputPaths(job);
  95. if (dirs.length == 0) {
  96. throw new IOException("No input paths specified in job");
  97. }
  98. JobConf newjob = new JobConf(job);
  99. ArrayList<InputSplit> result = new ArrayList<InputSplit>();
  100. int numOrigSplits = 0;
  101. // for each dir, get all files under the dir, do getSplits to each
  102. // individual file,
  103. // and then create a BucketizedHiveInputSplit on it
  104. for (Path dir : dirs) {
  105. PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
  106. // create a new InputFormat instance if this is the first time to see this
  107. // class
  108. Class inputFormatClass = part.getInputFileFormatClass();
  109. InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
  110. newjob.setInputFormat(inputFormat.getClass());
  111. FileStatus[] listStatus = listStatus(newjob, dir);
  112. for (FileStatus status : listStatus) {
  113. LOG.info("block size: " + status.getBlockSize());
  114. LOG.info("file length: " + status.getLen());
  115. FileInputFormat.setInputPaths(newjob, status.getPath());
  116. InputSplit[] iss = inputFormat.getSplits(newjob, 0);
  117. if (iss != null && iss.length > 0) {
  118. numOrigSplits += iss.length;
  119. result.add(new BucketizedHiveInputSplit(iss, inputFormatClass
  120. .getName()));
  121. }
  122. }
  123. }
  124. LOG.info(result.size() + " bucketized splits generated from "
  125. + numOrigSplits + " original splits.");
  126. return result.toArray(new BucketizedHiveInputSplit[result.size()]);
  127. }
  128. }