/tags/release-0.0.0-rc0/hive/external/ql/src/java/org/apache/hadoop/hive/ql/io/BucketizedHiveInputFormat.java
Java | 147 lines | 99 code | 19 blank | 29 comment | 15 complexity | db0993db076a424f08e32fb99ab8ebb1 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, JSON, CPL-1.0
- /**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.hadoop.hive.ql.io;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.List;
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- import org.apache.hadoop.fs.FileStatus;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.hive.common.FileUtils;
- import org.apache.hadoop.hive.ql.plan.PartitionDesc;
- import org.apache.hadoop.io.Writable;
- import org.apache.hadoop.io.WritableComparable;
- import org.apache.hadoop.mapred.FileInputFormat;
- import org.apache.hadoop.mapred.InputFormat;
- import org.apache.hadoop.mapred.InputSplit;
- import org.apache.hadoop.mapred.InvalidInputException;
- import org.apache.hadoop.mapred.JobConf;
- import org.apache.hadoop.mapred.RecordReader;
- import org.apache.hadoop.mapred.Reporter;
- /**
- * BucketizedHiveInputFormat serves the similar function as hiveInputFormat but
- * its getSplits() always group splits from one input file into one wrapper
- * split. It is useful for the applications that requires input files to fit in
- * one mapper.
- */
- public class BucketizedHiveInputFormat<K extends WritableComparable, V extends Writable>
- extends HiveInputFormat<K, V> {
- public static final Log LOG = LogFactory
- .getLog("org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat");
- @Override
- public RecordReader getRecordReader(InputSplit split, JobConf job,
- Reporter reporter) throws IOException {
- BucketizedHiveInputSplit hsplit = (BucketizedHiveInputSplit) split;
- String inputFormatClassName = null;
- Class inputFormatClass = null;
- try {
- inputFormatClassName = hsplit.inputFormatClassName();
- inputFormatClass = job.getClassByName(inputFormatClassName);
- } catch (Exception e) {
- throw new IOException("cannot find class " + inputFormatClassName);
- }
- // clone a jobConf for setting needed columns for reading
- JobConf cloneJobConf = new JobConf(job);
- pushProjectionsAndFilters(cloneJobConf, inputFormatClass, hsplit.getPath()
- .toString(), hsplit.getPath().toUri().getPath());
- InputFormat inputFormat = getInputFormatFromCache(inputFormatClass,
- cloneJobConf);
- BucketizedHiveRecordReader<K, V> rr= new BucketizedHiveRecordReader(inputFormat, hsplit, cloneJobConf,
- reporter);
- rr.initIOContext(hsplit, cloneJobConf, inputFormatClass);
- return rr;
- }
- protected FileStatus[] listStatus(JobConf job, Path path) throws IOException {
- ArrayList<FileStatus> result = new ArrayList<FileStatus>();
- List<IOException> errors = new ArrayList<IOException>();
- FileSystem fs = path.getFileSystem(job);
- FileStatus[] matches = fs.globStatus(path);
- if (matches == null) {
- errors.add(new IOException("Input path does not exist: " + path));
- } else if (matches.length == 0) {
- errors.add(new IOException("Input Pattern " + path + " matches 0 files"));
- } else {
- for (FileStatus globStat : matches) {
- FileUtils.listStatusRecursively(fs, globStat, result);
- }
- }
- if (!errors.isEmpty()) {
- throw new InvalidInputException(errors);
- }
- LOG.info("Total input paths to process : " + result.size());
- return result.toArray(new FileStatus[result.size()]);
- }
- @Override
- public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
- init(job);
- Path[] dirs = FileInputFormat.getInputPaths(job);
- if (dirs.length == 0) {
- throw new IOException("No input paths specified in job");
- }
- JobConf newjob = new JobConf(job);
- ArrayList<InputSplit> result = new ArrayList<InputSplit>();
- int numOrigSplits = 0;
- // for each dir, get all files under the dir, do getSplits to each
- // individual file,
- // and then create a BucketizedHiveInputSplit on it
- for (Path dir : dirs) {
- PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
- // create a new InputFormat instance if this is the first time to see this
- // class
- Class inputFormatClass = part.getInputFileFormatClass();
- InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
- newjob.setInputFormat(inputFormat.getClass());
- FileStatus[] listStatus = listStatus(newjob, dir);
- for (FileStatus status : listStatus) {
- LOG.info("block size: " + status.getBlockSize());
- LOG.info("file length: " + status.getLen());
- FileInputFormat.setInputPaths(newjob, status.getPath());
- InputSplit[] iss = inputFormat.getSplits(newjob, 0);
- if (iss != null && iss.length > 0) {
- numOrigSplits += iss.length;
- result.add(new BucketizedHiveInputSplit(iss, inputFormatClass
- .getName()));
- }
- }
- }
- LOG.info(result.size() + " bucketized splits generated from "
- + numOrigSplits + " original splits.");
- return result.toArray(new BucketizedHiveInputSplit[result.size()]);
- }
- }