/tags/release-0.1-rc2/hive/external/ql/src/java/org/apache/hadoop/hive/ql/index/compact/HiveCompactIndexInputFormat.java
# · Java · 150 lines · 112 code · 17 blank · 21 comment · 15 complexity · d03661ba878ce5ab50d562398b7dd908 MD5 · raw file
- /**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.hadoop.hive.ql.index.compact;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.Iterator;
- import java.util.Set;
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.hive.ql.exec.Utilities;
- import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
- import org.apache.hadoop.hive.ql.io.HiveInputFormat;
- import org.apache.hadoop.hive.ql.io.IOPrepareCache;
- import org.apache.hadoop.hive.ql.metadata.HiveException;
- import org.apache.hadoop.hive.ql.plan.PartitionDesc;
- import org.apache.hadoop.io.SequenceFile;
- import org.apache.hadoop.mapred.FileInputFormat;
- import org.apache.hadoop.mapred.FileSplit;
- import org.apache.hadoop.mapred.InputFormat;
- import org.apache.hadoop.mapred.InputSplit;
- import org.apache.hadoop.mapred.JobConf;
- public class HiveCompactIndexInputFormat extends HiveInputFormat {
- public static final Log l4j = LogFactory.getLog("HiveIndexInputFormat");
- public HiveCompactIndexInputFormat() {
- super();
- }
- public InputSplit[] doGetSplits(JobConf job, int numSplits) throws IOException {
- super.init(job);
- Path[] dirs = FileInputFormat.getInputPaths(job);
- if (dirs.length == 0) {
- throw new IOException("No input paths specified in job");
- }
- JobConf newjob = new JobConf(job);
- ArrayList<InputSplit> result = new ArrayList<InputSplit>();
- // for each dir, get the InputFormat, and do getSplits.
- for (Path dir : dirs) {
- PartitionDesc part = HiveFileFormatUtils
- .getPartitionDescFromPathRecursively(pathToPartitionInfo, dir,
- IOPrepareCache.get().allocatePartitionDescMap(), true);
- // create a new InputFormat instance if this is the first time to see this
- // class
- Class inputFormatClass = part.getInputFileFormatClass();
- InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
- Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), newjob);
- FileInputFormat.setInputPaths(newjob, dir);
- newjob.setInputFormat(inputFormat.getClass());
- InputSplit[] iss = inputFormat.getSplits(newjob, numSplits / dirs.length);
- for (InputSplit is : iss) {
- result.add(new HiveInputSplit(is, inputFormatClass.getName()));
- }
- }
- return result.toArray(new HiveInputSplit[result.size()]);
- }
-
- @Override
- public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
- String indexFileStr = job.get("hive.index.compact.file");
- l4j.info("index_file is " + indexFileStr);
- HiveCompactIndexResult hiveIndexResult = null;
- if (indexFileStr != null) {
- try {
- hiveIndexResult = new HiveCompactIndexResult(indexFileStr, job);
- } catch (HiveException e) {
- l4j.error("Unable to read index..");
- throw new IOException(e);
- }
- Set<String> inputFiles = hiveIndexResult.buckets.keySet();
- Iterator<String> iter = inputFiles.iterator();
- boolean first = true;
- StringBuilder newInputPaths = new StringBuilder();
- while(iter.hasNext()) {
- String path = iter.next();
- if (path.trim().equalsIgnoreCase(""))
- continue;
- if (!first) {
- newInputPaths.append(",");
- } else {
- first = false;
- }
- newInputPaths.append(path);
- }
- FileInputFormat.setInputPaths(job, newInputPaths.toString());
- } else {
- return super.getSplits(job, numSplits);
- }
-
- HiveInputSplit[] splits = (HiveInputSplit[]) this.doGetSplits(job, numSplits);
- ArrayList<HiveInputSplit> newSplits = new ArrayList<HiveInputSplit>(
- numSplits);
- for (HiveInputSplit split : splits) {
- l4j.info("split start : " + split.getStart());
- l4j.info("split end : " + (split.getStart() + split.getLength()));
- try {
- if (hiveIndexResult.contains(split)) {
- // we may miss a sync here
- HiveInputSplit newSplit = split;
- if (split.inputFormatClassName().contains("RCFile")
- || split.inputFormatClassName().contains("SequenceFile")) {
- if (split.getStart() > SequenceFile.SYNC_INTERVAL) {
- newSplit = new HiveInputSplit(new FileSplit(split.getPath(), split
- .getStart()
- - SequenceFile.SYNC_INTERVAL, split.getLength()
- + SequenceFile.SYNC_INTERVAL, split.getLocations()), split
- .inputFormatClassName());
- }
- }
- newSplits.add(newSplit);
- }
- } catch (HiveException e) {
- throw new RuntimeException(
- "Unable to get metadata for input table split" + split.getPath());
- }
- }
- InputSplit retA[] = newSplits.toArray((new FileSplit[newSplits.size()]));
- l4j.info("Number of input splits: " + splits.length + " new input splits: "
- + retA.length);
- return retA;
- }
- }