PageRenderTime 25ms CodeModel.GetById 13ms app.highlight 9ms RepoModel.GetById 1ms app.codeStats 0ms

/tags/release-0.1-rc2/hive/external/ql/src/java/org/apache/hadoop/hive/ql/io/SymlinkTextInputFormat.java

#
Java | 229 lines | 149 code | 24 blank | 56 comment | 11 complexity | 02c55bff4dbf35e035ea094a2ec8f0d6 MD5 | raw file
  1/**
  2 * Licensed to the Apache Software Foundation (ASF) under one
  3 * or more contributor license agreements.  See the NOTICE file
  4 * distributed with this work for additional information
  5 * regarding copyright ownership.  The ASF licenses this file
  6 * to you under the Apache License, Version 2.0 (the
  7 * "License"); you may not use this file except in compliance
  8 * with the License.  You may obtain a copy of the License at
  9 *
 10 *     http://www.apache.org/licenses/LICENSE-2.0
 11 *
 12 * Unless required by applicable law or agreed to in writing, software
 13 * distributed under the License is distributed on an "AS IS" BASIS,
 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 * See the License for the specific language governing permissions and
 16 * limitations under the License.
 17 */
 18package org.apache.hadoop.hive.ql.io;
 19
 20import java.io.BufferedReader;
 21import java.io.DataInput;
 22import java.io.DataOutput;
 23import java.io.IOException;
 24import java.io.InputStreamReader;
 25import java.util.ArrayList;
 26import java.util.List;
 27
 28import org.apache.hadoop.conf.Configuration;
 29import org.apache.hadoop.fs.ContentSummary;
 30import org.apache.hadoop.fs.FileStatus;
 31import org.apache.hadoop.fs.FileSystem;
 32import org.apache.hadoop.fs.Path;
 33import org.apache.hadoop.io.LongWritable;
 34import org.apache.hadoop.io.Text;
 35import org.apache.hadoop.mapred.FileInputFormat;
 36import org.apache.hadoop.mapred.FileSplit;
 37import org.apache.hadoop.mapred.InputFormat;
 38import org.apache.hadoop.mapred.InputSplit;
 39import org.apache.hadoop.mapred.JobConf;
 40import org.apache.hadoop.mapred.JobConfigurable;
 41import org.apache.hadoop.mapred.RecordReader;
 42import org.apache.hadoop.mapred.Reporter;
 43import org.apache.hadoop.mapred.TextInputFormat;
 44
 45/**
 46 * Symlink file is a text file which contains a list of filename / dirname.
 47 * This input method reads symlink files from specified job input paths and
 48 * takes the files / directories specified in those symlink files as
 49 * actual map-reduce input. The target input data should be in TextInputFormat.
 50 */
 51@SuppressWarnings("deprecation")
 52public class SymlinkTextInputFormat
 53    implements InputFormat<LongWritable, Text>, JobConfigurable, ContentSummaryInputFormat {
 54  /**
 55   * This input split wraps the FileSplit generated from
 56   * TextInputFormat.getSplits(), while setting the original link file path
 57   * as job input path. This is needed because MapOperator relies on the
 58   * job input path to lookup correct child operators. The target data file
 59   * is encapsulated in the wrapped FileSplit.
 60   */
 61  public static class SymlinkTextInputSplit extends FileSplit {
 62    private final FileSplit split;
 63
 64    public SymlinkTextInputSplit() {
 65      super((Path)null, 0, 0, (String[])null);
 66      split = new FileSplit((Path)null, 0, 0, (String[])null);
 67    }
 68
 69    public SymlinkTextInputSplit(Path symlinkPath, FileSplit split) throws IOException {
 70      super(symlinkPath, 0, 0, split.getLocations());
 71      this.split = split;
 72    }
 73
 74    /**
 75     * Gets the target split, i.e. the split of target data.
 76     */
 77    public FileSplit getTargetSplit() {
 78      return split;
 79    }
 80
 81    @Override
 82    public void write(DataOutput out) throws IOException {
 83      super.write(out);
 84      split.write(out);
 85    }
 86
 87    @Override
 88    public void readFields(DataInput in) throws IOException {
 89      super.readFields(in);
 90      split.readFields(in);
 91    }
 92  }
 93
 94  @Override
 95  public RecordReader<LongWritable, Text> getRecordReader(
 96      InputSplit split, JobConf job, Reporter reporter) throws IOException {
 97    InputSplit targetSplit = ((SymlinkTextInputSplit)split).getTargetSplit();
 98
 99    // The target data is in TextInputFormat.
100    TextInputFormat inputFormat = new TextInputFormat();
101    inputFormat.configure(job);
102    return inputFormat.getRecordReader(targetSplit, job, reporter);
103  }
104
105  /**
106   * Parses all target paths from job input directory which contains symlink
107   * files, and splits the target data using TextInputFormat.
108   */
109  @Override
110  public InputSplit[] getSplits(JobConf job, int numSplits)
111      throws IOException {
112    Path[] symlinksDirs = FileInputFormat.getInputPaths(job);
113    if (symlinksDirs.length == 0) {
114      throw new IOException("No input paths specified in job.");
115    }
116
117    // Get all target paths first, because the number of total target paths
118    // is used to determine number of splits of each target path.
119    List<Path> targetPaths = new ArrayList<Path>();
120    List<Path> symlinkPaths = new ArrayList<Path>();
121    try {
122      getTargetPathsFromSymlinksDirs(
123          job,
124          symlinksDirs,
125          targetPaths,
126          symlinkPaths);
127    } catch (Exception e) {
128      throw new IOException(
129          "Error parsing symlinks from specified job input path.", e);
130    }
131    if (targetPaths.size() == 0) {
132      return new InputSplit[0];
133    }
134
135    // The input should be in TextInputFormat.
136    TextInputFormat inputFormat = new TextInputFormat();
137    JobConf newjob = new JobConf(job);
138    newjob.setInputFormat(TextInputFormat.class);
139    inputFormat.configure(newjob);
140
141    List<InputSplit> result = new ArrayList<InputSplit>();
142
143    // ceil(numSplits / numPaths), so we can get at least numSplits splits.
144    int numPaths = targetPaths.size();
145    int numSubSplits = (numSplits + numPaths - 1) / numPaths;
146
147    // For each path, do getSplits().
148    for (int i = 0; i < numPaths; ++i) {
149      Path targetPath = targetPaths.get(i);
150      Path symlinkPath = symlinkPaths.get(i);
151
152      FileInputFormat.setInputPaths(newjob, targetPath);
153
154      InputSplit[] iss = inputFormat.getSplits(newjob, numSubSplits);
155      for (InputSplit is : iss) {
156        result.add(new SymlinkTextInputSplit(symlinkPath, (FileSplit)is));
157      }
158    }
159    return result.toArray(new InputSplit[result.size()]);
160  }
161
162  @Override
163  public void configure(JobConf job) {
164    // empty
165  }
166
167  /**
168   * Given list of directories containing symlink files, read all target
169   * paths from symlink files and return as targetPaths list. And for each
170   * targetPaths[i], symlinkPaths[i] will be the path to the symlink file
171   * containing the target path.
172   */
173  private static void getTargetPathsFromSymlinksDirs(
174      Configuration conf, Path[] symlinksDirs,
175      List<Path> targetPaths, List<Path> symlinkPaths) throws IOException {
176    for (Path symlinkDir : symlinksDirs) {
177      FileSystem fileSystem = symlinkDir.getFileSystem(conf);
178      FileStatus[] symlinks = fileSystem.listStatus(symlinkDir);
179
180      // Read paths from each symlink file.
181      for (FileStatus symlink : symlinks) {
182        BufferedReader reader =
183            new BufferedReader(
184                new InputStreamReader(
185                    fileSystem.open(symlink.getPath())));
186
187        String line;
188        while ((line = reader.readLine()) != null) {
189          targetPaths.add(new Path(line));
190          symlinkPaths.add(symlink.getPath());
191        }
192      }
193    }
194  }
195
196  /**
197   * For backward compatibility with hadoop 0.17.
198   */
199  public void validateInput(JobConf job) throws IOException {
200    // do nothing
201  }
202
203  @Override
204  public ContentSummary getContentSummary(Path p, JobConf job)
205      throws IOException {
206    //length, file count, directory count
207    long[] summary = {0, 0, 0};
208    List<Path> targetPaths = new ArrayList<Path>();
209    List<Path> symlinkPaths = new ArrayList<Path>();
210    try {
211      getTargetPathsFromSymlinksDirs(
212          job,
213          new Path[]{p},
214          targetPaths,
215          symlinkPaths);
216    } catch (Exception e) {
217      throw new IOException(
218          "Error parsing symlinks from specified job input path.", e);
219    }
220    for(Path path : targetPaths) {
221      FileSystem fs = path.getFileSystem(job);
222      ContentSummary cs = fs.getContentSummary(path);
223      summary[0] += cs.getLength();
224      summary[1] += cs.getFileCount();
225      summary[2] += cs.getDirectoryCount();
226    }
227    return new ContentSummary(summary[0], summary[1], summary[2]);
228  }
229}