PageRenderTime 38ms CodeModel.GetById 10ms RepoModel.GetById 0ms app.codeStats 0ms

/tags/release-0.1-rc2/hive/external/ql/src/java/org/apache/hadoop/hive/ql/io/SymlinkTextInputFormat.java

#
Java | 229 lines | 149 code | 24 blank | 56 comment | 11 complexity | 02c55bff4dbf35e035ea094a2ec8f0d6 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, JSON, CPL-1.0
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hive.ql.io;
  19. import java.io.BufferedReader;
  20. import java.io.DataInput;
  21. import java.io.DataOutput;
  22. import java.io.IOException;
  23. import java.io.InputStreamReader;
  24. import java.util.ArrayList;
  25. import java.util.List;
  26. import org.apache.hadoop.conf.Configuration;
  27. import org.apache.hadoop.fs.ContentSummary;
  28. import org.apache.hadoop.fs.FileStatus;
  29. import org.apache.hadoop.fs.FileSystem;
  30. import org.apache.hadoop.fs.Path;
  31. import org.apache.hadoop.io.LongWritable;
  32. import org.apache.hadoop.io.Text;
  33. import org.apache.hadoop.mapred.FileInputFormat;
  34. import org.apache.hadoop.mapred.FileSplit;
  35. import org.apache.hadoop.mapred.InputFormat;
  36. import org.apache.hadoop.mapred.InputSplit;
  37. import org.apache.hadoop.mapred.JobConf;
  38. import org.apache.hadoop.mapred.JobConfigurable;
  39. import org.apache.hadoop.mapred.RecordReader;
  40. import org.apache.hadoop.mapred.Reporter;
  41. import org.apache.hadoop.mapred.TextInputFormat;
  42. /**
  43. * Symlink file is a text file which contains a list of filename / dirname.
  44. * This input method reads symlink files from specified job input paths and
  45. * takes the files / directories specified in those symlink files as
  46. * actual map-reduce input. The target input data should be in TextInputFormat.
  47. */
  48. @SuppressWarnings("deprecation")
  49. public class SymlinkTextInputFormat
  50. implements InputFormat<LongWritable, Text>, JobConfigurable, ContentSummaryInputFormat {
  51. /**
  52. * This input split wraps the FileSplit generated from
  53. * TextInputFormat.getSplits(), while setting the original link file path
  54. * as job input path. This is needed because MapOperator relies on the
  55. * job input path to lookup correct child operators. The target data file
  56. * is encapsulated in the wrapped FileSplit.
  57. */
  58. public static class SymlinkTextInputSplit extends FileSplit {
  59. private final FileSplit split;
  60. public SymlinkTextInputSplit() {
  61. super((Path)null, 0, 0, (String[])null);
  62. split = new FileSplit((Path)null, 0, 0, (String[])null);
  63. }
  64. public SymlinkTextInputSplit(Path symlinkPath, FileSplit split) throws IOException {
  65. super(symlinkPath, 0, 0, split.getLocations());
  66. this.split = split;
  67. }
  68. /**
  69. * Gets the target split, i.e. the split of target data.
  70. */
  71. public FileSplit getTargetSplit() {
  72. return split;
  73. }
  74. @Override
  75. public void write(DataOutput out) throws IOException {
  76. super.write(out);
  77. split.write(out);
  78. }
  79. @Override
  80. public void readFields(DataInput in) throws IOException {
  81. super.readFields(in);
  82. split.readFields(in);
  83. }
  84. }
  85. @Override
  86. public RecordReader<LongWritable, Text> getRecordReader(
  87. InputSplit split, JobConf job, Reporter reporter) throws IOException {
  88. InputSplit targetSplit = ((SymlinkTextInputSplit)split).getTargetSplit();
  89. // The target data is in TextInputFormat.
  90. TextInputFormat inputFormat = new TextInputFormat();
  91. inputFormat.configure(job);
  92. return inputFormat.getRecordReader(targetSplit, job, reporter);
  93. }
  94. /**
  95. * Parses all target paths from job input directory which contains symlink
  96. * files, and splits the target data using TextInputFormat.
  97. */
  98. @Override
  99. public InputSplit[] getSplits(JobConf job, int numSplits)
  100. throws IOException {
  101. Path[] symlinksDirs = FileInputFormat.getInputPaths(job);
  102. if (symlinksDirs.length == 0) {
  103. throw new IOException("No input paths specified in job.");
  104. }
  105. // Get all target paths first, because the number of total target paths
  106. // is used to determine number of splits of each target path.
  107. List<Path> targetPaths = new ArrayList<Path>();
  108. List<Path> symlinkPaths = new ArrayList<Path>();
  109. try {
  110. getTargetPathsFromSymlinksDirs(
  111. job,
  112. symlinksDirs,
  113. targetPaths,
  114. symlinkPaths);
  115. } catch (Exception e) {
  116. throw new IOException(
  117. "Error parsing symlinks from specified job input path.", e);
  118. }
  119. if (targetPaths.size() == 0) {
  120. return new InputSplit[0];
  121. }
  122. // The input should be in TextInputFormat.
  123. TextInputFormat inputFormat = new TextInputFormat();
  124. JobConf newjob = new JobConf(job);
  125. newjob.setInputFormat(TextInputFormat.class);
  126. inputFormat.configure(newjob);
  127. List<InputSplit> result = new ArrayList<InputSplit>();
  128. // ceil(numSplits / numPaths), so we can get at least numSplits splits.
  129. int numPaths = targetPaths.size();
  130. int numSubSplits = (numSplits + numPaths - 1) / numPaths;
  131. // For each path, do getSplits().
  132. for (int i = 0; i < numPaths; ++i) {
  133. Path targetPath = targetPaths.get(i);
  134. Path symlinkPath = symlinkPaths.get(i);
  135. FileInputFormat.setInputPaths(newjob, targetPath);
  136. InputSplit[] iss = inputFormat.getSplits(newjob, numSubSplits);
  137. for (InputSplit is : iss) {
  138. result.add(new SymlinkTextInputSplit(symlinkPath, (FileSplit)is));
  139. }
  140. }
  141. return result.toArray(new InputSplit[result.size()]);
  142. }
  143. @Override
  144. public void configure(JobConf job) {
  145. // empty
  146. }
  147. /**
  148. * Given list of directories containing symlink files, read all target
  149. * paths from symlink files and return as targetPaths list. And for each
  150. * targetPaths[i], symlinkPaths[i] will be the path to the symlink file
  151. * containing the target path.
  152. */
  153. private static void getTargetPathsFromSymlinksDirs(
  154. Configuration conf, Path[] symlinksDirs,
  155. List<Path> targetPaths, List<Path> symlinkPaths) throws IOException {
  156. for (Path symlinkDir : symlinksDirs) {
  157. FileSystem fileSystem = symlinkDir.getFileSystem(conf);
  158. FileStatus[] symlinks = fileSystem.listStatus(symlinkDir);
  159. // Read paths from each symlink file.
  160. for (FileStatus symlink : symlinks) {
  161. BufferedReader reader =
  162. new BufferedReader(
  163. new InputStreamReader(
  164. fileSystem.open(symlink.getPath())));
  165. String line;
  166. while ((line = reader.readLine()) != null) {
  167. targetPaths.add(new Path(line));
  168. symlinkPaths.add(symlink.getPath());
  169. }
  170. }
  171. }
  172. }
  173. /**
  174. * For backward compatibility with hadoop 0.17.
  175. */
  176. public void validateInput(JobConf job) throws IOException {
  177. // do nothing
  178. }
  179. @Override
  180. public ContentSummary getContentSummary(Path p, JobConf job)
  181. throws IOException {
  182. //length, file count, directory count
  183. long[] summary = {0, 0, 0};
  184. List<Path> targetPaths = new ArrayList<Path>();
  185. List<Path> symlinkPaths = new ArrayList<Path>();
  186. try {
  187. getTargetPathsFromSymlinksDirs(
  188. job,
  189. new Path[]{p},
  190. targetPaths,
  191. symlinkPaths);
  192. } catch (Exception e) {
  193. throw new IOException(
  194. "Error parsing symlinks from specified job input path.", e);
  195. }
  196. for(Path path : targetPaths) {
  197. FileSystem fs = path.getFileSystem(job);
  198. ContentSummary cs = fs.getContentSummary(path);
  199. summary[0] += cs.getLength();
  200. summary[1] += cs.getFileCount();
  201. summary[2] += cs.getDirectoryCount();
  202. }
  203. return new ContentSummary(summary[0], summary[1], summary[2]);
  204. }
  205. }