PageRenderTime 48ms CodeModel.GetById 16ms app.highlight 26ms RepoModel.GetById 1ms app.codeStats 0ms

/tags/release-0.0.0-rc0/hive/external/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java

#
Java | 404 lines | 295 code | 36 blank | 73 comment | 57 complexity | 72dd32706688837da0708cc873fee77c MD5 | raw file
  1/**
  2 * Licensed to the Apache Software Foundation (ASF) under one
  3 * or more contributor license agreements.  See the NOTICE file
  4 * distributed with this work for additional information
  5 * regarding copyright ownership.  The ASF licenses this file
  6 * to you under the Apache License, Version 2.0 (the
  7 * "License"); you may not use this file except in compliance
  8 * with the License.  You may obtain a copy of the License at
  9 *
 10 *     http://www.apache.org/licenses/LICENSE-2.0
 11 *
 12 * Unless required by applicable law or agreed to in writing, software
 13 * distributed under the License is distributed on an "AS IS" BASIS,
 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 * See the License for the specific language governing permissions and
 16 * limitations under the License.
 17 */
 18
 19package org.apache.hadoop.hive.ql.io;
 20
 21import java.io.File;
 22import java.io.IOException;
 23import java.io.Serializable;
 24import java.util.ArrayList;
 25import java.util.List;
 26import java.util.HashMap;
 27import java.util.Map;
 28import java.util.Properties;
 29import java.util.Set;
 30
 31import org.apache.hadoop.fs.FileStatus;
 32import org.apache.hadoop.fs.FileSystem;
 33import org.apache.hadoop.fs.Path;
 34import org.apache.hadoop.hive.conf.HiveConf;
 35import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
 36import org.apache.hadoop.hive.ql.exec.Operator;
 37import org.apache.hadoop.hive.ql.exec.Utilities;
 38import org.apache.hadoop.hive.ql.metadata.HiveException;
 39import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
 40import org.apache.hadoop.hive.ql.plan.PartitionDesc;
 41import org.apache.hadoop.hive.ql.plan.TableDesc;
 42import org.apache.hadoop.io.SequenceFile.CompressionType;
 43import org.apache.hadoop.io.Writable;
 44import org.apache.hadoop.io.compress.CompressionCodec;
 45import org.apache.hadoop.mapred.FileOutputFormat;
 46import org.apache.hadoop.mapred.InputFormat;
 47import org.apache.hadoop.mapred.JobConf;
 48import org.apache.hadoop.mapred.OutputFormat;
 49import org.apache.hadoop.mapred.SequenceFileInputFormat;
 50import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 51import org.apache.hadoop.mapred.TextInputFormat;
 52
 53/**
 54 * An util class for various Hive file format tasks.
 55 * registerOutputFormatSubstitute(Class, Class) getOutputFormatSubstitute(Class)
 56 * are added for backward compatibility. They return the newly added
 57 * HiveOutputFormat for the older ones.
 58 *
 59 */
 60public final class HiveFileFormatUtils {
 61
 62  static {
 63    outputFormatSubstituteMap =
 64        new HashMap<Class<? extends OutputFormat>, Class<? extends HiveOutputFormat>>();
 65    HiveFileFormatUtils.registerOutputFormatSubstitute(
 66        IgnoreKeyTextOutputFormat.class, HiveIgnoreKeyTextOutputFormat.class);
 67    HiveFileFormatUtils.registerOutputFormatSubstitute(
 68        SequenceFileOutputFormat.class, HiveSequenceFileOutputFormat.class);
 69  }
 70
 71  @SuppressWarnings("unchecked")
 72  private static Map<Class<? extends OutputFormat>, Class<? extends HiveOutputFormat>>
 73  outputFormatSubstituteMap;
 74
 75  /**
 76   * register a substitute.
 77   *
 78   * @param origin
 79   *          the class that need to be substituted
 80   * @param substitute
 81   */
 82  @SuppressWarnings("unchecked")
 83  public static synchronized void registerOutputFormatSubstitute(
 84      Class<? extends OutputFormat> origin,
 85      Class<? extends HiveOutputFormat> substitute) {
 86    outputFormatSubstituteMap.put(origin, substitute);
 87  }
 88
 89  /**
 90   * get a OutputFormat's substitute HiveOutputFormat.
 91   */
 92  @SuppressWarnings("unchecked")
 93  public static synchronized Class<? extends HiveOutputFormat> getOutputFormatSubstitute(
 94      Class<?> origin) {
 95    if (HiveOutputFormat.class.isAssignableFrom(origin)) {
 96      return (Class<? extends HiveOutputFormat>) origin;
 97    }
 98    Class<? extends HiveOutputFormat> result = outputFormatSubstituteMap
 99        .get(origin);
100    return result;
101  }
102
103  /**
104   * get the final output path of a given FileOutputFormat.
105   *
106   * @param parent
107   *          parent dir of the expected final output path
108   * @param jc
109   *          job configuration
110   */
111  public static Path getOutputFormatFinalPath(Path parent, String taskId, JobConf jc,
112      HiveOutputFormat<?, ?> hiveOutputFormat, boolean isCompressed,
113      Path defaultFinalPath) throws IOException {
114    if (hiveOutputFormat instanceof HiveIgnoreKeyTextOutputFormat) {
115      return new Path(parent, taskId
116          + Utilities.getFileExtension(jc, isCompressed));
117    }
118    return defaultFinalPath;
119  }
120
121  static {
122    inputFormatCheckerMap =
123        new HashMap<Class<? extends InputFormat>, Class<? extends InputFormatChecker>>();
124    HiveFileFormatUtils.registerInputFormatChecker(
125        SequenceFileInputFormat.class, SequenceFileInputFormatChecker.class);
126    HiveFileFormatUtils.registerInputFormatChecker(RCFileInputFormat.class,
127        RCFileInputFormat.class);
128    inputFormatCheckerInstanceCache =
129        new HashMap<Class<? extends InputFormatChecker>, InputFormatChecker>();
130  }
131
132  @SuppressWarnings("unchecked")
133  private static Map<Class<? extends InputFormat>, Class<? extends InputFormatChecker>> inputFormatCheckerMap;
134
135  private static Map<Class<? extends InputFormatChecker>, InputFormatChecker> inputFormatCheckerInstanceCache;
136
137  /**
138   * register an InputFormatChecker for a given InputFormat.
139   *
140   * @param format
141   *          the class that need to be substituted
142   * @param checker
143   */
144  @SuppressWarnings("unchecked")
145  public static synchronized void registerInputFormatChecker(
146      Class<? extends InputFormat> format,
147      Class<? extends InputFormatChecker> checker) {
148    inputFormatCheckerMap.put(format, checker);
149  }
150
151  /**
152   * get an InputFormatChecker for a file format.
153   */
154  public static synchronized Class<? extends InputFormatChecker> getInputFormatChecker(
155      Class<?> inputFormat) {
156    Class<? extends InputFormatChecker> result = inputFormatCheckerMap
157        .get(inputFormat);
158    return result;
159  }
160
161  /**
162   * checks if files are in same format as the given input format.
163   */
164  @SuppressWarnings("unchecked")
165  public static boolean checkInputFormat(FileSystem fs, HiveConf conf,
166      Class<? extends InputFormat> inputFormatCls, ArrayList<FileStatus> files)
167      throws HiveException {
168    if (files.size() > 0) {
169      Class<? extends InputFormatChecker> checkerCls = getInputFormatChecker(inputFormatCls);
170      if (checkerCls == null
171          && inputFormatCls.isAssignableFrom(TextInputFormat.class)) {
172        // we get a text input format here, we can not determine a file is text
173        // according to its content, so we can do is to test if other file
174        // format can accept it. If one other file format can accept this file,
175        // we treat this file as text file, although it maybe not.
176        return checkTextInputFormat(fs, conf, files);
177      }
178
179      if (checkerCls != null) {
180        InputFormatChecker checkerInstance = inputFormatCheckerInstanceCache
181            .get(checkerCls);
182        try {
183          if (checkerInstance == null) {
184            checkerInstance = checkerCls.newInstance();
185            inputFormatCheckerInstanceCache.put(checkerCls, checkerInstance);
186          }
187          return checkerInstance.validateInput(fs, conf, files);
188        } catch (Exception e) {
189          throw new HiveException(e);
190        }
191      }
192      return true;
193    }
194    return false;
195  }
196
197  @SuppressWarnings("unchecked")
198  private static boolean checkTextInputFormat(FileSystem fs, HiveConf conf,
199      ArrayList<FileStatus> files) throws HiveException {
200    Set<Class<? extends InputFormat>> inputFormatter = inputFormatCheckerMap
201        .keySet();
202    for (Class<? extends InputFormat> reg : inputFormatter) {
203      boolean result = checkInputFormat(fs, conf, reg, files);
204      if (result) {
205        return false;
206      }
207    }
208    return true;
209  }
210
211  public static RecordWriter getHiveRecordWriter(JobConf jc,
212      TableDesc tableInfo, Class<? extends Writable> outputClass,
213      FileSinkDesc conf, Path outPath) throws HiveException {
214    try {
215      HiveOutputFormat<?, ?> hiveOutputFormat = tableInfo
216          .getOutputFileFormatClass().newInstance();
217      boolean isCompressed = conf.getCompressed();
218      JobConf jc_output = jc;
219      if (isCompressed) {
220        jc_output = new JobConf(jc);
221        String codecStr = conf.getCompressCodec();
222        if (codecStr != null && !codecStr.trim().equals("")) {
223          Class<? extends CompressionCodec> codec = (Class<? extends CompressionCodec>) Class
224              .forName(codecStr);
225          FileOutputFormat.setOutputCompressorClass(jc_output, codec);
226        }
227        String type = conf.getCompressType();
228        if (type != null && !type.trim().equals("")) {
229          CompressionType style = CompressionType.valueOf(type);
230          SequenceFileOutputFormat.setOutputCompressionType(jc, style);
231        }
232      }
233      return getRecordWriter(jc_output, hiveOutputFormat, outputClass,
234          isCompressed, tableInfo.getProperties(), outPath);
235    } catch (Exception e) {
236      throw new HiveException(e);
237    }
238  }
239
240  public static RecordWriter getRecordWriter(JobConf jc,
241      HiveOutputFormat<?, ?> hiveOutputFormat,
242      final Class<? extends Writable> valueClass, boolean isCompressed,
243      Properties tableProp, Path outPath) throws IOException, HiveException {
244    if (hiveOutputFormat != null) {
245      return hiveOutputFormat.getHiveRecordWriter(jc, outPath, valueClass,
246          isCompressed, tableProp, null);
247    }
248    return null;
249  }
250
251  public static PartitionDesc getPartitionDescFromPathRecursively(
252      Map<String, PartitionDesc> pathToPartitionInfo, Path dir,
253      Map<Map<String, PartitionDesc>, Map<String, PartitionDesc>> cacheMap)
254      throws IOException {
255    return getPartitionDescFromPathRecursively(pathToPartitionInfo, dir,
256        cacheMap, false);
257  }
258
259  public static PartitionDesc getPartitionDescFromPathRecursively(
260      Map<String, PartitionDesc> pathToPartitionInfo, Path dir,
261      Map<Map<String, PartitionDesc>, Map<String, PartitionDesc>> cacheMap,
262      boolean ignoreSchema) throws IOException {
263
264    PartitionDesc part = doGetPartitionDescFromPath(pathToPartitionInfo, dir);
265    if (part == null
266        && (ignoreSchema || (dir.toUri().getScheme() == null || dir.toUri().getScheme().trim()
267            .equals("")))) {
268
269      Map<String, PartitionDesc> newPathToPartitionInfo = null;
270      if (cacheMap != null) {
271        newPathToPartitionInfo = cacheMap.get(pathToPartitionInfo);
272      }
273
274      if (newPathToPartitionInfo == null) { // still null
275        newPathToPartitionInfo = new HashMap<String, PartitionDesc>();
276        populateNewPartitionDesc(pathToPartitionInfo, newPathToPartitionInfo);
277
278        if (cacheMap != null) {
279          cacheMap.put(pathToPartitionInfo, newPathToPartitionInfo);
280        }
281      }
282      part = doGetPartitionDescFromPath(newPathToPartitionInfo, dir);
283    }
284
285    if (part != null) {
286      return part;
287    } else {
288      throw new IOException("cannot find dir = " + dir.toString()
289                          + " in pathToPartitionInfo: " + pathToPartitionInfo.keySet());
290    }
291  }
292
293  private static void populateNewPartitionDesc(
294      Map<String, PartitionDesc> pathToPartitionInfo,
295      Map<String, PartitionDesc> newPathToPartitionInfo) {
296    for (Map.Entry<String, PartitionDesc> entry: pathToPartitionInfo.entrySet()) {
297      String entryKey = entry.getKey();
298      PartitionDesc partDesc = entry.getValue();
299      Path newP = new Path(entryKey);
300      String pathOnly = newP.toUri().getPath();
301      newPathToPartitionInfo.put(pathOnly, partDesc);
302    }
303  }
304
305  private static PartitionDesc doGetPartitionDescFromPath(
306      Map<String, PartitionDesc> pathToPartitionInfo, Path dir) {
307    // We first do exact match, and then do prefix matching. The latter is due to input dir
308    // could be /dir/ds='2001-02-21'/part-03 where part-03 is not part of partition
309    String dirPath = dir.toUri().getPath();
310    PartitionDesc part = pathToPartitionInfo.get(dir.toString());
311    if (part == null) {
312      //      LOG.warn("exact match not found, try ripping input path's theme and authority");
313      part = pathToPartitionInfo.get(dirPath);
314    }
315
316    if (part == null) {
317      String dirStr = dir.toString();
318      int dirPathIndex = dirPath.lastIndexOf(File.separator);
319      int dirStrIndex = dirStr.lastIndexOf(File.separator);
320      while (dirPathIndex >= 0 && dirStrIndex >= 0) {
321        dirStr = dirStr.substring(0, dirStrIndex);
322        dirPath = dirPath.substring(0, dirPathIndex);
323        //first try full match
324        part = pathToPartitionInfo.get(dirStr);
325        if (part == null) {
326          // LOG.warn("exact match not found, try ripping input path's theme and authority");
327          part = pathToPartitionInfo.get(dirPath);
328        }
329        if (part != null) {
330          break;
331        }
332        dirPathIndex = dirPath.lastIndexOf(File.separator);
333        dirStrIndex = dirStr.lastIndexOf(File.separator);
334      }
335    }
336    return part;
337  }
338
339  private static boolean foundAlias(Map<String, ArrayList<String>> pathToAliases,
340                                    String path) {
341    List<String> aliases = pathToAliases.get(path);
342    if ((aliases == null) || (aliases.isEmpty())) {
343      return false;
344    }
345    return true;
346  }
347
348  private static String getMatchingPath(Map<String, ArrayList<String>> pathToAliases,
349                                        Path dir) {
350    // First find the path to be searched
351    String path = dir.toString();
352    if (foundAlias(pathToAliases, path)) {
353      return path;
354    }
355
356    String dirPath = dir.toUri().getPath();
357    if (foundAlias(pathToAliases, dirPath)) {
358      return dirPath;
359    }
360    path = dirPath;
361
362    String dirStr = dir.toString();
363    int dirPathIndex = dirPath.lastIndexOf(File.separator);
364    int dirStrIndex = dirStr.lastIndexOf(File.separator);
365    while (dirPathIndex >= 0 && dirStrIndex >= 0) {
366      dirStr = dirStr.substring(0, dirStrIndex);
367      dirPath = dirPath.substring(0, dirPathIndex);
368      //first try full match
369      if (foundAlias(pathToAliases, dirStr)) {
370        return dirStr;
371      }
372      if (foundAlias(pathToAliases, dirPath)) {
373        return dirPath;
374      }
375      dirPathIndex = dirPath.lastIndexOf(File.separator);
376      dirStrIndex = dirStr.lastIndexOf(File.separator);
377    }
378    return null;
379  }
380
381  /**
382   * Get the list of operatators from the opeerator tree that are needed for the path
383   * @param pathToAliases  mapping from path to aliases
384   * @param aliasToWork    The operator tree to be invoked for a given alias
385   * @param dir            The path to look for
386   **/
387  public static List<Operator<? extends Serializable>> doGetAliasesFromPath(
388    Map<String, ArrayList<String>> pathToAliases,
389    Map<String, Operator<? extends Serializable>> aliasToWork, Path dir) {
390
391    String path = getMatchingPath(pathToAliases, dir);
392    List<Operator<? extends Serializable>> opList =
393      new ArrayList<Operator<? extends Serializable>>();
394    List<String> aliases = pathToAliases.get(path);
395    for (String alias : aliases) {
396      opList.add(aliasToWork.get(alias));
397    }
398    return opList;
399  }
400
401  private HiveFileFormatUtils() {
402    // prevent instantiation
403  }
404}