PageRenderTime 48ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 1ms

/tags/release-0.0.0-rc0/hive/external/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java

#
Java | 404 lines | 295 code | 36 blank | 73 comment | 57 complexity | 72dd32706688837da0708cc873fee77c MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, JSON, CPL-1.0
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hive.ql.io;
  19. import java.io.File;
  20. import java.io.IOException;
  21. import java.io.Serializable;
  22. import java.util.ArrayList;
  23. import java.util.List;
  24. import java.util.HashMap;
  25. import java.util.Map;
  26. import java.util.Properties;
  27. import java.util.Set;
  28. import org.apache.hadoop.fs.FileStatus;
  29. import org.apache.hadoop.fs.FileSystem;
  30. import org.apache.hadoop.fs.Path;
  31. import org.apache.hadoop.hive.conf.HiveConf;
  32. import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
  33. import org.apache.hadoop.hive.ql.exec.Operator;
  34. import org.apache.hadoop.hive.ql.exec.Utilities;
  35. import org.apache.hadoop.hive.ql.metadata.HiveException;
  36. import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
  37. import org.apache.hadoop.hive.ql.plan.PartitionDesc;
  38. import org.apache.hadoop.hive.ql.plan.TableDesc;
  39. import org.apache.hadoop.io.SequenceFile.CompressionType;
  40. import org.apache.hadoop.io.Writable;
  41. import org.apache.hadoop.io.compress.CompressionCodec;
  42. import org.apache.hadoop.mapred.FileOutputFormat;
  43. import org.apache.hadoop.mapred.InputFormat;
  44. import org.apache.hadoop.mapred.JobConf;
  45. import org.apache.hadoop.mapred.OutputFormat;
  46. import org.apache.hadoop.mapred.SequenceFileInputFormat;
  47. import org.apache.hadoop.mapred.SequenceFileOutputFormat;
  48. import org.apache.hadoop.mapred.TextInputFormat;
  49. /**
  50. * An util class for various Hive file format tasks.
  51. * registerOutputFormatSubstitute(Class, Class) getOutputFormatSubstitute(Class)
  52. * are added for backward compatibility. They return the newly added
  53. * HiveOutputFormat for the older ones.
  54. *
  55. */
  56. public final class HiveFileFormatUtils {
  57. static {
  58. outputFormatSubstituteMap =
  59. new HashMap<Class<? extends OutputFormat>, Class<? extends HiveOutputFormat>>();
  60. HiveFileFormatUtils.registerOutputFormatSubstitute(
  61. IgnoreKeyTextOutputFormat.class, HiveIgnoreKeyTextOutputFormat.class);
  62. HiveFileFormatUtils.registerOutputFormatSubstitute(
  63. SequenceFileOutputFormat.class, HiveSequenceFileOutputFormat.class);
  64. }
  65. @SuppressWarnings("unchecked")
  66. private static Map<Class<? extends OutputFormat>, Class<? extends HiveOutputFormat>>
  67. outputFormatSubstituteMap;
  68. /**
  69. * register a substitute.
  70. *
  71. * @param origin
  72. * the class that need to be substituted
  73. * @param substitute
  74. */
  75. @SuppressWarnings("unchecked")
  76. public static synchronized void registerOutputFormatSubstitute(
  77. Class<? extends OutputFormat> origin,
  78. Class<? extends HiveOutputFormat> substitute) {
  79. outputFormatSubstituteMap.put(origin, substitute);
  80. }
  81. /**
  82. * get a OutputFormat's substitute HiveOutputFormat.
  83. */
  84. @SuppressWarnings("unchecked")
  85. public static synchronized Class<? extends HiveOutputFormat> getOutputFormatSubstitute(
  86. Class<?> origin) {
  87. if (HiveOutputFormat.class.isAssignableFrom(origin)) {
  88. return (Class<? extends HiveOutputFormat>) origin;
  89. }
  90. Class<? extends HiveOutputFormat> result = outputFormatSubstituteMap
  91. .get(origin);
  92. return result;
  93. }
  94. /**
  95. * get the final output path of a given FileOutputFormat.
  96. *
  97. * @param parent
  98. * parent dir of the expected final output path
  99. * @param jc
  100. * job configuration
  101. */
  102. public static Path getOutputFormatFinalPath(Path parent, String taskId, JobConf jc,
  103. HiveOutputFormat<?, ?> hiveOutputFormat, boolean isCompressed,
  104. Path defaultFinalPath) throws IOException {
  105. if (hiveOutputFormat instanceof HiveIgnoreKeyTextOutputFormat) {
  106. return new Path(parent, taskId
  107. + Utilities.getFileExtension(jc, isCompressed));
  108. }
  109. return defaultFinalPath;
  110. }
  111. static {
  112. inputFormatCheckerMap =
  113. new HashMap<Class<? extends InputFormat>, Class<? extends InputFormatChecker>>();
  114. HiveFileFormatUtils.registerInputFormatChecker(
  115. SequenceFileInputFormat.class, SequenceFileInputFormatChecker.class);
  116. HiveFileFormatUtils.registerInputFormatChecker(RCFileInputFormat.class,
  117. RCFileInputFormat.class);
  118. inputFormatCheckerInstanceCache =
  119. new HashMap<Class<? extends InputFormatChecker>, InputFormatChecker>();
  120. }
  121. @SuppressWarnings("unchecked")
  122. private static Map<Class<? extends InputFormat>, Class<? extends InputFormatChecker>> inputFormatCheckerMap;
  123. private static Map<Class<? extends InputFormatChecker>, InputFormatChecker> inputFormatCheckerInstanceCache;
  124. /**
  125. * register an InputFormatChecker for a given InputFormat.
  126. *
  127. * @param format
  128. * the class that need to be substituted
  129. * @param checker
  130. */
  131. @SuppressWarnings("unchecked")
  132. public static synchronized void registerInputFormatChecker(
  133. Class<? extends InputFormat> format,
  134. Class<? extends InputFormatChecker> checker) {
  135. inputFormatCheckerMap.put(format, checker);
  136. }
  137. /**
  138. * get an InputFormatChecker for a file format.
  139. */
  140. public static synchronized Class<? extends InputFormatChecker> getInputFormatChecker(
  141. Class<?> inputFormat) {
  142. Class<? extends InputFormatChecker> result = inputFormatCheckerMap
  143. .get(inputFormat);
  144. return result;
  145. }
  146. /**
  147. * checks if files are in same format as the given input format.
  148. */
  149. @SuppressWarnings("unchecked")
  150. public static boolean checkInputFormat(FileSystem fs, HiveConf conf,
  151. Class<? extends InputFormat> inputFormatCls, ArrayList<FileStatus> files)
  152. throws HiveException {
  153. if (files.size() > 0) {
  154. Class<? extends InputFormatChecker> checkerCls = getInputFormatChecker(inputFormatCls);
  155. if (checkerCls == null
  156. && inputFormatCls.isAssignableFrom(TextInputFormat.class)) {
  157. // we get a text input format here, we can not determine a file is text
  158. // according to its content, so we can do is to test if other file
  159. // format can accept it. If one other file format can accept this file,
  160. // we treat this file as text file, although it maybe not.
  161. return checkTextInputFormat(fs, conf, files);
  162. }
  163. if (checkerCls != null) {
  164. InputFormatChecker checkerInstance = inputFormatCheckerInstanceCache
  165. .get(checkerCls);
  166. try {
  167. if (checkerInstance == null) {
  168. checkerInstance = checkerCls.newInstance();
  169. inputFormatCheckerInstanceCache.put(checkerCls, checkerInstance);
  170. }
  171. return checkerInstance.validateInput(fs, conf, files);
  172. } catch (Exception e) {
  173. throw new HiveException(e);
  174. }
  175. }
  176. return true;
  177. }
  178. return false;
  179. }
  180. @SuppressWarnings("unchecked")
  181. private static boolean checkTextInputFormat(FileSystem fs, HiveConf conf,
  182. ArrayList<FileStatus> files) throws HiveException {
  183. Set<Class<? extends InputFormat>> inputFormatter = inputFormatCheckerMap
  184. .keySet();
  185. for (Class<? extends InputFormat> reg : inputFormatter) {
  186. boolean result = checkInputFormat(fs, conf, reg, files);
  187. if (result) {
  188. return false;
  189. }
  190. }
  191. return true;
  192. }
  193. public static RecordWriter getHiveRecordWriter(JobConf jc,
  194. TableDesc tableInfo, Class<? extends Writable> outputClass,
  195. FileSinkDesc conf, Path outPath) throws HiveException {
  196. try {
  197. HiveOutputFormat<?, ?> hiveOutputFormat = tableInfo
  198. .getOutputFileFormatClass().newInstance();
  199. boolean isCompressed = conf.getCompressed();
  200. JobConf jc_output = jc;
  201. if (isCompressed) {
  202. jc_output = new JobConf(jc);
  203. String codecStr = conf.getCompressCodec();
  204. if (codecStr != null && !codecStr.trim().equals("")) {
  205. Class<? extends CompressionCodec> codec = (Class<? extends CompressionCodec>) Class
  206. .forName(codecStr);
  207. FileOutputFormat.setOutputCompressorClass(jc_output, codec);
  208. }
  209. String type = conf.getCompressType();
  210. if (type != null && !type.trim().equals("")) {
  211. CompressionType style = CompressionType.valueOf(type);
  212. SequenceFileOutputFormat.setOutputCompressionType(jc, style);
  213. }
  214. }
  215. return getRecordWriter(jc_output, hiveOutputFormat, outputClass,
  216. isCompressed, tableInfo.getProperties(), outPath);
  217. } catch (Exception e) {
  218. throw new HiveException(e);
  219. }
  220. }
  221. public static RecordWriter getRecordWriter(JobConf jc,
  222. HiveOutputFormat<?, ?> hiveOutputFormat,
  223. final Class<? extends Writable> valueClass, boolean isCompressed,
  224. Properties tableProp, Path outPath) throws IOException, HiveException {
  225. if (hiveOutputFormat != null) {
  226. return hiveOutputFormat.getHiveRecordWriter(jc, outPath, valueClass,
  227. isCompressed, tableProp, null);
  228. }
  229. return null;
  230. }
  231. public static PartitionDesc getPartitionDescFromPathRecursively(
  232. Map<String, PartitionDesc> pathToPartitionInfo, Path dir,
  233. Map<Map<String, PartitionDesc>, Map<String, PartitionDesc>> cacheMap)
  234. throws IOException {
  235. return getPartitionDescFromPathRecursively(pathToPartitionInfo, dir,
  236. cacheMap, false);
  237. }
  238. public static PartitionDesc getPartitionDescFromPathRecursively(
  239. Map<String, PartitionDesc> pathToPartitionInfo, Path dir,
  240. Map<Map<String, PartitionDesc>, Map<String, PartitionDesc>> cacheMap,
  241. boolean ignoreSchema) throws IOException {
  242. PartitionDesc part = doGetPartitionDescFromPath(pathToPartitionInfo, dir);
  243. if (part == null
  244. && (ignoreSchema || (dir.toUri().getScheme() == null || dir.toUri().getScheme().trim()
  245. .equals("")))) {
  246. Map<String, PartitionDesc> newPathToPartitionInfo = null;
  247. if (cacheMap != null) {
  248. newPathToPartitionInfo = cacheMap.get(pathToPartitionInfo);
  249. }
  250. if (newPathToPartitionInfo == null) { // still null
  251. newPathToPartitionInfo = new HashMap<String, PartitionDesc>();
  252. populateNewPartitionDesc(pathToPartitionInfo, newPathToPartitionInfo);
  253. if (cacheMap != null) {
  254. cacheMap.put(pathToPartitionInfo, newPathToPartitionInfo);
  255. }
  256. }
  257. part = doGetPartitionDescFromPath(newPathToPartitionInfo, dir);
  258. }
  259. if (part != null) {
  260. return part;
  261. } else {
  262. throw new IOException("cannot find dir = " + dir.toString()
  263. + " in pathToPartitionInfo: " + pathToPartitionInfo.keySet());
  264. }
  265. }
  266. private static void populateNewPartitionDesc(
  267. Map<String, PartitionDesc> pathToPartitionInfo,
  268. Map<String, PartitionDesc> newPathToPartitionInfo) {
  269. for (Map.Entry<String, PartitionDesc> entry: pathToPartitionInfo.entrySet()) {
  270. String entryKey = entry.getKey();
  271. PartitionDesc partDesc = entry.getValue();
  272. Path newP = new Path(entryKey);
  273. String pathOnly = newP.toUri().getPath();
  274. newPathToPartitionInfo.put(pathOnly, partDesc);
  275. }
  276. }
  277. private static PartitionDesc doGetPartitionDescFromPath(
  278. Map<String, PartitionDesc> pathToPartitionInfo, Path dir) {
  279. // We first do exact match, and then do prefix matching. The latter is due to input dir
  280. // could be /dir/ds='2001-02-21'/part-03 where part-03 is not part of partition
  281. String dirPath = dir.toUri().getPath();
  282. PartitionDesc part = pathToPartitionInfo.get(dir.toString());
  283. if (part == null) {
  284. // LOG.warn("exact match not found, try ripping input path's theme and authority");
  285. part = pathToPartitionInfo.get(dirPath);
  286. }
  287. if (part == null) {
  288. String dirStr = dir.toString();
  289. int dirPathIndex = dirPath.lastIndexOf(File.separator);
  290. int dirStrIndex = dirStr.lastIndexOf(File.separator);
  291. while (dirPathIndex >= 0 && dirStrIndex >= 0) {
  292. dirStr = dirStr.substring(0, dirStrIndex);
  293. dirPath = dirPath.substring(0, dirPathIndex);
  294. //first try full match
  295. part = pathToPartitionInfo.get(dirStr);
  296. if (part == null) {
  297. // LOG.warn("exact match not found, try ripping input path's theme and authority");
  298. part = pathToPartitionInfo.get(dirPath);
  299. }
  300. if (part != null) {
  301. break;
  302. }
  303. dirPathIndex = dirPath.lastIndexOf(File.separator);
  304. dirStrIndex = dirStr.lastIndexOf(File.separator);
  305. }
  306. }
  307. return part;
  308. }
  309. private static boolean foundAlias(Map<String, ArrayList<String>> pathToAliases,
  310. String path) {
  311. List<String> aliases = pathToAliases.get(path);
  312. if ((aliases == null) || (aliases.isEmpty())) {
  313. return false;
  314. }
  315. return true;
  316. }
  317. private static String getMatchingPath(Map<String, ArrayList<String>> pathToAliases,
  318. Path dir) {
  319. // First find the path to be searched
  320. String path = dir.toString();
  321. if (foundAlias(pathToAliases, path)) {
  322. return path;
  323. }
  324. String dirPath = dir.toUri().getPath();
  325. if (foundAlias(pathToAliases, dirPath)) {
  326. return dirPath;
  327. }
  328. path = dirPath;
  329. String dirStr = dir.toString();
  330. int dirPathIndex = dirPath.lastIndexOf(File.separator);
  331. int dirStrIndex = dirStr.lastIndexOf(File.separator);
  332. while (dirPathIndex >= 0 && dirStrIndex >= 0) {
  333. dirStr = dirStr.substring(0, dirStrIndex);
  334. dirPath = dirPath.substring(0, dirPathIndex);
  335. //first try full match
  336. if (foundAlias(pathToAliases, dirStr)) {
  337. return dirStr;
  338. }
  339. if (foundAlias(pathToAliases, dirPath)) {
  340. return dirPath;
  341. }
  342. dirPathIndex = dirPath.lastIndexOf(File.separator);
  343. dirStrIndex = dirStr.lastIndexOf(File.separator);
  344. }
  345. return null;
  346. }
  347. /**
  348. * Get the list of operatators from the opeerator tree that are needed for the path
  349. * @param pathToAliases mapping from path to aliases
  350. * @param aliasToWork The operator tree to be invoked for a given alias
  351. * @param dir The path to look for
  352. **/
  353. public static List<Operator<? extends Serializable>> doGetAliasesFromPath(
  354. Map<String, ArrayList<String>> pathToAliases,
  355. Map<String, Operator<? extends Serializable>> aliasToWork, Path dir) {
  356. String path = getMatchingPath(pathToAliases, dir);
  357. List<Operator<? extends Serializable>> opList =
  358. new ArrayList<Operator<? extends Serializable>>();
  359. List<String> aliases = pathToAliases.get(path);
  360. for (String alias : aliases) {
  361. opList.add(aliasToWork.get(alias));
  362. }
  363. return opList;
  364. }
  365. private HiveFileFormatUtils() {
  366. // prevent instantiation
  367. }
  368. }