PageRenderTime 85ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 1ms

/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java

http://github.com/apache/hive
Java | 5009 lines | 3560 code | 483 blank | 966 comment | 767 complexity | 0a4c994fbc3d9ff2194ad80e8b9978ba MD5 | raw file
Possible License(s): Apache-2.0
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hive.ql.exec;
  19. import com.fasterxml.jackson.databind.ObjectMapper;
  20. import java.beans.DefaultPersistenceDelegate;
  21. import java.beans.Encoder;
  22. import java.beans.Expression;
  23. import java.beans.Statement;
  24. import java.io.ByteArrayInputStream;
  25. import java.io.ByteArrayOutputStream;
  26. import java.io.DataInput;
  27. import java.io.EOFException;
  28. import java.io.File;
  29. import java.io.FileNotFoundException;
  30. import java.io.IOException;
  31. import java.io.InputStream;
  32. import java.io.OutputStream;
  33. import java.net.URI;
  34. import java.net.URISyntaxException;
  35. import java.net.URL;
  36. import java.net.URLClassLoader;
  37. import java.net.URLDecoder;
  38. import java.security.AccessController;
  39. import java.sql.Connection;
  40. import java.sql.DriverManager;
  41. import java.sql.PreparedStatement;
  42. import java.sql.SQLException;
  43. import java.sql.SQLFeatureNotSupportedException;
  44. import java.sql.SQLTransientException;
  45. import java.text.SimpleDateFormat;
  46. import java.util.ArrayList;
  47. import java.util.Arrays;
  48. import java.util.Base64;
  49. import java.util.Calendar;
  50. import java.util.Collection;
  51. import java.util.Collections;
  52. import java.util.Enumeration;
  53. import java.util.HashMap;
  54. import java.util.HashSet;
  55. import java.util.Iterator;
  56. import java.util.LinkedHashMap;
  57. import java.util.LinkedList;
  58. import java.util.List;
  59. import java.util.Map;
  60. import java.util.Optional;
  61. import java.util.Properties;
  62. import java.util.Random;
  63. import java.util.Set;
  64. import java.util.UUID;
  65. import java.util.concurrent.Callable;
  66. import java.util.concurrent.ConcurrentHashMap;
  67. import java.util.concurrent.ExecutionException;
  68. import java.util.concurrent.ExecutorService;
  69. import java.util.concurrent.Executors;
  70. import java.util.concurrent.Future;
  71. import java.util.concurrent.ThreadLocalRandom;
  72. import java.util.concurrent.atomic.AtomicLong;
  73. import java.util.regex.Matcher;
  74. import java.util.regex.Pattern;
  75. import java.util.zip.Deflater;
  76. import java.util.zip.DeflaterOutputStream;
  77. import java.util.zip.InflaterInputStream;
  78. import org.apache.commons.collections.MapUtils;
  79. import org.apache.commons.lang3.StringUtils;
  80. import org.apache.commons.lang3.StringEscapeUtils;
  81. import org.apache.hadoop.conf.Configuration;
  82. import org.apache.hadoop.filecache.DistributedCache;
  83. import org.apache.hadoop.fs.ContentSummary;
  84. import org.apache.hadoop.fs.FSDataInputStream;
  85. import org.apache.hadoop.fs.FSDataOutputStream;
  86. import org.apache.hadoop.fs.FileStatus;
  87. import org.apache.hadoop.fs.FileSystem;
  88. import org.apache.hadoop.fs.LocatedFileStatus;
  89. import org.apache.hadoop.fs.Path;
  90. import org.apache.hadoop.fs.PathFilter;
  91. import org.apache.hadoop.fs.RemoteIterator;
  92. import org.apache.hadoop.fs.permission.FsPermission;
  93. import org.apache.hadoop.hive.common.BlobStorageUtils;
  94. import org.apache.hadoop.hive.common.FileUtils;
  95. import org.apache.hadoop.hive.common.HiveInterruptCallback;
  96. import org.apache.hadoop.hive.common.HiveInterruptUtils;
  97. import org.apache.hadoop.hive.common.HiveStatsUtils;
  98. import org.apache.hadoop.hive.common.JavaUtils;
  99. import org.apache.hadoop.hive.common.StatsSetupConst;
  100. import org.apache.hadoop.hive.common.StringInternUtils;
  101. import org.apache.hadoop.hive.common.TableName;
  102. import org.apache.hadoop.hive.common.ValidWriteIdList;
  103. import org.apache.hadoop.hive.conf.HiveConf;
  104. import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
  105. import org.apache.hadoop.hive.metastore.Warehouse;
  106. import org.apache.hadoop.hive.metastore.api.FieldSchema;
  107. import org.apache.hadoop.hive.metastore.api.Order;
  108. import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
  109. import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils;
  110. import org.apache.hadoop.hive.ql.Context;
  111. import org.apache.hadoop.hive.ql.ErrorMsg;
  112. import org.apache.hadoop.hive.ql.DriverState;
  113. import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
  114. import org.apache.hadoop.hive.ql.exec.mr.ExecDriver;
  115. import org.apache.hadoop.hive.ql.exec.mr.ExecMapper;
  116. import org.apache.hadoop.hive.ql.exec.mr.ExecReducer;
  117. import org.apache.hadoop.hive.ql.exec.mr.MapRedTask;
  118. import org.apache.hadoop.hive.ql.exec.spark.SparkTask;
  119. import org.apache.hadoop.hive.ql.exec.tez.DagUtils;
  120. import org.apache.hadoop.hive.ql.exec.tez.TezTask;
  121. import org.apache.hadoop.hive.ql.exec.util.DAGTraversal;
  122. import org.apache.hadoop.hive.ql.exec.vector.VectorizedInputFormatInterface;
  123. import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx;
  124. import org.apache.hadoop.hive.ql.io.AcidUtils;
  125. import org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat;
  126. import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
  127. import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat;
  128. import org.apache.hadoop.hive.ql.io.HiveInputFormat;
  129. import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
  130. import org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat;
  131. import org.apache.hadoop.hive.ql.io.IOConstants;
  132. import org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat;
  133. import org.apache.hadoop.hive.ql.io.OneNullRowInputFormat;
  134. import org.apache.hadoop.hive.ql.io.RCFile;
  135. import org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat;
  136. import org.apache.hadoop.hive.ql.io.SelfDescribingInputFormatInterface;
  137. import org.apache.hadoop.hive.ql.io.merge.MergeFileMapper;
  138. import org.apache.hadoop.hive.ql.io.merge.MergeFileWork;
  139. import org.apache.hadoop.hive.ql.io.rcfile.truncate.ColumnTruncateMapper;
  140. import org.apache.hadoop.hive.ql.io.rcfile.truncate.ColumnTruncateWork;
  141. import org.apache.hadoop.hive.ql.log.PerfLogger;
  142. import org.apache.hadoop.hive.ql.metadata.HiveException;
  143. import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler;
  144. import org.apache.hadoop.hive.ql.metadata.HiveUtils;
  145. import org.apache.hadoop.hive.ql.metadata.InputEstimator;
  146. import org.apache.hadoop.hive.ql.metadata.Partition;
  147. import org.apache.hadoop.hive.ql.metadata.Table;
  148. import org.apache.hadoop.hive.ql.parse.SemanticException;
  149. import org.apache.hadoop.hive.ql.plan.BaseWork;
  150. import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
  151. import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
  152. import org.apache.hadoop.hive.ql.plan.IStatsGatherDesc;
  153. import org.apache.hadoop.hive.ql.plan.MapWork;
  154. import org.apache.hadoop.hive.ql.plan.MapredWork;
  155. import org.apache.hadoop.hive.ql.plan.MergeJoinWork;
  156. import org.apache.hadoop.hive.ql.plan.OperatorDesc;
  157. import org.apache.hadoop.hive.ql.plan.PartitionDesc;
  158. import org.apache.hadoop.hive.ql.plan.PlanUtils;
  159. import org.apache.hadoop.hive.ql.plan.ReduceWork;
  160. import org.apache.hadoop.hive.ql.plan.TableDesc;
  161. import org.apache.hadoop.hive.ql.secrets.URISecretSource;
  162. import org.apache.hadoop.hive.ql.session.SessionState;
  163. import org.apache.hadoop.hive.ql.stats.StatsFactory;
  164. import org.apache.hadoop.hive.ql.stats.StatsPublisher;
  165. import org.apache.hadoop.hive.serde.serdeConstants;
  166. import org.apache.hadoop.hive.serde2.AbstractSerDe;
  167. import org.apache.hadoop.hive.serde2.MetadataTypedColumnsetSerDe;
  168. import org.apache.hadoop.hive.serde2.SerDeException;
  169. import org.apache.hadoop.hive.serde2.SerDeUtils;
  170. import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe;
  171. import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
  172. import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
  173. import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
  174. import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector;
  175. import org.apache.hadoop.hive.serde2.objectinspector.StructField;
  176. import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
  177. import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
  178. import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
  179. import org.apache.hadoop.hive.shims.ShimLoader;
  180. import org.apache.hadoop.io.IOUtils;
  181. import org.apache.hadoop.io.SequenceFile;
  182. import org.apache.hadoop.io.SequenceFile.CompressionType;
  183. import org.apache.hadoop.io.Text;
  184. import org.apache.hadoop.io.Writable;
  185. import org.apache.hadoop.io.compress.CompressionCodec;
  186. import org.apache.hadoop.io.compress.DefaultCodec;
  187. import org.apache.hadoop.mapred.FileInputFormat;
  188. import org.apache.hadoop.mapred.FileOutputFormat;
  189. import org.apache.hadoop.mapred.FileSplit;
  190. import org.apache.hadoop.mapred.InputFormat;
  191. import org.apache.hadoop.mapred.InputSplit;
  192. import org.apache.hadoop.mapred.JobConf;
  193. import org.apache.hadoop.mapred.RecordReader;
  194. import org.apache.hadoop.mapred.Reporter;
  195. import org.apache.hadoop.mapred.SequenceFileInputFormat;
  196. import org.apache.hadoop.mapred.SequenceFileOutputFormat;
  197. import org.apache.hadoop.mapred.TextInputFormat;
  198. import org.apache.hadoop.security.Credentials;
  199. import org.apache.hadoop.security.UserGroupInformation;
  200. import org.apache.hadoop.security.alias.CredentialProviderFactory;
  201. import org.apache.hadoop.util.Progressable;
  202. import org.apache.hive.common.util.ACLConfigurationParser;
  203. import org.apache.hive.common.util.ReflectionUtil;
  204. import org.slf4j.Logger;
  205. import org.slf4j.LoggerFactory;
  206. import com.esotericsoftware.kryo.Kryo;
  207. import com.google.common.annotations.VisibleForTesting;
  208. import com.google.common.base.Preconditions;
  209. import com.google.common.collect.Maps;
  210. import com.google.common.util.concurrent.MoreExecutors;
  211. import com.google.common.util.concurrent.ThreadFactoryBuilder;
  212. /**
  213. * Utilities.
  214. *
  215. */
  216. @SuppressWarnings({ "nls", "deprecation" })
  217. public final class Utilities {
  218. /**
  219. * Mapper to use to serialize/deserialize JSON objects ().
  220. */
  221. public static final ObjectMapper JSON_MAPPER = new ObjectMapper();
  222. /**
  223. * A logger mostly used to trace-log the details of Hive table file operations. Filtering the
  224. * logs for FileOperations (with trace logs present) allows one to debug what Hive has done with
  225. * various files and directories while committing writes, as well as reading.
  226. */
  227. public static final Logger FILE_OP_LOGGER = LoggerFactory.getLogger("FileOperations");
  228. public static final Logger LOGGER = LoggerFactory.getLogger(Utilities.class);
  229. /**
  230. * The object in the reducer are composed of these top level fields.
  231. */
  232. public static final String HADOOP_LOCAL_FS = "file:///";
  233. public static final String HADOOP_LOCAL_FS_SCHEME = "file";
  234. public static final String MAP_PLAN_NAME = "map.xml";
  235. public static final String REDUCE_PLAN_NAME = "reduce.xml";
  236. public static final String MERGE_PLAN_NAME = "merge.xml";
  237. public static final String INPUT_NAME = "iocontext.input.name";
  238. public static final String HAS_MAP_WORK = "has.map.work";
  239. public static final String HAS_REDUCE_WORK = "has.reduce.work";
  240. public static final String MAPRED_MAPPER_CLASS = "mapred.mapper.class";
  241. public static final String MAPRED_REDUCER_CLASS = "mapred.reducer.class";
  242. public static final String HIVE_ADDED_JARS = "hive.added.jars";
  243. public static final String VECTOR_MODE = "VECTOR_MODE";
  244. public static final String USE_VECTORIZED_INPUT_FILE_FORMAT = "USE_VECTORIZED_INPUT_FILE_FORMAT";
  245. public static final String MAPNAME = "Map ";
  246. public static final String REDUCENAME = "Reducer ";
  247. public static final String ENSURE_OPERATORS_EXECUTED = "ENSURE_OPERATORS_EXECUTED";
  248. @Deprecated
  249. protected static final String DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX = "mapred.dfsclient.parallelism.max";
  250. // all common whitespaces as defined in Character.isWhitespace(char)
  251. // Used primarily as a workaround until TEXT-175 is released
  252. public static final char[] COMMON_WHITESPACE_CHARS =
  253. { '\t', '\n', '\u000B', '\f', '\r', '\u001C', '\u001D', '\u001E', '\u001F', ' ' };
  254. private static final Object INPUT_SUMMARY_LOCK = new Object();
  255. private static final Object ROOT_HDFS_DIR_LOCK = new Object();
  256. @FunctionalInterface
  257. public interface SupplierWithCheckedException<T, X extends Exception> {
  258. T get() throws X;
  259. }
  260. /**
  261. * ReduceField:
  262. * KEY: record key
  263. * VALUE: record value
  264. */
  265. public static enum ReduceField {
  266. KEY(0), VALUE(1);
  267. int position;
  268. ReduceField(int position) {
  269. this.position = position;
  270. };
  271. };
  272. public static List<String> reduceFieldNameList;
  273. static {
  274. reduceFieldNameList = new ArrayList<String>();
  275. for (ReduceField r : ReduceField.values()) {
  276. reduceFieldNameList.add(r.toString());
  277. }
  278. }
  279. public static String removeValueTag(String column) {
  280. if (column.startsWith(ReduceField.VALUE + ".")) {
  281. return column.substring(6);
  282. }
  283. return column;
  284. }
  285. private Utilities() {
  286. // prevent instantiation
  287. }
  288. private static GlobalWorkMapFactory gWorkMap = new GlobalWorkMapFactory();
  289. private static final String CLASS_NAME = Utilities.class.getName();
  290. private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME);
  291. public static void clearWork(Configuration conf) {
  292. Path mapPath = getPlanPath(conf, MAP_PLAN_NAME);
  293. Path reducePath = getPlanPath(conf, REDUCE_PLAN_NAME);
  294. // if the plan path hasn't been initialized just return, nothing to clean.
  295. if (mapPath == null && reducePath == null) {
  296. return;
  297. }
  298. try {
  299. if (!HiveConf.getBoolVar(conf, ConfVars.HIVE_RPC_QUERY_PLAN)) {
  300. FileSystem fs = mapPath.getFileSystem(conf);
  301. try {
  302. fs.delete(mapPath, true);
  303. } catch (FileNotFoundException e) {
  304. // delete if exists, don't panic if it doesn't
  305. }
  306. try {
  307. fs.delete(reducePath, true);
  308. } catch (FileNotFoundException e) {
  309. // delete if exists, don't panic if it doesn't
  310. }
  311. }
  312. } catch (Exception e) {
  313. LOG.warn("Failed to clean-up tmp directories.", e);
  314. } finally {
  315. // where a single process works with multiple plans - we must clear
  316. // the cache before working with the next plan.
  317. clearWorkMapForConf(conf);
  318. }
  319. }
  320. public static MapredWork getMapRedWork(Configuration conf) {
  321. MapredWork w = new MapredWork();
  322. w.setMapWork(getMapWork(conf));
  323. w.setReduceWork(getReduceWork(conf));
  324. return w;
  325. }
  326. public static void cacheMapWork(Configuration conf, MapWork work, Path hiveScratchDir) {
  327. cacheBaseWork(conf, MAP_PLAN_NAME, work, hiveScratchDir);
  328. }
  329. public static void setMapWork(Configuration conf, MapWork work) {
  330. setBaseWork(conf, MAP_PLAN_NAME, work);
  331. }
  332. public static MapWork getMapWork(Configuration conf) {
  333. if (!conf.getBoolean(HAS_MAP_WORK, false)) {
  334. return null;
  335. }
  336. return (MapWork) getBaseWork(conf, MAP_PLAN_NAME);
  337. }
  338. public static void setReduceWork(Configuration conf, ReduceWork work) {
  339. setBaseWork(conf, REDUCE_PLAN_NAME, work);
  340. }
  341. public static ReduceWork getReduceWork(Configuration conf) {
  342. if (!conf.getBoolean(HAS_REDUCE_WORK, false)) {
  343. return null;
  344. }
  345. return (ReduceWork) getBaseWork(conf, REDUCE_PLAN_NAME);
  346. }
  347. public static Path setMergeWork(JobConf conf, MergeJoinWork mergeJoinWork, Path mrScratchDir,
  348. boolean useCache) {
  349. for (BaseWork baseWork : mergeJoinWork.getBaseWorkList()) {
  350. setBaseWork(conf, baseWork, mrScratchDir, baseWork.getName() + MERGE_PLAN_NAME, useCache);
  351. String prefixes = conf.get(DagUtils.TEZ_MERGE_WORK_FILE_PREFIXES);
  352. if (prefixes == null) {
  353. prefixes = baseWork.getName();
  354. } else {
  355. prefixes = prefixes + "," + baseWork.getName();
  356. }
  357. conf.set(DagUtils.TEZ_MERGE_WORK_FILE_PREFIXES, prefixes);
  358. }
  359. // nothing to return
  360. return null;
  361. }
  362. public static BaseWork getMergeWork(Configuration jconf) {
  363. String currentMergePrefix = jconf.get(DagUtils.TEZ_MERGE_CURRENT_MERGE_FILE_PREFIX);
  364. if (StringUtils.isEmpty(currentMergePrefix)) {
  365. return null;
  366. }
  367. return getMergeWork(jconf, jconf.get(DagUtils.TEZ_MERGE_CURRENT_MERGE_FILE_PREFIX));
  368. }
  369. public static BaseWork getMergeWork(Configuration jconf, String prefix) {
  370. if (StringUtils.isEmpty(prefix)) {
  371. return null;
  372. }
  373. return getBaseWork(jconf, prefix + MERGE_PLAN_NAME);
  374. }
  375. public static void cacheBaseWork(Configuration conf, String name, BaseWork work,
  376. Path hiveScratchDir) {
  377. try {
  378. setPlanPath(conf, hiveScratchDir);
  379. setBaseWork(conf, name, work);
  380. } catch (IOException e) {
  381. LOG.error("Failed to cache plan", e);
  382. throw new RuntimeException(e);
  383. }
  384. }
  385. /**
  386. * Pushes work into the global work map
  387. */
  388. private static void setBaseWork(Configuration conf, String name, BaseWork work) {
  389. Path path = getPlanPath(conf, name);
  390. setHasWork(conf, name);
  391. gWorkMap.get(conf).put(path, work);
  392. }
  393. /**
  394. * Returns the Map or Reduce plan
  395. * Side effect: the BaseWork returned is also placed in the gWorkMap
  396. * @param conf
  397. * @param name
  398. * @return BaseWork based on the name supplied will return null if name is null
  399. * @throws RuntimeException if the configuration files are not proper or if plan can not be loaded
  400. */
  401. private static BaseWork getBaseWork(Configuration conf, String name) {
  402. Path path = getPlanPath(conf, name);
  403. LOG.debug("PLAN PATH = {}", path);
  404. if (path == null) { // Map/reduce plan may not be generated
  405. return null;
  406. }
  407. BaseWork gWork = gWorkMap.get(conf).get(path);
  408. if (gWork != null) {
  409. LOG.debug("Found plan in cache for name: {}", name);
  410. return gWork;
  411. }
  412. InputStream in = null;
  413. Kryo kryo = SerializationUtilities.borrowKryo();
  414. try {
  415. String engine = HiveConf.getVar(conf, ConfVars.HIVE_EXECUTION_ENGINE);
  416. if (engine.equals("spark")) {
  417. // TODO Add jar into current thread context classloader as it may be invoked by Spark driver inside
  418. // threads, should be unnecessary while SPARK-5377 is resolved.
  419. String addedJars = conf.get(HIVE_ADDED_JARS);
  420. if (StringUtils.isNotEmpty(addedJars)) {
  421. AddToClassPathAction addAction = new AddToClassPathAction(
  422. Thread.currentThread().getContextClassLoader(), Arrays.asList(addedJars.split(";"))
  423. );
  424. ClassLoader newLoader = AccessController.doPrivileged(addAction);
  425. Thread.currentThread().setContextClassLoader(newLoader);
  426. kryo.setClassLoader(newLoader);
  427. }
  428. }
  429. Path localPath = path;
  430. LOG.debug("local path = {}", localPath);
  431. final long serializedSize;
  432. final String planMode;
  433. if (HiveConf.getBoolVar(conf, ConfVars.HIVE_RPC_QUERY_PLAN)) {
  434. String planStringPath = path.toUri().getPath();
  435. LOG.debug("Loading plan from string: {}", planStringPath);
  436. String planString = conf.getRaw(planStringPath);
  437. if (planString == null) {
  438. LOG.info("Could not find plan string in conf");
  439. return null;
  440. }
  441. serializedSize = planString.length();
  442. planMode = "RPC";
  443. byte[] planBytes = Base64.getDecoder().decode(planString);
  444. in = new ByteArrayInputStream(planBytes);
  445. in = new InflaterInputStream(in);
  446. } else {
  447. LOG.debug("Open file to read in plan: {}", localPath);
  448. FileSystem fs = localPath.getFileSystem(conf);
  449. in = fs.open(localPath);
  450. serializedSize = fs.getFileStatus(localPath).getLen();
  451. planMode = "FILE";
  452. }
  453. if(MAP_PLAN_NAME.equals(name)){
  454. if (ExecMapper.class.getName().equals(conf.get(MAPRED_MAPPER_CLASS))){
  455. gWork = SerializationUtilities.deserializePlan(kryo, in, MapWork.class);
  456. } else if(MergeFileMapper.class.getName().equals(conf.get(MAPRED_MAPPER_CLASS))) {
  457. gWork = SerializationUtilities.deserializePlan(kryo, in, MergeFileWork.class);
  458. } else if(ColumnTruncateMapper.class.getName().equals(conf.get(MAPRED_MAPPER_CLASS))) {
  459. gWork = SerializationUtilities.deserializePlan(kryo, in, ColumnTruncateWork.class);
  460. } else {
  461. throw new RuntimeException("unable to determine work from configuration ."
  462. + MAPRED_MAPPER_CLASS + " was "+ conf.get(MAPRED_MAPPER_CLASS));
  463. }
  464. } else if (REDUCE_PLAN_NAME.equals(name)) {
  465. if(ExecReducer.class.getName().equals(conf.get(MAPRED_REDUCER_CLASS))) {
  466. gWork = SerializationUtilities.deserializePlan(kryo, in, ReduceWork.class);
  467. } else {
  468. throw new RuntimeException("unable to determine work from configuration ."
  469. + MAPRED_REDUCER_CLASS +" was "+ conf.get(MAPRED_REDUCER_CLASS));
  470. }
  471. } else if (name.contains(MERGE_PLAN_NAME)) {
  472. if (name.startsWith(MAPNAME)) {
  473. gWork = SerializationUtilities.deserializePlan(kryo, in, MapWork.class);
  474. } else if (name.startsWith(REDUCENAME)) {
  475. gWork = SerializationUtilities.deserializePlan(kryo, in, ReduceWork.class);
  476. } else {
  477. throw new RuntimeException("Unknown work type: " + name);
  478. }
  479. }
  480. LOG.info("Deserialized plan (via {}) - name: {} size: {}", planMode,
  481. gWork.getName(), humanReadableByteCount(serializedSize));
  482. gWorkMap.get(conf).put(path, gWork);
  483. return gWork;
  484. } catch (FileNotFoundException fnf) {
  485. // happens. e.g.: no reduce work.
  486. LOG.debug("No plan file found: {}", path, fnf);
  487. return null;
  488. } catch (Exception e) {
  489. String msg = "Failed to load plan: " + path;
  490. LOG.error(msg, e);
  491. throw new RuntimeException(msg, e);
  492. } finally {
  493. SerializationUtilities.releaseKryo(kryo);
  494. IOUtils.closeStream(in);
  495. }
  496. }
  497. private static void setHasWork(Configuration conf, String name) {
  498. if (MAP_PLAN_NAME.equals(name)) {
  499. conf.setBoolean(HAS_MAP_WORK, true);
  500. } else if (REDUCE_PLAN_NAME.equals(name)) {
  501. conf.setBoolean(HAS_REDUCE_WORK, true);
  502. }
  503. }
  504. public static List<String> getFieldSchemaString(List<FieldSchema> fl) {
  505. if (fl == null) {
  506. return null;
  507. }
  508. ArrayList<String> ret = new ArrayList<String>();
  509. for (FieldSchema f : fl) {
  510. ret.add(f.getName() + " " + f.getType()
  511. + (f.getComment() != null ? (" " + f.getComment()) : ""));
  512. }
  513. return ret;
  514. }
  515. public static void setMapRedWork(Configuration conf, MapredWork w, Path hiveScratchDir) {
  516. String useName = conf.get(INPUT_NAME);
  517. if (useName == null) {
  518. useName = "mapreduce:" + hiveScratchDir;
  519. }
  520. conf.set(INPUT_NAME, useName);
  521. setMapWork(conf, w.getMapWork(), hiveScratchDir, true);
  522. if (w.getReduceWork() != null) {
  523. conf.set(INPUT_NAME, useName);
  524. setReduceWork(conf, w.getReduceWork(), hiveScratchDir, true);
  525. }
  526. }
  527. public static Path setMapWork(Configuration conf, MapWork w, Path hiveScratchDir, boolean useCache) {
  528. return setBaseWork(conf, w, hiveScratchDir, MAP_PLAN_NAME, useCache);
  529. }
  530. public static Path setReduceWork(Configuration conf, ReduceWork w, Path hiveScratchDir, boolean useCache) {
  531. return setBaseWork(conf, w, hiveScratchDir, REDUCE_PLAN_NAME, useCache);
  532. }
  533. private static Path setBaseWork(Configuration conf, BaseWork w, Path hiveScratchDir, String name, boolean useCache) {
  534. Kryo kryo = SerializationUtilities.borrowKryo(conf);
  535. try {
  536. setPlanPath(conf, hiveScratchDir);
  537. Path planPath = getPlanPath(conf, name);
  538. setHasWork(conf, name);
  539. OutputStream out = null;
  540. final long serializedSize;
  541. final String planMode;
  542. if (HiveConf.getBoolVar(conf, ConfVars.HIVE_RPC_QUERY_PLAN)) {
  543. // add it to the conf
  544. ByteArrayOutputStream byteOut = new ByteArrayOutputStream();
  545. try {
  546. out = new DeflaterOutputStream(byteOut, new Deflater(Deflater.BEST_SPEED));
  547. SerializationUtilities.serializePlan(kryo, w, out);
  548. out.close();
  549. out = null;
  550. } finally {
  551. IOUtils.closeStream(out);
  552. }
  553. final String serializedPlan = Base64.getEncoder().encodeToString(byteOut.toByteArray());
  554. serializedSize = serializedPlan.length();
  555. planMode = "RPC";
  556. conf.set(planPath.toUri().getPath(), serializedPlan);
  557. } else {
  558. // use the default file system of the conf
  559. FileSystem fs = planPath.getFileSystem(conf);
  560. try {
  561. out = fs.create(planPath);
  562. SerializationUtilities.serializePlan(kryo, w, out);
  563. out.close();
  564. out = null;
  565. long fileLen = fs.getFileStatus(planPath).getLen();
  566. serializedSize = fileLen;
  567. planMode = "FILE";
  568. } finally {
  569. IOUtils.closeStream(out);
  570. }
  571. // Serialize the plan to the default hdfs instance
  572. // Except for hadoop local mode execution where we should be
  573. // able to get the plan directly from the cache
  574. if (useCache && !ShimLoader.getHadoopShims().isLocalMode(conf)) {
  575. // Set up distributed cache
  576. if (!DistributedCache.getSymlink(conf)) {
  577. DistributedCache.createSymlink(conf);
  578. }
  579. String uriWithLink = planPath.toUri().toString() + "#" + name;
  580. DistributedCache.addCacheFile(new URI(uriWithLink), conf);
  581. // set replication of the plan file to a high number. we use the same
  582. // replication factor as used by the hadoop jobclient for job.xml etc.
  583. short replication = (short) conf.getInt("mapred.submit.replication", 10);
  584. fs.setReplication(planPath, replication);
  585. }
  586. }
  587. LOG.info("Serialized plan (via {}) - name: {} size: {}", planMode, w.getName(),
  588. humanReadableByteCount(serializedSize));
  589. // Cache the plan in this process
  590. gWorkMap.get(conf).put(planPath, w);
  591. return planPath;
  592. } catch (Exception e) {
  593. String msg = "Error caching " + name;
  594. LOG.error(msg, e);
  595. throw new RuntimeException(msg, e);
  596. } finally {
  597. SerializationUtilities.releaseKryo(kryo);
  598. }
  599. }
  600. private static Path getPlanPath(Configuration conf, String name) {
  601. Path planPath = getPlanPath(conf);
  602. if (planPath == null) {
  603. return null;
  604. }
  605. return new Path(planPath, name);
  606. }
  607. private static void setPlanPath(Configuration conf, Path hiveScratchDir) throws IOException {
  608. if (getPlanPath(conf) == null) {
  609. // this is the unique conf ID, which is kept in JobConf as part of the plan file name
  610. String jobID = UUID.randomUUID().toString();
  611. Path planPath = new Path(hiveScratchDir, jobID);
  612. if (!HiveConf.getBoolVar(conf, ConfVars.HIVE_RPC_QUERY_PLAN)) {
  613. FileSystem fs = planPath.getFileSystem(conf);
  614. // since we are doing RPC creating a directory is un-necessary
  615. fs.mkdirs(planPath);
  616. }
  617. HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, planPath.toUri().toString());
  618. }
  619. }
  620. public static Path getPlanPath(Configuration conf) {
  621. String plan = HiveConf.getVar(conf, HiveConf.ConfVars.PLAN);
  622. if (plan != null && !plan.isEmpty()) {
  623. return new Path(plan);
  624. }
  625. return null;
  626. }
  627. public static class CollectionPersistenceDelegate extends DefaultPersistenceDelegate {
  628. @Override
  629. protected Expression instantiate(Object oldInstance, Encoder out) {
  630. return new Expression(oldInstance, oldInstance.getClass(), "new", null);
  631. }
  632. @Override
  633. protected void initialize(Class<?> type, Object oldInstance, Object newInstance, Encoder out) {
  634. Iterator<?> ite = ((Collection<?>) oldInstance).iterator();
  635. while (ite.hasNext()) {
  636. out.writeStatement(new Statement(oldInstance, "add", new Object[] {ite.next()}));
  637. }
  638. }
  639. }
  640. @VisibleForTesting
  641. public static TableDesc defaultTd;
  642. static {
  643. // by default we expect ^A separated strings
  644. // This tableDesc does not provide column names. We should always use
  645. // PlanUtils.getDefaultTableDesc(String separatorCode, String columns)
  646. // or getBinarySortableTableDesc(List<FieldSchema> fieldSchemas) when
  647. // we know the column names.
  648. /**
  649. * Generate the table descriptor of MetadataTypedColumnsetSerDe with the
  650. * separatorCode. MetaDataTypedColumnsetSerDe is used because LazySimpleSerDe
  651. * does not support a table with a single column "col" with type
  652. * "array<string>".
  653. */
  654. defaultTd = new TableDesc(TextInputFormat.class, IgnoreKeyTextOutputFormat.class,
  655. Utilities.makeProperties(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_FORMAT,
  656. "" + Utilities.ctrlaCode, serdeConstants.SERIALIZATION_LIB,
  657. MetadataTypedColumnsetSerDe.class.getName()));
  658. }
  659. public static final int carriageReturnCode = 13;
  660. public static final int newLineCode = 10;
  661. public static final int tabCode = 9;
  662. public static final int ctrlaCode = 1;
  663. public static final String INDENT = " ";
  664. // Note: When DDL supports specifying what string to represent null,
  665. // we should specify "NULL" to represent null in the temp table, and then
  666. // we can make the following translation deprecated.
  667. public static final String nullStringStorage = "\\N";
  668. public static final String nullStringOutput = "NULL";
  669. /**
  670. * Gets the task id if we are running as a Hadoop job. Gets a random number otherwise.
  671. */
  672. public static String getTaskId(Configuration hconf) {
  673. String taskid = (hconf == null) ? null : hconf.get("mapred.task.id");
  674. if (StringUtils.isEmpty(taskid)) {
  675. return (Integer
  676. .toString(ThreadLocalRandom.current().nextInt(Integer.MAX_VALUE)));
  677. } else {
  678. /*
  679. * extract the task and attempt id from the hadoop taskid. in version 17 the leading component
  680. * was 'task_'. thereafter the leading component is 'attempt_'. in 17 - hadoop also seems to
  681. * have used _map_ and _reduce_ to denote map/reduce task types
  682. */
  683. String ret = taskid.replaceAll(".*_[mr]_", "").replaceAll(".*_(map|reduce)_", "");
  684. return (ret);
  685. }
  686. }
  687. public static Properties makeProperties(String... olist) {
  688. Properties ret = new Properties();
  689. for (int i = 0; i < olist.length; i += 2) {
  690. ret.setProperty(olist[i], olist[i + 1]);
  691. }
  692. return (ret);
  693. }
  694. public static ArrayList makeList(Object... olist) {
  695. ArrayList ret = new ArrayList();
  696. for (Object element : olist) {
  697. ret.add(element);
  698. }
  699. return (ret);
  700. }
  701. public static TableDesc getTableDesc(Table tbl) {
  702. Properties props = tbl.getMetadata();
  703. props.put(serdeConstants.SERIALIZATION_LIB, tbl.getDeserializer().getClass().getName());
  704. if (tbl.getMetaTable() != null) {
  705. props.put("metaTable", tbl.getMetaTable());
  706. }
  707. return (new TableDesc(tbl.getInputFormatClass(), tbl
  708. .getOutputFormatClass(), props));
  709. }
  710. // column names and column types are all delimited by comma
  711. public static TableDesc getTableDesc(String cols, String colTypes) {
  712. Properties properties = new Properties();
  713. properties.put(serdeConstants.SERIALIZATION_FORMAT, "" + Utilities.ctrlaCode);
  714. properties.put(serdeConstants.LIST_COLUMNS, cols);
  715. properties.put(serdeConstants.LIST_COLUMN_TYPES, colTypes);
  716. properties.put(serdeConstants.SERIALIZATION_LIB, LazySimpleSerDe.class.getName());
  717. properties.put(hive_metastoreConstants.TABLE_BUCKETING_VERSION, "-1");
  718. return (new TableDesc(SequenceFileInputFormat.class,
  719. HiveSequenceFileOutputFormat.class, properties));
  720. }
  721. public static PartitionDesc getPartitionDesc(Partition part, TableDesc tableDesc) throws
  722. HiveException {
  723. return new PartitionDesc(part, tableDesc);
  724. }
  725. public static PartitionDesc getPartitionDescFromTableDesc(TableDesc tblDesc, Partition part,
  726. boolean usePartSchemaProperties) throws HiveException {
  727. return new PartitionDesc(part, tblDesc, usePartSchemaProperties);
  728. }
  729. private static boolean isWhitespace(int c) {
  730. if (c == -1) {
  731. return false;
  732. }
  733. return Character.isWhitespace((char) c);
  734. }
  735. public static boolean contentsEqual(InputStream is1, InputStream is2, boolean ignoreWhitespace)
  736. throws IOException {
  737. try {
  738. if ((is1 == is2) || (is1 == null && is2 == null)) {
  739. return true;
  740. }
  741. if (is1 == null || is2 == null) {
  742. return false;
  743. }
  744. while (true) {
  745. int c1 = is1.read();
  746. while (ignoreWhitespace && isWhitespace(c1)) {
  747. c1 = is1.read();
  748. }
  749. int c2 = is2.read();
  750. while (ignoreWhitespace && isWhitespace(c2)) {
  751. c2 = is2.read();
  752. }
  753. if (c1 == -1 && c2 == -1) {
  754. return true;
  755. }
  756. if (c1 != c2) {
  757. break;
  758. }
  759. }
  760. } catch (FileNotFoundException e) {
  761. LOG.warn("Could not compare files. One or both cannot be found", e);
  762. }
  763. return false;
  764. }
  765. /**
  766. * convert "From src insert blah blah" to "From src insert ... blah"
  767. */
  768. public static String abbreviate(String str, int max) {
  769. str = str.trim();
  770. int len = str.length();
  771. int suffixlength = 20;
  772. if (len <= max) {
  773. return str;
  774. }
  775. suffixlength = Math.min(suffixlength, (max - 3) / 2);
  776. String rev = StringUtils.reverse(str);
  777. // get the last few words
  778. String suffix = StringUtils.abbreviate(rev, suffixlength);
  779. suffix = StringUtils.reverse(suffix);
  780. // first few ..
  781. String prefix = StringUtils.abbreviate(str, max - suffix.length());
  782. return prefix + suffix;
  783. }
  784. public static final String NSTR = "";
  785. /**
  786. * StreamStatus.
  787. *
  788. */
  789. public static enum StreamStatus {
  790. EOF, TERMINATED
  791. }
  792. public static StreamStatus readColumn(DataInput in, OutputStream out) throws IOException {
  793. while (true) {
  794. int b;
  795. try {
  796. b = in.readByte();
  797. } catch (EOFException e) {
  798. return StreamStatus.EOF;
  799. }
  800. if (b == Utilities.newLineCode) {
  801. return StreamStatus.TERMINATED;
  802. }
  803. out.write(b);
  804. }
  805. // Unreachable
  806. }
  807. /**
  808. * Convert an output stream to a compressed output stream based on codecs codecs in the Job
  809. * Configuration. Caller specifies directly whether file is compressed or not
  810. *
  811. * @param jc
  812. * Job Configuration
  813. * @param out
  814. * Output Stream to be converted into compressed output stream
  815. * @param isCompressed
  816. * whether the output stream needs to be compressed or not
  817. * @return compressed output stream
  818. */
  819. public static OutputStream createCompressedStream(JobConf jc, OutputStream out,
  820. boolean isCompressed) throws IOException {
  821. if (isCompressed) {
  822. Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(jc,
  823. DefaultCodec.class);
  824. CompressionCodec codec = ReflectionUtil.newInstance(codecClass, jc);
  825. return codec.createOutputStream(out);
  826. } else {
  827. return (out);
  828. }
  829. }
  830. /**
  831. * Based on compression option, output format, and configured output codec -
  832. * get extension for output file. Text files require an extension, whereas
  833. * others, like sequence files, do not.
  834. * <p>
  835. * The property <code>hive.output.file.extension</code> is used to determine
  836. * the extension - if set, it will override other logic for choosing an
  837. * extension.
  838. *
  839. * @param jc
  840. * Job Configuration
  841. * @param isCompressed
  842. * Whether the output file is compressed or not
  843. * @param hiveOutputFormat
  844. * The output format, used to detect if the format is text
  845. * @return the required file extension (example: .gz)
  846. */
  847. public static String getFileExtension(JobConf jc, boolean isCompressed,
  848. HiveOutputFormat<?, ?> hiveOutputFormat) {
  849. String extension = HiveConf.getVar(jc, HiveConf.ConfVars.OUTPUT_FILE_EXTENSION);
  850. if (!StringUtils.isEmpty(extension)) {
  851. return extension;
  852. }
  853. if ((hiveOutputFormat instanceof HiveIgnoreKeyTextOutputFormat) && isCompressed) {
  854. Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(jc,
  855. DefaultCodec.class);
  856. CompressionCodec codec = ReflectionUtil.newInstance(codecClass, jc);
  857. return codec.getDefaultExtension();
  858. }
  859. return StringUtils.EMPTY;
  860. }
  861. /**
  862. * Create a sequencefile output stream based on job configuration Uses user supplied compression
  863. * flag (rather than obtaining it from the Job Configuration).
  864. *
  865. * @param jc
  866. * Job configuration
  867. * @param fs
  868. * File System to create file in
  869. * @param file
  870. * Path to be created
  871. * @param keyClass
  872. * Java Class for key
  873. * @param valClass
  874. * Java Class for value
  875. * @return output stream over the created sequencefile
  876. */
  877. public static SequenceFile.Writer createSequenceWriter(JobConf jc, FileSystem fs, Path file,
  878. Class<?> keyClass, Class<?> valClass, boolean isCompressed, Progressable progressable)
  879. throws IOException {
  880. CompressionCodec codec = null;
  881. CompressionType compressionType = CompressionType.NONE;
  882. Class<? extends CompressionCodec> codecClass = null;
  883. if (isCompressed) {
  884. compressionType = SequenceFileOutputFormat.getOutputCompressionType(jc);
  885. codecClass = FileOutputFormat.getOutputCompressorClass(jc, DefaultCodec.class);
  886. codec = ReflectionUtil.newInstance(codecClass, jc);
  887. }
  888. return SequenceFile.createWriter(fs, jc, file, keyClass, valClass, compressionType, codec,
  889. progressable);
  890. }
  891. /**
  892. * Create a RCFile output stream based on job configuration Uses user supplied compression flag
  893. * (rather than obtaining it from the Job Configuration).
  894. *
  895. * @param jc
  896. * Job configuration
  897. * @param fs
  898. * File System to create file in
  899. * @param file
  900. * Path to be created
  901. * @return output stream over the created rcfile
  902. */
  903. public static RCFile.Writer createRCFileWriter(JobConf jc, FileSystem fs, Path file,
  904. boolean isCompressed, Progressable progressable) throws IOException {
  905. CompressionCodec codec = null;
  906. if (isCompressed) {
  907. Class<?> codecClass = FileOutputFormat.getOutputCompressorClass(jc, DefaultCodec.class);
  908. codec = (CompressionCodec) ReflectionUtil.newInstance(codecClass, jc);
  909. }
  910. return new RCFile.Writer(fs, jc, file, progressable, codec);
  911. }
  912. /**
  913. * Shamelessly cloned from GenericOptionsParser.
  914. */
  915. public static String realFile(String newFile, Configuration conf) throws IOException {
  916. Path path = new Path(newFile);
  917. URI pathURI = path.toUri();
  918. FileSystem fs;
  919. if (pathURI.getScheme() == null) {
  920. fs = FileSystem.getLocal(conf);
  921. } else {
  922. fs = path.getFileSystem(conf);
  923. }
  924. if (!fs.exists(path)) {
  925. return null;
  926. }
  927. String file = path.makeQualified(fs).toString();
  928. return file;
  929. }
  930. public static List<String> mergeUniqElems(List<String> src, List<String> dest) {
  931. if (dest == null) {
  932. return src;
  933. }
  934. if (src == null) {
  935. return dest;
  936. }
  937. int pos = 0;
  938. while (pos < dest.size()) {
  939. if (!src.contains(dest.get(pos))) {
  940. src.add(dest.get(pos));
  941. }
  942. pos++;
  943. }
  944. return src;
  945. }
  946. private static final String tmpPrefix = "_tmp.";
  947. private static final String taskTmpPrefix = "_task_tmp.";
  948. public static Path toTaskTempPath(Path orig) {
  949. if (orig.getName().indexOf(taskTmpPrefix) == 0) {
  950. return orig;
  951. }
  952. return new Path(orig.getParent(), taskTmpPrefix + orig.getName());
  953. }
  954. public static Path toTempPath(Path orig) {
  955. if (orig.getName().indexOf(tmpPrefix) == 0) {
  956. return orig;
  957. }
  958. return new Path(orig.getParent(), tmpPrefix + orig.getName());
  959. }
  960. /**
  961. * Given a path, convert to a temporary path.
  962. */
  963. public static Path toTempPath(String orig) {
  964. return toTempPath(new Path(orig));
  965. }
  966. /**
  967. * Detect if the supplied file is a temporary path.
  968. */
  969. private static boolean isTempPath(FileStatus file) {
  970. String name = file.getPath().getName();
  971. // in addition to detecting hive temporary files, we also check hadoop
  972. // temporary folders that used to show up in older releases
  973. return (name.startsWith("_task") || name.startsWith(tmpPrefix));
  974. }
  975. /**
  976. * Rename src to dst, or in the case dst already exists, move files in src to dst. If there is an
  977. * existing file with the same name, the new file's name will be appended with "_1", "_2", etc.
  978. *
  979. * @param fs
  980. * the FileSystem where src and dst are on.
  981. * @param src
  982. * the src directory
  983. * @param dst
  984. * the target directory
  985. * @throws IOException
  986. */
  987. public static void rename(FileSystem fs, Path src, Path dst) throws IOException, HiveException {
  988. if (!fs.rename(src, dst)) {
  989. throw new HiveException("Unable to move: " + src + " to: " + dst);
  990. }
  991. }
  992. private static void moveFileOrDir(FileSystem fs, FileStatus file, Path dst) throws IOException,
  993. HiveException {
  994. Path srcFilePath = file.getPath();
  995. String fileName = srcFilePath.getName();
  996. Path dstFilePath = new Path(dst, fileName);
  997. if (file.isDir()) {
  998. renameOrMoveFiles(fs, srcFilePath, dstFilePath);
  999. } else {
  1000. moveFile(fs, srcFilePath, dst, fileName);
  1001. }
  1002. }
  1003. /**
  1004. * Rename src to dst, or in the case dst already exists, move files in src to dst. If there is an
  1005. * existing file with the same name, the new file's name will be generated based on the file name.
  1006. * If the file name confirms to hive managed file NNNNNN_Y(_copy_YY) then it will create NNNNN_Y_copy_XX
  1007. * else it will append _1, _2, ....
  1008. * @param fs
  1009. * the FileSystem where src and dst are on.
  1010. * @param srcFile
  1011. * the src file
  1012. * @param destDir
  1013. * the target directory
  1014. * @param destFileName
  1015. * the target filename
  1016. * @return The final path the file was moved to.
  1017. * @throws IOException, HiveException
  1018. */
  1019. public static Path moveFile(FileSystem fs, Path srcFile, Path destDir, String destFileName)
  1020. throws IOException, HiveException {
  1021. Path dstFilePath = new Path(destDir, destFileName);
  1022. if (fs.exists(dstFilePath)) {
  1023. ParsedOutputFileName parsedFileName = ParsedOutputFileName.parse(destFileName);
  1024. int suffix = 0;
  1025. do {
  1026. suffix++;
  1027. if (parsedFileName.matches()) {
  1028. dstFilePath = new Path(destDir, parsedFileName.makeFilenameWithCopyIndex(suffix));
  1029. } else {
  1030. dstFilePath = new Path(destDir, destFileName + "_" + suffix);
  1031. }
  1032. } while (fs.exists(dstFilePath));
  1033. }
  1034. if (!fs.rename(srcFile, dstFilePath)) {
  1035. throw new HiveException("Unable to move: " + srcFile + " to: " + dstFilePath);
  1036. }
  1037. return dstFilePath;
  1038. }
  1039. /**
  1040. * Rename src to dst, or in the case dst already exists, move files in src to dst. If there is an
  1041. * existing file with the same name, the new file's name will be generated based on the file name.
  1042. * If the file name confirms to hive managed file NNNNNN_Y(_copy_YY) then it will create NNNNN_Y_copy_XX
  1043. * else it will append _1, _2, ....
  1044. *
  1045. * @param fs
  1046. * the FileSystem where src and dst are on.
  1047. * @param src
  1048. * the src directory
  1049. * @param dst
  1050. * the target directory
  1051. * @throws IOException
  1052. */
  1053. public static void renameOrMoveFiles(FileSystem fs, Path src, Path dst) throws IOException,
  1054. HiveException {
  1055. if (!fs.exists(dst)) {
  1056. if (!fs.rename(src, dst)) {
  1057. throw new HiveException("Unable to move: " + src + " to: " + dst);
  1058. }
  1059. } else {
  1060. // move file by file
  1061. FileStatus[] files = fs.listStatus(src);
  1062. for (FileStatus file : files) {
  1063. Utilities.moveFileOrDir(fs, file, dst);
  1064. }
  1065. }
  1066. }
  1067. /**
  1068. * Rename src to dst, or in the case dst already exists, move files in src
  1069. * to dst. If there is an existing file with the same name, the new file's
  1070. * name will be appended with "_1", "_2", etc. Happens in parallel mode.
  1071. *
  1072. * @param conf
  1073. *
  1074. * @param fs
  1075. * the FileSystem where src and dst are on.
  1076. * @param src
  1077. * the src directory
  1078. * @param dst
  1079. * the target directory
  1080. * @throws IOException
  1081. */
  1082. public static void renameOrMoveFilesInParallel(Configuration conf,
  1083. FileSystem fs, Path src, Path dst) throws IOException, HiveException {
  1084. if (!fs.exists(dst)) {
  1085. if (!fs.rename(src, dst)) {
  1086. throw new HiveException("Unable to move: " + src + " to: " + dst);
  1087. }
  1088. } else {
  1089. // move files in parallel
  1090. LOG.info("Moving files from {} to {}", src, dst);
  1091. final ExecutorService pool = createMoveThreadPool(conf);
  1092. List<Future<Void>> futures = new LinkedList<>();
  1093. final FileStatus[] files = fs.listStatus(src);
  1094. for (FileStatus file : files) {
  1095. futures.add(pool.submit(new Callable<Void>() {
  1096. @Override
  1097. public Void call() throws HiveException {
  1098. try {
  1099. Utilities.moveFileOrDir(fs, file, dst);
  1100. } catch (Exception e) {
  1101. throw new HiveException(e);
  1102. }
  1103. return null;
  1104. }
  1105. }));
  1106. }
  1107. shutdownAndCleanup(pool, futures);
  1108. LOG.info("Rename files from {} to {} is complete", src, dst);
  1109. }
  1110. }
  1111. public static final String COPY_KEYWORD = "_copy_"; // copy keyword
  1112. /**
  1113. * This breaks a prefixed bucket number into the prefix and the taskID
  1114. */
  1115. private static final Pattern PREFIXED_TASK_ID_REGEX =
  1116. Pattern.compile("^(.*?\\(.*\\))?([0-9]+)$");
  1117. /**
  1118. * This breaks a prefixed bucket number out into a single integer
  1119. */
  1120. private static final Pattern PREFIXED_BUCKET_ID_REGEX =
  1121. Pattern.compile("^(0*([0-9]+))_([0-9]+).*");
  1122. /**
  1123. * Get the task id from the filename. It is assumed that the filename is derived from the output
  1124. * of getTaskId
  1125. *
  1126. * @param filename
  1127. * filename to extract taskid from
  1128. */
  1129. public static String getTaskIdFromFilename(String filename) {
  1130. return getIdFromFilename(filename, false, false);
  1131. }
  1132. /**
  1133. * Get the part-spec + task id from the filename. It is assumed that the filename is derived
  1134. * from the output of getTaskId
  1135. *
  1136. * @param filename
  1137. * filename to extract taskid from
  1138. */
  1139. private static String getPrefixedTaskIdFromFilename(String filename) {
  1140. return getIdFromFilename(filename, true, false);
  1141. }
  1142. private static int getAttemptIdFromFilename(String filename) {
  1143. return Integer.parseInt(getIdFromFilename(filename, true, true));
  1144. }
  1145. private static String getIdFromFilename(String filepath, boolean isPrefixed, boolean isTaskAttempt) {
  1146. String filename = filepath;
  1147. int dirEnd = filepath.lastIndexOf(Path.SEPARATOR);
  1148. if (dirEnd != -1) {
  1149. filename = filepath.substring(dirEnd + 1);
  1150. }
  1151. ParsedOutputFileName parsedOutputFileName = ParsedOutputFileName.parse(filename);
  1152. String taskId;
  1153. if (parsedOutputFileName.matches()) {
  1154. if (isTaskAttempt) {
  1155. taskId = parsedOutputFileName.getAttemptId();
  1156. } else {
  1157. taskId = isPrefixed ? parsedOutputFileName.getPrefixedTaskId() : parsedOutputFileName.getTaskId();
  1158. }
  1159. } else {
  1160. taskId = filename;
  1161. LOG.warn("Unable to get task id from file name: {}. Using last component {}"
  1162. + " as task id.", filepath, taskId);
  1163. }
  1164. if (isTaskAttempt) {
  1165. LOG.debug("TaskAttemptId for {} = {}", filepath, taskId);
  1166. } else {
  1167. LOG.debug("TaskId for {} = {}", filepath, taskId);
  1168. }
  1169. return taskId;
  1170. }
  1171. /**
  1172. * Replace the task id from the filename. It is assumed that the filename is derived from the
  1173. * output of getTaskId
  1174. *
  1175. * @param filename
  1176. * filename to replace taskid "0_0" or "0_0.gz" by 33 to "33_0" or "33_0.gz"
  1177. */
  1178. public static String replaceTaskIdFromFilename(String filename, int bucketNum) {
  1179. return replaceTaskIdFromFilename(filename, String.valueOf(bucketNum));
  1180. }
  1181. public static String replaceTaskIdFromFilename(String filename, String fileId) {
  1182. String taskId = getTaskIdFromFilename(filename);
  1183. String newTaskId = replaceTaskId(taskId, fileId);
  1184. String ret = replaceTaskIdFromFilename(filename, taskId, newTaskId);
  1185. return (ret);
  1186. }
  1187. /**
  1188. * Replace taskId with input bucketNum. For example, if taskId is 000000 and bucketNum is 1,
  1189. * return should be 000001; if taskId is (ds%3D1)000000 and bucketNum is 1, return should be
  1190. * (ds%3D1)000001. This method is different from the replaceTaskId(String, String) method.
  1191. * In this method, the pattern is in taskId.
  1192. * @param taskId
  1193. * @param bucketNum
  1194. * @return
  1195. */
  1196. public static String replaceTaskId(String taskId, int bucketNum) {
  1197. String bucketNumStr = String.valueOf(bucketNum);
  1198. Matcher m = PREFIXED_TASK_ID_REGEX.matcher(taskId);
  1199. if (!m.matches()) {
  1200. LOG.warn("Unable to determine bucket number from task id: {}. Using " +
  1201. "task ID as bucket number.", taskId);
  1202. return adjustBucketNumLen(bucketNumStr, taskId);
  1203. } else {
  1204. String adjustedBucketNum = adjustBucketNumLen(bucketNumStr, m.group(2));
  1205. return (m.group(1) == null ? StringUtils.EMPTY : m.group(1)) + adjustedBucketNum;
  1206. }
  1207. }
  1208. /**
  1209. * Returns strBucketNum with enough 0's prefixing the task ID portion of the String to make it
  1210. * equal in length to taskId
  1211. *
  1212. * @param taskId - the taskId used as a template for length
  1213. * @param strBucketNum - the bucket number of the output, may or may not be prefixed
  1214. * @return
  1215. */
  1216. private static String replaceTaskId(String taskId, String strBucketNum) {
  1217. Matcher m = PREFIXED_TASK_ID_REGEX.matcher(strBucketNum);
  1218. if (!m.matches()) {
  1219. LOG.warn("Unable to determine bucket number from file ID: {}. Using " +
  1220. "file ID as bucket number.", strBucketNum);
  1221. return adjustBucketNumLen(strBucketNum, taskId);
  1222. } else {
  1223. String adjustedBucketNum = adjustBucketNumLen(m.group(2), taskId);
  1224. return (m.group(1) == null ? StringUtils.EMPTY : m.group(1)) + adjustedBucketNum;
  1225. }
  1226. }
  1227. /**
  1228. * Adds 0's to the beginning of bucketNum until bucketNum and taskId are the same length.
  1229. *
  1230. * @param bucketNum - the bucket number, should not be prefixed
  1231. * @param taskId - the taskId used as a template for length
  1232. * @return
  1233. */
  1234. private static String adjustBucketNumLen(String bucketNum, String taskId) {
  1235. int bucketNumLen = bucketNum.length();
  1236. int taskIdLen = taskId.length();
  1237. StringBuilder s = new StringBuilder();
  1238. for (int i = 0; i < taskIdLen - bucketNumLen; i++) {
  1239. s.append('0');
  1240. }
  1241. s.append(bucketNum);
  1242. return s.toString();
  1243. }
  1244. /**
  1245. * Replace the oldTaskId appearing in the filename by the newTaskId. The string oldTaskId could
  1246. * appear multiple times, we should only replace the last one.
  1247. *
  1248. * @param filename
  1249. * @param oldTaskId
  1250. * @param newTaskId
  1251. * @return
  1252. */
  1253. private static String replaceTaskIdFromFilename(String filename, String oldTaskId,
  1254. String newTaskId) {
  1255. String[] spl = filename.split(oldTaskId);
  1256. if ((spl.length == 0) || (spl.length == 1)) {
  1257. return filename.replaceAll(oldTaskId, newTaskId);
  1258. }
  1259. StringBuilder snew = new StringBuilder();
  1260. for (int idx = 0; idx < spl.length - 1; idx++) {
  1261. if (idx > 0) {
  1262. snew.append(oldTaskId);
  1263. }
  1264. snew.append(spl[idx]);
  1265. }
  1266. snew.append(newTaskId);
  1267. snew.append(spl[spl.length - 1]);
  1268. return snew.toString();
  1269. }
  1270. private static boolean shouldAvoidRename(FileSinkDesc conf, Configuration hConf) {
  1271. // we are avoiding rename/move only if following conditions are met
  1272. // * execution engine is tez
  1273. // * if it is select query
  1274. if (conf != null && conf.getIsQuery() && conf.getFilesToFetch() != null
  1275. && HiveConf.getVar(hConf, ConfVars.HIVE_EXECUTION_ENGINE).equalsIgnoreCase("tez")){
  1276. return true;
  1277. }
  1278. return false;
  1279. }
  1280. /**
  1281. * returns null if path is not exist
  1282. */
  1283. public static FileStatus[] listStatusIfExists(Path path, FileSystem fs) throws IOException {
  1284. try {
  1285. return fs.listStatus(path, FileUtils.HIDDEN_FILES_PATH_FILTER);
  1286. } catch (FileNotFoundException e) {
  1287. // FS in hadoop 2.0 throws FNF instead of returning null
  1288. return null;
  1289. }
  1290. }
  1291. public static void mvFileToFinalPath(Path specPath, Configuration hconf,
  1292. boolean success, Logger log, DynamicPartitionCtx dpCtx, FileSinkDesc conf,
  1293. Reporter reporter) throws IOException,
  1294. HiveException {
  1295. // There are following two paths this could could take based on the value of shouldAvoidRename
  1296. // shouldAvoidRename indicate if tmpPath should be renamed/moved or now.
  1297. // if false:
  1298. // Skip renaming/moving the tmpPath
  1299. // Deduplicate and keep a list of files
  1300. // Pass on the list of files to conf (to be used later by fetch operator)
  1301. // if true:
  1302. // 1) Rename tmpPath to a new directory name to prevent additional files
  1303. // from being added by runaway processes.
  1304. // 2) Remove duplicates from the temp directory
  1305. // 3) Rename/move the temp directory to specPath
  1306. FileSystem fs = specPath.getFileSystem(hconf);
  1307. Path tmpPath = Utilities.toTempPath(specPath);
  1308. Path taskTmpPath = Utilities.toTaskTempPath(specPath);
  1309. PerfLogger perfLogger = SessionState.getPerfLogger();
  1310. boolean isBlobStorage = BlobStorageUtils.isBlobStorageFileSystem(hconf, fs);
  1311. boolean avoidRename = false;
  1312. boolean shouldAvoidRename = shouldAvoidRename(conf, hconf);
  1313. if(isBlobStorage && (shouldAvoidRename|| ((conf != null) && conf.isCTASorCM()))
  1314. || (!isBlobStorage && shouldAvoidRename)) {
  1315. avoidRename = true;
  1316. }
  1317. if (success) {
  1318. if (!avoidRename && fs.exists(tmpPath)) {
  1319. // 1) Rename tmpPath to a new directory name to prevent additional files
  1320. // from being added by runaway processes.
  1321. // this is only done for all statements except SELECT, CTAS and Create MV
  1322. Path tmpPathOriginal = tmpPath;
  1323. tmpPath = new Path(tmpPath.getParent(), tmpPath.getName() + ".moved");
  1324. LOG.debug("shouldAvoidRename is false therefore moving/renaming " + tmpPathOriginal + " to " + tmpPath);
  1325. perfLogger.perfLogBegin("FileSinkOperator", "rename");
  1326. Utilities.rename(fs, tmpPathOriginal, tmpPath);
  1327. perfLogger.perfLogEnd("FileSinkOperator", "rename");
  1328. }
  1329. // Remove duplicates from tmpPath
  1330. List<FileStatus> statusList = HiveStatsUtils.getFileStatusRecurse(
  1331. tmpPath, ((dpCtx == null) ? 1 : dpCtx.getNumDPCols()), fs);
  1332. FileStatus[] statuses = statusList.toArray(new FileStatus[statusList.size()]);
  1333. if(statuses != null && statuses.length > 0) {
  1334. Set<FileStatus> filesKept = new HashSet<>();
  1335. perfLogger.perfLogBegin("FileSinkOperator", "RemoveTempOrDuplicateFiles");
  1336. // remove any tmp file or double-committed output files
  1337. List<Path> emptyBuckets = Utilities.removeTempOrDuplicateFiles(
  1338. fs, statuses, dpCtx, conf, hconf, filesKept, false);
  1339. perfLogger.perfLogEnd("FileSinkOperator", "RemoveTempOrDuplicateFiles");
  1340. // create empty buckets if necessary
  1341. if (!emptyBuckets.isEmpty()) {
  1342. perfLogger.perfLogBegin("FileSinkOperator", "CreateEmptyBuckets");
  1343. createEmptyBuckets(
  1344. hconf, emptyBuckets, conf.getCompressed(), conf.getTableInfo(), reporter);
  1345. for(Path p:emptyBuckets) {
  1346. FileStatus[] items = fs.listStatus(p);
  1347. filesKept.addAll(Arrays.asList(items));
  1348. }
  1349. perfLogger.perfLogEnd("FileSinkOperator", "CreateEmptyBuckets");
  1350. }
  1351. // move to the file destination
  1352. Utilities.FILE_OP_LOGGER.trace("Moving tmp dir: {} to: {}", tmpPath, specPath);
  1353. if(shouldAvoidRename(conf, hconf)){
  1354. // for SELECT statements
  1355. LOG.debug("Skipping rename/move files. Files to be kept are: " + filesKept.toString());
  1356. conf.getFilesToFetch().addAll(filesKept);
  1357. } else if (conf !=null && conf.isCTASorCM() && isBlobStorage) {
  1358. // for CTAS or Create MV statements
  1359. perfLogger.perfLogBegin("FileSinkOperator", "moveSpecifiedFileStatus");
  1360. LOG.debug("CTAS/Create MV: Files being renamed: " + filesKept.toString());
  1361. moveSpecifiedFilesInParallel(hconf, fs, tmpPath, specPath, filesKept);
  1362. perfLogger.perfLogEnd("FileSinkOperator", "moveSpecifiedFileStatus");
  1363. } else {
  1364. // for rest of the statement e.g. INSERT, LOAD etc
  1365. perfLogger.perfLogBegin("FileSinkOperator", "RenameOrMoveFiles");
  1366. LOG.debug("Final renaming/moving. Source: " + tmpPath + " .Destination: " + specPath);
  1367. renameOrMoveFilesInParallel(hconf, fs, tmpPath, specPath);
  1368. perfLogger.perfLogEnd("FileSinkOperator", "RenameOrMoveFiles");
  1369. }
  1370. }
  1371. } else {
  1372. Utilities.FILE_OP_LOGGER.trace("deleting tmpPath {}", tmpPath);
  1373. fs.delete(tmpPath, true);
  1374. }
  1375. Utilities.FILE_OP_LOGGER.trace("deleting taskTmpPath {}", taskTmpPath);
  1376. fs.delete(taskTmpPath, true);
  1377. }
  1378. /**
  1379. * move specified files to destination in parallel mode.
  1380. * Spins up multiple threads, schedules transfer and shuts down the pool.
  1381. *
  1382. * @param conf
  1383. * @param fs
  1384. * @param srcPath
  1385. * @param destPath
  1386. * @param filesToMove
  1387. * @throws HiveException
  1388. * @throws IOException
  1389. */
  1390. private static void moveSpecifiedFilesInParallel(Configuration conf, FileSystem fs,
  1391. Path srcPath, Path destPath, Set<FileStatus> filesToMove)
  1392. throws HiveException, IOException {
  1393. LOG.info("rename {} files from {} to dest {}",
  1394. filesToMove.size(), srcPath, destPath);
  1395. PerfLogger perfLogger = SessionState.getPerfLogger();
  1396. perfLogger.perfLogBegin("FileSinkOperator", "moveSpecifiedFileStatus");
  1397. final ExecutorService pool = createMoveThreadPool(conf);
  1398. List<Future<Void>> futures = new LinkedList<>();
  1399. moveSpecifiedFilesInParallel(fs, srcPath, destPath, filesToMove, futures, pool);
  1400. shutdownAndCleanup(pool, futures);
  1401. LOG.info("Completed rename from {} to {}", srcPath, destPath);
  1402. perfLogger.perfLogEnd("FileSinkOperator", "moveSpecifiedFileStatus");
  1403. }
  1404. /**
  1405. * Moves files from src to dst if it is within the specified set of paths
  1406. * @param fs
  1407. * @param src
  1408. * @param dst
  1409. * @param filesToMove
  1410. * @param futures List of futures
  1411. * @param pool thread pool
  1412. * @throws IOException
  1413. */
  1414. private static void moveSpecifiedFilesInParallel(FileSystem fs,
  1415. Path src, Path dst, Set<FileStatus> filesToMove, List<Future<Void>> futures,
  1416. ExecutorService pool) throws IOException {
  1417. if (!fs.exists(dst)) {
  1418. LOG.info("Creating {}", dst);
  1419. fs.mkdirs(dst);
  1420. }
  1421. FileStatus[] files = fs.listStatus(src);
  1422. for (FileStatus fileStatus : files) {
  1423. if (filesToMove.contains(fileStatus)) {
  1424. futures.add(pool.submit(new Callable<Void>() {
  1425. @Override
  1426. public Void call() throws HiveException {
  1427. try {
  1428. LOG.debug("Moving from {} to {} ", fileStatus.getPath(), dst);
  1429. Utilities.moveFileOrDir(fs, fileStatus, dst);
  1430. } catch (Exception e) {
  1431. throw new HiveException(e);
  1432. }
  1433. return null;
  1434. }
  1435. }));
  1436. } else if (fileStatus.isDir()) {
  1437. // Traverse directory contents.
  1438. // Directory nesting for dst needs to match src.
  1439. Path nestedDstPath = new Path(dst, fileStatus.getPath().getName());
  1440. moveSpecifiedFilesInParallel(fs, fileStatus.getPath(), nestedDstPath,
  1441. filesToMove, futures, pool);
  1442. }
  1443. }
  1444. }
  1445. private static ExecutorService createMoveThreadPool(Configuration conf) {
  1446. int threads = Math.max(conf.getInt(ConfVars.HIVE_MOVE_FILES_THREAD_COUNT.varname, 15), 1);
  1447. return Executors.newFixedThreadPool(threads,
  1448. new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Move-Thread-%d").build());
  1449. }
  1450. private static void shutdownAndCleanup(ExecutorService pool,
  1451. List<Future<Void>> futures) throws HiveException {
  1452. if (pool == null) {
  1453. return;
  1454. }
  1455. pool.shutdown();
  1456. futures = (futures != null) ? futures : Collections.emptyList();
  1457. for (Future<Void> future : futures) {
  1458. try {
  1459. future.get();
  1460. } catch (InterruptedException | ExecutionException e) {
  1461. LOG.error("Error in moving files to destination", e);
  1462. cancelTasks(futures);
  1463. throw new HiveException(e);
  1464. }
  1465. }
  1466. }
  1467. /**
  1468. * cancel all futures.
  1469. *
  1470. * @param futureList
  1471. */
  1472. private static void cancelTasks(List<Future<Void>> futureList) {
  1473. for (Future future : futureList) {
  1474. future.cancel(true);
  1475. }
  1476. }
  1477. /**
  1478. * Check the existence of buckets according to bucket specification. Create empty buckets if
  1479. * needed.
  1480. *
  1481. * @param hconf The definition of the FileSink.
  1482. * @param paths A list of empty buckets to create
  1483. * @param reporter The mapreduce reporter object
  1484. * @throws HiveException
  1485. * @throws IOException
  1486. */
  1487. static void createEmptyBuckets(Configuration hconf, List<Path> paths,
  1488. boolean isCompressed, TableDesc tableInfo, Reporter reporter)
  1489. throws HiveException, IOException {
  1490. JobConf jc;
  1491. if (hconf instanceof JobConf) {
  1492. jc = new JobConf(hconf);
  1493. } else {
  1494. // test code path
  1495. jc = new JobConf(hconf);
  1496. }
  1497. HiveOutputFormat<?, ?> hiveOutputFormat = null;
  1498. Class<? extends Writable> outputClass = null;
  1499. try {
  1500. AbstractSerDe serde = tableInfo.getSerDeClass().newInstance();
  1501. serde.initialize(hconf, tableInfo.getProperties(), null);
  1502. outputClass = serde.getSerializedClass();
  1503. hiveOutputFormat = HiveFileFormatUtils.getHiveOutputFormat(hconf, tableInfo);
  1504. } catch (SerDeException e) {
  1505. throw new HiveException(e);
  1506. } catch (InstantiationException e) {
  1507. throw new HiveException(e);
  1508. } catch (IllegalAccessException e) {
  1509. throw new HiveException(e);
  1510. }
  1511. for (Path path : paths) {
  1512. Utilities.FILE_OP_LOGGER.trace("creating empty bucket for {}", path);
  1513. RecordWriter writer = hiveOutputFormat.getHiveRecordWriter(jc, path, outputClass, isCompressed,
  1514. tableInfo.getProperties(), reporter);
  1515. writer.close(false);
  1516. LOG.info("created empty bucket for enforcing bucketing at {}", path);
  1517. }
  1518. }
  1519. private static void addFilesToPathSet(Collection<FileStatus> files, Set<FileStatus> fileSet) {
  1520. for (FileStatus file : files) {
  1521. fileSet.add(file);
  1522. }
  1523. }
  1524. /**
  1525. * Remove all temporary files and duplicate (double-committed) files from a given directory.
  1526. */
  1527. public static void removeTempOrDuplicateFiles(FileSystem fs, Path path, Configuration hconf, boolean isBaseDir)
  1528. throws IOException {
  1529. removeTempOrDuplicateFiles(fs, path, null, null, hconf, isBaseDir);
  1530. }
  1531. public static List<Path> removeTempOrDuplicateFiles(FileSystem fs, Path path,
  1532. DynamicPartitionCtx dpCtx, FileSinkDesc conf, Configuration hconf, boolean isBaseDir) throws IOException {
  1533. if (path == null) {
  1534. return null;
  1535. }
  1536. List<FileStatus> statusList = HiveStatsUtils.getFileStatusRecurse(path,
  1537. ((dpCtx == null) ? 1 : dpCtx.getNumDPCols()), fs);
  1538. FileStatus[] stats = statusList.toArray(new FileStatus[statusList.size()]);
  1539. return removeTempOrDuplicateFiles(fs, stats, dpCtx, conf, hconf, isBaseDir);
  1540. }
  1541. private static List<Path> removeTempOrDuplicateFiles(FileSystem fs, FileStatus[] fileStats,
  1542. DynamicPartitionCtx dpCtx, FileSinkDesc conf, Configuration hconf, boolean isBaseDir) throws IOException {
  1543. return removeTempOrDuplicateFiles(fs, fileStats, dpCtx, conf, hconf, null, isBaseDir);
  1544. }
  1545. /**
  1546. * Remove all temporary files and duplicate (double-committed) files from a given directory.
  1547. *
  1548. * @return a list of path names corresponding to should-be-created empty buckets.
  1549. */
  1550. private static List<Path> removeTempOrDuplicateFiles(FileSystem fs, FileStatus[] fileStats,
  1551. DynamicPartitionCtx dpCtx, FileSinkDesc conf, Configuration hconf, Set<FileStatus> filesKept, boolean isBaseDir)
  1552. throws IOException {
  1553. int dpLevels = dpCtx == null ? 0 : dpCtx.getNumDPCols(),
  1554. numBuckets = (conf != null && conf.getTable() != null) ? conf.getTable().getNumBuckets() : 0;
  1555. return removeTempOrDuplicateFiles(
  1556. fs, fileStats, null, dpLevels, numBuckets, hconf, null, 0, false, filesKept, isBaseDir);
  1557. }
  1558. private static FileStatus[] removeEmptyDpDirectory(FileSystem fs, Path path) throws IOException {
  1559. // listStatus is not required to be called to check if we need to delete the directory or not,
  1560. // delete does that internally. We are getting the file list as it is used by the caller.
  1561. FileStatus[] items = fs.listStatus(path);
  1562. // Remove empty directory since DP insert should not generate empty partitions.
  1563. // Empty directories could be generated by crashed Task/ScriptOperator.
  1564. if (items.length == 0) {
  1565. // delete() returns false in only two conditions
  1566. // 1. Tried to delete root
  1567. // 2. The file wasn't actually there (or deleted by some other thread)
  1568. // So return value is not checked for delete.
  1569. fs.delete(path, true);
  1570. }
  1571. return items;
  1572. }
  1573. // Returns the list of non empty sub-directories, deletes the empty sub sub-directories.
  1574. private static Map<Path, FileStatus[]> getNonEmptySubDirs(FileSystem fs, Configuration hConf, FileStatus[] parts)
  1575. throws IOException {
  1576. int threadCount = hConf.getInt(ConfVars.HIVE_MOVE_FILES_THREAD_COUNT.varname, 15);
  1577. final ExecutorService pool = (threadCount <= 0 ? null :
  1578. Executors.newFixedThreadPool(threadCount, new ThreadFactoryBuilder().setDaemon(true).setNameFormat(
  1579. "Remove-Temp-%d").build()));
  1580. Map<Path, FileStatus[]> partStatusMap = new ConcurrentHashMap<>();
  1581. List<Future<Void>> futures = new LinkedList<>();
  1582. for (FileStatus part : parts) {
  1583. Path path = part.getPath();
  1584. if (pool != null) {
  1585. futures.add(pool.submit(() -> {
  1586. FileStatus[] items = removeEmptyDpDirectory(fs, path);
  1587. partStatusMap.put(path, items);
  1588. return null;
  1589. }));
  1590. } else {
  1591. partStatusMap.put(path, removeEmptyDpDirectory(fs, path));
  1592. }
  1593. }
  1594. if (null != pool) {
  1595. pool.shutdown();
  1596. try {
  1597. for (Future<Void> future : futures) {
  1598. future.get();
  1599. }
  1600. } catch (InterruptedException | ExecutionException e) {
  1601. LOG.error("Exception in getting dir status", e);
  1602. for (Future<Void> future : futures) {
  1603. future.cancel(true);
  1604. }
  1605. throw new IOException(e);
  1606. }
  1607. }
  1608. // Dump of its metrics
  1609. LOG.debug("FS {}", fs);
  1610. return partStatusMap;
  1611. }
  1612. public static List<Path> removeTempOrDuplicateFiles(FileSystem fs, FileStatus[] fileStats,
  1613. String unionSuffix, int dpLevels, int numBuckets, Configuration hconf, Long writeId,
  1614. int stmtId, boolean isMmTable, Set<FileStatus> filesKept, boolean isBaseDir) throws IOException {
  1615. if (fileStats == null) {
  1616. return null;
  1617. }
  1618. List<Path> result = new ArrayList<Path>();
  1619. HashMap<String, FileStatus> taskIDToFile = null;
  1620. if (dpLevels > 0) {
  1621. Map<Path, FileStatus[]> partStatusMap = getNonEmptySubDirs(fs, hconf, fileStats);
  1622. for (int i = 0; i < fileStats.length; ++i) {
  1623. Path path = fileStats[i].getPath();
  1624. assert fileStats[i].isDirectory() : "dynamic partition " + path + " is not a directory";
  1625. FileStatus[] items = partStatusMap.get(path);
  1626. if (items.length == 0) {
  1627. fileStats[i] = null;
  1628. continue;
  1629. }
  1630. if (isMmTable) {
  1631. if (!path.getName().equals(AcidUtils.baseOrDeltaSubdir(isBaseDir, writeId, writeId, stmtId))) {
  1632. throw new IOException("Unexpected non-MM directory name " + path);
  1633. }
  1634. Utilities.FILE_OP_LOGGER.trace("removeTempOrDuplicateFiles processing files in MM directory {}", path);
  1635. if (!StringUtils.isEmpty(unionSuffix)) {
  1636. try {
  1637. items = fs.listStatus(new Path(path, unionSuffix));
  1638. } catch (FileNotFoundException e) {
  1639. continue;
  1640. }
  1641. }
  1642. }
  1643. taskIDToFile = removeTempOrDuplicateFilesNonMm(items, fs, hconf);
  1644. if (filesKept != null && taskIDToFile != null) {
  1645. addFilesToPathSet(taskIDToFile.values(), filesKept);
  1646. }
  1647. addBucketFileToResults(taskIDToFile, numBuckets, hconf, result);
  1648. }
  1649. } else if (isMmTable && !StringUtils.isEmpty(unionSuffix)) {
  1650. if (fileStats.length == 0) {
  1651. return result;
  1652. }
  1653. Path mmDir = extractNonDpMmDir(writeId, stmtId, fileStats, isBaseDir);
  1654. taskIDToFile = removeTempOrDuplicateFilesNonMm(
  1655. fs.listStatus(new Path(mmDir, unionSuffix)), fs, hconf);
  1656. if (filesKept != null && taskIDToFile != null) {
  1657. addFilesToPathSet(taskIDToFile.values(), filesKept);
  1658. }
  1659. addBucketFileToResults2(taskIDToFile, numBuckets, hconf, result);
  1660. } else {
  1661. if (fileStats.length == 0) {
  1662. return result;
  1663. }
  1664. if (!isMmTable) {
  1665. taskIDToFile = removeTempOrDuplicateFilesNonMm(fileStats, fs, hconf);
  1666. if (filesKept != null && taskIDToFile != null) {
  1667. addFilesToPathSet(taskIDToFile.values(), filesKept);
  1668. }
  1669. } else {
  1670. Path mmDir = extractNonDpMmDir(writeId, stmtId, fileStats, isBaseDir);
  1671. taskIDToFile = removeTempOrDuplicateFilesNonMm(fs.listStatus(mmDir), fs, hconf);
  1672. if (filesKept != null && taskIDToFile != null) {
  1673. addFilesToPathSet(taskIDToFile.values(), filesKept);
  1674. }
  1675. }
  1676. addBucketFileToResults2(taskIDToFile, numBuckets, hconf, result);
  1677. }
  1678. return result;
  1679. }
  1680. private static Path extractNonDpMmDir(Long writeId, int stmtId, FileStatus[] items, boolean isBaseDir) throws IOException {
  1681. if (items.length > 1) {
  1682. throw new IOException("Unexpected directories for non-DP MM: " + Arrays.toString(items));
  1683. }
  1684. Path mmDir = items[0].getPath();
  1685. if (!mmDir.getName().equals(AcidUtils.baseOrDeltaSubdir(isBaseDir, writeId, writeId, stmtId))) {
  1686. throw new IOException("Unexpected non-MM directory " + mmDir);
  1687. }
  1688. Utilities.FILE_OP_LOGGER.trace("removeTempOrDuplicateFiles processing files in MM directory {}", mmDir);
  1689. return mmDir;
  1690. }
  1691. // TODO: not clear why two if conditions are different. Preserve the existing logic for now.
  1692. private static void addBucketFileToResults2(HashMap<String, FileStatus> taskIDToFile,
  1693. int numBuckets, Configuration hconf, List<Path> result) {
  1694. if (MapUtils.isNotEmpty(taskIDToFile) && (numBuckets > taskIDToFile.size())
  1695. && !"tez".equalsIgnoreCase(hconf.get(ConfVars.HIVE_EXECUTION_ENGINE.varname))) {
  1696. addBucketsToResultsCommon(taskIDToFile, numBuckets, result);
  1697. }
  1698. }
  1699. // TODO: not clear why two if conditions are different. Preserve the existing logic for now.
  1700. private static void addBucketFileToResults(HashMap<String, FileStatus> taskIDToFile,
  1701. int numBuckets, Configuration hconf, List<Path> result) {
  1702. // if the table is bucketed and enforce bucketing, we should check and generate all buckets
  1703. if (numBuckets > 0 && taskIDToFile != null
  1704. && !"tez".equalsIgnoreCase(hconf.get(ConfVars.HIVE_EXECUTION_ENGINE.varname))) {
  1705. addBucketsToResultsCommon(taskIDToFile, numBuckets, result);
  1706. }
  1707. }
  1708. private static void addBucketsToResultsCommon(
  1709. HashMap<String, FileStatus> taskIDToFile, int numBuckets, List<Path> result) {
  1710. String taskID1 = taskIDToFile.keySet().iterator().next();
  1711. Path bucketPath = taskIDToFile.values().iterator().next().getPath();
  1712. for (int j = 0; j < numBuckets; ++j) {
  1713. addBucketFileIfMissing(result, taskIDToFile, taskID1, bucketPath, j);
  1714. }
  1715. }
  1716. private static void addBucketFileIfMissing(List<Path> result,
  1717. HashMap<String, FileStatus> taskIDToFile, String taskID1, Path bucketPath, int j) {
  1718. String taskID2 = replaceTaskId(taskID1, j);
  1719. if (!taskIDToFile.containsKey(taskID2)) {
  1720. // create empty bucket, file name should be derived from taskID2
  1721. URI bucketUri = bucketPath.toUri();
  1722. String path2 = replaceTaskIdFromFilename(bucketUri.getPath().toString(), j);
  1723. Utilities.FILE_OP_LOGGER.trace("Creating an empty bucket file {}", path2);
  1724. result.add(new Path(bucketUri.getScheme(), bucketUri.getAuthority(), path2));
  1725. }
  1726. }
  1727. private static HashMap<String, FileStatus> removeTempOrDuplicateFilesNonMm(
  1728. FileStatus[] files, FileSystem fs, Configuration conf) throws IOException {
  1729. if (files == null || fs == null) {
  1730. return null;
  1731. }
  1732. HashMap<String, FileStatus> taskIdToFile = new HashMap<String, FileStatus>();
  1733. // This method currently does not support speculative execution due to
  1734. // compareTempOrDuplicateFiles not being able to de-duplicate speculative
  1735. // execution created files
  1736. if (isSpeculativeExecution(conf)) {
  1737. String engine = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE);
  1738. throw new IOException("Speculative execution is not supported for engine " + engine);
  1739. }
  1740. for (FileStatus one : files) {
  1741. if (isTempPath(one)) {
  1742. Path onePath = one.getPath();
  1743. Utilities.FILE_OP_LOGGER.trace("removeTempOrDuplicateFiles deleting {}", onePath);
  1744. if (!fs.delete(onePath, true)) {
  1745. // If file is already deleted by some other task, just ignore the failure with a warning.
  1746. LOG.warn("Unable to delete tmp file: " + onePath);
  1747. }
  1748. } else {
  1749. // This would be a single file. See if we need to remove it.
  1750. ponderRemovingTempOrDuplicateFile(fs, one, taskIdToFile, conf);
  1751. }
  1752. }
  1753. return taskIdToFile;
  1754. }
  1755. private static void ponderRemovingTempOrDuplicateFile(FileSystem fs,
  1756. FileStatus file, HashMap<String, FileStatus> taskIdToFile, Configuration conf)
  1757. throws IOException {
  1758. Path filePath = file.getPath();
  1759. String taskId = getPrefixedTaskIdFromFilename(filePath.getName());
  1760. Utilities.FILE_OP_LOGGER.trace("removeTempOrDuplicateFiles looking at {}"
  1761. + ", taskId {}", filePath, taskId);
  1762. FileStatus otherFile = taskIdToFile.get(taskId);
  1763. taskIdToFile.put(taskId, (otherFile == null) ? file :
  1764. compareTempOrDuplicateFiles(fs, file, otherFile, conf));
  1765. }
  1766. private static boolean warnIfSet(Configuration conf, String value) {
  1767. if (conf.getBoolean(value, false)) {
  1768. LOG.warn(value + " support is currently deprecated");
  1769. return true;
  1770. }
  1771. return false;
  1772. }
  1773. private static boolean isSpeculativeExecution(Configuration conf) {
  1774. String engine = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE);
  1775. boolean isSpeculative = false;
  1776. if ("mr".equalsIgnoreCase(engine)) {
  1777. isSpeculative = warnIfSet(conf, "mapreduce.map.speculative") ||
  1778. warnIfSet(conf, "mapreduce.reduce.speculative") ||
  1779. warnIfSet(conf, "mapred.map.tasks.speculative.execution") ||
  1780. warnIfSet(conf, "mapred.reduce.tasks.speculative.execution");
  1781. } else if ("tez".equalsIgnoreCase(engine)) {
  1782. isSpeculative = warnIfSet(conf, "tez.am.speculation.enabled");
  1783. } // all other engines do not support speculative execution
  1784. return isSpeculative;
  1785. }
  1786. private static FileStatus compareTempOrDuplicateFiles(FileSystem fs,
  1787. FileStatus file, FileStatus existingFile, Configuration conf) throws IOException {
  1788. // Pick the one with newest attempt ID. Previously, this function threw an
  1789. // exception when the file size of the newer attempt was less than the
  1790. // older attempt. This was an incorrect assumption due to various
  1791. // techniques like file compression and no guarantee that the new task will
  1792. // write values in the same order.
  1793. FileStatus toDelete = null, toRetain = null;
  1794. // This method currently does not support speculative execution
  1795. if (isSpeculativeExecution(conf)) {
  1796. String engine = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE);
  1797. throw new IOException("Speculative execution is not supported for engine " + engine);
  1798. }
  1799. // "LOAD .. INTO" and "INSERT INTO" commands will generate files with
  1800. // "_copy_x" suffix. These files are usually read by map tasks and the
  1801. // task output gets written to some tmp path. The output file names will
  1802. // be of format taskId_attemptId. The usual path for all these tasks is
  1803. // srcPath -> taskTmpPath -> tmpPath -> finalPath.
  1804. // But, MergeFileTask can move files directly from src path to final path
  1805. // without copying it to tmp path. In such cases, different files with
  1806. // "_copy_x" suffix will be identified as duplicates (change in value
  1807. // of x is wrongly identified as attempt id) and will be deleted.
  1808. // To avoid that we will ignore files with "_copy_x" suffix from duplicate
  1809. // elimination.
  1810. Path filePath = file.getPath();
  1811. if (isCopyFile(filePath.getName())) {
  1812. LOG.info("{} file identified as duplicate. This file is"
  1813. + " not deleted as it has copySuffix.", filePath);
  1814. return existingFile;
  1815. }
  1816. int existingFileAttemptId = getAttemptIdFromFilename(existingFile.getPath().getName());
  1817. int fileAttemptId = getAttemptIdFromFilename(file.getPath().getName());
  1818. // Files may come in any order irrespective of their attempt IDs
  1819. if (existingFileAttemptId > fileAttemptId) {
  1820. // keep existing
  1821. toRetain = existingFile;
  1822. toDelete = file;
  1823. } else if (existingFileAttemptId < fileAttemptId) {
  1824. // keep file
  1825. toRetain = file;
  1826. toDelete = existingFile;
  1827. } else {
  1828. throw new IOException(filePath + " has same attempt ID " + fileAttemptId + " as "
  1829. + existingFile.getPath());
  1830. }
  1831. if (!fs.delete(toDelete.getPath(), true)) {
  1832. throw new IOException("Unable to delete duplicate file: " + toDelete.getPath()
  1833. + ". Existing file: " + toRetain.getPath());
  1834. }
  1835. LOG.warn("Duplicate taskid file removed: " + toDelete.getPath() + " with length "
  1836. + toDelete.getLen() + ". Existing file: " + toRetain.getPath() + " with length "
  1837. + toRetain.getLen());
  1838. return toRetain;
  1839. }
  1840. public static boolean isCopyFile(String filepath) {
  1841. String filename = filepath;
  1842. int dirEnd = filepath.lastIndexOf(Path.SEPARATOR);
  1843. if (dirEnd != -1) {
  1844. filename = filepath.substring(dirEnd + 1);
  1845. }
  1846. ParsedOutputFileName parsedFileName = ParsedOutputFileName.parse(filename);
  1847. if (!parsedFileName.matches()) {
  1848. LOG.warn("Unable to verify if file name {} has _copy_ suffix.", filepath);
  1849. }
  1850. return parsedFileName.isCopyFile();
  1851. }
  1852. public static String getBucketFileNameFromPathSubString(String bucketName) {
  1853. try {
  1854. return bucketName.split(COPY_KEYWORD)[0];
  1855. } catch (Exception e) {
  1856. LOG.warn("Invalid bucket file name", e);
  1857. return bucketName;
  1858. }
  1859. }
  1860. /* compute bucket id from from Split */
  1861. public static int parseSplitBucket(InputSplit split) {
  1862. if (split instanceof FileSplit) {
  1863. return getBucketIdFromFile(((FileSplit) split).getPath().getName());
  1864. }
  1865. // cannot get this for combined splits
  1866. return -1;
  1867. }
  1868. public static int getBucketIdFromFile(String bucketName) {
  1869. Matcher m = PREFIXED_BUCKET_ID_REGEX.matcher(bucketName);
  1870. if (m.matches()) {
  1871. if (m.group(2).isEmpty()) {
  1872. // all zeros
  1873. return m.group(1).isEmpty() ? -1 : 0;
  1874. }
  1875. return Integer.parseInt(m.group(2));
  1876. }
  1877. // Check to see if the bucketName matches the pattern "bucket_([0-9]+).*"
  1878. // This can happen in ACID cases when we have splits on delta files, where the filenames
  1879. // are of the form delta_x_y/bucket_a.
  1880. if (bucketName.startsWith(AcidUtils.BUCKET_PREFIX)) {
  1881. m = AcidUtils.BUCKET_PATTERN.matcher(bucketName);
  1882. if (m.find()) {
  1883. return Integer.parseInt(m.group(1));
  1884. }
  1885. // Note that legacy bucket digit pattern are being ignored here.
  1886. }
  1887. return -1;
  1888. }
  1889. public static String getNameMessage(Throwable e) {
  1890. return e.getClass().getName() + "(" + e.getMessage() + ")";
  1891. }
  1892. public static String getResourceFiles(Configuration conf, SessionState.ResourceType t) {
  1893. // fill in local files (includes copy of HDFS files) to be added to the task environment
  1894. SessionState ss = SessionState.get();
  1895. Set<String> files = (ss == null) ? null : ss.list_resource(t, null);
  1896. return validateFiles(conf, files);
  1897. }
  1898. public static String getHdfsResourceFiles(Configuration conf, SessionState.ResourceType type) {
  1899. // fill in HDFS files to be added to the task environment
  1900. SessionState ss = SessionState.get();
  1901. Set<String> files = (ss == null) ? null : ss.list_hdfs_resource(type);
  1902. return validateFiles(conf, files);
  1903. }
  1904. public static String getLocalResourceFiles(Configuration conf, SessionState.ResourceType type) {
  1905. // fill in local only files (excludes copy of HDFS files) to be added to the task environment
  1906. SessionState ss = SessionState.get();
  1907. Set<String> files = (ss == null) ? null : ss.list_local_resource(type);
  1908. return validateFiles(conf, files);
  1909. }
  1910. private static String validateFiles(Configuration conf, Set<String> files){
  1911. if (files != null) {
  1912. List<String> realFiles = new ArrayList<String>(files.size());
  1913. for (String one : files) {
  1914. try {
  1915. String onefile = realFile(one, conf);
  1916. if (onefile != null) {
  1917. realFiles.add(realFile(one, conf));
  1918. } else {
  1919. LOG.warn("The file {} does not exist.", one);
  1920. }
  1921. } catch (IOException e) {
  1922. throw new RuntimeException("Cannot validate file " + one + "due to exception: "
  1923. + e.getMessage(), e);
  1924. }
  1925. }
  1926. return StringUtils.join(realFiles, ",");
  1927. } else {
  1928. return StringUtils.EMPTY;
  1929. }
  1930. }
  1931. /**
  1932. * get session specified class loader and get current class loader if fall
  1933. *
  1934. * @return
  1935. */
  1936. public static ClassLoader getSessionSpecifiedClassLoader() {
  1937. SessionState state = SessionState.get();
  1938. if (state == null || state.getConf() == null) {
  1939. LOG.debug("Hive Conf not found or Session not initiated, use thread based class loader instead");
  1940. return JavaUtils.getClassLoader();
  1941. }
  1942. ClassLoader sessionCL = state.getConf().getClassLoader();
  1943. if (sessionCL != null) {
  1944. LOG.trace("Use session specified class loader"); //it's normal case
  1945. return sessionCL;
  1946. }
  1947. LOG.debug("Session specified class loader not found, use thread based class loader");
  1948. return JavaUtils.getClassLoader();
  1949. }
  1950. public static void restoreSessionSpecifiedClassLoader(ClassLoader prev) {
  1951. SessionState state = SessionState.get();
  1952. if (state != null && state.getConf() != null) {
  1953. ClassLoader current = state.getConf().getClassLoader();
  1954. if (current != prev && JavaUtils.closeClassLoadersTo(current, prev)) {
  1955. Thread.currentThread().setContextClassLoader(prev);
  1956. state.getConf().setClassLoader(prev);
  1957. }
  1958. }
  1959. }
  1960. /**
  1961. * Create a URL from a string representing a path to a local file.
  1962. * The path string can be just a path, or can start with file:/, file:///
  1963. * @param onestr path string
  1964. * @return
  1965. */
  1966. static URL urlFromPathString(String onestr) {
  1967. URL oneurl = null;
  1968. try {
  1969. if (StringUtils.indexOf(onestr, "file:/") == 0) {
  1970. oneurl = new URL(onestr);
  1971. } else {
  1972. oneurl = new File(onestr).toURL();
  1973. }
  1974. } catch (Exception err) {
  1975. LOG.error("Bad URL {}, ignoring path", onestr);
  1976. }
  1977. return oneurl;
  1978. }
  1979. /**
  1980. * Remove elements from the classpath, if possible. This will only work if the current thread context class loader is
  1981. * an UDFClassLoader (i.e. if we have created it).
  1982. *
  1983. * @param pathsToRemove
  1984. * Array of classpath elements
  1985. */
  1986. public static void removeFromClassPath(String[] pathsToRemove) throws IOException {
  1987. Thread curThread = Thread.currentThread();
  1988. ClassLoader currentLoader = curThread.getContextClassLoader();
  1989. // If current class loader is NOT UDFClassLoader, then it is a system class loader, we should not mess with it.
  1990. if (!(currentLoader instanceof UDFClassLoader)) {
  1991. LOG.warn("Ignoring attempt to manipulate {}; probably means we have closed more UDF loaders than opened.",
  1992. currentLoader == null ? "null" : currentLoader.getClass().getSimpleName());
  1993. return;
  1994. }
  1995. // Otherwise -- for UDFClassLoaders -- we close the current one and create a new one, with more limited class path.
  1996. UDFClassLoader loader = (UDFClassLoader) currentLoader;
  1997. Set<URL> newPath = new HashSet<URL>(Arrays.asList(loader.getURLs()));
  1998. for (String onestr : pathsToRemove) {
  1999. URL oneurl = urlFromPathString(onestr);
  2000. if (oneurl != null) {
  2001. newPath.remove(oneurl);
  2002. }
  2003. }
  2004. JavaUtils.closeClassLoader(loader);
  2005. // This loader is closed, remove it from cached registry loaders to avoid removing it again.
  2006. Registry reg = SessionState.getRegistry();
  2007. if (reg != null) {
  2008. reg.removeFromUDFLoaders(loader);
  2009. }
  2010. loader = new UDFClassLoader(newPath.toArray(new URL[0]));
  2011. curThread.setContextClassLoader(loader);
  2012. SessionState.get().getConf().setClassLoader(loader);
  2013. }
  2014. public static String formatBinaryString(byte[] array, int start, int length) {
  2015. StringBuilder sb = new StringBuilder();
  2016. for (int i = start; i < start + length; i++) {
  2017. sb.append('x');
  2018. sb.append(array[i] < 0 ? array[i] + 256 : array[i] + 0);
  2019. }
  2020. return sb.toString();
  2021. }
  2022. public static List<String> getColumnNamesFromSortCols(List<Order> sortCols) {
  2023. if(sortCols == null) {
  2024. return Collections.emptyList();
  2025. }
  2026. List<String> names = new ArrayList<String>();
  2027. for (Order o : sortCols) {
  2028. names.add(o.getCol());
  2029. }
  2030. return names;
  2031. }
  2032. public static List<String> getColumnNamesFromFieldSchema(List<FieldSchema> partCols) {
  2033. List<String> names = new ArrayList<String>();
  2034. for (FieldSchema o : partCols) {
  2035. names.add(o.getName());
  2036. }
  2037. return names;
  2038. }
  2039. public static List<String> getInternalColumnNamesFromSignature(List<ColumnInfo> colInfos) {
  2040. List<String> names = new ArrayList<String>();
  2041. for (ColumnInfo ci : colInfos) {
  2042. names.add(ci.getInternalName());
  2043. }
  2044. return names;
  2045. }
  2046. /**
  2047. * Note: This will not return the correct number of columns in the case of
  2048. * Avro serde using an external schema URL, unless these properties have been
  2049. * used to initialize the Avro SerDe (which updates these properties).
  2050. * @param props TableDesc properties
  2051. * @return list of column names based on the table properties
  2052. */
  2053. public static List<String> getColumnNames(Properties props) {
  2054. List<String> names = new ArrayList<String>();
  2055. String colNames = props.getProperty(serdeConstants.LIST_COLUMNS);
  2056. return splitColNames(names, colNames);
  2057. }
  2058. public static List<String> getColumnNames(Configuration conf) {
  2059. List<String> names = new ArrayList<String>();
  2060. String colNames = conf.get(serdeConstants.LIST_COLUMNS);
  2061. return splitColNames(names, colNames);
  2062. }
  2063. private static List<String> splitColNames(List<String> names, String colNames) {
  2064. String[] cols = colNames.trim().split(",");
  2065. for(String col : cols) {
  2066. if(StringUtils.isNotBlank(col)) {
  2067. names.add(col);
  2068. }
  2069. }
  2070. return names;
  2071. }
  2072. public static List<String> getColumnTypes(Properties props) {
  2073. List<String> names = new ArrayList<String>();
  2074. String colNames = props.getProperty(serdeConstants.LIST_COLUMN_TYPES);
  2075. ArrayList<TypeInfo> cols = TypeInfoUtils.getTypeInfosFromTypeString(colNames);
  2076. for (TypeInfo col : cols) {
  2077. names.add(col.getTypeName());
  2078. }
  2079. return names;
  2080. }
  2081. /**
  2082. * Extract db and table name from dbtable string, where db and table are separated by "."
  2083. * If there is no db name part, set the current sessions default db
  2084. * @param dbtable
  2085. * @return String array with two elements, first is db name, second is table name
  2086. * @throws SemanticException
  2087. * @deprecated use {@link TableName} or {@link org.apache.hadoop.hive.ql.parse.HiveTableName} instead
  2088. */
  2089. @Deprecated
  2090. public static String[] getDbTableName(String dbtable) throws SemanticException {
  2091. return getDbTableName(SessionState.get().getCurrentDatabase(), dbtable);
  2092. }
  2093. /**
  2094. * Extract db and table name from dbtable string.
  2095. * @param defaultDb
  2096. * @param dbtable
  2097. * @return String array with two elements, first is db name, second is table name
  2098. * @throws SemanticException
  2099. * @deprecated use {@link TableName} or {@link org.apache.hadoop.hive.ql.parse.HiveTableName} instead
  2100. */
  2101. @Deprecated
  2102. public static String[] getDbTableName(String defaultDb, String dbtable) throws SemanticException {
  2103. if (dbtable == null) {
  2104. return new String[2];
  2105. }
  2106. String[] names = dbtable.split("\\.");
  2107. switch (names.length) {
  2108. case 3:
  2109. case 2:
  2110. return names;
  2111. case 1:
  2112. return new String [] {defaultDb, dbtable};
  2113. default:
  2114. throw new SemanticException(ErrorMsg.INVALID_TABLE_NAME, dbtable);
  2115. }
  2116. }
  2117. public static void validateColumnNames(List<String> colNames, List<String> checkCols)
  2118. throws SemanticException {
  2119. Iterator<String> checkColsIter = checkCols.iterator();
  2120. while (checkColsIter.hasNext()) {
  2121. String toCheck = checkColsIter.next();
  2122. boolean found = false;
  2123. Iterator<String> colNamesIter = colNames.iterator();
  2124. while (colNamesIter.hasNext()) {
  2125. String colName = colNamesIter.next();
  2126. if (toCheck.equalsIgnoreCase(colName)) {
  2127. found = true;
  2128. break;
  2129. }
  2130. }
  2131. if (!found) {
  2132. throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg());
  2133. }
  2134. }
  2135. }
  2136. /**
  2137. * Accepts qualified name which is in the form of table, dbname.tablename or catalog.dbname.tablename and returns a
  2138. * {@link TableName}. All parts can be null.
  2139. *
  2140. * @param dbTableName
  2141. * @return a {@link TableName}
  2142. * @throws SemanticException
  2143. * @deprecated handle null values and use {@link TableName#fromString(String, String, String)}
  2144. */
  2145. @Deprecated
  2146. public static TableName getNullableTableName(String dbTableName) throws SemanticException {
  2147. return getNullableTableName(dbTableName, SessionState.get().getCurrentDatabase());
  2148. }
  2149. /**
  2150. * Accepts qualified name which is in the form of table, dbname.tablename or catalog.dbname.tablename and returns a
  2151. * {@link TableName}. All parts can be null.
  2152. *
  2153. * @param dbTableName
  2154. * @param defaultDb
  2155. * @return a {@link TableName}
  2156. * @throws SemanticException
  2157. * @deprecated handle null values and use {@link TableName#fromString(String, String, String)}
  2158. */
  2159. @Deprecated
  2160. public static TableName getNullableTableName(String dbTableName, String defaultDb) throws SemanticException {
  2161. if (dbTableName == null) {
  2162. return new TableName(null, null, null);
  2163. } else {
  2164. try {
  2165. return TableName
  2166. .fromString(dbTableName, SessionState.get().getCurrentCatalog(), defaultDb);
  2167. } catch (IllegalArgumentException e) {
  2168. throw new SemanticException(e.getCause());
  2169. }
  2170. }
  2171. }
  2172. /**
  2173. * Gets the default notification interval to send progress updates to the tracker. Useful for
  2174. * operators that may not output data for a while.
  2175. *
  2176. * @param hconf
  2177. * @return the interval in milliseconds
  2178. */
  2179. public static int getDefaultNotificationInterval(Configuration hconf) {
  2180. int notificationInterval;
  2181. Integer expInterval = Integer.decode(hconf.get("mapred.tasktracker.expiry.interval"));
  2182. if (expInterval != null) {
  2183. notificationInterval = expInterval.intValue() / 2;
  2184. } else {
  2185. // 5 minutes
  2186. notificationInterval = 5 * 60 * 1000;
  2187. }
  2188. return notificationInterval;
  2189. }
  2190. /**
  2191. * Copies the storage handler properties configured for a table descriptor to a runtime job
  2192. * configuration.
  2193. *
  2194. * @param tbl
  2195. * table descriptor from which to read
  2196. *
  2197. * @param job
  2198. * configuration which receives configured properties
  2199. */
  2200. public static void copyTableJobPropertiesToConf(TableDesc tbl, JobConf job) throws HiveException {
  2201. Properties tblProperties = tbl.getProperties();
  2202. for(String name: tblProperties.stringPropertyNames()) {
  2203. if (job.get(name) == null) {
  2204. String val = (String) tblProperties.get(name);
  2205. if (val != null) {
  2206. job.set(name, StringEscapeUtils.escapeJava(val));
  2207. }
  2208. }
  2209. }
  2210. Map<String, String> jobProperties = tbl.getJobProperties();
  2211. if (jobProperties != null) {
  2212. for (Map.Entry<String, String> entry : jobProperties.entrySet()) {
  2213. job.set(entry.getKey(), entry.getValue());
  2214. }
  2215. }
  2216. }
  2217. /**
  2218. * Copies the storage handler properties configured for a table descriptor to a runtime job
  2219. * configuration. This differs from {@link #copyTablePropertiesToConf(org.apache.hadoop.hive.ql.plan.TableDesc, org.apache.hadoop.mapred.JobConf)}
  2220. * in that it does not allow parameters already set in the job to override the values from the
  2221. * table. This is important for setting the config up for reading,
  2222. * as the job may already have values in it from another table.
  2223. * @param tbl
  2224. * @param job
  2225. */
  2226. public static void copyTablePropertiesToConf(TableDesc tbl, JobConf job) throws HiveException {
  2227. Properties tblProperties = tbl.getProperties();
  2228. for(String name: tblProperties.stringPropertyNames()) {
  2229. String val = (String) tblProperties.get(name);
  2230. if (val != null) {
  2231. job.set(name, StringEscapeUtils.escapeJava(val));
  2232. }
  2233. }
  2234. Map<String, String> jobProperties = tbl.getJobProperties();
  2235. if (jobProperties != null) {
  2236. for (Map.Entry<String, String> entry : jobProperties.entrySet()) {
  2237. job.set(entry.getKey(), entry.getValue());
  2238. }
  2239. }
  2240. }
  2241. /**
  2242. * Copy job credentials to table properties
  2243. * @param tbl
  2244. */
  2245. public static void copyJobSecretToTableProperties(TableDesc tbl) throws IOException {
  2246. Credentials credentials = UserGroupInformation.getCurrentUser().getCredentials();
  2247. for (Text key : credentials.getAllSecretKeys()) {
  2248. String keyString = key.toString();
  2249. if (keyString.startsWith(TableDesc.SECRET_PREFIX + TableDesc.SECRET_DELIMIT)) {
  2250. String[] comps = keyString.split(TableDesc.SECRET_DELIMIT);
  2251. String tblName = comps[1];
  2252. String keyName = comps[2];
  2253. if (tbl.getTableName().equalsIgnoreCase(tblName)) {
  2254. tbl.getProperties().put(keyName, new String(credentials.getSecretKey(key)));
  2255. }
  2256. }
  2257. }
  2258. }
  2259. /**
  2260. * Returns the maximum number of executors required to get file information from several input locations.
  2261. * It checks whether HIVE_EXEC_INPUT_LISTING_MAX_THREADS or DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX are > 1
  2262. *
  2263. * @param conf Configuration object to get the maximum number of threads.
  2264. * @param inputLocationListSize Number of input locations required to process.
  2265. * @return The maximum number of executors to use.
  2266. */
  2267. @VisibleForTesting
  2268. static int getMaxExecutorsForInputListing(final Configuration conf, int inputLocationListSize) {
  2269. if (inputLocationListSize < 1) {
  2270. return 0;
  2271. }
  2272. int maxExecutors = 1;
  2273. if (inputLocationListSize > 1) {
  2274. int listingMaxThreads = HiveConf.getIntVar(conf, ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS);
  2275. // DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX must be removed on next Hive version (probably on 3.0).
  2276. // If HIVE_EXEC_INPUT_LISTING_MAX_THREADS is not set, then we check of the deprecated configuration.
  2277. if (listingMaxThreads <= 0) {
  2278. listingMaxThreads = conf.getInt(DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 0);
  2279. if (listingMaxThreads > 0) {
  2280. LOG.warn("Deprecated configuration is used: {}. Please use {}",
  2281. DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX,
  2282. ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname);
  2283. }
  2284. }
  2285. if (listingMaxThreads > 1) {
  2286. maxExecutors = Math.min(inputLocationListSize, listingMaxThreads);
  2287. }
  2288. }
  2289. return maxExecutors;
  2290. }
  2291. /**
  2292. * Calculate the total size of input files.
  2293. *
  2294. * @param ctx
  2295. * the hadoop job context
  2296. * @param work
  2297. * map reduce job plan
  2298. * @param filter
  2299. * filter to apply to the input paths before calculating size
  2300. * @return the summary of all the input paths.
  2301. * @throws IOException
  2302. */
  2303. public static ContentSummary getInputSummary(final Context ctx, MapWork work, PathFilter filter)
  2304. throws IOException {
  2305. PerfLogger perfLogger = SessionState.getPerfLogger();
  2306. perfLogger.perfLogBegin(CLASS_NAME, PerfLogger.INPUT_SUMMARY);
  2307. final long[] summary = {0L, 0L, 0L};
  2308. final Set<Path> pathNeedProcess = new HashSet<>();
  2309. // For each input path, calculate the total size.
  2310. for (final Path path : work.getPathToAliases().keySet()) {
  2311. if (path == null) {
  2312. continue;
  2313. }
  2314. if (filter != null && !filter.accept(path)) {
  2315. continue;
  2316. }
  2317. ContentSummary cs = ctx.getCS(path);
  2318. if (cs != null) {
  2319. summary[0] += cs.getLength();
  2320. summary[1] += cs.getFileCount();
  2321. summary[2] += cs.getDirectoryCount();
  2322. } else {
  2323. pathNeedProcess.add(path);
  2324. }
  2325. }
  2326. // Process the case when name node call is needed
  2327. final ExecutorService executor;
  2328. int numExecutors = getMaxExecutorsForInputListing(ctx.getConf(), pathNeedProcess.size());
  2329. if (numExecutors > 1) {
  2330. // Since multiple threads could call this method concurrently, locking
  2331. // this method will avoid number of threads out of control.
  2332. synchronized (INPUT_SUMMARY_LOCK) {
  2333. LOG.info("Using {} threads for getContentSummary", numExecutors);
  2334. executor = Executors.newFixedThreadPool(numExecutors,
  2335. new ThreadFactoryBuilder().setDaemon(true)
  2336. .setNameFormat("Get-Input-Summary-%d").build());
  2337. getInputSummaryWithPool(ctx, Collections.unmodifiableSet(pathNeedProcess),
  2338. work, summary, executor);
  2339. }
  2340. } else {
  2341. LOG.info("Not using thread pool for getContentSummary");
  2342. executor = MoreExecutors.newDirectExecutorService();
  2343. getInputSummaryWithPool(ctx, Collections.unmodifiableSet(pathNeedProcess),
  2344. work, summary, executor);
  2345. }
  2346. perfLogger.perfLogEnd(CLASS_NAME, PerfLogger.INPUT_SUMMARY);
  2347. return new ContentSummary.Builder().length(summary[0])
  2348. .fileCount(summary[1]).directoryCount(summary[2]).build();
  2349. }
  2350. /**
  2351. * Performs a ContentSummary lookup over a set of paths using 1 or more
  2352. * threads. The 'summary' argument is directly modified.
  2353. *
  2354. * @param ctx
  2355. * @param pathNeedProcess
  2356. * @param work
  2357. * @param summary
  2358. * @param executor
  2359. * @throws IOException
  2360. */
  2361. @VisibleForTesting
  2362. static void getInputSummaryWithPool(final Context ctx,
  2363. final Set<Path> pathNeedProcess, final MapWork work, final long[] summary,
  2364. final ExecutorService executor) throws IOException {
  2365. Preconditions.checkNotNull(ctx);
  2366. Preconditions.checkNotNull(pathNeedProcess);
  2367. Preconditions.checkNotNull(executor);
  2368. List<Future<?>> futures = new ArrayList<Future<?>>(pathNeedProcess.size());
  2369. final AtomicLong totalLength = new AtomicLong(0L);
  2370. final AtomicLong totalFileCount = new AtomicLong(0L);
  2371. final AtomicLong totalDirectoryCount = new AtomicLong(0L);
  2372. HiveInterruptCallback interrup = HiveInterruptUtils.add(new HiveInterruptCallback() {
  2373. @Override
  2374. public void interrupt() {
  2375. for (Path path : pathNeedProcess) {
  2376. try {
  2377. path.getFileSystem(ctx.getConf()).close();
  2378. } catch (IOException ignore) {
  2379. LOG.debug("Failed to close filesystem", ignore);
  2380. }
  2381. }
  2382. executor.shutdownNow();
  2383. }
  2384. });
  2385. try {
  2386. Configuration conf = ctx.getConf();
  2387. JobConf jobConf = new JobConf(conf);
  2388. for (final Path path : pathNeedProcess) {
  2389. // All threads share the same Configuration and JobConf based on the
  2390. // assumption that they are thread safe if only read operations are
  2391. // executed. It is not stated in Hadoop's javadoc, the sourcce codes
  2392. // clearly showed that they made efforts for it and we believe it is
  2393. // thread safe. Will revisit this piece of codes if we find the assumption
  2394. // is not correct.
  2395. final Configuration myConf = conf;
  2396. final JobConf myJobConf = jobConf;
  2397. final Map<String, Operator<?>> aliasToWork = work.getAliasToWork();
  2398. final Map<Path, List<String>> pathToAlias = work.getPathToAliases();
  2399. final PartitionDesc partDesc = work.getPathToPartitionInfo().get(path);
  2400. Runnable r = new Runnable() {
  2401. @Override
  2402. public void run() {
  2403. try {
  2404. Class<? extends InputFormat> inputFormatCls = partDesc
  2405. .getInputFileFormatClass();
  2406. InputFormat inputFormatObj = HiveInputFormat.getInputFormatFromCache(
  2407. inputFormatCls, myJobConf);
  2408. if (inputFormatObj instanceof ContentSummaryInputFormat) {
  2409. ContentSummaryInputFormat csif = (ContentSummaryInputFormat) inputFormatObj;
  2410. final ContentSummary cs = csif.getContentSummary(path, myJobConf);
  2411. recordSummary(path, cs);
  2412. return;
  2413. }
  2414. String metaTableStorage = null;
  2415. if (partDesc.getTableDesc() != null &&
  2416. partDesc.getTableDesc().getProperties() != null) {
  2417. metaTableStorage = partDesc.getTableDesc().getProperties()
  2418. .getProperty(hive_metastoreConstants.META_TABLE_STORAGE, null);
  2419. }
  2420. if (partDesc.getProperties() != null) {
  2421. metaTableStorage = partDesc.getProperties()
  2422. .getProperty(hive_metastoreConstants.META_TABLE_STORAGE, metaTableStorage);
  2423. }
  2424. HiveStorageHandler handler = HiveUtils.getStorageHandler(myConf, metaTableStorage);
  2425. if (handler instanceof InputEstimator) {
  2426. long total = 0;
  2427. TableDesc tableDesc = partDesc.getTableDesc();
  2428. InputEstimator estimator = (InputEstimator) handler;
  2429. for (String alias : HiveFileFormatUtils.doGetAliasesFromPath(pathToAlias, path)) {
  2430. JobConf jobConf = new JobConf(myJobConf);
  2431. TableScanOperator scanOp = (TableScanOperator) aliasToWork.get(alias);
  2432. Utilities.setColumnNameList(jobConf, scanOp, true);
  2433. Utilities.setColumnTypeList(jobConf, scanOp, true);
  2434. PlanUtils.configureInputJobPropertiesForStorageHandler(tableDesc);
  2435. Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf);
  2436. total += estimator.estimate(jobConf, scanOp, -1).getTotalLength();
  2437. }
  2438. recordSummary(path, new ContentSummary(total, -1, -1));
  2439. } else if (handler == null) {
  2440. // Nullify summary for non-native tables,
  2441. // in order not to be selected as a mapjoin target
  2442. FileSystem fs = path.getFileSystem(myConf);
  2443. recordSummary(path, fs.getContentSummary(path));
  2444. }
  2445. } catch (Exception e) {
  2446. // We safely ignore this exception for summary data.
  2447. // We don't update the cache to protect it from polluting other
  2448. // usages. The worst case is that IOException will always be
  2449. // retried for another getInputSummary(), which is fine as
  2450. // IOException is not considered as a common case.
  2451. LOG.info("Cannot get size of {}. Safely ignored.", path);
  2452. LOG.debug("Cannot get size of {}. Safely ignored.", path, e);
  2453. }
  2454. }
  2455. private void recordSummary(final Path p, final ContentSummary cs) {
  2456. final long csLength = cs.getLength();
  2457. final long csFileCount = cs.getFileCount();
  2458. final long csDirectoryCount = cs.getDirectoryCount();
  2459. totalLength.addAndGet(csLength);
  2460. totalFileCount.addAndGet(csFileCount);
  2461. totalDirectoryCount.addAndGet(csDirectoryCount);
  2462. ctx.addCS(p.toString(), cs);
  2463. LOG.debug(
  2464. "Cache Content Summary for {} length: {} file count: {} "
  2465. + "directory count: {}",
  2466. path, csLength, csFileCount, csDirectoryCount);
  2467. }
  2468. };
  2469. futures.add(executor.submit(r));
  2470. }
  2471. for (Future<?> future : futures) {
  2472. try {
  2473. future.get();
  2474. } catch (InterruptedException e) {
  2475. LOG.info("Interrupted when waiting threads", e);
  2476. Thread.currentThread().interrupt();
  2477. break;
  2478. } catch (ExecutionException e) {
  2479. throw new IOException(e);
  2480. }
  2481. }
  2482. executor.shutdown();
  2483. HiveInterruptUtils.checkInterrupted();
  2484. summary[0] += totalLength.get();
  2485. summary[1] += totalFileCount.get();
  2486. summary[2] += totalDirectoryCount.get();
  2487. } finally {
  2488. executor.shutdownNow();
  2489. HiveInterruptUtils.remove(interrup);
  2490. }
  2491. }
  2492. public static long sumOf(Map<String, Long> aliasToSize, Set<String> aliases) {
  2493. return sumOfExcept(aliasToSize, aliases, null);
  2494. }
  2495. // return sum of lengths except some aliases. returns -1 if any of other alias is unknown
  2496. public static long sumOfExcept(Map<String, Long> aliasToSize,
  2497. Set<String> aliases, Set<String> excepts) {
  2498. long total = 0;
  2499. for (String alias : aliases) {
  2500. if (excepts != null && excepts.contains(alias)) {
  2501. continue;
  2502. }
  2503. Long size = aliasToSize.get(alias);
  2504. if (size == null) {
  2505. return -1;
  2506. }
  2507. total += size;
  2508. }
  2509. return total;
  2510. }
  2511. public static boolean isEmptyPath(JobConf job, Path dirPath, Context ctx)
  2512. throws Exception {
  2513. if (ctx != null) {
  2514. ContentSummary cs = ctx.getCS(dirPath);
  2515. if (cs != null) {
  2516. if (LOG.isDebugEnabled()) {
  2517. LOG.debug("Content Summary cached for {} length: {} num files: {} " +
  2518. "num directories: {}", dirPath, cs.getLength(), cs.getFileCount(),
  2519. cs.getDirectoryCount());
  2520. }
  2521. return (cs.getLength() == 0 && cs.getFileCount() == 0 && cs.getDirectoryCount() <= 1);
  2522. } else {
  2523. LOG.debug("Content Summary not cached for {}", dirPath);
  2524. }
  2525. }
  2526. return isEmptyPath(job, dirPath);
  2527. }
  2528. public static boolean isEmptyPath(Configuration job, Path dirPath) throws IOException {
  2529. FileStatus[] fStats = listNonHiddenFileStatus(job, dirPath);
  2530. if (fStats.length > 0) {
  2531. return false;
  2532. }
  2533. return true;
  2534. }
  2535. public static FileStatus[] listNonHiddenFileStatus(Configuration job, Path dirPath)
  2536. throws IOException {
  2537. FileSystem inpFs = dirPath.getFileSystem(job);
  2538. try {
  2539. return inpFs.listStatus(dirPath, FileUtils.HIDDEN_FILES_PATH_FILTER);
  2540. } catch (FileNotFoundException e) {
  2541. return new FileStatus[] {};
  2542. }
  2543. }
  2544. public static List<TezTask> getTezTasks(List<Task<?>> tasks) {
  2545. return getTasks(tasks, new TaskFilterFunction<>(TezTask.class));
  2546. }
  2547. public static List<SparkTask> getSparkTasks(List<Task<?>> tasks) {
  2548. return getTasks(tasks, new TaskFilterFunction<>(SparkTask.class));
  2549. }
  2550. public static List<ExecDriver> getMRTasks(List<Task<?>> tasks) {
  2551. return getTasks(tasks, new TaskFilterFunction<>(ExecDriver.class));
  2552. }
  2553. public static int getNumClusterJobs(List<Task<?>> tasks) {
  2554. return getMRTasks(tasks).size() + getTezTasks(tasks).size() + getSparkTasks(tasks).size();
  2555. }
  2556. static class TaskFilterFunction<T> implements DAGTraversal.Function {
  2557. private Set<Task<?>> visited = new HashSet<>();
  2558. private Class<T> requiredType;
  2559. private List<T> typeSpecificTasks = new ArrayList<>();
  2560. TaskFilterFunction(Class<T> requiredType) {
  2561. this.requiredType = requiredType;
  2562. }
  2563. @Override
  2564. public void process(Task<?> task) {
  2565. if (requiredType.isInstance(task) && !typeSpecificTasks.contains(task)) {
  2566. typeSpecificTasks.add((T) task);
  2567. }
  2568. visited.add(task);
  2569. }
  2570. List<T> getTasks() {
  2571. return typeSpecificTasks;
  2572. }
  2573. @Override
  2574. public boolean skipProcessing(Task<?> task) {
  2575. return visited.contains(task);
  2576. }
  2577. }
  2578. private static <T> List<T> getTasks(List<Task<?>> tasks,
  2579. TaskFilterFunction<T> function) {
  2580. DAGTraversal.traverse(tasks, function);
  2581. return function.getTasks();
  2582. }
  2583. public static final class PartitionDetails {
  2584. public Map<String, String> fullSpec;
  2585. public Partition partition;
  2586. public List<FileStatus> newFiles;
  2587. public boolean hasOldPartition = false;
  2588. public AcidUtils.TableSnapshot tableSnapshot;
  2589. }
  2590. /**
  2591. * Construct a list of full partition spec from Dynamic Partition Context and the directory names
  2592. * corresponding to these dynamic partitions.
  2593. */
  2594. public static Map<Path, PartitionDetails> getFullDPSpecs(Configuration conf, DynamicPartitionCtx dpCtx,
  2595. Map<String, List<Path>> dynamicPartitionSpecs) throws HiveException {
  2596. try {
  2597. Path loadPath = dpCtx.getRootPath();
  2598. FileSystem fs = loadPath.getFileSystem(conf);
  2599. int numDPCols = dpCtx.getNumDPCols();
  2600. Map<Path, Optional<List<Path>>> allPartition = new HashMap<>();
  2601. if (dynamicPartitionSpecs != null) {
  2602. for (Map.Entry<String, List<Path>> partSpec : dynamicPartitionSpecs.entrySet()) {
  2603. allPartition.put(new Path(loadPath, partSpec.getKey()), Optional.of(partSpec.getValue()));
  2604. }
  2605. } else {
  2606. List<FileStatus> status = HiveStatsUtils.getFileStatusRecurse(loadPath, numDPCols, fs);
  2607. for (FileStatus fileStatus : status) {
  2608. allPartition.put(fileStatus.getPath(), Optional.empty());
  2609. }
  2610. }
  2611. if (allPartition.isEmpty()) {
  2612. LOG.warn("No partition is generated by dynamic partitioning");
  2613. return Collections.synchronizedMap(new LinkedHashMap<>());
  2614. }
  2615. validateDynPartitionCount(conf, allPartition.keySet());
  2616. // partial partition specification
  2617. Map<String, String> partSpec = dpCtx.getPartSpec();
  2618. // list of full partition specification
  2619. Map<Path, PartitionDetails> partitionDetailsMap =
  2620. Collections.synchronizedMap(new LinkedHashMap<>());
  2621. // calculate full path spec for each valid partition path
  2622. for (Map.Entry<Path, Optional<List<Path>>> partEntry : allPartition.entrySet()) {
  2623. Path partPath = partEntry.getKey();
  2624. Map<String, String> fullPartSpec = Maps.newLinkedHashMap(partSpec);
  2625. String staticParts = Warehouse.makeDynamicPartName(partSpec);
  2626. Path computedPath = partPath;
  2627. if (!staticParts.isEmpty() ) {
  2628. computedPath = new Path(new Path(partPath.getParent(), staticParts), partPath.getName());
  2629. }
  2630. if (!Warehouse.makeSpecFromName(fullPartSpec, computedPath, new HashSet<>(partSpec.keySet()))) {
  2631. Utilities.FILE_OP_LOGGER.warn("Ignoring invalid DP directory " + partPath);
  2632. } else {
  2633. PartitionDetails details = new PartitionDetails();
  2634. details.fullSpec = fullPartSpec;
  2635. if (partEntry.getValue().isPresent()) {
  2636. details.newFiles = new ArrayList<>();
  2637. for (Path filePath : partEntry.getValue().get()) {
  2638. details.newFiles.add(fs.getFileStatus(filePath));
  2639. }
  2640. }
  2641. partitionDetailsMap.put(partPath, details);
  2642. }
  2643. }
  2644. return partitionDetailsMap;
  2645. } catch (IOException e) {
  2646. throw new HiveException(e);
  2647. }
  2648. }
  2649. private static void validateDynPartitionCount(Configuration conf, Collection<Path> partitions) throws HiveException {
  2650. int partsToLoad = partitions.size();
  2651. int maxPartition = HiveConf.getIntVar(conf, HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS);
  2652. if (partsToLoad > maxPartition) {
  2653. throw new HiveException("Number of dynamic partitions created is " + partsToLoad
  2654. + ", which is more than "
  2655. + maxPartition
  2656. +". To solve this try to set " + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname
  2657. + " to at least " + partsToLoad + '.');
  2658. }
  2659. }
  2660. public static StatsPublisher getStatsPublisher(JobConf jc) {
  2661. StatsFactory factory = StatsFactory.newFactory(jc);
  2662. return factory == null ? null : factory.getStatsPublisher();
  2663. }
  2664. public static String join(String... elements) {
  2665. StringBuilder builder = new StringBuilder();
  2666. for (String element : elements) {
  2667. if (element == null || element.isEmpty()) {
  2668. continue;
  2669. }
  2670. builder.append(element);
  2671. if (!element.endsWith(Path.SEPARATOR)) {
  2672. builder.append(Path.SEPARATOR);
  2673. }
  2674. }
  2675. return builder.toString();
  2676. }
  2677. public static void setColumnNameList(JobConf jobConf, RowSchema rowSchema) {
  2678. setColumnNameList(jobConf, rowSchema, false);
  2679. }
  2680. public static void setColumnNameList(JobConf jobConf, RowSchema rowSchema, boolean excludeVCs) {
  2681. if (rowSchema == null) {
  2682. return;
  2683. }
  2684. StringBuilder columnNames = new StringBuilder();
  2685. for (ColumnInfo colInfo : rowSchema.getSignature()) {
  2686. if (excludeVCs && colInfo.getIsVirtualCol()) {
  2687. continue;
  2688. }
  2689. if (columnNames.length() > 0) {
  2690. columnNames.append(',');
  2691. }
  2692. columnNames.append(colInfo.getInternalName());
  2693. }
  2694. String columnNamesString = columnNames.toString();
  2695. jobConf.set(serdeConstants.LIST_COLUMNS, columnNamesString);
  2696. }
  2697. public static void setColumnNameList(JobConf jobConf, Operator op) {
  2698. setColumnNameList(jobConf, op, false);
  2699. }
  2700. public static void setColumnNameList(JobConf jobConf, Operator op, boolean excludeVCs) {
  2701. RowSchema rowSchema = op.getSchema();
  2702. setColumnNameList(jobConf, rowSchema, excludeVCs);
  2703. }
  2704. public static void setColumnTypeList(JobConf jobConf, RowSchema rowSchema) {
  2705. setColumnTypeList(jobConf, rowSchema, false);
  2706. }
  2707. public static void setColumnTypeList(JobConf jobConf, RowSchema rowSchema, boolean excludeVCs) {
  2708. if (rowSchema == null) {
  2709. return;
  2710. }
  2711. StringBuilder columnTypes = new StringBuilder();
  2712. for (ColumnInfo colInfo : rowSchema.getSignature()) {
  2713. if (excludeVCs && colInfo.getIsVirtualCol()) {
  2714. continue;
  2715. }
  2716. if (columnTypes.length() > 0) {
  2717. columnTypes.append(',');
  2718. }
  2719. columnTypes.append(colInfo.getTypeName());
  2720. }
  2721. String columnTypesString = columnTypes.toString();
  2722. jobConf.set(serdeConstants.LIST_COLUMN_TYPES, columnTypesString);
  2723. }
  2724. public static void setColumnTypeList(JobConf jobConf, Operator op) {
  2725. setColumnTypeList(jobConf, op, false);
  2726. }
  2727. public static void setColumnTypeList(JobConf jobConf, Operator op, boolean excludeVCs) {
  2728. RowSchema rowSchema = op.getSchema();
  2729. setColumnTypeList(jobConf, rowSchema, excludeVCs);
  2730. }
  2731. public static final String suffix = ".hashtable";
  2732. public static Path generatePath(Path basePath, String dumpFilePrefix,
  2733. Byte tag, String bigBucketFileName) {
  2734. return new Path(basePath, "MapJoin-" + dumpFilePrefix + tag +
  2735. "-" + bigBucketFileName + suffix);
  2736. }
  2737. public static String generateFileName(Byte tag, String bigBucketFileName) {
  2738. return "MapJoin-" + tag + "-" + bigBucketFileName + suffix;
  2739. }
  2740. public static Path generateTmpPath(Path basePath, String id) {
  2741. return new Path(basePath, "HashTable-" + id);
  2742. }
  2743. public static Path generateTarPath(Path basePath, String filename) {
  2744. return new Path(basePath, filename + ".tar.gz");
  2745. }
  2746. public static String generateTarFileName(String name) {
  2747. return name + ".tar.gz";
  2748. }
  2749. public static String generatePath(Path baseURI, String filename) {
  2750. return baseURI + Path.SEPARATOR + filename;
  2751. }
  2752. public static String now() {
  2753. Calendar cal = Calendar.getInstance();
  2754. SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  2755. return sdf.format(cal.getTime());
  2756. }
  2757. public static double showTime(long time) {
  2758. double result = (double) time / (double) 1000;
  2759. return result;
  2760. }
  2761. /**
  2762. * The check here is kind of not clean. It first use a for loop to go through
  2763. * all input formats, and choose the ones that extend ReworkMapredInputFormat
  2764. * to a set. And finally go through the ReworkMapredInputFormat set, and call
  2765. * rework for each one.
  2766. *
  2767. * Technically all these can be avoided if all Hive's input formats can share
  2768. * a same interface. As in today's hive and Hadoop, it is not possible because
  2769. * a lot of Hive's input formats are in Hadoop's code. And most of Hadoop's
  2770. * input formats just extend InputFormat interface.
  2771. *
  2772. * @param task
  2773. * @param reworkMapredWork
  2774. * @param conf
  2775. * @throws SemanticException
  2776. */
  2777. public static void reworkMapRedWork(Task<?> task,
  2778. boolean reworkMapredWork, HiveConf conf) throws SemanticException {
  2779. if (reworkMapredWork && (task instanceof MapRedTask)) {
  2780. try {
  2781. MapredWork mapredWork = ((MapRedTask) task).getWork();
  2782. Set<Class<? extends InputFormat>> reworkInputFormats = new HashSet<Class<? extends InputFormat>>();
  2783. for (PartitionDesc part : mapredWork.getMapWork().getPathToPartitionInfo().values()) {
  2784. Class<? extends InputFormat> inputFormatCls = part
  2785. .getInputFileFormatClass();
  2786. if (ReworkMapredInputFormat.class.isAssignableFrom(inputFormatCls)) {
  2787. reworkInputFormats.add(inputFormatCls);
  2788. }
  2789. }
  2790. if (reworkInputFormats.size() > 0) {
  2791. for (Class<? extends InputFormat> inputFormatCls : reworkInputFormats) {
  2792. ReworkMapredInputFormat inst = (ReworkMapredInputFormat) ReflectionUtil
  2793. .newInstance(inputFormatCls, null);
  2794. inst.rework(conf, mapredWork);
  2795. }
  2796. }
  2797. } catch (IOException e) {
  2798. throw new SemanticException(e);
  2799. }
  2800. }
  2801. }
  2802. public static class SQLCommand<T> {
  2803. public T run(PreparedStatement stmt) throws SQLException {
  2804. return null;
  2805. }
  2806. }
  2807. /**
  2808. * Retry SQL execution with random backoff (same as the one implemented in HDFS-767).
  2809. * This function only retries when the SQL query throws a SQLTransientException (which
  2810. * might be able to succeed with a simple retry). It doesn't retry when the exception
  2811. * is a SQLRecoverableException or SQLNonTransientException. For SQLRecoverableException
  2812. * the caller needs to reconnect to the database and restart the whole transaction.
  2813. *
  2814. * @param cmd the SQL command
  2815. * @param stmt the prepared statement of SQL.
  2816. * @param baseWindow The base time window (in milliseconds) before the next retry.
  2817. * see {@link #getRandomWaitTime} for details.
  2818. * @param maxRetries the maximum # of retries when getting a SQLTransientException.
  2819. * @throws SQLException throws SQLRecoverableException or SQLNonTransientException the
  2820. * first time it is caught, or SQLTransientException when the maxRetries has reached.
  2821. */
  2822. public static <T> T executeWithRetry(SQLCommand<T> cmd, PreparedStatement stmt,
  2823. long baseWindow, int maxRetries) throws SQLException {
  2824. T result = null;
  2825. // retry with # of maxRetries before throwing exception
  2826. for (int failures = 0; ; failures++) {
  2827. try {
  2828. result = cmd.run(stmt);
  2829. return result;
  2830. } catch (SQLTransientException e) {
  2831. LOG.warn("Failure and retry # {}", failures, e);
  2832. if (failures >= maxRetries) {
  2833. throw e;
  2834. }
  2835. long waitTime = getRandomWaitTime(baseWindow, failures,
  2836. ThreadLocalRandom.current());
  2837. try {
  2838. Thread.sleep(waitTime);
  2839. } catch (InterruptedException iex) {
  2840. }
  2841. } catch (SQLException e) {
  2842. // throw other types of SQLExceptions (SQLNonTransientException / SQLRecoverableException)
  2843. throw e;
  2844. }
  2845. }
  2846. }
  2847. /**
  2848. * Retry connecting to a database with random backoff (same as the one implemented in HDFS-767).
  2849. * This function only retries when the SQL query throws a SQLTransientException (which
  2850. * might be able to succeed with a simple retry). It doesn't retry when the exception
  2851. * is a SQLRecoverableException or SQLNonTransientException. For SQLRecoverableException
  2852. * the caller needs to reconnect to the database and restart the whole transaction.
  2853. *
  2854. * @param connectionString the JDBC connection string.
  2855. * @param waitWindow The base time window (in milliseconds) before the next retry.
  2856. * see {@link #getRandomWaitTime} for details.
  2857. * @param maxRetries the maximum # of retries when getting a SQLTransientException.
  2858. * @throws SQLException throws SQLRecoverableException or SQLNonTransientException the
  2859. * first time it is caught, or SQLTransientException when the maxRetries has reached.
  2860. */
  2861. public static Connection connectWithRetry(String connectionString,
  2862. long waitWindow, int maxRetries) throws SQLException {
  2863. // retry with # of maxRetries before throwing exception
  2864. for (int failures = 0; ; failures++) {
  2865. try {
  2866. Connection conn = DriverManager.getConnection(connectionString);
  2867. return conn;
  2868. } catch (SQLTransientException e) {
  2869. if (failures >= maxRetries) {
  2870. LOG.error("Error during JDBC connection.", e);
  2871. throw e;
  2872. }
  2873. long waitTime = Utilities.getRandomWaitTime(waitWindow, failures,
  2874. ThreadLocalRandom.current());
  2875. try {
  2876. Thread.sleep(waitTime);
  2877. } catch (InterruptedException e1) {
  2878. }
  2879. } catch (SQLException e) {
  2880. // just throw other types (SQLNonTransientException / SQLRecoverableException)
  2881. throw e;
  2882. }
  2883. }
  2884. }
  2885. /**
  2886. * Retry preparing a SQL statement with random backoff (same as the one implemented in HDFS-767).
  2887. * This function only retries when the SQL query throws a SQLTransientException (which
  2888. * might be able to succeed with a simple retry). It doesn't retry when the exception
  2889. * is a SQLRecoverableException or SQLNonTransientException. For SQLRecoverableException
  2890. * the caller needs to reconnect to the database and restart the whole transaction.
  2891. *
  2892. * @param conn a JDBC connection.
  2893. * @param stmt the SQL statement to be prepared.
  2894. * @param waitWindow The base time window (in milliseconds) before the next retry.
  2895. * see {@link #getRandomWaitTime} for details.
  2896. * @param maxRetries the maximum # of retries when getting a SQLTransientException.
  2897. * @throws SQLException throws SQLRecoverableException or SQLNonTransientException the
  2898. * first time it is caught, or SQLTransientException when the maxRetries has reached.
  2899. */
  2900. public static PreparedStatement prepareWithRetry(Connection conn, String stmt,
  2901. long waitWindow, int maxRetries) throws SQLException {
  2902. // retry with # of maxRetries before throwing exception
  2903. for (int failures = 0; ; failures++) {
  2904. try {
  2905. return conn.prepareStatement(stmt);
  2906. } catch (SQLTransientException e) {
  2907. if (failures >= maxRetries) {
  2908. LOG.error("Error preparing JDBC Statement {}", stmt, e);
  2909. throw e;
  2910. }
  2911. long waitTime = Utilities.getRandomWaitTime(waitWindow, failures,
  2912. ThreadLocalRandom.current());
  2913. try {
  2914. Thread.sleep(waitTime);
  2915. } catch (InterruptedException e1) {
  2916. }
  2917. } catch (SQLException e) {
  2918. // just throw other types (SQLNonTransientException / SQLRecoverableException)
  2919. throw e;
  2920. }
  2921. }
  2922. }
  2923. public static void setQueryTimeout(java.sql.Statement stmt, int timeout) throws SQLException {
  2924. if (timeout < 0) {
  2925. LOG.info("Invalid query timeout {}", timeout);
  2926. return;
  2927. }
  2928. try {
  2929. stmt.setQueryTimeout(timeout);
  2930. } catch (SQLException e) {
  2931. String message = e.getMessage() == null ? null : e.getMessage().toLowerCase();
  2932. if (e instanceof SQLFeatureNotSupportedException ||
  2933. (message != null && (message.contains("implemented") || message.contains("supported")))) {
  2934. LOG.info("setQueryTimeout is not supported");
  2935. return;
  2936. }
  2937. throw e;
  2938. }
  2939. }
  2940. /**
  2941. * Introducing a random factor to the wait time before another retry.
  2942. * The wait time is dependent on # of failures and a random factor.
  2943. * At the first time of getting an exception , the wait time
  2944. * is a random number between 0..baseWindow msec. If the first retry
  2945. * still fails, we will wait baseWindow msec grace period before the 2nd retry.
  2946. * Also at the second retry, the waiting window is expanded to 2*baseWindow msec
  2947. * alleviating the request rate from the server. Similarly the 3rd retry
  2948. * will wait 2*baseWindow msec. grace period before retry and the waiting window is
  2949. * expanded to 3*baseWindow msec and so on.
  2950. * @param baseWindow the base waiting window.
  2951. * @param failures number of failures so far.
  2952. * @param r a random generator.
  2953. * @return number of milliseconds for the next wait time.
  2954. */
  2955. public static long getRandomWaitTime(long baseWindow, int failures, Random r) {
  2956. return (long) (
  2957. baseWindow * failures + // grace period for the last round of attempt
  2958. baseWindow * (failures + 1) * r.nextDouble()); // expanding time window for each failure
  2959. }
  2960. public static final char sqlEscapeChar = '\\';
  2961. /**
  2962. * Escape the '_', '%', as well as the escape characters inside the string key.
  2963. * @param key the string that will be used for the SQL LIKE operator.
  2964. * @return a string with escaped '_' and '%'.
  2965. */
  2966. public static String escapeSqlLike(String key) {
  2967. StringBuilder sb = new StringBuilder(key.length());
  2968. for (char c: key.toCharArray()) {
  2969. switch(c) {
  2970. case '_':
  2971. case '%':
  2972. case sqlEscapeChar:
  2973. sb.append(sqlEscapeChar);
  2974. // fall through
  2975. default:
  2976. sb.append(c);
  2977. break;
  2978. }
  2979. }
  2980. return sb.toString();
  2981. }
  2982. /**
  2983. * Format number of milliseconds to strings
  2984. *
  2985. * @param msec milliseconds
  2986. * @return a formatted string like "x days y hours z minutes a seconds b msec"
  2987. */
  2988. public static String formatMsecToStr(long msec) {
  2989. long day = -1, hour = -1, minute = -1, second = -1;
  2990. long ms = msec % 1000;
  2991. long timeLeft = msec / 1000;
  2992. if (timeLeft > 0) {
  2993. second = timeLeft % 60;
  2994. timeLeft /= 60;
  2995. if (timeLeft > 0) {
  2996. minute = timeLeft % 60;
  2997. timeLeft /= 60;
  2998. if (timeLeft > 0) {
  2999. hour = timeLeft % 24;
  3000. day = timeLeft / 24;
  3001. }
  3002. }
  3003. }
  3004. StringBuilder sb = new StringBuilder();
  3005. if (day != -1) {
  3006. sb.append(day + " days ");
  3007. }
  3008. if (hour != -1) {
  3009. sb.append(hour + " hours ");
  3010. }
  3011. if (minute != -1) {
  3012. sb.append(minute + " minutes ");
  3013. }
  3014. if (second != -1) {
  3015. sb.append(second + " seconds ");
  3016. }
  3017. sb.append(ms + " msec");
  3018. return sb.toString();
  3019. }
  3020. /**
  3021. * Estimate the number of reducers needed for this job, based on job input,
  3022. * and configuration parameters.
  3023. *
  3024. * The output of this method should only be used if the output of this
  3025. * MapRedTask is not being used to populate a bucketed table and the user
  3026. * has not specified the number of reducers to use.
  3027. *
  3028. * @return the number of reducers.
  3029. */
  3030. public static int estimateNumberOfReducers(HiveConf conf, ContentSummary inputSummary,
  3031. MapWork work, boolean finalMapRed) throws IOException {
  3032. long bytesPerReducer = conf.getLongVar(HiveConf.ConfVars.BYTESPERREDUCER);
  3033. int maxReducers = conf.getIntVar(HiveConf.ConfVars.MAXREDUCERS);
  3034. double samplePercentage = getHighestSamplePercentage(work);
  3035. long totalInputFileSize = getTotalInputFileSize(inputSummary, work, samplePercentage);
  3036. // if all inputs are sampled, we should shrink the size of reducers accordingly.
  3037. if (totalInputFileSize != inputSummary.getLength()) {
  3038. LOG.info("BytesPerReducer={} maxReducers={} estimated totalInputFileSize={}", bytesPerReducer,
  3039. maxReducers, totalInputFileSize);
  3040. } else {
  3041. LOG.info("BytesPerReducer={} maxReducers={} totalInputFileSize={}", bytesPerReducer,
  3042. maxReducers, totalInputFileSize);
  3043. }
  3044. // If this map reduce job writes final data to a table and bucketing is being inferred,
  3045. // and the user has configured Hive to do this, make sure the number of reducers is a
  3046. // power of two
  3047. boolean powersOfTwo = conf.getBoolVar(HiveConf.ConfVars.HIVE_INFER_BUCKET_SORT_NUM_BUCKETS_POWER_TWO) &&
  3048. finalMapRed && !work.getBucketedColsByDirectory().isEmpty();
  3049. return estimateReducers(totalInputFileSize, bytesPerReducer, maxReducers, powersOfTwo);
  3050. }
  3051. public static int estimateReducers(long totalInputFileSize, long bytesPerReducer,
  3052. int maxReducers, boolean powersOfTwo) {
  3053. double bytes = Math.max(totalInputFileSize, bytesPerReducer);
  3054. int reducers = (int) Math.ceil(bytes / bytesPerReducer);
  3055. reducers = Math.max(1, reducers);
  3056. reducers = Math.min(maxReducers, reducers);
  3057. int reducersLog = (int)(Math.log(reducers) / Math.log(2)) + 1;
  3058. int reducersPowerTwo = (int)Math.pow(2, reducersLog);
  3059. if (powersOfTwo) {
  3060. // If the original number of reducers was a power of two, use that
  3061. if (reducersPowerTwo / 2 == reducers) {
  3062. // nothing to do
  3063. } else if (reducersPowerTwo > maxReducers) {
  3064. // If the next power of two greater than the original number of reducers is greater
  3065. // than the max number of reducers, use the preceding power of two, which is strictly
  3066. // less than the original number of reducers and hence the max
  3067. reducers = reducersPowerTwo / 2;
  3068. } else {
  3069. // Otherwise use the smallest power of two greater than the original number of reducers
  3070. reducers = reducersPowerTwo;
  3071. }
  3072. }
  3073. return reducers;
  3074. }
  3075. /**
  3076. * Computes the total input file size. If block sampling was used it will scale this
  3077. * value by the highest sample percentage (as an estimate for input).
  3078. *
  3079. * @param inputSummary
  3080. * @param work
  3081. * @param highestSamplePercentage
  3082. * @return estimated total input size for job
  3083. */
  3084. public static long getTotalInputFileSize (ContentSummary inputSummary, MapWork work,
  3085. double highestSamplePercentage) {
  3086. long totalInputFileSize = inputSummary.getLength();
  3087. if (MapUtils.isEmpty(work.getNameToSplitSample())) {
  3088. // If percentage block sampling wasn't used, we don't need to do any estimation
  3089. return totalInputFileSize;
  3090. }
  3091. if (highestSamplePercentage >= 0) {
  3092. totalInputFileSize = Math.min((long) (totalInputFileSize * (highestSamplePercentage / 100D))
  3093. , totalInputFileSize);
  3094. }
  3095. return totalInputFileSize;
  3096. }
  3097. /**
  3098. * Computes the total number of input files. If block sampling was used it will scale this
  3099. * value by the highest sample percentage (as an estimate for # input files).
  3100. *
  3101. * @param inputSummary
  3102. * @param work
  3103. * @param highestSamplePercentage
  3104. * @return
  3105. */
  3106. public static long getTotalInputNumFiles (ContentSummary inputSummary, MapWork work,
  3107. double highestSamplePercentage) {
  3108. long totalInputNumFiles = inputSummary.getFileCount();
  3109. if (MapUtils.isEmpty(work.getNameToSplitSample())) {
  3110. // If percentage block sampling wasn't used, we don't need to do any estimation
  3111. return totalInputNumFiles;
  3112. }
  3113. if (highestSamplePercentage >= 0) {
  3114. totalInputNumFiles = Math.min((long) (totalInputNumFiles * (highestSamplePercentage / 100D))
  3115. , totalInputNumFiles);
  3116. }
  3117. return totalInputNumFiles;
  3118. }
  3119. /**
  3120. * Returns the highest sample percentage of any alias in the given MapWork
  3121. */
  3122. public static double getHighestSamplePercentage (MapWork work) {
  3123. double highestSamplePercentage = 0;
  3124. for (String alias : work.getAliasToWork().keySet()) {
  3125. if (work.getNameToSplitSample().containsKey(alias)) {
  3126. Double rate = work.getNameToSplitSample().get(alias).getPercent();
  3127. if (rate != null && rate > highestSamplePercentage) {
  3128. highestSamplePercentage = rate;
  3129. }
  3130. } else {
  3131. highestSamplePercentage = -1;
  3132. break;
  3133. }
  3134. }
  3135. return highestSamplePercentage;
  3136. }
  3137. /**
  3138. * On Tez we're not creating dummy files when getting/setting input paths.
  3139. * We let Tez handle the situation. We're also setting the paths in the AM
  3140. * so we don't want to depend on scratch dir and context.
  3141. */
  3142. public static List<Path> getInputPathsTez(JobConf job, MapWork work) throws Exception {
  3143. String scratchDir = job.get(DagUtils.TEZ_TMP_DIR_KEY);
  3144. List<Path> paths = getInputPaths(job, work, new Path(scratchDir), null, true);
  3145. return paths;
  3146. }
  3147. /**
  3148. * Appends vertex name to specified counter name.
  3149. *
  3150. * @param counter counter to be appended with
  3151. * @param vertexName vertex name
  3152. * @return counter name with vertex name appended
  3153. */
  3154. public static String getVertexCounterName(String counter, String vertexName) {
  3155. if (vertexName != null && !vertexName.isEmpty()) {
  3156. vertexName = "_" + vertexName.replace(" ", "_");
  3157. }
  3158. return counter + vertexName;
  3159. }
  3160. /**
  3161. * Computes a list of all input paths needed to compute the given MapWork. All aliases
  3162. * are considered and a merged list of input paths is returned. If any input path points
  3163. * to an empty table or partition a dummy file in the scratch dir is instead created and
  3164. * added to the list. This is needed to avoid special casing the operator pipeline for
  3165. * these cases.
  3166. *
  3167. * @param job JobConf used to run the job
  3168. * @param work MapWork encapsulating the info about the task
  3169. * @param hiveScratchDir The tmp dir used to create dummy files if needed
  3170. * @param ctx Context object
  3171. * @return List of paths to process for the given MapWork
  3172. * @throws Exception
  3173. */
  3174. public static List<Path> getInputPaths(JobConf job, MapWork work, Path hiveScratchDir,
  3175. Context ctx, boolean skipDummy) throws Exception {
  3176. PerfLogger perfLogger = SessionState.getPerfLogger();
  3177. perfLogger.perfLogBegin(CLASS_NAME, PerfLogger.INPUT_PATHS);
  3178. Set<Path> pathsProcessed = new HashSet<Path>();
  3179. List<Path> pathsToAdd = new LinkedList<Path>();
  3180. DriverState driverState = DriverState.getDriverState();
  3181. // AliasToWork contains all the aliases
  3182. Collection<String> aliasToWork = work.getAliasToWork().keySet();
  3183. if (!skipDummy) {
  3184. // ConcurrentModification otherwise if adding dummy.
  3185. aliasToWork = new ArrayList<>(aliasToWork);
  3186. }
  3187. for (String alias : aliasToWork) {
  3188. LOG.info("Processing alias {}", alias);
  3189. // The alias may not have any path
  3190. Collection<Map.Entry<Path, List<String>>> pathToAliases = work.getPathToAliases().entrySet();
  3191. if (!skipDummy) {
  3192. // ConcurrentModification otherwise if adding dummy.
  3193. pathToAliases = new ArrayList<>(pathToAliases);
  3194. }
  3195. boolean isEmptyTable = true;
  3196. boolean hasLogged = false;
  3197. for (Map.Entry<Path, List<String>> e : pathToAliases) {
  3198. if (driverState != null && driverState.isAborted()) {
  3199. throw new IOException("Operation is Canceled.");
  3200. }
  3201. Path file = e.getKey();
  3202. List<String> aliases = e.getValue();
  3203. if (aliases.contains(alias)) {
  3204. if (file != null) {
  3205. isEmptyTable = false;
  3206. } else {
  3207. LOG.warn("Found a null path for alias {}", alias);
  3208. continue;
  3209. }
  3210. // Multiple aliases can point to the same path - it should be
  3211. // processed only once
  3212. if (pathsProcessed.contains(file)) {
  3213. continue;
  3214. }
  3215. StringInternUtils.internUriStringsInPath(file);
  3216. pathsProcessed.add(file);
  3217. LOG.debug("Adding input file {}", file);
  3218. if (!hasLogged) {
  3219. hasLogged = true;
  3220. LOG.info("Adding {} inputs; the first input is {}",
  3221. work.getPathToAliases().size(), file);
  3222. }
  3223. pathsToAdd.add(file);
  3224. }
  3225. }
  3226. // If the query references non-existent partitions
  3227. // We need to add a empty file, it is not acceptable to change the
  3228. // operator tree
  3229. // Consider the query:
  3230. // select * from (select count(1) from T union all select count(1) from
  3231. // T2) x;
  3232. // If T is empty and T2 contains 100 rows, the user expects: 0, 100 (2
  3233. // rows)
  3234. if (isEmptyTable && !skipDummy) {
  3235. pathsToAdd.add(createDummyFileForEmptyTable(job, work, hiveScratchDir, alias));
  3236. }
  3237. }
  3238. List<Path> finalPathsToAdd = new LinkedList<>();
  3239. int numExecutors = getMaxExecutorsForInputListing(job, pathsToAdd.size());
  3240. if (numExecutors > 1) {
  3241. ExecutorService pool = Executors.newFixedThreadPool(numExecutors,
  3242. new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Get-Input-Paths-%d").build());
  3243. finalPathsToAdd.addAll(getInputPathsWithPool(job, work, hiveScratchDir, ctx, skipDummy, pathsToAdd, pool));
  3244. } else {
  3245. for (final Path path : pathsToAdd) {
  3246. if (driverState != null && driverState.isAborted()) {
  3247. throw new IOException("Operation is Canceled.");
  3248. }
  3249. Path newPath = new GetInputPathsCallable(path, job, work, hiveScratchDir, ctx, skipDummy).call();
  3250. updatePathForMapWork(newPath, work, path);
  3251. finalPathsToAdd.add(newPath);
  3252. }
  3253. }
  3254. perfLogger.perfLogEnd(CLASS_NAME, PerfLogger.INPUT_PATHS);
  3255. return finalPathsToAdd;
  3256. }
  3257. @VisibleForTesting
  3258. static List<Path> getInputPathsWithPool(JobConf job, MapWork work, Path hiveScratchDir,
  3259. Context ctx, boolean skipDummy, List<Path> pathsToAdd,
  3260. ExecutorService pool) throws IOException, ExecutionException, InterruptedException {
  3261. DriverState driverState = DriverState.getDriverState();
  3262. List<Path> finalPathsToAdd = new ArrayList<>();
  3263. try {
  3264. Map<GetInputPathsCallable, Future<Path>> getPathsCallableToFuture = new LinkedHashMap<>();
  3265. for (final Path path : pathsToAdd) {
  3266. if (driverState != null && driverState.isAborted()) {
  3267. throw new IOException("Operation is Canceled.");
  3268. }
  3269. GetInputPathsCallable callable = new GetInputPathsCallable(path, job, work, hiveScratchDir, ctx, skipDummy);
  3270. getPathsCallableToFuture.put(callable, pool.submit(callable));
  3271. }
  3272. pool.shutdown();
  3273. for (Map.Entry<GetInputPathsCallable, Future<Path>> future : getPathsCallableToFuture.entrySet()) {
  3274. if (driverState != null && driverState.isAborted()) {
  3275. throw new IOException("Operation is Canceled.");
  3276. }
  3277. Path newPath = future.getValue().get();
  3278. updatePathForMapWork(newPath, work, future.getKey().path);
  3279. finalPathsToAdd.add(newPath);
  3280. }
  3281. } finally {
  3282. pool.shutdownNow();
  3283. }
  3284. return finalPathsToAdd;
  3285. }
  3286. private static class GetInputPathsCallable implements Callable<Path> {
  3287. private final Path path;
  3288. private final JobConf job;
  3289. private final MapWork work;
  3290. private final Path hiveScratchDir;
  3291. private final Context ctx;
  3292. private final boolean skipDummy;
  3293. private GetInputPathsCallable(Path path, JobConf job, MapWork work, Path hiveScratchDir,
  3294. Context ctx, boolean skipDummy) {
  3295. this.path = path;
  3296. this.job = job;
  3297. this.work = work;
  3298. this.hiveScratchDir = hiveScratchDir;
  3299. this.ctx = ctx;
  3300. this.skipDummy = skipDummy;
  3301. }
  3302. @Override
  3303. public Path call() throws Exception {
  3304. if (!this.skipDummy && isEmptyPath(this.job, this.path, this.ctx)) {
  3305. return createDummyFileForEmptyPartition(this.path, this.job, this.work.getPathToPartitionInfo().get(this.path),
  3306. this.hiveScratchDir);
  3307. }
  3308. return this.path;
  3309. }
  3310. }
  3311. @SuppressWarnings({"rawtypes", "unchecked"})
  3312. private static Path createEmptyFile(Path hiveScratchDir,
  3313. HiveOutputFormat outFileFormat, JobConf job,
  3314. Properties props, boolean dummyRow)
  3315. throws IOException, InstantiationException, IllegalAccessException {
  3316. // create a dummy empty file in a new directory
  3317. String newDir = hiveScratchDir + Path.SEPARATOR + UUID.randomUUID().toString();
  3318. Path newPath = new Path(newDir);
  3319. FileSystem fs = newPath.getFileSystem(job);
  3320. fs.mkdirs(newPath);
  3321. //Qualify the path against the file system. The user configured path might contain default port which is skipped
  3322. //in the file status. This makes sure that all paths which goes into PathToPartitionInfo are always listed status
  3323. //file path.
  3324. newPath = fs.makeQualified(newPath);
  3325. String newFile = newDir + Path.SEPARATOR + "emptyFile";
  3326. Path newFilePath = new Path(newFile);
  3327. RecordWriter recWriter = outFileFormat.getHiveRecordWriter(job, newFilePath,
  3328. Text.class, false, props, null);
  3329. if (dummyRow) {
  3330. // empty files are omitted at CombineHiveInputFormat.
  3331. // for meta-data only query, it effectively makes partition columns disappear..
  3332. // this could be fixed by other methods, but this seemed to be the most easy (HIVEV-2955)
  3333. recWriter.write(new Text("empty")); // written via HiveIgnoreKeyTextOutputFormat
  3334. }
  3335. recWriter.close(false);
  3336. return StringInternUtils.internUriStringsInPath(newPath);
  3337. }
  3338. @SuppressWarnings("rawtypes")
  3339. private static Path createDummyFileForEmptyPartition(Path path, JobConf job, PartitionDesc partDesc,
  3340. Path hiveScratchDir) throws Exception {
  3341. String strPath = path.toString();
  3342. // The input file does not exist, replace it by a empty file
  3343. if (partDesc.getTableDesc().isNonNative()) {
  3344. // if this isn't a hive table we can't create an empty file for it.
  3345. return path;
  3346. }
  3347. Properties props = SerDeUtils.createOverlayedProperties(
  3348. partDesc.getTableDesc().getProperties(), partDesc.getProperties());
  3349. HiveOutputFormat outFileFormat = HiveFileFormatUtils.getHiveOutputFormat(job, partDesc);
  3350. boolean oneRow = partDesc.getInputFileFormatClass() == OneNullRowInputFormat.class;
  3351. Path newPath = createEmptyFile(hiveScratchDir, outFileFormat, job, props, oneRow);
  3352. LOG.info("Changed input file {} to empty file {} ({})", strPath, newPath, oneRow);
  3353. return newPath;
  3354. }
  3355. private static void updatePathForMapWork(Path newPath, MapWork work, Path path) {
  3356. // update the work
  3357. if (!newPath.equals(path)) {
  3358. PartitionDesc partDesc = work.getPathToPartitionInfo().get(path);
  3359. work.addPathToAlias(newPath, work.getPathToAliases().get(path));
  3360. work.removePathToAlias(path);
  3361. work.removePathToPartitionInfo(path);
  3362. work.addPathToPartitionInfo(newPath, partDesc);
  3363. }
  3364. }
  3365. @SuppressWarnings("rawtypes")
  3366. private static Path createDummyFileForEmptyTable(JobConf job, MapWork work,
  3367. Path hiveScratchDir, String alias)
  3368. throws Exception {
  3369. TableDesc tableDesc = work.getAliasToPartnInfo().get(alias).getTableDesc();
  3370. if (tableDesc.isNonNative()) {
  3371. // if it does not need native storage, we can't create an empty file for it.
  3372. return null;
  3373. }
  3374. Properties props = tableDesc.getProperties();
  3375. HiveOutputFormat outFileFormat = HiveFileFormatUtils.getHiveOutputFormat(job, tableDesc);
  3376. Path newPath = createEmptyFile(hiveScratchDir, outFileFormat, job, props, false);
  3377. LOG.info("Changed input file for alias {} to newPath", alias, newPath);
  3378. // update the work
  3379. Map<Path, List<String>> pathToAliases = work.getPathToAliases();
  3380. List<String> newList = new ArrayList<String>(1);
  3381. newList.add(alias);
  3382. pathToAliases.put(newPath, newList);
  3383. work.setPathToAliases(pathToAliases);
  3384. PartitionDesc pDesc = work.getAliasToPartnInfo().get(alias).clone();
  3385. work.addPathToPartitionInfo(newPath, pDesc);
  3386. return newPath;
  3387. }
  3388. private static final Path[] EMPTY_PATH = new Path[0];
  3389. /**
  3390. * setInputPaths add all the paths in the provided list to the Job conf object
  3391. * as input paths for the job.
  3392. *
  3393. * @param job
  3394. * @param pathsToAdd
  3395. */
  3396. public static void setInputPaths(JobConf job, List<Path> pathsToAdd) {
  3397. Path[] addedPaths = FileInputFormat.getInputPaths(job);
  3398. if (addedPaths == null) {
  3399. addedPaths = EMPTY_PATH;
  3400. }
  3401. Path[] combined = new Path[addedPaths.length + pathsToAdd.size()];
  3402. System.arraycopy(addedPaths, 0, combined, 0, addedPaths.length);
  3403. int i = 0;
  3404. for (Path p: pathsToAdd) {
  3405. combined[addedPaths.length + (i++)] = p;
  3406. }
  3407. FileInputFormat.setInputPaths(job, combined);
  3408. }
  3409. /**
  3410. * Set hive input format, and input format file if necessary.
  3411. */
  3412. public static void setInputAttributes(Configuration conf, MapWork mWork) {
  3413. HiveConf.ConfVars var = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez") ?
  3414. HiveConf.ConfVars.HIVETEZINPUTFORMAT : HiveConf.ConfVars.HIVEINPUTFORMAT;
  3415. if (mWork.getInputformat() != null) {
  3416. HiveConf.setVar(conf, var, mWork.getInputformat());
  3417. }
  3418. // Intentionally overwrites anything the user may have put here
  3419. conf.setBoolean("hive.input.format.sorted", mWork.isInputFormatSorted());
  3420. }
  3421. /**
  3422. * Hive uses tmp directories to capture the output of each FileSinkOperator.
  3423. * This method creates all necessary tmp directories for FileSinks in the Mapwork.
  3424. *
  3425. * @param conf Used to get the right FileSystem
  3426. * @param mWork Used to find FileSinkOperators
  3427. * @throws IOException
  3428. */
  3429. public static void createTmpDirs(Configuration conf, MapWork mWork)
  3430. throws IOException {
  3431. Map<Path, List<String>> pa = mWork.getPathToAliases();
  3432. if (MapUtils.isNotEmpty(pa)) {
  3433. // common case: 1 table scan per map-work
  3434. // rare case: smb joins
  3435. HashSet<String> aliases = new HashSet<String>(1);
  3436. List<Operator<? extends OperatorDesc>> ops =
  3437. new ArrayList<Operator<? extends OperatorDesc>>();
  3438. for (List<String> ls : pa.values()) {
  3439. for (String a : ls) {
  3440. aliases.add(a);
  3441. }
  3442. }
  3443. for (String a : aliases) {
  3444. ops.add(mWork.getAliasToWork().get(a));
  3445. }
  3446. createTmpDirs(conf, ops);
  3447. }
  3448. }
  3449. /**
  3450. * Hive uses tmp directories to capture the output of each FileSinkOperator.
  3451. * This method creates all necessary tmp directories for FileSinks in the ReduceWork.
  3452. *
  3453. * @param conf Used to get the right FileSystem
  3454. * @param rWork Used to find FileSinkOperators
  3455. * @throws IOException
  3456. */
  3457. public static void createTmpDirs(Configuration conf, ReduceWork rWork)
  3458. throws IOException {
  3459. if (rWork == null) {
  3460. return;
  3461. }
  3462. List<Operator<? extends OperatorDesc>> ops
  3463. = new LinkedList<Operator<? extends OperatorDesc>>();
  3464. ops.add(rWork.getReducer());
  3465. createTmpDirs(conf, ops);
  3466. }
  3467. private static void createTmpDirs(Configuration conf,
  3468. List<Operator<? extends OperatorDesc>> ops) throws IOException {
  3469. while (!ops.isEmpty()) {
  3470. Operator<? extends OperatorDesc> op = ops.remove(0);
  3471. if (op instanceof FileSinkOperator) {
  3472. FileSinkDesc fdesc = ((FileSinkOperator) op).getConf();
  3473. if (fdesc.isMmTable() || fdesc.isDirectInsert()) {
  3474. // No need to create for MM tables, or ACID insert
  3475. continue;
  3476. }
  3477. Path tempDir = fdesc.getDirName();
  3478. if (tempDir != null) {
  3479. Path tempPath = Utilities.toTempPath(tempDir);
  3480. FileSystem fs = tempPath.getFileSystem(conf);
  3481. fs.mkdirs(tempPath);
  3482. }
  3483. }
  3484. if (op.getChildOperators() != null) {
  3485. ops.addAll(op.getChildOperators());
  3486. }
  3487. }
  3488. }
  3489. public static boolean createDirsWithPermission(Configuration conf, Path mkdirPath,
  3490. FsPermission fsPermission, boolean recursive) throws IOException {
  3491. String origUmask = null;
  3492. LOG.debug("Create dirs {} with permission {} recursive {}",
  3493. mkdirPath, fsPermission, recursive);
  3494. if (recursive) {
  3495. origUmask = conf.get(FsPermission.UMASK_LABEL);
  3496. // this umask is required because by default the hdfs mask is 022 resulting in
  3497. // all parents getting the fsPermission & !(022) permission instead of fsPermission
  3498. conf.set(FsPermission.UMASK_LABEL, "000");
  3499. }
  3500. FileSystem fs = ShimLoader.getHadoopShims().getNonCachedFileSystem(mkdirPath.toUri(), conf);
  3501. boolean retval = false;
  3502. try {
  3503. retval = fs.mkdirs(mkdirPath, fsPermission);
  3504. resetUmaskInConf(conf, recursive, origUmask);
  3505. } catch (IOException ioe) {
  3506. resetUmaskInConf(conf, recursive, origUmask);
  3507. throw ioe;
  3508. } finally {
  3509. IOUtils.closeStream(fs);
  3510. }
  3511. return retval;
  3512. }
  3513. private static void resetUmaskInConf(Configuration conf, boolean unsetUmask, String origUmask) {
  3514. if (unsetUmask) {
  3515. if (origUmask != null) {
  3516. conf.set(FsPermission.UMASK_LABEL, origUmask);
  3517. } else {
  3518. conf.unset(FsPermission.UMASK_LABEL);
  3519. }
  3520. }
  3521. }
  3522. /**
  3523. * Returns true if a plan is both configured for vectorized execution
  3524. * and the node is vectorized.
  3525. *
  3526. * The plan may be configured for vectorization
  3527. * but vectorization disallowed eg. for FetchOperator execution.
  3528. */
  3529. public static boolean getIsVectorized(Configuration conf) {
  3530. if (conf.get(VECTOR_MODE) != null) {
  3531. // this code path is necessary, because with HS2 and client
  3532. // side split generation we end up not finding the map work.
  3533. // This is because of thread local madness (tez split
  3534. // generation is multi-threaded - HS2 plan cache uses thread
  3535. // locals).
  3536. return
  3537. conf.getBoolean(VECTOR_MODE, false);
  3538. } else {
  3539. if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED) &&
  3540. Utilities.getPlanPath(conf) != null) {
  3541. MapWork mapWork = Utilities.getMapWork(conf);
  3542. return mapWork.getVectorMode();
  3543. } else {
  3544. return false;
  3545. }
  3546. }
  3547. }
  3548. public static boolean getIsVectorized(Configuration conf, MapWork mapWork) {
  3549. return HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED) &&
  3550. mapWork.getVectorMode();
  3551. }
  3552. /**
  3553. * @param conf
  3554. * @return the configured VectorizedRowBatchCtx for a MapWork task.
  3555. */
  3556. public static VectorizedRowBatchCtx getVectorizedRowBatchCtx(Configuration conf) {
  3557. VectorizedRowBatchCtx result = null;
  3558. if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED) &&
  3559. Utilities.getPlanPath(conf) != null) {
  3560. MapWork mapWork = Utilities.getMapWork(conf);
  3561. if (mapWork != null && mapWork.getVectorMode()) {
  3562. result = mapWork.getVectorizedRowBatchCtx();
  3563. }
  3564. }
  3565. return result;
  3566. }
  3567. public static void clearWorkMapForConf(Configuration conf) {
  3568. // Remove cached query plans for the current query only
  3569. Path mapPath = getPlanPath(conf, MAP_PLAN_NAME);
  3570. Path reducePath = getPlanPath(conf, REDUCE_PLAN_NAME);
  3571. if (mapPath != null) {
  3572. gWorkMap.get(conf).remove(mapPath);
  3573. }
  3574. if (reducePath != null) {
  3575. gWorkMap.get(conf).remove(reducePath);
  3576. }
  3577. // TODO: should this also clean merge work?
  3578. }
  3579. public static void clearWorkMap(Configuration conf) {
  3580. gWorkMap.get(conf).clear();
  3581. }
  3582. /**
  3583. * Skip header lines in the table file when reading the record.
  3584. *
  3585. * @param currRecReader
  3586. * Record reader.
  3587. *
  3588. * @param headerCount
  3589. * Header line number of the table files.
  3590. *
  3591. * @param key
  3592. * Key of current reading record.
  3593. *
  3594. * @param value
  3595. * Value of current reading record.
  3596. *
  3597. * @return Return true if there are 0 or more records left in the file
  3598. * after skipping all headers, otherwise return false.
  3599. */
  3600. public static <K, V> boolean skipHeader(RecordReader<K, V> currRecReader, int headerCount, K key, V value)
  3601. throws IOException {
  3602. while (headerCount > 0) {
  3603. if (!currRecReader.next(key, value)) {
  3604. return false;
  3605. }
  3606. headerCount--;
  3607. }
  3608. return true;
  3609. }
  3610. /**
  3611. * Get header line count for a table.
  3612. *
  3613. * @param table
  3614. * Table description for target table.
  3615. *
  3616. */
  3617. public static int getHeaderCount(TableDesc table) throws IOException {
  3618. int headerCount;
  3619. try {
  3620. headerCount =
  3621. getHeaderOrFooterCount(table, serdeConstants.HEADER_COUNT);
  3622. } catch (NumberFormatException nfe) {
  3623. throw new IOException(nfe);
  3624. }
  3625. return headerCount;
  3626. }
  3627. /**
  3628. * Get footer line count for a table.
  3629. *
  3630. * @param table
  3631. * Table description for target table.
  3632. *
  3633. * @param job
  3634. * Job configuration for current job.
  3635. */
  3636. public static int getFooterCount(TableDesc table, JobConf job) throws IOException {
  3637. int footerCount;
  3638. try {
  3639. footerCount =
  3640. getHeaderOrFooterCount(table, serdeConstants.FOOTER_COUNT);
  3641. if (footerCount > HiveConf.getIntVar(job, HiveConf.ConfVars.HIVE_FILE_MAX_FOOTER)) {
  3642. throw new IOException("footer number exceeds the limit defined in hive.file.max.footer");
  3643. }
  3644. } catch (NumberFormatException nfe) {
  3645. // Footer line number must be set as an integer.
  3646. throw new IOException(nfe);
  3647. }
  3648. return footerCount;
  3649. }
  3650. private static int getHeaderOrFooterCount(TableDesc table,
  3651. String propertyName) {
  3652. int count =
  3653. Integer.parseInt(table.getProperties().getProperty(propertyName, "0"));
  3654. if (count > 0 && table.getInputFileFormatClass() != null
  3655. && !TextInputFormat.class
  3656. .isAssignableFrom(table.getInputFileFormatClass())) {
  3657. LOG.warn(propertyName
  3658. + " is only valid for TextInputFormat, ignoring the value.");
  3659. count = 0;
  3660. }
  3661. return count;
  3662. }
  3663. /**
  3664. * Convert path to qualified path.
  3665. *
  3666. * @param conf
  3667. * Hive configuration.
  3668. * @param path
  3669. * Path to convert.
  3670. * @return Qualified path
  3671. */
  3672. public static String getQualifiedPath(HiveConf conf, Path path) throws HiveException {
  3673. FileSystem fs;
  3674. if (path == null) {
  3675. return null;
  3676. }
  3677. try {
  3678. fs = path.getFileSystem(conf);
  3679. return fs.makeQualified(path).toString();
  3680. }
  3681. catch (IOException e) {
  3682. throw new HiveException(e);
  3683. }
  3684. }
  3685. /**
  3686. * Checks if the current HiveServer2 logging operation level is &gt;= PERFORMANCE.
  3687. * @param conf Hive configuration.
  3688. * @return true if current HiveServer2 logging operation level is &gt;= PERFORMANCE.
  3689. * Else, false.
  3690. */
  3691. public static boolean isPerfOrAboveLogging(HiveConf conf) {
  3692. String loggingLevel = conf.getVar(HiveConf.ConfVars.HIVE_SERVER2_LOGGING_OPERATION_LEVEL);
  3693. return conf.getBoolVar(HiveConf.ConfVars.HIVE_SERVER2_LOGGING_OPERATION_ENABLED) &&
  3694. (loggingLevel.equalsIgnoreCase("PERFORMANCE") || loggingLevel.equalsIgnoreCase("VERBOSE"));
  3695. }
  3696. /**
  3697. * Returns the full path to the Jar containing the class. It always return a JAR.
  3698. *
  3699. * @param klass
  3700. * class.
  3701. *
  3702. * @return path to the Jar containing the class.
  3703. */
  3704. @SuppressWarnings("rawtypes")
  3705. public static String jarFinderGetJar(Class klass) {
  3706. Preconditions.checkNotNull(klass, "klass");
  3707. ClassLoader loader = klass.getClassLoader();
  3708. if (loader != null) {
  3709. String class_file = klass.getName().replaceAll("\\.", "/") + ".class";
  3710. try {
  3711. for (Enumeration itr = loader.getResources(class_file); itr.hasMoreElements();) {
  3712. URL url = (URL) itr.nextElement();
  3713. String path = url.getPath();
  3714. if (path.startsWith("file:")) {
  3715. path = path.substring("file:".length());
  3716. }
  3717. path = URLDecoder.decode(path, "UTF-8");
  3718. if ("jar".equals(url.getProtocol())) {
  3719. path = URLDecoder.decode(path, "UTF-8");
  3720. return path.replaceAll("!.*$", "");
  3721. }
  3722. }
  3723. } catch (IOException e) {
  3724. throw new RuntimeException(e);
  3725. }
  3726. }
  3727. return null;
  3728. }
  3729. /**
  3730. * Sets up the job so that all necessary jars ar passed that contain classes from the given argument of this method.
  3731. * @param conf jobConf instance to setup
  3732. * @param classes the classes to look in jars for
  3733. * @throws IOException
  3734. */
  3735. public static void addDependencyJars(Configuration conf, Class<?>... classes)
  3736. throws IOException {
  3737. FileSystem localFs = FileSystem.getLocal(conf);
  3738. Set<String> jars = new HashSet<>(conf.getStringCollection("tmpjars"));
  3739. for (Class<?> clazz : classes) {
  3740. if (clazz == null) {
  3741. continue;
  3742. }
  3743. final String path = Utilities.jarFinderGetJar(clazz);
  3744. if (path == null) {
  3745. throw new RuntimeException("Could not find jar for class " + clazz +
  3746. " in order to ship it to the cluster.");
  3747. }
  3748. if (!localFs.exists(new Path(path))) {
  3749. throw new RuntimeException("Could not validate jar file " + path + " for class " + clazz);
  3750. }
  3751. jars.add(path);
  3752. }
  3753. if (jars.isEmpty()) {
  3754. return;
  3755. }
  3756. //noinspection ToArrayCallWithZeroLengthArrayArgument
  3757. conf.set("tmpjars", org.apache.hadoop.util.StringUtils.arrayToString(jars.toArray(new String[jars.size()])));
  3758. }
  3759. public static int getDPColOffset(FileSinkDesc conf) {
  3760. if (conf.getWriteType() == AcidUtils.Operation.DELETE) {
  3761. // For deletes, there is only ROW__ID in non-partitioning, non-bucketing columns.
  3762. //See : UpdateDeleteSemanticAnalyzer::reparseAndSuperAnalyze() for details.
  3763. return 1;
  3764. } else if (conf.getWriteType() == AcidUtils.Operation.UPDATE) {
  3765. // For updates, ROW__ID is an extra column at index 0.
  3766. //See : UpdateDeleteSemanticAnalyzer::reparseAndSuperAnalyze() for details.
  3767. return getColumnNames(conf.getTableInfo().getProperties()).size() + 1;
  3768. } else {
  3769. return getColumnNames(conf.getTableInfo().getProperties()).size();
  3770. }
  3771. }
  3772. public static List<String> getStatsTmpDirs(BaseWork work, Configuration conf) {
  3773. List<String> statsTmpDirs = new ArrayList<>();
  3774. if (!StatsSetupConst.StatDB.fs.name().equalsIgnoreCase(HiveConf.getVar(conf, ConfVars.HIVESTATSDBCLASS))) {
  3775. // no-op for non-fs stats collection
  3776. return statsTmpDirs;
  3777. }
  3778. // if its auto-stats gather for inserts or CTAS, stats dir will be in FileSink
  3779. Set<Operator<? extends OperatorDesc>> ops = work.getAllLeafOperators();
  3780. if (work instanceof MapWork) {
  3781. // if its an anlayze statement, stats dir will be in TableScan
  3782. ops.addAll(work.getAllRootOperators());
  3783. }
  3784. for (Operator<? extends OperatorDesc> op : ops) {
  3785. OperatorDesc desc = op.getConf();
  3786. String statsTmpDir = null;
  3787. if (desc instanceof IStatsGatherDesc) {
  3788. statsTmpDir = ((IStatsGatherDesc) desc).getTmpStatsDir();
  3789. }
  3790. if (statsTmpDir != null && !statsTmpDir.isEmpty()) {
  3791. statsTmpDirs.add(statsTmpDir);
  3792. }
  3793. }
  3794. return statsTmpDirs;
  3795. }
  3796. public static boolean isSchemaEvolutionEnabled(Configuration conf, boolean isAcid) {
  3797. return isAcid || HiveConf.getBoolVar(conf, ConfVars.HIVE_SCHEMA_EVOLUTION);
  3798. }
  3799. public static boolean isInputFileFormatSelfDescribing(PartitionDesc pd) {
  3800. Class<?> inputFormatClass = pd.getInputFileFormatClass();
  3801. return SelfDescribingInputFormatInterface.class.isAssignableFrom(inputFormatClass);
  3802. }
  3803. public static boolean isInputFileFormatVectorized(PartitionDesc pd) {
  3804. Class<?> inputFormatClass = pd.getInputFileFormatClass();
  3805. return VectorizedInputFormatInterface.class.isAssignableFrom(inputFormatClass);
  3806. }
  3807. public static Collection<Class<?>> getClassNamesFromConfig(HiveConf hiveConf, ConfVars confVar) {
  3808. String[] classNames = org.apache.hadoop.util.StringUtils.getStrings(HiveConf.getVar(hiveConf,
  3809. confVar));
  3810. if (classNames == null) {
  3811. return Collections.emptyList();
  3812. }
  3813. Collection<Class<?>> classList = new ArrayList<Class<?>>(classNames.length);
  3814. for (String className : classNames) {
  3815. if (StringUtils.isEmpty(className)) {
  3816. continue;
  3817. }
  3818. try {
  3819. classList.add(Class.forName(className));
  3820. } catch (Exception ex) {
  3821. LOG.warn("Cannot create class {} for {} checks", className, confVar.varname);
  3822. }
  3823. }
  3824. return classList;
  3825. }
  3826. public static void addSchemaEvolutionToTableScanOperator(Table table,
  3827. TableScanOperator tableScanOp) {
  3828. String colNames = MetaStoreUtils.getColumnNamesFromFieldSchema(table.getSd().getCols());
  3829. String colTypes = MetaStoreUtils.getColumnTypesFromFieldSchema(table.getSd().getCols());
  3830. tableScanOp.setSchemaEvolution(colNames, colTypes);
  3831. }
  3832. public static void addSchemaEvolutionToTableScanOperator(StructObjectInspector structOI,
  3833. TableScanOperator tableScanOp) {
  3834. String colNames = ObjectInspectorUtils.getFieldNames(structOI);
  3835. String colTypes = ObjectInspectorUtils.getFieldTypes(structOI);
  3836. tableScanOp.setSchemaEvolution(colNames, colTypes);
  3837. }
  3838. public static void unsetSchemaEvolution(Configuration conf) {
  3839. conf.unset(IOConstants.SCHEMA_EVOLUTION_COLUMNS);
  3840. conf.unset(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES);
  3841. }
  3842. public static void addTableSchemaToConf(Configuration conf,
  3843. TableScanOperator tableScanOp) {
  3844. String schemaEvolutionColumns = tableScanOp.getSchemaEvolutionColumns();
  3845. if (schemaEvolutionColumns != null) {
  3846. conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, tableScanOp.getSchemaEvolutionColumns());
  3847. conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, tableScanOp.getSchemaEvolutionColumnsTypes());
  3848. } else {
  3849. LOG.info("schema.evolution.columns and schema.evolution.columns.types not available");
  3850. }
  3851. }
  3852. /**
  3853. * Create row key and value object inspectors for reduce vectorization.
  3854. * The row object inspector used by ReduceWork needs to be a **standard**
  3855. * struct object inspector, not just any struct object inspector.
  3856. * @param keyInspector
  3857. * @param valueInspector
  3858. * @return OI
  3859. * @throws HiveException
  3860. */
  3861. public static StandardStructObjectInspector constructVectorizedReduceRowOI(
  3862. StructObjectInspector keyInspector, StructObjectInspector valueInspector)
  3863. throws HiveException {
  3864. ArrayList<String> colNames = new ArrayList<String>();
  3865. ArrayList<ObjectInspector> ois = new ArrayList<ObjectInspector>();
  3866. List<? extends StructField> fields = keyInspector.getAllStructFieldRefs();
  3867. for (StructField field: fields) {
  3868. colNames.add(Utilities.ReduceField.KEY.toString() + '.' + field.getFieldName());
  3869. ois.add(field.getFieldObjectInspector());
  3870. }
  3871. fields = valueInspector.getAllStructFieldRefs();
  3872. for (StructField field: fields) {
  3873. colNames.add(Utilities.ReduceField.VALUE.toString() + '.' + field.getFieldName());
  3874. ois.add(field.getFieldObjectInspector());
  3875. }
  3876. StandardStructObjectInspector rowObjectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(colNames, ois);
  3877. return rowObjectInspector;
  3878. }
  3879. public static String humanReadableByteCount(long bytes) {
  3880. int unit = 1000; // use binary units instead?
  3881. if (bytes < unit) {
  3882. return bytes + "B";
  3883. }
  3884. int exp = (int) (Math.log(bytes) / Math.log(unit));
  3885. String suffix = "KMGTPE".charAt(exp-1) + "";
  3886. return String.format("%.2f%sB", bytes / Math.pow(unit, exp), suffix);
  3887. }
  3888. private static final String MANIFEST_EXTENSION = ".manifest";
  3889. private static void tryDelete(FileSystem fs, Path path) {
  3890. try {
  3891. fs.delete(path, true);
  3892. } catch (IOException ex) {
  3893. LOG.error("Failed to delete {}", path, ex);
  3894. }
  3895. }
  3896. public static Path[] getDirectInsertDirectoryCandidates(FileSystem fs, Path path, int dpLevels,
  3897. PathFilter filter, long writeId, int stmtId, Configuration conf,
  3898. Boolean isBaseDir, AcidUtils.Operation acidOperation) throws IOException {
  3899. int skipLevels = dpLevels;
  3900. if (filter == null) {
  3901. filter = new AcidUtils.IdPathFilter(writeId, stmtId);
  3902. }
  3903. if (skipLevels == 0) {
  3904. return statusToPath(fs.listStatus(path, filter));
  3905. }
  3906. // TODO: for some reason, globStatus doesn't work for masks like "...blah/*/delta_0000007_0000007*"
  3907. // the last star throws it off. So, for now, if stmtId is missing use recursion.
  3908. // For the same reason, we cannot use it if we don't know isBaseDir. Currently, we don't
  3909. // /want/ to know isBaseDir because that is error prone; so, it ends up never being used.
  3910. if (stmtId < 0 || isBaseDir == null
  3911. || (HiveConf.getBoolVar(conf, ConfVars.HIVE_MM_AVOID_GLOBSTATUS_ON_S3) && isS3(fs))) {
  3912. return getDirectInsertDirectoryCandidatesRecursive(fs, path, skipLevels, filter);
  3913. }
  3914. return getDirectInsertDirectoryCandidatesGlobStatus(fs, path, skipLevels, filter, writeId, stmtId, isBaseDir,
  3915. acidOperation);
  3916. }
  3917. private static boolean isS3(FileSystem fs) {
  3918. try {
  3919. return "s3a".equalsIgnoreCase(fs.getScheme());
  3920. } catch (UnsupportedOperationException ex) {
  3921. // Some FS-es do not implement getScheme, e.g. ProxyLocalFileSystem.
  3922. return false;
  3923. }
  3924. }
  3925. private static Path[] statusToPath(FileStatus[] statuses) {
  3926. if (statuses == null) {
  3927. return null;
  3928. }
  3929. Path[] paths = new Path[statuses.length];
  3930. for (int i = 0; i < statuses.length; ++i) {
  3931. paths[i] = statuses[i].getPath();
  3932. }
  3933. return paths;
  3934. }
  3935. private static Path[] getDirectInsertDirectoryCandidatesRecursive(FileSystem fs,
  3936. Path path, int skipLevels, PathFilter filter) throws IOException {
  3937. String lastRelDir = null;
  3938. HashSet<Path> results = new HashSet<Path>();
  3939. String relRoot = Path.getPathWithoutSchemeAndAuthority(path).toString();
  3940. if (!relRoot.endsWith(Path.SEPARATOR)) {
  3941. relRoot += Path.SEPARATOR;
  3942. }
  3943. RemoteIterator<LocatedFileStatus> allFiles = fs.listFiles(path, true);
  3944. while (allFiles.hasNext()) {
  3945. LocatedFileStatus lfs = allFiles.next();
  3946. Path lfsPath = lfs.getPath();
  3947. Path dirPath = Path.getPathWithoutSchemeAndAuthority(lfsPath);
  3948. String dir = dirPath.toString();
  3949. if (!dir.startsWith(relRoot)) {
  3950. throw new IOException("Path " + lfsPath + " is not under " + relRoot
  3951. + " (when shortened to " + dir + ")");
  3952. }
  3953. String subDir = dir.substring(relRoot.length());
  3954. Utilities.FILE_OP_LOGGER.trace("Looking at {} from {}", subDir, lfsPath);
  3955. // If sorted, we'll skip a bunch of files.
  3956. if (lastRelDir != null && subDir.startsWith(lastRelDir)) {
  3957. continue;
  3958. }
  3959. int startIx = skipLevels > 0 ? -1 : 0;
  3960. for (int i = 0; i < skipLevels; ++i) {
  3961. startIx = subDir.indexOf(Path.SEPARATOR_CHAR, startIx + 1);
  3962. if (startIx == -1) {
  3963. Utilities.FILE_OP_LOGGER.info("Expected level of nesting ({}) is not "
  3964. + " present in {} (from {})", skipLevels, subDir, lfsPath);
  3965. break;
  3966. }
  3967. }
  3968. if (startIx == -1) {
  3969. continue;
  3970. }
  3971. int endIx = subDir.indexOf(Path.SEPARATOR_CHAR, startIx + 1);
  3972. if (endIx == -1) {
  3973. Utilities.FILE_OP_LOGGER.info("Expected level of nesting ({}) is not present in"
  3974. + " {} (from {})", (skipLevels + 1), subDir, lfsPath);
  3975. continue;
  3976. }
  3977. lastRelDir = subDir = subDir.substring(0, endIx);
  3978. Path candidate = new Path(relRoot, subDir);
  3979. if (!filter.accept(candidate)) {
  3980. continue;
  3981. }
  3982. results.add(fs.makeQualified(candidate));
  3983. }
  3984. return results.toArray(new Path[results.size()]);
  3985. }
  3986. private static Path[] getDirectInsertDirectoryCandidatesGlobStatus(FileSystem fs, Path path, int skipLevels,
  3987. PathFilter filter, long writeId, int stmtId, boolean isBaseDir, AcidUtils.Operation acidOperation) throws IOException {
  3988. StringBuilder sb = new StringBuilder(path.toUri().getPath());
  3989. for (int i = 0; i < skipLevels; i++) {
  3990. sb.append(Path.SEPARATOR).append('*');
  3991. }
  3992. if (stmtId < 0) {
  3993. // Note: this does not work.
  3994. // sb.append(Path.SEPARATOR).append(AcidUtils.deltaSubdir(writeId, writeId)).append("_*");
  3995. throw new AssertionError("GlobStatus should not be called without a statement ID");
  3996. } else {
  3997. String deltaSubDir = AcidUtils.baseOrDeltaSubdir(isBaseDir, writeId, writeId, stmtId);
  3998. if (AcidUtils.Operation.DELETE.equals(acidOperation)) {
  3999. deltaSubDir = AcidUtils.deleteDeltaSubdir(writeId, writeId, stmtId);
  4000. }
  4001. if (AcidUtils.Operation.UPDATE.equals(acidOperation)) {
  4002. String deltaPostFix = deltaSubDir.replace("delta", "");
  4003. deltaSubDir = "{delete_delta,delta}" + deltaPostFix;
  4004. }
  4005. sb.append(Path.SEPARATOR).append(deltaSubDir);
  4006. }
  4007. Path pathPattern = new Path(path, sb.toString());
  4008. return statusToPath(fs.globStatus(pathPattern, filter));
  4009. }
  4010. private static void tryDeleteAllDirectInsertFiles(FileSystem fs, Path specPath, Path manifestDir,
  4011. int dpLevels, int lbLevels, AcidUtils.IdPathFilter filter, long writeId, int stmtId,
  4012. Configuration conf, AcidUtils.Operation acidOperation) throws IOException {
  4013. Path[] files = getDirectInsertDirectoryCandidates(
  4014. fs, specPath, dpLevels, filter, writeId, stmtId, conf, null, acidOperation);
  4015. if (files != null) {
  4016. for (Path path : files) {
  4017. Utilities.FILE_OP_LOGGER.info("Deleting {} on failure", path);
  4018. tryDelete(fs, path);
  4019. }
  4020. }
  4021. Utilities.FILE_OP_LOGGER.info("Deleting {} on failure", manifestDir);
  4022. fs.delete(manifestDir, true);
  4023. }
  4024. public static void writeCommitManifest(List<Path> commitPaths, Path specPath, FileSystem fs,
  4025. String taskId, Long writeId, int stmtId, String unionSuffix, boolean isInsertOverwrite,
  4026. boolean hasDynamicPartitions, Set<String> dynamicPartitionSpecs, String staticSpec, boolean isDelete) throws HiveException {
  4027. // When doing a multi-statement insert overwrite with dynamic partitioning,
  4028. // the partition information will be written to the manifest file.
  4029. // This is needed because in this use case each FileSinkOperator should clean-up
  4030. // only the partition directories written by the same FileSinkOperator and do not
  4031. // clean-up the partition directories written by the other FileSinkOperators.
  4032. // If a statement from the insert overwrite query, doesn't produce any data,
  4033. // a manifest file will still be written, otherwise the missing manifest file
  4034. // would result a clean-up on table level which could delete the data written by
  4035. // the other FileSinkOperators. (For further details please see HIVE-23114.)
  4036. boolean writeDynamicPartitionsToManifest = hasDynamicPartitions;
  4037. if (commitPaths.isEmpty() && !writeDynamicPartitionsToManifest) {
  4038. return;
  4039. }
  4040. // We assume one FSOP per task (per specPath), so we create it in specPath.
  4041. Path manifestPath = getManifestDir(specPath, writeId, stmtId, unionSuffix, isInsertOverwrite, staticSpec, isDelete);
  4042. manifestPath = new Path(manifestPath, taskId + MANIFEST_EXTENSION);
  4043. Utilities.FILE_OP_LOGGER.info("Writing manifest to {} with {}", manifestPath, commitPaths);
  4044. try {
  4045. // Don't overwrite the manifest... should fail if we have collisions.
  4046. try (FSDataOutputStream out = fs.create(manifestPath, false)) {
  4047. if (out == null) {
  4048. throw new HiveException("Failed to create manifest at " + manifestPath);
  4049. }
  4050. if (writeDynamicPartitionsToManifest) {
  4051. out.writeInt(dynamicPartitionSpecs.size());
  4052. for (String dynamicPartitionSpec : dynamicPartitionSpecs) {
  4053. out.writeUTF(dynamicPartitionSpec.toString());
  4054. }
  4055. }
  4056. out.writeInt(commitPaths.size());
  4057. for (Path path : commitPaths) {
  4058. out.writeUTF(path.toString());
  4059. }
  4060. }
  4061. } catch (IOException e) {
  4062. throw new HiveException(e);
  4063. }
  4064. }
  4065. private static Path getManifestDir(Path specPath, long writeId, int stmtId, String unionSuffix,
  4066. boolean isInsertOverwrite, String staticSpec, boolean isDelete) {
  4067. Path manifestRoot = specPath;
  4068. if (staticSpec != null) {
  4069. String tableRoot = specPath.toString();
  4070. tableRoot = tableRoot.substring(0, tableRoot.length() - staticSpec.length());
  4071. manifestRoot = new Path(tableRoot);
  4072. }
  4073. String deltaDir = AcidUtils.baseOrDeltaSubdir(isInsertOverwrite, writeId, writeId, stmtId);
  4074. if (isDelete) {
  4075. deltaDir = AcidUtils.deleteDeltaSubdir(writeId, writeId, stmtId);
  4076. }
  4077. Path manifestPath = new Path(manifestRoot, "_tmp." + deltaDir);
  4078. if (isInsertOverwrite) {
  4079. // When doing a multi-statement insert overwrite query with dynamic partitioning, the
  4080. // generated manifest directory is the same for each FileSinkOperator.
  4081. // To resolve this name collision, extending the manifest path with the statement id.
  4082. manifestPath = new Path(manifestPath + "_" + stmtId);
  4083. }
  4084. return (unionSuffix == null) ? manifestPath : new Path(manifestPath, unionSuffix);
  4085. }
  4086. public static final class MissingBucketsContext {
  4087. public final TableDesc tableInfo;
  4088. public final int numBuckets;
  4089. public final boolean isCompressed;
  4090. public MissingBucketsContext(TableDesc tableInfo, int numBuckets, boolean isCompressed) {
  4091. this.tableInfo = tableInfo;
  4092. this.numBuckets = numBuckets;
  4093. this.isCompressed = isCompressed;
  4094. }
  4095. }
  4096. public static void handleDirectInsertTableFinalPath(Path specPath, String unionSuffix, Configuration hconf,
  4097. boolean success, int dpLevels, int lbLevels, MissingBucketsContext mbc, long writeId, int stmtId,
  4098. Reporter reporter, boolean isMmTable, boolean isMmCtas, boolean isInsertOverwrite, boolean isDirectInsert,
  4099. String staticSpec, AcidUtils.Operation acidOperation, FileSinkDesc conf) throws IOException, HiveException {
  4100. FileSystem fs = specPath.getFileSystem(hconf);
  4101. boolean isDelete = AcidUtils.Operation.DELETE.equals(acidOperation);
  4102. Path manifestDir = getManifestDir(specPath, writeId, stmtId, unionSuffix, isInsertOverwrite, staticSpec, isDelete);
  4103. if (!success) {
  4104. AcidUtils.IdPathFilter filter = new AcidUtils.IdPathFilter(writeId, stmtId);
  4105. tryDeleteAllDirectInsertFiles(fs, specPath, manifestDir, dpLevels, lbLevels,
  4106. filter, writeId, stmtId, hconf, acidOperation);
  4107. return;
  4108. }
  4109. Utilities.FILE_OP_LOGGER.debug("Looking for manifests in: {} ({})", manifestDir, writeId);
  4110. List<Path> manifests = new ArrayList<>();
  4111. try {
  4112. FileStatus[] manifestFiles = fs.listStatus(manifestDir);
  4113. manifests = selectManifestFiles(manifestFiles);
  4114. } catch (FileNotFoundException ex) {
  4115. Utilities.FILE_OP_LOGGER.info("No manifests found in directory {} - query produced no output", manifestDir);
  4116. manifestDir = null;
  4117. if (!fs.exists(specPath)) {
  4118. // Empty insert to new partition
  4119. fs.mkdirs(specPath);
  4120. }
  4121. }
  4122. Map<String, List<Path>> dynamicPartitionSpecs = new HashMap<>();
  4123. Set<Path> committed = Collections.newSetFromMap(new ConcurrentHashMap<>());
  4124. Set<Path> directInsertDirectories = new HashSet<>();
  4125. for (Path mfp : manifests) {
  4126. Utilities.FILE_OP_LOGGER.info("Looking at manifest file: {}", mfp);
  4127. try (FSDataInputStream mdis = fs.open(mfp)) {
  4128. if (dpLevels > 0) {
  4129. int partitionCount = mdis.readInt();
  4130. for (int i = 0; i < partitionCount; ++i) {
  4131. String nextPart = mdis.readUTF();
  4132. Utilities.FILE_OP_LOGGER.debug("Looking at dynamic partition {}", nextPart);
  4133. if (!dynamicPartitionSpecs.containsKey(nextPart)) {
  4134. dynamicPartitionSpecs.put(nextPart, new ArrayList<>());
  4135. }
  4136. }
  4137. }
  4138. int fileCount = mdis.readInt();
  4139. for (int i = 0; i < fileCount; ++i) {
  4140. String nextFile = mdis.readUTF();
  4141. Utilities.FILE_OP_LOGGER.debug("Looking at committed file {}", nextFile);
  4142. Path path = fs.makeQualified(new Path(nextFile));
  4143. if (!committed.add(path)) {
  4144. throw new HiveException(nextFile + " was specified in multiple manifests");
  4145. }
  4146. dynamicPartitionSpecs.entrySet()
  4147. .stream()
  4148. .filter(dynpath -> path.toString().contains(dynpath.getKey()))
  4149. .findAny()
  4150. .ifPresent(dynPath -> dynPath.getValue().add(path));
  4151. Path parentDirPath = path.getParent();
  4152. while (AcidUtils.isChildOfDelta(parentDirPath, specPath)) {
  4153. // Some cases there are other directory layers between the delta and the datafiles
  4154. // (export-import mm table, insert with union all to mm table, skewed tables).
  4155. parentDirPath = parentDirPath.getParent();
  4156. }
  4157. directInsertDirectories.add(parentDirPath);
  4158. }
  4159. }
  4160. }
  4161. if (manifestDir != null) {
  4162. Utilities.FILE_OP_LOGGER.info("Deleting manifest directory {}", manifestDir);
  4163. tryDelete(fs, manifestDir);
  4164. if (unionSuffix != null) {
  4165. // Also delete the parent directory if we are the last union FSOP to execute.
  4166. manifestDir = manifestDir.getParent();
  4167. FileStatus[] remainingFiles = fs.listStatus(manifestDir);
  4168. if (remainingFiles == null || remainingFiles.length == 0) {
  4169. Utilities.FILE_OP_LOGGER.info("Deleting manifest directory {}", manifestDir);
  4170. tryDelete(fs, manifestDir);
  4171. }
  4172. }
  4173. }
  4174. if (!directInsertDirectories.isEmpty()) {
  4175. cleanDirectInsertDirectoriesConcurrently(directInsertDirectories, committed, fs, hconf, unionSuffix, lbLevels);
  4176. }
  4177. conf.setDynPartitionValues(dynamicPartitionSpecs);
  4178. if (!committed.isEmpty()) {
  4179. throw new HiveException("The following files were committed but not found: " + committed);
  4180. }
  4181. if (directInsertDirectories.isEmpty()) {
  4182. return;
  4183. }
  4184. // TODO: see HIVE-14886 - removeTempOrDuplicateFiles is broken for list bucketing,
  4185. // so maintain parity here by not calling it at all.
  4186. if (lbLevels != 0) {
  4187. return;
  4188. }
  4189. if (!isDirectInsert) {
  4190. // Create fake file statuses to avoid querying the file system. removeTempOrDuplicateFiles
  4191. // doesn't need to check anything except path and directory status for MM directories.
  4192. FileStatus[] finalResults = directInsertDirectories.stream()
  4193. .map(PathOnlyFileStatus::new)
  4194. .toArray(FileStatus[]::new);
  4195. List<Path> emptyBuckets = Utilities.removeTempOrDuplicateFiles(fs, finalResults,
  4196. unionSuffix, dpLevels, mbc == null ? 0 : mbc.numBuckets, hconf, writeId, stmtId,
  4197. isMmTable, null, isInsertOverwrite);
  4198. // create empty buckets if necessary
  4199. if (!emptyBuckets.isEmpty()) {
  4200. assert mbc != null;
  4201. Utilities.createEmptyBuckets(hconf, emptyBuckets, mbc.isCompressed, mbc.tableInfo, reporter);
  4202. }
  4203. }
  4204. }
  4205. /**
  4206. * The name of a manifest file consists of the task ID and a .manifest extension, where
  4207. * the task ID includes the attempt ID as well. It can happen that a task attempt already
  4208. * wrote out the manifest file, and then fails, so Tez restarts it. If the next attempt
  4209. * successfully finishes, the query won't fail but there could be multiple manifest files with
  4210. * the same task ID, but different attempt IDs. In this case the manifest file which has
  4211. * the highest attempt ID, and not empty, has to be considered.
  4212. * The empty manifest files and the ones with the same task ID but lower attempt ID has to be ignored.
  4213. * @param manifestFiles All the files listed in the manifest directory
  4214. * @return The list of manifest files which have the highest attempt ID and are not empty
  4215. */
  4216. @VisibleForTesting
  4217. static List<Path> selectManifestFiles(FileStatus[] manifestFiles) {
  4218. List<Path> manifests = new ArrayList<>();
  4219. if (manifestFiles != null) {
  4220. Map<String, Integer> fileNameToAttempId = new HashMap<>();
  4221. Map<String, Path> fileNameToPath = new HashMap<>();
  4222. for (FileStatus manifestFile : manifestFiles) {
  4223. Path path = manifestFile.getPath();
  4224. if (manifestFile.getLen() == 0L) {
  4225. Utilities.FILE_OP_LOGGER.info("Found manifest file {}, but it is empty.", path);
  4226. continue;
  4227. }
  4228. String fileName = path.getName();
  4229. if (fileName.endsWith(MANIFEST_EXTENSION)) {
  4230. Pattern pattern = Pattern.compile("([0-9]+)_([0-9]+).manifest");
  4231. Matcher matcher = pattern.matcher(fileName);
  4232. if (matcher.matches()) {
  4233. String taskId = matcher.group(1);
  4234. int attemptId = Integer.parseInt(matcher.group(2));
  4235. Integer maxAttemptId = fileNameToAttempId.get(taskId);
  4236. if (maxAttemptId == null) {
  4237. fileNameToAttempId.put(taskId, attemptId);
  4238. fileNameToPath.put(taskId, path);
  4239. Utilities.FILE_OP_LOGGER.info("Found manifest file {} with attemptId {}.", path, attemptId);
  4240. } else if (attemptId > maxAttemptId) {
  4241. fileNameToAttempId.put(taskId, attemptId);
  4242. fileNameToPath.put(taskId, path);
  4243. Utilities.FILE_OP_LOGGER.info(
  4244. "Found manifest file {} which has higher attemptId than {}. Ignore the manifest files with attemptId below {}.",
  4245. path, maxAttemptId, attemptId);
  4246. } else {
  4247. Utilities.FILE_OP_LOGGER.info(
  4248. "Found manifest file {} with attemptId {}, but already have a manifest file with attemptId {}. Ignore this manifest file.",
  4249. path, attemptId, maxAttemptId);
  4250. }
  4251. } else {
  4252. Utilities.FILE_OP_LOGGER.info("Found manifest file {}", path);
  4253. manifests.add(path);
  4254. }
  4255. }
  4256. }
  4257. if (!fileNameToPath.isEmpty()) {
  4258. manifests.addAll(fileNameToPath.values());
  4259. }
  4260. }
  4261. return manifests;
  4262. }
  4263. private static void cleanDirectInsertDirectoriesConcurrently(
  4264. Set<Path> directInsertDirectories, Set<Path> committed, FileSystem fs, Configuration hconf, String unionSuffix, int lbLevels)
  4265. throws IOException, HiveException {
  4266. ExecutorService executor = createCleanTaskExecutor(hconf, directInsertDirectories.size());
  4267. List<Future<Void>> cleanTaskFutures = submitCleanTasksForExecution(executor, directInsertDirectories, committed, fs, unionSuffix, lbLevels);
  4268. waitForCleanTasksToComplete(executor, cleanTaskFutures);
  4269. }
  4270. private static ExecutorService createCleanTaskExecutor(Configuration hconf, int numOfDirectories) {
  4271. int threadCount = Math.min(numOfDirectories, HiveConf.getIntVar(hconf, ConfVars.HIVE_MOVE_FILES_THREAD_COUNT));
  4272. threadCount = threadCount <= 0 ? 1 : threadCount;
  4273. return Executors.newFixedThreadPool(threadCount,
  4274. new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Clean-Direct-Insert-Dirs-Thread-%d").build());
  4275. }
  4276. private static List<Future<Void>> submitCleanTasksForExecution(ExecutorService executor, Set<Path> directInsertDirectories,
  4277. Set<Path> committed, FileSystem fs, String unionSuffix, int lbLevels) {
  4278. List<Future<Void>> cleanTaskFutures = new ArrayList<>(directInsertDirectories.size());
  4279. for (Path directory : directInsertDirectories) {
  4280. Future<Void> cleanTaskFuture = executor.submit(() -> {
  4281. cleanDirectInsertDirectory(directory, fs, unionSuffix, lbLevels, committed);
  4282. return null;
  4283. });
  4284. cleanTaskFutures.add(cleanTaskFuture);
  4285. }
  4286. return cleanTaskFutures;
  4287. }
  4288. private static void waitForCleanTasksToComplete(ExecutorService executor, List<Future<Void>> cleanTaskFutures)
  4289. throws IOException, HiveException {
  4290. executor.shutdown();
  4291. for (Future<Void> cleanFuture : cleanTaskFutures) {
  4292. try {
  4293. cleanFuture.get();
  4294. } catch (InterruptedException | ExecutionException e) {
  4295. executor.shutdownNow();
  4296. if (e.getCause() instanceof IOException) {
  4297. throw (IOException) e.getCause();
  4298. }
  4299. if (e.getCause() instanceof HiveException) {
  4300. throw (HiveException) e.getCause();
  4301. }
  4302. }
  4303. }
  4304. }
  4305. private static final class PathOnlyFileStatus extends FileStatus {
  4306. public PathOnlyFileStatus(Path path) {
  4307. super(0, true, 0, 0, 0, path);
  4308. }
  4309. }
  4310. private static void cleanDirectInsertDirectory(Path dir, FileSystem fs, String unionSuffix, int lbLevels, Set<Path> committed)
  4311. throws IOException, HiveException {
  4312. for (FileStatus child : fs.listStatus(dir)) {
  4313. Path childPath = child.getPath();
  4314. if (lbLevels > 0) {
  4315. // We need to recurse into some LB directories. We don't check the directories themselves
  4316. // for matches; if they are empty they don't matter, and we do will delete bad files.
  4317. // This recursion is not the most efficient way to do this but LB is rarely used.
  4318. if (child.isDirectory()) {
  4319. Utilities.FILE_OP_LOGGER.trace(
  4320. "Recursion into LB directory {}; levels remaining ", childPath, lbLevels - 1);
  4321. cleanDirectInsertDirectory(childPath, fs, unionSuffix, lbLevels - 1, committed);
  4322. } else {
  4323. if (committed.contains(childPath)) {
  4324. throw new HiveException("LB FSOP has commited "
  4325. + childPath + " outside of LB directory levels " + lbLevels);
  4326. }
  4327. deleteUncommitedFile(childPath, fs);
  4328. }
  4329. continue;
  4330. }
  4331. // No more LB directories expected.
  4332. if (unionSuffix == null) {
  4333. if (committed.remove(childPath)) {
  4334. continue; // A good file.
  4335. }
  4336. if (!childPath.getName().equals(AcidUtils.OrcAcidVersion.ACID_FORMAT)) {
  4337. deleteUncommitedFile(childPath, fs);
  4338. }
  4339. } else if (!child.isDirectory()) {
  4340. if (committed.contains(childPath)) {
  4341. throw new HiveException("Union FSOP has commited "
  4342. + childPath + " outside of union directory " + unionSuffix);
  4343. }
  4344. deleteUncommitedFile(childPath, fs);
  4345. } else if (childPath.getName().equals(unionSuffix)) {
  4346. // Found the right union directory; treat it as "our" directory.
  4347. cleanDirectInsertDirectory(childPath, fs, null, 0, committed);
  4348. } else {
  4349. String childName = childPath.getName();
  4350. if (!childName.startsWith(AbstractFileMergeOperator.UNION_SUDBIR_PREFIX)
  4351. && !childName.startsWith(".") && !childName.startsWith("_")) {
  4352. throw new HiveException("Union FSOP has an unknown directory "
  4353. + childPath + " outside of union directory " + unionSuffix);
  4354. }
  4355. Utilities.FILE_OP_LOGGER.trace(
  4356. "FSOP for {} is ignoring the other side of the union {}", unionSuffix, childPath);
  4357. }
  4358. }
  4359. }
  4360. private static void deleteUncommitedFile(Path childPath, FileSystem fs)
  4361. throws IOException, HiveException {
  4362. Utilities.FILE_OP_LOGGER.info("Deleting {} that was not committed", childPath);
  4363. // We should actually succeed here - if we fail, don't commit the query.
  4364. if (!fs.delete(childPath, true)) {
  4365. throw new HiveException("Failed to delete an uncommitted path " + childPath);
  4366. }
  4367. }
  4368. /**
  4369. * @return the complete list of valid MM directories under a table/partition path; null
  4370. * if the entire directory is valid (has no uncommitted/temporary files).
  4371. */
  4372. public static List<Path> getValidMmDirectoriesFromTableOrPart(Path path, Configuration conf,
  4373. ValidWriteIdList validWriteIdList) throws IOException {
  4374. Utilities.FILE_OP_LOGGER.trace("Looking for valid MM paths under {}", path);
  4375. // NULL means this directory is entirely valid.
  4376. List<Path> result = null;
  4377. FileSystem fs = path.getFileSystem(conf);
  4378. FileStatus[] children = fs.listStatus(path);
  4379. for (int i = 0; i < children.length; ++i) {
  4380. FileStatus file = children[i];
  4381. Path childPath = file.getPath();
  4382. Long writeId = AcidUtils.extractWriteId(childPath);
  4383. if (!file.isDirectory() || writeId == null || !validWriteIdList.isWriteIdValid(writeId)) {
  4384. Utilities.FILE_OP_LOGGER.debug("Skipping path {}", childPath);
  4385. if (result == null) {
  4386. result = new ArrayList<>(children.length - 1);
  4387. for (int j = 0; j < i; ++j) {
  4388. result.add(children[j].getPath());
  4389. }
  4390. }
  4391. } else if (result != null) {
  4392. result.add(childPath);
  4393. }
  4394. }
  4395. return result;
  4396. }
  4397. public static String getAclStringWithHiveModification(Configuration tezConf,
  4398. String propertyName,
  4399. boolean addHs2User,
  4400. String user,
  4401. String hs2User) throws
  4402. IOException {
  4403. // Start with initial ACLs
  4404. ACLConfigurationParser aclConf =
  4405. new ACLConfigurationParser(tezConf, propertyName);
  4406. // Always give access to the user
  4407. aclConf.addAllowedUser(user);
  4408. // Give access to the process user if the config is set.
  4409. if (addHs2User && hs2User != null) {
  4410. aclConf.addAllowedUser(hs2User);
  4411. }
  4412. return aclConf.toAclString();
  4413. }
  4414. public static boolean isHiveManagedFile(Path path) {
  4415. return AcidUtils.ORIGINAL_PATTERN.matcher(path.getName()).matches() ||
  4416. AcidUtils.ORIGINAL_PATTERN_COPY.matcher(path.getName()).matches();
  4417. }
  4418. /**
  4419. * Checks if path passed in exists and has writable permissions.
  4420. * The path will be created if it does not exist.
  4421. * @param rootHDFSDirPath
  4422. * @param conf
  4423. */
  4424. public static void ensurePathIsWritable(Path rootHDFSDirPath, HiveConf conf) throws IOException {
  4425. FsPermission writableHDFSDirPermission = new FsPermission((short)00733);
  4426. FileSystem fs = rootHDFSDirPath.getFileSystem(conf);
  4427. if (!fs.exists(rootHDFSDirPath)) {
  4428. synchronized (ROOT_HDFS_DIR_LOCK) {
  4429. if (!fs.exists(rootHDFSDirPath)) {
  4430. Utilities.createDirsWithPermission(conf, rootHDFSDirPath, writableHDFSDirPermission, true);
  4431. }
  4432. }
  4433. }
  4434. FsPermission currentHDFSDirPermission = fs.getFileStatus(rootHDFSDirPath).getPermission();
  4435. if (rootHDFSDirPath.toUri() != null) {
  4436. String schema = rootHDFSDirPath.toUri().getScheme();
  4437. LOG.debug("HDFS dir: " + rootHDFSDirPath + " with schema " + schema + ", permission: " +
  4438. currentHDFSDirPermission);
  4439. } else {
  4440. LOG.debug(
  4441. "HDFS dir: " + rootHDFSDirPath + ", permission: " + currentHDFSDirPermission);
  4442. }
  4443. // If the root HDFS scratch dir already exists, make sure it is writeable.
  4444. if (!((currentHDFSDirPermission.toShort() & writableHDFSDirPermission
  4445. .toShort()) == writableHDFSDirPermission.toShort())) {
  4446. throw new RuntimeException("The dir: " + rootHDFSDirPath
  4447. + " on HDFS should be writable. Current permissions are: " + currentHDFSDirPermission);
  4448. }
  4449. }
  4450. // Get the bucketing version stored in the string format
  4451. public static int getBucketingVersion(final String versionStr) {
  4452. int bucketingVersion = 1;
  4453. if (versionStr != null) {
  4454. try {
  4455. bucketingVersion = Integer.parseInt(versionStr);
  4456. } catch (NumberFormatException e) {
  4457. // Do nothing
  4458. }
  4459. }
  4460. return bucketingVersion;
  4461. }
  4462. public static String getPasswdFromKeystore(String keystore, String key) throws IOException {
  4463. String passwd = null;
  4464. if (keystore != null && key != null) {
  4465. Configuration conf = new Configuration();
  4466. conf.set(CredentialProviderFactory.CREDENTIAL_PROVIDER_PATH, keystore);
  4467. char[] pwdCharArray = conf.getPassword(key);
  4468. if (pwdCharArray != null) {
  4469. passwd = new String(pwdCharArray);
  4470. }
  4471. }
  4472. return passwd;
  4473. }
  4474. /**
  4475. * Load password from the given uri.
  4476. * @param uriString The URI which is used to load the password.
  4477. * @return null if the uri is empty or null, else the password represented by the URI.
  4478. * @throws IOException
  4479. * @throws URISyntaxException
  4480. * @throws HiveException
  4481. */
  4482. public static String getPasswdFromUri(String uriString) throws IOException, URISyntaxException, HiveException {
  4483. if (uriString == null || uriString.isEmpty()) {
  4484. return null;
  4485. }
  4486. return URISecretSource.getInstance().getPasswordFromUri(new URI(uriString));
  4487. }
  4488. public static String encodeColumnNames(List<String> colNames) throws SemanticException {
  4489. try {
  4490. return JSON_MAPPER.writeValueAsString(colNames);
  4491. } catch (IOException e) {
  4492. throw new SemanticException(e);
  4493. }
  4494. }
  4495. public static List<String> decodeColumnNames(String colNamesStr) throws SemanticException {
  4496. try {
  4497. return JSON_MAPPER.readValue(colNamesStr, List.class);
  4498. } catch (IOException e) {
  4499. throw new SemanticException(e);
  4500. }
  4501. }
  4502. /**
  4503. * Logs the class paths of the job class loader and the thread context class loader to the passed logger.
  4504. * Checks both loaders if getURLs method is available; if not, prints a message about this (instead of the class path)
  4505. *
  4506. * Note: all messages will always be logged with DEBUG log level.
  4507. */
  4508. public static void tryLoggingClassPaths(JobConf job, Logger logger) {
  4509. if (logger != null && logger.isDebugEnabled()) {
  4510. tryToLogClassPath("conf", job.getClassLoader(), logger);
  4511. tryToLogClassPath("thread", Thread.currentThread().getContextClassLoader(), logger);
  4512. }
  4513. }
  4514. private static void tryToLogClassPath(String prefix, ClassLoader loader, Logger logger) {
  4515. if(loader instanceof URLClassLoader) {
  4516. logger.debug("{} class path = {}", prefix, Arrays.asList(((URLClassLoader) loader).getURLs()).toString());
  4517. } else {
  4518. logger.debug("{} class path = unavailable for {}", prefix,
  4519. loader == null ? "null" : loader.getClass().getSimpleName());
  4520. }
  4521. }
  4522. public static boolean arePathsEqualOrWithin(Path p1, Path p2) {
  4523. return ((p1.toString().toLowerCase().indexOf(p2.toString().toLowerCase()) > -1) ||
  4524. (p2.toString().toLowerCase().indexOf(p1.toString().toLowerCase()) > -1)) ? true : false;
  4525. }
  4526. }