PageRenderTime 63ms CodeModel.GetById 13ms RepoModel.GetById 1ms app.codeStats 0ms

/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java

https://github.com/mkgobaco/hive
Java | 3525 lines | 2393 code | 368 blank | 764 comment | 448 complexity | 606b47d493fe363a4cdb507f15147d83 MD5 | raw file
Possible License(s): Apache-2.0
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hive.ql.exec;
  19. import java.beans.DefaultPersistenceDelegate;
  20. import java.beans.Encoder;
  21. import java.beans.ExceptionListener;
  22. import java.beans.Expression;
  23. import java.beans.PersistenceDelegate;
  24. import java.beans.Statement;
  25. import java.beans.XMLDecoder;
  26. import java.beans.XMLEncoder;
  27. import java.io.BufferedReader;
  28. import java.io.ByteArrayInputStream;
  29. import java.io.ByteArrayOutputStream;
  30. import java.io.DataInput;
  31. import java.io.EOFException;
  32. import java.io.File;
  33. import java.io.FileInputStream;
  34. import java.io.FileNotFoundException;
  35. import java.io.IOException;
  36. import java.io.InputStream;
  37. import java.io.InputStreamReader;
  38. import java.io.OutputStream;
  39. import java.io.PrintStream;
  40. import java.io.Serializable;
  41. import java.io.UnsupportedEncodingException;
  42. import java.net.URI;
  43. import java.net.URL;
  44. import java.net.URLClassLoader;
  45. import java.security.MessageDigest;
  46. import java.security.NoSuchAlgorithmException;
  47. import java.sql.Connection;
  48. import java.sql.DriverManager;
  49. import java.sql.PreparedStatement;
  50. import java.sql.SQLException;
  51. import java.sql.SQLTransientException;
  52. import java.sql.Timestamp;
  53. import java.text.SimpleDateFormat;
  54. import java.util.ArrayList;
  55. import java.util.Arrays;
  56. import java.util.Calendar;
  57. import java.util.Collection;
  58. import java.util.Collections;
  59. import java.util.Date;
  60. import java.util.HashMap;
  61. import java.util.HashSet;
  62. import java.util.Iterator;
  63. import java.util.LinkedHashMap;
  64. import java.util.LinkedList;
  65. import java.util.List;
  66. import java.util.Map;
  67. import java.util.Properties;
  68. import java.util.Random;
  69. import java.util.Set;
  70. import java.util.UUID;
  71. import java.util.concurrent.ConcurrentHashMap;
  72. import java.util.concurrent.ExecutionException;
  73. import java.util.concurrent.Future;
  74. import java.util.concurrent.LinkedBlockingQueue;
  75. import java.util.concurrent.ThreadPoolExecutor;
  76. import java.util.concurrent.TimeUnit;
  77. import java.util.regex.Matcher;
  78. import java.util.regex.Pattern;
  79. import java.util.zip.Deflater;
  80. import java.util.zip.DeflaterOutputStream;
  81. import java.util.zip.InflaterInputStream;
  82. import org.antlr.runtime.CommonToken;
  83. import org.apache.commons.codec.binary.Base64;
  84. import org.apache.commons.lang.StringUtils;
  85. import org.apache.commons.lang.WordUtils;
  86. import org.apache.commons.logging.Log;
  87. import org.apache.commons.logging.LogFactory;
  88. import org.apache.hadoop.conf.Configuration;
  89. import org.apache.hadoop.filecache.DistributedCache;
  90. import org.apache.hadoop.fs.ContentSummary;
  91. import org.apache.hadoop.fs.FileStatus;
  92. import org.apache.hadoop.fs.FileSystem;
  93. import org.apache.hadoop.fs.Path;
  94. import org.apache.hadoop.fs.PathFilter;
  95. import org.apache.hadoop.fs.permission.FsPermission;
  96. import org.apache.hadoop.hive.common.HiveInterruptCallback;
  97. import org.apache.hadoop.hive.common.HiveInterruptUtils;
  98. import org.apache.hadoop.hive.common.HiveStatsUtils;
  99. import org.apache.hadoop.hive.conf.HiveConf;
  100. import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
  101. import org.apache.hadoop.hive.metastore.Warehouse;
  102. import org.apache.hadoop.hive.metastore.api.FieldSchema;
  103. import org.apache.hadoop.hive.metastore.api.Order;
  104. import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
  105. import org.apache.hadoop.hive.ql.Context;
  106. import org.apache.hadoop.hive.ql.ErrorMsg;
  107. import org.apache.hadoop.hive.ql.QueryPlan;
  108. import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
  109. import org.apache.hadoop.hive.ql.exec.mr.ExecDriver;
  110. import org.apache.hadoop.hive.ql.exec.mr.ExecMapper;
  111. import org.apache.hadoop.hive.ql.exec.mr.ExecReducer;
  112. import org.apache.hadoop.hive.ql.exec.mr.MapRedTask;
  113. import org.apache.hadoop.hive.ql.exec.tez.TezTask;
  114. import org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat;
  115. import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
  116. import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat;
  117. import org.apache.hadoop.hive.ql.io.HiveInputFormat;
  118. import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
  119. import org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat;
  120. import org.apache.hadoop.hive.ql.io.OneNullRowInputFormat;
  121. import org.apache.hadoop.hive.ql.io.RCFile;
  122. import org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat;
  123. import org.apache.hadoop.hive.ql.io.rcfile.merge.MergeWork;
  124. import org.apache.hadoop.hive.ql.io.rcfile.merge.RCFileMergeMapper;
  125. import org.apache.hadoop.hive.ql.io.rcfile.stats.PartialScanMapper;
  126. import org.apache.hadoop.hive.ql.io.rcfile.stats.PartialScanWork;
  127. import org.apache.hadoop.hive.ql.io.rcfile.truncate.ColumnTruncateMapper;
  128. import org.apache.hadoop.hive.ql.io.rcfile.truncate.ColumnTruncateWork;
  129. import org.apache.hadoop.hive.ql.log.PerfLogger;
  130. import org.apache.hadoop.hive.ql.metadata.HiveException;
  131. import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler;
  132. import org.apache.hadoop.hive.ql.metadata.HiveUtils;
  133. import org.apache.hadoop.hive.ql.metadata.InputEstimator;
  134. import org.apache.hadoop.hive.ql.metadata.Partition;
  135. import org.apache.hadoop.hive.ql.metadata.Table;
  136. import org.apache.hadoop.hive.ql.parse.SemanticException;
  137. import org.apache.hadoop.hive.ql.plan.BaseWork;
  138. import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
  139. import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
  140. import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
  141. import org.apache.hadoop.hive.ql.plan.GroupByDesc;
  142. import org.apache.hadoop.hive.ql.plan.MapWork;
  143. import org.apache.hadoop.hive.ql.plan.MapredWork;
  144. import org.apache.hadoop.hive.ql.plan.OperatorDesc;
  145. import org.apache.hadoop.hive.ql.plan.PartitionDesc;
  146. import org.apache.hadoop.hive.ql.plan.PlanUtils;
  147. import org.apache.hadoop.hive.ql.plan.PlanUtils.ExpressionTypes;
  148. import org.apache.hadoop.hive.ql.plan.ReduceWork;
  149. import org.apache.hadoop.hive.ql.plan.TableDesc;
  150. import org.apache.hadoop.hive.ql.plan.api.Adjacency;
  151. import org.apache.hadoop.hive.ql.plan.api.Graph;
  152. import org.apache.hadoop.hive.ql.session.SessionState;
  153. import org.apache.hadoop.hive.ql.stats.StatsFactory;
  154. import org.apache.hadoop.hive.ql.stats.StatsPublisher;
  155. import org.apache.hadoop.hive.serde.serdeConstants;
  156. import org.apache.hadoop.hive.serde2.SerDeException;
  157. import org.apache.hadoop.hive.serde2.SerDeUtils;
  158. import org.apache.hadoop.hive.serde2.Serializer;
  159. import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe;
  160. import org.apache.hadoop.hive.shims.ShimLoader;
  161. import org.apache.hadoop.io.IOUtils;
  162. import org.apache.hadoop.io.SequenceFile;
  163. import org.apache.hadoop.io.SequenceFile.CompressionType;
  164. import org.apache.hadoop.io.Text;
  165. import org.apache.hadoop.io.Writable;
  166. import org.apache.hadoop.io.WritableComparable;
  167. import org.apache.hadoop.io.compress.CompressionCodec;
  168. import org.apache.hadoop.io.compress.DefaultCodec;
  169. import org.apache.hadoop.mapred.FileInputFormat;
  170. import org.apache.hadoop.mapred.FileOutputFormat;
  171. import org.apache.hadoop.mapred.InputFormat;
  172. import org.apache.hadoop.mapred.JobConf;
  173. import org.apache.hadoop.mapred.RecordReader;
  174. import org.apache.hadoop.mapred.Reporter;
  175. import org.apache.hadoop.mapred.SequenceFileInputFormat;
  176. import org.apache.hadoop.mapred.SequenceFileOutputFormat;
  177. import org.apache.hadoop.util.Progressable;
  178. import org.apache.hadoop.util.ReflectionUtils;
  179. import org.apache.hadoop.util.Shell;
  180. import com.esotericsoftware.kryo.Kryo;
  181. import com.esotericsoftware.kryo.io.Input;
  182. import com.esotericsoftware.kryo.io.Output;
  183. import com.esotericsoftware.kryo.serializers.FieldSerializer;
  184. import com.esotericsoftware.shaded.org.objenesis.strategy.StdInstantiatorStrategy;
  185. /**
  186. * Utilities.
  187. *
  188. */
  189. @SuppressWarnings("nls")
  190. public final class Utilities {
  191. /**
  192. * The object in the reducer are composed of these top level fields.
  193. */
  194. public static String HADOOP_LOCAL_FS = "file:///";
  195. public static String MAP_PLAN_NAME = "map.xml";
  196. public static String REDUCE_PLAN_NAME = "reduce.xml";
  197. public static final String MAPRED_MAPPER_CLASS = "mapred.mapper.class";
  198. public static final String MAPRED_REDUCER_CLASS = "mapred.reducer.class";
  199. /**
  200. * ReduceField:
  201. * KEY: record key
  202. * VALUE: record value
  203. */
  204. public static enum ReduceField {
  205. KEY, VALUE
  206. };
  207. public static List<String> reduceFieldNameList;
  208. static {
  209. reduceFieldNameList = new ArrayList<String>();
  210. for (ReduceField r : ReduceField.values()) {
  211. reduceFieldNameList.add(r.toString());
  212. }
  213. }
  214. public static String removeValueTag(String column) {
  215. if (column.startsWith(ReduceField.VALUE + ".")) {
  216. return column.substring(6);
  217. }
  218. return column;
  219. }
  220. private Utilities() {
  221. // prevent instantiation
  222. }
  223. private static Map<Path, BaseWork> gWorkMap = Collections
  224. .synchronizedMap(new HashMap<Path, BaseWork>());
  225. private static final String CLASS_NAME = Utilities.class.getName();
  226. private static final Log LOG = LogFactory.getLog(CLASS_NAME);
  227. public static void clearWork(Configuration conf) {
  228. Path mapPath = getPlanPath(conf, MAP_PLAN_NAME);
  229. Path reducePath = getPlanPath(conf, REDUCE_PLAN_NAME);
  230. // if the plan path hasn't been initialized just return, nothing to clean.
  231. if (mapPath == null && reducePath == null) {
  232. return;
  233. }
  234. try {
  235. FileSystem fs = mapPath.getFileSystem(conf);
  236. if (fs.exists(mapPath)) {
  237. fs.delete(mapPath, true);
  238. }
  239. if (fs.exists(reducePath)) {
  240. fs.delete(reducePath, true);
  241. }
  242. } catch (Exception e) {
  243. LOG.warn("Failed to clean-up tmp directories.", e);
  244. } finally {
  245. // where a single process works with multiple plans - we must clear
  246. // the cache before working with the next plan.
  247. clearWorkMapForConf(conf);
  248. }
  249. }
  250. public static MapredWork getMapRedWork(Configuration conf) {
  251. MapredWork w = new MapredWork();
  252. w.setMapWork(getMapWork(conf));
  253. w.setReduceWork(getReduceWork(conf));
  254. return w;
  255. }
  256. public static void setMapWork(Configuration conf, MapWork work) {
  257. setBaseWork(conf, MAP_PLAN_NAME, work);
  258. }
  259. public static MapWork getMapWork(Configuration conf) {
  260. return (MapWork) getBaseWork(conf, MAP_PLAN_NAME);
  261. }
  262. public static void setReduceWork(Configuration conf, ReduceWork work) {
  263. setBaseWork(conf, REDUCE_PLAN_NAME, work);
  264. }
  265. public static ReduceWork getReduceWork(Configuration conf) {
  266. return (ReduceWork) getBaseWork(conf, REDUCE_PLAN_NAME);
  267. }
  268. /**
  269. * Pushes work into the global work map
  270. */
  271. public static void setBaseWork(Configuration conf, String name, BaseWork work) {
  272. Path path = getPlanPath(conf, name);
  273. gWorkMap.put(path, work);
  274. }
  275. /**
  276. * Returns the Map or Reduce plan
  277. * Side effect: the BaseWork returned is also placed in the gWorkMap
  278. * @param conf
  279. * @param name
  280. * @return BaseWork based on the name supplied will return null if name is null
  281. * @throws RuntimeException if the configuration files are not proper or if plan can not be loaded
  282. */
  283. private static BaseWork getBaseWork(Configuration conf, String name) {
  284. BaseWork gWork = null;
  285. Path path = null;
  286. InputStream in = null;
  287. try {
  288. path = getPlanPath(conf, name);
  289. assert path != null;
  290. if (!gWorkMap.containsKey(path)) {
  291. Path localPath;
  292. if (ShimLoader.getHadoopShims().isLocalMode(conf)) {
  293. localPath = path;
  294. } else {
  295. localPath = new Path(name);
  296. }
  297. if (HiveConf.getBoolVar(conf, ConfVars.HIVE_RPC_QUERY_PLAN)) {
  298. LOG.debug("Loading plan from string: "+path.toUri().getPath());
  299. String planString = conf.get(path.toUri().getPath());
  300. if (planString == null) {
  301. LOG.info("Could not find plan string in conf");
  302. return null;
  303. }
  304. byte[] planBytes = Base64.decodeBase64(planString);
  305. in = new ByteArrayInputStream(planBytes);
  306. in = new InflaterInputStream(in);
  307. } else {
  308. in = new FileInputStream(localPath.toUri().getPath());
  309. }
  310. if(MAP_PLAN_NAME.equals(name)){
  311. if (ExecMapper.class.getName().equals(conf.get(MAPRED_MAPPER_CLASS))){
  312. gWork = deserializePlan(in, MapWork.class, conf);
  313. } else if(RCFileMergeMapper.class.getName().equals(conf.get(MAPRED_MAPPER_CLASS))) {
  314. gWork = deserializePlan(in, MergeWork.class, conf);
  315. } else if(ColumnTruncateMapper.class.getName().equals(conf.get(MAPRED_MAPPER_CLASS))) {
  316. gWork = deserializePlan(in, ColumnTruncateWork.class, conf);
  317. } else if(PartialScanMapper.class.getName().equals(conf.get(MAPRED_MAPPER_CLASS))) {
  318. gWork = deserializePlan(in, PartialScanWork.class,conf);
  319. } else {
  320. throw new RuntimeException("unable to determine work from configuration ."
  321. + MAPRED_MAPPER_CLASS + " was "+ conf.get(MAPRED_MAPPER_CLASS)) ;
  322. }
  323. } else if (REDUCE_PLAN_NAME.equals(name)) {
  324. if(ExecReducer.class.getName().equals(conf.get(MAPRED_REDUCER_CLASS))) {
  325. gWork = deserializePlan(in, ReduceWork.class, conf);
  326. } else {
  327. throw new RuntimeException("unable to determine work from configuration ."
  328. + MAPRED_REDUCER_CLASS +" was "+ conf.get(MAPRED_REDUCER_CLASS)) ;
  329. }
  330. }
  331. gWorkMap.put(path, gWork);
  332. } else {
  333. LOG.debug("Found plan in cache.");
  334. gWork = gWorkMap.get(path);
  335. }
  336. return gWork;
  337. } catch (FileNotFoundException fnf) {
  338. // happens. e.g.: no reduce work.
  339. LOG.info("No plan file found: "+path);
  340. return null;
  341. } catch (Exception e) {
  342. LOG.error("Failed to load plan: "+path, e);
  343. throw new RuntimeException(e);
  344. } finally {
  345. if (in != null) {
  346. try {
  347. in.close();
  348. } catch (IOException cantBlameMeForTrying) { }
  349. }
  350. }
  351. }
  352. public static void setWorkflowAdjacencies(Configuration conf, QueryPlan plan) {
  353. try {
  354. Graph stageGraph = plan.getQueryPlan().getStageGraph();
  355. if (stageGraph == null) {
  356. return;
  357. }
  358. List<Adjacency> adjList = stageGraph.getAdjacencyList();
  359. if (adjList == null) {
  360. return;
  361. }
  362. for (Adjacency adj : adjList) {
  363. List<String> children = adj.getChildren();
  364. if (children == null || children.isEmpty()) {
  365. return;
  366. }
  367. conf.setStrings("mapreduce.workflow.adjacency."+adj.getNode(),
  368. children.toArray(new String[children.size()]));
  369. }
  370. } catch (IOException e) {
  371. }
  372. }
  373. public static List<String> getFieldSchemaString(List<FieldSchema> fl) {
  374. if (fl == null) {
  375. return null;
  376. }
  377. ArrayList<String> ret = new ArrayList<String>();
  378. for (FieldSchema f : fl) {
  379. ret.add(f.getName() + " " + f.getType()
  380. + (f.getComment() != null ? (" " + f.getComment()) : ""));
  381. }
  382. return ret;
  383. }
  384. /**
  385. * Java 1.5 workaround. From http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=5015403
  386. */
  387. public static class EnumDelegate extends DefaultPersistenceDelegate {
  388. @Override
  389. protected Expression instantiate(Object oldInstance, Encoder out) {
  390. return new Expression(Enum.class, "valueOf", new Object[] {oldInstance.getClass(),
  391. ((Enum<?>) oldInstance).name()});
  392. }
  393. @Override
  394. protected boolean mutatesTo(Object oldInstance, Object newInstance) {
  395. return oldInstance == newInstance;
  396. }
  397. }
  398. public static class MapDelegate extends DefaultPersistenceDelegate {
  399. @Override
  400. protected Expression instantiate(Object oldInstance, Encoder out) {
  401. Map oldMap = (Map) oldInstance;
  402. HashMap newMap = new HashMap(oldMap);
  403. return new Expression(newMap, HashMap.class, "new", new Object[] {});
  404. }
  405. @Override
  406. protected boolean mutatesTo(Object oldInstance, Object newInstance) {
  407. return false;
  408. }
  409. @Override
  410. protected void initialize(Class<?> type, Object oldInstance, Object newInstance, Encoder out) {
  411. java.util.Collection oldO = (java.util.Collection) oldInstance;
  412. java.util.Collection newO = (java.util.Collection) newInstance;
  413. if (newO.size() != 0) {
  414. out.writeStatement(new Statement(oldInstance, "clear", new Object[] {}));
  415. }
  416. for (Iterator i = oldO.iterator(); i.hasNext();) {
  417. out.writeStatement(new Statement(oldInstance, "add", new Object[] {i.next()}));
  418. }
  419. }
  420. }
  421. public static class SetDelegate extends DefaultPersistenceDelegate {
  422. @Override
  423. protected Expression instantiate(Object oldInstance, Encoder out) {
  424. Set oldSet = (Set) oldInstance;
  425. HashSet newSet = new HashSet(oldSet);
  426. return new Expression(newSet, HashSet.class, "new", new Object[] {});
  427. }
  428. @Override
  429. protected boolean mutatesTo(Object oldInstance, Object newInstance) {
  430. return false;
  431. }
  432. @Override
  433. protected void initialize(Class<?> type, Object oldInstance, Object newInstance, Encoder out) {
  434. java.util.Collection oldO = (java.util.Collection) oldInstance;
  435. java.util.Collection newO = (java.util.Collection) newInstance;
  436. if (newO.size() != 0) {
  437. out.writeStatement(new Statement(oldInstance, "clear", new Object[] {}));
  438. }
  439. for (Iterator i = oldO.iterator(); i.hasNext();) {
  440. out.writeStatement(new Statement(oldInstance, "add", new Object[] {i.next()}));
  441. }
  442. }
  443. }
  444. public static class ListDelegate extends DefaultPersistenceDelegate {
  445. @Override
  446. protected Expression instantiate(Object oldInstance, Encoder out) {
  447. List oldList = (List) oldInstance;
  448. ArrayList newList = new ArrayList(oldList);
  449. return new Expression(newList, ArrayList.class, "new", new Object[] {});
  450. }
  451. @Override
  452. protected boolean mutatesTo(Object oldInstance, Object newInstance) {
  453. return false;
  454. }
  455. @Override
  456. protected void initialize(Class<?> type, Object oldInstance, Object newInstance, Encoder out) {
  457. java.util.Collection oldO = (java.util.Collection) oldInstance;
  458. java.util.Collection newO = (java.util.Collection) newInstance;
  459. if (newO.size() != 0) {
  460. out.writeStatement(new Statement(oldInstance, "clear", new Object[] {}));
  461. }
  462. for (Iterator i = oldO.iterator(); i.hasNext();) {
  463. out.writeStatement(new Statement(oldInstance, "add", new Object[] {i.next()}));
  464. }
  465. }
  466. }
  467. /**
  468. * DatePersistenceDelegate. Needed to serialize java.util.Date
  469. * since it is not serialization friendly.
  470. * Also works for java.sql.Date since it derives from java.util.Date.
  471. */
  472. public static class DatePersistenceDelegate extends PersistenceDelegate {
  473. @Override
  474. protected Expression instantiate(Object oldInstance, Encoder out) {
  475. Date dateVal = (Date)oldInstance;
  476. Object[] args = { dateVal.getTime() };
  477. return new Expression(dateVal, dateVal.getClass(), "new", args);
  478. }
  479. @Override
  480. protected boolean mutatesTo(Object oldInstance, Object newInstance) {
  481. if (oldInstance == null || newInstance == null) {
  482. return false;
  483. }
  484. return oldInstance.getClass() == newInstance.getClass();
  485. }
  486. }
  487. /**
  488. * TimestampPersistenceDelegate. Needed to serialize java.sql.Timestamp since
  489. * it is not serialization friendly.
  490. */
  491. public static class TimestampPersistenceDelegate extends DatePersistenceDelegate {
  492. @Override
  493. protected void initialize(Class<?> type, Object oldInstance, Object newInstance, Encoder out) {
  494. Timestamp ts = (Timestamp)oldInstance;
  495. Object[] args = { ts.getNanos() };
  496. Statement stmt = new Statement(oldInstance, "setNanos", args);
  497. out.writeStatement(stmt);
  498. }
  499. }
  500. /**
  501. * Need to serialize org.antlr.runtime.CommonToken
  502. */
  503. public static class CommonTokenDelegate extends PersistenceDelegate {
  504. @Override
  505. protected Expression instantiate(Object oldInstance, Encoder out) {
  506. CommonToken ct = (CommonToken)oldInstance;
  507. Object[] args = {ct.getType(), ct.getText()};
  508. return new Expression(ct, ct.getClass(), "new", args);
  509. }
  510. }
  511. public static class PathDelegate extends PersistenceDelegate {
  512. @Override
  513. protected Expression instantiate(Object oldInstance, Encoder out) {
  514. Path p = (Path)oldInstance;
  515. Object[] args = {p.toString()};
  516. return new Expression(p, p.getClass(), "new", args);
  517. }
  518. }
  519. public static void setMapRedWork(Configuration conf, MapredWork w, Path hiveScratchDir) {
  520. setMapWork(conf, w.getMapWork(), hiveScratchDir, true);
  521. if (w.getReduceWork() != null) {
  522. setReduceWork(conf, w.getReduceWork(), hiveScratchDir, true);
  523. }
  524. }
  525. public static Path setMapWork(Configuration conf, MapWork w, Path hiveScratchDir, boolean useCache) {
  526. return setBaseWork(conf, w, hiveScratchDir, MAP_PLAN_NAME, useCache);
  527. }
  528. public static Path setReduceWork(Configuration conf, ReduceWork w, Path hiveScratchDir, boolean useCache) {
  529. return setBaseWork(conf, w, hiveScratchDir, REDUCE_PLAN_NAME, useCache);
  530. }
  531. private static Path setBaseWork(Configuration conf, BaseWork w, Path hiveScratchDir, String name, boolean useCache) {
  532. try {
  533. setPlanPath(conf, hiveScratchDir);
  534. Path planPath = getPlanPath(conf, name);
  535. OutputStream out;
  536. if (HiveConf.getBoolVar(conf, ConfVars.HIVE_RPC_QUERY_PLAN)) {
  537. // add it to the conf
  538. ByteArrayOutputStream byteOut = new ByteArrayOutputStream();
  539. out = new DeflaterOutputStream(byteOut, new Deflater(Deflater.BEST_SPEED));
  540. serializePlan(w, out, conf);
  541. LOG.info("Setting plan: "+planPath.toUri().getPath());
  542. conf.set(planPath.toUri().getPath(),
  543. Base64.encodeBase64String(byteOut.toByteArray()));
  544. } else {
  545. // use the default file system of the conf
  546. FileSystem fs = planPath.getFileSystem(conf);
  547. out = fs.create(planPath);
  548. serializePlan(w, out, conf);
  549. // Serialize the plan to the default hdfs instance
  550. // Except for hadoop local mode execution where we should be
  551. // able to get the plan directly from the cache
  552. if (useCache && !ShimLoader.getHadoopShims().isLocalMode(conf)) {
  553. // Set up distributed cache
  554. if (!DistributedCache.getSymlink(conf)) {
  555. DistributedCache.createSymlink(conf);
  556. }
  557. String uriWithLink = planPath.toUri().toString() + "#" + name;
  558. DistributedCache.addCacheFile(new URI(uriWithLink), conf);
  559. // set replication of the plan file to a high number. we use the same
  560. // replication factor as used by the hadoop jobclient for job.xml etc.
  561. short replication = (short) conf.getInt("mapred.submit.replication", 10);
  562. fs.setReplication(planPath, replication);
  563. }
  564. }
  565. // Cache the plan in this process
  566. gWorkMap.put(planPath, w);
  567. return planPath;
  568. } catch (Exception e) {
  569. e.printStackTrace();
  570. throw new RuntimeException(e);
  571. }
  572. }
  573. private static Path getPlanPath(Configuration conf, String name) {
  574. Path planPath = getPlanPath(conf);
  575. if (planPath == null) {
  576. return null;
  577. }
  578. return new Path(planPath, name);
  579. }
  580. private static void setPlanPath(Configuration conf, Path hiveScratchDir) throws IOException {
  581. if (getPlanPath(conf) == null) {
  582. // this is the unique conf ID, which is kept in JobConf as part of the plan file name
  583. String jobID = UUID.randomUUID().toString();
  584. Path planPath = new Path(hiveScratchDir, jobID);
  585. FileSystem fs = planPath.getFileSystem(conf);
  586. fs.mkdirs(planPath);
  587. HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, planPath.toUri().toString());
  588. }
  589. }
  590. public static Path getPlanPath(Configuration conf) {
  591. String plan = HiveConf.getVar(conf, HiveConf.ConfVars.PLAN);
  592. if (plan != null && !plan.isEmpty()) {
  593. return new Path(plan);
  594. }
  595. return null;
  596. }
  597. /**
  598. * Serializes expression via Kryo.
  599. * @param expr Expression.
  600. * @return Bytes.
  601. */
  602. public static byte[] serializeExpressionToKryo(ExprNodeGenericFuncDesc expr) {
  603. return serializeObjectToKryo(expr);
  604. }
  605. /**
  606. * Deserializes expression from Kryo.
  607. * @param bytes Bytes containing the expression.
  608. * @return Expression; null if deserialization succeeded, but the result type is incorrect.
  609. */
  610. public static ExprNodeGenericFuncDesc deserializeExpressionFromKryo(byte[] bytes) {
  611. return deserializeObjectFromKryo(bytes, ExprNodeGenericFuncDesc.class);
  612. }
  613. public static String serializeExpression(ExprNodeGenericFuncDesc expr) {
  614. try {
  615. return new String(Base64.encodeBase64(serializeExpressionToKryo(expr)), "UTF-8");
  616. } catch (UnsupportedEncodingException ex) {
  617. throw new RuntimeException("UTF-8 support required", ex);
  618. }
  619. }
  620. public static ExprNodeGenericFuncDesc deserializeExpression(String s) {
  621. byte[] bytes;
  622. try {
  623. bytes = Base64.decodeBase64(s.getBytes("UTF-8"));
  624. } catch (UnsupportedEncodingException ex) {
  625. throw new RuntimeException("UTF-8 support required", ex);
  626. }
  627. return deserializeExpressionFromKryo(bytes);
  628. }
  629. private static byte[] serializeObjectToKryo(Serializable object) {
  630. ByteArrayOutputStream baos = new ByteArrayOutputStream();
  631. Output output = new Output(baos);
  632. runtimeSerializationKryo.get().writeObject(output, object);
  633. output.close();
  634. return baos.toByteArray();
  635. }
  636. private static <T extends Serializable> T deserializeObjectFromKryo(byte[] bytes, Class<T> clazz) {
  637. Input inp = new Input(new ByteArrayInputStream(bytes));
  638. T func = runtimeSerializationKryo.get().readObject(inp, clazz);
  639. inp.close();
  640. return func;
  641. }
  642. public static String serializeObject(Serializable expr) {
  643. try {
  644. return new String(Base64.encodeBase64(serializeObjectToKryo(expr)), "UTF-8");
  645. } catch (UnsupportedEncodingException ex) {
  646. throw new RuntimeException("UTF-8 support required", ex);
  647. }
  648. }
  649. public static <T extends Serializable> T deserializeObject(String s, Class<T> clazz) {
  650. try {
  651. return deserializeObjectFromKryo(Base64.decodeBase64(s.getBytes("UTF-8")), clazz);
  652. } catch (UnsupportedEncodingException ex) {
  653. throw new RuntimeException("UTF-8 support required", ex);
  654. }
  655. }
  656. public static class CollectionPersistenceDelegate extends DefaultPersistenceDelegate {
  657. @Override
  658. protected Expression instantiate(Object oldInstance, Encoder out) {
  659. return new Expression(oldInstance, oldInstance.getClass(), "new", null);
  660. }
  661. @Override
  662. protected void initialize(Class type, Object oldInstance, Object newInstance, Encoder out) {
  663. Iterator ite = ((Collection) oldInstance).iterator();
  664. while (ite.hasNext()) {
  665. out.writeStatement(new Statement(oldInstance, "add", new Object[] {ite.next()}));
  666. }
  667. }
  668. }
  669. /**
  670. * Kryo serializer for timestamp.
  671. */
  672. private static class TimestampSerializer extends
  673. com.esotericsoftware.kryo.Serializer<Timestamp> {
  674. @Override
  675. public Timestamp read(Kryo kryo, Input input, Class<Timestamp> clazz) {
  676. Timestamp ts = new Timestamp(input.readLong());
  677. ts.setNanos(input.readInt());
  678. return ts;
  679. }
  680. @Override
  681. public void write(Kryo kryo, Output output, Timestamp ts) {
  682. output.writeLong(ts.getTime());
  683. output.writeInt(ts.getNanos());
  684. }
  685. }
  686. /** Custom Kryo serializer for sql date, otherwise Kryo gets confused between
  687. java.sql.Date and java.util.Date while deserializing
  688. */
  689. private static class SqlDateSerializer extends
  690. com.esotericsoftware.kryo.Serializer<java.sql.Date> {
  691. @Override
  692. public java.sql.Date read(Kryo kryo, Input input, Class<java.sql.Date> clazz) {
  693. return new java.sql.Date(input.readLong());
  694. }
  695. @Override
  696. public void write(Kryo kryo, Output output, java.sql.Date sqlDate) {
  697. output.writeLong(sqlDate.getTime());
  698. }
  699. }
  700. private static class CommonTokenSerializer extends com.esotericsoftware.kryo.Serializer<CommonToken> {
  701. @Override
  702. public CommonToken read(Kryo kryo, Input input, Class<CommonToken> clazz) {
  703. return new CommonToken(input.readInt(), input.readString());
  704. }
  705. @Override
  706. public void write(Kryo kryo, Output output, CommonToken token) {
  707. output.writeInt(token.getType());
  708. output.writeString(token.getText());
  709. }
  710. }
  711. private static class PathSerializer extends com.esotericsoftware.kryo.Serializer<Path> {
  712. @Override
  713. public void write(Kryo kryo, Output output, Path path) {
  714. output.writeString(path.toUri().toString());
  715. }
  716. @Override
  717. public Path read(Kryo kryo, Input input, Class<Path> type) {
  718. return new Path(URI.create(input.readString()));
  719. }
  720. }
  721. public static Set<Operator<?>> cloneOperatorTree(Configuration conf, Set<Operator<?>> roots) {
  722. ByteArrayOutputStream baos = new ByteArrayOutputStream(4096);
  723. serializePlan(roots, baos, conf, true);
  724. Set<Operator<?>> result = deserializePlan(new ByteArrayInputStream(baos.toByteArray()),
  725. roots.getClass(), conf, true);
  726. return result;
  727. }
  728. private static void serializePlan(Object plan, OutputStream out, Configuration conf, boolean cloningPlan) {
  729. PerfLogger perfLogger = PerfLogger.getPerfLogger();
  730. perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.SERIALIZE_PLAN);
  731. String serializationType = conf.get(HiveConf.ConfVars.PLAN_SERIALIZATION.varname, "kryo");
  732. LOG.info("Serializing " + plan.getClass().getSimpleName() + " via " + serializationType);
  733. if("javaXML".equalsIgnoreCase(serializationType)) {
  734. serializeObjectByJavaXML(plan, out);
  735. } else {
  736. if(cloningPlan) {
  737. serializeObjectByKryo(cloningQueryPlanKryo.get(), plan, out);
  738. } else {
  739. serializeObjectByKryo(runtimeSerializationKryo.get(), plan, out);
  740. }
  741. }
  742. perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.SERIALIZE_PLAN);
  743. }
  744. /**
  745. * Serializes the plan.
  746. * @param plan The plan, such as QueryPlan, MapredWork, etc.
  747. * @param out The stream to write to.
  748. * @param conf to pick which serialization format is desired.
  749. */
  750. public static void serializePlan(Object plan, OutputStream out, Configuration conf) {
  751. serializePlan(plan, out, conf, false);
  752. }
  753. private static <T> T deserializePlan(InputStream in, Class<T> planClass, Configuration conf, boolean cloningPlan) {
  754. PerfLogger perfLogger = PerfLogger.getPerfLogger();
  755. perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.DESERIALIZE_PLAN);
  756. T plan;
  757. String serializationType = conf.get(HiveConf.ConfVars.PLAN_SERIALIZATION.varname, "kryo");
  758. LOG.info("Deserializing " + planClass.getSimpleName() + " via " + serializationType);
  759. if("javaXML".equalsIgnoreCase(serializationType)) {
  760. plan = deserializeObjectByJavaXML(in);
  761. } else {
  762. if(cloningPlan) {
  763. plan = deserializeObjectByKryo(cloningQueryPlanKryo.get(), in, planClass);
  764. } else {
  765. plan = deserializeObjectByKryo(runtimeSerializationKryo.get(), in, planClass);
  766. }
  767. }
  768. perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.DESERIALIZE_PLAN);
  769. return plan;
  770. }
  771. /**
  772. * Deserializes the plan.
  773. * @param in The stream to read from.
  774. * @param planClass class of plan
  775. * @param conf configuration
  776. * @return The plan, such as QueryPlan, MapredWork, etc.
  777. */
  778. public static <T> T deserializePlan(InputStream in, Class<T> planClass, Configuration conf) {
  779. return deserializePlan(in, planClass, conf, false);
  780. }
  781. /**
  782. * Clones using the powers of XML. Do not use unless necessary.
  783. * @param plan The plan.
  784. * @return The clone.
  785. */
  786. public static MapredWork clonePlan(MapredWork plan) {
  787. // TODO: need proper clone. Meanwhile, let's at least keep this horror in one place
  788. PerfLogger perfLogger = PerfLogger.getPerfLogger();
  789. perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.CLONE_PLAN);
  790. ByteArrayOutputStream baos = new ByteArrayOutputStream(4096);
  791. Configuration conf = new HiveConf();
  792. serializePlan(plan, baos, conf, true);
  793. MapredWork newPlan = deserializePlan(new ByteArrayInputStream(baos.toByteArray()),
  794. MapredWork.class, conf, true);
  795. perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.CLONE_PLAN);
  796. return newPlan;
  797. }
  798. /**
  799. * Serialize the object. This helper function mainly makes sure that enums,
  800. * counters, etc are handled properly.
  801. */
  802. private static void serializeObjectByJavaXML(Object plan, OutputStream out) {
  803. XMLEncoder e = new XMLEncoder(out);
  804. e.setExceptionListener(new ExceptionListener() {
  805. @Override
  806. public void exceptionThrown(Exception e) {
  807. LOG.warn(org.apache.hadoop.util.StringUtils.stringifyException(e));
  808. throw new RuntimeException("Cannot serialize object", e);
  809. }
  810. });
  811. // workaround for java 1.5
  812. e.setPersistenceDelegate(ExpressionTypes.class, new EnumDelegate());
  813. e.setPersistenceDelegate(GroupByDesc.Mode.class, new EnumDelegate());
  814. e.setPersistenceDelegate(java.sql.Date.class, new DatePersistenceDelegate());
  815. e.setPersistenceDelegate(Timestamp.class, new TimestampPersistenceDelegate());
  816. e.setPersistenceDelegate(org.datanucleus.store.types.backed.Map.class, new MapDelegate());
  817. e.setPersistenceDelegate(org.datanucleus.store.types.backed.List.class, new ListDelegate());
  818. e.setPersistenceDelegate(CommonToken.class, new CommonTokenDelegate());
  819. e.setPersistenceDelegate(Path.class, new PathDelegate());
  820. e.writeObject(plan);
  821. e.close();
  822. }
  823. /**
  824. * @param plan Usually of type MapredWork, MapredLocalWork etc.
  825. * @param out stream in which serialized plan is written into
  826. */
  827. private static void serializeObjectByKryo(Kryo kryo, Object plan, OutputStream out) {
  828. Output output = new Output(out);
  829. kryo.writeObject(output, plan);
  830. output.close();
  831. }
  832. /**
  833. * De-serialize an object. This helper function mainly makes sure that enums,
  834. * counters, etc are handled properly.
  835. */
  836. @SuppressWarnings("unchecked")
  837. private static <T> T deserializeObjectByJavaXML(InputStream in) {
  838. XMLDecoder d = null;
  839. try {
  840. d = new XMLDecoder(in, null, null);
  841. return (T) d.readObject();
  842. } finally {
  843. if (null != d) {
  844. d.close();
  845. }
  846. }
  847. }
  848. private static <T> T deserializeObjectByKryo(Kryo kryo, InputStream in, Class<T> clazz ) {
  849. Input inp = new Input(in);
  850. T t = kryo.readObject(inp,clazz);
  851. inp.close();
  852. return t;
  853. }
  854. // Kryo is not thread-safe,
  855. // Also new Kryo() is expensive, so we want to do it just once.
  856. public static ThreadLocal<Kryo> runtimeSerializationKryo = new ThreadLocal<Kryo>() {
  857. @Override
  858. protected synchronized Kryo initialValue() {
  859. Kryo kryo = new Kryo();
  860. kryo.setClassLoader(Thread.currentThread().getContextClassLoader());
  861. kryo.register(java.sql.Date.class, new SqlDateSerializer());
  862. kryo.register(java.sql.Timestamp.class, new TimestampSerializer());
  863. kryo.register(Path.class, new PathSerializer());
  864. kryo.setInstantiatorStrategy(new StdInstantiatorStrategy());
  865. removeField(kryo, Operator.class, "colExprMap");
  866. removeField(kryo, ColumnInfo.class, "objectInspector");
  867. removeField(kryo, MapWork.class, "opParseCtxMap");
  868. removeField(kryo, MapWork.class, "joinTree");
  869. return kryo;
  870. };
  871. };
  872. @SuppressWarnings("rawtypes")
  873. protected static void removeField(Kryo kryo, Class type, String fieldName) {
  874. FieldSerializer fld = new FieldSerializer(kryo, type);
  875. fld.removeField(fieldName);
  876. kryo.register(type, fld);
  877. }
  878. private static ThreadLocal<Kryo> cloningQueryPlanKryo = new ThreadLocal<Kryo>() {
  879. @Override
  880. protected synchronized Kryo initialValue() {
  881. Kryo kryo = new Kryo();
  882. kryo.setClassLoader(Thread.currentThread().getContextClassLoader());
  883. kryo.register(CommonToken.class, new CommonTokenSerializer());
  884. kryo.register(java.sql.Date.class, new SqlDateSerializer());
  885. kryo.register(java.sql.Timestamp.class, new TimestampSerializer());
  886. kryo.register(Path.class, new PathSerializer());
  887. kryo.setInstantiatorStrategy(new StdInstantiatorStrategy());
  888. return kryo;
  889. };
  890. };
  891. public static TableDesc defaultTd;
  892. static {
  893. // by default we expect ^A separated strings
  894. // This tableDesc does not provide column names. We should always use
  895. // PlanUtils.getDefaultTableDesc(String separatorCode, String columns)
  896. // or getBinarySortableTableDesc(List<FieldSchema> fieldSchemas) when
  897. // we know the column names.
  898. defaultTd = PlanUtils.getDefaultTableDesc("" + Utilities.ctrlaCode);
  899. }
  900. public static final int carriageReturnCode = 13;
  901. public static final int newLineCode = 10;
  902. public static final int tabCode = 9;
  903. public static final int ctrlaCode = 1;
  904. public static final String INDENT = " ";
  905. // Note: When DDL supports specifying what string to represent null,
  906. // we should specify "NULL" to represent null in the temp table, and then
  907. // we can make the following translation deprecated.
  908. public static String nullStringStorage = "\\N";
  909. public static String nullStringOutput = "NULL";
  910. public static Random randGen = new Random();
  911. /**
  912. * Gets the task id if we are running as a Hadoop job. Gets a random number otherwise.
  913. */
  914. public static String getTaskId(Configuration hconf) {
  915. String taskid = (hconf == null) ? null : hconf.get("mapred.task.id");
  916. if ((taskid == null) || taskid.equals("")) {
  917. return ("" + Math.abs(randGen.nextInt()));
  918. } else {
  919. /*
  920. * extract the task and attempt id from the hadoop taskid. in version 17 the leading component
  921. * was 'task_'. thereafter the leading component is 'attempt_'. in 17 - hadoop also seems to
  922. * have used _map_ and _reduce_ to denote map/reduce task types
  923. */
  924. String ret = taskid.replaceAll(".*_[mr]_", "").replaceAll(".*_(map|reduce)_", "");
  925. return (ret);
  926. }
  927. }
  928. public static HashMap makeMap(Object... olist) {
  929. HashMap ret = new HashMap();
  930. for (int i = 0; i < olist.length; i += 2) {
  931. ret.put(olist[i], olist[i + 1]);
  932. }
  933. return (ret);
  934. }
  935. public static Properties makeProperties(String... olist) {
  936. Properties ret = new Properties();
  937. for (int i = 0; i < olist.length; i += 2) {
  938. ret.setProperty(olist[i], olist[i + 1]);
  939. }
  940. return (ret);
  941. }
  942. public static ArrayList makeList(Object... olist) {
  943. ArrayList ret = new ArrayList();
  944. for (Object element : olist) {
  945. ret.add(element);
  946. }
  947. return (ret);
  948. }
  949. /**
  950. * StreamPrinter.
  951. *
  952. */
  953. public static class StreamPrinter extends Thread {
  954. InputStream is;
  955. String type;
  956. PrintStream os;
  957. public StreamPrinter(InputStream is, String type, PrintStream os) {
  958. this.is = is;
  959. this.type = type;
  960. this.os = os;
  961. }
  962. @Override
  963. public void run() {
  964. BufferedReader br = null;
  965. try {
  966. InputStreamReader isr = new InputStreamReader(is);
  967. br = new BufferedReader(isr);
  968. String line = null;
  969. if (type != null) {
  970. while ((line = br.readLine()) != null) {
  971. os.println(type + ">" + line);
  972. }
  973. } else {
  974. while ((line = br.readLine()) != null) {
  975. os.println(line);
  976. }
  977. }
  978. br.close();
  979. br=null;
  980. } catch (IOException ioe) {
  981. ioe.printStackTrace();
  982. }finally{
  983. IOUtils.closeStream(br);
  984. }
  985. }
  986. }
  987. public static TableDesc getTableDesc(Table tbl) {
  988. Properties props = tbl.getMetadata();
  989. props.put(serdeConstants.SERIALIZATION_LIB, tbl.getDeserializer().getClass().getName());
  990. return (new TableDesc(tbl.getInputFormatClass(), tbl
  991. .getOutputFormatClass(), props));
  992. }
  993. // column names and column types are all delimited by comma
  994. public static TableDesc getTableDesc(String cols, String colTypes) {
  995. return (new TableDesc(SequenceFileInputFormat.class,
  996. HiveSequenceFileOutputFormat.class, Utilities.makeProperties(
  997. serdeConstants.SERIALIZATION_FORMAT, "" + Utilities.ctrlaCode,
  998. serdeConstants.LIST_COLUMNS, cols,
  999. serdeConstants.LIST_COLUMN_TYPES, colTypes,
  1000. serdeConstants.SERIALIZATION_LIB,LazySimpleSerDe.class.getName())));
  1001. }
  1002. public static PartitionDesc getPartitionDesc(Partition part) throws HiveException {
  1003. return (new PartitionDesc(part));
  1004. }
  1005. public static PartitionDesc getPartitionDescFromTableDesc(TableDesc tblDesc, Partition part)
  1006. throws HiveException {
  1007. return new PartitionDesc(part, tblDesc);
  1008. }
  1009. private static String getOpTreeSkel_helper(Operator<?> op, String indent) {
  1010. if (op == null) {
  1011. return "";
  1012. }
  1013. StringBuilder sb = new StringBuilder();
  1014. sb.append(indent);
  1015. sb.append(op.toString());
  1016. sb.append("\n");
  1017. if (op.getChildOperators() != null) {
  1018. for (Object child : op.getChildOperators()) {
  1019. sb.append(getOpTreeSkel_helper((Operator<?>) child, indent + " "));
  1020. }
  1021. }
  1022. return sb.toString();
  1023. }
  1024. public static String getOpTreeSkel(Operator<?> op) {
  1025. return getOpTreeSkel_helper(op, "");
  1026. }
  1027. private static boolean isWhitespace(int c) {
  1028. if (c == -1) {
  1029. return false;
  1030. }
  1031. return Character.isWhitespace((char) c);
  1032. }
  1033. public static boolean contentsEqual(InputStream is1, InputStream is2, boolean ignoreWhitespace)
  1034. throws IOException {
  1035. try {
  1036. if ((is1 == is2) || (is1 == null && is2 == null)) {
  1037. return true;
  1038. }
  1039. if (is1 == null || is2 == null) {
  1040. return false;
  1041. }
  1042. while (true) {
  1043. int c1 = is1.read();
  1044. while (ignoreWhitespace && isWhitespace(c1)) {
  1045. c1 = is1.read();
  1046. }
  1047. int c2 = is2.read();
  1048. while (ignoreWhitespace && isWhitespace(c2)) {
  1049. c2 = is2.read();
  1050. }
  1051. if (c1 == -1 && c2 == -1) {
  1052. return true;
  1053. }
  1054. if (c1 != c2) {
  1055. break;
  1056. }
  1057. }
  1058. } catch (FileNotFoundException e) {
  1059. e.printStackTrace();
  1060. }
  1061. return false;
  1062. }
  1063. /**
  1064. * convert "From src insert blah blah" to "From src insert ... blah"
  1065. */
  1066. public static String abbreviate(String str, int max) {
  1067. str = str.trim();
  1068. int len = str.length();
  1069. int suffixlength = 20;
  1070. if (len <= max) {
  1071. return str;
  1072. }
  1073. suffixlength = Math.min(suffixlength, (max - 3) / 2);
  1074. String rev = StringUtils.reverse(str);
  1075. // get the last few words
  1076. String suffix = WordUtils.abbreviate(rev, 0, suffixlength, "");
  1077. suffix = StringUtils.reverse(suffix);
  1078. // first few ..
  1079. String prefix = StringUtils.abbreviate(str, max - suffix.length());
  1080. return prefix + suffix;
  1081. }
  1082. public static final String NSTR = "";
  1083. /**
  1084. * StreamStatus.
  1085. *
  1086. */
  1087. public static enum StreamStatus {
  1088. EOF, TERMINATED
  1089. }
  1090. public static StreamStatus readColumn(DataInput in, OutputStream out) throws IOException {
  1091. boolean foundCrChar = false;
  1092. while (true) {
  1093. int b;
  1094. try {
  1095. b = in.readByte();
  1096. } catch (EOFException e) {
  1097. return StreamStatus.EOF;
  1098. }
  1099. // Default new line characters on windows are "CRLF" so detect if there are any windows
  1100. // native newline characters and handle them.
  1101. if (Shell.WINDOWS) {
  1102. // if the CR is not followed by the LF on windows then add it back to the stream and
  1103. // proceed with next characters in the input stream.
  1104. if (foundCrChar && b != Utilities.newLineCode) {
  1105. out.write(Utilities.carriageReturnCode);
  1106. foundCrChar = false;
  1107. }
  1108. if (b == Utilities.carriageReturnCode) {
  1109. foundCrChar = true;
  1110. continue;
  1111. }
  1112. }
  1113. if (b == Utilities.newLineCode) {
  1114. return StreamStatus.TERMINATED;
  1115. }
  1116. out.write(b);
  1117. }
  1118. // Unreachable
  1119. }
  1120. /**
  1121. * Convert an output stream to a compressed output stream based on codecs and compression options
  1122. * specified in the Job Configuration.
  1123. *
  1124. * @param jc
  1125. * Job Configuration
  1126. * @param out
  1127. * Output Stream to be converted into compressed output stream
  1128. * @return compressed output stream
  1129. */
  1130. public static OutputStream createCompressedStream(JobConf jc, OutputStream out)
  1131. throws IOException {
  1132. boolean isCompressed = FileOutputFormat.getCompressOutput(jc);
  1133. return createCompressedStream(jc, out, isCompressed);
  1134. }
  1135. /**
  1136. * Convert an output stream to a compressed output stream based on codecs codecs in the Job
  1137. * Configuration. Caller specifies directly whether file is compressed or not
  1138. *
  1139. * @param jc
  1140. * Job Configuration
  1141. * @param out
  1142. * Output Stream to be converted into compressed output stream
  1143. * @param isCompressed
  1144. * whether the output stream needs to be compressed or not
  1145. * @return compressed output stream
  1146. */
  1147. public static OutputStream createCompressedStream(JobConf jc, OutputStream out,
  1148. boolean isCompressed) throws IOException {
  1149. if (isCompressed) {
  1150. Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(jc,
  1151. DefaultCodec.class);
  1152. CompressionCodec codec = ReflectionUtils.newInstance(codecClass, jc);
  1153. return codec.createOutputStream(out);
  1154. } else {
  1155. return (out);
  1156. }
  1157. }
  1158. /**
  1159. * Based on compression option and configured output codec - get extension for output file. This
  1160. * is only required for text files - not sequencefiles
  1161. *
  1162. * @param jc
  1163. * Job Configuration
  1164. * @param isCompressed
  1165. * Whether the output file is compressed or not
  1166. * @return the required file extension (example: .gz)
  1167. * @deprecated Use {@link #getFileExtension(JobConf, boolean, HiveOutputFormat)}
  1168. */
  1169. @Deprecated
  1170. public static String getFileExtension(JobConf jc, boolean isCompressed) {
  1171. return getFileExtension(jc, isCompressed, new HiveIgnoreKeyTextOutputFormat());
  1172. }
  1173. /**
  1174. * Based on compression option, output format, and configured output codec -
  1175. * get extension for output file. Text files require an extension, whereas
  1176. * others, like sequence files, do not.
  1177. * <p>
  1178. * The property <code>hive.output.file.extension</code> is used to determine
  1179. * the extension - if set, it will override other logic for choosing an
  1180. * extension.
  1181. *
  1182. * @param jc
  1183. * Job Configuration
  1184. * @param isCompressed
  1185. * Whether the output file is compressed or not
  1186. * @param hiveOutputFormat
  1187. * The output format, used to detect if the format is text
  1188. * @return the required file extension (example: .gz)
  1189. */
  1190. public static String getFileExtension(JobConf jc, boolean isCompressed,
  1191. HiveOutputFormat<?, ?> hiveOutputFormat) {
  1192. String extension = HiveConf.getVar(jc, HiveConf.ConfVars.OUTPUT_FILE_EXTENSION);
  1193. if (!StringUtils.isEmpty(extension)) {
  1194. return extension;
  1195. }
  1196. if ((hiveOutputFormat instanceof HiveIgnoreKeyTextOutputFormat) && isCompressed) {
  1197. Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(jc,
  1198. DefaultCodec.class);
  1199. CompressionCodec codec = ReflectionUtils.newInstance(codecClass, jc);
  1200. return codec.getDefaultExtension();
  1201. }
  1202. return "";
  1203. }
  1204. /**
  1205. * Create a sequencefile output stream based on job configuration.
  1206. *
  1207. * @param jc
  1208. * Job configuration
  1209. * @param fs
  1210. * File System to create file in
  1211. * @param file
  1212. * Path to be created
  1213. * @param keyClass
  1214. * Java Class for key
  1215. * @param valClass
  1216. * Java Class for value
  1217. * @return output stream over the created sequencefile
  1218. */
  1219. public static SequenceFile.Writer createSequenceWriter(JobConf jc, FileSystem fs, Path file,
  1220. Class<?> keyClass, Class<?> valClass, Progressable progressable) throws IOException {
  1221. boolean isCompressed = FileOutputFormat.getCompressOutput(jc);
  1222. return createSequenceWriter(jc, fs, file, keyClass, valClass, isCompressed, progressable);
  1223. }
  1224. /**
  1225. * Create a sequencefile output stream based on job configuration Uses user supplied compression
  1226. * flag (rather than obtaining it from the Job Configuration).
  1227. *
  1228. * @param jc
  1229. * Job configuration
  1230. * @param fs
  1231. * File System to create file in
  1232. * @param file
  1233. * Path to be created
  1234. * @param keyClass
  1235. * Java Class for key
  1236. * @param valClass
  1237. * Java Class for value
  1238. * @return output stream over the created sequencefile
  1239. */
  1240. public static SequenceFile.Writer createSequenceWriter(JobConf jc, FileSystem fs, Path file,
  1241. Class<?> keyClass, Class<?> valClass, boolean isCompressed, Progressable progressable)
  1242. throws IOException {
  1243. CompressionCodec codec = null;
  1244. CompressionType compressionType = CompressionType.NONE;
  1245. Class codecClass = null;
  1246. if (isCompressed) {
  1247. compressionType = SequenceFileOutputFormat.getOutputCompressionType(jc);
  1248. codecClass = FileOutputFormat.getOutputCompressorClass(jc, DefaultCodec.class);
  1249. codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, jc);
  1250. }
  1251. return (SequenceFile.createWriter(fs, jc, file, keyClass, valClass, compressionType, codec,
  1252. progressable));
  1253. }
  1254. /**
  1255. * Create a RCFile output stream based on job configuration Uses user supplied compression flag
  1256. * (rather than obtaining it from the Job Configuration).
  1257. *
  1258. * @param jc
  1259. * Job configuration
  1260. * @param fs
  1261. * File System to create file in
  1262. * @param file
  1263. * Path to be created
  1264. * @return output stream over the created rcfile
  1265. */
  1266. public static RCFile.Writer createRCFileWriter(JobConf jc, FileSystem fs, Path file,
  1267. boolean isCompressed, Progressable progressable) throws IOException {
  1268. CompressionCodec codec = null;
  1269. Class<?> codecClass = null;
  1270. if (isCompressed) {
  1271. codecClass = FileOutputFormat.getOutputCompressorClass(jc, DefaultCodec.class);
  1272. codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, jc);
  1273. }
  1274. return new RCFile.Writer(fs, jc, file, progressable, codec);
  1275. }
  1276. /**
  1277. * Shamelessly cloned from GenericOptionsParser.
  1278. */
  1279. public static String realFile(String newFile, Configuration conf) throws IOException {
  1280. Path path = new Path(newFile);
  1281. URI pathURI = path.toUri();
  1282. FileSystem fs;
  1283. if (pathURI.getScheme() == null) {
  1284. fs = FileSystem.getLocal(conf);
  1285. } else {
  1286. fs = path.getFileSystem(conf);
  1287. }
  1288. if (!fs.exists(path)) {
  1289. return null;
  1290. }
  1291. String file = path.makeQualified(fs).toString();
  1292. return file;
  1293. }
  1294. public static List<String> mergeUniqElems(List<String> src, List<String> dest) {
  1295. if (dest == null) {
  1296. return src;
  1297. }
  1298. if (src == null) {
  1299. return dest;
  1300. }
  1301. int pos = 0;
  1302. while (pos < dest.size()) {
  1303. if (!src.contains(dest.get(pos))) {
  1304. src.add(dest.get(pos));
  1305. }
  1306. pos++;
  1307. }
  1308. return src;
  1309. }
  1310. private static final String tmpPrefix = "_tmp.";
  1311. private static final String taskTmpPrefix = "_task_tmp.";
  1312. public static Path toTaskTempPath(Path orig) {
  1313. if (orig.getName().indexOf(taskTmpPrefix) == 0) {
  1314. return orig;
  1315. }
  1316. return new Path(orig.getParent(), taskTmpPrefix + orig.getName());
  1317. }
  1318. public static Path toTempPath(Path orig) {
  1319. if (orig.getName().indexOf(tmpPrefix) == 0) {
  1320. return orig;
  1321. }
  1322. return new Path(orig.getParent(), tmpPrefix + orig.getName());
  1323. }
  1324. /**
  1325. * Given a path, convert to a temporary path.
  1326. */
  1327. public static Path toTempPath(String orig) {
  1328. return toTempPath(new Path(orig));
  1329. }
  1330. /**
  1331. * Detect if the supplied file is a temporary path.
  1332. */
  1333. public static boolean isTempPath(FileStatus file) {
  1334. String name = file.getPath().getName();
  1335. // in addition to detecting hive temporary files, we also check hadoop
  1336. // temporary folders that used to show up in older releases
  1337. return (name.startsWith("_task") || name.startsWith(tmpPrefix));
  1338. }
  1339. /**
  1340. * Rename src to dst, or in the case dst already exists, move files in src to dst. If there is an
  1341. * existing file with the same name, the new file's name will be appended with "_1", "_2", etc.
  1342. *
  1343. * @param fs
  1344. * the FileSystem where src and dst are on.
  1345. * @param src
  1346. * the src directory
  1347. * @param dst
  1348. * the target directory
  1349. * @throws IOException
  1350. */
  1351. public static void rename(FileSystem fs, Path src, Path dst) throws IOException, HiveException {
  1352. if (!fs.rename(src, dst)) {
  1353. throw new HiveException("Unable to move: " + src + " to: " + dst);
  1354. }
  1355. }
  1356. /**
  1357. * Rename src to dst, or in the case dst already exists, move files in src to dst. If there is an
  1358. * existing file with the same name, the new file's name will be appended with "_1", "_2", etc.
  1359. *
  1360. * @param fs
  1361. * the FileSystem where src and dst are on.
  1362. * @param src
  1363. * the src directory
  1364. * @param dst
  1365. * the target directory
  1366. * @throws IOException
  1367. */
  1368. public static void renameOrMoveFiles(FileSystem fs, Path src, Path dst) throws IOException,
  1369. HiveException {
  1370. if (!fs.exists(dst)) {
  1371. if (!fs.rename(src, dst)) {
  1372. throw new HiveException("Unable to move: " + src + " to: " + dst);
  1373. }
  1374. } else {
  1375. // move file by file
  1376. FileStatus[] files = fs.listStatus(src);
  1377. for (FileStatus file : files) {
  1378. Path srcFilePath = file.getPath();
  1379. String fileName = srcFilePath.getName();
  1380. Path dstFilePath = new Path(dst, fileName);
  1381. if (file.isDir()) {
  1382. renameOrMoveFiles(fs, srcFilePath, dstFilePath);
  1383. }
  1384. else {
  1385. if (fs.exists(dstFilePath)) {
  1386. int suffix = 0;
  1387. do {
  1388. suffix++;
  1389. dstFilePath = new Path(dst, fileName + "_" + suffix);
  1390. } while (fs.exists(dstFilePath));
  1391. }
  1392. if (!fs.rename(srcFilePath, dstFilePath)) {
  1393. throw new HiveException("Unable to move: " + src + " to: " + dst);
  1394. }
  1395. }
  1396. }
  1397. }
  1398. }
  1399. /**
  1400. * The first group will contain the task id. The second group is the optional extension. The file
  1401. * name looks like: "0_0" or "0_0.gz". There may be a leading prefix (tmp_). Since getTaskId() can
  1402. * return an integer only - this should match a pure integer as well. {1,6} is used to limit
  1403. * matching for attempts #'s 0-999999.
  1404. */
  1405. private static final Pattern FILE_NAME_TO_TASK_ID_REGEX =
  1406. Pattern.compile("^.*?([0-9]+)(_[0-9]{1,6})?(\\..*)?$");
  1407. /**
  1408. * This retruns prefix part + taskID for bucket join for partitioned table
  1409. */
  1410. private static final Pattern FILE_NAME_PREFIXED_TASK_ID_REGEX =
  1411. Pattern.compile("^.*?((\\(.*\\))?[0-9]+)(_[0-9]{1,6})?(\\..*)?$");
  1412. /**
  1413. * This breaks a prefixed bucket number into the prefix and the taskID
  1414. */
  1415. private static final Pattern PREFIXED_TASK_ID_REGEX =
  1416. Pattern.compile("^(.*?\\(.*\\))?([0-9]+)$");
  1417. /**
  1418. * Get the task id from the filename. It is assumed that the filename is derived from the output
  1419. * of getTaskId
  1420. *
  1421. * @param filename
  1422. * filename to extract taskid from
  1423. */
  1424. public static String getTaskIdFromFilename(String filename) {
  1425. return getIdFromFilename(filename, FILE_NAME_TO_TASK_ID_REGEX);
  1426. }
  1427. /**
  1428. * Get the part-spec + task id from the filename. It is assumed that the filename is derived
  1429. * from the output of getTaskId
  1430. *
  1431. * @param filename
  1432. * filename to extract taskid from
  1433. */
  1434. public static String getPrefixedTaskIdFromFilename(String filename) {
  1435. return getIdFromFilename(filename, FILE_NAME_PREFIXED_TASK_ID_REGEX);
  1436. }
  1437. private static String getIdFromFilename(String filename, Pattern pattern) {
  1438. String taskId = filename;
  1439. int dirEnd = filename.lastIndexOf(Path.SEPARATOR);
  1440. if (dirEnd != -1) {
  1441. taskId = filename.substring(dirEnd + 1);
  1442. }
  1443. Matcher m = pattern.matcher(taskId);
  1444. if (!m.matches()) {
  1445. LOG.warn("Unable to get task id from file name: " + filename + ". Using last component"
  1446. + taskId + " as task id.");
  1447. } else {
  1448. taskId = m.group(1);
  1449. }
  1450. LOG.debug("TaskId for " + filename + " = " + taskId);
  1451. return taskId;
  1452. }
  1453. public static String getFileNameFromDirName(String dirName) {
  1454. int dirEnd = dirName.lastIndexOf(Path.SEPARATOR);
  1455. if (dirEnd != -1) {
  1456. return dirName.substring(dirEnd + 1);
  1457. }
  1458. return dirName;
  1459. }
  1460. /**
  1461. * Replace the task id from the filename. It is assumed that the filename is derived from the
  1462. * output of getTaskId
  1463. *
  1464. * @param filename
  1465. * filename to replace taskid "0_0" or "0_0.gz" by 33 to "33_0" or "33_0.gz"
  1466. */
  1467. public static String replaceTaskIdFromFilename(String filename, int bucketNum) {
  1468. return replaceTaskIdFromFilename(filename, String.valueOf(bucketNum));
  1469. }
  1470. public static String replaceTaskIdFromFilename(String filename, String fileId) {
  1471. String taskId = getTaskIdFromFilename(filename);
  1472. String newTaskId = replaceTaskId(taskId, fileId);
  1473. String ret = replaceTaskIdFromFilename(filename, taskId, newTaskId);
  1474. return (ret);
  1475. }
  1476. private static String replaceTaskId(String taskId, int bucketNum) {
  1477. return replaceTaskId(taskId, String.valueOf(bucketNum));
  1478. }
  1479. /**
  1480. * Returns strBucketNum with enough 0's prefixing the task ID portion of the String to make it
  1481. * equal in length to taskId
  1482. *
  1483. * @param taskId - the taskId used as a template for length
  1484. * @param strBucketNum - the bucket number of the output, may or may not be prefixed
  1485. * @return
  1486. */
  1487. private static String replaceTaskId(String taskId, String strBucketNum) {
  1488. Matcher m = PREFIXED_TASK_ID_REGEX.matcher(strBucketNum);
  1489. if (!m.matches()) {
  1490. LOG.warn("Unable to determine bucket number from file ID: " + strBucketNum + ". Using " +
  1491. "file ID as bucket number.");
  1492. return adjustBucketNumLen(strBucketNum, taskId);
  1493. } else {
  1494. String adjustedBucketNum = adjustBucketNumLen(m.group(2), taskId);
  1495. return (m.group(1) == null ? "" : m.group(1)) + adjustedBucketNum;
  1496. }
  1497. }
  1498. /**
  1499. * Adds 0's to the beginning of bucketNum until bucketNum and taskId are the same length.
  1500. *
  1501. * @param bucketNum - the bucket number, should not be prefixed
  1502. * @param taskId - the taskId used as a template for length
  1503. * @return
  1504. */
  1505. private static String adjustBucketNumLen(String bucketNum, String taskId) {
  1506. int bucketNumLen = bucketNum.length();
  1507. int taskIdLen = taskId.length();
  1508. StringBuffer s = new StringBuffer();
  1509. for (int i = 0; i < taskIdLen - bucketNumLen; i++) {
  1510. s.append("0");
  1511. }
  1512. return s.toString() + bucketNum;
  1513. }
  1514. /**
  1515. * Replace the oldTaskId appearing in the filename by the newTaskId. The string oldTaskId could
  1516. * appear multiple times, we should only replace the last one.
  1517. *
  1518. * @param filename
  1519. * @param oldTaskId
  1520. * @param newTaskId
  1521. * @return
  1522. */
  1523. private static String replaceTaskIdFromFilename(String filename, String oldTaskId,
  1524. String newTaskId) {
  1525. String[] spl = filename.split(oldTaskId);
  1526. if ((spl.length == 0) || (spl.length == 1)) {
  1527. return filename.replaceAll(oldTaskId, newTaskId);
  1528. }
  1529. StringBuffer snew = new StringBuffer();
  1530. for (int idx = 0; idx < spl.length - 1; idx++) {
  1531. if (idx > 0) {
  1532. snew.append(oldTaskId);
  1533. }
  1534. snew.append(spl[idx]);
  1535. }
  1536. snew.append(newTaskId);
  1537. snew.append(spl[spl.length - 1]);
  1538. return snew.toString();
  1539. }
  1540. /**
  1541. * returns null if path is not exist
  1542. */
  1543. public static FileStatus[] listStatusIfExists(Path path, FileSystem fs) throws IOException {
  1544. try {
  1545. return fs.listStatus(path);
  1546. } catch (FileNotFoundException e) {
  1547. // FS in hadoop 2.0 throws FNF instead of returning null
  1548. return null;
  1549. }
  1550. }
  1551. public static void mvFileToFinalPath(Path specPath, Configuration hconf,
  1552. boolean success, Log log, DynamicPartitionCtx dpCtx, FileSinkDesc conf,
  1553. Reporter reporter) throws IOException,
  1554. HiveException {
  1555. FileSystem fs = specPath.getFileSystem(hconf);
  1556. Path tmpPath = Utilities.toTempPath(specPath);
  1557. Path taskTmpPath = Utilities.toTaskTempPath(specPath);
  1558. if (success) {
  1559. if (fs.exists(tmpPath)) {
  1560. // remove any tmp file or double-committed output files
  1561. ArrayList<String> emptyBuckets =
  1562. Utilities.removeTempOrDuplicateFiles(fs, tmpPath, dpCtx);
  1563. // create empty buckets if necessary
  1564. if (emptyBuckets.size() > 0) {
  1565. createEmptyBuckets(hconf, emptyBuckets, conf, reporter);
  1566. }
  1567. // move to the file destination
  1568. log.info("Moving tmp dir: " + tmpPath + " to: " + specPath);
  1569. Utilities.renameOrMoveFiles(fs, tmpPath, specPath);
  1570. }
  1571. } else {
  1572. fs.delete(tmpPath, true);
  1573. }
  1574. fs.delete(taskTmpPath, true);
  1575. }
  1576. /**
  1577. * Check the existence of buckets according to bucket specification. Create empty buckets if
  1578. * needed.
  1579. *
  1580. * @param hconf
  1581. * @param paths A list of empty buckets to create
  1582. * @param conf The definition of the FileSink.
  1583. * @param reporter The mapreduce reporter object
  1584. * @throws HiveException
  1585. * @throws IOException
  1586. */
  1587. private static void createEmptyBuckets(Configuration hconf, ArrayList<String> paths,
  1588. FileSinkDesc conf, Reporter reporter)
  1589. throws HiveException, IOException {
  1590. JobConf jc;
  1591. if (hconf instanceof JobConf) {
  1592. jc = new JobConf(hconf);
  1593. } else {
  1594. // test code path
  1595. jc = new JobConf(hconf);
  1596. }
  1597. HiveOutputFormat<?, ?> hiveOutputFormat = null;
  1598. Class<? extends Writable> outputClass = null;
  1599. boolean isCompressed = conf.getCompressed();
  1600. TableDesc tableInfo = conf.getTableInfo();
  1601. try {
  1602. Serializer serializer = (Serializer) tableInfo.getDeserializerClass().newInstance();
  1603. serializer.initialize(null, tableInfo.getProperties());
  1604. outputClass = serializer.getSerializedClass();
  1605. hiveOutputFormat = conf.getTableInfo().getOutputFileFormatClass().newInstance();
  1606. } catch (SerDeException e) {
  1607. throw new HiveException(e);
  1608. } catch (InstantiationException e) {
  1609. throw new HiveException(e);
  1610. } catch (IllegalAccessException e) {
  1611. throw new HiveException(e);
  1612. }
  1613. for (String p : paths) {
  1614. Path path = new Path(p);
  1615. RecordWriter writer = HiveFileFormatUtils.getRecordWriter(
  1616. jc, hiveOutputFormat, outputClass, isCompressed,
  1617. tableInfo.getProperties(), path, reporter);
  1618. writer.close(false);
  1619. LOG.info("created empty bucket for enforcing bucketing at " + path);
  1620. }
  1621. }
  1622. /**
  1623. * Remove all temporary files and duplicate (double-committed) files from a given directory.
  1624. */
  1625. public static void removeTempOrDuplicateFiles(FileSystem fs, Path path) throws IOException {
  1626. removeTempOrDuplicateFiles(fs, path, null);
  1627. }
  1628. /**
  1629. * Remove all temporary files and duplicate (double-committed) files from a given directory.
  1630. *
  1631. * @return a list of path names corresponding to should-be-created empty buckets.
  1632. */
  1633. public static ArrayList<String> removeTempOrDuplicateFiles(FileSystem fs, Path path,
  1634. DynamicPartitionCtx dpCtx) throws IOException {
  1635. if (path == null) {
  1636. return null;
  1637. }
  1638. ArrayList<String> result = new ArrayList<String>();
  1639. if (dpCtx != null) {
  1640. FileStatus parts[] = HiveStatsUtils.getFileStatusRecurse(path, dpCtx.getNumDPCols(), fs);
  1641. HashMap<String, FileStatus> taskIDToFile = null;
  1642. for (int i = 0; i < parts.length; ++i) {
  1643. assert parts[i].isDir() : "dynamic partition " + parts[i].getPath()
  1644. + " is not a direcgtory";
  1645. FileStatus[] items = fs.listStatus(parts[i].getPath());
  1646. // remove empty directory since DP insert should not generate empty partitions.
  1647. // empty directories could be generated by crashed Task/ScriptOperator
  1648. if (items.length == 0) {
  1649. if (!fs.delete(parts[i].getPath(), true)) {
  1650. LOG.error("Cannot delete empty directory " + parts[i].getPath());
  1651. throw new IOException("Cannot delete empty directory " + parts[i].getPath());
  1652. }
  1653. }
  1654. taskIDToFile = removeTempOrDuplicateFiles(items, fs);
  1655. // if the table is bucketed and enforce bucketing, we should check and generate all buckets
  1656. if (dpCtx.getNumBuckets() > 0 && taskIDToFile != null) {
  1657. // refresh the file list
  1658. items = fs.listStatus(parts[i].getPath());
  1659. // get the missing buckets and generate empty buckets
  1660. String taskID1 = taskIDToFile.keySet().iterator().next();
  1661. Path bucketPath = taskIDToFile.values().iterator().next().getPath();
  1662. for (int j = 0; j < dpCtx.getNumBuckets(); ++j) {
  1663. String taskID2 = replaceTaskId(taskID1, j);
  1664. if (!taskIDToFile.containsKey(taskID2)) {
  1665. // create empty bucket, file name should be derived from taskID2
  1666. String path2 = replaceTaskIdFromFilename(bucketPath.toUri().getPath().toString(), j);
  1667. result.add(path2);
  1668. }
  1669. }
  1670. }
  1671. }
  1672. } else {
  1673. FileStatus[] items = fs.listStatus(path);
  1674. removeTempOrDuplicateFiles(items, fs);
  1675. }
  1676. return result;
  1677. }
  1678. public static HashMap<String, FileStatus> removeTempOrDuplicateFiles(FileStatus[] items,
  1679. FileSystem fs) throws IOException {
  1680. if (items == null || fs == null) {
  1681. return null;
  1682. }
  1683. HashMap<String, FileStatus> taskIdToFile = new HashMap<String, FileStatus>();
  1684. for (FileStatus one : items) {
  1685. if (isTempPath(one)) {
  1686. if (!fs.delete(one.getPath(), true)) {
  1687. throw new IOException("Unable to delete tmp file: " + one.getPath());
  1688. }
  1689. } else {
  1690. String taskId = getPrefixedTaskIdFromFilename(one.getPath().getName());
  1691. FileStatus otherFile = taskIdToFile.get(taskId);
  1692. if (otherFile == null) {
  1693. taskIdToFile.put(taskId, one);
  1694. } else {
  1695. // Compare the file sizes of all the attempt files for the same task, the largest win
  1696. // any attempt files could contain partial results (due to task failures or
  1697. // speculative runs), but the largest should be the correct one since the result
  1698. // of a successful run should never be smaller than a failed/speculative run.
  1699. FileStatus toDelete = null;
  1700. if (otherFile.getLen() >= one.getLen()) {
  1701. toDelete = one;
  1702. } else {
  1703. toDelete = otherFile;
  1704. taskIdToFile.put(taskId, one);
  1705. }
  1706. long len1 = toDelete.getLen();
  1707. long len2 = taskIdToFile.get(taskId).getLen();
  1708. if (!fs.delete(toDelete.getPath(), true)) {
  1709. throw new IOException("Unable to delete duplicate file: " + toDelete.getPath()
  1710. + ". Existing file: " + taskIdToFile.get(taskId).getPath());
  1711. } else {
  1712. LOG.warn("Duplicate taskid file removed: " + toDelete.getPath() + " with length "
  1713. + len1 + ". Existing file: " + taskIdToFile.get(taskId).getPath() + " with length "
  1714. + len2);
  1715. }
  1716. }
  1717. }
  1718. }
  1719. return taskIdToFile;
  1720. }
  1721. public static String getNameMessage(Exception e) {
  1722. return e.getClass().getName() + "(" + e.getMessage() + ")";
  1723. }
  1724. public static String getResourceFiles(Configuration conf, SessionState.ResourceType t) {
  1725. // fill in local files to be added to the task environment
  1726. SessionState ss = SessionState.get();
  1727. Set<String> files = (ss == null) ? null : ss.list_resource(t, null);
  1728. if (files != null) {
  1729. List<String> realFiles = new ArrayList<String>(files.size());
  1730. for (String one : files) {
  1731. try {
  1732. realFiles.add(realFile(one, conf));
  1733. } catch (IOException e) {
  1734. throw new RuntimeException("Cannot validate file " + one + "due to exception: "
  1735. + e.getMessage(), e);
  1736. }
  1737. }
  1738. return StringUtils.join(realFiles, ",");
  1739. } else {
  1740. return "";
  1741. }
  1742. }
  1743. /**
  1744. * Create a URL from a string representing a path to a local file.
  1745. * The path string can be just a path, or can start with file:/, file:///
  1746. * @param onestr path string
  1747. * @return
  1748. */
  1749. private static URL urlFromPathString(String onestr) {
  1750. URL oneurl = null;
  1751. try {
  1752. if (StringUtils.indexOf(onestr, "file:/") == 0) {
  1753. oneurl = new URL(onestr);
  1754. } else {
  1755. oneurl = new File(onestr).toURL();
  1756. }
  1757. } catch (Exception err) {
  1758. LOG.error("Bad URL " + onestr + ", ignoring path");
  1759. }
  1760. return oneurl;
  1761. }
  1762. /**
  1763. * Add new elements to the classpath.
  1764. *
  1765. * @param newPaths
  1766. * Array of classpath elements
  1767. */
  1768. public static ClassLoader addToClassPath(ClassLoader cloader, String[] newPaths) throws Exception {
  1769. URLClassLoader loader = (URLClassLoader) cloader;
  1770. List<URL> curPath = Arrays.asList(loader.getURLs());
  1771. ArrayList<URL> newPath = new ArrayList<URL>();
  1772. // get a list with the current classpath components
  1773. for (URL onePath : curPath) {
  1774. newPath.add(onePath);
  1775. }
  1776. curPath = newPath;
  1777. for (String onestr : newPaths) {
  1778. URL oneurl = urlFromPathString(onestr);
  1779. if (oneurl != null && !curPath.contains(oneurl)) {
  1780. curPath.add(oneurl);
  1781. }
  1782. }
  1783. return new URLClassLoader(curPath.toArray(new URL[0]), loader);
  1784. }
  1785. /**
  1786. * remove elements from the classpath.
  1787. *
  1788. * @param pathsToRemove
  1789. * Array of classpath elements
  1790. */
  1791. public static void removeFromClassPath(String[] pathsToRemove) throws Exception {
  1792. Thread curThread = Thread.currentThread();
  1793. URLClassLoader loader = (URLClassLoader) curThread.getContextClassLoader();
  1794. Set<URL> newPath = new HashSet<URL>(Arrays.asList(loader.getURLs()));
  1795. for (String onestr : pathsToRemove) {
  1796. URL oneurl = urlFromPathString(onestr);
  1797. if (oneurl != null) {
  1798. newPath.remove(oneurl);
  1799. }
  1800. }
  1801. loader = new URLClassLoader(newPath.toArray(new URL[0]));
  1802. curThread.setContextClassLoader(loader);
  1803. SessionState.get().getConf().setClassLoader(loader);
  1804. }
  1805. public static String formatBinaryString(byte[] array, int start, int length) {
  1806. StringBuilder sb = new StringBuilder();
  1807. for (int i = start; i < start + length; i++) {
  1808. sb.append("x");
  1809. sb.append(array[i] < 0 ? array[i] + 256 : array[i] + 0);
  1810. }
  1811. return sb.toString();
  1812. }
  1813. public static List<String> getColumnNamesFromSortCols(List<Order> sortCols) {
  1814. List<String> names = new ArrayList<String>();
  1815. for (Order o : sortCols) {
  1816. names.add(o.getCol());
  1817. }
  1818. return names;
  1819. }
  1820. public static List<String> getColumnNamesFromFieldSchema(List<FieldSchema> partCols) {
  1821. List<String> names = new ArrayList<String>();
  1822. for (FieldSchema o : partCols) {
  1823. names.add(o.getName());
  1824. }
  1825. return names;
  1826. }
  1827. public static List<String> getInternalColumnNamesFromSignature(List<ColumnInfo> colInfos) {
  1828. List<String> names = new ArrayList<String>();
  1829. for (ColumnInfo ci : colInfos) {
  1830. names.add(ci.getInternalName());
  1831. }
  1832. return names;
  1833. }
  1834. public static List<String> getColumnNames(Properties props) {
  1835. List<String> names = new ArrayList<String>();
  1836. String colNames = props.getProperty(serdeConstants.LIST_COLUMNS);
  1837. String[] cols = colNames.trim().split(",");
  1838. if (cols != null) {
  1839. for (String col : cols) {
  1840. if (col != null && !col.trim().equals("")) {
  1841. names.add(col);
  1842. }
  1843. }
  1844. }
  1845. return names;
  1846. }
  1847. public static List<String> getColumnTypes(Properties props) {
  1848. List<String> names = new ArrayList<String>();
  1849. String colNames = props.getProperty(serdeConstants.LIST_COLUMN_TYPES);
  1850. String[] cols = colNames.trim().split(",");
  1851. if (cols != null) {
  1852. for (String col : cols) {
  1853. if (col != null && !col.trim().equals("")) {
  1854. names.add(col);
  1855. }
  1856. }
  1857. }
  1858. return names;
  1859. }
  1860. /**
  1861. * Extract db and table name from dbtable string, where db and table are separated by "."
  1862. * If there is no db name part, set the current sessions default db
  1863. * @param dbtable
  1864. * @return String array with two elements, first is db name, second is table name
  1865. * @throws HiveException
  1866. */
  1867. public static String[] getDbTableName(String dbtable) throws HiveException{
  1868. if(dbtable == null){
  1869. return new String[2];
  1870. }
  1871. String[] names = dbtable.split("\\.");
  1872. switch (names.length) {
  1873. case 2:
  1874. return names;
  1875. case 1:
  1876. return new String [] {SessionState.get().getCurrentDatabase(), dbtable};
  1877. default:
  1878. throw new HiveException(ErrorMsg.INVALID_TABLE_NAME, dbtable);
  1879. }
  1880. }
  1881. public static void validateColumnNames(List<String> colNames, List<String> checkCols)
  1882. throws SemanticException {
  1883. Iterator<String> checkColsIter = checkCols.iterator();
  1884. while (checkColsIter.hasNext()) {
  1885. String toCheck = checkColsIter.next();
  1886. boolean found = false;
  1887. Iterator<String> colNamesIter = colNames.iterator();
  1888. while (colNamesIter.hasNext()) {
  1889. String colName = colNamesIter.next();
  1890. if (toCheck.equalsIgnoreCase(colName)) {
  1891. found = true;
  1892. break;
  1893. }
  1894. }
  1895. if (!found) {
  1896. throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg());
  1897. }
  1898. }
  1899. }
  1900. /**
  1901. * Gets the default notification interval to send progress updates to the tracker. Useful for
  1902. * operators that may not output data for a while.
  1903. *
  1904. * @param hconf
  1905. * @return the interval in milliseconds
  1906. */
  1907. public static int getDefaultNotificationInterval(Configuration hconf) {
  1908. int notificationInterval;
  1909. Integer expInterval = Integer.decode(hconf.get("mapred.tasktracker.expiry.interval"));
  1910. if (expInterval != null) {
  1911. notificationInterval = expInterval.intValue() / 2;
  1912. } else {
  1913. // 5 minutes
  1914. notificationInterval = 5 * 60 * 1000;
  1915. }
  1916. return notificationInterval;
  1917. }
  1918. /**
  1919. * Copies the storage handler properties configured for a table descriptor to a runtime job
  1920. * configuration.
  1921. *
  1922. * @param tbl
  1923. * table descriptor from which to read
  1924. *
  1925. * @param job
  1926. * configuration which receives configured properties
  1927. */
  1928. public static void copyTableJobPropertiesToConf(TableDesc tbl, JobConf job) {
  1929. String bucketString = tbl.getProperties()
  1930. .getProperty(hive_metastoreConstants.BUCKET_COUNT);
  1931. // copy the bucket count
  1932. if (bucketString != null) {
  1933. job.set(hive_metastoreConstants.BUCKET_COUNT, bucketString);
  1934. }
  1935. Map<String, String> jobProperties = tbl.getJobProperties();
  1936. if (jobProperties == null) {
  1937. return;
  1938. }
  1939. for (Map.Entry<String, String> entry : jobProperties.entrySet()) {
  1940. job.set(entry.getKey(), entry.getValue());
  1941. }
  1942. }
  1943. private static final Object INPUT_SUMMARY_LOCK = new Object();
  1944. /**
  1945. * Calculate the total size of input files.
  1946. *
  1947. * @param ctx
  1948. * the hadoop job context
  1949. * @param work
  1950. * map reduce job plan
  1951. * @param filter
  1952. * filter to apply to the input paths before calculating size
  1953. * @return the summary of all the input paths.
  1954. * @throws IOException
  1955. */
  1956. public static ContentSummary getInputSummary(final Context ctx, MapWork work, PathFilter filter)
  1957. throws IOException {
  1958. PerfLogger perfLogger = PerfLogger.getPerfLogger();
  1959. perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.INPUT_SUMMARY);
  1960. long[] summary = {0, 0, 0};
  1961. final List<String> pathNeedProcess = new ArrayList<String>();
  1962. // Since multiple threads could call this method concurrently, locking
  1963. // this method will avoid number of threads out of control.
  1964. synchronized (INPUT_SUMMARY_LOCK) {
  1965. // For each input path, calculate the total size.
  1966. for (String path : work.getPathToAliases().keySet()) {
  1967. Path p = new Path(path);
  1968. if (filter != null && !filter.accept(p)) {
  1969. continue;
  1970. }
  1971. ContentSummary cs = ctx.getCS(path);
  1972. if (cs == null) {
  1973. if (path == null) {
  1974. continue;
  1975. }
  1976. pathNeedProcess.add(path);
  1977. } else {
  1978. summary[0] += cs.getLength();
  1979. summary[1] += cs.getFileCount();
  1980. summary[2] += cs.getDirectoryCount();
  1981. }
  1982. }
  1983. // Process the case when name node call is needed
  1984. final Map<String, ContentSummary> resultMap = new ConcurrentHashMap<String, ContentSummary>();
  1985. ArrayList<Future<?>> results = new ArrayList<Future<?>>();
  1986. final ThreadPoolExecutor executor;
  1987. int maxThreads = ctx.getConf().getInt("mapred.dfsclient.parallelism.max", 0);
  1988. if (pathNeedProcess.size() > 1 && maxThreads > 1) {
  1989. int numExecutors = Math.min(pathNeedProcess.size(), maxThreads);
  1990. LOG.info("Using " + numExecutors + " threads for getContentSummary");
  1991. executor = new ThreadPoolExecutor(numExecutors, numExecutors, 60, TimeUnit.SECONDS,
  1992. new LinkedBlockingQueue<Runnable>());
  1993. } else {
  1994. executor = null;
  1995. }
  1996. HiveInterruptCallback interrup = HiveInterruptUtils.add(new HiveInterruptCallback() {
  1997. @Override
  1998. public void interrupt() {
  1999. for (String path : pathNeedProcess) {
  2000. try {
  2001. new Path(path).getFileSystem(ctx.getConf()).close();
  2002. } catch (IOException ignore) {
  2003. LOG.debug(ignore);
  2004. }
  2005. }
  2006. if (executor != null) {
  2007. executor.shutdownNow();
  2008. }
  2009. }
  2010. });
  2011. try {
  2012. Configuration conf = ctx.getConf();
  2013. JobConf jobConf = new JobConf(conf);
  2014. for (String path : pathNeedProcess) {
  2015. final Path p = new Path(path);
  2016. final String pathStr = path;
  2017. // All threads share the same Configuration and JobConf based on the
  2018. // assumption that they are thread safe if only read operations are
  2019. // executed. It is not stated in Hadoop's javadoc, the sourcce codes
  2020. // clearly showed that they made efforts for it and we believe it is
  2021. // thread safe. Will revisit this piece of codes if we find the assumption
  2022. // is not correct.
  2023. final Configuration myConf = conf;
  2024. final JobConf myJobConf = jobConf;
  2025. final Map<String, Operator<?>> aliasToWork = work.getAliasToWork();
  2026. final Map<String, ArrayList<String>> pathToAlias = work.getPathToAliases();
  2027. final PartitionDesc partDesc = work.getPathToPartitionInfo().get(
  2028. p.toString());
  2029. Runnable r = new Runnable() {
  2030. @Override
  2031. public void run() {
  2032. try {
  2033. Class<? extends InputFormat> inputFormatCls = partDesc
  2034. .getInputFileFormatClass();
  2035. InputFormat inputFormatObj = HiveInputFormat.getInputFormatFromCache(
  2036. inputFormatCls, myJobConf);
  2037. if (inputFormatObj instanceof ContentSummaryInputFormat) {
  2038. ContentSummaryInputFormat cs = (ContentSummaryInputFormat) inputFormatObj;
  2039. resultMap.put(pathStr, cs.getContentSummary(p, myJobConf));
  2040. return;
  2041. }
  2042. HiveStorageHandler handler = HiveUtils.getStorageHandler(myConf,
  2043. SerDeUtils.createOverlayedProperties(
  2044. partDesc.getTableDesc().getProperties(),
  2045. partDesc.getProperties())
  2046. .getProperty(hive_metastoreConstants.META_TABLE_STORAGE));
  2047. if (handler instanceof InputEstimator) {
  2048. long total = 0;
  2049. TableDesc tableDesc = partDesc.getTableDesc();
  2050. InputEstimator estimator = (InputEstimator) handler;
  2051. for (String alias : HiveFileFormatUtils.doGetAliasesFromPath(pathToAlias, p)) {
  2052. JobConf jobConf = new JobConf(myJobConf);
  2053. TableScanOperator scanOp = (TableScanOperator) aliasToWork.get(alias);
  2054. Utilities.setColumnNameList(jobConf, scanOp, true);
  2055. Utilities.setColumnTypeList(jobConf, scanOp, true);
  2056. PlanUtils.configureInputJobPropertiesForStorageHandler(tableDesc);
  2057. Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf);
  2058. total += estimator.estimate(myJobConf, scanOp, -1).getTotalLength();
  2059. }
  2060. resultMap.put(pathStr, new ContentSummary(total, -1, -1));
  2061. }
  2062. // todo: should nullify summary for non-native tables,
  2063. // not to be selected as a mapjoin target
  2064. FileSystem fs = p.getFileSystem(myConf);
  2065. resultMap.put(pathStr, fs.getContentSummary(p));
  2066. } catch (Exception e) {
  2067. // We safely ignore this exception for summary data.
  2068. // We don't update the cache to protect it from polluting other
  2069. // usages. The worst case is that IOException will always be
  2070. // retried for another getInputSummary(), which is fine as
  2071. // IOException is not considered as a common case.
  2072. LOG.info("Cannot get size of " + pathStr + ". Safely ignored.");
  2073. }
  2074. }
  2075. };
  2076. if (executor == null) {
  2077. r.run();
  2078. } else {
  2079. Future<?> result = executor.submit(r);
  2080. results.add(result);
  2081. }
  2082. }
  2083. if (executor != null) {
  2084. for (Future<?> result : results) {
  2085. boolean executorDone = false;
  2086. do {
  2087. try {
  2088. result.get();
  2089. executorDone = true;
  2090. } catch (InterruptedException e) {
  2091. LOG.info("Interrupted when waiting threads: ", e);
  2092. Thread.currentThread().interrupt();
  2093. break;
  2094. } catch (ExecutionException e) {
  2095. throw new IOException(e);
  2096. }
  2097. } while (!executorDone);
  2098. }
  2099. executor.shutdown();
  2100. }
  2101. HiveInterruptUtils.checkInterrupted();
  2102. for (Map.Entry<String, ContentSummary> entry : resultMap.entrySet()) {
  2103. ContentSummary cs = entry.getValue();
  2104. summary[0] += cs.getLength();
  2105. summary[1] += cs.getFileCount();
  2106. summary[2] += cs.getDirectoryCount();
  2107. ctx.addCS(entry.getKey(), cs);
  2108. LOG.info("Cache Content Summary for " + entry.getKey() + " length: " + cs.getLength()
  2109. + " file count: "
  2110. + cs.getFileCount() + " directory count: " + cs.getDirectoryCount());
  2111. }
  2112. perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.INPUT_SUMMARY);
  2113. return new ContentSummary(summary[0], summary[1], summary[2]);
  2114. } finally {
  2115. HiveInterruptUtils.remove(interrup);
  2116. }
  2117. }
  2118. }
  2119. public static long sumOf(Map<String, Long> aliasToSize, Set<String> aliases) {
  2120. return sumOfExcept(aliasToSize, aliases, null);
  2121. }
  2122. // return sum of lengths except some aliases. returns -1 if any of other alias is unknown
  2123. public static long sumOfExcept(Map<String, Long> aliasToSize,
  2124. Set<String> aliases, Set<String> excepts) {
  2125. long total = 0;
  2126. for (String alias : aliases) {
  2127. if (excepts != null && excepts.contains(alias)) {
  2128. continue;
  2129. }
  2130. Long size = aliasToSize.get(alias);
  2131. if (size == null) {
  2132. return -1;
  2133. }
  2134. total += size;
  2135. }
  2136. return total;
  2137. }
  2138. public static boolean isEmptyPath(JobConf job, Path dirPath, Context ctx)
  2139. throws Exception {
  2140. ContentSummary cs = ctx.getCS(dirPath);
  2141. if (cs != null) {
  2142. LOG.info("Content Summary " + dirPath + "length: " + cs.getLength() + " num files: "
  2143. + cs.getFileCount() + " num directories: " + cs.getDirectoryCount());
  2144. return (cs.getLength() == 0 && cs.getFileCount() == 0 && cs.getDirectoryCount() <= 1);
  2145. } else {
  2146. LOG.info("Content Summary not cached for " + dirPath);
  2147. }
  2148. return isEmptyPath(job, dirPath);
  2149. }
  2150. public static boolean isEmptyPath(JobConf job, Path dirPath) throws Exception {
  2151. FileSystem inpFs = dirPath.getFileSystem(job);
  2152. if (inpFs.exists(dirPath)) {
  2153. FileStatus[] fStats = inpFs.listStatus(dirPath);
  2154. if (fStats.length > 0) {
  2155. return false;
  2156. }
  2157. }
  2158. return true;
  2159. }
  2160. public static List<TezTask> getTezTasks(List<Task<? extends Serializable>> tasks) {
  2161. List<TezTask> tezTasks = new ArrayList<TezTask>();
  2162. if (tasks != null) {
  2163. getTezTasks(tasks, tezTasks);
  2164. }
  2165. return tezTasks;
  2166. }
  2167. private static void getTezTasks(List<Task<? extends Serializable>> tasks, List<TezTask> tezTasks) {
  2168. for (Task<? extends Serializable> task : tasks) {
  2169. if (task instanceof TezTask && !tezTasks.contains(task)) {
  2170. tezTasks.add((TezTask) task);
  2171. }
  2172. if (task.getDependentTasks() != null) {
  2173. getTezTasks(task.getDependentTasks(), tezTasks);
  2174. }
  2175. }
  2176. }
  2177. public static List<ExecDriver> getMRTasks(List<Task<? extends Serializable>> tasks) {
  2178. List<ExecDriver> mrTasks = new ArrayList<ExecDriver>();
  2179. if (tasks != null) {
  2180. getMRTasks(tasks, mrTasks);
  2181. }
  2182. return mrTasks;
  2183. }
  2184. private static void getMRTasks(List<Task<? extends Serializable>> tasks, List<ExecDriver> mrTasks) {
  2185. for (Task<? extends Serializable> task : tasks) {
  2186. if (task instanceof ExecDriver && !mrTasks.contains(task)) {
  2187. mrTasks.add((ExecDriver) task);
  2188. }
  2189. if (task.getDependentTasks() != null) {
  2190. getMRTasks(task.getDependentTasks(), mrTasks);
  2191. }
  2192. }
  2193. }
  2194. /**
  2195. * Construct a list of full partition spec from Dynamic Partition Context and the directory names
  2196. * corresponding to these dynamic partitions.
  2197. */
  2198. public static List<LinkedHashMap<String, String>> getFullDPSpecs(Configuration conf,
  2199. DynamicPartitionCtx dpCtx) throws HiveException {
  2200. try {
  2201. Path loadPath = dpCtx.getRootPath();
  2202. FileSystem fs = loadPath.getFileSystem(conf);
  2203. int numDPCols = dpCtx.getNumDPCols();
  2204. FileStatus[] status = HiveStatsUtils.getFileStatusRecurse(loadPath, numDPCols, fs);
  2205. if (status.length == 0) {
  2206. LOG.warn("No partition is generated by dynamic partitioning");
  2207. return null;
  2208. }
  2209. // partial partition specification
  2210. Map<String, String> partSpec = dpCtx.getPartSpec();
  2211. // list of full partition specification
  2212. List<LinkedHashMap<String, String>> fullPartSpecs = new ArrayList<LinkedHashMap<String, String>>();
  2213. // for each dynamically created DP directory, construct a full partition spec
  2214. // and load the partition based on that
  2215. for (int i = 0; i < status.length; ++i) {
  2216. // get the dynamically created directory
  2217. Path partPath = status[i].getPath();
  2218. assert fs.getFileStatus(partPath).isDir() : "partitions " + partPath
  2219. + " is not a directory !";
  2220. // generate a full partition specification
  2221. LinkedHashMap<String, String> fullPartSpec = new LinkedHashMap<String, String>(partSpec);
  2222. Warehouse.makeSpecFromName(fullPartSpec, partPath);
  2223. fullPartSpecs.add(fullPartSpec);
  2224. }
  2225. return fullPartSpecs;
  2226. } catch (IOException e) {
  2227. throw new HiveException(e);
  2228. }
  2229. }
  2230. public static StatsPublisher getStatsPublisher(JobConf jc) {
  2231. StatsFactory factory = StatsFactory.newFactory(jc);
  2232. return factory == null ? null : factory.getStatsPublisher();
  2233. }
  2234. /**
  2235. * If statsPrefix's length is greater than maxPrefixLength and maxPrefixLength > 0,
  2236. * then it returns an MD5 hash of statsPrefix followed by path separator, otherwise
  2237. * it returns statsPrefix
  2238. *
  2239. * @param statsPrefix prefix of stats key
  2240. * @param maxPrefixLength max length of stats key
  2241. * @return if the length of prefix is longer than max, return MD5 hashed value of the prefix
  2242. */
  2243. public static String getHashedStatsPrefix(String statsPrefix, int maxPrefixLength) {
  2244. // todo: this might return possibly longer prefix than
  2245. // maxPrefixLength (if set) when maxPrefixLength - postfixLength < 17,
  2246. // which would make stat values invalid (especially for 'counter' type)
  2247. if (maxPrefixLength >= 0 && statsPrefix.length() > maxPrefixLength) {
  2248. try {
  2249. MessageDigest digester = MessageDigest.getInstance("MD5");
  2250. digester.update(statsPrefix.getBytes());
  2251. return new String(digester.digest()) + Path.SEPARATOR; // 17 byte
  2252. } catch (NoSuchAlgorithmException e) {
  2253. throw new RuntimeException(e);
  2254. }
  2255. }
  2256. return statsPrefix.endsWith(Path.SEPARATOR) ? statsPrefix : statsPrefix + Path.SEPARATOR;
  2257. }
  2258. public static String join(String... elements) {
  2259. StringBuilder builder = new StringBuilder();
  2260. for (String element : elements) {
  2261. if (element == null || element.isEmpty()) {
  2262. continue;
  2263. }
  2264. builder.append(element);
  2265. if (!element.endsWith(Path.SEPARATOR)) {
  2266. builder.append(Path.SEPARATOR);
  2267. }
  2268. }
  2269. return builder.toString();
  2270. }
  2271. public static void setColumnNameList(JobConf jobConf, Operator op) {
  2272. setColumnNameList(jobConf, op, false);
  2273. }
  2274. public static void setColumnNameList(JobConf jobConf, Operator op, boolean excludeVCs) {
  2275. RowSchema rowSchema = op.getSchema();
  2276. if (rowSchema == null) {
  2277. return;
  2278. }
  2279. StringBuilder columnNames = new StringBuilder();
  2280. for (ColumnInfo colInfo : rowSchema.getSignature()) {
  2281. if (excludeVCs && colInfo.getIsVirtualCol()) {
  2282. continue;
  2283. }
  2284. if (columnNames.length() > 0) {
  2285. columnNames.append(",");
  2286. }
  2287. columnNames.append(colInfo.getInternalName());
  2288. }
  2289. String columnNamesString = columnNames.toString();
  2290. jobConf.set(serdeConstants.LIST_COLUMNS, columnNamesString);
  2291. }
  2292. public static void setColumnTypeList(JobConf jobConf, Operator op) {
  2293. setColumnTypeList(jobConf, op, false);
  2294. }
  2295. public static void setColumnTypeList(JobConf jobConf, Operator op, boolean excludeVCs) {
  2296. RowSchema rowSchema = op.getSchema();
  2297. if (rowSchema == null) {
  2298. return;
  2299. }
  2300. StringBuilder columnTypes = new StringBuilder();
  2301. for (ColumnInfo colInfo : rowSchema.getSignature()) {
  2302. if (excludeVCs && colInfo.getIsVirtualCol()) {
  2303. continue;
  2304. }
  2305. if (columnTypes.length() > 0) {
  2306. columnTypes.append(",");
  2307. }
  2308. columnTypes.append(colInfo.getTypeName());
  2309. }
  2310. String columnTypesString = columnTypes.toString();
  2311. jobConf.set(serdeConstants.LIST_COLUMN_TYPES, columnTypesString);
  2312. }
  2313. public static String suffix = ".hashtable";
  2314. public static Path generatePath(Path basePath, String dumpFilePrefix,
  2315. Byte tag, String bigBucketFileName) {
  2316. return new Path(basePath, "MapJoin-" + dumpFilePrefix + tag +
  2317. "-" + bigBucketFileName + suffix);
  2318. }
  2319. public static String generateFileName(Byte tag, String bigBucketFileName) {
  2320. String fileName = new String("MapJoin-" + tag + "-" + bigBucketFileName + suffix);
  2321. return fileName;
  2322. }
  2323. public static Path generateTmpPath(Path basePath, String id) {
  2324. return new Path(basePath, "HashTable-" + id);
  2325. }
  2326. public static Path generateTarPath(Path basePath, String filename) {
  2327. return new Path(basePath, filename + ".tar.gz");
  2328. }
  2329. public static String generateTarFileName(String name) {
  2330. return name + ".tar.gz";
  2331. }
  2332. public static String generatePath(Path baseURI, String filename) {
  2333. String path = new String(baseURI + Path.SEPARATOR + filename);
  2334. return path;
  2335. }
  2336. public static String now() {
  2337. Calendar cal = Calendar.getInstance();
  2338. SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
  2339. return sdf.format(cal.getTime());
  2340. }
  2341. public static double showTime(long time) {
  2342. double result = (double) time / (double) 1000;
  2343. return result;
  2344. }
  2345. /**
  2346. * The check here is kind of not clean. It first use a for loop to go through
  2347. * all input formats, and choose the ones that extend ReworkMapredInputFormat
  2348. * to a set. And finally go through the ReworkMapredInputFormat set, and call
  2349. * rework for each one.
  2350. *
  2351. * Technically all these can be avoided if all Hive's input formats can share
  2352. * a same interface. As in today's hive and Hadoop, it is not possible because
  2353. * a lot of Hive's input formats are in Hadoop's code. And most of Hadoop's
  2354. * input formats just extend InputFormat interface.
  2355. *
  2356. * @param task
  2357. * @param reworkMapredWork
  2358. * @param conf
  2359. * @throws SemanticException
  2360. */
  2361. public static void reworkMapRedWork(Task<? extends Serializable> task,
  2362. boolean reworkMapredWork, HiveConf conf) throws SemanticException {
  2363. if (reworkMapredWork && (task instanceof MapRedTask)) {
  2364. try {
  2365. MapredWork mapredWork = ((MapRedTask) task).getWork();
  2366. Set<Class<? extends InputFormat>> reworkInputFormats = new HashSet<Class<? extends InputFormat>>();
  2367. for (PartitionDesc part : mapredWork.getMapWork().getPathToPartitionInfo().values()) {
  2368. Class<? extends InputFormat> inputFormatCls = part
  2369. .getInputFileFormatClass();
  2370. if (ReworkMapredInputFormat.class.isAssignableFrom(inputFormatCls)) {
  2371. reworkInputFormats.add(inputFormatCls);
  2372. }
  2373. }
  2374. if (reworkInputFormats.size() > 0) {
  2375. for (Class<? extends InputFormat> inputFormatCls : reworkInputFormats) {
  2376. ReworkMapredInputFormat inst = (ReworkMapredInputFormat) ReflectionUtils
  2377. .newInstance(inputFormatCls, null);
  2378. inst.rework(conf, mapredWork);
  2379. }
  2380. }
  2381. } catch (IOException e) {
  2382. throw new SemanticException(e);
  2383. }
  2384. }
  2385. }
  2386. public static class SQLCommand<T> {
  2387. public T run(PreparedStatement stmt) throws SQLException {
  2388. return null;
  2389. }
  2390. }
  2391. /**
  2392. * Retry SQL execution with random backoff (same as the one implemented in HDFS-767).
  2393. * This function only retries when the SQL query throws a SQLTransientException (which
  2394. * might be able to succeed with a simple retry). It doesn't retry when the exception
  2395. * is a SQLRecoverableException or SQLNonTransientException. For SQLRecoverableException
  2396. * the caller needs to reconnect to the database and restart the whole transaction.
  2397. *
  2398. * @param cmd the SQL command
  2399. * @param stmt the prepared statement of SQL.
  2400. * @param baseWindow The base time window (in milliseconds) before the next retry.
  2401. * see {@link #getRandomWaitTime} for details.
  2402. * @param maxRetries the maximum # of retries when getting a SQLTransientException.
  2403. * @throws SQLException throws SQLRecoverableException or SQLNonTransientException the
  2404. * first time it is caught, or SQLTransientException when the maxRetries has reached.
  2405. */
  2406. public static <T> T executeWithRetry(SQLCommand<T> cmd, PreparedStatement stmt,
  2407. int baseWindow, int maxRetries) throws SQLException {
  2408. Random r = new Random();
  2409. T result = null;
  2410. // retry with # of maxRetries before throwing exception
  2411. for (int failures = 0; ; failures++) {
  2412. try {
  2413. result = cmd.run(stmt);
  2414. return result;
  2415. } catch (SQLTransientException e) {
  2416. LOG.warn("Failure and retry #" + failures + " with exception " + e.getMessage());
  2417. if (failures >= maxRetries) {
  2418. throw e;
  2419. }
  2420. long waitTime = getRandomWaitTime(baseWindow, failures, r);
  2421. try {
  2422. Thread.sleep(waitTime);
  2423. } catch (InterruptedException iex) {
  2424. }
  2425. } catch (SQLException e) {
  2426. // throw other types of SQLExceptions (SQLNonTransientException / SQLRecoverableException)
  2427. throw e;
  2428. }
  2429. }
  2430. }
  2431. /**
  2432. * Retry connecting to a database with random backoff (same as the one implemented in HDFS-767).
  2433. * This function only retries when the SQL query throws a SQLTransientException (which
  2434. * might be able to succeed with a simple retry). It doesn't retry when the exception
  2435. * is a SQLRecoverableException or SQLNonTransientException. For SQLRecoverableException
  2436. * the caller needs to reconnect to the database and restart the whole transaction.
  2437. *
  2438. * @param connectionString the JDBC connection string.
  2439. * @param waitWindow The base time window (in milliseconds) before the next retry.
  2440. * see {@link #getRandomWaitTime} for details.
  2441. * @param maxRetries the maximum # of retries when getting a SQLTransientException.
  2442. * @throws SQLException throws SQLRecoverableException or SQLNonTransientException the
  2443. * first time it is caught, or SQLTransientException when the maxRetries has reached.
  2444. */
  2445. public static Connection connectWithRetry(String connectionString,
  2446. int waitWindow, int maxRetries) throws SQLException {
  2447. Random r = new Random();
  2448. // retry with # of maxRetries before throwing exception
  2449. for (int failures = 0; ; failures++) {
  2450. try {
  2451. Connection conn = DriverManager.getConnection(connectionString);
  2452. return conn;
  2453. } catch (SQLTransientException e) {
  2454. if (failures >= maxRetries) {
  2455. LOG.error("Error during JDBC connection. " + e);
  2456. throw e;
  2457. }
  2458. long waitTime = Utilities.getRandomWaitTime(waitWindow, failures, r);
  2459. try {
  2460. Thread.sleep(waitTime);
  2461. } catch (InterruptedException e1) {
  2462. }
  2463. } catch (SQLException e) {
  2464. // just throw other types (SQLNonTransientException / SQLRecoverableException)
  2465. throw e;
  2466. }
  2467. }
  2468. }
  2469. /**
  2470. * Retry preparing a SQL statement with random backoff (same as the one implemented in HDFS-767).
  2471. * This function only retries when the SQL query throws a SQLTransientException (which
  2472. * might be able to succeed with a simple retry). It doesn't retry when the exception
  2473. * is a SQLRecoverableException or SQLNonTransientException. For SQLRecoverableException
  2474. * the caller needs to reconnect to the database and restart the whole transaction.
  2475. *
  2476. * @param conn a JDBC connection.
  2477. * @param stmt the SQL statement to be prepared.
  2478. * @param waitWindow The base time window (in milliseconds) before the next retry.
  2479. * see {@link #getRandomWaitTime} for details.
  2480. * @param maxRetries the maximum # of retries when getting a SQLTransientException.
  2481. * @throws SQLException throws SQLRecoverableException or SQLNonTransientException the
  2482. * first time it is caught, or SQLTransientException when the maxRetries has reached.
  2483. */
  2484. public static PreparedStatement prepareWithRetry(Connection conn, String stmt,
  2485. int waitWindow, int maxRetries) throws SQLException {
  2486. Random r = new Random();
  2487. // retry with # of maxRetries before throwing exception
  2488. for (int failures = 0; ; failures++) {
  2489. try {
  2490. return conn.prepareStatement(stmt);
  2491. } catch (SQLTransientException e) {
  2492. if (failures >= maxRetries) {
  2493. LOG.error("Error preparing JDBC Statement " + stmt + " :" + e);
  2494. throw e;
  2495. }
  2496. long waitTime = Utilities.getRandomWaitTime(waitWindow, failures, r);
  2497. try {
  2498. Thread.sleep(waitTime);
  2499. } catch (InterruptedException e1) {
  2500. }
  2501. } catch (SQLException e) {
  2502. // just throw other types (SQLNonTransientException / SQLRecoverableException)
  2503. throw e;
  2504. }
  2505. }
  2506. }
  2507. /**
  2508. * Introducing a random factor to the wait time before another retry.
  2509. * The wait time is dependent on # of failures and a random factor.
  2510. * At the first time of getting an exception , the wait time
  2511. * is a random number between 0..baseWindow msec. If the first retry
  2512. * still fails, we will wait baseWindow msec grace period before the 2nd retry.
  2513. * Also at the second retry, the waiting window is expanded to 2*baseWindow msec
  2514. * alleviating the request rate from the server. Similarly the 3rd retry
  2515. * will wait 2*baseWindow msec. grace period before retry and the waiting window is
  2516. * expanded to 3*baseWindow msec and so on.
  2517. * @param baseWindow the base waiting window.
  2518. * @param failures number of failures so far.
  2519. * @param r a random generator.
  2520. * @return number of milliseconds for the next wait time.
  2521. */
  2522. public static long getRandomWaitTime(int baseWindow, int failures, Random r) {
  2523. return (long) (
  2524. baseWindow * failures + // grace period for the last round of attempt
  2525. baseWindow * (failures + 1) * r.nextDouble()); // expanding time window for each failure
  2526. }
  2527. public static final char sqlEscapeChar = '\\';
  2528. /**
  2529. * Escape the '_', '%', as well as the escape characters inside the string key.
  2530. * @param key the string that will be used for the SQL LIKE operator.
  2531. * @return a string with escaped '_' and '%'.
  2532. */
  2533. public static String escapeSqlLike(String key) {
  2534. StringBuffer sb = new StringBuffer(key.length());
  2535. for (char c: key.toCharArray()) {
  2536. switch(c) {
  2537. case '_':
  2538. case '%':
  2539. case sqlEscapeChar:
  2540. sb.append(sqlEscapeChar);
  2541. // fall through
  2542. default:
  2543. sb.append(c);
  2544. break;
  2545. }
  2546. }
  2547. return sb.toString();
  2548. }
  2549. /**
  2550. * Format number of milliseconds to strings
  2551. *
  2552. * @param msec milliseconds
  2553. * @return a formatted string like "x days y hours z minutes a seconds b msec"
  2554. */
  2555. public static String formatMsecToStr(long msec) {
  2556. long day = -1, hour = -1, minute = -1, second = -1;
  2557. long ms = msec % 1000;
  2558. long timeLeft = msec / 1000;
  2559. if (timeLeft > 0) {
  2560. second = timeLeft % 60;
  2561. timeLeft /= 60;
  2562. if (timeLeft > 0) {
  2563. minute = timeLeft % 60;
  2564. timeLeft /= 60;
  2565. if (timeLeft > 0) {
  2566. hour = timeLeft % 24;
  2567. day = timeLeft / 24;
  2568. }
  2569. }
  2570. }
  2571. StringBuilder sb = new StringBuilder();
  2572. if (day != -1) {
  2573. sb.append(day + " days ");
  2574. }
  2575. if (hour != -1) {
  2576. sb.append(hour + " hours ");
  2577. }
  2578. if (minute != -1) {
  2579. sb.append(minute + " minutes ");
  2580. }
  2581. if (second != -1) {
  2582. sb.append(second + " seconds ");
  2583. }
  2584. sb.append(ms + " msec");
  2585. return sb.toString();
  2586. }
  2587. /**
  2588. * Estimate the number of reducers needed for this job, based on job input,
  2589. * and configuration parameters.
  2590. *
  2591. * The output of this method should only be used if the output of this
  2592. * MapRedTask is not being used to populate a bucketed table and the user
  2593. * has not specified the number of reducers to use.
  2594. *
  2595. * @return the number of reducers.
  2596. */
  2597. public static int estimateNumberOfReducers(HiveConf conf, ContentSummary inputSummary,
  2598. MapWork work, boolean finalMapRed) throws IOException {
  2599. long bytesPerReducer = conf.getLongVar(HiveConf.ConfVars.BYTESPERREDUCER);
  2600. int maxReducers = conf.getIntVar(HiveConf.ConfVars.MAXREDUCERS);
  2601. double samplePercentage = getHighestSamplePercentage(work);
  2602. long totalInputFileSize = getTotalInputFileSize(inputSummary, work, samplePercentage);
  2603. // if all inputs are sampled, we should shrink the size of reducers accordingly.
  2604. if (totalInputFileSize != inputSummary.getLength()) {
  2605. LOG.info("BytesPerReducer=" + bytesPerReducer + " maxReducers="
  2606. + maxReducers + " estimated totalInputFileSize=" + totalInputFileSize);
  2607. } else {
  2608. LOG.info("BytesPerReducer=" + bytesPerReducer + " maxReducers="
  2609. + maxReducers + " totalInputFileSize=" + totalInputFileSize);
  2610. }
  2611. // If this map reduce job writes final data to a table and bucketing is being inferred,
  2612. // and the user has configured Hive to do this, make sure the number of reducers is a
  2613. // power of two
  2614. boolean powersOfTwo = conf.getBoolVar(HiveConf.ConfVars.HIVE_INFER_BUCKET_SORT_NUM_BUCKETS_POWER_TWO) &&
  2615. finalMapRed && !work.getBucketedColsByDirectory().isEmpty();
  2616. return estimateReducers(totalInputFileSize, bytesPerReducer, maxReducers, powersOfTwo);
  2617. }
  2618. public static int estimateReducers(long totalInputFileSize, long bytesPerReducer,
  2619. int maxReducers, boolean powersOfTwo) {
  2620. int reducers = (int) ((totalInputFileSize + bytesPerReducer - 1) / bytesPerReducer);
  2621. reducers = Math.max(1, reducers);
  2622. reducers = Math.min(maxReducers, reducers);
  2623. int reducersLog = (int)(Math.log(reducers) / Math.log(2)) + 1;
  2624. int reducersPowerTwo = (int)Math.pow(2, reducersLog);
  2625. if (powersOfTwo) {
  2626. // If the original number of reducers was a power of two, use that
  2627. if (reducersPowerTwo / 2 == reducers) {
  2628. // nothing to do
  2629. } else if (reducersPowerTwo > maxReducers) {
  2630. // If the next power of two greater than the original number of reducers is greater
  2631. // than the max number of reducers, use the preceding power of two, which is strictly
  2632. // less than the original number of reducers and hence the max
  2633. reducers = reducersPowerTwo / 2;
  2634. } else {
  2635. // Otherwise use the smallest power of two greater than the original number of reducers
  2636. reducers = reducersPowerTwo;
  2637. }
  2638. }
  2639. return reducers;
  2640. }
  2641. /**
  2642. * Computes the total input file size. If block sampling was used it will scale this
  2643. * value by the highest sample percentage (as an estimate for input).
  2644. *
  2645. * @param inputSummary
  2646. * @param work
  2647. * @param highestSamplePercentage
  2648. * @return estimated total input size for job
  2649. */
  2650. public static long getTotalInputFileSize (ContentSummary inputSummary, MapWork work,
  2651. double highestSamplePercentage) {
  2652. long totalInputFileSize = inputSummary.getLength();
  2653. if (work.getNameToSplitSample() == null || work.getNameToSplitSample().isEmpty()) {
  2654. // If percentage block sampling wasn't used, we don't need to do any estimation
  2655. return totalInputFileSize;
  2656. }
  2657. if (highestSamplePercentage >= 0) {
  2658. totalInputFileSize = Math.min((long) (totalInputFileSize * highestSamplePercentage / 100D)
  2659. , totalInputFileSize);
  2660. }
  2661. return totalInputFileSize;
  2662. }
  2663. /**
  2664. * Computes the total number of input files. If block sampling was used it will scale this
  2665. * value by the highest sample percentage (as an estimate for # input files).
  2666. *
  2667. * @param inputSummary
  2668. * @param work
  2669. * @param highestSamplePercentage
  2670. * @return
  2671. */
  2672. public static long getTotalInputNumFiles (ContentSummary inputSummary, MapWork work,
  2673. double highestSamplePercentage) {
  2674. long totalInputNumFiles = inputSummary.getFileCount();
  2675. if (work.getNameToSplitSample() == null || work.getNameToSplitSample().isEmpty()) {
  2676. // If percentage block sampling wasn't used, we don't need to do any estimation
  2677. return totalInputNumFiles;
  2678. }
  2679. if (highestSamplePercentage >= 0) {
  2680. totalInputNumFiles = Math.min((long) (totalInputNumFiles * highestSamplePercentage / 100D)
  2681. , totalInputNumFiles);
  2682. }
  2683. return totalInputNumFiles;
  2684. }
  2685. /**
  2686. * Returns the highest sample percentage of any alias in the given MapWork
  2687. */
  2688. public static double getHighestSamplePercentage (MapWork work) {
  2689. double highestSamplePercentage = 0;
  2690. for (String alias : work.getAliasToWork().keySet()) {
  2691. if (work.getNameToSplitSample().containsKey(alias)) {
  2692. Double rate = work.getNameToSplitSample().get(alias).getPercent();
  2693. if (rate != null && rate > highestSamplePercentage) {
  2694. highestSamplePercentage = rate;
  2695. }
  2696. } else {
  2697. highestSamplePercentage = -1;
  2698. break;
  2699. }
  2700. }
  2701. return highestSamplePercentage;
  2702. }
  2703. /**
  2704. * On Tez we're not creating dummy files when getting/setting input paths.
  2705. * We let Tez handle the situation. We're also setting the paths in the AM
  2706. * so we don't want to depend on scratch dir and context.
  2707. */
  2708. public static List<Path> getInputPathsTez(JobConf job, MapWork work) throws Exception {
  2709. List<Path> paths = getInputPaths(job, work, null, null);
  2710. return paths;
  2711. }
  2712. /**
  2713. * Computes a list of all input paths needed to compute the given MapWork. All aliases
  2714. * are considered and a merged list of input paths is returned. If any input path points
  2715. * to an empty table or partition a dummy file in the scratch dir is instead created and
  2716. * added to the list. This is needed to avoid special casing the operator pipeline for
  2717. * these cases.
  2718. *
  2719. * @param job JobConf used to run the job
  2720. * @param work MapWork encapsulating the info about the task
  2721. * @param hiveScratchDir The tmp dir used to create dummy files if needed
  2722. * @param ctx Context object
  2723. * @return List of paths to process for the given MapWork
  2724. * @throws Exception
  2725. */
  2726. public static List<Path> getInputPaths(JobConf job, MapWork work, Path hiveScratchDir, Context ctx)
  2727. throws Exception {
  2728. int sequenceNumber = 0;
  2729. Set<Path> pathsProcessed = new HashSet<Path>();
  2730. List<Path> pathsToAdd = new LinkedList<Path>();
  2731. // AliasToWork contains all the aliases
  2732. for (String alias : work.getAliasToWork().keySet()) {
  2733. LOG.info("Processing alias " + alias);
  2734. // The alias may not have any path
  2735. Path path = null;
  2736. for (String file : new LinkedList<String>(work.getPathToAliases().keySet())) {
  2737. List<String> aliases = work.getPathToAliases().get(file);
  2738. if (aliases.contains(alias)) {
  2739. path = new Path(file);
  2740. // Multiple aliases can point to the same path - it should be
  2741. // processed only once
  2742. if (pathsProcessed.contains(path)) {
  2743. continue;
  2744. }
  2745. pathsProcessed.add(path);
  2746. LOG.info("Adding input file " + path);
  2747. if (!HiveConf.getVar(job, ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")
  2748. && isEmptyPath(job, path, ctx)) {
  2749. path = createDummyFileForEmptyPartition(path, job, work,
  2750. hiveScratchDir, alias, sequenceNumber++);
  2751. }
  2752. pathsToAdd.add(path);
  2753. }
  2754. }
  2755. // If the query references non-existent partitions
  2756. // We need to add a empty file, it is not acceptable to change the
  2757. // operator tree
  2758. // Consider the query:
  2759. // select * from (select count(1) from T union all select count(1) from
  2760. // T2) x;
  2761. // If T is empty and T2 contains 100 rows, the user expects: 0, 100 (2
  2762. // rows)
  2763. if (path == null
  2764. && !HiveConf.getVar(job, ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
  2765. path = createDummyFileForEmptyTable(job, work, hiveScratchDir,
  2766. alias, sequenceNumber++);
  2767. pathsToAdd.add(path);
  2768. }
  2769. }
  2770. return pathsToAdd;
  2771. }
  2772. @SuppressWarnings({"rawtypes", "unchecked"})
  2773. private static Path createEmptyFile(Path hiveScratchDir,
  2774. Class<? extends HiveOutputFormat> outFileFormat, JobConf job,
  2775. int sequenceNumber, Properties props, boolean dummyRow)
  2776. throws IOException, InstantiationException, IllegalAccessException {
  2777. // create a dummy empty file in a new directory
  2778. String newDir = hiveScratchDir + Path.SEPARATOR + sequenceNumber;
  2779. Path newPath = new Path(newDir);
  2780. FileSystem fs = newPath.getFileSystem(job);
  2781. fs.mkdirs(newPath);
  2782. //Qualify the path against the file system. The user configured path might contain default port which is skipped
  2783. //in the file status. This makes sure that all paths which goes into PathToPartitionInfo are always listed status
  2784. //file path.
  2785. newPath = fs.makeQualified(newPath);
  2786. String newFile = newDir + Path.SEPARATOR + "emptyFile";
  2787. Path newFilePath = new Path(newFile);
  2788. RecordWriter recWriter = outFileFormat.newInstance().getHiveRecordWriter(job, newFilePath,
  2789. Text.class, false, props, null);
  2790. if (dummyRow) {
  2791. // empty files are omitted at CombineHiveInputFormat.
  2792. // for meta-data only query, it effectively makes partition columns disappear..
  2793. // this could be fixed by other methods, but this seemed to be the most easy (HIVEV-2955)
  2794. recWriter.write(new Text("empty")); // written via HiveIgnoreKeyTextOutputFormat
  2795. }
  2796. recWriter.close(false);
  2797. return newPath;
  2798. }
  2799. @SuppressWarnings("rawtypes")
  2800. private static Path createDummyFileForEmptyPartition(Path path, JobConf job, MapWork work,
  2801. Path hiveScratchDir, String alias, int sequenceNumber)
  2802. throws IOException, InstantiationException, IllegalAccessException {
  2803. String strPath = path.toString();
  2804. // The input file does not exist, replace it by a empty file
  2805. PartitionDesc partDesc = work.getPathToPartitionInfo().get(strPath);
  2806. boolean nonNative = partDesc.getTableDesc().isNonNative();
  2807. boolean oneRow = partDesc.getInputFileFormatClass() == OneNullRowInputFormat.class;
  2808. Properties props = partDesc.getProperties();
  2809. Class<? extends HiveOutputFormat> outFileFormat = partDesc.getOutputFileFormatClass();
  2810. if (nonNative) {
  2811. // if this isn't a hive table we can't create an empty file for it.
  2812. return path;
  2813. }
  2814. Path newPath = createEmptyFile(hiveScratchDir, outFileFormat, job,
  2815. sequenceNumber, props, oneRow);
  2816. LOG.info("Changed input file to " + newPath);
  2817. // update the work
  2818. String strNewPath = newPath.toString();
  2819. LinkedHashMap<String, ArrayList<String>> pathToAliases = work.getPathToAliases();
  2820. pathToAliases.put(strNewPath, pathToAliases.get(strPath));
  2821. pathToAliases.remove(strPath);
  2822. work.setPathToAliases(pathToAliases);
  2823. LinkedHashMap<String, PartitionDesc> pathToPartitionInfo = work.getPathToPartitionInfo();
  2824. pathToPartitionInfo.put(strNewPath, pathToPartitionInfo.get(strPath));
  2825. pathToPartitionInfo.remove(strPath);
  2826. work.setPathToPartitionInfo(pathToPartitionInfo);
  2827. return newPath;
  2828. }
  2829. @SuppressWarnings("rawtypes")
  2830. private static Path createDummyFileForEmptyTable(JobConf job, MapWork work,
  2831. Path hiveScratchDir, String alias, int sequenceNumber)
  2832. throws IOException, InstantiationException, IllegalAccessException {
  2833. TableDesc tableDesc = work.getAliasToPartnInfo().get(alias).getTableDesc();
  2834. Properties props = tableDesc.getProperties();
  2835. boolean nonNative = tableDesc.isNonNative();
  2836. Class<? extends HiveOutputFormat> outFileFormat = tableDesc.getOutputFileFormatClass();
  2837. if (nonNative) {
  2838. // if this isn't a hive table we can't create an empty file for it.
  2839. return null;
  2840. }
  2841. Path newPath = createEmptyFile(hiveScratchDir, outFileFormat, job,
  2842. sequenceNumber, props, false);
  2843. LOG.info("Changed input file to " + newPath.toString());
  2844. // update the work
  2845. LinkedHashMap<String, ArrayList<String>> pathToAliases = work.getPathToAliases();
  2846. ArrayList<String> newList = new ArrayList<String>();
  2847. newList.add(alias);
  2848. pathToAliases.put(newPath.toUri().toString(), newList);
  2849. work.setPathToAliases(pathToAliases);
  2850. LinkedHashMap<String, PartitionDesc> pathToPartitionInfo = work.getPathToPartitionInfo();
  2851. PartitionDesc pDesc = work.getAliasToPartnInfo().get(alias).clone();
  2852. pathToPartitionInfo.put(newPath.toUri().toString(), pDesc);
  2853. work.setPathToPartitionInfo(pathToPartitionInfo);
  2854. return newPath;
  2855. }
  2856. /**
  2857. * setInputPaths add all the paths in the provided list to the Job conf object
  2858. * as input paths for the job.
  2859. *
  2860. * @param job
  2861. * @param pathsToAdd
  2862. */
  2863. public static void setInputPaths(JobConf job, List<Path> pathsToAdd) {
  2864. Path[] addedPaths = FileInputFormat.getInputPaths(job);
  2865. if (addedPaths == null) {
  2866. addedPaths = new Path[0];
  2867. }
  2868. Path[] combined = new Path[addedPaths.length + pathsToAdd.size()];
  2869. System.arraycopy(addedPaths, 0, combined, 0, addedPaths.length);
  2870. int i = 0;
  2871. for(Path p: pathsToAdd) {
  2872. combined[addedPaths.length + (i++)] = p;
  2873. }
  2874. FileInputFormat.setInputPaths(job, combined);
  2875. }
  2876. /**
  2877. * Set hive input format, and input format file if necessary.
  2878. */
  2879. public static void setInputAttributes(Configuration conf, MapWork mWork) {
  2880. HiveConf.ConfVars var = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez") ?
  2881. HiveConf.ConfVars.HIVETEZINPUTFORMAT : HiveConf.ConfVars.HIVEINPUTFORMAT;
  2882. if (mWork.getInputformat() != null) {
  2883. HiveConf.setVar(conf, var, mWork.getInputformat());
  2884. }
  2885. if (mWork.getIndexIntermediateFile() != null) {
  2886. conf.set(ConfVars.HIVE_INDEX_COMPACT_FILE.varname, mWork.getIndexIntermediateFile());
  2887. conf.set(ConfVars.HIVE_INDEX_BLOCKFILTER_FILE.varname, mWork.getIndexIntermediateFile());
  2888. }
  2889. // Intentionally overwrites anything the user may have put here
  2890. conf.setBoolean("hive.input.format.sorted", mWork.isInputFormatSorted());
  2891. }
  2892. /**
  2893. * Hive uses tmp directories to capture the output of each FileSinkOperator.
  2894. * This method creates all necessary tmp directories for FileSinks in the Mapwork.
  2895. *
  2896. * @param conf Used to get the right FileSystem
  2897. * @param mWork Used to find FileSinkOperators
  2898. * @throws IOException
  2899. */
  2900. public static void createTmpDirs(Configuration conf, MapWork mWork)
  2901. throws IOException {
  2902. Map<String, ArrayList<String>> pa = mWork.getPathToAliases();
  2903. if (pa != null) {
  2904. List<Operator<? extends OperatorDesc>> ops =
  2905. new ArrayList<Operator<? extends OperatorDesc>>();
  2906. for (List<String> ls : pa.values()) {
  2907. for (String a : ls) {
  2908. ops.add(mWork.getAliasToWork().get(a));
  2909. }
  2910. }
  2911. createTmpDirs(conf, ops);
  2912. }
  2913. }
  2914. /**
  2915. * Hive uses tmp directories to capture the output of each FileSinkOperator.
  2916. * This method creates all necessary tmp directories for FileSinks in the ReduceWork.
  2917. *
  2918. * @param conf Used to get the right FileSystem
  2919. * @param rWork Used to find FileSinkOperators
  2920. * @throws IOException
  2921. */
  2922. @SuppressWarnings("unchecked")
  2923. public static void createTmpDirs(Configuration conf, ReduceWork rWork)
  2924. throws IOException {
  2925. if (rWork == null) {
  2926. return;
  2927. }
  2928. List<Operator<? extends OperatorDesc>> ops
  2929. = new LinkedList<Operator<? extends OperatorDesc>>();
  2930. ops.add(rWork.getReducer());
  2931. createTmpDirs(conf, ops);
  2932. }
  2933. private static void createTmpDirs(Configuration conf,
  2934. List<Operator<? extends OperatorDesc>> ops) throws IOException {
  2935. FsPermission fsPermission = new FsPermission((short)00777);
  2936. while (!ops.isEmpty()) {
  2937. Operator<? extends OperatorDesc> op = ops.remove(0);
  2938. if (op instanceof FileSinkOperator) {
  2939. FileSinkDesc fdesc = ((FileSinkOperator) op).getConf();
  2940. Path tempDir = fdesc.getDirName();
  2941. if (tempDir != null) {
  2942. Path tempPath = Utilities.toTempPath(tempDir);
  2943. createDirsWithPermission(conf, tempPath, fsPermission);
  2944. }
  2945. }
  2946. if (op.getChildOperators() != null) {
  2947. ops.addAll(op.getChildOperators());
  2948. }
  2949. }
  2950. }
  2951. /**
  2952. * Returns true if a plan is both configured for vectorized execution
  2953. * and vectorization is allowed. The plan may be configured for vectorization
  2954. * but vectorization dissalowed eg. for FetchOperator execution.
  2955. */
  2956. public static boolean isVectorMode(Configuration conf) {
  2957. if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED) &&
  2958. Utilities.getPlanPath(conf) != null && Utilities
  2959. .getMapRedWork(conf).getMapWork().getVectorMode()) {
  2960. return true;
  2961. }
  2962. return false;
  2963. }
  2964. public static void clearWorkMapForConf(Configuration conf) {
  2965. // Remove cached query plans for the current query only
  2966. Path mapPath = getPlanPath(conf, MAP_PLAN_NAME);
  2967. Path reducePath = getPlanPath(conf, REDUCE_PLAN_NAME);
  2968. if (mapPath != null) {
  2969. gWorkMap.remove(mapPath);
  2970. }
  2971. if (reducePath != null) {
  2972. gWorkMap.remove(reducePath);
  2973. }
  2974. }
  2975. public static void clearWorkMap() {
  2976. gWorkMap.clear();
  2977. }
  2978. /**
  2979. * Create a temp dir in specified baseDir
  2980. * This can go away once hive moves to support only JDK 7
  2981. * and can use Files.createTempDirectory
  2982. * Guava Files.createTempDir() does not take a base dir
  2983. * @param baseDir - directory under which new temp dir will be created
  2984. * @return File object for new temp dir
  2985. */
  2986. public static File createTempDir(String baseDir){
  2987. //try creating the temp dir MAX_ATTEMPTS times
  2988. final int MAX_ATTEMPS = 30;
  2989. for(int i = 0; i < MAX_ATTEMPS; i++){
  2990. //pick a random file name
  2991. String tempDirName = "tmp_" + ((int)(100000 * Math.random()));
  2992. //return if dir could successfully be created with that file name
  2993. File tempDir = new File(baseDir, tempDirName);
  2994. if(tempDir.mkdir()){
  2995. return tempDir;
  2996. }
  2997. }
  2998. throw new IllegalStateException("Failed to create a temp dir under "
  2999. + baseDir + " Giving up after " + MAX_ATTEMPS + " attemps");
  3000. }
  3001. /**
  3002. * Skip header lines in the table file when reading the record.
  3003. *
  3004. * @param currRecReader
  3005. * Record reader.
  3006. *
  3007. * @param headerCount
  3008. * Header line number of the table files.
  3009. *
  3010. * @param key
  3011. * Key of current reading record.
  3012. *
  3013. * @param value
  3014. * Value of current reading record.
  3015. *
  3016. * @return Return true if there are 0 or more records left in the file
  3017. * after skipping all headers, otherwise return false.
  3018. */
  3019. public static boolean skipHeader(RecordReader<WritableComparable, Writable> currRecReader,
  3020. int headerCount, WritableComparable key, Writable value) throws IOException {
  3021. while (headerCount > 0) {
  3022. if (!currRecReader.next(key, value))
  3023. return false;
  3024. headerCount--;
  3025. }
  3026. return true;
  3027. }
  3028. /**
  3029. * Get header line count for a table.
  3030. *
  3031. * @param table
  3032. * Table description for target table.
  3033. *
  3034. */
  3035. public static int getHeaderCount(TableDesc table) throws IOException {
  3036. int headerCount;
  3037. try {
  3038. headerCount = Integer.parseInt(table.getProperties().getProperty(serdeConstants.HEADER_COUNT, "0"));
  3039. } catch (NumberFormatException nfe) {
  3040. throw new IOException(nfe);
  3041. }
  3042. return headerCount;
  3043. }
  3044. /**
  3045. * Get footer line count for a table.
  3046. *
  3047. * @param table
  3048. * Table description for target table.
  3049. *
  3050. * @param job
  3051. * Job configuration for current job.
  3052. */
  3053. public static int getFooterCount(TableDesc table, JobConf job) throws IOException {
  3054. int footerCount;
  3055. try {
  3056. footerCount = Integer.parseInt(table.getProperties().getProperty(serdeConstants.FOOTER_COUNT, "0"));
  3057. if (footerCount > HiveConf.getIntVar(job, HiveConf.ConfVars.HIVE_FILE_MAX_FOOTER)) {
  3058. throw new IOException("footer number exceeds the limit defined in hive.file.max.footer");
  3059. }
  3060. } catch (NumberFormatException nfe) {
  3061. // Footer line number must be set as an integer.
  3062. throw new IOException(nfe);
  3063. }
  3064. return footerCount;
  3065. }
  3066. /**
  3067. * @param conf the configuration used to derive the filesystem to create the path
  3068. * @param mkdir the path to be created
  3069. * @param fsPermission ignored if it is hive server session and doAs is enabled
  3070. * @return true if successfully created the directory else false
  3071. * @throws IOException if hdfs experiences any error conditions
  3072. */
  3073. public static boolean createDirsWithPermission(Configuration conf, Path mkdir,
  3074. FsPermission fsPermission) throws IOException {
  3075. boolean recursive = false;
  3076. if (SessionState.get() != null) {
  3077. recursive = SessionState.get().isHiveServerQuery() &&
  3078. conf.getBoolean(HiveConf.ConfVars.HIVE_SERVER2_ENABLE_DOAS.varname,
  3079. HiveConf.ConfVars.HIVE_SERVER2_ENABLE_DOAS.defaultBoolVal);
  3080. // we reset the permission in case of hive server and doAs enabled because
  3081. // currently scratch directory uses /tmp/hive-hive as the scratch directory.
  3082. // However, with doAs enabled, the first user to create this directory would
  3083. // own the directory and subsequent users cannot access the scratch directory.
  3084. // The right fix is to have scratch dir per user.
  3085. fsPermission = new FsPermission((short)00777);
  3086. }
  3087. // if we made it so far without exception we are good!
  3088. return createDirsWithPermission(conf, mkdir, fsPermission, recursive);
  3089. }
  3090. private static void resetConfAndCloseFS (Configuration conf, boolean unsetUmask,
  3091. String origUmask, FileSystem fs) throws IOException {
  3092. if (unsetUmask) {
  3093. if (origUmask != null) {
  3094. conf.set("fs.permissions.umask-mode", origUmask);
  3095. } else {
  3096. conf.unset("fs.permissions.umask-mode");
  3097. }
  3098. }
  3099. fs.close();
  3100. }
  3101. public static boolean createDirsWithPermission(Configuration conf, Path mkdirPath,
  3102. FsPermission fsPermission, boolean recursive) throws IOException {
  3103. String origUmask = null;
  3104. LOG.debug("Create dirs " + mkdirPath + " with permission " + fsPermission + " recursive " +
  3105. recursive);
  3106. if (recursive) {
  3107. origUmask = conf.get("fs.permissions.umask-mode");
  3108. // this umask is required because by default the hdfs mask is 022 resulting in
  3109. // all parents getting the fsPermission & !(022) permission instead of fsPermission
  3110. conf.set("fs.permissions.umask-mode", "000");
  3111. }
  3112. FileSystem fs = ShimLoader.getHadoopShims().getNonCachedFileSystem(mkdirPath.toUri(), conf);
  3113. boolean retval = false;
  3114. try {
  3115. retval = fs.mkdirs(mkdirPath, fsPermission);
  3116. resetConfAndCloseFS(conf, recursive, origUmask, fs);
  3117. } catch (IOException ioe) {
  3118. try {
  3119. resetConfAndCloseFS(conf, recursive, origUmask, fs);
  3120. }
  3121. catch (IOException e) {
  3122. // do nothing - double failure
  3123. }
  3124. }
  3125. return retval;
  3126. }
  3127. /**
  3128. * Convert path to qualified path.
  3129. *
  3130. * @param conf
  3131. * Hive configuration.
  3132. * @param path
  3133. * Path to convert.
  3134. * @return Qualified path
  3135. */
  3136. public static String getQualifiedPath(HiveConf conf, Path path) throws HiveException {
  3137. FileSystem fs;
  3138. if (path == null) {
  3139. return null;
  3140. }
  3141. try {
  3142. fs = path.getFileSystem(conf);
  3143. return fs.makeQualified(path).toString();
  3144. }
  3145. catch (IOException e) {
  3146. throw new HiveException(e);
  3147. }
  3148. }
  3149. /**
  3150. * Checks if current hive script was executed with non-default namenode
  3151. *
  3152. * @return True/False
  3153. */
  3154. public static boolean isDefaultNameNode(HiveConf conf) {
  3155. return !conf.getChangedProperties().containsKey(HiveConf.ConfVars.HADOOPFS.varname);
  3156. }
  3157. }