PageRenderTime 59ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 1ms

/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java

https://github.com/mkgobaco/hive
Java | 3525 lines | 2393 code | 368 blank | 764 comment | 448 complexity | 606b47d493fe363a4cdb507f15147d83 MD5 | raw file
Possible License(s): Apache-2.0

Large files files are truncated, but you can click here to view the full file

  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hive.ql.exec;
  19. import java.beans.DefaultPersistenceDelegate;
  20. import java.beans.Encoder;
  21. import java.beans.ExceptionListener;
  22. import java.beans.Expression;
  23. import java.beans.PersistenceDelegate;
  24. import java.beans.Statement;
  25. import java.beans.XMLDecoder;
  26. import java.beans.XMLEncoder;
  27. import java.io.BufferedReader;
  28. import java.io.ByteArrayInputStream;
  29. import java.io.ByteArrayOutputStream;
  30. import java.io.DataInput;
  31. import java.io.EOFException;
  32. import java.io.File;
  33. import java.io.FileInputStream;
  34. import java.io.FileNotFoundException;
  35. import java.io.IOException;
  36. import java.io.InputStream;
  37. import java.io.InputStreamReader;
  38. import java.io.OutputStream;
  39. import java.io.PrintStream;
  40. import java.io.Serializable;
  41. import java.io.UnsupportedEncodingException;
  42. import java.net.URI;
  43. import java.net.URL;
  44. import java.net.URLClassLoader;
  45. import java.security.MessageDigest;
  46. import java.security.NoSuchAlgorithmException;
  47. import java.sql.Connection;
  48. import java.sql.DriverManager;
  49. import java.sql.PreparedStatement;
  50. import java.sql.SQLException;
  51. import java.sql.SQLTransientException;
  52. import java.sql.Timestamp;
  53. import java.text.SimpleDateFormat;
  54. import java.util.ArrayList;
  55. import java.util.Arrays;
  56. import java.util.Calendar;
  57. import java.util.Collection;
  58. import java.util.Collections;
  59. import java.util.Date;
  60. import java.util.HashMap;
  61. import java.util.HashSet;
  62. import java.util.Iterator;
  63. import java.util.LinkedHashMap;
  64. import java.util.LinkedList;
  65. import java.util.List;
  66. import java.util.Map;
  67. import java.util.Properties;
  68. import java.util.Random;
  69. import java.util.Set;
  70. import java.util.UUID;
  71. import java.util.concurrent.ConcurrentHashMap;
  72. import java.util.concurrent.ExecutionException;
  73. import java.util.concurrent.Future;
  74. import java.util.concurrent.LinkedBlockingQueue;
  75. import java.util.concurrent.ThreadPoolExecutor;
  76. import java.util.concurrent.TimeUnit;
  77. import java.util.regex.Matcher;
  78. import java.util.regex.Pattern;
  79. import java.util.zip.Deflater;
  80. import java.util.zip.DeflaterOutputStream;
  81. import java.util.zip.InflaterInputStream;
  82. import org.antlr.runtime.CommonToken;
  83. import org.apache.commons.codec.binary.Base64;
  84. import org.apache.commons.lang.StringUtils;
  85. import org.apache.commons.lang.WordUtils;
  86. import org.apache.commons.logging.Log;
  87. import org.apache.commons.logging.LogFactory;
  88. import org.apache.hadoop.conf.Configuration;
  89. import org.apache.hadoop.filecache.DistributedCache;
  90. import org.apache.hadoop.fs.ContentSummary;
  91. import org.apache.hadoop.fs.FileStatus;
  92. import org.apache.hadoop.fs.FileSystem;
  93. import org.apache.hadoop.fs.Path;
  94. import org.apache.hadoop.fs.PathFilter;
  95. import org.apache.hadoop.fs.permission.FsPermission;
  96. import org.apache.hadoop.hive.common.HiveInterruptCallback;
  97. import org.apache.hadoop.hive.common.HiveInterruptUtils;
  98. import org.apache.hadoop.hive.common.HiveStatsUtils;
  99. import org.apache.hadoop.hive.conf.HiveConf;
  100. import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
  101. import org.apache.hadoop.hive.metastore.Warehouse;
  102. import org.apache.hadoop.hive.metastore.api.FieldSchema;
  103. import org.apache.hadoop.hive.metastore.api.Order;
  104. import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
  105. import org.apache.hadoop.hive.ql.Context;
  106. import org.apache.hadoop.hive.ql.ErrorMsg;
  107. import org.apache.hadoop.hive.ql.QueryPlan;
  108. import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
  109. import org.apache.hadoop.hive.ql.exec.mr.ExecDriver;
  110. import org.apache.hadoop.hive.ql.exec.mr.ExecMapper;
  111. import org.apache.hadoop.hive.ql.exec.mr.ExecReducer;
  112. import org.apache.hadoop.hive.ql.exec.mr.MapRedTask;
  113. import org.apache.hadoop.hive.ql.exec.tez.TezTask;
  114. import org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat;
  115. import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
  116. import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat;
  117. import org.apache.hadoop.hive.ql.io.HiveInputFormat;
  118. import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
  119. import org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat;
  120. import org.apache.hadoop.hive.ql.io.OneNullRowInputFormat;
  121. import org.apache.hadoop.hive.ql.io.RCFile;
  122. import org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat;
  123. import org.apache.hadoop.hive.ql.io.rcfile.merge.MergeWork;
  124. import org.apache.hadoop.hive.ql.io.rcfile.merge.RCFileMergeMapper;
  125. import org.apache.hadoop.hive.ql.io.rcfile.stats.PartialScanMapper;
  126. import org.apache.hadoop.hive.ql.io.rcfile.stats.PartialScanWork;
  127. import org.apache.hadoop.hive.ql.io.rcfile.truncate.ColumnTruncateMapper;
  128. import org.apache.hadoop.hive.ql.io.rcfile.truncate.ColumnTruncateWork;
  129. import org.apache.hadoop.hive.ql.log.PerfLogger;
  130. import org.apache.hadoop.hive.ql.metadata.HiveException;
  131. import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler;
  132. import org.apache.hadoop.hive.ql.metadata.HiveUtils;
  133. import org.apache.hadoop.hive.ql.metadata.InputEstimator;
  134. import org.apache.hadoop.hive.ql.metadata.Partition;
  135. import org.apache.hadoop.hive.ql.metadata.Table;
  136. import org.apache.hadoop.hive.ql.parse.SemanticException;
  137. import org.apache.hadoop.hive.ql.plan.BaseWork;
  138. import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
  139. import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
  140. import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
  141. import org.apache.hadoop.hive.ql.plan.GroupByDesc;
  142. import org.apache.hadoop.hive.ql.plan.MapWork;
  143. import org.apache.hadoop.hive.ql.plan.MapredWork;
  144. import org.apache.hadoop.hive.ql.plan.OperatorDesc;
  145. import org.apache.hadoop.hive.ql.plan.PartitionDesc;
  146. import org.apache.hadoop.hive.ql.plan.PlanUtils;
  147. import org.apache.hadoop.hive.ql.plan.PlanUtils.ExpressionTypes;
  148. import org.apache.hadoop.hive.ql.plan.ReduceWork;
  149. import org.apache.hadoop.hive.ql.plan.TableDesc;
  150. import org.apache.hadoop.hive.ql.plan.api.Adjacency;
  151. import org.apache.hadoop.hive.ql.plan.api.Graph;
  152. import org.apache.hadoop.hive.ql.session.SessionState;
  153. import org.apache.hadoop.hive.ql.stats.StatsFactory;
  154. import org.apache.hadoop.hive.ql.stats.StatsPublisher;
  155. import org.apache.hadoop.hive.serde.serdeConstants;
  156. import org.apache.hadoop.hive.serde2.SerDeException;
  157. import org.apache.hadoop.hive.serde2.SerDeUtils;
  158. import org.apache.hadoop.hive.serde2.Serializer;
  159. import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe;
  160. import org.apache.hadoop.hive.shims.ShimLoader;
  161. import org.apache.hadoop.io.IOUtils;
  162. import org.apache.hadoop.io.SequenceFile;
  163. import org.apache.hadoop.io.SequenceFile.CompressionType;
  164. import org.apache.hadoop.io.Text;
  165. import org.apache.hadoop.io.Writable;
  166. import org.apache.hadoop.io.WritableComparable;
  167. import org.apache.hadoop.io.compress.CompressionCodec;
  168. import org.apache.hadoop.io.compress.DefaultCodec;
  169. import org.apache.hadoop.mapred.FileInputFormat;
  170. import org.apache.hadoop.mapred.FileOutputFormat;
  171. import org.apache.hadoop.mapred.InputFormat;
  172. import org.apache.hadoop.mapred.JobConf;
  173. import org.apache.hadoop.mapred.RecordReader;
  174. import org.apache.hadoop.mapred.Reporter;
  175. import org.apache.hadoop.mapred.SequenceFileInputFormat;
  176. import org.apache.hadoop.mapred.SequenceFileOutputFormat;
  177. import org.apache.hadoop.util.Progressable;
  178. import org.apache.hadoop.util.ReflectionUtils;
  179. import org.apache.hadoop.util.Shell;
  180. import com.esotericsoftware.kryo.Kryo;
  181. import com.esotericsoftware.kryo.io.Input;
  182. import com.esotericsoftware.kryo.io.Output;
  183. import com.esotericsoftware.kryo.serializers.FieldSerializer;
  184. import com.esotericsoftware.shaded.org.objenesis.strategy.StdInstantiatorStrategy;
  185. /**
  186. * Utilities.
  187. *
  188. */
  189. @SuppressWarnings("nls")
  190. public final class Utilities {
  191. /**
  192. * The object in the reducer are composed of these top level fields.
  193. */
  194. public static String HADOOP_LOCAL_FS = "file:///";
  195. public static String MAP_PLAN_NAME = "map.xml";
  196. public static String REDUCE_PLAN_NAME = "reduce.xml";
  197. public static final String MAPRED_MAPPER_CLASS = "mapred.mapper.class";
  198. public static final String MAPRED_REDUCER_CLASS = "mapred.reducer.class";
  199. /**
  200. * ReduceField:
  201. * KEY: record key
  202. * VALUE: record value
  203. */
  204. public static enum ReduceField {
  205. KEY, VALUE
  206. };
  207. public static List<String> reduceFieldNameList;
  208. static {
  209. reduceFieldNameList = new ArrayList<String>();
  210. for (ReduceField r : ReduceField.values()) {
  211. reduceFieldNameList.add(r.toString());
  212. }
  213. }
  214. public static String removeValueTag(String column) {
  215. if (column.startsWith(ReduceField.VALUE + ".")) {
  216. return column.substring(6);
  217. }
  218. return column;
  219. }
  220. private Utilities() {
  221. // prevent instantiation
  222. }
  223. private static Map<Path, BaseWork> gWorkMap = Collections
  224. .synchronizedMap(new HashMap<Path, BaseWork>());
  225. private static final String CLASS_NAME = Utilities.class.getName();
  226. private static final Log LOG = LogFactory.getLog(CLASS_NAME);
  227. public static void clearWork(Configuration conf) {
  228. Path mapPath = getPlanPath(conf, MAP_PLAN_NAME);
  229. Path reducePath = getPlanPath(conf, REDUCE_PLAN_NAME);
  230. // if the plan path hasn't been initialized just return, nothing to clean.
  231. if (mapPath == null && reducePath == null) {
  232. return;
  233. }
  234. try {
  235. FileSystem fs = mapPath.getFileSystem(conf);
  236. if (fs.exists(mapPath)) {
  237. fs.delete(mapPath, true);
  238. }
  239. if (fs.exists(reducePath)) {
  240. fs.delete(reducePath, true);
  241. }
  242. } catch (Exception e) {
  243. LOG.warn("Failed to clean-up tmp directories.", e);
  244. } finally {
  245. // where a single process works with multiple plans - we must clear
  246. // the cache before working with the next plan.
  247. clearWorkMapForConf(conf);
  248. }
  249. }
  250. public static MapredWork getMapRedWork(Configuration conf) {
  251. MapredWork w = new MapredWork();
  252. w.setMapWork(getMapWork(conf));
  253. w.setReduceWork(getReduceWork(conf));
  254. return w;
  255. }
  256. public static void setMapWork(Configuration conf, MapWork work) {
  257. setBaseWork(conf, MAP_PLAN_NAME, work);
  258. }
  259. public static MapWork getMapWork(Configuration conf) {
  260. return (MapWork) getBaseWork(conf, MAP_PLAN_NAME);
  261. }
  262. public static void setReduceWork(Configuration conf, ReduceWork work) {
  263. setBaseWork(conf, REDUCE_PLAN_NAME, work);
  264. }
  265. public static ReduceWork getReduceWork(Configuration conf) {
  266. return (ReduceWork) getBaseWork(conf, REDUCE_PLAN_NAME);
  267. }
  268. /**
  269. * Pushes work into the global work map
  270. */
  271. public static void setBaseWork(Configuration conf, String name, BaseWork work) {
  272. Path path = getPlanPath(conf, name);
  273. gWorkMap.put(path, work);
  274. }
  275. /**
  276. * Returns the Map or Reduce plan
  277. * Side effect: the BaseWork returned is also placed in the gWorkMap
  278. * @param conf
  279. * @param name
  280. * @return BaseWork based on the name supplied will return null if name is null
  281. * @throws RuntimeException if the configuration files are not proper or if plan can not be loaded
  282. */
  283. private static BaseWork getBaseWork(Configuration conf, String name) {
  284. BaseWork gWork = null;
  285. Path path = null;
  286. InputStream in = null;
  287. try {
  288. path = getPlanPath(conf, name);
  289. assert path != null;
  290. if (!gWorkMap.containsKey(path)) {
  291. Path localPath;
  292. if (ShimLoader.getHadoopShims().isLocalMode(conf)) {
  293. localPath = path;
  294. } else {
  295. localPath = new Path(name);
  296. }
  297. if (HiveConf.getBoolVar(conf, ConfVars.HIVE_RPC_QUERY_PLAN)) {
  298. LOG.debug("Loading plan from string: "+path.toUri().getPath());
  299. String planString = conf.get(path.toUri().getPath());
  300. if (planString == null) {
  301. LOG.info("Could not find plan string in conf");
  302. return null;
  303. }
  304. byte[] planBytes = Base64.decodeBase64(planString);
  305. in = new ByteArrayInputStream(planBytes);
  306. in = new InflaterInputStream(in);
  307. } else {
  308. in = new FileInputStream(localPath.toUri().getPath());
  309. }
  310. if(MAP_PLAN_NAME.equals(name)){
  311. if (ExecMapper.class.getName().equals(conf.get(MAPRED_MAPPER_CLASS))){
  312. gWork = deserializePlan(in, MapWork.class, conf);
  313. } else if(RCFileMergeMapper.class.getName().equals(conf.get(MAPRED_MAPPER_CLASS))) {
  314. gWork = deserializePlan(in, MergeWork.class, conf);
  315. } else if(ColumnTruncateMapper.class.getName().equals(conf.get(MAPRED_MAPPER_CLASS))) {
  316. gWork = deserializePlan(in, ColumnTruncateWork.class, conf);
  317. } else if(PartialScanMapper.class.getName().equals(conf.get(MAPRED_MAPPER_CLASS))) {
  318. gWork = deserializePlan(in, PartialScanWork.class,conf);
  319. } else {
  320. throw new RuntimeException("unable to determine work from configuration ."
  321. + MAPRED_MAPPER_CLASS + " was "+ conf.get(MAPRED_MAPPER_CLASS)) ;
  322. }
  323. } else if (REDUCE_PLAN_NAME.equals(name)) {
  324. if(ExecReducer.class.getName().equals(conf.get(MAPRED_REDUCER_CLASS))) {
  325. gWork = deserializePlan(in, ReduceWork.class, conf);
  326. } else {
  327. throw new RuntimeException("unable to determine work from configuration ."
  328. + MAPRED_REDUCER_CLASS +" was "+ conf.get(MAPRED_REDUCER_CLASS)) ;
  329. }
  330. }
  331. gWorkMap.put(path, gWork);
  332. } else {
  333. LOG.debug("Found plan in cache.");
  334. gWork = gWorkMap.get(path);
  335. }
  336. return gWork;
  337. } catch (FileNotFoundException fnf) {
  338. // happens. e.g.: no reduce work.
  339. LOG.info("No plan file found: "+path);
  340. return null;
  341. } catch (Exception e) {
  342. LOG.error("Failed to load plan: "+path, e);
  343. throw new RuntimeException(e);
  344. } finally {
  345. if (in != null) {
  346. try {
  347. in.close();
  348. } catch (IOException cantBlameMeForTrying) { }
  349. }
  350. }
  351. }
  352. public static void setWorkflowAdjacencies(Configuration conf, QueryPlan plan) {
  353. try {
  354. Graph stageGraph = plan.getQueryPlan().getStageGraph();
  355. if (stageGraph == null) {
  356. return;
  357. }
  358. List<Adjacency> adjList = stageGraph.getAdjacencyList();
  359. if (adjList == null) {
  360. return;
  361. }
  362. for (Adjacency adj : adjList) {
  363. List<String> children = adj.getChildren();
  364. if (children == null || children.isEmpty()) {
  365. return;
  366. }
  367. conf.setStrings("mapreduce.workflow.adjacency."+adj.getNode(),
  368. children.toArray(new String[children.size()]));
  369. }
  370. } catch (IOException e) {
  371. }
  372. }
  373. public static List<String> getFieldSchemaString(List<FieldSchema> fl) {
  374. if (fl == null) {
  375. return null;
  376. }
  377. ArrayList<String> ret = new ArrayList<String>();
  378. for (FieldSchema f : fl) {
  379. ret.add(f.getName() + " " + f.getType()
  380. + (f.getComment() != null ? (" " + f.getComment()) : ""));
  381. }
  382. return ret;
  383. }
  384. /**
  385. * Java 1.5 workaround. From http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=5015403
  386. */
  387. public static class EnumDelegate extends DefaultPersistenceDelegate {
  388. @Override
  389. protected Expression instantiate(Object oldInstance, Encoder out) {
  390. return new Expression(Enum.class, "valueOf", new Object[] {oldInstance.getClass(),
  391. ((Enum<?>) oldInstance).name()});
  392. }
  393. @Override
  394. protected boolean mutatesTo(Object oldInstance, Object newInstance) {
  395. return oldInstance == newInstance;
  396. }
  397. }
  398. public static class MapDelegate extends DefaultPersistenceDelegate {
  399. @Override
  400. protected Expression instantiate(Object oldInstance, Encoder out) {
  401. Map oldMap = (Map) oldInstance;
  402. HashMap newMap = new HashMap(oldMap);
  403. return new Expression(newMap, HashMap.class, "new", new Object[] {});
  404. }
  405. @Override
  406. protected boolean mutatesTo(Object oldInstance, Object newInstance) {
  407. return false;
  408. }
  409. @Override
  410. protected void initialize(Class<?> type, Object oldInstance, Object newInstance, Encoder out) {
  411. java.util.Collection oldO = (java.util.Collection) oldInstance;
  412. java.util.Collection newO = (java.util.Collection) newInstance;
  413. if (newO.size() != 0) {
  414. out.writeStatement(new Statement(oldInstance, "clear", new Object[] {}));
  415. }
  416. for (Iterator i = oldO.iterator(); i.hasNext();) {
  417. out.writeStatement(new Statement(oldInstance, "add", new Object[] {i.next()}));
  418. }
  419. }
  420. }
  421. public static class SetDelegate extends DefaultPersistenceDelegate {
  422. @Override
  423. protected Expression instantiate(Object oldInstance, Encoder out) {
  424. Set oldSet = (Set) oldInstance;
  425. HashSet newSet = new HashSet(oldSet);
  426. return new Expression(newSet, HashSet.class, "new", new Object[] {});
  427. }
  428. @Override
  429. protected boolean mutatesTo(Object oldInstance, Object newInstance) {
  430. return false;
  431. }
  432. @Override
  433. protected void initialize(Class<?> type, Object oldInstance, Object newInstance, Encoder out) {
  434. java.util.Collection oldO = (java.util.Collection) oldInstance;
  435. java.util.Collection newO = (java.util.Collection) newInstance;
  436. if (newO.size() != 0) {
  437. out.writeStatement(new Statement(oldInstance, "clear", new Object[] {}));
  438. }
  439. for (Iterator i = oldO.iterator(); i.hasNext();) {
  440. out.writeStatement(new Statement(oldInstance, "add", new Object[] {i.next()}));
  441. }
  442. }
  443. }
  444. public static class ListDelegate extends DefaultPersistenceDelegate {
  445. @Override
  446. protected Expression instantiate(Object oldInstance, Encoder out) {
  447. List oldList = (List) oldInstance;
  448. ArrayList newList = new ArrayList(oldList);
  449. return new Expression(newList, ArrayList.class, "new", new Object[] {});
  450. }
  451. @Override
  452. protected boolean mutatesTo(Object oldInstance, Object newInstance) {
  453. return false;
  454. }
  455. @Override
  456. protected void initialize(Class<?> type, Object oldInstance, Object newInstance, Encoder out) {
  457. java.util.Collection oldO = (java.util.Collection) oldInstance;
  458. java.util.Collection newO = (java.util.Collection) newInstance;
  459. if (newO.size() != 0) {
  460. out.writeStatement(new Statement(oldInstance, "clear", new Object[] {}));
  461. }
  462. for (Iterator i = oldO.iterator(); i.hasNext();) {
  463. out.writeStatement(new Statement(oldInstance, "add", new Object[] {i.next()}));
  464. }
  465. }
  466. }
  467. /**
  468. * DatePersistenceDelegate. Needed to serialize java.util.Date
  469. * since it is not serialization friendly.
  470. * Also works for java.sql.Date since it derives from java.util.Date.
  471. */
  472. public static class DatePersistenceDelegate extends PersistenceDelegate {
  473. @Override
  474. protected Expression instantiate(Object oldInstance, Encoder out) {
  475. Date dateVal = (Date)oldInstance;
  476. Object[] args = { dateVal.getTime() };
  477. return new Expression(dateVal, dateVal.getClass(), "new", args);
  478. }
  479. @Override
  480. protected boolean mutatesTo(Object oldInstance, Object newInstance) {
  481. if (oldInstance == null || newInstance == null) {
  482. return false;
  483. }
  484. return oldInstance.getClass() == newInstance.getClass();
  485. }
  486. }
  487. /**
  488. * TimestampPersistenceDelegate. Needed to serialize java.sql.Timestamp since
  489. * it is not serialization friendly.
  490. */
  491. public static class TimestampPersistenceDelegate extends DatePersistenceDelegate {
  492. @Override
  493. protected void initialize(Class<?> type, Object oldInstance, Object newInstance, Encoder out) {
  494. Timestamp ts = (Timestamp)oldInstance;
  495. Object[] args = { ts.getNanos() };
  496. Statement stmt = new Statement(oldInstance, "setNanos", args);
  497. out.writeStatement(stmt);
  498. }
  499. }
  500. /**
  501. * Need to serialize org.antlr.runtime.CommonToken
  502. */
  503. public static class CommonTokenDelegate extends PersistenceDelegate {
  504. @Override
  505. protected Expression instantiate(Object oldInstance, Encoder out) {
  506. CommonToken ct = (CommonToken)oldInstance;
  507. Object[] args = {ct.getType(), ct.getText()};
  508. return new Expression(ct, ct.getClass(), "new", args);
  509. }
  510. }
  511. public static class PathDelegate extends PersistenceDelegate {
  512. @Override
  513. protected Expression instantiate(Object oldInstance, Encoder out) {
  514. Path p = (Path)oldInstance;
  515. Object[] args = {p.toString()};
  516. return new Expression(p, p.getClass(), "new", args);
  517. }
  518. }
  519. public static void setMapRedWork(Configuration conf, MapredWork w, Path hiveScratchDir) {
  520. setMapWork(conf, w.getMapWork(), hiveScratchDir, true);
  521. if (w.getReduceWork() != null) {
  522. setReduceWork(conf, w.getReduceWork(), hiveScratchDir, true);
  523. }
  524. }
  525. public static Path setMapWork(Configuration conf, MapWork w, Path hiveScratchDir, boolean useCache) {
  526. return setBaseWork(conf, w, hiveScratchDir, MAP_PLAN_NAME, useCache);
  527. }
  528. public static Path setReduceWork(Configuration conf, ReduceWork w, Path hiveScratchDir, boolean useCache) {
  529. return setBaseWork(conf, w, hiveScratchDir, REDUCE_PLAN_NAME, useCache);
  530. }
  531. private static Path setBaseWork(Configuration conf, BaseWork w, Path hiveScratchDir, String name, boolean useCache) {
  532. try {
  533. setPlanPath(conf, hiveScratchDir);
  534. Path planPath = getPlanPath(conf, name);
  535. OutputStream out;
  536. if (HiveConf.getBoolVar(conf, ConfVars.HIVE_RPC_QUERY_PLAN)) {
  537. // add it to the conf
  538. ByteArrayOutputStream byteOut = new ByteArrayOutputStream();
  539. out = new DeflaterOutputStream(byteOut, new Deflater(Deflater.BEST_SPEED));
  540. serializePlan(w, out, conf);
  541. LOG.info("Setting plan: "+planPath.toUri().getPath());
  542. conf.set(planPath.toUri().getPath(),
  543. Base64.encodeBase64String(byteOut.toByteArray()));
  544. } else {
  545. // use the default file system of the conf
  546. FileSystem fs = planPath.getFileSystem(conf);
  547. out = fs.create(planPath);
  548. serializePlan(w, out, conf);
  549. // Serialize the plan to the default hdfs instance
  550. // Except for hadoop local mode execution where we should be
  551. // able to get the plan directly from the cache
  552. if (useCache && !ShimLoader.getHadoopShims().isLocalMode(conf)) {
  553. // Set up distributed cache
  554. if (!DistributedCache.getSymlink(conf)) {
  555. DistributedCache.createSymlink(conf);
  556. }
  557. String uriWithLink = planPath.toUri().toString() + "#" + name;
  558. DistributedCache.addCacheFile(new URI(uriWithLink), conf);
  559. // set replication of the plan file to a high number. we use the same
  560. // replication factor as used by the hadoop jobclient for job.xml etc.
  561. short replication = (short) conf.getInt("mapred.submit.replication", 10);
  562. fs.setReplication(planPath, replication);
  563. }
  564. }
  565. // Cache the plan in this process
  566. gWorkMap.put(planPath, w);
  567. return planPath;
  568. } catch (Exception e) {
  569. e.printStackTrace();
  570. throw new RuntimeException(e);
  571. }
  572. }
  573. private static Path getPlanPath(Configuration conf, String name) {
  574. Path planPath = getPlanPath(conf);
  575. if (planPath == null) {
  576. return null;
  577. }
  578. return new Path(planPath, name);
  579. }
  580. private static void setPlanPath(Configuration conf, Path hiveScratchDir) throws IOException {
  581. if (getPlanPath(conf) == null) {
  582. // this is the unique conf ID, which is kept in JobConf as part of the plan file name
  583. String jobID = UUID.randomUUID().toString();
  584. Path planPath = new Path(hiveScratchDir, jobID);
  585. FileSystem fs = planPath.getFileSystem(conf);
  586. fs.mkdirs(planPath);
  587. HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, planPath.toUri().toString());
  588. }
  589. }
  590. public static Path getPlanPath(Configuration conf) {
  591. String plan = HiveConf.getVar(conf, HiveConf.ConfVars.PLAN);
  592. if (plan != null && !plan.isEmpty()) {
  593. return new Path(plan);
  594. }
  595. return null;
  596. }
  597. /**
  598. * Serializes expression via Kryo.
  599. * @param expr Expression.
  600. * @return Bytes.
  601. */
  602. public static byte[] serializeExpressionToKryo(ExprNodeGenericFuncDesc expr) {
  603. return serializeObjectToKryo(expr);
  604. }
  605. /**
  606. * Deserializes expression from Kryo.
  607. * @param bytes Bytes containing the expression.
  608. * @return Expression; null if deserialization succeeded, but the result type is incorrect.
  609. */
  610. public static ExprNodeGenericFuncDesc deserializeExpressionFromKryo(byte[] bytes) {
  611. return deserializeObjectFromKryo(bytes, ExprNodeGenericFuncDesc.class);
  612. }
  613. public static String serializeExpression(ExprNodeGenericFuncDesc expr) {
  614. try {
  615. return new String(Base64.encodeBase64(serializeExpressionToKryo(expr)), "UTF-8");
  616. } catch (UnsupportedEncodingException ex) {
  617. throw new RuntimeException("UTF-8 support required", ex);
  618. }
  619. }
  620. public static ExprNodeGenericFuncDesc deserializeExpression(String s) {
  621. byte[] bytes;
  622. try {
  623. bytes = Base64.decodeBase64(s.getBytes("UTF-8"));
  624. } catch (UnsupportedEncodingException ex) {
  625. throw new RuntimeException("UTF-8 support required", ex);
  626. }
  627. return deserializeExpressionFromKryo(bytes);
  628. }
  629. private static byte[] serializeObjectToKryo(Serializable object) {
  630. ByteArrayOutputStream baos = new ByteArrayOutputStream();
  631. Output output = new Output(baos);
  632. runtimeSerializationKryo.get().writeObject(output, object);
  633. output.close();
  634. return baos.toByteArray();
  635. }
  636. private static <T extends Serializable> T deserializeObjectFromKryo(byte[] bytes, Class<T> clazz) {
  637. Input inp = new Input(new ByteArrayInputStream(bytes));
  638. T func = runtimeSerializationKryo.get().readObject(inp, clazz);
  639. inp.close();
  640. return func;
  641. }
  642. public static String serializeObject(Serializable expr) {
  643. try {
  644. return new String(Base64.encodeBase64(serializeObjectToKryo(expr)), "UTF-8");
  645. } catch (UnsupportedEncodingException ex) {
  646. throw new RuntimeException("UTF-8 support required", ex);
  647. }
  648. }
  649. public static <T extends Serializable> T deserializeObject(String s, Class<T> clazz) {
  650. try {
  651. return deserializeObjectFromKryo(Base64.decodeBase64(s.getBytes("UTF-8")), clazz);
  652. } catch (UnsupportedEncodingException ex) {
  653. throw new RuntimeException("UTF-8 support required", ex);
  654. }
  655. }
  656. public static class CollectionPersistenceDelegate extends DefaultPersistenceDelegate {
  657. @Override
  658. protected Expression instantiate(Object oldInstance, Encoder out) {
  659. return new Expression(oldInstance, oldInstance.getClass(), "new", null);
  660. }
  661. @Override
  662. protected void initialize(Class type, Object oldInstance, Object newInstance, Encoder out) {
  663. Iterator ite = ((Collection) oldInstance).iterator();
  664. while (ite.hasNext()) {
  665. out.writeStatement(new Statement(oldInstance, "add", new Object[] {ite.next()}));
  666. }
  667. }
  668. }
  669. /**
  670. * Kryo serializer for timestamp.
  671. */
  672. private static class TimestampSerializer extends
  673. com.esotericsoftware.kryo.Serializer<Timestamp> {
  674. @Override
  675. public Timestamp read(Kryo kryo, Input input, Class<Timestamp> clazz) {
  676. Timestamp ts = new Timestamp(input.readLong());
  677. ts.setNanos(input.readInt());
  678. return ts;
  679. }
  680. @Override
  681. public void write(Kryo kryo, Output output, Timestamp ts) {
  682. output.writeLong(ts.getTime());
  683. output.writeInt(ts.getNanos());
  684. }
  685. }
  686. /** Custom Kryo serializer for sql date, otherwise Kryo gets confused between
  687. java.sql.Date and java.util.Date while deserializing
  688. */
  689. private static class SqlDateSerializer extends
  690. com.esotericsoftware.kryo.Serializer<java.sql.Date> {
  691. @Override
  692. public java.sql.Date read(Kryo kryo, Input input, Class<java.sql.Date> clazz) {
  693. return new java.sql.Date(input.readLong());
  694. }
  695. @Override
  696. public void write(Kryo kryo, Output output, java.sql.Date sqlDate) {
  697. output.writeLong(sqlDate.getTime());
  698. }
  699. }
  700. private static class CommonTokenSerializer extends com.esotericsoftware.kryo.Serializer<CommonToken> {
  701. @Override
  702. public CommonToken read(Kryo kryo, Input input, Class<CommonToken> clazz) {
  703. return new CommonToken(input.readInt(), input.readString());
  704. }
  705. @Override
  706. public void write(Kryo kryo, Output output, CommonToken token) {
  707. output.writeInt(token.getType());
  708. output.writeString(token.getText());
  709. }
  710. }
  711. private static class PathSerializer extends com.esotericsoftware.kryo.Serializer<Path> {
  712. @Override
  713. public void write(Kryo kryo, Output output, Path path) {
  714. output.writeString(path.toUri().toString());
  715. }
  716. @Override
  717. public Path read(Kryo kryo, Input input, Class<Path> type) {
  718. return new Path(URI.create(input.readString()));
  719. }
  720. }
  721. public static Set<Operator<?>> cloneOperatorTree(Configuration conf, Set<Operator<?>> roots) {
  722. ByteArrayOutputStream baos = new ByteArrayOutputStream(4096);
  723. serializePlan(roots, baos, conf, true);
  724. Set<Operator<?>> result = deserializePlan(new ByteArrayInputStream(baos.toByteArray()),
  725. roots.getClass(), conf, true);
  726. return result;
  727. }
  728. private static void serializePlan(Object plan, OutputStream out, Configuration conf, boolean cloningPlan) {
  729. PerfLogger perfLogger = PerfLogger.getPerfLogger();
  730. perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.SERIALIZE_PLAN);
  731. String serializationType = conf.get(HiveConf.ConfVars.PLAN_SERIALIZATION.varname, "kryo");
  732. LOG.info("Serializing " + plan.getClass().getSimpleName() + " via " + serializationType);
  733. if("javaXML".equalsIgnoreCase(serializationType)) {
  734. serializeObjectByJavaXML(plan, out);
  735. } else {
  736. if(cloningPlan) {
  737. serializeObjectByKryo(cloningQueryPlanKryo.get(), plan, out);
  738. } else {
  739. serializeObjectByKryo(runtimeSerializationKryo.get(), plan, out);
  740. }
  741. }
  742. perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.SERIALIZE_PLAN);
  743. }
  744. /**
  745. * Serializes the plan.
  746. * @param plan The plan, such as QueryPlan, MapredWork, etc.
  747. * @param out The stream to write to.
  748. * @param conf to pick which serialization format is desired.
  749. */
  750. public static void serializePlan(Object plan, OutputStream out, Configuration conf) {
  751. serializePlan(plan, out, conf, false);
  752. }
  753. private static <T> T deserializePlan(InputStream in, Class<T> planClass, Configuration conf, boolean cloningPlan) {
  754. PerfLogger perfLogger = PerfLogger.getPerfLogger();
  755. perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.DESERIALIZE_PLAN);
  756. T plan;
  757. String serializationType = conf.get(HiveConf.ConfVars.PLAN_SERIALIZATION.varname, "kryo");
  758. LOG.info("Deserializing " + planClass.getSimpleName() + " via " + serializationType);
  759. if("javaXML".equalsIgnoreCase(serializationType)) {
  760. plan = deserializeObjectByJavaXML(in);
  761. } else {
  762. if(cloningPlan) {
  763. plan = deserializeObjectByKryo(cloningQueryPlanKryo.get(), in, planClass);
  764. } else {
  765. plan = deserializeObjectByKryo(runtimeSerializationKryo.get(), in, planClass);
  766. }
  767. }
  768. perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.DESERIALIZE_PLAN);
  769. return plan;
  770. }
  771. /**
  772. * Deserializes the plan.
  773. * @param in The stream to read from.
  774. * @param planClass class of plan
  775. * @param conf configuration
  776. * @return The plan, such as QueryPlan, MapredWork, etc.
  777. */
  778. public static <T> T deserializePlan(InputStream in, Class<T> planClass, Configuration conf) {
  779. return deserializePlan(in, planClass, conf, false);
  780. }
  781. /**
  782. * Clones using the powers of XML. Do not use unless necessary.
  783. * @param plan The plan.
  784. * @return The clone.
  785. */
  786. public static MapredWork clonePlan(MapredWork plan) {
  787. // TODO: need proper clone. Meanwhile, let's at least keep this horror in one place
  788. PerfLogger perfLogger = PerfLogger.getPerfLogger();
  789. perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.CLONE_PLAN);
  790. ByteArrayOutputStream baos = new ByteArrayOutputStream(4096);
  791. Configuration conf = new HiveConf();
  792. serializePlan(plan, baos, conf, true);
  793. MapredWork newPlan = deserializePlan(new ByteArrayInputStream(baos.toByteArray()),
  794. MapredWork.class, conf, true);
  795. perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.CLONE_PLAN);
  796. return newPlan;
  797. }
  798. /**
  799. * Serialize the object. This helper function mainly makes sure that enums,
  800. * counters, etc are handled properly.
  801. */
  802. private static void serializeObjectByJavaXML(Object plan, OutputStream out) {
  803. XMLEncoder e = new XMLEncoder(out);
  804. e.setExceptionListener(new ExceptionListener() {
  805. @Override
  806. public void exceptionThrown(Exception e) {
  807. LOG.warn(org.apache.hadoop.util.StringUtils.stringifyException(e));
  808. throw new RuntimeException("Cannot serialize object", e);
  809. }
  810. });
  811. // workaround for java 1.5
  812. e.setPersistenceDelegate(ExpressionTypes.class, new EnumDelegate());
  813. e.setPersistenceDelegate(GroupByDesc.Mode.class, new EnumDelegate());
  814. e.setPersistenceDelegate(java.sql.Date.class, new DatePersistenceDelegate());
  815. e.setPersistenceDelegate(Timestamp.class, new TimestampPersistenceDelegate());
  816. e.setPersistenceDelegate(org.datanucleus.store.types.backed.Map.class, new MapDelegate());
  817. e.setPersistenceDelegate(org.datanucleus.store.types.backed.List.class, new ListDelegate());
  818. e.setPersistenceDelegate(CommonToken.class, new CommonTokenDelegate());
  819. e.setPersistenceDelegate(Path.class, new PathDelegate());
  820. e.writeObject(plan);
  821. e.close();
  822. }
  823. /**
  824. * @param plan Usually of type MapredWork, MapredLocalWork etc.
  825. * @param out stream in which serialized plan is written into
  826. */
  827. private static void serializeObjectByKryo(Kryo kryo, Object plan, OutputStream out) {
  828. Output output = new Output(out);
  829. kryo.writeObject(output, plan);
  830. output.close();
  831. }
  832. /**
  833. * De-serialize an object. This helper function mainly makes sure that enums,
  834. * counters, etc are handled properly.
  835. */
  836. @SuppressWarnings("unchecked")
  837. private static <T> T deserializeObjectByJavaXML(InputStream in) {
  838. XMLDecoder d = null;
  839. try {
  840. d = new XMLDecoder(in, null, null);
  841. return (T) d.readObject();
  842. } finally {
  843. if (null != d) {
  844. d.close();
  845. }
  846. }
  847. }
  848. private static <T> T deserializeObjectByKryo(Kryo kryo, InputStream in, Class<T> clazz ) {
  849. Input inp = new Input(in);
  850. T t = kryo.readObject(inp,clazz);
  851. inp.close();
  852. return t;
  853. }
  854. // Kryo is not thread-safe,
  855. // Also new Kryo() is expensive, so we want to do it just once.
  856. public static ThreadLocal<Kryo> runtimeSerializationKryo = new ThreadLocal<Kryo>() {
  857. @Override
  858. protected synchronized Kryo initialValue() {
  859. Kryo kryo = new Kryo();
  860. kryo.setClassLoader(Thread.currentThread().getContextClassLoader());
  861. kryo.register(java.sql.Date.class, new SqlDateSerializer());
  862. kryo.register(java.sql.Timestamp.class, new TimestampSerializer());
  863. kryo.register(Path.class, new PathSerializer());
  864. kryo.setInstantiatorStrategy(new StdInstantiatorStrategy());
  865. removeField(kryo, Operator.class, "colExprMap");
  866. removeField(kryo, ColumnInfo.class, "objectInspector");
  867. removeField(kryo, MapWork.class, "opParseCtxMap");
  868. removeField(kryo, MapWork.class, "joinTree");
  869. return kryo;
  870. };
  871. };
  872. @SuppressWarnings("rawtypes")
  873. protected static void removeField(Kryo kryo, Class type, String fieldName) {
  874. FieldSerializer fld = new FieldSerializer(kryo, type);
  875. fld.removeField(fieldName);
  876. kryo.register(type, fld);
  877. }
  878. private static ThreadLocal<Kryo> cloningQueryPlanKryo = new ThreadLocal<Kryo>() {
  879. @Override
  880. protected synchronized Kryo initialValue() {
  881. Kryo kryo = new Kryo();
  882. kryo.setClassLoader(Thread.currentThread().getContextClassLoader());
  883. kryo.register(CommonToken.class, new CommonTokenSerializer());
  884. kryo.register(java.sql.Date.class, new SqlDateSerializer());
  885. kryo.register(java.sql.Timestamp.class, new TimestampSerializer());
  886. kryo.register(Path.class, new PathSerializer());
  887. kryo.setInstantiatorStrategy(new StdInstantiatorStrategy());
  888. return kryo;
  889. };
  890. };
  891. public static TableDesc defaultTd;
  892. static {
  893. // by default we expect ^A separated strings
  894. // This tableDesc does not provide column names. We should always use
  895. // PlanUtils.getDefaultTableDesc(String separatorCode, String columns)
  896. // or getBinarySortableTableDesc(List<FieldSchema> fieldSchemas) when
  897. // we know the column names.
  898. defaultTd = PlanUtils.getDefaultTableDesc("" + Utilities.ctrlaCode);
  899. }
  900. public static final int carriageReturnCode = 13;
  901. public static final int newLineCode = 10;
  902. public static final int tabCode = 9;
  903. public static final int ctrlaCode = 1;
  904. public static final String INDENT = " ";
  905. // Note: When DDL supports specifying what string to represent null,
  906. // we should specify "NULL" to represent null in the temp table, and then
  907. // we can make the following translation deprecated.
  908. public static String nullStringStorage = "\\N";
  909. public static String nullStringOutput = "NULL";
  910. public static Random randGen = new Random();
  911. /**
  912. * Gets the task id if we are running as a Hadoop job. Gets a random number otherwise.
  913. */
  914. public static String getTaskId(Configuration hconf) {
  915. String taskid = (hconf == null) ? null : hconf.get("mapred.task.id");
  916. if ((taskid == null) || taskid.equals("")) {
  917. return ("" + Math.abs(randGen.nextInt()));
  918. } else {
  919. /*
  920. * extract the task and attempt id from the hadoop taskid. in version 17 the leading component
  921. * was 'task_'. thereafter the leading component is 'attempt_'. in 17 - hadoop also seems to
  922. * have used _map_ and _reduce_ to denote map/reduce task types
  923. */
  924. String ret = taskid.replaceAll(".*_[mr]_", "").replaceAll(".*_(map|reduce)_", "");
  925. return (ret);
  926. }
  927. }
  928. public static HashMap makeMap(Object... olist) {
  929. HashMap ret = new HashMap();
  930. for (int i = 0; i < olist.length; i += 2) {
  931. ret.put(olist[i], olist[i + 1]);
  932. }
  933. return (ret);
  934. }
  935. public static Properties makeProperties(String... olist) {
  936. Properties ret = new Properties();
  937. for (int i = 0; i < olist.length; i += 2) {
  938. ret.setProperty(olist[i], olist[i + 1]);
  939. }
  940. return (ret);
  941. }
  942. public static ArrayList makeList(Object... olist) {
  943. ArrayList ret = new ArrayList();
  944. for (Object element : olist) {
  945. ret.add(element);
  946. }
  947. return (ret);
  948. }
  949. /**
  950. * StreamPrinter.
  951. *
  952. */
  953. public static class StreamPrinter extends Thread {
  954. InputStream is;
  955. String type;
  956. PrintStream os;
  957. public StreamPrinter(InputStream is, String type, PrintStream os) {
  958. this.is = is;
  959. this.type = type;
  960. this.os = os;
  961. }
  962. @Override
  963. public void run() {
  964. BufferedReader br = null;
  965. try {
  966. InputStreamReader isr = new InputStreamReader(is);
  967. br = new BufferedReader(isr);
  968. String line = null;
  969. if (type != null) {
  970. while ((line = br.readLine()) != null) {
  971. os.println(type + ">" + line);
  972. }
  973. } else {
  974. while ((line = br.readLine()) != null) {
  975. os.println(line);
  976. }
  977. }
  978. br.close();
  979. br=null;
  980. } catch (IOException ioe) {
  981. ioe.printStackTrace();
  982. }finally{
  983. IOUtils.closeStream(br);
  984. }
  985. }
  986. }
  987. public static TableDesc getTableDesc(Table tbl) {
  988. Properties props = tbl.getMetadata();
  989. props.put(serdeConstants.SERIALIZATION_LIB, tbl.getDeserializer().getClass().getName());
  990. return (new TableDesc(tbl.getInputFormatClass(), tbl
  991. .getOutputFormatClass(), props));
  992. }
  993. // column names and column types are all delimited by comma
  994. public static TableDesc getTableDesc(String cols, String colTypes) {
  995. return (new TableDesc(SequenceFileInputFormat.class,
  996. HiveSequenceFileOutputFormat.class, Utilities.makeProperties(
  997. serdeConstants.SERIALIZATION_FORMAT, "" + Utilities.ctrlaCode,
  998. serdeConstants.LIST_COLUMNS, cols,
  999. serdeConstants.LIST_COLUMN_TYPES, colTypes,
  1000. serdeConstants.SERIALIZATION_LIB,LazySimpleSerDe.class.getName())));
  1001. }
  1002. public static PartitionDesc getPartitionDesc(Partition part) throws HiveException {
  1003. return (new PartitionDesc(part));
  1004. }
  1005. public static PartitionDesc getPartitionDescFromTableDesc(TableDesc tblDesc, Partition part)
  1006. throws HiveException {
  1007. return new PartitionDesc(part, tblDesc);
  1008. }
  1009. private static String getOpTreeSkel_helper(Operator<?> op, String indent) {
  1010. if (op == null) {
  1011. return "";
  1012. }
  1013. StringBuilder sb = new StringBuilder();
  1014. sb.append(indent);
  1015. sb.append(op.toString());
  1016. sb.append("\n");
  1017. if (op.getChildOperators() != null) {
  1018. for (Object child : op.getChildOperators()) {
  1019. sb.append(getOpTreeSkel_helper((Operator<?>) child, indent + " "));
  1020. }
  1021. }
  1022. return sb.toString();
  1023. }
  1024. public static String getOpTreeSkel(Operator<?> op) {
  1025. return getOpTreeSkel_helper(op, "");
  1026. }
  1027. private static boolean isWhitespace(int c) {
  1028. if (c == -1) {
  1029. return false;
  1030. }
  1031. return Character.isWhitespace((char) c);
  1032. }
  1033. public static boolean contentsEqual(InputStream is1, InputStream is2, boolean ignoreWhitespace)
  1034. throws IOException {
  1035. try {
  1036. if ((is1 == is2) || (is1 == null && is2 == null)) {
  1037. return true;
  1038. }
  1039. if (is1 == null || is2 == null) {
  1040. return false;
  1041. }
  1042. while (true) {
  1043. int c1 = is1.read();
  1044. while (ignoreWhitespace && isWhitespace(c1)) {
  1045. c1 = is1.read();
  1046. }
  1047. int c2 = is2.read();
  1048. while (ignoreWhitespace && isWhitespace(c2)) {
  1049. c2 = is2.read();
  1050. }
  1051. if (c1 == -1 && c2 == -1) {
  1052. return true;
  1053. }
  1054. if (c1 != c2) {
  1055. break;
  1056. }
  1057. }
  1058. } catch (FileNotFoundException e) {
  1059. e.printStackTrace();
  1060. }
  1061. return false;
  1062. }
  1063. /**
  1064. * convert "From src insert blah blah" to "From src insert ... blah"
  1065. */
  1066. public static String abbreviate(String str, int max) {
  1067. str = str.trim();
  1068. int len = str.length();
  1069. int suffixlength = 20;
  1070. if (len <= max) {
  1071. return str;
  1072. }
  1073. suffixlength = Math.min(suffixlength, (max - 3) / 2);
  1074. String rev = StringUtils.reverse(str);
  1075. // get the last few words
  1076. String suffix = WordUtils.abbreviate(rev, 0, suffixlength, "");
  1077. suffix = StringUtils.reverse(suffix);
  1078. // first few ..
  1079. String prefix = StringUtils.abbreviate(str, max - suffix.length());
  1080. return prefix + suffix;
  1081. }
  1082. public static final String NSTR = "";
  1083. /**
  1084. * StreamStatus.
  1085. *
  1086. */
  1087. public static enum StreamStatus {
  1088. EOF, TERMINATED
  1089. }
  1090. public static StreamStatus readColumn(DataInput in, OutputStream out) throws IOException {
  1091. boolean foundCrChar = false;
  1092. while (true) {
  1093. int b;
  1094. try {
  1095. b = in.readByte();
  1096. } catch (EOFException e) {
  1097. return StreamStatus.EOF;
  1098. }
  1099. // Default new line characters on windows are "CRLF" so detect if there are any windows
  1100. // native newline characters and handle them.
  1101. if (Shell.WINDOWS) {
  1102. // if the CR is not followed by the LF on windows then add it back to the stream and
  1103. // proceed with next characters in the input stream.
  1104. if (foundCrChar && b != Utilities.newLineCode) {
  1105. out.write(Utilities.carriageReturnCode);
  1106. foundCrChar = false;
  1107. }
  1108. if (b == Utilities.carriageReturnCode) {
  1109. foundCrChar = true;
  1110. continue;
  1111. }
  1112. }
  1113. if (b == Utilities.newLineCode) {
  1114. return StreamStatus.TERMINATED;
  1115. }
  1116. out.write(b);
  1117. }
  1118. // Unreachable
  1119. }
  1120. /**
  1121. * Convert an output stream to a compressed output stream based on codecs and compression options
  1122. * specified in the Job Configuration.
  1123. *
  1124. * @param jc
  1125. * Job Configuration
  1126. * @param out
  1127. * Output Stream to be converted into compressed output stream
  1128. * @return compressed output stream
  1129. */
  1130. public static OutputStream createCompressedStream(JobConf jc, OutputStream out)
  1131. throws IOException {
  1132. boolean isCompressed = FileOutputFormat.getCompressOutput(jc);
  1133. return createCompressedStream(jc, out, isCompressed);
  1134. }
  1135. /**
  1136. * Convert an output stream to a compressed output stream based on codecs codecs in the Job
  1137. * Configuration. Caller specifies directly whether file is compressed or not
  1138. *
  1139. * @param jc
  1140. * Job Configuration
  1141. * @param out
  1142. * Output Stream to be converted into compressed output stream
  1143. * @param isCompressed
  1144. * whether the output stream needs to be compressed or not
  1145. * @return compressed output stream
  1146. */
  1147. public static OutputStream createCompressedStream(JobConf jc, OutputStream out,
  1148. boolean isCompressed) throws IOException {
  1149. if (isCompressed) {
  1150. Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(jc,
  1151. DefaultCodec.class);
  1152. CompressionCodec codec = ReflectionUtils.newInstance(codecClass, jc);
  1153. return codec.createOutputStream(out);
  1154. } else {
  1155. return (out);
  1156. }
  1157. }
  1158. /**
  1159. * Based on compression option and configured output codec - get extension for output file. This
  1160. * is only required for text files - not sequencefiles
  1161. *
  1162. * @param jc
  1163. * Job Configuration
  1164. * @param isCompressed
  1165. * Whether the output file is compressed or not
  1166. * @return the required file extension (example: .gz)
  1167. * @deprecated Use {@link #getFileExtension(JobConf, boolean, HiveOutputFormat)}
  1168. */
  1169. @Deprecated
  1170. public static String getFileExtension(JobConf jc, boolean isCompressed) {
  1171. return getFileExtension(jc, isCompressed, new HiveIgnoreKeyTextOutputFormat());
  1172. }
  1173. /**
  1174. * Based on compression option, output format, and configured output codec -
  1175. * get extension for output file. Text files require an extension, whereas
  1176. * others, like sequence files, do not.
  1177. * <p>
  1178. * The property <code>hive.output.file.extension</code> is used to determine
  1179. * the extension - if set, it will override other logic for choosing an
  1180. * extension.
  1181. *
  1182. * @param jc
  1183. * Job Configuration
  1184. * @param isCompressed
  1185. * Whether the output file is compressed or not
  1186. * @param hiveOutputFormat
  1187. * The output format, used to detect if the format is text
  1188. * @return the required file extension (example: .gz)
  1189. */
  1190. public static String getFileExtension(JobConf jc, boolean isCompressed,
  1191. HiveOutputFormat<?, ?> hiveOutputFormat) {
  1192. String extension = HiveConf.getVar(jc, HiveConf.ConfVars.OUTPUT_FILE_EXTENSION);
  1193. if (!StringUtils.isEmpty(extension)) {
  1194. return extension;
  1195. }
  1196. if ((hiveOutputFormat instanceof HiveIgnoreKeyTextOutputFormat) && isCompressed) {
  1197. Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(jc,
  1198. DefaultCodec.class);
  1199. CompressionCodec codec = ReflectionUtils.newInstance(codecClass, jc);
  1200. return codec.getDefaultExtension();
  1201. }
  1202. return "";
  1203. }
  1204. /**
  1205. * Create a sequencefile output stream based on job configuration.
  1206. *
  1207. * @param jc
  1208. * Job configuration
  1209. * @param fs
  1210. * File System to create file in
  1211. * @param file
  1212. * Path to be created
  1213. * @param keyClass
  1214. * Java Class for key
  1215. * @param valClass
  1216. * Java Class for value
  1217. * @return output stream over the created sequencefile
  1218. */
  1219. public static SequenceFile.Writer createSequenceWriter(JobConf jc, FileSystem fs, Path file,
  1220. Class<?> keyClass, Class<?> valClass, Progressable progressable) throws IOException {
  1221. boolean isCompressed = FileOutputFormat.getCompressOutput(jc);
  1222. return createSequenceWriter(jc, fs, file, keyClass, valClass, isCompressed, progressable);
  1223. }
  1224. /**
  1225. * Create a sequencefile output stream based on job configuration Uses user supplied compression
  1226. * flag (rather than obtaining it from the Job Configuration).
  1227. *
  1228. * @param jc
  1229. * Job configuration
  1230. * @param fs
  1231. * File System to create file in
  1232. * @param file
  1233. * Path to be created
  1234. * @param keyClass
  1235. * Java Class for key
  1236. * @param valClass
  1237. * Java Class for value
  1238. * @return output stream over the created sequencefile
  1239. */
  1240. public static SequenceFile.Writer createSequenceWriter(JobConf jc, FileSystem fs, Path file,
  1241. Class<?> keyClass, Class<?> valClass, boolean isCompressed, Progressable progressable)
  1242. throws IOException {
  1243. CompressionCodec codec = null;
  1244. CompressionType compressionType = CompressionType.NONE;
  1245. Class codecClass = null;
  1246. if (isCompressed) {
  1247. compressionType = SequenceFileOutputFormat.getOutputCompressionType(jc);
  1248. codecClass = FileOutputFormat.getOutputCompressorClass(jc, DefaultCodec.class);
  1249. codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, jc);
  1250. }
  1251. return (SequenceFile.createWriter(fs, jc, file, keyClass, valClass, compressionType, codec,
  1252. progressable));
  1253. }
  1254. /**
  1255. * Create a RCFile output stream based on job configuration Uses user supplied compression flag
  1256. * (rather than obtaining it from the Job Configuration).
  1257. *
  1258. * @param jc
  1259. * Job configuration
  1260. * @par…

Large files files are truncated, but you can click here to view the full file