PageRenderTime 57ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/tags/release-0.1-rc2/hive/external/ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java

#
Java | 1341 lines | 949 code | 182 blank | 210 comment | 205 complexity | 41b635fa891d3cb9e811c21f462079a4 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, JSON, CPL-1.0
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hive.ql.exec;
  19. import java.io.File;
  20. import java.io.IOException;
  21. import java.io.InputStream;
  22. import java.io.Serializable;
  23. import java.io.UnsupportedEncodingException;
  24. import java.lang.management.ManagementFactory;
  25. import java.lang.management.MemoryMXBean;
  26. import java.net.URL;
  27. import java.net.URLDecoder;
  28. import java.net.URLEncoder;
  29. import java.text.SimpleDateFormat;
  30. import java.util.ArrayList;
  31. import java.util.Calendar;
  32. import java.util.Collections;
  33. import java.util.Enumeration;
  34. import java.util.HashMap;
  35. import java.util.HashSet;
  36. import java.util.LinkedHashMap;
  37. import java.util.List;
  38. import java.util.Map;
  39. import java.util.Properties;
  40. import java.util.Set;
  41. import org.apache.commons.lang.StringUtils;
  42. import org.apache.commons.logging.Log;
  43. import org.apache.commons.logging.LogFactory;
  44. import org.apache.hadoop.conf.Configuration;
  45. import org.apache.hadoop.filecache.DistributedCache;
  46. import org.apache.hadoop.fs.FileStatus;
  47. import org.apache.hadoop.fs.FileSystem;
  48. import org.apache.hadoop.fs.Path;
  49. import org.apache.hadoop.hive.common.FileUtils;
  50. import org.apache.hadoop.hive.conf.HiveConf;
  51. import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
  52. import org.apache.hadoop.hive.ql.Context;
  53. import org.apache.hadoop.hive.ql.DriverContext;
  54. import org.apache.hadoop.hive.ql.QueryPlan;
  55. import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
  56. import org.apache.hadoop.hive.ql.exec.Operator.ProgressCounter;
  57. import org.apache.hadoop.hive.ql.exec.errors.ErrorAndSolution;
  58. import org.apache.hadoop.hive.ql.exec.errors.TaskLogProcessor;
  59. import org.apache.hadoop.hive.ql.history.HiveHistory.Keys;
  60. import org.apache.hadoop.hive.ql.io.HiveKey;
  61. import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
  62. import org.apache.hadoop.hive.ql.io.IOPrepareCache;
  63. import org.apache.hadoop.hive.ql.metadata.HiveException;
  64. import org.apache.hadoop.hive.ql.plan.FetchWork;
  65. import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
  66. import org.apache.hadoop.hive.ql.plan.MapredLocalWork;
  67. import org.apache.hadoop.hive.ql.plan.MapredWork;
  68. import org.apache.hadoop.hive.ql.plan.PartitionDesc;
  69. import org.apache.hadoop.hive.ql.plan.TableDesc;
  70. import org.apache.hadoop.hive.ql.plan.api.StageType;
  71. import org.apache.hadoop.hive.ql.session.SessionState;
  72. import org.apache.hadoop.hive.ql.session.SessionState.LogHelper;
  73. import org.apache.hadoop.hive.ql.stats.StatsFactory;
  74. import org.apache.hadoop.hive.ql.stats.StatsPublisher;
  75. import org.apache.hadoop.hive.shims.ShimLoader;
  76. import org.apache.hadoop.io.BytesWritable;
  77. import org.apache.hadoop.io.Text;
  78. import org.apache.hadoop.mapred.Counters;
  79. import org.apache.hadoop.mapred.FileInputFormat;
  80. import org.apache.hadoop.mapred.InputFormat;
  81. import org.apache.hadoop.mapred.JobClient;
  82. import org.apache.hadoop.mapred.JobConf;
  83. import org.apache.hadoop.mapred.Partitioner;
  84. import org.apache.hadoop.mapred.RunningJob;
  85. import org.apache.hadoop.mapred.TaskCompletionEvent;
  86. import org.apache.log4j.Appender;
  87. import org.apache.log4j.BasicConfigurator;
  88. import org.apache.log4j.FileAppender;
  89. import org.apache.log4j.LogManager;
  90. import org.apache.log4j.PropertyConfigurator;
  91. import org.apache.log4j.varia.NullAppender;
  92. /**
  93. * ExecDriver.
  94. *
  95. */
  96. public class ExecDriver extends Task<MapredWork> implements Serializable {
  97. private static final long serialVersionUID = 1L;
  98. protected transient JobConf job;
  99. protected transient int mapProgress = 0;
  100. protected transient int reduceProgress = 0;
  101. public transient String jobId;
  102. public String getJobId() {
  103. return jobId;
  104. }
  105. public void setJobId(String jobId) {
  106. this.jobId = jobId;
  107. }
  108. public static MemoryMXBean memoryMXBean;
  109. /**
  110. * Constructor when invoked from QL.
  111. */
  112. public ExecDriver() {
  113. super();
  114. }
  115. protected static String getResourceFiles(Configuration conf, SessionState.ResourceType t) {
  116. // fill in local files to be added to the task environment
  117. SessionState ss = SessionState.get();
  118. Set<String> files = (ss == null) ? null : ss.list_resource(t, null);
  119. if (files != null) {
  120. List<String> realFiles = new ArrayList<String>(files.size());
  121. for (String one : files) {
  122. try {
  123. realFiles.add(Utilities.realFile(one, conf));
  124. } catch (IOException e) {
  125. throw new RuntimeException("Cannot validate file " + one + "due to exception: "
  126. + e.getMessage(), e);
  127. }
  128. }
  129. return StringUtils.join(realFiles, ",");
  130. } else {
  131. return "";
  132. }
  133. }
  134. private void initializeFiles(String prop, String files) {
  135. if (files != null && files.length() > 0) {
  136. job.set(prop, files);
  137. ShimLoader.getHadoopShims().setTmpFiles(prop, files);
  138. }
  139. }
  140. /**
  141. * Initialization when invoked from QL.
  142. */
  143. @Override
  144. public void initialize(HiveConf conf, QueryPlan queryPlan, DriverContext driverContext) {
  145. super.initialize(conf, queryPlan, driverContext);
  146. job = new JobConf(conf, ExecDriver.class);
  147. // NOTE: initialize is only called if it is in non-local mode.
  148. // In case it's in non-local mode, we need to move the SessionState files
  149. // and jars to jobConf.
  150. // In case it's in local mode, MapRedTask will set the jobConf.
  151. //
  152. // "tmpfiles" and "tmpjars" are set by the method ExecDriver.execute(),
  153. // which will be called by both local and NON-local mode.
  154. String addedFiles = getResourceFiles(job, SessionState.ResourceType.FILE);
  155. if (StringUtils.isNotBlank(addedFiles)) {
  156. HiveConf.setVar(job, ConfVars.HIVEADDEDFILES, addedFiles);
  157. }
  158. String addedJars = getResourceFiles(job, SessionState.ResourceType.JAR);
  159. if (StringUtils.isNotBlank(addedJars)) {
  160. HiveConf.setVar(job, ConfVars.HIVEADDEDJARS, addedJars);
  161. }
  162. String addedArchives = getResourceFiles(job, SessionState.ResourceType.ARCHIVE);
  163. if (StringUtils.isNotBlank(addedArchives)) {
  164. HiveConf.setVar(job, ConfVars.HIVEADDEDARCHIVES, addedArchives);
  165. }
  166. }
  167. /**
  168. * Constructor/Initialization for invocation as independent utility.
  169. */
  170. public ExecDriver(MapredWork plan, JobConf job, boolean isSilent) throws HiveException {
  171. setWork(plan);
  172. this.job = job;
  173. LOG = LogFactory.getLog(this.getClass().getName());
  174. console = new LogHelper(LOG, isSilent);
  175. }
  176. /**
  177. * A list of the currently running jobs spawned in this Hive instance that is used to kill all
  178. * running jobs in the event of an unexpected shutdown - i.e., the JVM shuts down while there are
  179. * still jobs running.
  180. */
  181. private static Map<String, String> runningJobKillURIs = Collections
  182. .synchronizedMap(new HashMap<String, String>());
  183. /**
  184. * In Hive, when the user control-c's the command line, any running jobs spawned from that command
  185. * line are best-effort killed.
  186. *
  187. * This static constructor registers a shutdown thread to iterate over all the running job kill
  188. * URLs and do a get on them.
  189. *
  190. */
  191. static {
  192. if (new org.apache.hadoop.conf.Configuration()
  193. .getBoolean("webinterface.private.actions", false)) {
  194. Runtime.getRuntime().addShutdownHook(new Thread() {
  195. @Override
  196. public void run() {
  197. synchronized (runningJobKillURIs) {
  198. for (String uri : runningJobKillURIs.values()) {
  199. try {
  200. System.err.println("killing job with: " + uri);
  201. java.net.HttpURLConnection conn = (java.net.HttpURLConnection) new java.net.URL(uri)
  202. .openConnection();
  203. conn.setRequestMethod("POST");
  204. int retCode = conn.getResponseCode();
  205. if (retCode != 200) {
  206. System.err.println("Got an error trying to kill job with URI: " + uri + " = "
  207. + retCode);
  208. }
  209. } catch (Exception e) {
  210. System.err.println("trying to kill job, caught: " + e);
  211. // do nothing
  212. }
  213. }
  214. }
  215. }
  216. });
  217. }
  218. }
  219. /**
  220. * from StreamJob.java.
  221. */
  222. private void jobInfo(RunningJob rj) {
  223. if (job.get("mapred.job.tracker", "local").equals("local")) {
  224. console.printInfo("Job running in-process (local Hadoop)");
  225. } else {
  226. String hp = job.get("mapred.job.tracker");
  227. if (SessionState.get() != null) {
  228. SessionState.get().getHiveHistory().setTaskProperty(SessionState.get().getQueryId(),
  229. getId(), Keys.TASK_HADOOP_ID, rj.getJobID());
  230. }
  231. console.printInfo(ExecDriver.getJobStartMsg(rj.getJobID()) + ", Tracking URL = "
  232. + rj.getTrackingURL());
  233. console.printInfo("Kill Command = " + HiveConf.getVar(job, HiveConf.ConfVars.HADOOPBIN)
  234. + " job -Dmapred.job.tracker=" + hp + " -kill " + rj.getJobID());
  235. }
  236. }
  237. /**
  238. * This class contains the state of the running task Going forward, we will return this handle
  239. * from execute and Driver can split execute into start, monitorProgess and postProcess.
  240. */
  241. private static class ExecDriverTaskHandle extends TaskHandle {
  242. JobClient jc;
  243. RunningJob rj;
  244. JobClient getJobClient() {
  245. return jc;
  246. }
  247. RunningJob getRunningJob() {
  248. return rj;
  249. }
  250. public ExecDriverTaskHandle(JobClient jc, RunningJob rj) {
  251. this.jc = jc;
  252. this.rj = rj;
  253. }
  254. public void setRunningJob(RunningJob job) {
  255. rj = job;
  256. }
  257. @Override
  258. public Counters getCounters() throws IOException {
  259. return rj.getCounters();
  260. }
  261. }
  262. /**
  263. * Fatal errors are those errors that cannot be recovered by retries. These are application
  264. * dependent. Examples of fatal errors include: - the small table in the map-side joins is too
  265. * large to be feasible to be handled by one mapper. The job should fail and the user should be
  266. * warned to use regular joins rather than map-side joins. Fatal errors are indicated by counters
  267. * that are set at execution time. If the counter is non-zero, a fatal error occurred. The value
  268. * of the counter indicates the error type.
  269. *
  270. * @return true if fatal errors happened during job execution, false otherwise.
  271. */
  272. private boolean checkFatalErrors(Counters ctrs, StringBuilder errMsg) {
  273. if (ctrs == null) {
  274. // hadoop might return null if it cannot locate the job.
  275. // we may still be able to retrieve the job status - so ignore
  276. return false;
  277. }
  278. // check for number of created files
  279. long numFiles = ctrs.getCounter(ProgressCounter.CREATED_FILES);
  280. long upperLimit = HiveConf.getLongVar(job, HiveConf.ConfVars.MAXCREATEDFILES);
  281. if (numFiles > upperLimit) {
  282. errMsg.append("total number of created files exceeds ").append(upperLimit);
  283. return true;
  284. }
  285. for (Operator<? extends Serializable> op : work.getAliasToWork().values()) {
  286. if (op.checkFatalErrors(ctrs, errMsg)) {
  287. return true;
  288. }
  289. }
  290. if (work.getReducer() != null) {
  291. if (work.getReducer().checkFatalErrors(ctrs, errMsg)) {
  292. return true;
  293. }
  294. }
  295. return false;
  296. }
  297. private boolean progress(ExecDriverTaskHandle th) throws IOException {
  298. JobClient jc = th.getJobClient();
  299. RunningJob rj = th.getRunningJob();
  300. String lastReport = "";
  301. SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss,SSS");
  302. long reportTime = System.currentTimeMillis();
  303. long maxReportInterval = 60 * 1000; // One minute
  304. boolean fatal = false;
  305. StringBuilder errMsg = new StringBuilder();
  306. long pullInterval = HiveConf.getLongVar(job, HiveConf.ConfVars.HIVECOUNTERSPULLINTERVAL);
  307. boolean initializing = true;
  308. while (!rj.isComplete()) {
  309. try {
  310. Thread.sleep(pullInterval);
  311. } catch (InterruptedException e) {
  312. }
  313. if (initializing && ShimLoader.getHadoopShims().isJobPreparing(rj)) {
  314. // No reason to poll untill the job is initialized
  315. continue;
  316. } else {
  317. // By now the job is initialized so no reason to do
  318. // rj.getJobState() again and we do not want to do an extra RPC call
  319. initializing = false;
  320. }
  321. RunningJob newRj = jc.getJob(rj.getJobID());
  322. if (newRj == null) {
  323. // under exceptional load, hadoop may not be able to look up status
  324. // of finished jobs (because it has purged them from memory). From
  325. // hive's perspective - it's equivalent to the job having failed.
  326. // So raise a meaningful exception
  327. throw new IOException("Could not find status of job: + rj.getJobID()");
  328. } else {
  329. th.setRunningJob(newRj);
  330. rj = newRj;
  331. }
  332. // If fatal errors happen we should kill the job immediately rather than
  333. // let the job retry several times, which eventually lead to failure.
  334. if (fatal) {
  335. continue; // wait until rj.isComplete
  336. }
  337. Counters ctrs = th.getCounters();
  338. if (fatal = checkFatalErrors(ctrs, errMsg)) {
  339. console.printError("[Fatal Error] " + errMsg.toString() + ". Killing the job.");
  340. rj.killJob();
  341. continue;
  342. }
  343. errMsg.setLength(0);
  344. updateCounters(ctrs, rj);
  345. String report = " " + getId() + " map = " + mapProgress + "%, reduce = " + reduceProgress
  346. + "%";
  347. if (!report.equals(lastReport)
  348. || System.currentTimeMillis() >= reportTime + maxReportInterval) {
  349. // write out serialized plan with counters to log file
  350. // LOG.info(queryPlan);
  351. String output = dateFormat.format(Calendar.getInstance().getTime()) + report;
  352. SessionState ss = SessionState.get();
  353. if (ss != null) {
  354. ss.getHiveHistory().setTaskCounters(SessionState.get().getQueryId(), getId(), ctrs);
  355. ss.getHiveHistory().setTaskProperty(SessionState.get().getQueryId(), getId(),
  356. Keys.TASK_HADOOP_PROGRESS, output);
  357. ss.getHiveHistory().progressTask(SessionState.get().getQueryId(), this);
  358. ss.getHiveHistory().logPlanProgress(queryPlan);
  359. }
  360. console.printInfo(output);
  361. lastReport = report;
  362. reportTime = System.currentTimeMillis();
  363. }
  364. }
  365. boolean success;
  366. Counters ctrs = th.getCounters();
  367. if (fatal) {
  368. success = false;
  369. } else {
  370. // check for fatal error again in case it occurred after
  371. // the last check before the job is completed
  372. if (checkFatalErrors(ctrs, errMsg)) {
  373. console.printError("[Fatal Error] " + errMsg.toString());
  374. success = false;
  375. } else {
  376. success = rj.isSuccessful();
  377. }
  378. }
  379. setDone();
  380. // update based on the final value of the counters
  381. updateCounters(ctrs, rj);
  382. SessionState ss = SessionState.get();
  383. if (ss != null) {
  384. ss.getHiveHistory().logPlanProgress(queryPlan);
  385. }
  386. // LOG.info(queryPlan);
  387. return (success);
  388. }
  389. /**
  390. * Update counters relevant to this task.
  391. */
  392. private void updateCounters(Counters ctrs, RunningJob rj) throws IOException {
  393. mapProgress = Math.round(rj.mapProgress() * 100);
  394. reduceProgress = Math.round(rj.reduceProgress() * 100);
  395. taskCounters.put("CNTR_NAME_" + getId() + "_MAP_PROGRESS", Long.valueOf(mapProgress));
  396. taskCounters.put("CNTR_NAME_" + getId() + "_REDUCE_PROGRESS", Long.valueOf(reduceProgress));
  397. if (ctrs == null) {
  398. // hadoop might return null if it cannot locate the job.
  399. // we may still be able to retrieve the job status - so ignore
  400. return;
  401. }
  402. for (Operator<? extends Serializable> op : work.getAliasToWork().values()) {
  403. op.updateCounters(ctrs);
  404. }
  405. if (work.getReducer() != null) {
  406. work.getReducer().updateCounters(ctrs);
  407. }
  408. }
  409. public boolean mapStarted() {
  410. return mapProgress > 0;
  411. }
  412. public boolean reduceStarted() {
  413. return reduceProgress > 0;
  414. }
  415. public boolean mapDone() {
  416. return mapProgress == 100;
  417. }
  418. public boolean reduceDone() {
  419. return reduceProgress == 100;
  420. }
  421. /**
  422. * Execute a query plan using Hadoop.
  423. */
  424. @Override
  425. public int execute(DriverContext driverContext) {
  426. IOPrepareCache ioPrepareCache = IOPrepareCache.get();
  427. ioPrepareCache.clear();
  428. boolean success = true;
  429. String invalidReason = work.isInvalid();
  430. if (invalidReason != null) {
  431. throw new RuntimeException("Plan invalid, Reason: " + invalidReason);
  432. }
  433. Context ctx = driverContext.getCtx();
  434. boolean ctxCreated = false;
  435. String emptyScratchDirStr;
  436. Path emptyScratchDir;
  437. try {
  438. if (ctx == null) {
  439. ctx = new Context(job);
  440. ctxCreated = true;
  441. }
  442. emptyScratchDirStr = ctx.getMRTmpFileURI();
  443. emptyScratchDir = new Path(emptyScratchDirStr);
  444. FileSystem fs = emptyScratchDir.getFileSystem(job);
  445. fs.mkdirs(emptyScratchDir);
  446. } catch (IOException e) {
  447. e.printStackTrace();
  448. console.printError("Error launching map-reduce job", "\n"
  449. + org.apache.hadoop.util.StringUtils.stringifyException(e));
  450. return 5;
  451. }
  452. ShimLoader.getHadoopShims().setNullOutputFormat(job);
  453. job.setMapperClass(ExecMapper.class);
  454. job.setMapOutputKeyClass(HiveKey.class);
  455. job.setMapOutputValueClass(BytesWritable.class);
  456. try {
  457. job.setPartitionerClass((Class<? extends Partitioner>) (Class.forName(HiveConf.getVar(job,
  458. HiveConf.ConfVars.HIVEPARTITIONER))));
  459. } catch (ClassNotFoundException e) {
  460. throw new RuntimeException(e.getMessage());
  461. }
  462. if (work.getNumMapTasks() != null) {
  463. job.setNumMapTasks(work.getNumMapTasks().intValue());
  464. }
  465. if (work.getMinSplitSize() != null) {
  466. HiveConf.setLongVar(job, HiveConf.ConfVars.MAPREDMINSPLITSIZE, work.getMinSplitSize().longValue());
  467. }
  468. job.setNumReduceTasks(work.getNumReduceTasks().intValue());
  469. job.setReducerClass(ExecReducer.class);
  470. if (work.getInputformat() != null) {
  471. HiveConf.setVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT, work.getInputformat());
  472. }
  473. // Turn on speculative execution for reducers
  474. boolean useSpeculativeExecReducers = HiveConf.getBoolVar(job,
  475. HiveConf.ConfVars.HIVESPECULATIVEEXECREDUCERS);
  476. HiveConf.setBoolVar(job, HiveConf.ConfVars.HADOOPSPECULATIVEEXECREDUCERS,
  477. useSpeculativeExecReducers);
  478. String inpFormat = HiveConf.getVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT);
  479. if ((inpFormat == null) || (!StringUtils.isNotBlank(inpFormat))) {
  480. inpFormat = ShimLoader.getHadoopShims().getInputFormatClassName();
  481. }
  482. LOG.info("Using " + inpFormat);
  483. try {
  484. job.setInputFormat((Class<? extends InputFormat>) (Class.forName(inpFormat)));
  485. } catch (ClassNotFoundException e) {
  486. throw new RuntimeException(e.getMessage());
  487. }
  488. // No-Op - we don't really write anything here ..
  489. job.setOutputKeyClass(Text.class);
  490. job.setOutputValueClass(Text.class);
  491. // Transfer HIVEAUXJARS and HIVEADDEDJARS to "tmpjars" so hadoop understands
  492. // it
  493. String auxJars = HiveConf.getVar(job, HiveConf.ConfVars.HIVEAUXJARS);
  494. String addedJars = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDJARS);
  495. if (StringUtils.isNotBlank(auxJars) || StringUtils.isNotBlank(addedJars)) {
  496. String allJars = StringUtils.isNotBlank(auxJars) ? (StringUtils.isNotBlank(addedJars) ? addedJars
  497. + "," + auxJars
  498. : auxJars)
  499. : addedJars;
  500. LOG.info("adding libjars: " + allJars);
  501. initializeFiles("tmpjars", allJars);
  502. }
  503. // Transfer HIVEADDEDFILES to "tmpfiles" so hadoop understands it
  504. String addedFiles = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDFILES);
  505. if (StringUtils.isNotBlank(addedFiles)) {
  506. initializeFiles("tmpfiles", addedFiles);
  507. }
  508. int returnVal = 0;
  509. RunningJob rj = null;
  510. boolean noName = StringUtils.isEmpty(HiveConf.getVar(job, HiveConf.ConfVars.HADOOPJOBNAME));
  511. if (noName) {
  512. // This is for a special case to ensure unit tests pass
  513. HiveConf.setVar(job, HiveConf.ConfVars.HADOOPJOBNAME, "JOB" + Utilities.randGen.nextInt());
  514. }
  515. String addedArchives = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDARCHIVES);
  516. // Transfer HIVEADDEDARCHIVES to "tmparchives" so hadoop understands it
  517. if (StringUtils.isNotBlank(addedArchives)) {
  518. initializeFiles("tmparchives", addedArchives);
  519. }
  520. try{
  521. MapredLocalWork localwork = work.getMapLocalWork();
  522. if (localwork != null) {
  523. boolean localMode = HiveConf.getVar(job, HiveConf.ConfVars.HADOOPJT).equals("local");
  524. if (!localMode) {
  525. Path localPath = new Path(localwork.getTmpFileURI());
  526. Path hdfsPath = new Path(work.getTmpHDFSFileURI());
  527. FileSystem hdfs = hdfsPath.getFileSystem(job);
  528. FileSystem localFS = localPath.getFileSystem(job);
  529. FileStatus[] hashtableFiles = localFS.listStatus(localPath);
  530. int fileNumber = hashtableFiles.length;
  531. String[] fileNames = new String[fileNumber];
  532. for ( int i = 0; i < fileNumber; i++){
  533. fileNames[i] = hashtableFiles[i].getPath().getName();
  534. }
  535. //package and compress all the hashtable files to an archive file
  536. String parentDir = localPath.toUri().getPath();
  537. String stageId = this.getId();
  538. String archiveFileURI = Utilities.generateTarURI(parentDir, stageId);
  539. String archiveFileName = Utilities.generateTarFileName(stageId);
  540. localwork.setStageID(stageId);
  541. FileUtils.tar(parentDir, fileNames,archiveFileName);
  542. Path archivePath = new Path(archiveFileURI);
  543. LOG.info("Archive "+ hashtableFiles.length+" hash table files to " + archiveFileURI);
  544. //upload archive file to hdfs
  545. String hdfsFile =Utilities.generateTarURI(hdfsPath, stageId);
  546. Path hdfsFilePath = new Path(hdfsFile);
  547. short replication = (short) job.getInt("mapred.submit.replication", 10);
  548. hdfs.setReplication(hdfsFilePath, replication);
  549. hdfs.copyFromLocalFile(archivePath, hdfsFilePath);
  550. LOG.info("Upload 1 archive file from" + archivePath + " to: " + hdfsFilePath);
  551. //add the archive file to distributed cache
  552. DistributedCache.createSymlink(job);
  553. DistributedCache.addCacheArchive(hdfsFilePath.toUri(), job);
  554. LOG.info("Add 1 archive file to distributed cache. Archive file: " + hdfsFilePath.toUri());
  555. }
  556. }
  557. addInputPaths(job, work, emptyScratchDirStr);
  558. Utilities.setMapRedWork(job, work, ctx.getMRTmpFileURI());
  559. // remove the pwd from conf file so that job tracker doesn't show this
  560. // logs
  561. String pwd = HiveConf.getVar(job, HiveConf.ConfVars.METASTOREPWD);
  562. if (pwd != null) {
  563. HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, "HIVE");
  564. }
  565. JobClient jc = new JobClient(job);
  566. // make this client wait if job trcker is not behaving well.
  567. Throttle.checkJobTracker(job, LOG);
  568. if (work.isGatheringStats()) {
  569. // initialize stats publishing table
  570. StatsPublisher statsPublisher;
  571. String statsImplementationClass = HiveConf.getVar(job, HiveConf.ConfVars.HIVESTATSDBCLASS);
  572. if (StatsFactory.setImplementation(statsImplementationClass, job)) {
  573. statsPublisher = StatsFactory.getStatsPublisher();
  574. statsPublisher.init(job); // creating stats table if not exists
  575. }
  576. }
  577. // Finally SUBMIT the JOB!
  578. rj = jc.submitJob(job);
  579. jobId = rj.getJobID();
  580. // replace it back
  581. if (pwd != null) {
  582. HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, pwd);
  583. }
  584. // add to list of running jobs to kill in case of abnormal shutdown
  585. runningJobKillURIs.put(rj.getJobID(), rj.getTrackingURL() + "&action=kill");
  586. ExecDriverTaskHandle th = new ExecDriverTaskHandle(jc, rj);
  587. jobInfo(rj);
  588. success = progress(th);
  589. String statusMesg = getJobEndMsg(rj.getJobID());
  590. if (!success) {
  591. statusMesg += " with errors";
  592. returnVal = 2;
  593. console.printError(statusMesg);
  594. if (HiveConf.getBoolVar(job, HiveConf.ConfVars.SHOW_JOB_FAIL_DEBUG_INFO)) {
  595. showJobFailDebugInfo(job, rj);
  596. }
  597. } else {
  598. console.printInfo(statusMesg);
  599. }
  600. } catch (Exception e) {
  601. e.printStackTrace();
  602. String mesg = " with exception '" + Utilities.getNameMessage(e) + "'";
  603. if (rj != null) {
  604. mesg = "Ended Job = " + rj.getJobID() + mesg;
  605. } else {
  606. mesg = "Job Submission failed" + mesg;
  607. }
  608. // Has to use full name to make sure it does not conflict with
  609. // org.apache.commons.lang.StringUtils
  610. console.printError(mesg, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
  611. success = false;
  612. returnVal = 1;
  613. } finally {
  614. Utilities.clearMapRedWork(job);
  615. try {
  616. if (ctxCreated) {
  617. ctx.clear();
  618. }
  619. if (rj != null) {
  620. if (returnVal != 0) {
  621. rj.killJob();
  622. }
  623. runningJobKillURIs.remove(rj.getJobID());
  624. }
  625. } catch (Exception e) {
  626. }
  627. }
  628. // get the list of Dynamic partition paths
  629. try {
  630. if (rj != null) {
  631. JobCloseFeedBack feedBack = new JobCloseFeedBack();
  632. if (work.getAliasToWork() != null) {
  633. for (Operator<? extends Serializable> op : work.getAliasToWork().values()) {
  634. op.jobClose(job, success, feedBack);
  635. }
  636. }
  637. if (work.getReducer() != null) {
  638. work.getReducer().jobClose(job, success, feedBack);
  639. }
  640. }
  641. } catch (Exception e) {
  642. // jobClose needs to execute successfully otherwise fail task
  643. if (success) {
  644. success = false;
  645. returnVal = 3;
  646. String mesg = "Job Commit failed with exception '" + Utilities.getNameMessage(e) + "'";
  647. console.printError(mesg, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
  648. }
  649. }
  650. return (returnVal);
  651. }
  652. /**
  653. * This msg pattern is used to track when a job is started.
  654. *
  655. * @param jobId
  656. * @return
  657. */
  658. private static String getJobStartMsg(String jobId) {
  659. return "Starting Job = " + jobId;
  660. }
  661. /**
  662. * this msg pattern is used to track when a job is successfully done.
  663. *
  664. * @param jobId
  665. * @return
  666. */
  667. public static String getJobEndMsg(String jobId) {
  668. return "Ended Job = " + jobId;
  669. }
  670. private String getTaskAttemptLogUrl(String taskTrackerHttpAddress, String taskAttemptId) {
  671. return taskTrackerHttpAddress + "/tasklog?taskid=" + taskAttemptId + "&all=true";
  672. }
  673. // Used for showJobFailDebugInfo
  674. private static class TaskInfo {
  675. String jobId;
  676. HashSet<String> logUrls;
  677. public TaskInfo(String jobId) {
  678. this.jobId = jobId;
  679. logUrls = new HashSet<String>();
  680. }
  681. public void addLogUrl(String logUrl) {
  682. logUrls.add(logUrl);
  683. }
  684. public HashSet<String> getLogUrls() {
  685. return logUrls;
  686. }
  687. public String getJobId() {
  688. return jobId;
  689. }
  690. }
  691. @SuppressWarnings("deprecation")
  692. private void showJobFailDebugInfo(JobConf conf, RunningJob rj) throws IOException {
  693. // Mapping from task ID to the number of failures
  694. Map<String, Integer> failures = new HashMap<String, Integer>();
  695. // Successful task ID's
  696. Set<String> successes = new HashSet<String>();
  697. Map<String, TaskInfo> taskIdToInfo = new HashMap<String, TaskInfo>();
  698. int startIndex = 0;
  699. // Loop to get all task completion events because getTaskCompletionEvents
  700. // only returns a subset per call
  701. while (true) {
  702. TaskCompletionEvent[] taskCompletions = rj.getTaskCompletionEvents(startIndex);
  703. if (taskCompletions == null || taskCompletions.length == 0) {
  704. break;
  705. }
  706. boolean more = true;
  707. for (TaskCompletionEvent t : taskCompletions) {
  708. // getTaskJobIDs returns Strings for compatibility with Hadoop versions
  709. // without TaskID or TaskAttemptID
  710. String[] taskJobIds = ShimLoader.getHadoopShims().getTaskJobIDs(t);
  711. if (taskJobIds == null) {
  712. console.printError("Task attempt info is unavailable in this Hadoop version");
  713. more = false;
  714. break;
  715. }
  716. // For each task completion event, get the associated task id, job id
  717. // and the logs
  718. String taskId = taskJobIds[0];
  719. String jobId = taskJobIds[1];
  720. TaskInfo ti = taskIdToInfo.get(taskId);
  721. if (ti == null) {
  722. ti = new TaskInfo(jobId);
  723. taskIdToInfo.put(taskId, ti);
  724. }
  725. // These tasks should have come from the same job.
  726. assert (ti.getJobId() == jobId);
  727. ti.getLogUrls().add(getTaskAttemptLogUrl(t.getTaskTrackerHttp(), t.getTaskId()));
  728. // If a task failed, then keep track of the total number of failures
  729. // for that task (typically, a task gets re-run up to 4 times if it
  730. // fails
  731. if (t.getTaskStatus() != TaskCompletionEvent.Status.SUCCEEDED) {
  732. Integer failAttempts = failures.get(taskId);
  733. if (failAttempts == null) {
  734. failAttempts = Integer.valueOf(0);
  735. }
  736. failAttempts = Integer.valueOf(failAttempts.intValue() + 1);
  737. failures.put(taskId, failAttempts);
  738. } else {
  739. successes.add(taskId);
  740. }
  741. }
  742. if (!more) {
  743. break;
  744. }
  745. startIndex += taskCompletions.length;
  746. }
  747. // Remove failures for tasks that succeeded
  748. for (String task : successes) {
  749. failures.remove(task);
  750. }
  751. if (failures.keySet().size() == 0) {
  752. return;
  753. }
  754. // Find the highest failure count
  755. int maxFailures = 0;
  756. for (Integer failCount : failures.values()) {
  757. if (maxFailures < failCount.intValue()) {
  758. maxFailures = failCount.intValue();
  759. }
  760. }
  761. // Display Error Message for tasks with the highest failure count
  762. String jtUrl = JobTrackerURLResolver.getURL(conf);
  763. for (String task : failures.keySet()) {
  764. if (failures.get(task).intValue() == maxFailures) {
  765. TaskInfo ti = taskIdToInfo.get(task);
  766. String jobId = ti.getJobId();
  767. String taskUrl = jtUrl + "/taskdetails.jsp?jobid=" + jobId + "&tipid=" + task.toString();
  768. TaskLogProcessor tlp = new TaskLogProcessor(conf);
  769. for (String logUrl : ti.getLogUrls()) {
  770. tlp.addTaskAttemptLogUrl(logUrl);
  771. }
  772. List<ErrorAndSolution> errors = tlp.getErrors();
  773. StringBuilder sb = new StringBuilder();
  774. // We use a StringBuilder and then call printError only once as
  775. // printError will write to both stderr and the error log file. In
  776. // situations where both the stderr and the log file output is
  777. // simultaneously output to a single stream, this will look cleaner.
  778. sb.append("\n");
  779. sb.append("Task with the most failures(" + maxFailures + "): \n");
  780. sb.append("-----\n");
  781. sb.append("Task ID:\n " + task + "\n\n");
  782. sb.append("URL:\n " + taskUrl + "\n");
  783. for (ErrorAndSolution e : errors) {
  784. sb.append("\n");
  785. sb.append("Possible error:\n " + e.getError() + "\n\n");
  786. sb.append("Solution:\n " + e.getSolution() + "\n");
  787. }
  788. sb.append("-----\n");
  789. console.printError(sb.toString());
  790. // Only print out one task because that's good enough for debugging.
  791. break;
  792. }
  793. }
  794. return;
  795. }
  796. private static void printUsage() {
  797. System.err.println("ExecDriver -plan <plan-file> [-jobconf k1=v1 [-jobconf k2=v2] ...] "
  798. + "[-files <file1>[,<file2>] ...]");
  799. System.exit(1);
  800. }
  801. /**
  802. * we are running the hadoop job via a sub-command. this typically happens when we are running
  803. * jobs in local mode. the log4j in this mode is controlled as follows: 1. if the admin provides a
  804. * log4j properties file especially for execution mode - then we pick that up 2. otherwise - we
  805. * default to the regular hive log4j properties if one is supplied 3. if none of the above two
  806. * apply - we don't do anything - the log4j properties would likely be determined by hadoop.
  807. *
  808. * The intention behind providing a separate option #1 is to be able to collect hive run time logs
  809. * generated in local mode in a separate (centralized) location if desired. This mimics the
  810. * behavior of hive run time logs when running against a hadoop cluster where they are available
  811. * on the tasktracker nodes.
  812. */
  813. private static void setupChildLog4j(Configuration conf) {
  814. URL hive_l4j = ExecDriver.class.getClassLoader().getResource(SessionState.HIVE_EXEC_L4J);
  815. if (hive_l4j == null) {
  816. hive_l4j = ExecDriver.class.getClassLoader().getResource(SessionState.HIVE_L4J);
  817. }
  818. if (hive_l4j != null) {
  819. // setting queryid so that log4j configuration can use it to generate
  820. // per query log file
  821. System.setProperty(HiveConf.ConfVars.HIVEQUERYID.toString(), HiveConf.getVar(conf,
  822. HiveConf.ConfVars.HIVEQUERYID));
  823. LogManager.resetConfiguration();
  824. PropertyConfigurator.configure(hive_l4j);
  825. }
  826. }
  827. public static void main(String[] args) throws IOException, HiveException {
  828. String planFileName = null;
  829. ArrayList<String> jobConfArgs = new ArrayList<String>();
  830. boolean noLog = false;
  831. String files = null;
  832. boolean localtask = false;
  833. try {
  834. for (int i = 0; i < args.length; i++) {
  835. if (args[i].equals("-plan")) {
  836. planFileName = args[++i];
  837. } else if (args[i].equals("-jobconf")) {
  838. jobConfArgs.add(args[++i]);
  839. } else if (args[i].equals("-nolog")) {
  840. noLog = true;
  841. } else if (args[i].equals("-files")) {
  842. files = args[++i];
  843. } else if (args[i].equals("-localtask")) {
  844. localtask = true;
  845. }
  846. }
  847. } catch (IndexOutOfBoundsException e) {
  848. System.err.println("Missing argument to option");
  849. printUsage();
  850. }
  851. JobConf conf;
  852. if (localtask) {
  853. conf = new JobConf(MapredLocalTask.class);
  854. } else {
  855. conf = new JobConf(ExecDriver.class);
  856. }
  857. StringBuilder sb = new StringBuilder("JobConf:\n");
  858. for (String one : jobConfArgs) {
  859. int eqIndex = one.indexOf('=');
  860. if (eqIndex != -1) {
  861. try {
  862. String key = one.substring(0, eqIndex);
  863. String value = URLDecoder.decode(one.substring(eqIndex + 1), "UTF-8");
  864. conf.set(key, value);
  865. sb.append(key).append("=").append(value).append("\n");
  866. } catch (UnsupportedEncodingException e) {
  867. System.err.println("Unexpected error " + e.getMessage() + " while encoding "
  868. + one.substring(eqIndex + 1));
  869. System.exit(3);
  870. }
  871. }
  872. }
  873. if (files != null) {
  874. conf.set("tmpfiles", files);
  875. }
  876. boolean isSilent = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVESESSIONSILENT);
  877. if (noLog) {
  878. // If started from main(), and noLog is on, we should not output
  879. // any logs. To turn the log on, please set -Dtest.silent=false
  880. BasicConfigurator.resetConfiguration();
  881. BasicConfigurator.configure(new NullAppender());
  882. } else {
  883. setupChildLog4j(conf);
  884. }
  885. Log LOG = LogFactory.getLog(ExecDriver.class.getName());
  886. LogHelper console = new LogHelper(LOG, isSilent);
  887. if (planFileName == null) {
  888. console.printError("Must specify Plan File Name");
  889. printUsage();
  890. }
  891. // print out the location of the log file for the user so
  892. // that it's easy to find reason for local mode execution failures
  893. for (Appender appender : Collections.list((Enumeration<Appender>) LogManager.getRootLogger()
  894. .getAllAppenders())) {
  895. if (appender instanceof FileAppender) {
  896. console.printInfo("Execution log at: " + ((FileAppender) appender).getFile());
  897. }
  898. }
  899. // log the list of job conf parameters for reference
  900. LOG.info(sb.toString());
  901. // the plan file should always be in local directory
  902. Path p = new Path(planFileName);
  903. FileSystem fs = FileSystem.getLocal(conf);
  904. InputStream pathData = fs.open(p);
  905. // this is workaround for hadoop-17 - libjars are not added to classpath of the
  906. // child process. so we add it here explicitly
  907. String auxJars = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEAUXJARS);
  908. String addedJars = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEADDEDJARS);
  909. try {
  910. // see also - code in CliDriver.java
  911. ClassLoader loader = conf.getClassLoader();
  912. if (StringUtils.isNotBlank(auxJars)) {
  913. loader = Utilities.addToClassPath(loader, StringUtils.split(auxJars, ","));
  914. }
  915. if (StringUtils.isNotBlank(addedJars)) {
  916. loader = Utilities.addToClassPath(loader, StringUtils.split(addedJars, ","));
  917. }
  918. conf.setClassLoader(loader);
  919. // Also set this to the Thread ContextClassLoader, so new threads will
  920. // inherit
  921. // this class loader, and propagate into newly created Configurations by
  922. // those
  923. // new threads.
  924. Thread.currentThread().setContextClassLoader(loader);
  925. } catch (Exception e) {
  926. throw new HiveException(e.getMessage(), e);
  927. }
  928. int ret;
  929. if (localtask) {
  930. memoryMXBean = ManagementFactory.getMemoryMXBean();
  931. MapredLocalWork plan = Utilities.deserializeMapRedLocalWork(pathData, conf);
  932. MapredLocalTask ed = new MapredLocalTask(plan, conf, isSilent);
  933. ret = ed.executeFromChildJVM(new DriverContext());
  934. } else {
  935. MapredWork plan = Utilities.deserializeMapRedWork(pathData, conf);
  936. ExecDriver ed = new ExecDriver(plan, conf, isSilent);
  937. ret = ed.execute(new DriverContext());
  938. }
  939. if (ret != 0) {
  940. System.exit(2);
  941. }
  942. }
  943. /**
  944. * Given a Hive Configuration object - generate a command line fragment for passing such
  945. * configuration information to ExecDriver.
  946. */
  947. public static String generateCmdLine(HiveConf hconf) {
  948. try {
  949. StringBuilder sb = new StringBuilder();
  950. Properties deltaP = hconf.getChangedProperties();
  951. boolean hadoopLocalMode = hconf.getVar(HiveConf.ConfVars.HADOOPJT).equals("local");
  952. String hadoopSysDir = "mapred.system.dir";
  953. String hadoopWorkDir = "mapred.local.dir";
  954. for (Object one : deltaP.keySet()) {
  955. String oneProp = (String) one;
  956. if (hadoopLocalMode && (oneProp.equals(hadoopSysDir) || oneProp.equals(hadoopWorkDir))) {
  957. continue;
  958. }
  959. String oneValue = deltaP.getProperty(oneProp);
  960. sb.append("-jobconf ");
  961. sb.append(oneProp);
  962. sb.append("=");
  963. sb.append(URLEncoder.encode(oneValue, "UTF-8"));
  964. sb.append(" ");
  965. }
  966. // Multiple concurrent local mode job submissions can cause collisions in
  967. // working dirs
  968. // Workaround is to rename map red working dir to a temp dir in such cases
  969. if (hadoopLocalMode) {
  970. sb.append("-jobconf ");
  971. sb.append(hadoopSysDir);
  972. sb.append("=");
  973. sb.append(URLEncoder.encode(hconf.get(hadoopSysDir) + "/" + Utilities.randGen.nextInt(),
  974. "UTF-8"));
  975. sb.append(" ");
  976. sb.append("-jobconf ");
  977. sb.append(hadoopWorkDir);
  978. sb.append("=");
  979. sb.append(URLEncoder.encode(hconf.get(hadoopWorkDir) + "/" + Utilities.randGen.nextInt(),
  980. "UTF-8"));
  981. }
  982. return sb.toString();
  983. } catch (UnsupportedEncodingException e) {
  984. throw new RuntimeException(e);
  985. }
  986. }
  987. @Override
  988. public boolean isMapRedTask() {
  989. return true;
  990. }
  991. @Override
  992. public boolean hasReduce() {
  993. MapredWork w = getWork();
  994. return w.getReducer() != null;
  995. }
  996. /**
  997. * Handle a empty/null path for a given alias.
  998. */
  999. private int addInputPath(String path, JobConf job, MapredWork work, String hiveScratchDir,
  1000. int numEmptyPaths, boolean isEmptyPath, String alias) throws Exception {
  1001. // either the directory does not exist or it is empty
  1002. assert path == null || isEmptyPath;
  1003. // The input file does not exist, replace it by a empty file
  1004. Class<? extends HiveOutputFormat> outFileFormat = null;
  1005. boolean nonNative = true;
  1006. if (isEmptyPath) {
  1007. PartitionDesc partDesc = work.getPathToPartitionInfo().get(path);
  1008. outFileFormat = partDesc.getOutputFileFormatClass();
  1009. nonNative = partDesc.getTableDesc().isNonNative();
  1010. } else {
  1011. TableDesc tableDesc = work.getAliasToPartnInfo().get(alias).getTableDesc();
  1012. outFileFormat = tableDesc.getOutputFileFormatClass();
  1013. nonNative = tableDesc.isNonNative();
  1014. }
  1015. if (nonNative) {
  1016. FileInputFormat.addInputPaths(job, path);
  1017. LOG.info("Add a non-native table " + path);
  1018. return numEmptyPaths;
  1019. }
  1020. // create a dummy empty file in a new directory
  1021. String newDir = hiveScratchDir + File.separator + (++numEmptyPaths);
  1022. Path newPath = new Path(newDir);
  1023. FileSystem fs = newPath.getFileSystem(job);
  1024. fs.mkdirs(newPath);
  1025. //Qualify the path against the filesystem. The user configured path might contain default port which is skipped
  1026. //in the file status. This makes sure that all paths which goes into PathToPartitionInfo are always listed status
  1027. //filepath.
  1028. newPath = fs.makeQualified(newPath);
  1029. String newFile = newDir + File.separator + "emptyFile";
  1030. Path newFilePath = new Path(newFile);
  1031. LOG.info("Changed input file to " + newPath.toString());
  1032. // toggle the work
  1033. LinkedHashMap<String, ArrayList<String>> pathToAliases = work.getPathToAliases();
  1034. if (isEmptyPath) {
  1035. assert path != null;
  1036. pathToAliases.put(newPath.toUri().toString(), pathToAliases.get(path));
  1037. pathToAliases.remove(path);
  1038. } else {
  1039. assert path == null;
  1040. ArrayList<String> newList = new ArrayList<String>();
  1041. newList.add(alias);
  1042. pathToAliases.put(newPath.toUri().toString(), newList);
  1043. }
  1044. work.setPathToAliases(pathToAliases);
  1045. LinkedHashMap<String, PartitionDesc> pathToPartitionInfo = work.getPathToPartitionInfo();
  1046. if (isEmptyPath) {
  1047. pathToPartitionInfo.put(newPath.toUri().toString(), pathToPartitionInfo.get(path));
  1048. pathToPartitionInfo.remove(path);
  1049. } else {
  1050. PartitionDesc pDesc = work.getAliasToPartnInfo().get(alias).clone();
  1051. pathToPartitionInfo.put(newPath.toUri().toString(), pDesc);
  1052. }
  1053. work.setPathToPartitionInfo(pathToPartitionInfo);
  1054. String onefile = newPath.toString();
  1055. RecordWriter recWriter = outFileFormat.newInstance().getHiveRecordWriter(job, newFilePath,
  1056. Text.class, false, new Properties(), null);
  1057. recWriter.close(false);
  1058. FileInputFormat.addInputPaths(job, onefile);
  1059. return numEmptyPaths;
  1060. }
  1061. private void addInputPaths(JobConf job, MapredWork work, String hiveScratchDir) throws Exception {
  1062. int numEmptyPaths = 0;
  1063. List<String> pathsProcessed = new ArrayList<String>();
  1064. // AliasToWork contains all the aliases
  1065. for (String oneAlias : work.getAliasToWork().keySet()) {
  1066. LOG.info("Processing alias " + oneAlias);
  1067. List<String> emptyPaths = new ArrayList<String>();
  1068. // The alias may not have any path
  1069. String path = null;
  1070. for (String onefile : work.getPathToAliases().keySet()) {
  1071. List<String> aliases = work.getPathToAliases().get(onefile);
  1072. if (aliases.contains(oneAlias)) {
  1073. path = onefile;
  1074. // Multiple aliases can point to the same path - it should be
  1075. // processed only once
  1076. if (pathsProcessed.contains(path)) {
  1077. continue;
  1078. }
  1079. pathsProcessed.add(path);
  1080. LOG.info("Adding input file " + path);
  1081. Path dirPath = new Path(path);
  1082. if (!Utilities.isEmptyPath(job, dirPath)) {
  1083. FileInputFormat.addInputPath(job, dirPath);
  1084. } else {
  1085. emptyPaths.add(path);
  1086. }
  1087. }
  1088. }
  1089. // Create a empty file if the directory is empty
  1090. for (String emptyPath : emptyPaths) {
  1091. numEmptyPaths = addInputPath(emptyPath, job, work, hiveScratchDir, numEmptyPaths, true,
  1092. oneAlias);
  1093. }
  1094. // If the query references non-existent partitions
  1095. // We need to add a empty file, it is not acceptable to change the
  1096. // operator tree
  1097. // Consider the query:
  1098. // select * from (select count(1) from T union all select count(1) from
  1099. // T2) x;
  1100. // If T is empty and T2 contains 100 rows, the user expects: 0, 100 (2
  1101. // rows)
  1102. if (path == null) {
  1103. numEmptyPaths = addInputPath(null, job, work, hiveScratchDir, numEmptyPaths, false,
  1104. oneAlias);
  1105. }
  1106. }
  1107. }
  1108. @Override
  1109. public StageType getType() {
  1110. return StageType.MAPRED;
  1111. }
  1112. @Override
  1113. public String getName() {
  1114. return "MAPRED";
  1115. }
  1116. @Override
  1117. protected void localizeMRTmpFilesImpl(Context ctx) {
  1118. // localize any map-reduce input paths
  1119. ctx.localizeKeys((Map<String, Object>) ((Object) work.getPathToAliases()));
  1120. ctx.localizeKeys((Map<String, Object>) ((Object) work.getPathToPartitionInfo()));
  1121. // localize any input paths for maplocal work
  1122. MapredLocalWork l = work.getMapLocalWork();
  1123. if (l != null) {
  1124. Map<String, FetchWork> m = l.getAliasToFetchWork();
  1125. if (m != null) {
  1126. for (FetchWork fw : m.values()) {
  1127. String s = fw.getTblDir();
  1128. if ((s != null) && ctx.isMRTmpFileURI(s)) {
  1129. fw.setTblDir(ctx.localizeMRTmpFileURI(s));
  1130. }
  1131. }
  1132. }
  1133. }
  1134. // fix up outputs
  1135. Map<String, ArrayList<String>> pa = work.getPathToAliases();
  1136. if (pa != null) {
  1137. for (List<String> ls : pa.values()) {
  1138. for (String a : ls) {
  1139. ArrayList<Operator<? extends Serializable>> opList = new ArrayList<Operator<? extends Serializable>>();
  1140. opList.add(work.getAliasToWork().get(a));
  1141. while (!opList.isEmpty()) {
  1142. Operator<? extends Serializable> op = opList.remove(0);
  1143. if (op instanceof FileSinkOperator) {
  1144. FileSinkDesc fdesc = ((FileSinkOperator) op).getConf();
  1145. String s = fdesc.getDirName();
  1146. if ((s != null) && ctx.isMRTmpFileURI(s)) {
  1147. fdesc.setDirName(ctx.localizeMRTmpFileURI(s));
  1148. }
  1149. ((FileSinkOperator) op).setConf(fdesc);
  1150. }
  1151. if (op.getChildOperators() != null) {
  1152. opList.addAll(op.getChildOperators());
  1153. }
  1154. }
  1155. }
  1156. }
  1157. }
  1158. }
  1159. }