PageRenderTime 122ms CodeModel.GetById 13ms app.highlight 96ms RepoModel.GetById 1ms app.codeStats 1ms

/tags/release-0.1-rc2/hive/external/ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java

#
Java | 1341 lines | 949 code | 182 blank | 210 comment | 205 complexity | 41b635fa891d3cb9e811c21f462079a4 MD5 | raw file
   1/**
   2 * Licensed to the Apache Software Foundation (ASF) under one
   3 * or more contributor license agreements.  See the NOTICE file
   4 * distributed with this work for additional information
   5 * regarding copyright ownership.  The ASF licenses this file
   6 * to you under the Apache License, Version 2.0 (the
   7 * "License"); you may not use this file except in compliance
   8 * with the License.  You may obtain a copy of the License at
   9 *
  10 *     http://www.apache.org/licenses/LICENSE-2.0
  11 *
  12 * Unless required by applicable law or agreed to in writing, software
  13 * distributed under the License is distributed on an "AS IS" BASIS,
  14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15 * See the License for the specific language governing permissions and
  16 * limitations under the License.
  17 */
  18
  19package org.apache.hadoop.hive.ql.exec;
  20
  21import java.io.File;
  22import java.io.IOException;
  23import java.io.InputStream;
  24import java.io.Serializable;
  25import java.io.UnsupportedEncodingException;
  26import java.lang.management.ManagementFactory;
  27import java.lang.management.MemoryMXBean;
  28import java.net.URL;
  29import java.net.URLDecoder;
  30import java.net.URLEncoder;
  31import java.text.SimpleDateFormat;
  32import java.util.ArrayList;
  33import java.util.Calendar;
  34import java.util.Collections;
  35import java.util.Enumeration;
  36import java.util.HashMap;
  37import java.util.HashSet;
  38import java.util.LinkedHashMap;
  39import java.util.List;
  40import java.util.Map;
  41import java.util.Properties;
  42import java.util.Set;
  43
  44import org.apache.commons.lang.StringUtils;
  45import org.apache.commons.logging.Log;
  46import org.apache.commons.logging.LogFactory;
  47import org.apache.hadoop.conf.Configuration;
  48import org.apache.hadoop.filecache.DistributedCache;
  49import org.apache.hadoop.fs.FileStatus;
  50import org.apache.hadoop.fs.FileSystem;
  51import org.apache.hadoop.fs.Path;
  52import org.apache.hadoop.hive.common.FileUtils;
  53import org.apache.hadoop.hive.conf.HiveConf;
  54import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
  55import org.apache.hadoop.hive.ql.Context;
  56import org.apache.hadoop.hive.ql.DriverContext;
  57import org.apache.hadoop.hive.ql.QueryPlan;
  58import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
  59import org.apache.hadoop.hive.ql.exec.Operator.ProgressCounter;
  60import org.apache.hadoop.hive.ql.exec.errors.ErrorAndSolution;
  61import org.apache.hadoop.hive.ql.exec.errors.TaskLogProcessor;
  62import org.apache.hadoop.hive.ql.history.HiveHistory.Keys;
  63import org.apache.hadoop.hive.ql.io.HiveKey;
  64import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
  65import org.apache.hadoop.hive.ql.io.IOPrepareCache;
  66import org.apache.hadoop.hive.ql.metadata.HiveException;
  67import org.apache.hadoop.hive.ql.plan.FetchWork;
  68import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
  69import org.apache.hadoop.hive.ql.plan.MapredLocalWork;
  70import org.apache.hadoop.hive.ql.plan.MapredWork;
  71import org.apache.hadoop.hive.ql.plan.PartitionDesc;
  72import org.apache.hadoop.hive.ql.plan.TableDesc;
  73import org.apache.hadoop.hive.ql.plan.api.StageType;
  74import org.apache.hadoop.hive.ql.session.SessionState;
  75import org.apache.hadoop.hive.ql.session.SessionState.LogHelper;
  76import org.apache.hadoop.hive.ql.stats.StatsFactory;
  77import org.apache.hadoop.hive.ql.stats.StatsPublisher;
  78import org.apache.hadoop.hive.shims.ShimLoader;
  79import org.apache.hadoop.io.BytesWritable;
  80import org.apache.hadoop.io.Text;
  81import org.apache.hadoop.mapred.Counters;
  82import org.apache.hadoop.mapred.FileInputFormat;
  83import org.apache.hadoop.mapred.InputFormat;
  84import org.apache.hadoop.mapred.JobClient;
  85import org.apache.hadoop.mapred.JobConf;
  86import org.apache.hadoop.mapred.Partitioner;
  87import org.apache.hadoop.mapred.RunningJob;
  88import org.apache.hadoop.mapred.TaskCompletionEvent;
  89import org.apache.log4j.Appender;
  90import org.apache.log4j.BasicConfigurator;
  91import org.apache.log4j.FileAppender;
  92import org.apache.log4j.LogManager;
  93import org.apache.log4j.PropertyConfigurator;
  94import org.apache.log4j.varia.NullAppender;
  95
  96/**
  97 * ExecDriver.
  98 *
  99 */
 100public class ExecDriver extends Task<MapredWork> implements Serializable {
 101
 102  private static final long serialVersionUID = 1L;
 103
 104  protected transient JobConf job;
 105  protected transient int mapProgress = 0;
 106  protected transient int reduceProgress = 0;
 107  public transient String jobId;
 108
 109  public String getJobId() {
 110    return jobId;
 111  }
 112
 113  public void setJobId(String jobId) {
 114    this.jobId = jobId;
 115  }
 116
 117  public static MemoryMXBean memoryMXBean;
 118
 119  /**
 120   * Constructor when invoked from QL.
 121   */
 122  public ExecDriver() {
 123    super();
 124  }
 125
 126  protected static String getResourceFiles(Configuration conf, SessionState.ResourceType t) {
 127    // fill in local files to be added to the task environment
 128    SessionState ss = SessionState.get();
 129    Set<String> files = (ss == null) ? null : ss.list_resource(t, null);
 130    if (files != null) {
 131      List<String> realFiles = new ArrayList<String>(files.size());
 132      for (String one : files) {
 133        try {
 134          realFiles.add(Utilities.realFile(one, conf));
 135        } catch (IOException e) {
 136          throw new RuntimeException("Cannot validate file " + one + "due to exception: "
 137              + e.getMessage(), e);
 138        }
 139      }
 140      return StringUtils.join(realFiles, ",");
 141    } else {
 142      return "";
 143    }
 144  }
 145
 146  private void initializeFiles(String prop, String files) {
 147    if (files != null && files.length() > 0) {
 148      job.set(prop, files);
 149      ShimLoader.getHadoopShims().setTmpFiles(prop, files);
 150    }
 151  }
 152
 153  /**
 154   * Initialization when invoked from QL.
 155   */
 156  @Override
 157  public void initialize(HiveConf conf, QueryPlan queryPlan, DriverContext driverContext) {
 158    super.initialize(conf, queryPlan, driverContext);
 159
 160    job = new JobConf(conf, ExecDriver.class);
 161
 162    // NOTE: initialize is only called if it is in non-local mode.
 163    // In case it's in non-local mode, we need to move the SessionState files
 164    // and jars to jobConf.
 165    // In case it's in local mode, MapRedTask will set the jobConf.
 166    //
 167    // "tmpfiles" and "tmpjars" are set by the method ExecDriver.execute(),
 168    // which will be called by both local and NON-local mode.
 169    String addedFiles = getResourceFiles(job, SessionState.ResourceType.FILE);
 170    if (StringUtils.isNotBlank(addedFiles)) {
 171      HiveConf.setVar(job, ConfVars.HIVEADDEDFILES, addedFiles);
 172    }
 173    String addedJars = getResourceFiles(job, SessionState.ResourceType.JAR);
 174    if (StringUtils.isNotBlank(addedJars)) {
 175      HiveConf.setVar(job, ConfVars.HIVEADDEDJARS, addedJars);
 176    }
 177    String addedArchives = getResourceFiles(job, SessionState.ResourceType.ARCHIVE);
 178    if (StringUtils.isNotBlank(addedArchives)) {
 179      HiveConf.setVar(job, ConfVars.HIVEADDEDARCHIVES, addedArchives);
 180    }
 181  }
 182
 183  /**
 184   * Constructor/Initialization for invocation as independent utility.
 185   */
 186  public ExecDriver(MapredWork plan, JobConf job, boolean isSilent) throws HiveException {
 187    setWork(plan);
 188    this.job = job;
 189    LOG = LogFactory.getLog(this.getClass().getName());
 190    console = new LogHelper(LOG, isSilent);
 191  }
 192
 193  /**
 194   * A list of the currently running jobs spawned in this Hive instance that is used to kill all
 195   * running jobs in the event of an unexpected shutdown - i.e., the JVM shuts down while there are
 196   * still jobs running.
 197   */
 198  private static Map<String, String> runningJobKillURIs = Collections
 199      .synchronizedMap(new HashMap<String, String>());
 200
 201  /**
 202   * In Hive, when the user control-c's the command line, any running jobs spawned from that command
 203   * line are best-effort killed.
 204   *
 205   * This static constructor registers a shutdown thread to iterate over all the running job kill
 206   * URLs and do a get on them.
 207   *
 208   */
 209  static {
 210    if (new org.apache.hadoop.conf.Configuration()
 211        .getBoolean("webinterface.private.actions", false)) {
 212      Runtime.getRuntime().addShutdownHook(new Thread() {
 213        @Override
 214        public void run() {
 215          synchronized (runningJobKillURIs) {
 216            for (String uri : runningJobKillURIs.values()) {
 217              try {
 218                System.err.println("killing job with: " + uri);
 219                java.net.HttpURLConnection conn = (java.net.HttpURLConnection) new java.net.URL(uri)
 220                    .openConnection();
 221                conn.setRequestMethod("POST");
 222                int retCode = conn.getResponseCode();
 223                if (retCode != 200) {
 224                  System.err.println("Got an error trying to kill job with URI: " + uri + " = "
 225                      + retCode);
 226                }
 227              } catch (Exception e) {
 228                System.err.println("trying to kill job, caught: " + e);
 229                // do nothing
 230              }
 231            }
 232          }
 233        }
 234      });
 235    }
 236  }
 237
 238  /**
 239   * from StreamJob.java.
 240   */
 241  private void jobInfo(RunningJob rj) {
 242    if (job.get("mapred.job.tracker", "local").equals("local")) {
 243      console.printInfo("Job running in-process (local Hadoop)");
 244    } else {
 245      String hp = job.get("mapred.job.tracker");
 246      if (SessionState.get() != null) {
 247        SessionState.get().getHiveHistory().setTaskProperty(SessionState.get().getQueryId(),
 248            getId(), Keys.TASK_HADOOP_ID, rj.getJobID());
 249      }
 250      console.printInfo(ExecDriver.getJobStartMsg(rj.getJobID()) + ", Tracking URL = "
 251          + rj.getTrackingURL());
 252      console.printInfo("Kill Command = " + HiveConf.getVar(job, HiveConf.ConfVars.HADOOPBIN)
 253          + " job  -Dmapred.job.tracker=" + hp + " -kill " + rj.getJobID());
 254    }
 255  }
 256
 257  /**
 258   * This class contains the state of the running task Going forward, we will return this handle
 259   * from execute and Driver can split execute into start, monitorProgess and postProcess.
 260   */
 261  private static class ExecDriverTaskHandle extends TaskHandle {
 262    JobClient jc;
 263    RunningJob rj;
 264
 265    JobClient getJobClient() {
 266      return jc;
 267    }
 268
 269    RunningJob getRunningJob() {
 270      return rj;
 271    }
 272
 273    public ExecDriverTaskHandle(JobClient jc, RunningJob rj) {
 274      this.jc = jc;
 275      this.rj = rj;
 276    }
 277
 278    public void setRunningJob(RunningJob job) {
 279      rj = job;
 280    }
 281
 282    @Override
 283    public Counters getCounters() throws IOException {
 284      return rj.getCounters();
 285    }
 286  }
 287
 288  /**
 289   * Fatal errors are those errors that cannot be recovered by retries. These are application
 290   * dependent. Examples of fatal errors include: - the small table in the map-side joins is too
 291   * large to be feasible to be handled by one mapper. The job should fail and the user should be
 292   * warned to use regular joins rather than map-side joins. Fatal errors are indicated by counters
 293   * that are set at execution time. If the counter is non-zero, a fatal error occurred. The value
 294   * of the counter indicates the error type.
 295   *
 296   * @return true if fatal errors happened during job execution, false otherwise.
 297   */
 298  private boolean checkFatalErrors(Counters ctrs, StringBuilder errMsg) {
 299    if (ctrs == null) {
 300      // hadoop might return null if it cannot locate the job.
 301      // we may still be able to retrieve the job status - so ignore
 302      return false;
 303    }
 304    // check for number of created files
 305    long numFiles = ctrs.getCounter(ProgressCounter.CREATED_FILES);
 306    long upperLimit = HiveConf.getLongVar(job, HiveConf.ConfVars.MAXCREATEDFILES);
 307    if (numFiles > upperLimit) {
 308      errMsg.append("total number of created files exceeds ").append(upperLimit);
 309      return true;
 310    }
 311
 312    for (Operator<? extends Serializable> op : work.getAliasToWork().values()) {
 313      if (op.checkFatalErrors(ctrs, errMsg)) {
 314        return true;
 315      }
 316    }
 317    if (work.getReducer() != null) {
 318      if (work.getReducer().checkFatalErrors(ctrs, errMsg)) {
 319        return true;
 320      }
 321    }
 322    return false;
 323  }
 324
 325  private boolean progress(ExecDriverTaskHandle th) throws IOException {
 326    JobClient jc = th.getJobClient();
 327    RunningJob rj = th.getRunningJob();
 328    String lastReport = "";
 329    SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss,SSS");
 330    long reportTime = System.currentTimeMillis();
 331    long maxReportInterval = 60 * 1000; // One minute
 332    boolean fatal = false;
 333    StringBuilder errMsg = new StringBuilder();
 334    long pullInterval = HiveConf.getLongVar(job, HiveConf.ConfVars.HIVECOUNTERSPULLINTERVAL);
 335    boolean initializing = true;
 336    while (!rj.isComplete()) {
 337      try {
 338        Thread.sleep(pullInterval);
 339      } catch (InterruptedException e) {
 340      }
 341
 342      if (initializing && ShimLoader.getHadoopShims().isJobPreparing(rj)) {
 343        // No reason to poll untill the job is initialized
 344        continue;
 345      } else {
 346        // By now the job is initialized so no reason to do
 347        // rj.getJobState() again and we do not want to do an extra RPC call
 348        initializing = false;
 349      }
 350
 351      RunningJob newRj = jc.getJob(rj.getJobID());
 352      if (newRj == null) {
 353        // under exceptional load, hadoop may not be able to look up status
 354        // of finished jobs (because it has purged them from memory). From
 355        // hive's perspective - it's equivalent to the job having failed.
 356        // So raise a meaningful exception
 357        throw new IOException("Could not find status of job: + rj.getJobID()");
 358      } else {
 359        th.setRunningJob(newRj);
 360        rj = newRj;
 361      }
 362
 363      // If fatal errors happen we should kill the job immediately rather than
 364      // let the job retry several times, which eventually lead to failure.
 365      if (fatal) {
 366        continue; // wait until rj.isComplete
 367      }
 368
 369      Counters ctrs = th.getCounters();
 370
 371      if (fatal = checkFatalErrors(ctrs, errMsg)) {
 372        console.printError("[Fatal Error] " + errMsg.toString() + ". Killing the job.");
 373        rj.killJob();
 374        continue;
 375      }
 376      errMsg.setLength(0);
 377
 378      updateCounters(ctrs, rj);
 379
 380      String report = " " + getId() + " map = " + mapProgress + "%,  reduce = " + reduceProgress
 381          + "%";
 382
 383      if (!report.equals(lastReport)
 384          || System.currentTimeMillis() >= reportTime + maxReportInterval) {
 385
 386        // write out serialized plan with counters to log file
 387        // LOG.info(queryPlan);
 388        String output = dateFormat.format(Calendar.getInstance().getTime()) + report;
 389        SessionState ss = SessionState.get();
 390        if (ss != null) {
 391          ss.getHiveHistory().setTaskCounters(SessionState.get().getQueryId(), getId(), ctrs);
 392          ss.getHiveHistory().setTaskProperty(SessionState.get().getQueryId(), getId(),
 393              Keys.TASK_HADOOP_PROGRESS, output);
 394          ss.getHiveHistory().progressTask(SessionState.get().getQueryId(), this);
 395          ss.getHiveHistory().logPlanProgress(queryPlan);
 396        }
 397        console.printInfo(output);
 398        lastReport = report;
 399        reportTime = System.currentTimeMillis();
 400      }
 401    }
 402
 403    boolean success;
 404    Counters ctrs = th.getCounters();
 405
 406    if (fatal) {
 407      success = false;
 408    } else {
 409      // check for fatal error again in case it occurred after
 410      // the last check before the job is completed
 411      if (checkFatalErrors(ctrs, errMsg)) {
 412        console.printError("[Fatal Error] " + errMsg.toString());
 413        success = false;
 414      } else {
 415        success = rj.isSuccessful();
 416      }
 417    }
 418
 419    setDone();
 420    // update based on the final value of the counters
 421    updateCounters(ctrs, rj);
 422
 423    SessionState ss = SessionState.get();
 424    if (ss != null) {
 425      ss.getHiveHistory().logPlanProgress(queryPlan);
 426    }
 427    // LOG.info(queryPlan);
 428    return (success);
 429  }
 430
 431  /**
 432   * Update counters relevant to this task.
 433   */
 434  private void updateCounters(Counters ctrs, RunningJob rj) throws IOException {
 435    mapProgress = Math.round(rj.mapProgress() * 100);
 436    reduceProgress = Math.round(rj.reduceProgress() * 100);
 437    taskCounters.put("CNTR_NAME_" + getId() + "_MAP_PROGRESS", Long.valueOf(mapProgress));
 438    taskCounters.put("CNTR_NAME_" + getId() + "_REDUCE_PROGRESS", Long.valueOf(reduceProgress));
 439    if (ctrs == null) {
 440      // hadoop might return null if it cannot locate the job.
 441      // we may still be able to retrieve the job status - so ignore
 442      return;
 443    }
 444    for (Operator<? extends Serializable> op : work.getAliasToWork().values()) {
 445      op.updateCounters(ctrs);
 446    }
 447    if (work.getReducer() != null) {
 448      work.getReducer().updateCounters(ctrs);
 449    }
 450  }
 451
 452  public boolean mapStarted() {
 453    return mapProgress > 0;
 454  }
 455
 456  public boolean reduceStarted() {
 457    return reduceProgress > 0;
 458  }
 459
 460  public boolean mapDone() {
 461    return mapProgress == 100;
 462  }
 463
 464  public boolean reduceDone() {
 465    return reduceProgress == 100;
 466  }
 467
 468  /**
 469   * Execute a query plan using Hadoop.
 470   */
 471  @Override
 472  public int execute(DriverContext driverContext) {
 473
 474    IOPrepareCache ioPrepareCache = IOPrepareCache.get();
 475    ioPrepareCache.clear();
 476
 477    boolean success = true;
 478
 479    String invalidReason = work.isInvalid();
 480    if (invalidReason != null) {
 481      throw new RuntimeException("Plan invalid, Reason: " + invalidReason);
 482    }
 483
 484    Context ctx = driverContext.getCtx();
 485    boolean ctxCreated = false;
 486    String emptyScratchDirStr;
 487    Path emptyScratchDir;
 488
 489    try {
 490      if (ctx == null) {
 491        ctx = new Context(job);
 492        ctxCreated = true;
 493      }
 494
 495      emptyScratchDirStr = ctx.getMRTmpFileURI();
 496      emptyScratchDir = new Path(emptyScratchDirStr);
 497      FileSystem fs = emptyScratchDir.getFileSystem(job);
 498      fs.mkdirs(emptyScratchDir);
 499    } catch (IOException e) {
 500      e.printStackTrace();
 501      console.printError("Error launching map-reduce job", "\n"
 502          + org.apache.hadoop.util.StringUtils.stringifyException(e));
 503      return 5;
 504    }
 505
 506    ShimLoader.getHadoopShims().setNullOutputFormat(job);
 507    job.setMapperClass(ExecMapper.class);
 508
 509    job.setMapOutputKeyClass(HiveKey.class);
 510    job.setMapOutputValueClass(BytesWritable.class);
 511
 512    try {
 513      job.setPartitionerClass((Class<? extends Partitioner>) (Class.forName(HiveConf.getVar(job,
 514          HiveConf.ConfVars.HIVEPARTITIONER))));
 515    } catch (ClassNotFoundException e) {
 516      throw new RuntimeException(e.getMessage());
 517    }
 518
 519    if (work.getNumMapTasks() != null) {
 520      job.setNumMapTasks(work.getNumMapTasks().intValue());
 521    }
 522    if (work.getMinSplitSize() != null) {
 523      HiveConf.setLongVar(job, HiveConf.ConfVars.MAPREDMINSPLITSIZE, work.getMinSplitSize().longValue());
 524    }
 525    job.setNumReduceTasks(work.getNumReduceTasks().intValue());
 526    job.setReducerClass(ExecReducer.class);
 527
 528    if (work.getInputformat() != null) {
 529      HiveConf.setVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT, work.getInputformat());
 530    }
 531
 532    // Turn on speculative execution for reducers
 533    boolean useSpeculativeExecReducers = HiveConf.getBoolVar(job,
 534        HiveConf.ConfVars.HIVESPECULATIVEEXECREDUCERS);
 535    HiveConf.setBoolVar(job, HiveConf.ConfVars.HADOOPSPECULATIVEEXECREDUCERS,
 536        useSpeculativeExecReducers);
 537
 538    String inpFormat = HiveConf.getVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT);
 539    if ((inpFormat == null) || (!StringUtils.isNotBlank(inpFormat))) {
 540      inpFormat = ShimLoader.getHadoopShims().getInputFormatClassName();
 541    }
 542
 543    LOG.info("Using " + inpFormat);
 544
 545    try {
 546      job.setInputFormat((Class<? extends InputFormat>) (Class.forName(inpFormat)));
 547    } catch (ClassNotFoundException e) {
 548      throw new RuntimeException(e.getMessage());
 549    }
 550
 551
 552    // No-Op - we don't really write anything here ..
 553    job.setOutputKeyClass(Text.class);
 554    job.setOutputValueClass(Text.class);
 555
 556    // Transfer HIVEAUXJARS and HIVEADDEDJARS to "tmpjars" so hadoop understands
 557    // it
 558    String auxJars = HiveConf.getVar(job, HiveConf.ConfVars.HIVEAUXJARS);
 559    String addedJars = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDJARS);
 560    if (StringUtils.isNotBlank(auxJars) || StringUtils.isNotBlank(addedJars)) {
 561      String allJars = StringUtils.isNotBlank(auxJars) ? (StringUtils.isNotBlank(addedJars) ? addedJars
 562          + "," + auxJars
 563          : auxJars)
 564          : addedJars;
 565      LOG.info("adding libjars: " + allJars);
 566      initializeFiles("tmpjars", allJars);
 567    }
 568
 569    // Transfer HIVEADDEDFILES to "tmpfiles" so hadoop understands it
 570    String addedFiles = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDFILES);
 571    if (StringUtils.isNotBlank(addedFiles)) {
 572      initializeFiles("tmpfiles", addedFiles);
 573    }
 574    int returnVal = 0;
 575    RunningJob rj = null;
 576    boolean noName = StringUtils.isEmpty(HiveConf.getVar(job, HiveConf.ConfVars.HADOOPJOBNAME));
 577
 578    if (noName) {
 579      // This is for a special case to ensure unit tests pass
 580      HiveConf.setVar(job, HiveConf.ConfVars.HADOOPJOBNAME, "JOB" + Utilities.randGen.nextInt());
 581    }
 582    String addedArchives = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDARCHIVES);
 583    // Transfer HIVEADDEDARCHIVES to "tmparchives" so hadoop understands it
 584    if (StringUtils.isNotBlank(addedArchives)) {
 585      initializeFiles("tmparchives", addedArchives);
 586    }
 587
 588    try{
 589      MapredLocalWork localwork = work.getMapLocalWork();
 590      if (localwork != null) {
 591        boolean localMode = HiveConf.getVar(job, HiveConf.ConfVars.HADOOPJT).equals("local");
 592        if (!localMode) {
 593          Path localPath = new Path(localwork.getTmpFileURI());
 594          Path hdfsPath = new Path(work.getTmpHDFSFileURI());
 595
 596          FileSystem hdfs = hdfsPath.getFileSystem(job);
 597          FileSystem localFS = localPath.getFileSystem(job);
 598          FileStatus[] hashtableFiles = localFS.listStatus(localPath);
 599          int fileNumber = hashtableFiles.length;
 600          String[] fileNames = new String[fileNumber];
 601
 602          for ( int i = 0; i < fileNumber; i++){
 603            fileNames[i] = hashtableFiles[i].getPath().getName();
 604          }
 605
 606          //package and compress all the hashtable files to an archive file
 607          String parentDir = localPath.toUri().getPath();
 608          String stageId = this.getId();
 609          String archiveFileURI = Utilities.generateTarURI(parentDir, stageId);
 610          String archiveFileName = Utilities.generateTarFileName(stageId);
 611          localwork.setStageID(stageId);
 612
 613          FileUtils.tar(parentDir, fileNames,archiveFileName);
 614          Path archivePath = new Path(archiveFileURI);
 615          LOG.info("Archive "+ hashtableFiles.length+" hash table files to " + archiveFileURI);
 616
 617          //upload archive file to hdfs
 618          String hdfsFile =Utilities.generateTarURI(hdfsPath, stageId);
 619          Path hdfsFilePath = new Path(hdfsFile);
 620          short replication = (short) job.getInt("mapred.submit.replication", 10);
 621          hdfs.setReplication(hdfsFilePath, replication);
 622          hdfs.copyFromLocalFile(archivePath, hdfsFilePath);
 623          LOG.info("Upload 1 archive file  from" + archivePath + " to: " + hdfsFilePath);
 624
 625          //add the archive file to distributed cache
 626          DistributedCache.createSymlink(job);
 627          DistributedCache.addCacheArchive(hdfsFilePath.toUri(), job);
 628          LOG.info("Add 1 archive file to distributed cache. Archive file: " + hdfsFilePath.toUri());
 629        }
 630      }
 631
 632      addInputPaths(job, work, emptyScratchDirStr);
 633
 634      Utilities.setMapRedWork(job, work, ctx.getMRTmpFileURI());
 635      // remove the pwd from conf file so that job tracker doesn't show this
 636      // logs
 637      String pwd = HiveConf.getVar(job, HiveConf.ConfVars.METASTOREPWD);
 638      if (pwd != null) {
 639        HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, "HIVE");
 640      }
 641      JobClient jc = new JobClient(job);
 642
 643      // make this client wait if job trcker is not behaving well.
 644      Throttle.checkJobTracker(job, LOG);
 645
 646      if (work.isGatheringStats()) {
 647        // initialize stats publishing table
 648        StatsPublisher statsPublisher;
 649        String statsImplementationClass = HiveConf.getVar(job, HiveConf.ConfVars.HIVESTATSDBCLASS);
 650        if (StatsFactory.setImplementation(statsImplementationClass, job)) {
 651          statsPublisher = StatsFactory.getStatsPublisher();
 652          statsPublisher.init(job); // creating stats table if not exists
 653        }
 654      }
 655
 656      // Finally SUBMIT the JOB!
 657      rj = jc.submitJob(job);
 658
 659      jobId = rj.getJobID();
 660
 661      // replace it back
 662      if (pwd != null) {
 663        HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, pwd);
 664      }
 665
 666      // add to list of running jobs to kill in case of abnormal shutdown
 667
 668      runningJobKillURIs.put(rj.getJobID(), rj.getTrackingURL() + "&action=kill");
 669
 670      ExecDriverTaskHandle th = new ExecDriverTaskHandle(jc, rj);
 671      jobInfo(rj);
 672      success = progress(th);
 673
 674      String statusMesg = getJobEndMsg(rj.getJobID());
 675      if (!success) {
 676        statusMesg += " with errors";
 677        returnVal = 2;
 678        console.printError(statusMesg);
 679        if (HiveConf.getBoolVar(job, HiveConf.ConfVars.SHOW_JOB_FAIL_DEBUG_INFO)) {
 680          showJobFailDebugInfo(job, rj);
 681        }
 682      } else {
 683        console.printInfo(statusMesg);
 684      }
 685
 686
 687    } catch (Exception e) {
 688      e.printStackTrace();
 689      String mesg = " with exception '" + Utilities.getNameMessage(e) + "'";
 690      if (rj != null) {
 691        mesg = "Ended Job = " + rj.getJobID() + mesg;
 692      } else {
 693        mesg = "Job Submission failed" + mesg;
 694      }
 695
 696      // Has to use full name to make sure it does not conflict with
 697      // org.apache.commons.lang.StringUtils
 698      console.printError(mesg, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
 699
 700      success = false;
 701      returnVal = 1;
 702    } finally {
 703      Utilities.clearMapRedWork(job);
 704      try {
 705        if (ctxCreated) {
 706          ctx.clear();
 707        }
 708
 709        if (rj != null) {
 710          if (returnVal != 0) {
 711            rj.killJob();
 712          }
 713          runningJobKillURIs.remove(rj.getJobID());
 714        }
 715      } catch (Exception e) {
 716      }
 717    }
 718
 719    // get the list of Dynamic partition paths
 720    try {
 721      if (rj != null) {
 722        JobCloseFeedBack feedBack = new JobCloseFeedBack();
 723        if (work.getAliasToWork() != null) {
 724          for (Operator<? extends Serializable> op : work.getAliasToWork().values()) {
 725            op.jobClose(job, success, feedBack);
 726          }
 727        }
 728        if (work.getReducer() != null) {
 729          work.getReducer().jobClose(job, success, feedBack);
 730        }
 731      }
 732    } catch (Exception e) {
 733      // jobClose needs to execute successfully otherwise fail task
 734      if (success) {
 735        success = false;
 736        returnVal = 3;
 737        String mesg = "Job Commit failed with exception '" + Utilities.getNameMessage(e) + "'";
 738        console.printError(mesg, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
 739      }
 740    }
 741
 742    return (returnVal);
 743  }
 744
 745  /**
 746   * This msg pattern is used to track when a job is started.
 747   *
 748   * @param jobId
 749   * @return
 750   */
 751  private static String getJobStartMsg(String jobId) {
 752    return "Starting Job = " + jobId;
 753  }
 754
 755  /**
 756   * this msg pattern is used to track when a job is successfully done.
 757   *
 758   * @param jobId
 759   * @return
 760   */
 761  public static String getJobEndMsg(String jobId) {
 762    return "Ended Job = " + jobId;
 763  }
 764
 765  private String getTaskAttemptLogUrl(String taskTrackerHttpAddress, String taskAttemptId) {
 766    return taskTrackerHttpAddress + "/tasklog?taskid=" + taskAttemptId + "&all=true";
 767  }
 768
 769  // Used for showJobFailDebugInfo
 770  private static class TaskInfo {
 771    String jobId;
 772    HashSet<String> logUrls;
 773
 774    public TaskInfo(String jobId) {
 775      this.jobId = jobId;
 776      logUrls = new HashSet<String>();
 777    }
 778
 779    public void addLogUrl(String logUrl) {
 780      logUrls.add(logUrl);
 781    }
 782
 783    public HashSet<String> getLogUrls() {
 784      return logUrls;
 785    }
 786
 787    public String getJobId() {
 788      return jobId;
 789    }
 790  }
 791
 792  @SuppressWarnings("deprecation")
 793  private void showJobFailDebugInfo(JobConf conf, RunningJob rj) throws IOException {
 794    // Mapping from task ID to the number of failures
 795    Map<String, Integer> failures = new HashMap<String, Integer>();
 796    // Successful task ID's
 797    Set<String> successes = new HashSet<String>();
 798
 799    Map<String, TaskInfo> taskIdToInfo = new HashMap<String, TaskInfo>();
 800
 801    int startIndex = 0;
 802
 803    // Loop to get all task completion events because getTaskCompletionEvents
 804    // only returns a subset per call
 805    while (true) {
 806      TaskCompletionEvent[] taskCompletions = rj.getTaskCompletionEvents(startIndex);
 807
 808      if (taskCompletions == null || taskCompletions.length == 0) {
 809        break;
 810      }
 811
 812      boolean more = true;
 813      for (TaskCompletionEvent t : taskCompletions) {
 814        // getTaskJobIDs returns Strings for compatibility with Hadoop versions
 815        // without TaskID or TaskAttemptID
 816        String[] taskJobIds = ShimLoader.getHadoopShims().getTaskJobIDs(t);
 817
 818        if (taskJobIds == null) {
 819          console.printError("Task attempt info is unavailable in this Hadoop version");
 820          more = false;
 821          break;
 822        }
 823
 824        // For each task completion event, get the associated task id, job id
 825        // and the logs
 826        String taskId = taskJobIds[0];
 827        String jobId = taskJobIds[1];
 828
 829        TaskInfo ti = taskIdToInfo.get(taskId);
 830        if (ti == null) {
 831          ti = new TaskInfo(jobId);
 832          taskIdToInfo.put(taskId, ti);
 833        }
 834        // These tasks should have come from the same job.
 835        assert (ti.getJobId() == jobId);
 836        ti.getLogUrls().add(getTaskAttemptLogUrl(t.getTaskTrackerHttp(), t.getTaskId()));
 837
 838        // If a task failed, then keep track of the total number of failures
 839        // for that task (typically, a task gets re-run up to 4 times if it
 840        // fails
 841
 842        if (t.getTaskStatus() != TaskCompletionEvent.Status.SUCCEEDED) {
 843          Integer failAttempts = failures.get(taskId);
 844          if (failAttempts == null) {
 845            failAttempts = Integer.valueOf(0);
 846          }
 847          failAttempts = Integer.valueOf(failAttempts.intValue() + 1);
 848          failures.put(taskId, failAttempts);
 849        } else {
 850          successes.add(taskId);
 851        }
 852      }
 853      if (!more) {
 854        break;
 855      }
 856      startIndex += taskCompletions.length;
 857    }
 858    // Remove failures for tasks that succeeded
 859    for (String task : successes) {
 860      failures.remove(task);
 861    }
 862
 863    if (failures.keySet().size() == 0) {
 864      return;
 865    }
 866
 867    // Find the highest failure count
 868    int maxFailures = 0;
 869    for (Integer failCount : failures.values()) {
 870      if (maxFailures < failCount.intValue()) {
 871        maxFailures = failCount.intValue();
 872      }
 873    }
 874
 875    // Display Error Message for tasks with the highest failure count
 876    String jtUrl = JobTrackerURLResolver.getURL(conf);
 877
 878    for (String task : failures.keySet()) {
 879      if (failures.get(task).intValue() == maxFailures) {
 880        TaskInfo ti = taskIdToInfo.get(task);
 881        String jobId = ti.getJobId();
 882        String taskUrl = jtUrl + "/taskdetails.jsp?jobid=" + jobId + "&tipid=" + task.toString();
 883
 884        TaskLogProcessor tlp = new TaskLogProcessor(conf);
 885        for (String logUrl : ti.getLogUrls()) {
 886          tlp.addTaskAttemptLogUrl(logUrl);
 887        }
 888
 889        List<ErrorAndSolution> errors = tlp.getErrors();
 890
 891        StringBuilder sb = new StringBuilder();
 892        // We use a StringBuilder and then call printError only once as
 893        // printError will write to both stderr and the error log file. In
 894        // situations where both the stderr and the log file output is
 895        // simultaneously output to a single stream, this will look cleaner.
 896        sb.append("\n");
 897        sb.append("Task with the most failures(" + maxFailures + "): \n");
 898        sb.append("-----\n");
 899        sb.append("Task ID:\n  " + task + "\n\n");
 900        sb.append("URL:\n  " + taskUrl + "\n");
 901
 902        for (ErrorAndSolution e : errors) {
 903          sb.append("\n");
 904          sb.append("Possible error:\n  " + e.getError() + "\n\n");
 905          sb.append("Solution:\n  " + e.getSolution() + "\n");
 906        }
 907        sb.append("-----\n");
 908
 909        console.printError(sb.toString());
 910
 911        // Only print out one task because that's good enough for debugging.
 912        break;
 913      }
 914    }
 915    return;
 916
 917  }
 918
 919  private static void printUsage() {
 920    System.err.println("ExecDriver -plan <plan-file> [-jobconf k1=v1 [-jobconf k2=v2] ...] "
 921        + "[-files <file1>[,<file2>] ...]");
 922    System.exit(1);
 923  }
 924
 925  /**
 926   * we are running the hadoop job via a sub-command. this typically happens when we are running
 927   * jobs in local mode. the log4j in this mode is controlled as follows: 1. if the admin provides a
 928   * log4j properties file especially for execution mode - then we pick that up 2. otherwise - we
 929   * default to the regular hive log4j properties if one is supplied 3. if none of the above two
 930   * apply - we don't do anything - the log4j properties would likely be determined by hadoop.
 931   *
 932   * The intention behind providing a separate option #1 is to be able to collect hive run time logs
 933   * generated in local mode in a separate (centralized) location if desired. This mimics the
 934   * behavior of hive run time logs when running against a hadoop cluster where they are available
 935   * on the tasktracker nodes.
 936   */
 937
 938  private static void setupChildLog4j(Configuration conf) {
 939    URL hive_l4j = ExecDriver.class.getClassLoader().getResource(SessionState.HIVE_EXEC_L4J);
 940    if (hive_l4j == null) {
 941      hive_l4j = ExecDriver.class.getClassLoader().getResource(SessionState.HIVE_L4J);
 942    }
 943
 944    if (hive_l4j != null) {
 945      // setting queryid so that log4j configuration can use it to generate
 946      // per query log file
 947      System.setProperty(HiveConf.ConfVars.HIVEQUERYID.toString(), HiveConf.getVar(conf,
 948          HiveConf.ConfVars.HIVEQUERYID));
 949      LogManager.resetConfiguration();
 950      PropertyConfigurator.configure(hive_l4j);
 951    }
 952  }
 953
 954  public static void main(String[] args) throws IOException, HiveException {
 955
 956    String planFileName = null;
 957    ArrayList<String> jobConfArgs = new ArrayList<String>();
 958    boolean noLog = false;
 959    String files = null;
 960    boolean localtask = false;
 961    try {
 962      for (int i = 0; i < args.length; i++) {
 963        if (args[i].equals("-plan")) {
 964          planFileName = args[++i];
 965        } else if (args[i].equals("-jobconf")) {
 966          jobConfArgs.add(args[++i]);
 967        } else if (args[i].equals("-nolog")) {
 968          noLog = true;
 969        } else if (args[i].equals("-files")) {
 970          files = args[++i];
 971        } else if (args[i].equals("-localtask")) {
 972          localtask = true;
 973        }
 974      }
 975    } catch (IndexOutOfBoundsException e) {
 976      System.err.println("Missing argument to option");
 977      printUsage();
 978    }
 979
 980    JobConf conf;
 981    if (localtask) {
 982      conf = new JobConf(MapredLocalTask.class);
 983    } else {
 984      conf = new JobConf(ExecDriver.class);
 985    }
 986    StringBuilder sb = new StringBuilder("JobConf:\n");
 987
 988    for (String one : jobConfArgs) {
 989      int eqIndex = one.indexOf('=');
 990      if (eqIndex != -1) {
 991        try {
 992          String key = one.substring(0, eqIndex);
 993          String value = URLDecoder.decode(one.substring(eqIndex + 1), "UTF-8");
 994          conf.set(key, value);
 995          sb.append(key).append("=").append(value).append("\n");
 996        } catch (UnsupportedEncodingException e) {
 997          System.err.println("Unexpected error " + e.getMessage() + " while encoding "
 998              + one.substring(eqIndex + 1));
 999          System.exit(3);
1000        }
1001      }
1002    }
1003
1004    if (files != null) {
1005      conf.set("tmpfiles", files);
1006    }
1007
1008    boolean isSilent = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVESESSIONSILENT);
1009
1010    if (noLog) {
1011      // If started from main(), and noLog is on, we should not output
1012      // any logs. To turn the log on, please set -Dtest.silent=false
1013      BasicConfigurator.resetConfiguration();
1014      BasicConfigurator.configure(new NullAppender());
1015    } else {
1016      setupChildLog4j(conf);
1017    }
1018
1019    Log LOG = LogFactory.getLog(ExecDriver.class.getName());
1020    LogHelper console = new LogHelper(LOG, isSilent);
1021
1022    if (planFileName == null) {
1023      console.printError("Must specify Plan File Name");
1024      printUsage();
1025    }
1026
1027    // print out the location of the log file for the user so
1028    // that it's easy to find reason for local mode execution failures
1029    for (Appender appender : Collections.list((Enumeration<Appender>) LogManager.getRootLogger()
1030        .getAllAppenders())) {
1031      if (appender instanceof FileAppender) {
1032        console.printInfo("Execution log at: " + ((FileAppender) appender).getFile());
1033      }
1034    }
1035
1036    // log the list of job conf parameters for reference
1037    LOG.info(sb.toString());
1038
1039    // the plan file should always be in local directory
1040    Path p = new Path(planFileName);
1041    FileSystem fs = FileSystem.getLocal(conf);
1042    InputStream pathData = fs.open(p);
1043
1044    // this is workaround for hadoop-17 - libjars are not added to classpath of the
1045    // child process. so we add it here explicitly
1046
1047    String auxJars = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEAUXJARS);
1048    String addedJars = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEADDEDJARS);
1049    try {
1050      // see also - code in CliDriver.java
1051      ClassLoader loader = conf.getClassLoader();
1052      if (StringUtils.isNotBlank(auxJars)) {
1053        loader = Utilities.addToClassPath(loader, StringUtils.split(auxJars, ","));
1054      }
1055      if (StringUtils.isNotBlank(addedJars)) {
1056        loader = Utilities.addToClassPath(loader, StringUtils.split(addedJars, ","));
1057      }
1058      conf.setClassLoader(loader);
1059      // Also set this to the Thread ContextClassLoader, so new threads will
1060      // inherit
1061      // this class loader, and propagate into newly created Configurations by
1062      // those
1063      // new threads.
1064      Thread.currentThread().setContextClassLoader(loader);
1065    } catch (Exception e) {
1066      throw new HiveException(e.getMessage(), e);
1067    }
1068    int ret;
1069    if (localtask) {
1070      memoryMXBean = ManagementFactory.getMemoryMXBean();
1071      MapredLocalWork plan = Utilities.deserializeMapRedLocalWork(pathData, conf);
1072      MapredLocalTask ed = new MapredLocalTask(plan, conf, isSilent);
1073      ret = ed.executeFromChildJVM(new DriverContext());
1074
1075    } else {
1076      MapredWork plan = Utilities.deserializeMapRedWork(pathData, conf);
1077      ExecDriver ed = new ExecDriver(plan, conf, isSilent);
1078      ret = ed.execute(new DriverContext());
1079    }
1080
1081    if (ret != 0) {
1082      System.exit(2);
1083    }
1084  }
1085
1086  /**
1087   * Given a Hive Configuration object - generate a command line fragment for passing such
1088   * configuration information to ExecDriver.
1089   */
1090  public static String generateCmdLine(HiveConf hconf) {
1091    try {
1092      StringBuilder sb = new StringBuilder();
1093      Properties deltaP = hconf.getChangedProperties();
1094      boolean hadoopLocalMode = hconf.getVar(HiveConf.ConfVars.HADOOPJT).equals("local");
1095      String hadoopSysDir = "mapred.system.dir";
1096      String hadoopWorkDir = "mapred.local.dir";
1097
1098      for (Object one : deltaP.keySet()) {
1099        String oneProp = (String) one;
1100
1101        if (hadoopLocalMode && (oneProp.equals(hadoopSysDir) || oneProp.equals(hadoopWorkDir))) {
1102          continue;
1103        }
1104
1105        String oneValue = deltaP.getProperty(oneProp);
1106
1107        sb.append("-jobconf ");
1108        sb.append(oneProp);
1109        sb.append("=");
1110        sb.append(URLEncoder.encode(oneValue, "UTF-8"));
1111        sb.append(" ");
1112      }
1113
1114      // Multiple concurrent local mode job submissions can cause collisions in
1115      // working dirs
1116      // Workaround is to rename map red working dir to a temp dir in such cases
1117
1118      if (hadoopLocalMode) {
1119        sb.append("-jobconf ");
1120        sb.append(hadoopSysDir);
1121        sb.append("=");
1122        sb.append(URLEncoder.encode(hconf.get(hadoopSysDir) + "/" + Utilities.randGen.nextInt(),
1123            "UTF-8"));
1124
1125        sb.append(" ");
1126        sb.append("-jobconf ");
1127        sb.append(hadoopWorkDir);
1128        sb.append("=");
1129        sb.append(URLEncoder.encode(hconf.get(hadoopWorkDir) + "/" + Utilities.randGen.nextInt(),
1130            "UTF-8"));
1131      }
1132
1133      return sb.toString();
1134    } catch (UnsupportedEncodingException e) {
1135      throw new RuntimeException(e);
1136    }
1137  }
1138
1139  @Override
1140  public boolean isMapRedTask() {
1141    return true;
1142  }
1143
1144  @Override
1145  public boolean hasReduce() {
1146    MapredWork w = getWork();
1147    return w.getReducer() != null;
1148  }
1149
1150  /**
1151   * Handle a empty/null path for a given alias.
1152   */
1153  private int addInputPath(String path, JobConf job, MapredWork work, String hiveScratchDir,
1154      int numEmptyPaths, boolean isEmptyPath, String alias) throws Exception {
1155    // either the directory does not exist or it is empty
1156    assert path == null || isEmptyPath;
1157
1158    // The input file does not exist, replace it by a empty file
1159    Class<? extends HiveOutputFormat> outFileFormat = null;
1160    boolean nonNative = true;
1161    if (isEmptyPath) {
1162      PartitionDesc partDesc = work.getPathToPartitionInfo().get(path);
1163      outFileFormat = partDesc.getOutputFileFormatClass();
1164      nonNative = partDesc.getTableDesc().isNonNative();
1165    } else {
1166      TableDesc tableDesc = work.getAliasToPartnInfo().get(alias).getTableDesc();
1167      outFileFormat = tableDesc.getOutputFileFormatClass();
1168      nonNative = tableDesc.isNonNative();
1169    }
1170
1171    if (nonNative) {
1172      FileInputFormat.addInputPaths(job, path);
1173      LOG.info("Add a non-native table " + path);
1174      return numEmptyPaths;
1175    }
1176
1177    // create a dummy empty file in a new directory
1178    String newDir = hiveScratchDir + File.separator + (++numEmptyPaths);
1179    Path newPath = new Path(newDir);
1180    FileSystem fs = newPath.getFileSystem(job);
1181    fs.mkdirs(newPath);
1182    //Qualify the path against the filesystem. The user configured path might contain default port which is skipped
1183    //in the file status. This makes sure that all paths which goes into PathToPartitionInfo are always listed status
1184    //filepath.
1185    newPath = fs.makeQualified(newPath);
1186    String newFile = newDir + File.separator + "emptyFile";
1187    Path newFilePath = new Path(newFile);
1188
1189    LOG.info("Changed input file to " + newPath.toString());
1190
1191    // toggle the work
1192
1193    LinkedHashMap<String, ArrayList<String>> pathToAliases = work.getPathToAliases();
1194
1195    if (isEmptyPath) {
1196      assert path != null;
1197      pathToAliases.put(newPath.toUri().toString(), pathToAliases.get(path));
1198      pathToAliases.remove(path);
1199    } else {
1200      assert path == null;
1201      ArrayList<String> newList = new ArrayList<String>();
1202      newList.add(alias);
1203      pathToAliases.put(newPath.toUri().toString(), newList);
1204    }
1205
1206    work.setPathToAliases(pathToAliases);
1207
1208    LinkedHashMap<String, PartitionDesc> pathToPartitionInfo = work.getPathToPartitionInfo();
1209    if (isEmptyPath) {
1210      pathToPartitionInfo.put(newPath.toUri().toString(), pathToPartitionInfo.get(path));
1211      pathToPartitionInfo.remove(path);
1212    } else {
1213      PartitionDesc pDesc = work.getAliasToPartnInfo().get(alias).clone();
1214      pathToPartitionInfo.put(newPath.toUri().toString(), pDesc);
1215    }
1216    work.setPathToPartitionInfo(pathToPartitionInfo);
1217
1218    String onefile = newPath.toString();
1219    RecordWriter recWriter = outFileFormat.newInstance().getHiveRecordWriter(job, newFilePath,
1220        Text.class, false, new Properties(), null);
1221    recWriter.close(false);
1222    FileInputFormat.addInputPaths(job, onefile);
1223    return numEmptyPaths;
1224  }
1225
1226  private void addInputPaths(JobConf job, MapredWork work, String hiveScratchDir) throws Exception {
1227    int numEmptyPaths = 0;
1228
1229    List<String> pathsProcessed = new ArrayList<String>();
1230
1231    // AliasToWork contains all the aliases
1232    for (String oneAlias : work.getAliasToWork().keySet()) {
1233      LOG.info("Processing alias " + oneAlias);
1234      List<String> emptyPaths = new ArrayList<String>();
1235
1236      // The alias may not have any path
1237      String path = null;
1238      for (String onefile : work.getPathToAliases().keySet()) {
1239        List<String> aliases = work.getPathToAliases().get(onefile);
1240        if (aliases.contains(oneAlias)) {
1241          path = onefile;
1242
1243          // Multiple aliases can point to the same path - it should be
1244          // processed only once
1245          if (pathsProcessed.contains(path)) {
1246            continue;
1247          }
1248          pathsProcessed.add(path);
1249
1250          LOG.info("Adding input file " + path);
1251
1252          Path dirPath = new Path(path);
1253          if (!Utilities.isEmptyPath(job, dirPath)) {
1254            FileInputFormat.addInputPath(job, dirPath);
1255          } else {
1256            emptyPaths.add(path);
1257          }
1258        }
1259      }
1260
1261      // Create a empty file if the directory is empty
1262      for (String emptyPath : emptyPaths) {
1263        numEmptyPaths = addInputPath(emptyPath, job, work, hiveScratchDir, numEmptyPaths, true,
1264            oneAlias);
1265      }
1266
1267      // If the query references non-existent partitions
1268      // We need to add a empty file, it is not acceptable to change the
1269      // operator tree
1270      // Consider the query:
1271      // select * from (select count(1) from T union all select count(1) from
1272      // T2) x;
1273      // If T is empty and T2 contains 100 rows, the user expects: 0, 100 (2
1274      // rows)
1275      if (path == null) {
1276        numEmptyPaths = addInputPath(null, job, work, hiveScratchDir, numEmptyPaths, false,
1277            oneAlias);
1278      }
1279    }
1280  }
1281
1282  @Override
1283  public StageType getType() {
1284    return StageType.MAPRED;
1285  }
1286
1287  @Override
1288  public String getName() {
1289    return "MAPRED";
1290  }
1291
1292  @Override
1293  protected void localizeMRTmpFilesImpl(Context ctx) {
1294
1295    // localize any map-reduce input paths
1296    ctx.localizeKeys((Map<String, Object>) ((Object) work.getPathToAliases()));
1297    ctx.localizeKeys((Map<String, Object>) ((Object) work.getPathToPartitionInfo()));
1298
1299    // localize any input paths for maplocal work
1300    MapredLocalWork l = work.getMapLocalWork();
1301    if (l != null) {
1302      Map<String, FetchWork> m = l.getAliasToFetchWork();
1303      if (m != null) {
1304        for (FetchWork fw : m.values()) {
1305          String s = fw.getTblDir();
1306          if ((s != null) && ctx.isMRTmpFileURI(s)) {
1307            fw.setTblDir(ctx.localizeMRTmpFileURI(s));
1308          }
1309        }
1310      }
1311    }
1312
1313    // fix up outputs
1314    Map<String, ArrayList<String>> pa = work.getPathToAliases();
1315    if (pa != null) {
1316      for (List<String> ls : pa.values()) {
1317        for (String a : ls) {
1318          ArrayList<Operator<? extends Serializable>> opList = new ArrayList<Operator<? extends Serializable>>();
1319          opList.add(work.getAliasToWork().get(a));
1320
1321          while (!opList.isEmpty()) {
1322            Operator<? extends Serializable> op = opList.remove(0);
1323
1324            if (op instanceof FileSinkOperator) {
1325              FileSinkDesc fdesc = ((FileSinkOperator) op).getConf();
1326              String s = fdesc.getDirName();
1327              if ((s != null) && ctx.isMRTmpFileURI(s)) {
1328                fdesc.setDirName(ctx.localizeMRTmpFileURI(s));
1329              }
1330              ((FileSinkOperator) op).setConf(fdesc);
1331            }
1332
1333            if (op.getChildOperators() != null) {
1334              opList.addAll(op.getChildOperators());
1335            }
1336          }
1337        }
1338      }
1339    }
1340  }
1341}