PageRenderTime 88ms CodeModel.GetById 19ms app.highlight 62ms RepoModel.GetById 1ms app.codeStats 0ms

/tags/release-0.1-rc2/hive/external/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java

#
Java | 615 lines | 368 code | 85 blank | 162 comment | 62 complexity | 53caa4bc8ac2b4ed49024f00b836ea61 MD5 | raw file
  1/**
  2 * Licensed to the Apache Software Foundation (ASF) under one
  3 * or more contributor license agreements.  See the NOTICE file
  4 * distributed with this work for additional information
  5 * regarding copyright ownership.  The ASF licenses this file
  6 * to you under the Apache License, Version 2.0 (the
  7 * "License"); you may not use this file except in compliance
  8 * with the License.  You may obtain a copy of the License at
  9 *
 10 *     http://www.apache.org/licenses/LICENSE-2.0
 11 *
 12 * Unless required by applicable law or agreed to in writing, software
 13 * distributed under the License is distributed on an "AS IS" BASIS,
 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 * See the License for the specific language governing permissions and
 16 * limitations under the License.
 17 */
 18
 19package org.apache.hadoop.hive.ql.optimizer;
 20
 21import java.io.Serializable;
 22import java.util.ArrayList;
 23import java.util.HashMap;
 24import java.util.LinkedHashMap;
 25import java.util.List;
 26import java.util.Stack;
 27
 28import org.apache.commons.logging.Log;
 29import org.apache.commons.logging.LogFactory;
 30import org.apache.hadoop.fs.Path;
 31import org.apache.hadoop.hive.conf.HiveConf;
 32import org.apache.hadoop.hive.ql.Context;
 33import org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator;
 34import org.apache.hadoop.hive.ql.exec.ColumnInfo;
 35import org.apache.hadoop.hive.ql.exec.ConditionalTask;
 36import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
 37import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
 38import org.apache.hadoop.hive.ql.exec.MapRedTask;
 39import org.apache.hadoop.hive.ql.exec.MoveTask;
 40import org.apache.hadoop.hive.ql.exec.Operator;
 41import org.apache.hadoop.hive.ql.exec.OperatorFactory;
 42import org.apache.hadoop.hive.ql.exec.RowSchema;
 43import org.apache.hadoop.hive.ql.exec.Task;
 44import org.apache.hadoop.hive.ql.exec.TaskFactory;
 45import org.apache.hadoop.hive.ql.exec.UnionOperator;
 46import org.apache.hadoop.hive.ql.exec.Utilities;
 47import org.apache.hadoop.hive.ql.lib.Node;
 48import org.apache.hadoop.hive.ql.lib.NodeProcessor;
 49import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
 50import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMRMapJoinCtx;
 51import org.apache.hadoop.hive.ql.parse.ParseContext;
 52import org.apache.hadoop.hive.ql.parse.RowResolver;
 53import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer;
 54import org.apache.hadoop.hive.ql.parse.SemanticException;
 55import org.apache.hadoop.hive.ql.parse.TypeCheckProcFactory;
 56import org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles;
 57import org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles.ConditionalResolverMergeFilesCtx;
 58import org.apache.hadoop.hive.ql.plan.ConditionalWork;
 59import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
 60import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
 61import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
 62import org.apache.hadoop.hive.ql.plan.ExtractDesc;
 63import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
 64import org.apache.hadoop.hive.ql.plan.LoadFileDesc;
 65import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
 66import org.apache.hadoop.hive.ql.plan.MapredWork;
 67import org.apache.hadoop.hive.ql.plan.MoveWork;
 68import org.apache.hadoop.hive.ql.plan.PartitionDesc;
 69import org.apache.hadoop.hive.ql.plan.PlanUtils;
 70import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
 71import org.apache.hadoop.hive.ql.plan.StatsWork;
 72import org.apache.hadoop.hive.ql.plan.TableDesc;
 73import org.apache.hadoop.hive.ql.plan.TableScanDesc;
 74import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
 75
 76/**
 77 * Processor for the rule - table scan followed by reduce sink.
 78 */
 79public class GenMRFileSink1 implements NodeProcessor {
 80
 81  static final private Log LOG = LogFactory.getLog(GenMRFileSink1.class.getName());
 82
 83  public GenMRFileSink1() {
 84  }
 85
 86  /**
 87   * File Sink Operator encountered.
 88   *
 89   * @param nd
 90   *          the file sink operator encountered
 91   * @param opProcCtx
 92   *          context
 93   */
 94  public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx,
 95      Object... nodeOutputs) throws SemanticException {
 96    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
 97    ParseContext parseCtx = ctx.getParseCtx();
 98    boolean chDir = false;
 99    Task<? extends Serializable> currTask = ctx.getCurrTask();
100    FileSinkOperator fsOp = (FileSinkOperator) nd;
101    boolean isInsertTable = // is INSERT OVERWRITE TABLE
102      fsOp.getConf().getTableInfo().getTableName() != null &&
103      parseCtx.getQB().getParseInfo().isInsertToTable();
104    HiveConf hconf = parseCtx.getConf();
105
106
107    // Has the user enabled merging of files for map-only jobs or for all jobs
108    if ((ctx.getMvTask() != null) && (!ctx.getMvTask().isEmpty())) {
109      List<Task<? extends Serializable>> mvTasks = ctx.getMvTask();
110
111      // In case of unions or map-joins, it is possible that the file has
112      // already been seen.
113      // So, no need to attempt to merge the files again.
114      if ((ctx.getSeenFileSinkOps() == null)
115          || (!ctx.getSeenFileSinkOps().contains(nd))) {
116
117        // no need of merging if the move is to a local file system
118        MoveTask mvTask = (MoveTask) findMoveTask(mvTasks, fsOp);
119
120        if (isInsertTable &&
121            hconf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
122          addStatsTask(fsOp, mvTask, currTask, parseCtx.getConf());
123        }
124
125        if ((mvTask != null) && !mvTask.isLocal()) {
126          // There are separate configuration parameters to control whether to
127          // merge for a map-only job
128          // or for a map-reduce job
129          MapredWork currWork = (MapredWork) currTask.getWork();
130          boolean mergeMapOnly =
131            hconf.getBoolVar(HiveConf.ConfVars.HIVEMERGEMAPFILES) &&
132            currWork.getReducer() == null;
133          boolean mergeMapRed =
134            hconf.getBoolVar(HiveConf.ConfVars.HIVEMERGEMAPREDFILES) &&
135            currWork.getReducer() != null;
136          if (mergeMapOnly || mergeMapRed) {
137            chDir = true;
138          }
139        }
140      }
141    }
142
143    String finalName = processFS(nd, stack, opProcCtx, chDir);
144
145    // need to merge the files in the destination table/partitions
146    if (chDir && (finalName != null)) {
147      createMergeJob((FileSinkOperator) nd, ctx, finalName);
148    }
149
150    return null;
151  }
152
153  /**
154   * Add the StatsTask as a dependent task of the MoveTask
155   * because StatsTask will change the Table/Partition metadata. For atomicity, we
156   * should not change it before the data is actually there done by MoveTask.
157   * @param nd the FileSinkOperator whose results are taken care of by the MoveTask.
158   * @param mvTask The MoveTask that moves the FileSinkOperator's results.
159   * @param currTask The MapRedTask that the FileSinkOperator belongs to.
160   * @param hconf HiveConf
161   */
162  private void addStatsTask(FileSinkOperator nd, MoveTask mvTask,
163      Task<? extends Serializable> currTask, HiveConf hconf) {
164
165    MoveWork mvWork = ((MoveTask)mvTask).getWork();
166    StatsWork statsWork = new StatsWork(mvWork.getLoadTableWork());
167    MapredWork mrWork = (MapredWork) currTask.getWork();
168
169    // AggKey in StatsWork is used for stats aggregation while StatsAggPrefix
170    // in FileSinkDesc is used for stats publishing. They should be consistent.
171    statsWork.setAggKey(((FileSinkOperator)nd).getConf().getStatsAggPrefix());
172    Task<? extends Serializable> statsTask = TaskFactory.get(statsWork, hconf);
173
174    // mark the MapredWork and FileSinkOperator for gathering stats
175    nd.getConf().setGatherStats(true);
176    mrWork.setGatheringStats(true);
177    // mrWork.addDestinationTable(nd.getConf().getTableInfo().getTableName());
178
179    // subscribe feeds from the MoveTask so that MoveTask can forward the list
180    // of dynamic partition list to the StatsTask
181    mvTask.addDependentTask(statsTask);
182    statsTask.subscribeFeed(mvTask);
183  }
184
185  private void createMapReduce4Merge(FileSinkOperator fsOp, GenMRProcContext ctx, String finalName)
186      throws SemanticException {
187    Task<? extends Serializable> currTask = ctx.getCurrTask();
188    RowSchema inputRS = fsOp.getSchema();
189
190    // create a reduce Sink operator - key is the first column
191    ArrayList<ExprNodeDesc> keyCols = new ArrayList<ExprNodeDesc>();
192    keyCols.add(TypeCheckProcFactory.DefaultExprProcessor
193        .getFuncExprNodeDesc("rand"));
194
195    // value is all the columns in the FileSink operator input
196    ArrayList<ExprNodeDesc> valueCols = new ArrayList<ExprNodeDesc>();
197    for (ColumnInfo ci : inputRS.getSignature()) {
198      valueCols.add(new ExprNodeColumnDesc(ci.getType(), ci.getInternalName(),
199          ci.getTabAlias(), ci.getIsVirtualCol()));
200    }
201
202    // create a dummy tableScan operator
203    Operator<? extends Serializable> tsMerge = OperatorFactory.get(
204        TableScanDesc.class, inputRS);
205
206    ArrayList<String> outputColumns = new ArrayList<String>();
207    for (int i = 0; i < valueCols.size(); i++) {
208      outputColumns.add(SemanticAnalyzer.getColumnInternalName(i));
209    }
210
211    ReduceSinkDesc rsDesc = PlanUtils.getReduceSinkDesc(
212        new ArrayList<ExprNodeDesc>(), valueCols, outputColumns, false, -1, -1,
213        -1);
214    OperatorFactory.getAndMakeChild(rsDesc, inputRS, tsMerge);
215    ParseContext parseCtx = ctx.getParseCtx();
216    FileSinkDesc fsConf = fsOp.getConf();
217
218    // Add the extract operator to get the value fields
219    RowResolver out_rwsch = new RowResolver();
220    RowResolver interim_rwsch = ctx.getParseCtx().getOpParseCtx().get(fsOp).getRowResolver();
221    Integer pos = Integer.valueOf(0);
222    for (ColumnInfo colInfo : interim_rwsch.getColumnInfos()) {
223      String[] info = interim_rwsch.reverseLookup(colInfo.getInternalName());
224      out_rwsch.put(info[0], info[1], new ColumnInfo(pos.toString(), colInfo
225          .getType(), info[0], colInfo.getIsVirtualCol(), colInfo.isHiddenVirtualCol()));
226      pos = Integer.valueOf(pos.intValue() + 1);
227    }
228
229    Operator<ExtractDesc> extract = OperatorFactory.getAndMakeChild(new ExtractDesc(
230        new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo,
231            Utilities.ReduceField.VALUE.toString(), "", false)),
232            new RowSchema(out_rwsch.getColumnInfos()));
233
234    TableDesc ts = (TableDesc) fsConf.getTableInfo().clone();
235    fsConf.getTableInfo().getProperties().remove(
236        org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_PARTITION_COLUMNS);
237
238    FileSinkDesc newFSD = new FileSinkDesc(finalName, ts, parseCtx.getConf()
239        .getBoolVar(HiveConf.ConfVars.COMPRESSRESULT));
240    FileSinkOperator newOutput = (FileSinkOperator) OperatorFactory.
241      getAndMakeChild(newFSD, inputRS, extract);
242
243    HiveConf conf = parseCtx.getConf();
244    MapredWork cplan = createMergeTask(conf, tsMerge, fsConf);
245    cplan.setReducer(extract);
246
247    // NOTE: we should gather stats in MR1 (rather than the merge MR job)
248    // since it is unknown if the merge MR will be triggered at execution time.
249
250    MoveWork dummyMv = new MoveWork(null, null, null,
251        new LoadFileDesc(fsConf.getDirName(), finalName, true, null, null), false);
252
253    ConditionalTask cndTsk = createCondTask(conf, currTask, dummyMv, cplan,
254        fsConf.getDirName());
255
256    LinkMoveTask(ctx, newOutput, cndTsk);
257  }
258
259  /**
260   * Create a MapReduce job for a particular partition if Hadoop version is pre 0.20,
261   * otherwise create a Map-only job using CombineHiveInputFormat for all partitions.
262   * @param fsOp The FileSink operator.
263   * @param ctx The MR processing context.
264   * @param finalName the final destination path the merge job should output.
265   * @throws SemanticException
266   */
267  private void createMergeJob(FileSinkOperator fsOp, GenMRProcContext ctx, String finalName)
268      throws SemanticException {
269
270    // if the hadoop version support CombineFileInputFormat (version >= 0.20),
271    // create a Map-only job for merge, otherwise create a MapReduce merge job.
272    ParseContext parseCtx = ctx.getParseCtx();
273    HiveConf conf = parseCtx.getConf();
274    if (conf.getBoolVar(HiveConf.ConfVars.HIVEMERGEMAPONLY) &&
275        Utilities.supportCombineFileInputFormat()) {
276      // create Map-only merge job
277      createMap4Merge(fsOp, ctx, finalName);
278      LOG.info("use CombineHiveInputformat for the merge job");
279    } else {
280      createMapReduce4Merge(fsOp, ctx, finalName);
281      LOG.info("use HiveInputFormat for the merge job");
282    }
283  }
284
285  /**
286   * create a Map-only merge job with the following operators:
287   * @param fsInput
288   * @param ctx
289   * @param finalName
290   *  MR job J0:
291   *          ...
292   *              |
293   *              v
294   *         FileSinkOperator_1 (fsInput)
295   *             |
296   *             v
297   *  Merge job J1:
298   *             |
299   *             v
300   *         TableScan (using CombineHiveInputFormat) (tsMerge)
301   *             |
302   *             v
303   *         FileSinkOperator (fsMerge)
304   *
305   * Here the pathToPartitionInfo & pathToAlias will remain the same, which means the paths do
306   * not contain the dynamic partitions (their parent). So after the dynamic partitions are
307   * created (after the first job finished before the moveTask or ConditionalTask start),
308   * we need to change the pathToPartitionInfo & pathToAlias to include the dynamic partition
309   * directories.
310   *
311   */
312  private void createMap4Merge(FileSinkOperator fsInput, GenMRProcContext ctx, String finalName) {
313
314    //
315    // 1. create the operator tree
316    //
317    ParseContext parseCtx = ctx.getParseCtx();
318    FileSinkDesc fsInputDesc = fsInput.getConf();
319
320    // Create a TableScan operator
321    RowSchema inputRS = fsInput.getSchema();
322    Operator<? extends Serializable> tsMerge = OperatorFactory.get(TableScanDesc.class, inputRS);
323
324    // Create a FileSink operator
325    TableDesc ts = (TableDesc) fsInputDesc.getTableInfo().clone();
326    FileSinkDesc fsOutputDesc =  new FileSinkDesc(finalName, ts,
327        parseCtx.getConf().getBoolVar(HiveConf.ConfVars.COMPRESSRESULT));
328    FileSinkOperator fsOutput = (FileSinkOperator) OperatorFactory.getAndMakeChild(
329        fsOutputDesc,  inputRS, tsMerge);
330
331    // If the input FileSinkOperator is a dynamic partition enabled, the tsMerge input schema
332    // needs to include the partition column, and the fsOutput should have
333    // a DynamicPartitionCtx to indicate that it needs to dynamically partitioned.
334    DynamicPartitionCtx dpCtx = fsInputDesc.getDynPartCtx();
335    if (dpCtx != null && dpCtx.getNumDPCols() > 0) {
336      // adding DP ColumnInfo to the RowSchema signature
337      ArrayList<ColumnInfo> signature = inputRS.getSignature();
338      String tblAlias = fsInputDesc.getTableInfo().getTableName();
339      LinkedHashMap<String, String> colMap = new LinkedHashMap<String, String>();
340      StringBuilder partCols = new StringBuilder();
341      for (String dpCol: dpCtx.getDPColNames()) {
342        ColumnInfo colInfo = new ColumnInfo(dpCol,
343            TypeInfoFactory.stringTypeInfo, // all partition column type should be string
344            tblAlias, true); // partition column is virtual column
345        signature.add(colInfo);
346        colMap.put(dpCol, dpCol); // input and output have the same column name
347        partCols.append(dpCol).append('/');
348      }
349      partCols.setLength(partCols.length()-1); // remove the last '/'
350      inputRS.setSignature(signature);
351
352      // create another DynamicPartitionCtx, which has a different input-to-DP column mapping
353      DynamicPartitionCtx dpCtx2 = new DynamicPartitionCtx(dpCtx);
354      dpCtx2.setInputToDPCols(colMap);
355      fsOutputDesc.setDynPartCtx(dpCtx2);
356
357      // update the FileSinkOperator to include partition columns
358      fsInputDesc.getTableInfo().getProperties().setProperty(
359        org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_PARTITION_COLUMNS,
360        partCols.toString()); // list of dynamic partition column names
361    } else {
362      // non-partitioned table
363      fsInputDesc.getTableInfo().getProperties().remove(
364        org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_PARTITION_COLUMNS);
365    }
366
367    //
368    // 2. Constructing a conditional task consisting of a move task and a map reduce task
369    //
370    MapRedTask currTask = (MapRedTask) ctx.getCurrTask();
371    MoveWork dummyMv = new MoveWork(null, null, null,
372        new LoadFileDesc(fsInputDesc.getDirName(), finalName, true, null, null), false);
373    MapredWork cplan = createMergeTask(ctx.getConf(), tsMerge, fsInputDesc);
374    // use CombineHiveInputFormat for map-only merging
375    cplan.setInputformat("org.apache.hadoop.hive.ql.io.CombineHiveInputFormat");
376    // NOTE: we should gather stats in MR1 rather than MR2 at merge job since we don't
377    // know if merge MR2 will be triggered at execution time
378    ConditionalTask cndTsk = createCondTask(ctx.getConf(), ctx.getCurrTask(), dummyMv, cplan,
379        fsInputDesc.getDirName());
380
381    // keep the dynamic partition context in conditional task resolver context
382    ConditionalResolverMergeFilesCtx mrCtx =
383      (ConditionalResolverMergeFilesCtx) cndTsk.getResolverCtx();
384    mrCtx.setDPCtx(fsInputDesc.getDynPartCtx());
385
386    //
387    // 3. add the moveTask as the children of the conditional task
388    //
389    LinkMoveTask(ctx, fsOutput, cndTsk);
390 }
391
392  private void LinkMoveTask(GenMRProcContext ctx, FileSinkOperator newOutput,
393      ConditionalTask cndTsk) {
394
395    List<Task<? extends Serializable>> mvTasks = ctx.getMvTask();
396    Task<? extends Serializable> mvTask = findMoveTask(mvTasks, newOutput);
397
398    if (mvTask != null) {
399      for (Task<? extends Serializable> tsk : cndTsk.getListTasks()) {
400        tsk.addDependentTask(mvTask);
401      }
402    }
403  }
404
405  /**
406   * Create a MapredWork based on input path, the top operator and the input
407   * table descriptor.
408   * @param conf
409   * @param topOp the table scan operator that is the root of the MapReduce task.
410   * @param fsDesc the file sink descriptor that serves as the input to this merge task.
411   * @param parentMR the parent MapReduce work
412   * @param parentFS the last FileSinkOperator in the parent MapReduce work
413   * @return the MapredWork
414   */
415  private MapredWork createMergeTask(HiveConf conf, Operator<? extends Serializable> topOp,
416      FileSinkDesc fsDesc) {
417
418    ArrayList<String> aliases = new ArrayList<String>();
419    String inputDir = fsDesc.getDirName();
420    TableDesc tblDesc = fsDesc.getTableInfo();
421    aliases.add(inputDir); // dummy alias: just use the input path
422
423    // constructing the default MapredWork
424    MapredWork cplan = GenMapRedUtils.getMapRedWork(conf);
425    cplan.getPathToAliases().put(inputDir, aliases);
426    cplan.getPathToPartitionInfo().put(inputDir, new PartitionDesc(tblDesc, null));
427    cplan.setNumReduceTasks(0);
428    cplan.getAliasToWork().put(inputDir, topOp);
429    cplan.setMapperCannotSpanPartns(true);
430
431    return cplan;
432  }
433  /**
434   * Construct a conditional task given the current leaf task, the MoveWork and the MapredWork.
435   * @param conf HiveConf
436   * @param currTask current leaf task
437   * @param mvWork MoveWork for the move task
438   * @param mergeWork MapredWork for the merge task.
439   * @param inputPath the input directory of the merge/move task
440   * @return The conditional task
441   */
442  private ConditionalTask createCondTask(HiveConf conf,
443      Task<? extends Serializable> currTask, MoveWork mvWork,
444      MapredWork mergeWork, String inputPath) {
445
446    Task<? extends Serializable> mergeTask = TaskFactory.get(mergeWork, conf);
447    Task<? extends Serializable> moveTask = TaskFactory.get(mvWork, conf);
448    List<Serializable> listWorks = new ArrayList<Serializable>();
449    listWorks.add(mvWork);
450    listWorks.add(mergeWork);
451
452    ConditionalWork cndWork = new ConditionalWork(listWorks);
453
454    List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
455    listTasks.add(moveTask);
456    listTasks.add(mergeTask);
457
458    ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, conf);
459    cndTsk.setListTasks(listTasks);
460
461    // create resolver
462    cndTsk.setResolver(new ConditionalResolverMergeFiles());
463    ConditionalResolverMergeFilesCtx mrCtx =
464      new ConditionalResolverMergeFilesCtx(listTasks, inputPath);
465    cndTsk.setResolverCtx(mrCtx);
466
467    // make the conditional task as the child of the current leaf task
468    currTask.addDependentTask(cndTsk);
469
470    return cndTsk;
471  }
472
473  private Task<? extends Serializable> findMoveTask(
474      List<Task<? extends Serializable>> mvTasks, FileSinkOperator fsOp) {
475    // find the move task
476    for (Task<? extends Serializable> mvTsk : mvTasks) {
477      MoveWork mvWork = (MoveWork) mvTsk.getWork();
478      String srcDir = null;
479      if (mvWork.getLoadFileWork() != null) {
480        srcDir = mvWork.getLoadFileWork().getSourceDir();
481      } else if (mvWork.getLoadTableWork() != null) {
482        srcDir = mvWork.getLoadTableWork().getSourceDir();
483      }
484
485      if ((srcDir != null)
486          && (srcDir.equalsIgnoreCase(fsOp.getConf().getDirName()))) {
487        return mvTsk;
488      }
489    }
490    return null;
491  }
492
493  /**
494   * Process the FileSink operator to generate a MoveTask if necessary.
495   * @param nd current FileSink operator
496   * @param stack parent operators
497   * @param opProcCtx
498   * @param chDir whether the operator should be first output to a tmp dir and then merged
499   *        to the final dir later
500   * @return the final file name to which the FileSinkOperator should store.
501   * @throws SemanticException
502   */
503  private String processFS(Node nd, Stack<Node> stack,
504      NodeProcessorCtx opProcCtx, boolean chDir) throws SemanticException {
505
506    // Is it the dummy file sink after the mapjoin
507    FileSinkOperator fsOp = (FileSinkOperator) nd;
508    if ((fsOp.getParentOperators().size() == 1)
509        && (fsOp.getParentOperators().get(0) instanceof MapJoinOperator)) {
510      return null;
511    }
512
513    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
514    List<FileSinkOperator> seenFSOps = ctx.getSeenFileSinkOps();
515    if (seenFSOps == null) {
516      seenFSOps = new ArrayList<FileSinkOperator>();
517    }
518    if (!seenFSOps.contains(fsOp)) {
519      seenFSOps.add(fsOp);
520    }
521    ctx.setSeenFileSinkOps(seenFSOps);
522
523    Task<? extends Serializable> currTask = ctx.getCurrTask();
524
525    // If the directory needs to be changed, send the new directory
526    String dest = null;
527
528    if (chDir) {
529      dest = fsOp.getConf().getDirName();
530
531      // generate the temporary file
532      // it must be on the same file system as the current destination
533      ParseContext parseCtx = ctx.getParseCtx();
534      Context baseCtx = parseCtx.getContext();
535      String tmpDir = baseCtx.getExternalTmpFileURI((new Path(dest)).toUri());
536
537      fsOp.getConf().setDirName(tmpDir);
538    }
539
540    Task<? extends Serializable> mvTask = null;
541
542    if (!chDir) {
543      mvTask = findMoveTask(ctx.getMvTask(), fsOp);
544    }
545
546    Operator<? extends Serializable> currTopOp = ctx.getCurrTopOp();
547    String currAliasId = ctx.getCurrAliasId();
548    HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap =
549      ctx.getOpTaskMap();
550    List<Operator<? extends Serializable>> seenOps = ctx.getSeenOps();
551    List<Task<? extends Serializable>> rootTasks = ctx.getRootTasks();
552
553    // Set the move task to be dependent on the current task
554    if (mvTask != null) {
555      currTask.addDependentTask(mvTask);
556    }
557
558    // In case of multi-table insert, the path to alias mapping is needed for
559    // all the sources. Since there is no
560    // reducer, treat it as a plan with null reducer
561    // If it is a map-only job, the task needs to be processed
562    if (currTopOp != null) {
563      Task<? extends Serializable> mapTask = opTaskMap.get(null);
564      if (mapTask == null) {
565        assert (!seenOps.contains(currTopOp));
566        seenOps.add(currTopOp);
567        GenMapRedUtils.setTaskPlan(currAliasId, currTopOp,
568            (MapredWork) currTask.getWork(), false, ctx);
569        opTaskMap.put(null, currTask);
570        rootTasks.add(currTask);
571      } else {
572        if (!seenOps.contains(currTopOp)) {
573          seenOps.add(currTopOp);
574          GenMapRedUtils.setTaskPlan(currAliasId, currTopOp,
575              (MapredWork) mapTask.getWork(), false, ctx);
576        }
577        // mapTask and currTask should be merged by and join/union operator
578        // (e.g., GenMRUnion1j) which has multiple topOps.
579        assert mapTask == currTask : "mapTask.id = " + mapTask.getId()
580            + "; currTask.id = " + currTask.getId();
581      }
582
583      return dest;
584
585    }
586
587    UnionOperator currUnionOp = ctx.getCurrUnionOp();
588
589    if (currUnionOp != null) {
590      opTaskMap.put(null, currTask);
591      GenMapRedUtils.initUnionPlan(ctx, currTask, false);
592      return dest;
593    }
594
595    AbstractMapJoinOperator<? extends MapJoinDesc> currMapJoinOp = ctx.getCurrMapJoinOp();
596
597    if (currMapJoinOp != null) {
598      opTaskMap.put(null, currTask);
599      GenMRMapJoinCtx mjCtx = ctx.getMapJoinCtx(currMapJoinOp);
600      MapredWork plan = (MapredWork) currTask.getWork();
601
602      String taskTmpDir = mjCtx.getTaskTmpDir();
603      TableDesc tt_desc = mjCtx.getTTDesc();
604      assert plan.getPathToAliases().get(taskTmpDir) == null;
605      plan.getPathToAliases().put(taskTmpDir, new ArrayList<String>());
606      plan.getPathToAliases().get(taskTmpDir).add(taskTmpDir);
607      plan.getPathToPartitionInfo().put(taskTmpDir,
608          new PartitionDesc(tt_desc, null));
609      plan.getAliasToWork().put(taskTmpDir, mjCtx.getRootMapJoinOp());
610      return dest;
611    }
612
613    return dest;
614  }
615}