PageRenderTime 46ms CodeModel.GetById 16ms app.highlight 25ms RepoModel.GetById 1ms app.codeStats 1ms

/tags/release-0.1-rc2/hive/external/ql/src/java/org/apache/hadoop/hive/ql/optimizer/lineage/OpProcFactory.java

#
Java | 538 lines | 337 code | 85 blank | 116 comment | 40 complexity | ccdb894c82546742f13a60335ce961aa MD5 | raw file
  1/**
  2 * Licensed to the Apache Software Foundation (ASF) under one
  3 * or more contributor license agreements.  See the NOTICE file
  4 * distributed with this work for additional information
  5 * regarding copyright ownership.  The ASF licenses this file
  6 * to you under the Apache License, Version 2.0 (the
  7 * "License"); you may not use this file except in compliance
  8 * with the License.  You may obtain a copy of the License at
  9 *
 10 *     http://www.apache.org/licenses/LICENSE-2.0
 11 *
 12 * Unless required by applicable law or agreed to in writing, software
 13 * distributed under the License is distributed on an "AS IS" BASIS,
 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 * See the License for the specific language governing permissions and
 16 * limitations under the License.
 17 */
 18
 19package org.apache.hadoop.hive.ql.optimizer.lineage;
 20
 21import java.io.Serializable;
 22import java.util.ArrayList;
 23import java.util.HashMap;
 24import java.util.Iterator;
 25import java.util.LinkedHashSet;
 26import java.util.List;
 27import java.util.Map;
 28import java.util.Set;
 29import java.util.Stack;
 30
 31import org.apache.commons.logging.Log;
 32import org.apache.commons.logging.LogFactory;
 33import org.apache.hadoop.hive.metastore.api.FieldSchema;
 34import org.apache.hadoop.hive.metastore.api.Table;
 35import org.apache.hadoop.hive.ql.exec.ColumnInfo;
 36import org.apache.hadoop.hive.ql.exec.ForwardOperator;
 37import org.apache.hadoop.hive.ql.exec.GroupByOperator;
 38import org.apache.hadoop.hive.ql.exec.JoinOperator;
 39import org.apache.hadoop.hive.ql.exec.LateralViewJoinOperator;
 40import org.apache.hadoop.hive.ql.exec.Operator;
 41import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
 42import org.apache.hadoop.hive.ql.exec.RowSchema;
 43import org.apache.hadoop.hive.ql.exec.SelectOperator;
 44import org.apache.hadoop.hive.ql.exec.TableScanOperator;
 45import org.apache.hadoop.hive.ql.hooks.LineageInfo;
 46import org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo;
 47import org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency;
 48import org.apache.hadoop.hive.ql.hooks.LineageInfo.DependencyType;
 49import org.apache.hadoop.hive.ql.hooks.LineageInfo.TableAliasInfo;
 50import org.apache.hadoop.hive.ql.lib.Node;
 51import org.apache.hadoop.hive.ql.lib.NodeProcessor;
 52import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
 53import org.apache.hadoop.hive.ql.lib.Utils;
 54import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
 55import org.apache.hadoop.hive.ql.parse.ParseContext;
 56import org.apache.hadoop.hive.ql.parse.SemanticException;
 57import org.apache.hadoop.hive.ql.plan.AggregationDesc;
 58import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
 59import org.apache.hadoop.hive.ql.plan.JoinDesc;
 60import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
 61
 62/**
 63 * Operator factory for the rule processors for lineage.
 64 */
 65public class OpProcFactory {
 66
 67  /**
 68   * Returns the parent operator in the walk path to the current operator.
 69   *
 70   * @param stack The stack encoding the path.
 71   *
 72   * @return Operator The parent operator in the current path.
 73   */
 74  protected static Operator<? extends Serializable> getParent(Stack<Node> stack) {
 75    return (Operator<? extends Serializable>)Utils.getNthAncestor(stack, 1);
 76  }
 77
 78  /**
 79   * Processor for Script and UDTF Operators.
 80   */
 81  public static class TransformLineage extends DefaultLineage implements NodeProcessor {
 82
 83    @Override
 84    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
 85        Object... nodeOutputs) throws SemanticException {
 86
 87      // LineageCTx
 88      LineageCtx lCtx = (LineageCtx) procCtx;
 89
 90      // The operators
 91      Operator<? extends Serializable> op = (Operator<? extends Serializable>)nd;
 92      Operator<? extends Serializable> inpOp = getParent(stack);
 93
 94      // Create a single dependency list by concatenating the dependencies of all
 95      // the cols
 96      Dependency dep = new Dependency();
 97      DependencyType new_type = LineageInfo.DependencyType.SCRIPT;
 98      dep.setType(LineageInfo.DependencyType.SCRIPT);
 99      // TODO: Fix this to a non null value.
100      dep.setExpr(null);
101
102      LinkedHashSet<BaseColumnInfo> col_set = new LinkedHashSet<BaseColumnInfo>();
103      for(ColumnInfo ci : inpOp.getSchema().getSignature()) {
104        Dependency d = lCtx.getIndex().getDependency(inpOp, ci);
105        if (d != null) {
106          new_type = LineageCtx.getNewDependencyType(d.getType(), new_type);
107          col_set.addAll(d.getBaseCols());
108        }
109      }
110
111      dep.setType(new_type);
112      dep.setBaseCols(new ArrayList<BaseColumnInfo>(col_set));
113
114      // This dependency is then set for all the colinfos of the script operator
115      for(ColumnInfo ci : op.getSchema().getSignature()) {
116        lCtx.getIndex().putDependency(op, ci, dep);
117      }
118
119      return null;
120    }
121
122  }
123
124  /**
125   * Processor for TableScan Operator. This actually creates the base column mappings.
126   */
127  public static class TableScanLineage extends DefaultLineage implements NodeProcessor {
128
129    @Override
130    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
131        Object... nodeOutputs) throws SemanticException {
132
133      // LineageCtx
134      LineageCtx lCtx = (LineageCtx) procCtx;
135      ParseContext pctx = lCtx.getParseCtx();
136
137      // Table scan operator.
138      TableScanOperator top = (TableScanOperator)nd;
139      org.apache.hadoop.hive.ql.metadata.Table t = pctx.getTopToTable().get(top);
140      Table tab = t.getTTable();
141
142      // Generate the mappings
143      RowSchema rs = top.getSchema();
144      List<FieldSchema> cols = t.getAllCols();
145      Map<String, FieldSchema> fieldSchemaMap = new HashMap<String, FieldSchema>();
146      for(FieldSchema col : cols) {
147        fieldSchemaMap.put(col.getName(), col);
148      }
149
150      Iterator<VirtualColumn> vcs = VirtualColumn.registry.values().iterator();
151      while (vcs.hasNext()) {
152        VirtualColumn vc = vcs.next();
153        fieldSchemaMap.put(vc.getName(), new FieldSchema(vc.getName(),
154            vc.getTypeInfo().getTypeName(), ""));
155      }
156
157      TableAliasInfo tai = new TableAliasInfo();
158      tai.setAlias(top.getConf().getAlias());
159      tai.setTable(tab);
160      for(ColumnInfo ci : rs.getSignature()) {
161        // Create a dependency
162        Dependency dep = new Dependency();
163        BaseColumnInfo bci = new BaseColumnInfo();
164        bci.setTabAlias(tai);
165        bci.setColumn(fieldSchemaMap.get(ci.getInternalName()));
166
167        // Populate the dependency
168        dep.setType(LineageInfo.DependencyType.SIMPLE);
169        // TODO: Find out how to get the expression here.
170        dep.setExpr(null);
171        dep.setBaseCols(new ArrayList<BaseColumnInfo>());
172        dep.getBaseCols().add(bci);
173
174        // Put the dependency in the map
175        lCtx.getIndex().putDependency(top, ci, dep);
176      }
177
178      return null;
179    }
180
181  }
182
183  /**
184   * Processor for Join Operator.
185   */
186  public static class JoinLineage extends DefaultLineage implements NodeProcessor {
187    @Override
188    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
189        Object... nodeOutputs) throws SemanticException {
190
191      // Assert that there is atleast one item in the stack. This should never
192      // be called for leafs.
193      assert(!stack.isEmpty());
194
195      // LineageCtx
196      LineageCtx lCtx = (LineageCtx) procCtx;
197      JoinOperator op = (JoinOperator)nd;
198      JoinDesc jd = op.getConf();
199
200      // The input operator to the join is always a reduce sink operator
201      ReduceSinkOperator inpOp = (ReduceSinkOperator)getParent(stack);
202      ReduceSinkDesc rd = inpOp.getConf();
203      int tag = rd.getTag();
204
205      // Iterate over the outputs of the join operator and merge the
206      // dependencies of the columns that corresponding to the tag.
207      int cnt = 0;
208      List<ExprNodeDesc> exprs = jd.getExprs().get((byte)tag);
209      for(ColumnInfo ci : op.getSchema().getSignature()) {
210        if (jd.getReversedExprs().get(ci.getInternalName()) != tag) {
211          continue;
212        }
213
214        // Otherwise look up the expression corresponding to this ci
215        ExprNodeDesc expr = exprs.get(cnt++);
216        lCtx.getIndex().mergeDependency(op, ci,
217            ExprProcFactory.getExprDependency(lCtx, inpOp, expr));
218      }
219
220      return null;
221    }
222
223  }
224
225  /**
226   * Processor for Join Operator.
227   */
228  public static class LateralViewJoinLineage extends DefaultLineage implements NodeProcessor {
229    @Override
230    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
231        Object... nodeOutputs) throws SemanticException {
232
233      // Assert that there is atleast one item in the stack. This should never
234      // be called for leafs.
235      assert(!stack.isEmpty());
236
237      // LineageCtx
238      LineageCtx lCtx = (LineageCtx) procCtx;
239      LateralViewJoinOperator op = (LateralViewJoinOperator)nd;
240      boolean isUdtfPath = true;
241      Operator<? extends Serializable> inpOp = getParent(stack);
242      ArrayList<ColumnInfo> cols = inpOp.getSchema().getSignature();
243
244      if (inpOp instanceof SelectOperator) {
245        isUdtfPath = false;
246      }
247
248      // Dirty hack!!
249      // For the select path the columns are the ones at the end of the
250      // current operators schema and for the udtf path the columns are
251      // at the beginning of the operator schema.
252      ArrayList<ColumnInfo> out_cols = op.getSchema().getSignature();
253      int out_cols_size = out_cols.size();
254      int cols_size = cols.size();
255      if (isUdtfPath) {
256        int cnt = 0;
257        while (cnt < cols_size) {
258          lCtx.getIndex().mergeDependency(op, out_cols.get(cnt),
259              lCtx.getIndex().getDependency(inpOp, cols.get(cnt)));
260          cnt++;
261        }
262      }
263      else {
264        int cnt = cols_size - 1;
265        while (cnt >= 0) {
266          lCtx.getIndex().mergeDependency(op, out_cols.get(out_cols_size - cols_size + cnt),
267              lCtx.getIndex().getDependency(inpOp, cols.get(cnt)));
268          cnt--;
269        }
270      }
271      return null;
272    }
273
274  }
275
276  /**
277   * Processor for Select operator.
278   */
279  public static class SelectLineage extends DefaultLineage implements NodeProcessor {
280    @Override
281    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
282        Object... nodeOutputs) throws SemanticException {
283
284      LineageCtx lctx = (LineageCtx)procCtx;
285      SelectOperator sop = (SelectOperator)nd;
286
287      // if this is a selStarNoCompute then this select operator
288      // is treated like a default operator, so just call the super classes
289      // process method.
290      if (sop.getConf().isSelStarNoCompute()) {
291        return super.process(nd, stack, procCtx, nodeOutputs);
292      }
293
294      // Otherwise we treat this as a normal select operator and look at
295      // the expressions.
296
297      ArrayList<ColumnInfo> col_infos = sop.getSchema().getSignature();
298      int cnt = 0;
299      for(ExprNodeDesc expr : sop.getConf().getColList()) {
300        lctx.getIndex().putDependency(sop, col_infos.get(cnt++),
301            ExprProcFactory.getExprDependency(lctx, getParent(stack), expr));
302      }
303
304      return null;
305    }
306
307  }
308
309  /**
310   * Processor for GroupBy operator.
311   */
312  public static class GroupByLineage extends DefaultLineage implements NodeProcessor {
313    @Override
314    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
315        Object... nodeOutputs) throws SemanticException {
316
317      LineageCtx lctx = (LineageCtx)procCtx;
318      GroupByOperator gop = (GroupByOperator)nd;
319      ArrayList<ColumnInfo> col_infos = gop.getSchema().getSignature();
320      Operator<? extends Serializable> inpOp = getParent(stack);
321      int cnt = 0;
322
323      for(ExprNodeDesc expr : gop.getConf().getKeys()) {
324        lctx.getIndex().putDependency(gop, col_infos.get(cnt++),
325            ExprProcFactory.getExprDependency(lctx, inpOp, expr));
326      }
327
328      for(AggregationDesc agg : gop.getConf().getAggregators()) {
329        // Concatenate the dependencies of all the parameters to
330        // create the new dependency
331        Dependency dep = new Dependency();
332        DependencyType new_type = LineageInfo.DependencyType.EXPRESSION;
333        // TODO: Get the actual string here.
334        dep.setExpr(null);
335        LinkedHashSet<BaseColumnInfo> bci_set = new LinkedHashSet<BaseColumnInfo>();
336        for(ExprNodeDesc expr : agg.getParameters()) {
337          Dependency expr_dep = ExprProcFactory.getExprDependency(lctx, inpOp, expr);
338          if (expr_dep != null) {
339            new_type = LineageCtx.getNewDependencyType(expr_dep.getType(), new_type);
340            bci_set.addAll(expr_dep.getBaseCols());
341          }
342        }
343
344        // If the bci_set is empty, this means that the inputs to this
345        // aggregate function were all constants (e.g. count(1)). In this case
346        // the aggregate function is just dependent on all the tables that are in
347        // the dependency list of the input operator.
348        if (bci_set.isEmpty()) {
349          Set<TableAliasInfo> tai_set = new LinkedHashSet<TableAliasInfo>();
350          if (inpOp.getSchema() != null && inpOp.getSchema().getSignature() != null ) {
351            for(ColumnInfo ci : inpOp.getSchema().getSignature()) {
352              Dependency inp_dep = lctx.getIndex().getDependency(inpOp, ci);
353            	// The dependency can be null as some of the input cis may not have
354            	// been set in case of joins.
355            	if (inp_dep != null) {
356            	  for(BaseColumnInfo bci : inp_dep.getBaseCols()) {
357            	    new_type = LineageCtx.getNewDependencyType(inp_dep.getType(), new_type);
358            	    tai_set.add(bci.getTabAlias());
359            	  }
360            	}
361            }
362          }
363
364          // Create the BaseColumnInfos and set them in the bci_set
365          for(TableAliasInfo tai : tai_set) {
366            BaseColumnInfo bci = new BaseColumnInfo();
367            bci.setTabAlias(tai);
368            // This is set to null to reflect that the dependency is not on any
369            // particular column of the table.
370            bci.setColumn(null);
371            bci_set.add(bci);
372          }
373        }
374
375        dep.setBaseCols(new ArrayList<BaseColumnInfo>(bci_set));
376        dep.setType(new_type);
377        lctx.getIndex().putDependency(gop, col_infos.get(cnt++), dep);
378      }
379
380      return null;
381    }
382
383  }
384
385  /**
386   * Union processor.
387   * In this case we call mergeDependency as opposed to putDependency
388   * in order to account for visits from different parents.
389   */
390  public static class UnionLineage extends DefaultLineage implements NodeProcessor {
391
392    protected static final Log LOG = LogFactory.getLog(OpProcFactory.class.getName());
393
394    @SuppressWarnings("unchecked")
395    @Override
396    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
397        Object... nodeOutputs) throws SemanticException {
398      // Assert that there is atleast one item in the stack. This should never
399      // be called for leafs.
400      assert(!stack.isEmpty());
401
402      // LineageCtx
403      LineageCtx lCtx = (LineageCtx) procCtx;
404      Operator<? extends Serializable> op = (Operator<? extends Serializable>)nd;
405
406      // Get the row schema of the input operator.
407      // The row schema of the parent operator
408      Operator<? extends Serializable> inpOp = getParent(stack);
409      RowSchema rs = op.getSchema();
410      ArrayList<ColumnInfo> inp_cols = inpOp.getSchema().getSignature();
411      int cnt = 0;
412      for(ColumnInfo ci : rs.getSignature()) {
413        Dependency inp_dep = lCtx.getIndex().getDependency(inpOp, inp_cols.get(cnt++));
414        if (inp_dep != null) {
415          lCtx.getIndex().mergeDependency(op, ci, inp_dep);
416        }
417      }
418      return null;
419    }
420  }
421
422  /**
423   * ReduceSink processor.
424   */
425  public static class ReduceSinkLineage implements NodeProcessor {
426
427    protected static final Log LOG = LogFactory.getLog(OpProcFactory.class.getName());
428
429    @SuppressWarnings("unchecked")
430    @Override
431    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
432        Object... nodeOutputs) throws SemanticException {
433      // Assert that there is atleast one item in the stack. This should never
434      // be called for leafs.
435      assert(!stack.isEmpty());
436
437      // LineageCtx
438      LineageCtx lCtx = (LineageCtx) procCtx;
439      ReduceSinkOperator rop = (ReduceSinkOperator)nd;
440
441      ArrayList<ColumnInfo> col_infos = rop.getSchema().getSignature();
442      Operator<? extends Serializable> inpOp = getParent(stack);
443      int cnt = 0;
444
445      // The keys are included only in case the reduce sink feeds into
446      // a group by operator through a chain of forward operators
447      Operator<? extends Serializable> op = rop.getChildOperators().get(0);
448      while (op instanceof ForwardOperator) {
449        op = op.getChildOperators().get(0);
450      }
451
452      if (op instanceof GroupByOperator) {
453        for(ExprNodeDesc expr : rop.getConf().getKeyCols()) {
454          lCtx.getIndex().putDependency(rop, col_infos.get(cnt++),
455              ExprProcFactory.getExprDependency(lCtx, inpOp, expr));
456        }
457      }
458
459      for(ExprNodeDesc expr : rop.getConf().getValueCols()) {
460        lCtx.getIndex().putDependency(rop, col_infos.get(cnt++),
461            ExprProcFactory.getExprDependency(lCtx, inpOp, expr));
462      }
463
464      return null;
465    }
466  }
467
468  /**
469   * Default processor. This basically passes the input dependencies as such
470   * to the output dependencies.
471   */
472  public static class DefaultLineage implements NodeProcessor {
473
474    protected static final Log LOG = LogFactory.getLog(OpProcFactory.class.getName());
475
476    @SuppressWarnings("unchecked")
477    @Override
478    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
479        Object... nodeOutputs) throws SemanticException {
480      // Assert that there is atleast one item in the stack. This should never
481      // be called for leafs.
482      assert(!stack.isEmpty());
483
484      // LineageCtx
485      LineageCtx lCtx = (LineageCtx) procCtx;
486      Operator<? extends Serializable> op = (Operator<? extends Serializable>)nd;
487
488      // Get the row schema of the input operator.
489      // The row schema of the parent operator
490      Operator<? extends Serializable> inpOp = getParent(stack);
491      RowSchema rs = op.getSchema();
492      ArrayList<ColumnInfo> inp_cols = inpOp.getSchema().getSignature();
493      int cnt = 0;
494      for(ColumnInfo ci : rs.getSignature()) {
495        lCtx.getIndex().putDependency(op, ci,
496            lCtx.getIndex().getDependency(inpOp, inp_cols.get(cnt++)));
497      }
498      return null;
499    }
500  }
501
502  public static NodeProcessor getJoinProc() {
503    return new JoinLineage();
504  }
505
506  public static NodeProcessor getLateralViewJoinProc() {
507    return new LateralViewJoinLineage();
508  }
509
510  public static NodeProcessor getTSProc() {
511    return new TableScanLineage();
512  }
513
514  public static NodeProcessor getTransformProc() {
515    return new TransformLineage();
516  }
517
518  public static NodeProcessor getSelProc() {
519    return new SelectLineage();
520  }
521
522  public static NodeProcessor getGroupByProc() {
523    return new GroupByLineage();
524  }
525
526  public static NodeProcessor getUnionProc() {
527    return new UnionLineage();
528  }
529
530  public static NodeProcessor getReduceSinkProc() {
531    return new ReduceSinkLineage();
532  }
533
534  public static NodeProcessor getDefaultProc() {
535    return new DefaultLineage();
536  }
537
538}