/tags/release-0.1-rc2/hive/external/ql/src/java/org/apache/hadoop/hive/ql/optimizer/lineage/OpProcFactory.java
Java | 538 lines | 337 code | 85 blank | 116 comment | 40 complexity | ccdb894c82546742f13a60335ce961aa MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, JSON, CPL-1.0
- /**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.hadoop.hive.ql.optimizer.lineage;
- import java.io.Serializable;
- import java.util.ArrayList;
- import java.util.HashMap;
- import java.util.Iterator;
- import java.util.LinkedHashSet;
- import java.util.List;
- import java.util.Map;
- import java.util.Set;
- import java.util.Stack;
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- import org.apache.hadoop.hive.metastore.api.FieldSchema;
- import org.apache.hadoop.hive.metastore.api.Table;
- import org.apache.hadoop.hive.ql.exec.ColumnInfo;
- import org.apache.hadoop.hive.ql.exec.ForwardOperator;
- import org.apache.hadoop.hive.ql.exec.GroupByOperator;
- import org.apache.hadoop.hive.ql.exec.JoinOperator;
- import org.apache.hadoop.hive.ql.exec.LateralViewJoinOperator;
- import org.apache.hadoop.hive.ql.exec.Operator;
- import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
- import org.apache.hadoop.hive.ql.exec.RowSchema;
- import org.apache.hadoop.hive.ql.exec.SelectOperator;
- import org.apache.hadoop.hive.ql.exec.TableScanOperator;
- import org.apache.hadoop.hive.ql.hooks.LineageInfo;
- import org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo;
- import org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency;
- import org.apache.hadoop.hive.ql.hooks.LineageInfo.DependencyType;
- import org.apache.hadoop.hive.ql.hooks.LineageInfo.TableAliasInfo;
- import org.apache.hadoop.hive.ql.lib.Node;
- import org.apache.hadoop.hive.ql.lib.NodeProcessor;
- import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
- import org.apache.hadoop.hive.ql.lib.Utils;
- import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
- import org.apache.hadoop.hive.ql.parse.ParseContext;
- import org.apache.hadoop.hive.ql.parse.SemanticException;
- import org.apache.hadoop.hive.ql.plan.AggregationDesc;
- import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
- import org.apache.hadoop.hive.ql.plan.JoinDesc;
- import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
- /**
- * Operator factory for the rule processors for lineage.
- */
- public class OpProcFactory {
- /**
- * Returns the parent operator in the walk path to the current operator.
- *
- * @param stack The stack encoding the path.
- *
- * @return Operator The parent operator in the current path.
- */
- protected static Operator<? extends Serializable> getParent(Stack<Node> stack) {
- return (Operator<? extends Serializable>)Utils.getNthAncestor(stack, 1);
- }
- /**
- * Processor for Script and UDTF Operators.
- */
- public static class TransformLineage extends DefaultLineage implements NodeProcessor {
- @Override
- public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
- Object... nodeOutputs) throws SemanticException {
- // LineageCTx
- LineageCtx lCtx = (LineageCtx) procCtx;
- // The operators
- Operator<? extends Serializable> op = (Operator<? extends Serializable>)nd;
- Operator<? extends Serializable> inpOp = getParent(stack);
- // Create a single dependency list by concatenating the dependencies of all
- // the cols
- Dependency dep = new Dependency();
- DependencyType new_type = LineageInfo.DependencyType.SCRIPT;
- dep.setType(LineageInfo.DependencyType.SCRIPT);
- // TODO: Fix this to a non null value.
- dep.setExpr(null);
- LinkedHashSet<BaseColumnInfo> col_set = new LinkedHashSet<BaseColumnInfo>();
- for(ColumnInfo ci : inpOp.getSchema().getSignature()) {
- Dependency d = lCtx.getIndex().getDependency(inpOp, ci);
- if (d != null) {
- new_type = LineageCtx.getNewDependencyType(d.getType(), new_type);
- col_set.addAll(d.getBaseCols());
- }
- }
- dep.setType(new_type);
- dep.setBaseCols(new ArrayList<BaseColumnInfo>(col_set));
- // This dependency is then set for all the colinfos of the script operator
- for(ColumnInfo ci : op.getSchema().getSignature()) {
- lCtx.getIndex().putDependency(op, ci, dep);
- }
- return null;
- }
- }
- /**
- * Processor for TableScan Operator. This actually creates the base column mappings.
- */
- public static class TableScanLineage extends DefaultLineage implements NodeProcessor {
- @Override
- public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
- Object... nodeOutputs) throws SemanticException {
- // LineageCtx
- LineageCtx lCtx = (LineageCtx) procCtx;
- ParseContext pctx = lCtx.getParseCtx();
- // Table scan operator.
- TableScanOperator top = (TableScanOperator)nd;
- org.apache.hadoop.hive.ql.metadata.Table t = pctx.getTopToTable().get(top);
- Table tab = t.getTTable();
- // Generate the mappings
- RowSchema rs = top.getSchema();
- List<FieldSchema> cols = t.getAllCols();
- Map<String, FieldSchema> fieldSchemaMap = new HashMap<String, FieldSchema>();
- for(FieldSchema col : cols) {
- fieldSchemaMap.put(col.getName(), col);
- }
- Iterator<VirtualColumn> vcs = VirtualColumn.registry.values().iterator();
- while (vcs.hasNext()) {
- VirtualColumn vc = vcs.next();
- fieldSchemaMap.put(vc.getName(), new FieldSchema(vc.getName(),
- vc.getTypeInfo().getTypeName(), ""));
- }
- TableAliasInfo tai = new TableAliasInfo();
- tai.setAlias(top.getConf().getAlias());
- tai.setTable(tab);
- for(ColumnInfo ci : rs.getSignature()) {
- // Create a dependency
- Dependency dep = new Dependency();
- BaseColumnInfo bci = new BaseColumnInfo();
- bci.setTabAlias(tai);
- bci.setColumn(fieldSchemaMap.get(ci.getInternalName()));
- // Populate the dependency
- dep.setType(LineageInfo.DependencyType.SIMPLE);
- // TODO: Find out how to get the expression here.
- dep.setExpr(null);
- dep.setBaseCols(new ArrayList<BaseColumnInfo>());
- dep.getBaseCols().add(bci);
- // Put the dependency in the map
- lCtx.getIndex().putDependency(top, ci, dep);
- }
- return null;
- }
- }
- /**
- * Processor for Join Operator.
- */
- public static class JoinLineage extends DefaultLineage implements NodeProcessor {
- @Override
- public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
- Object... nodeOutputs) throws SemanticException {
- // Assert that there is atleast one item in the stack. This should never
- // be called for leafs.
- assert(!stack.isEmpty());
- // LineageCtx
- LineageCtx lCtx = (LineageCtx) procCtx;
- JoinOperator op = (JoinOperator)nd;
- JoinDesc jd = op.getConf();
- // The input operator to the join is always a reduce sink operator
- ReduceSinkOperator inpOp = (ReduceSinkOperator)getParent(stack);
- ReduceSinkDesc rd = inpOp.getConf();
- int tag = rd.getTag();
- // Iterate over the outputs of the join operator and merge the
- // dependencies of the columns that corresponding to the tag.
- int cnt = 0;
- List<ExprNodeDesc> exprs = jd.getExprs().get((byte)tag);
- for(ColumnInfo ci : op.getSchema().getSignature()) {
- if (jd.getReversedExprs().get(ci.getInternalName()) != tag) {
- continue;
- }
- // Otherwise look up the expression corresponding to this ci
- ExprNodeDesc expr = exprs.get(cnt++);
- lCtx.getIndex().mergeDependency(op, ci,
- ExprProcFactory.getExprDependency(lCtx, inpOp, expr));
- }
- return null;
- }
- }
- /**
- * Processor for Join Operator.
- */
- public static class LateralViewJoinLineage extends DefaultLineage implements NodeProcessor {
- @Override
- public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
- Object... nodeOutputs) throws SemanticException {
- // Assert that there is atleast one item in the stack. This should never
- // be called for leafs.
- assert(!stack.isEmpty());
- // LineageCtx
- LineageCtx lCtx = (LineageCtx) procCtx;
- LateralViewJoinOperator op = (LateralViewJoinOperator)nd;
- boolean isUdtfPath = true;
- Operator<? extends Serializable> inpOp = getParent(stack);
- ArrayList<ColumnInfo> cols = inpOp.getSchema().getSignature();
- if (inpOp instanceof SelectOperator) {
- isUdtfPath = false;
- }
- // Dirty hack!!
- // For the select path the columns are the ones at the end of the
- // current operators schema and for the udtf path the columns are
- // at the beginning of the operator schema.
- ArrayList<ColumnInfo> out_cols = op.getSchema().getSignature();
- int out_cols_size = out_cols.size();
- int cols_size = cols.size();
- if (isUdtfPath) {
- int cnt = 0;
- while (cnt < cols_size) {
- lCtx.getIndex().mergeDependency(op, out_cols.get(cnt),
- lCtx.getIndex().getDependency(inpOp, cols.get(cnt)));
- cnt++;
- }
- }
- else {
- int cnt = cols_size - 1;
- while (cnt >= 0) {
- lCtx.getIndex().mergeDependency(op, out_cols.get(out_cols_size - cols_size + cnt),
- lCtx.getIndex().getDependency(inpOp, cols.get(cnt)));
- cnt--;
- }
- }
- return null;
- }
- }
- /**
- * Processor for Select operator.
- */
- public static class SelectLineage extends DefaultLineage implements NodeProcessor {
- @Override
- public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
- Object... nodeOutputs) throws SemanticException {
- LineageCtx lctx = (LineageCtx)procCtx;
- SelectOperator sop = (SelectOperator)nd;
- // if this is a selStarNoCompute then this select operator
- // is treated like a default operator, so just call the super classes
- // process method.
- if (sop.getConf().isSelStarNoCompute()) {
- return super.process(nd, stack, procCtx, nodeOutputs);
- }
- // Otherwise we treat this as a normal select operator and look at
- // the expressions.
- ArrayList<ColumnInfo> col_infos = sop.getSchema().getSignature();
- int cnt = 0;
- for(ExprNodeDesc expr : sop.getConf().getColList()) {
- lctx.getIndex().putDependency(sop, col_infos.get(cnt++),
- ExprProcFactory.getExprDependency(lctx, getParent(stack), expr));
- }
- return null;
- }
- }
- /**
- * Processor for GroupBy operator.
- */
- public static class GroupByLineage extends DefaultLineage implements NodeProcessor {
- @Override
- public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
- Object... nodeOutputs) throws SemanticException {
- LineageCtx lctx = (LineageCtx)procCtx;
- GroupByOperator gop = (GroupByOperator)nd;
- ArrayList<ColumnInfo> col_infos = gop.getSchema().getSignature();
- Operator<? extends Serializable> inpOp = getParent(stack);
- int cnt = 0;
- for(ExprNodeDesc expr : gop.getConf().getKeys()) {
- lctx.getIndex().putDependency(gop, col_infos.get(cnt++),
- ExprProcFactory.getExprDependency(lctx, inpOp, expr));
- }
- for(AggregationDesc agg : gop.getConf().getAggregators()) {
- // Concatenate the dependencies of all the parameters to
- // create the new dependency
- Dependency dep = new Dependency();
- DependencyType new_type = LineageInfo.DependencyType.EXPRESSION;
- // TODO: Get the actual string here.
- dep.setExpr(null);
- LinkedHashSet<BaseColumnInfo> bci_set = new LinkedHashSet<BaseColumnInfo>();
- for(ExprNodeDesc expr : agg.getParameters()) {
- Dependency expr_dep = ExprProcFactory.getExprDependency(lctx, inpOp, expr);
- if (expr_dep != null) {
- new_type = LineageCtx.getNewDependencyType(expr_dep.getType(), new_type);
- bci_set.addAll(expr_dep.getBaseCols());
- }
- }
- // If the bci_set is empty, this means that the inputs to this
- // aggregate function were all constants (e.g. count(1)). In this case
- // the aggregate function is just dependent on all the tables that are in
- // the dependency list of the input operator.
- if (bci_set.isEmpty()) {
- Set<TableAliasInfo> tai_set = new LinkedHashSet<TableAliasInfo>();
- if (inpOp.getSchema() != null && inpOp.getSchema().getSignature() != null ) {
- for(ColumnInfo ci : inpOp.getSchema().getSignature()) {
- Dependency inp_dep = lctx.getIndex().getDependency(inpOp, ci);
- // The dependency can be null as some of the input cis may not have
- // been set in case of joins.
- if (inp_dep != null) {
- for(BaseColumnInfo bci : inp_dep.getBaseCols()) {
- new_type = LineageCtx.getNewDependencyType(inp_dep.getType(), new_type);
- tai_set.add(bci.getTabAlias());
- }
- }
- }
- }
- // Create the BaseColumnInfos and set them in the bci_set
- for(TableAliasInfo tai : tai_set) {
- BaseColumnInfo bci = new BaseColumnInfo();
- bci.setTabAlias(tai);
- // This is set to null to reflect that the dependency is not on any
- // particular column of the table.
- bci.setColumn(null);
- bci_set.add(bci);
- }
- }
- dep.setBaseCols(new ArrayList<BaseColumnInfo>(bci_set));
- dep.setType(new_type);
- lctx.getIndex().putDependency(gop, col_infos.get(cnt++), dep);
- }
- return null;
- }
- }
- /**
- * Union processor.
- * In this case we call mergeDependency as opposed to putDependency
- * in order to account for visits from different parents.
- */
- public static class UnionLineage extends DefaultLineage implements NodeProcessor {
- protected static final Log LOG = LogFactory.getLog(OpProcFactory.class.getName());
- @SuppressWarnings("unchecked")
- @Override
- public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
- Object... nodeOutputs) throws SemanticException {
- // Assert that there is atleast one item in the stack. This should never
- // be called for leafs.
- assert(!stack.isEmpty());
- // LineageCtx
- LineageCtx lCtx = (LineageCtx) procCtx;
- Operator<? extends Serializable> op = (Operator<? extends Serializable>)nd;
- // Get the row schema of the input operator.
- // The row schema of the parent operator
- Operator<? extends Serializable> inpOp = getParent(stack);
- RowSchema rs = op.getSchema();
- ArrayList<ColumnInfo> inp_cols = inpOp.getSchema().getSignature();
- int cnt = 0;
- for(ColumnInfo ci : rs.getSignature()) {
- Dependency inp_dep = lCtx.getIndex().getDependency(inpOp, inp_cols.get(cnt++));
- if (inp_dep != null) {
- lCtx.getIndex().mergeDependency(op, ci, inp_dep);
- }
- }
- return null;
- }
- }
- /**
- * ReduceSink processor.
- */
- public static class ReduceSinkLineage implements NodeProcessor {
- protected static final Log LOG = LogFactory.getLog(OpProcFactory.class.getName());
- @SuppressWarnings("unchecked")
- @Override
- public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
- Object... nodeOutputs) throws SemanticException {
- // Assert that there is atleast one item in the stack. This should never
- // be called for leafs.
- assert(!stack.isEmpty());
- // LineageCtx
- LineageCtx lCtx = (LineageCtx) procCtx;
- ReduceSinkOperator rop = (ReduceSinkOperator)nd;
- ArrayList<ColumnInfo> col_infos = rop.getSchema().getSignature();
- Operator<? extends Serializable> inpOp = getParent(stack);
- int cnt = 0;
- // The keys are included only in case the reduce sink feeds into
- // a group by operator through a chain of forward operators
- Operator<? extends Serializable> op = rop.getChildOperators().get(0);
- while (op instanceof ForwardOperator) {
- op = op.getChildOperators().get(0);
- }
- if (op instanceof GroupByOperator) {
- for(ExprNodeDesc expr : rop.getConf().getKeyCols()) {
- lCtx.getIndex().putDependency(rop, col_infos.get(cnt++),
- ExprProcFactory.getExprDependency(lCtx, inpOp, expr));
- }
- }
- for(ExprNodeDesc expr : rop.getConf().getValueCols()) {
- lCtx.getIndex().putDependency(rop, col_infos.get(cnt++),
- ExprProcFactory.getExprDependency(lCtx, inpOp, expr));
- }
- return null;
- }
- }
- /**
- * Default processor. This basically passes the input dependencies as such
- * to the output dependencies.
- */
- public static class DefaultLineage implements NodeProcessor {
- protected static final Log LOG = LogFactory.getLog(OpProcFactory.class.getName());
- @SuppressWarnings("unchecked")
- @Override
- public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
- Object... nodeOutputs) throws SemanticException {
- // Assert that there is atleast one item in the stack. This should never
- // be called for leafs.
- assert(!stack.isEmpty());
- // LineageCtx
- LineageCtx lCtx = (LineageCtx) procCtx;
- Operator<? extends Serializable> op = (Operator<? extends Serializable>)nd;
- // Get the row schema of the input operator.
- // The row schema of the parent operator
- Operator<? extends Serializable> inpOp = getParent(stack);
- RowSchema rs = op.getSchema();
- ArrayList<ColumnInfo> inp_cols = inpOp.getSchema().getSignature();
- int cnt = 0;
- for(ColumnInfo ci : rs.getSignature()) {
- lCtx.getIndex().putDependency(op, ci,
- lCtx.getIndex().getDependency(inpOp, inp_cols.get(cnt++)));
- }
- return null;
- }
- }
- public static NodeProcessor getJoinProc() {
- return new JoinLineage();
- }
- public static NodeProcessor getLateralViewJoinProc() {
- return new LateralViewJoinLineage();
- }
- public static NodeProcessor getTSProc() {
- return new TableScanLineage();
- }
- public static NodeProcessor getTransformProc() {
- return new TransformLineage();
- }
- public static NodeProcessor getSelProc() {
- return new SelectLineage();
- }
- public static NodeProcessor getGroupByProc() {
- return new GroupByLineage();
- }
- public static NodeProcessor getUnionProc() {
- return new UnionLineage();
- }
- public static NodeProcessor getReduceSinkProc() {
- return new ReduceSinkLineage();
- }
- public static NodeProcessor getDefaultProc() {
- return new DefaultLineage();
- }
- }