/tags/release-0.0.0-rc0/hive/external/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java
Java | 267 lines | 213 code | 26 blank | 28 comment | 36 complexity | 3092d96f410dba8f3cedee86ef2cfaa4 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, JSON, CPL-1.0
- /**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.hadoop.hive.ql.optimizer;
- import java.io.Serializable;
- import java.util.ArrayList;
- import java.util.HashMap;
- import java.util.LinkedHashMap;
- import java.util.List;
- import java.util.Map;
- import java.util.Stack;
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- import org.apache.hadoop.hive.metastore.api.Order;
- import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
- import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
- import org.apache.hadoop.hive.ql.exec.Operator;
- import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator;
- import org.apache.hadoop.hive.ql.exec.TableScanOperator;
- import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
- import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
- import org.apache.hadoop.hive.ql.lib.Dispatcher;
- import org.apache.hadoop.hive.ql.lib.GraphWalker;
- import org.apache.hadoop.hive.ql.lib.Node;
- import org.apache.hadoop.hive.ql.lib.NodeProcessor;
- import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
- import org.apache.hadoop.hive.ql.lib.Rule;
- import org.apache.hadoop.hive.ql.lib.RuleRegExp;
- import org.apache.hadoop.hive.ql.metadata.HiveException;
- import org.apache.hadoop.hive.ql.metadata.Partition;
- import org.apache.hadoop.hive.ql.metadata.Table;
- import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
- import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer;
- import org.apache.hadoop.hive.ql.parse.ParseContext;
- import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
- import org.apache.hadoop.hive.ql.parse.QBJoinTree;
- import org.apache.hadoop.hive.ql.parse.SemanticException;
- import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
- import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
- import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
- import org.apache.hadoop.hive.ql.plan.SMBJoinDesc;
- import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
- //try to replace a bucket map join with a sorted merge map join
- public class SortedMergeBucketMapJoinOptimizer implements Transform {
- private static final Log LOG = LogFactory
- .getLog(SortedMergeBucketMapJoinOptimizer.class.getName());
- public SortedMergeBucketMapJoinOptimizer() {
- }
- @Override
- public ParseContext transform(ParseContext pctx) throws SemanticException {
- Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
- // go through all map joins and find out all which have enabled bucket map
- // join.
- opRules.put(new RuleRegExp("R1", "MAPJOIN%"),
- getSortedMergeBucketMapjoinProc(pctx));
- // The dispatcher fires the processor corresponding to the closest matching
- // rule and passes the context along
- Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, null);
- GraphWalker ogw = new DefaultGraphWalker(disp);
- // Create a list of topop nodes
- ArrayList<Node> topNodes = new ArrayList<Node>();
- topNodes.addAll(pctx.getTopOps().values());
- ogw.startWalking(topNodes, null);
- return pctx;
- }
- private NodeProcessor getSortedMergeBucketMapjoinProc(ParseContext pctx) {
- return new SortedMergeBucketMapjoinProc(pctx);
- }
- private NodeProcessor getDefaultProc() {
- return new NodeProcessor() {
- @Override
- public Object process(Node nd, Stack<Node> stack,
- NodeProcessorCtx procCtx, Object... nodeOutputs)
- throws SemanticException {
- return null;
- }
- };
- }
- class SortedMergeBucketMapjoinProc implements NodeProcessor {
- ParseContext pGraphContext;
- public SortedMergeBucketMapjoinProc(ParseContext pctx) {
- this.pGraphContext = pctx;
- }
- public SortedMergeBucketMapjoinProc() {
- }
- @Override
- public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
- Object... nodeOutputs) throws SemanticException {
- if (nd instanceof SMBMapJoinOperator) {
- return null;
- }
- MapJoinOperator mapJoinOp = (MapJoinOperator) nd;
- if (mapJoinOp.getConf().getAliasBucketFileNameMapping() == null
- || mapJoinOp.getConf().getAliasBucketFileNameMapping().size() == 0) {
- return null;
- }
- boolean tableSorted = true;
- QBJoinTree joinCxt = this.pGraphContext.getMapJoinContext()
- .get(mapJoinOp);
- if (joinCxt == null) {
- return null;
- }
- String[] srcs = joinCxt.getBaseSrc();
- int pos = 0;
- for (String src : srcs) {
- tableSorted = tableSorted
- && isTableSorted(this.pGraphContext, mapJoinOp, joinCxt, src, pos);
- pos++;
- }
- if (!tableSorted) {
- //this is a mapjoin but not suit for a sort merge bucket map join. check outer joins
- MapJoinProcessor.checkMapJoin(((MapJoinOperator) nd).getConf().getPosBigTable(),
- ((MapJoinOperator) nd).getConf().getConds());
- return null;
- }
- // convert a bucket map join operator to a sorted merge bucket map join
- // operator
- convertToSMBJoin(mapJoinOp, srcs);
- return null;
- }
- private SMBMapJoinOperator convertToSMBJoin(MapJoinOperator mapJoinOp,
- String[] srcs) {
- SMBMapJoinOperator smbJop = new SMBMapJoinOperator(mapJoinOp);
- SMBJoinDesc smbJoinDesc = new SMBJoinDesc(mapJoinOp.getConf());
- smbJop.setConf(smbJoinDesc);
- HashMap<Byte, String> tagToAlias = new HashMap<Byte, String>();
- for (int i = 0; i < srcs.length; i++) {
- tagToAlias.put((byte) i, srcs[i]);
- }
- smbJoinDesc.setTagToAlias(tagToAlias);
- int indexInListMapJoinNoReducer = this.pGraphContext.getListMapJoinOpsNoReducer().indexOf(mapJoinOp);
- if(indexInListMapJoinNoReducer >= 0 ) {
- this.pGraphContext.getListMapJoinOpsNoReducer().remove(indexInListMapJoinNoReducer);
- this.pGraphContext.getListMapJoinOpsNoReducer().add(indexInListMapJoinNoReducer, smbJop);
- }
- List<? extends Operator> parentOperators = mapJoinOp.getParentOperators();
- for (int i = 0; i < parentOperators.size(); i++) {
- Operator par = parentOperators.get(i);
- int index = par.getChildOperators().indexOf(mapJoinOp);
- par.getChildOperators().remove(index);
- par.getChildOperators().add(index, smbJop);
- }
- List<? extends Operator> childOps = mapJoinOp.getChildOperators();
- for (int i = 0; i < childOps.size(); i++) {
- Operator child = childOps.get(i);
- int index = child.getParentOperators().indexOf(mapJoinOp);
- child.getParentOperators().remove(index);
- child.getParentOperators().add(index, smbJop);
- }
- return smbJop;
- }
- private boolean isTableSorted(ParseContext pctx, MapJoinOperator op,
- QBJoinTree joinTree, String alias, int pos) throws SemanticException {
- Map<String, Operator<? extends Serializable>> topOps = this.pGraphContext
- .getTopOps();
- Map<TableScanOperator, Table> topToTable = this.pGraphContext
- .getTopToTable();
- TableScanOperator tso = (TableScanOperator) topOps.get(alias);
- if (tso == null) {
- return false;
- }
- List<ExprNodeDesc> keys = op.getConf().getKeys().get((byte) pos);
- // get all join columns from join keys stored in MapJoinDesc
- List<String> joinCols = new ArrayList<String>();
- List<ExprNodeDesc> joinKeys = new ArrayList<ExprNodeDesc>();
- joinKeys.addAll(keys);
- while (joinKeys.size() > 0) {
- ExprNodeDesc node = joinKeys.remove(0);
- if (node instanceof ExprNodeColumnDesc) {
- joinCols.addAll(node.getCols());
- } else if (node instanceof ExprNodeGenericFuncDesc) {
- ExprNodeGenericFuncDesc udfNode = ((ExprNodeGenericFuncDesc) node);
- GenericUDF udf = udfNode.getGenericUDF();
- if (!FunctionRegistry.isDeterministic(udf)) {
- return false;
- }
- joinKeys.addAll(0, udfNode.getChildExprs());
- }
- }
- Table tbl = topToTable.get(tso);
- if (tbl.isPartitioned()) {
- PrunedPartitionList prunedParts = null;
- try {
- prunedParts = pGraphContext.getOpToPartList().get(tso);
- if (prunedParts == null) {
- prunedParts = PartitionPruner.prune(tbl, pGraphContext
- .getOpToPartPruner().get(tso), pGraphContext.getConf(), alias,
- pGraphContext.getPrunedPartitions());
- pGraphContext.getOpToPartList().put(tso, prunedParts);
- }
- } catch (HiveException e) {
- LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
- throw new SemanticException(e.getMessage(), e);
- }
- boolean ret = true;
- for (Partition p : prunedParts.getConfirmedPartns()) {
- ret = ret && checkSortColsAndJoinCols(p.getSortCols(), joinCols);
- if (!ret) {
- return false;
- }
- }
- for (Partition p : prunedParts.getUnknownPartns()) {
- ret = ret && checkSortColsAndJoinCols(p.getSortCols(), joinCols);
- if (!ret) {
- return false;
- }
- }
- } else {
- return checkSortColsAndJoinCols(tbl.getSortCols(), joinCols);
- }
- return true;
- }
- private boolean checkSortColsAndJoinCols(List<Order> sortCols,
- List<String> joinCols) {
- // require all sort columns are asc, right now only support asc
- List<String> sortColNames = new ArrayList<String>();
- for (Order o : sortCols) {
- if (o.getOrder() != BaseSemanticAnalyzer.HIVE_COLUMN_ORDER_ASC) {
- return false;
- }
- sortColNames.add(o.getCol());
- }
- return sortColNames.containsAll(joinCols)
- && sortColNames.size() == joinCols.size();
- }
- }
- }