PageRenderTime 62ms CodeModel.GetById 10ms app.highlight 46ms RepoModel.GetById 1ms app.codeStats 0ms

/tags/release-0.0.0-rc0/hive/external/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java

#
Java | 267 lines | 213 code | 26 blank | 28 comment | 36 complexity | 3092d96f410dba8f3cedee86ef2cfaa4 MD5 | raw file
  1/**
  2 * Licensed to the Apache Software Foundation (ASF) under one
  3 * or more contributor license agreements.See the NOTICE file
  4 * distributed with this work for additional information
  5 * regarding copyright ownership.The ASF licenses this file
  6 * to you under the Apache License, Version 2.0 (the
  7 * "License"); you may not use this file except in compliance
  8 * with the License.You may obtain a copy of the License at
  9 *
 10 * http://www.apache.org/licenses/LICENSE-2.0
 11 *
 12 * Unless required by applicable law or agreed to in writing, software
 13 * distributed under the License is distributed on an "AS IS" BASIS,
 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 * See the License for the specific language governing permissions and
 16 * limitations under the License.
 17 */
 18
 19package org.apache.hadoop.hive.ql.optimizer;
 20
 21import java.io.Serializable;
 22import java.util.ArrayList;
 23import java.util.HashMap;
 24import java.util.LinkedHashMap;
 25import java.util.List;
 26import java.util.Map;
 27import java.util.Stack;
 28
 29import org.apache.commons.logging.Log;
 30import org.apache.commons.logging.LogFactory;
 31import org.apache.hadoop.hive.metastore.api.Order;
 32import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
 33import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
 34import org.apache.hadoop.hive.ql.exec.Operator;
 35import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator;
 36import org.apache.hadoop.hive.ql.exec.TableScanOperator;
 37import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
 38import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
 39import org.apache.hadoop.hive.ql.lib.Dispatcher;
 40import org.apache.hadoop.hive.ql.lib.GraphWalker;
 41import org.apache.hadoop.hive.ql.lib.Node;
 42import org.apache.hadoop.hive.ql.lib.NodeProcessor;
 43import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
 44import org.apache.hadoop.hive.ql.lib.Rule;
 45import org.apache.hadoop.hive.ql.lib.RuleRegExp;
 46import org.apache.hadoop.hive.ql.metadata.HiveException;
 47import org.apache.hadoop.hive.ql.metadata.Partition;
 48import org.apache.hadoop.hive.ql.metadata.Table;
 49import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
 50import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer;
 51import org.apache.hadoop.hive.ql.parse.ParseContext;
 52import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
 53import org.apache.hadoop.hive.ql.parse.QBJoinTree;
 54import org.apache.hadoop.hive.ql.parse.SemanticException;
 55import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
 56import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
 57import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
 58import org.apache.hadoop.hive.ql.plan.SMBJoinDesc;
 59import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
 60
 61//try to replace a bucket map join with a sorted merge map join
 62public class SortedMergeBucketMapJoinOptimizer implements Transform {
 63
 64  private static final Log LOG = LogFactory
 65      .getLog(SortedMergeBucketMapJoinOptimizer.class.getName());
 66
 67  public SortedMergeBucketMapJoinOptimizer() {
 68  }
 69
 70  @Override
 71  public ParseContext transform(ParseContext pctx) throws SemanticException {
 72
 73    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
 74    // go through all map joins and find out all which have enabled bucket map
 75    // join.
 76    opRules.put(new RuleRegExp("R1", "MAPJOIN%"),
 77        getSortedMergeBucketMapjoinProc(pctx));
 78    // The dispatcher fires the processor corresponding to the closest matching
 79    // rule and passes the context along
 80    Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, null);
 81    GraphWalker ogw = new DefaultGraphWalker(disp);
 82
 83    // Create a list of topop nodes
 84    ArrayList<Node> topNodes = new ArrayList<Node>();
 85    topNodes.addAll(pctx.getTopOps().values());
 86    ogw.startWalking(topNodes, null);
 87
 88    return pctx;
 89  }
 90
 91  private NodeProcessor getSortedMergeBucketMapjoinProc(ParseContext pctx) {
 92    return new SortedMergeBucketMapjoinProc(pctx);
 93  }
 94
 95  private NodeProcessor getDefaultProc() {
 96    return new NodeProcessor() {
 97      @Override
 98      public Object process(Node nd, Stack<Node> stack,
 99          NodeProcessorCtx procCtx, Object... nodeOutputs)
100          throws SemanticException {
101        return null;
102      }
103    };
104  }
105
106  class SortedMergeBucketMapjoinProc implements NodeProcessor {
107    ParseContext pGraphContext;
108
109    public SortedMergeBucketMapjoinProc(ParseContext pctx) {
110      this.pGraphContext = pctx;
111    }
112
113    public SortedMergeBucketMapjoinProc() {
114    }
115
116    @Override
117    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
118        Object... nodeOutputs) throws SemanticException {
119      if (nd instanceof SMBMapJoinOperator) {
120        return null;
121      }
122      MapJoinOperator mapJoinOp = (MapJoinOperator) nd;
123      if (mapJoinOp.getConf().getAliasBucketFileNameMapping() == null
124          || mapJoinOp.getConf().getAliasBucketFileNameMapping().size() == 0) {
125        return null;
126      }
127
128      boolean tableSorted = true;
129      QBJoinTree joinCxt = this.pGraphContext.getMapJoinContext()
130          .get(mapJoinOp);
131      if (joinCxt == null) {
132        return null;
133      }
134      String[] srcs = joinCxt.getBaseSrc();
135      int pos = 0;
136      for (String src : srcs) {
137        tableSorted = tableSorted
138            && isTableSorted(this.pGraphContext, mapJoinOp, joinCxt, src, pos);
139        pos++;
140      }
141      if (!tableSorted) {
142        //this is a mapjoin but not suit for a sort merge bucket map join. check outer joins
143        MapJoinProcessor.checkMapJoin(((MapJoinOperator) nd).getConf().getPosBigTable(),
144            ((MapJoinOperator) nd).getConf().getConds());
145        return null;
146      }
147      // convert a bucket map join operator to a sorted merge bucket map join
148      // operator
149      convertToSMBJoin(mapJoinOp, srcs);
150      return null;
151    }
152
153    private SMBMapJoinOperator convertToSMBJoin(MapJoinOperator mapJoinOp,
154        String[] srcs) {
155      SMBMapJoinOperator smbJop = new SMBMapJoinOperator(mapJoinOp);
156      SMBJoinDesc smbJoinDesc = new SMBJoinDesc(mapJoinOp.getConf());
157      smbJop.setConf(smbJoinDesc);
158      HashMap<Byte, String> tagToAlias = new HashMap<Byte, String>();
159      for (int i = 0; i < srcs.length; i++) {
160        tagToAlias.put((byte) i, srcs[i]);
161      }
162      smbJoinDesc.setTagToAlias(tagToAlias);
163
164      int indexInListMapJoinNoReducer = this.pGraphContext.getListMapJoinOpsNoReducer().indexOf(mapJoinOp);
165      if(indexInListMapJoinNoReducer >= 0 ) {
166        this.pGraphContext.getListMapJoinOpsNoReducer().remove(indexInListMapJoinNoReducer);
167        this.pGraphContext.getListMapJoinOpsNoReducer().add(indexInListMapJoinNoReducer, smbJop);
168      }
169
170      List<? extends Operator> parentOperators = mapJoinOp.getParentOperators();
171      for (int i = 0; i < parentOperators.size(); i++) {
172        Operator par = parentOperators.get(i);
173        int index = par.getChildOperators().indexOf(mapJoinOp);
174        par.getChildOperators().remove(index);
175        par.getChildOperators().add(index, smbJop);
176      }
177      List<? extends Operator> childOps = mapJoinOp.getChildOperators();
178      for (int i = 0; i < childOps.size(); i++) {
179        Operator child = childOps.get(i);
180        int index = child.getParentOperators().indexOf(mapJoinOp);
181        child.getParentOperators().remove(index);
182        child.getParentOperators().add(index, smbJop);
183      }
184      return smbJop;
185    }
186
187    private boolean isTableSorted(ParseContext pctx, MapJoinOperator op,
188        QBJoinTree joinTree, String alias, int pos) throws SemanticException {
189      Map<String, Operator<? extends Serializable>> topOps = this.pGraphContext
190          .getTopOps();
191      Map<TableScanOperator, Table> topToTable = this.pGraphContext
192          .getTopToTable();
193      TableScanOperator tso = (TableScanOperator) topOps.get(alias);
194      if (tso == null) {
195        return false;
196      }
197
198      List<ExprNodeDesc> keys = op.getConf().getKeys().get((byte) pos);
199      // get all join columns from join keys stored in MapJoinDesc
200      List<String> joinCols = new ArrayList<String>();
201      List<ExprNodeDesc> joinKeys = new ArrayList<ExprNodeDesc>();
202      joinKeys.addAll(keys);
203      while (joinKeys.size() > 0) {
204        ExprNodeDesc node = joinKeys.remove(0);
205        if (node instanceof ExprNodeColumnDesc) {
206          joinCols.addAll(node.getCols());
207        } else if (node instanceof ExprNodeGenericFuncDesc) {
208          ExprNodeGenericFuncDesc udfNode = ((ExprNodeGenericFuncDesc) node);
209          GenericUDF udf = udfNode.getGenericUDF();
210          if (!FunctionRegistry.isDeterministic(udf)) {
211            return false;
212          }
213          joinKeys.addAll(0, udfNode.getChildExprs());
214        }
215      }
216
217      Table tbl = topToTable.get(tso);
218      if (tbl.isPartitioned()) {
219        PrunedPartitionList prunedParts = null;
220        try {
221          prunedParts = pGraphContext.getOpToPartList().get(tso);
222          if (prunedParts == null) {
223            prunedParts = PartitionPruner.prune(tbl, pGraphContext
224                .getOpToPartPruner().get(tso), pGraphContext.getConf(), alias,
225                pGraphContext.getPrunedPartitions());
226            pGraphContext.getOpToPartList().put(tso, prunedParts);
227          }
228        } catch (HiveException e) {
229          LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
230          throw new SemanticException(e.getMessage(), e);
231        }
232        boolean ret = true;
233        for (Partition p : prunedParts.getConfirmedPartns()) {
234          ret = ret && checkSortColsAndJoinCols(p.getSortCols(), joinCols);
235          if (!ret) {
236            return false;
237          }
238        }
239        for (Partition p : prunedParts.getUnknownPartns()) {
240          ret = ret && checkSortColsAndJoinCols(p.getSortCols(), joinCols);
241          if (!ret) {
242            return false;
243          }
244        }
245      } else {
246        return checkSortColsAndJoinCols(tbl.getSortCols(), joinCols);
247      }
248      return true;
249    }
250
251    private boolean checkSortColsAndJoinCols(List<Order> sortCols,
252        List<String> joinCols) {
253      // require all sort columns are asc, right now only support asc
254      List<String> sortColNames = new ArrayList<String>();
255      for (Order o : sortCols) {
256        if (o.getOrder() != BaseSemanticAnalyzer.HIVE_COLUMN_ORDER_ASC) {
257          return false;
258        }
259        sortColNames.add(o.getCol());
260      }
261
262      return sortColNames.containsAll(joinCols)
263          && sortColNames.size() == joinCols.size();
264    }
265  }
266
267}