PageRenderTime 45ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/tags/release-0.0.0-rc0/hive/external/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java

#
Java | 267 lines | 213 code | 26 blank | 28 comment | 36 complexity | 3092d96f410dba8f3cedee86ef2cfaa4 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, JSON, CPL-1.0
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements.See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership.The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License.You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hive.ql.optimizer;
  19. import java.io.Serializable;
  20. import java.util.ArrayList;
  21. import java.util.HashMap;
  22. import java.util.LinkedHashMap;
  23. import java.util.List;
  24. import java.util.Map;
  25. import java.util.Stack;
  26. import org.apache.commons.logging.Log;
  27. import org.apache.commons.logging.LogFactory;
  28. import org.apache.hadoop.hive.metastore.api.Order;
  29. import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
  30. import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
  31. import org.apache.hadoop.hive.ql.exec.Operator;
  32. import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator;
  33. import org.apache.hadoop.hive.ql.exec.TableScanOperator;
  34. import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
  35. import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
  36. import org.apache.hadoop.hive.ql.lib.Dispatcher;
  37. import org.apache.hadoop.hive.ql.lib.GraphWalker;
  38. import org.apache.hadoop.hive.ql.lib.Node;
  39. import org.apache.hadoop.hive.ql.lib.NodeProcessor;
  40. import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
  41. import org.apache.hadoop.hive.ql.lib.Rule;
  42. import org.apache.hadoop.hive.ql.lib.RuleRegExp;
  43. import org.apache.hadoop.hive.ql.metadata.HiveException;
  44. import org.apache.hadoop.hive.ql.metadata.Partition;
  45. import org.apache.hadoop.hive.ql.metadata.Table;
  46. import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
  47. import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer;
  48. import org.apache.hadoop.hive.ql.parse.ParseContext;
  49. import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
  50. import org.apache.hadoop.hive.ql.parse.QBJoinTree;
  51. import org.apache.hadoop.hive.ql.parse.SemanticException;
  52. import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
  53. import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
  54. import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
  55. import org.apache.hadoop.hive.ql.plan.SMBJoinDesc;
  56. import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
  57. //try to replace a bucket map join with a sorted merge map join
  58. public class SortedMergeBucketMapJoinOptimizer implements Transform {
  59. private static final Log LOG = LogFactory
  60. .getLog(SortedMergeBucketMapJoinOptimizer.class.getName());
  61. public SortedMergeBucketMapJoinOptimizer() {
  62. }
  63. @Override
  64. public ParseContext transform(ParseContext pctx) throws SemanticException {
  65. Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
  66. // go through all map joins and find out all which have enabled bucket map
  67. // join.
  68. opRules.put(new RuleRegExp("R1", "MAPJOIN%"),
  69. getSortedMergeBucketMapjoinProc(pctx));
  70. // The dispatcher fires the processor corresponding to the closest matching
  71. // rule and passes the context along
  72. Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, null);
  73. GraphWalker ogw = new DefaultGraphWalker(disp);
  74. // Create a list of topop nodes
  75. ArrayList<Node> topNodes = new ArrayList<Node>();
  76. topNodes.addAll(pctx.getTopOps().values());
  77. ogw.startWalking(topNodes, null);
  78. return pctx;
  79. }
  80. private NodeProcessor getSortedMergeBucketMapjoinProc(ParseContext pctx) {
  81. return new SortedMergeBucketMapjoinProc(pctx);
  82. }
  83. private NodeProcessor getDefaultProc() {
  84. return new NodeProcessor() {
  85. @Override
  86. public Object process(Node nd, Stack<Node> stack,
  87. NodeProcessorCtx procCtx, Object... nodeOutputs)
  88. throws SemanticException {
  89. return null;
  90. }
  91. };
  92. }
  93. class SortedMergeBucketMapjoinProc implements NodeProcessor {
  94. ParseContext pGraphContext;
  95. public SortedMergeBucketMapjoinProc(ParseContext pctx) {
  96. this.pGraphContext = pctx;
  97. }
  98. public SortedMergeBucketMapjoinProc() {
  99. }
  100. @Override
  101. public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
  102. Object... nodeOutputs) throws SemanticException {
  103. if (nd instanceof SMBMapJoinOperator) {
  104. return null;
  105. }
  106. MapJoinOperator mapJoinOp = (MapJoinOperator) nd;
  107. if (mapJoinOp.getConf().getAliasBucketFileNameMapping() == null
  108. || mapJoinOp.getConf().getAliasBucketFileNameMapping().size() == 0) {
  109. return null;
  110. }
  111. boolean tableSorted = true;
  112. QBJoinTree joinCxt = this.pGraphContext.getMapJoinContext()
  113. .get(mapJoinOp);
  114. if (joinCxt == null) {
  115. return null;
  116. }
  117. String[] srcs = joinCxt.getBaseSrc();
  118. int pos = 0;
  119. for (String src : srcs) {
  120. tableSorted = tableSorted
  121. && isTableSorted(this.pGraphContext, mapJoinOp, joinCxt, src, pos);
  122. pos++;
  123. }
  124. if (!tableSorted) {
  125. //this is a mapjoin but not suit for a sort merge bucket map join. check outer joins
  126. MapJoinProcessor.checkMapJoin(((MapJoinOperator) nd).getConf().getPosBigTable(),
  127. ((MapJoinOperator) nd).getConf().getConds());
  128. return null;
  129. }
  130. // convert a bucket map join operator to a sorted merge bucket map join
  131. // operator
  132. convertToSMBJoin(mapJoinOp, srcs);
  133. return null;
  134. }
  135. private SMBMapJoinOperator convertToSMBJoin(MapJoinOperator mapJoinOp,
  136. String[] srcs) {
  137. SMBMapJoinOperator smbJop = new SMBMapJoinOperator(mapJoinOp);
  138. SMBJoinDesc smbJoinDesc = new SMBJoinDesc(mapJoinOp.getConf());
  139. smbJop.setConf(smbJoinDesc);
  140. HashMap<Byte, String> tagToAlias = new HashMap<Byte, String>();
  141. for (int i = 0; i < srcs.length; i++) {
  142. tagToAlias.put((byte) i, srcs[i]);
  143. }
  144. smbJoinDesc.setTagToAlias(tagToAlias);
  145. int indexInListMapJoinNoReducer = this.pGraphContext.getListMapJoinOpsNoReducer().indexOf(mapJoinOp);
  146. if(indexInListMapJoinNoReducer >= 0 ) {
  147. this.pGraphContext.getListMapJoinOpsNoReducer().remove(indexInListMapJoinNoReducer);
  148. this.pGraphContext.getListMapJoinOpsNoReducer().add(indexInListMapJoinNoReducer, smbJop);
  149. }
  150. List<? extends Operator> parentOperators = mapJoinOp.getParentOperators();
  151. for (int i = 0; i < parentOperators.size(); i++) {
  152. Operator par = parentOperators.get(i);
  153. int index = par.getChildOperators().indexOf(mapJoinOp);
  154. par.getChildOperators().remove(index);
  155. par.getChildOperators().add(index, smbJop);
  156. }
  157. List<? extends Operator> childOps = mapJoinOp.getChildOperators();
  158. for (int i = 0; i < childOps.size(); i++) {
  159. Operator child = childOps.get(i);
  160. int index = child.getParentOperators().indexOf(mapJoinOp);
  161. child.getParentOperators().remove(index);
  162. child.getParentOperators().add(index, smbJop);
  163. }
  164. return smbJop;
  165. }
  166. private boolean isTableSorted(ParseContext pctx, MapJoinOperator op,
  167. QBJoinTree joinTree, String alias, int pos) throws SemanticException {
  168. Map<String, Operator<? extends Serializable>> topOps = this.pGraphContext
  169. .getTopOps();
  170. Map<TableScanOperator, Table> topToTable = this.pGraphContext
  171. .getTopToTable();
  172. TableScanOperator tso = (TableScanOperator) topOps.get(alias);
  173. if (tso == null) {
  174. return false;
  175. }
  176. List<ExprNodeDesc> keys = op.getConf().getKeys().get((byte) pos);
  177. // get all join columns from join keys stored in MapJoinDesc
  178. List<String> joinCols = new ArrayList<String>();
  179. List<ExprNodeDesc> joinKeys = new ArrayList<ExprNodeDesc>();
  180. joinKeys.addAll(keys);
  181. while (joinKeys.size() > 0) {
  182. ExprNodeDesc node = joinKeys.remove(0);
  183. if (node instanceof ExprNodeColumnDesc) {
  184. joinCols.addAll(node.getCols());
  185. } else if (node instanceof ExprNodeGenericFuncDesc) {
  186. ExprNodeGenericFuncDesc udfNode = ((ExprNodeGenericFuncDesc) node);
  187. GenericUDF udf = udfNode.getGenericUDF();
  188. if (!FunctionRegistry.isDeterministic(udf)) {
  189. return false;
  190. }
  191. joinKeys.addAll(0, udfNode.getChildExprs());
  192. }
  193. }
  194. Table tbl = topToTable.get(tso);
  195. if (tbl.isPartitioned()) {
  196. PrunedPartitionList prunedParts = null;
  197. try {
  198. prunedParts = pGraphContext.getOpToPartList().get(tso);
  199. if (prunedParts == null) {
  200. prunedParts = PartitionPruner.prune(tbl, pGraphContext
  201. .getOpToPartPruner().get(tso), pGraphContext.getConf(), alias,
  202. pGraphContext.getPrunedPartitions());
  203. pGraphContext.getOpToPartList().put(tso, prunedParts);
  204. }
  205. } catch (HiveException e) {
  206. LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
  207. throw new SemanticException(e.getMessage(), e);
  208. }
  209. boolean ret = true;
  210. for (Partition p : prunedParts.getConfirmedPartns()) {
  211. ret = ret && checkSortColsAndJoinCols(p.getSortCols(), joinCols);
  212. if (!ret) {
  213. return false;
  214. }
  215. }
  216. for (Partition p : prunedParts.getUnknownPartns()) {
  217. ret = ret && checkSortColsAndJoinCols(p.getSortCols(), joinCols);
  218. if (!ret) {
  219. return false;
  220. }
  221. }
  222. } else {
  223. return checkSortColsAndJoinCols(tbl.getSortCols(), joinCols);
  224. }
  225. return true;
  226. }
  227. private boolean checkSortColsAndJoinCols(List<Order> sortCols,
  228. List<String> joinCols) {
  229. // require all sort columns are asc, right now only support asc
  230. List<String> sortColNames = new ArrayList<String>();
  231. for (Order o : sortCols) {
  232. if (o.getOrder() != BaseSemanticAnalyzer.HIVE_COLUMN_ORDER_ASC) {
  233. return false;
  234. }
  235. sortColNames.add(o.getCol());
  236. }
  237. return sortColNames.containsAll(joinCols)
  238. && sortColNames.size() == joinCols.size();
  239. }
  240. }
  241. }