PageRenderTime 44ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 1ms

/tags/release-0.0.0-rc0/hive/external/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java

#
Java | 309 lines | 205 code | 33 blank | 71 comment | 52 complexity | 2ab85addf9f94b66aaceb688ac9aab44 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, JSON, CPL-1.0
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements.See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership.The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License.You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hive.ql.optimizer;
  19. import java.io.Serializable;
  20. import java.util.ArrayList;
  21. import java.util.LinkedHashMap;
  22. import java.util.LinkedList;
  23. import java.util.List;
  24. import java.util.Map;
  25. import java.util.Set;
  26. import java.util.Stack;
  27. import org.apache.commons.logging.Log;
  28. import org.apache.commons.logging.LogFactory;
  29. import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
  30. import org.apache.hadoop.hive.ql.exec.GroupByOperator;
  31. import org.apache.hadoop.hive.ql.exec.Operator;
  32. import org.apache.hadoop.hive.ql.exec.TableScanOperator;
  33. import org.apache.hadoop.hive.ql.exec.Utilities;
  34. import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
  35. import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
  36. import org.apache.hadoop.hive.ql.lib.Dispatcher;
  37. import org.apache.hadoop.hive.ql.lib.GraphWalker;
  38. import org.apache.hadoop.hive.ql.lib.Node;
  39. import org.apache.hadoop.hive.ql.lib.NodeProcessor;
  40. import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
  41. import org.apache.hadoop.hive.ql.lib.Rule;
  42. import org.apache.hadoop.hive.ql.lib.RuleRegExp;
  43. import org.apache.hadoop.hive.ql.metadata.HiveException;
  44. import org.apache.hadoop.hive.ql.metadata.Partition;
  45. import org.apache.hadoop.hive.ql.metadata.Table;
  46. import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
  47. import org.apache.hadoop.hive.ql.parse.ParseContext;
  48. import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
  49. import org.apache.hadoop.hive.ql.parse.SemanticException;
  50. import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
  51. import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
  52. import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
  53. import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc;
  54. import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
  55. import org.apache.hadoop.hive.ql.plan.ExprNodeNullDesc;
  56. import org.apache.hadoop.hive.ql.plan.GroupByDesc;
  57. import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
  58. /**
  59. *this transformation does bucket group by optimization.
  60. */
  61. public class GroupByOptimizer implements Transform {
  62. private static final Log LOG = LogFactory.getLog(GroupByOptimizer.class
  63. .getName());
  64. public GroupByOptimizer() {
  65. }
  66. @Override
  67. public ParseContext transform(ParseContext pctx) throws SemanticException {
  68. Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
  69. GroupByOptProcCtx groupByOptimizeCtx = new GroupByOptProcCtx();
  70. // process group-by pattern
  71. opRules.put(new RuleRegExp("R1", "GBY%RS%GBY%"),
  72. getMapAggreSortedGroupbyProc(pctx));
  73. // The dispatcher fires the processor corresponding to the closest matching
  74. // rule and passes the context along
  75. Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules,
  76. groupByOptimizeCtx);
  77. GraphWalker ogw = new DefaultGraphWalker(disp);
  78. // Create a list of topop nodes
  79. ArrayList<Node> topNodes = new ArrayList<Node>();
  80. topNodes.addAll(pctx.getTopOps().values());
  81. ogw.startWalking(topNodes, null);
  82. return pctx;
  83. }
  84. private NodeProcessor getDefaultProc() {
  85. return new NodeProcessor() {
  86. @Override
  87. public Object process(Node nd, Stack<Node> stack,
  88. NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
  89. return null;
  90. }
  91. };
  92. }
  93. private NodeProcessor getMapAggreSortedGroupbyProc(ParseContext pctx) {
  94. return new BucketGroupByProcessor(pctx);
  95. }
  96. /**
  97. * BucketGroupByProcessor.
  98. *
  99. */
  100. public class BucketGroupByProcessor implements NodeProcessor {
  101. protected ParseContext pGraphContext;
  102. public BucketGroupByProcessor(ParseContext pGraphContext) {
  103. this.pGraphContext = pGraphContext;
  104. }
  105. @Override
  106. public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
  107. Object... nodeOutputs) throws SemanticException {
  108. // GBY,RS,GBY... (top to bottom)
  109. GroupByOperator op = (GroupByOperator) stack.get(stack.size() - 3);
  110. checkBucketGroupBy(op);
  111. return null;
  112. }
  113. private void checkBucketGroupBy(GroupByOperator curr)
  114. throws SemanticException {
  115. // if this is not a HASH groupby, return
  116. if (curr.getConf().getMode() != GroupByDesc.Mode.HASH) {
  117. return;
  118. }
  119. Set<String> tblNames = pGraphContext.getGroupOpToInputTables().get(curr);
  120. if (tblNames == null || tblNames.size() == 0) {
  121. return;
  122. }
  123. boolean bucketGroupBy = true;
  124. GroupByDesc desc = curr.getConf();
  125. List<ExprNodeDesc> groupByKeys = new LinkedList<ExprNodeDesc>();
  126. groupByKeys.addAll(desc.getKeys());
  127. // compute groupby columns from groupby keys
  128. List<String> groupByCols = new ArrayList<String>();
  129. while (groupByKeys.size() > 0) {
  130. ExprNodeDesc node = groupByKeys.remove(0);
  131. if (node instanceof ExprNodeColumnDesc) {
  132. groupByCols.addAll(node.getCols());
  133. } else if ((node instanceof ExprNodeConstantDesc)
  134. || (node instanceof ExprNodeNullDesc)) {
  135. // nothing
  136. } else if (node instanceof ExprNodeFieldDesc) {
  137. groupByKeys.add(0, ((ExprNodeFieldDesc) node).getDesc());
  138. continue;
  139. } else if (node instanceof ExprNodeGenericFuncDesc) {
  140. ExprNodeGenericFuncDesc udfNode = ((ExprNodeGenericFuncDesc) node);
  141. GenericUDF udf = udfNode.getGenericUDF();
  142. if (!FunctionRegistry.isDeterministic(udf)) {
  143. return;
  144. }
  145. groupByKeys.addAll(0, udfNode.getChildExprs());
  146. } else {
  147. return;
  148. }
  149. }
  150. if (groupByCols.size() == 0) {
  151. return;
  152. }
  153. for (String table : tblNames) {
  154. Operator<? extends Serializable> topOp = pGraphContext.getTopOps().get(
  155. table);
  156. if (topOp == null || (!(topOp instanceof TableScanOperator))) {
  157. // this is in a sub-query.
  158. // In future, we need to infer subq's columns propery. For example
  159. // "select key, count(1)
  160. // from (from clustergroupbyselect key, value where ds='210') group by key, 3;",
  161. // even though the group by op is in a subquery, it can be changed to
  162. // bucket groupby.
  163. return;
  164. }
  165. TableScanOperator ts = (TableScanOperator) topOp;
  166. Table destTable = pGraphContext.getTopToTable().get(ts);
  167. if (destTable == null) {
  168. return;
  169. }
  170. if (!destTable.isPartitioned()) {
  171. List<String> bucketCols = destTable.getBucketCols();
  172. List<String> sortCols = Utilities
  173. .getColumnNamesFromSortCols(destTable.getSortCols());
  174. bucketGroupBy = matchBucketOrSortedColumns(groupByCols, bucketCols,
  175. sortCols);
  176. if (!bucketGroupBy) {
  177. return;
  178. }
  179. } else {
  180. PrunedPartitionList partsList = null;
  181. try {
  182. partsList = pGraphContext.getOpToPartList().get(ts);
  183. if (partsList == null) {
  184. partsList = PartitionPruner.prune(destTable, pGraphContext
  185. .getOpToPartPruner().get(ts), pGraphContext.getConf(), table,
  186. pGraphContext.getPrunedPartitions());
  187. pGraphContext.getOpToPartList().put(ts, partsList);
  188. }
  189. } catch (HiveException e) {
  190. // Has to use full name to make sure it does not conflict with
  191. // org.apache.commons.lang.StringUtils
  192. LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
  193. throw new SemanticException(e.getMessage(), e);
  194. }
  195. List<Partition> parts = new ArrayList<Partition>();
  196. parts.addAll(partsList.getConfirmedPartns());
  197. parts.addAll(partsList.getUnknownPartns());
  198. for (Partition part : parts) {
  199. List<String> bucketCols = part.getBucketCols();
  200. List<String> sortCols = part.getSortColNames();
  201. bucketGroupBy = matchBucketOrSortedColumns(groupByCols, bucketCols,
  202. sortCols);
  203. if (!bucketGroupBy) {
  204. return;
  205. }
  206. }
  207. }
  208. }
  209. curr.getConf().setBucketGroup(bucketGroupBy);
  210. }
  211. /**
  212. * Given the group by keys, bucket columns, sort column, this method
  213. * determines if we can use sorted group by or not.
  214. *
  215. * We use bucket columns only when the sorted column set is empty and if all
  216. * group by columns are contained in bucket columns.
  217. *
  218. * If we can can not determine by looking at bucketed columns and the table
  219. * has sort columns, we resort to sort columns. We can use bucket group by
  220. * if the groupby column set is an exact prefix match of sort columns.
  221. *
  222. * @param groupByCols
  223. * @param bucketCols
  224. * @param sortCols
  225. * @return
  226. * @throws SemanticException
  227. */
  228. private boolean matchBucketOrSortedColumns(List<String> groupByCols,
  229. List<String> bucketCols, List<String> sortCols) throws SemanticException {
  230. boolean ret = false;
  231. if (sortCols == null || sortCols.size() == 0) {
  232. ret = matchBucketColumns(groupByCols, bucketCols);
  233. }
  234. if (!ret && sortCols != null && sortCols.size() >= groupByCols.size()) {
  235. // check sort columns, if groupByCols is a prefix subset of sort
  236. // columns, we will use sorted group by. For example, if data is sorted
  237. // by column a, b, c, and a query wants to group by b,a, we will use
  238. // sorted group by. But if the query wants to groupby b,c, then sorted
  239. // group by can not be used.
  240. int num = groupByCols.size();
  241. for (int i = 0; i < num; i++) {
  242. if (sortCols.indexOf(groupByCols.get(i)) > (num - 1)) {
  243. return false;
  244. }
  245. }
  246. return true;
  247. }
  248. return ret;
  249. }
  250. /*
  251. * All group by columns should be contained in the bucket column set. And
  252. * the number of group by columns should be equal to number of bucket
  253. * columns.
  254. */
  255. private boolean matchBucketColumns(List<String> grpCols,
  256. List<String> tblBucketCols) throws SemanticException {
  257. if (tblBucketCols == null || tblBucketCols.size() == 0
  258. || grpCols.size() == 0 || grpCols.size() != tblBucketCols.size()) {
  259. return false;
  260. }
  261. for (int i = 0; i < grpCols.size(); i++) {
  262. String tblCol = grpCols.get(i);
  263. if (!tblBucketCols.contains(tblCol)) {
  264. return false;
  265. }
  266. }
  267. return true;
  268. }
  269. }
  270. /**
  271. * GroupByOptProcCtx.
  272. *
  273. */
  274. public class GroupByOptProcCtx implements NodeProcessorCtx {
  275. }
  276. }