PageRenderTime 46ms CodeModel.GetById 13ms app.highlight 28ms RepoModel.GetById 1ms app.codeStats 1ms

/tags/release-0.0.0-rc0/hive/external/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java

#
Java | 309 lines | 205 code | 33 blank | 71 comment | 52 complexity | 2ab85addf9f94b66aaceb688ac9aab44 MD5 | raw file
  1/**
  2 * Licensed to the Apache Software Foundation (ASF) under one
  3 * or more contributor license agreements.See the NOTICE file
  4 * distributed with this work for additional information
  5 * regarding copyright ownership.The ASF licenses this file
  6 * to you under the Apache License, Version 2.0 (the
  7 * "License"); you may not use this file except in compliance
  8 * with the License.You may obtain a copy of the License at
  9 *
 10 * http://www.apache.org/licenses/LICENSE-2.0
 11 *
 12 * Unless required by applicable law or agreed to in writing, software
 13 * distributed under the License is distributed on an "AS IS" BASIS,
 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 * See the License for the specific language governing permissions and
 16 * limitations under the License.
 17 */
 18
 19package org.apache.hadoop.hive.ql.optimizer;
 20
 21import java.io.Serializable;
 22import java.util.ArrayList;
 23import java.util.LinkedHashMap;
 24import java.util.LinkedList;
 25import java.util.List;
 26import java.util.Map;
 27import java.util.Set;
 28import java.util.Stack;
 29
 30import org.apache.commons.logging.Log;
 31import org.apache.commons.logging.LogFactory;
 32import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
 33import org.apache.hadoop.hive.ql.exec.GroupByOperator;
 34import org.apache.hadoop.hive.ql.exec.Operator;
 35import org.apache.hadoop.hive.ql.exec.TableScanOperator;
 36import org.apache.hadoop.hive.ql.exec.Utilities;
 37import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
 38import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
 39import org.apache.hadoop.hive.ql.lib.Dispatcher;
 40import org.apache.hadoop.hive.ql.lib.GraphWalker;
 41import org.apache.hadoop.hive.ql.lib.Node;
 42import org.apache.hadoop.hive.ql.lib.NodeProcessor;
 43import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
 44import org.apache.hadoop.hive.ql.lib.Rule;
 45import org.apache.hadoop.hive.ql.lib.RuleRegExp;
 46import org.apache.hadoop.hive.ql.metadata.HiveException;
 47import org.apache.hadoop.hive.ql.metadata.Partition;
 48import org.apache.hadoop.hive.ql.metadata.Table;
 49import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
 50import org.apache.hadoop.hive.ql.parse.ParseContext;
 51import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
 52import org.apache.hadoop.hive.ql.parse.SemanticException;
 53import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
 54import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
 55import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
 56import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc;
 57import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
 58import org.apache.hadoop.hive.ql.plan.ExprNodeNullDesc;
 59import org.apache.hadoop.hive.ql.plan.GroupByDesc;
 60import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
 61
 62/**
 63 *this transformation does bucket group by optimization.
 64 */
 65public class GroupByOptimizer implements Transform {
 66
 67  private static final Log LOG = LogFactory.getLog(GroupByOptimizer.class
 68      .getName());
 69
 70  public GroupByOptimizer() {
 71  }
 72
 73  @Override
 74  public ParseContext transform(ParseContext pctx) throws SemanticException {
 75
 76    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
 77    GroupByOptProcCtx groupByOptimizeCtx = new GroupByOptProcCtx();
 78
 79    // process group-by pattern
 80    opRules.put(new RuleRegExp("R1", "GBY%RS%GBY%"),
 81        getMapAggreSortedGroupbyProc(pctx));
 82
 83    // The dispatcher fires the processor corresponding to the closest matching
 84    // rule and passes the context along
 85    Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules,
 86        groupByOptimizeCtx);
 87    GraphWalker ogw = new DefaultGraphWalker(disp);
 88
 89    // Create a list of topop nodes
 90    ArrayList<Node> topNodes = new ArrayList<Node>();
 91    topNodes.addAll(pctx.getTopOps().values());
 92    ogw.startWalking(topNodes, null);
 93
 94    return pctx;
 95  }
 96
 97  private NodeProcessor getDefaultProc() {
 98    return new NodeProcessor() {
 99      @Override
100      public Object process(Node nd, Stack<Node> stack,
101          NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
102        return null;
103      }
104    };
105  }
106
107  private NodeProcessor getMapAggreSortedGroupbyProc(ParseContext pctx) {
108    return new BucketGroupByProcessor(pctx);
109  }
110
111  /**
112   * BucketGroupByProcessor.
113   *
114   */
115  public class BucketGroupByProcessor implements NodeProcessor {
116
117    protected ParseContext pGraphContext;
118
119    public BucketGroupByProcessor(ParseContext pGraphContext) {
120      this.pGraphContext = pGraphContext;
121    }
122
123    @Override
124    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
125        Object... nodeOutputs) throws SemanticException {
126      // GBY,RS,GBY... (top to bottom)
127      GroupByOperator op = (GroupByOperator) stack.get(stack.size() - 3);
128      checkBucketGroupBy(op);
129      return null;
130    }
131
132    private void checkBucketGroupBy(GroupByOperator curr)
133        throws SemanticException {
134
135      // if this is not a HASH groupby, return
136      if (curr.getConf().getMode() != GroupByDesc.Mode.HASH) {
137        return;
138      }
139
140      Set<String> tblNames = pGraphContext.getGroupOpToInputTables().get(curr);
141      if (tblNames == null || tblNames.size() == 0) {
142        return;
143      }
144
145      boolean bucketGroupBy = true;
146      GroupByDesc desc = curr.getConf();
147      List<ExprNodeDesc> groupByKeys = new LinkedList<ExprNodeDesc>();
148      groupByKeys.addAll(desc.getKeys());
149      // compute groupby columns from groupby keys
150      List<String> groupByCols = new ArrayList<String>();
151      while (groupByKeys.size() > 0) {
152        ExprNodeDesc node = groupByKeys.remove(0);
153        if (node instanceof ExprNodeColumnDesc) {
154          groupByCols.addAll(node.getCols());
155        } else if ((node instanceof ExprNodeConstantDesc)
156            || (node instanceof ExprNodeNullDesc)) {
157          // nothing
158        } else if (node instanceof ExprNodeFieldDesc) {
159          groupByKeys.add(0, ((ExprNodeFieldDesc) node).getDesc());
160          continue;
161        } else if (node instanceof ExprNodeGenericFuncDesc) {
162          ExprNodeGenericFuncDesc udfNode = ((ExprNodeGenericFuncDesc) node);
163          GenericUDF udf = udfNode.getGenericUDF();
164          if (!FunctionRegistry.isDeterministic(udf)) {
165            return;
166          }
167          groupByKeys.addAll(0, udfNode.getChildExprs());
168        } else {
169          return;
170        }
171      }
172
173      if (groupByCols.size() == 0) {
174        return;
175      }
176
177      for (String table : tblNames) {
178        Operator<? extends Serializable> topOp = pGraphContext.getTopOps().get(
179            table);
180        if (topOp == null || (!(topOp instanceof TableScanOperator))) {
181          // this is in a sub-query.
182          // In future, we need to infer subq's columns propery. For example
183          // "select key, count(1)
184          // from (from clustergroupbyselect key, value where ds='210') group by key, 3;",
185          // even though the group by op is in a subquery, it can be changed to
186          // bucket groupby.
187          return;
188        }
189        TableScanOperator ts = (TableScanOperator) topOp;
190        Table destTable = pGraphContext.getTopToTable().get(ts);
191        if (destTable == null) {
192          return;
193        }
194        if (!destTable.isPartitioned()) {
195          List<String> bucketCols = destTable.getBucketCols();
196          List<String> sortCols = Utilities
197              .getColumnNamesFromSortCols(destTable.getSortCols());
198          bucketGroupBy = matchBucketOrSortedColumns(groupByCols, bucketCols,
199              sortCols);
200          if (!bucketGroupBy) {
201            return;
202          }
203        } else {
204          PrunedPartitionList partsList = null;
205          try {
206            partsList = pGraphContext.getOpToPartList().get(ts);
207            if (partsList == null) {
208              partsList = PartitionPruner.prune(destTable, pGraphContext
209                .getOpToPartPruner().get(ts), pGraphContext.getConf(), table,
210                pGraphContext.getPrunedPartitions());
211              pGraphContext.getOpToPartList().put(ts, partsList);
212            }
213          } catch (HiveException e) {
214            // Has to use full name to make sure it does not conflict with
215            // org.apache.commons.lang.StringUtils
216            LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
217            throw new SemanticException(e.getMessage(), e);
218          }
219          List<Partition> parts = new ArrayList<Partition>();
220          parts.addAll(partsList.getConfirmedPartns());
221          parts.addAll(partsList.getUnknownPartns());
222          for (Partition part : parts) {
223            List<String> bucketCols = part.getBucketCols();
224            List<String> sortCols = part.getSortColNames();
225            bucketGroupBy = matchBucketOrSortedColumns(groupByCols, bucketCols,
226                sortCols);
227            if (!bucketGroupBy) {
228              return;
229            }
230          }
231        }
232      }
233
234      curr.getConf().setBucketGroup(bucketGroupBy);
235    }
236
237    /**
238     * Given the group by keys, bucket columns, sort column, this method
239     * determines if we can use sorted group by or not.
240     *
241     * We use bucket columns only when the sorted column set is empty and if all
242     * group by columns are contained in bucket columns.
243     *
244     * If we can can not determine by looking at bucketed columns and the table
245     * has sort columns, we resort to sort columns. We can use bucket group by
246     * if the groupby column set is an exact prefix match of sort columns.
247     *
248     * @param groupByCols
249     * @param bucketCols
250     * @param sortCols
251     * @return
252     * @throws SemanticException
253     */
254    private boolean matchBucketOrSortedColumns(List<String> groupByCols,
255        List<String> bucketCols, List<String> sortCols) throws SemanticException {
256      boolean ret = false;
257
258      if (sortCols == null || sortCols.size() == 0) {
259        ret = matchBucketColumns(groupByCols, bucketCols);
260      }
261
262      if (!ret && sortCols != null && sortCols.size() >= groupByCols.size()) {
263        // check sort columns, if groupByCols is a prefix subset of sort
264        // columns, we will use sorted group by. For example, if data is sorted
265        // by column a, b, c, and a query wants to group by b,a, we will use
266        // sorted group by. But if the query wants to groupby b,c, then sorted
267        // group by can not be used.
268        int num = groupByCols.size();
269        for (int i = 0; i < num; i++) {
270          if (sortCols.indexOf(groupByCols.get(i)) > (num - 1)) {
271            return false;
272          }
273        }
274        return true;
275      }
276
277      return ret;
278    }
279
280    /*
281     * All group by columns should be contained in the bucket column set. And
282     * the number of group by columns should be equal to number of bucket
283     * columns.
284     */
285    private boolean matchBucketColumns(List<String> grpCols,
286        List<String> tblBucketCols) throws SemanticException {
287
288      if (tblBucketCols == null || tblBucketCols.size() == 0
289          || grpCols.size() == 0 || grpCols.size() != tblBucketCols.size()) {
290        return false;
291      }
292
293      for (int i = 0; i < grpCols.size(); i++) {
294        String tblCol = grpCols.get(i);
295        if (!tblBucketCols.contains(tblCol)) {
296          return false;
297        }
298      }
299      return true;
300    }
301  }
302
303  /**
304   * GroupByOptProcCtx.
305   *
306   */
307  public class GroupByOptProcCtx implements NodeProcessorCtx {
308  }
309}