GroupByOptimizer.java

/tags/release-0.0.0-rc0/hive/external/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java

# · Java · 309 lines · 205 code · 33 blank · 71 comment · 52 complexity · 2ab85addf9f94b66aaceb688ac9aab44 MD5 · raw file

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.optimizer;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Stack;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
import org.apache.hadoop.hive.ql.exec.GroupByOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.GraphWalker;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.lib.RuleRegExp;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeNullDesc;
import org.apache.hadoop.hive.ql.plan.GroupByDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;

/**
 *this transformation does bucket group by optimization.
 */
public class GroupByOptimizer implements Transform {

  private static final Log LOG = LogFactory.getLog(GroupByOptimizer.class
      .getName());

  public GroupByOptimizer() {
  }

  @Override
  public ParseContext transform(ParseContext pctx) throws SemanticException {

    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
    GroupByOptProcCtx groupByOptimizeCtx = new GroupByOptProcCtx();

    // process group-by pattern
    opRules.put(new RuleRegExp("R1", "GBY%RS%GBY%"),
        getMapAggreSortedGroupbyProc(pctx));

    // The dispatcher fires the processor corresponding to the closest matching
    // rule and passes the context along
    Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules,
        groupByOptimizeCtx);
    GraphWalker ogw = new DefaultGraphWalker(disp);

    // Create a list of topop nodes
    ArrayList<Node> topNodes = new ArrayList<Node>();
    topNodes.addAll(pctx.getTopOps().values());
    ogw.startWalking(topNodes, null);

    return pctx;
  }

  private NodeProcessor getDefaultProc() {
    return new NodeProcessor() {
      @Override
      public Object process(Node nd, Stack<Node> stack,
          NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
        return null;
      }
    };
  }

  private NodeProcessor getMapAggreSortedGroupbyProc(ParseContext pctx) {
    return new BucketGroupByProcessor(pctx);
  }

  /**
   * BucketGroupByProcessor.
   *
   */
  public class BucketGroupByProcessor implements NodeProcessor {

    protected ParseContext pGraphContext;

    public BucketGroupByProcessor(ParseContext pGraphContext) {
      this.pGraphContext = pGraphContext;
    }

    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      // GBY,RS,GBY... (top to bottom)
      GroupByOperator op = (GroupByOperator) stack.get(stack.size() - 3);
      checkBucketGroupBy(op);
      return null;
    }

    private void checkBucketGroupBy(GroupByOperator curr)
        throws SemanticException {

      // if this is not a HASH groupby, return
      if (curr.getConf().getMode() != GroupByDesc.Mode.HASH) {
        return;
      }

      Set<String> tblNames = pGraphContext.getGroupOpToInputTables().get(curr);
      if (tblNames == null || tblNames.size() == 0) {
        return;
      }

      boolean bucketGroupBy = true;
      GroupByDesc desc = curr.getConf();
      List<ExprNodeDesc> groupByKeys = new LinkedList<ExprNodeDesc>();
      groupByKeys.addAll(desc.getKeys());
      // compute groupby columns from groupby keys
      List<String> groupByCols = new ArrayList<String>();
      while (groupByKeys.size() > 0) {
        ExprNodeDesc node = groupByKeys.remove(0);
        if (node instanceof ExprNodeColumnDesc) {
          groupByCols.addAll(node.getCols());
        } else if ((node instanceof ExprNodeConstantDesc)
            || (node instanceof ExprNodeNullDesc)) {
          // nothing
        } else if (node instanceof ExprNodeFieldDesc) {
          groupByKeys.add(0, ((ExprNodeFieldDesc) node).getDesc());
          continue;
        } else if (node instanceof ExprNodeGenericFuncDesc) {
          ExprNodeGenericFuncDesc udfNode = ((ExprNodeGenericFuncDesc) node);
          GenericUDF udf = udfNode.getGenericUDF();
          if (!FunctionRegistry.isDeterministic(udf)) {
            return;
          }
          groupByKeys.addAll(0, udfNode.getChildExprs());
        } else {
          return;
        }
      }

      if (groupByCols.size() == 0) {
        return;
      }

      for (String table : tblNames) {
        Operator<? extends Serializable> topOp = pGraphContext.getTopOps().get(
            table);
        if (topOp == null || (!(topOp instanceof TableScanOperator))) {
          // this is in a sub-query.
          // In future, we need to infer subq's columns propery. For example
          // "select key, count(1)
          // from (from clustergroupbyselect key, value where ds='210') group by key, 3;",
          // even though the group by op is in a subquery, it can be changed to
          // bucket groupby.
          return;
        }
        TableScanOperator ts = (TableScanOperator) topOp;
        Table destTable = pGraphContext.getTopToTable().get(ts);
        if (destTable == null) {
          return;
        }
        if (!destTable.isPartitioned()) {
          List<String> bucketCols = destTable.getBucketCols();
          List<String> sortCols = Utilities
              .getColumnNamesFromSortCols(destTable.getSortCols());
          bucketGroupBy = matchBucketOrSortedColumns(groupByCols, bucketCols,
              sortCols);
          if (!bucketGroupBy) {
            return;
          }
        } else {
          PrunedPartitionList partsList = null;
          try {
            partsList = pGraphContext.getOpToPartList().get(ts);
            if (partsList == null) {
              partsList = PartitionPruner.prune(destTable, pGraphContext
                .getOpToPartPruner().get(ts), pGraphContext.getConf(), table,
                pGraphContext.getPrunedPartitions());
              pGraphContext.getOpToPartList().put(ts, partsList);
            }
          } catch (HiveException e) {
            // Has to use full name to make sure it does not conflict with
            // org.apache.commons.lang.StringUtils
            LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
            throw new SemanticException(e.getMessage(), e);
          }
          List<Partition> parts = new ArrayList<Partition>();
          parts.addAll(partsList.getConfirmedPartns());
          parts.addAll(partsList.getUnknownPartns());
          for (Partition part : parts) {
            List<String> bucketCols = part.getBucketCols();
            List<String> sortCols = part.getSortColNames();
            bucketGroupBy = matchBucketOrSortedColumns(groupByCols, bucketCols,
                sortCols);
            if (!bucketGroupBy) {
              return;
            }
          }
        }
      }

      curr.getConf().setBucketGroup(bucketGroupBy);
    }

    /**
     * Given the group by keys, bucket columns, sort column, this method
     * determines if we can use sorted group by or not.
     *
     * We use bucket columns only when the sorted column set is empty and if all
     * group by columns are contained in bucket columns.
     *
     * If we can can not determine by looking at bucketed columns and the table
     * has sort columns, we resort to sort columns. We can use bucket group by
     * if the groupby column set is an exact prefix match of sort columns.
     *
     * @param groupByCols
     * @param bucketCols
     * @param sortCols
     * @return
     * @throws SemanticException
     */
    private boolean matchBucketOrSortedColumns(List<String> groupByCols,
        List<String> bucketCols, List<String> sortCols) throws SemanticException {
      boolean ret = false;

      if (sortCols == null || sortCols.size() == 0) {
        ret = matchBucketColumns(groupByCols, bucketCols);
      }

      if (!ret && sortCols != null && sortCols.size() >= groupByCols.size()) {
        // check sort columns, if groupByCols is a prefix subset of sort
        // columns, we will use sorted group by. For example, if data is sorted
        // by column a, b, c, and a query wants to group by b,a, we will use
        // sorted group by. But if the query wants to groupby b,c, then sorted
        // group by can not be used.
        int num = groupByCols.size();
        for (int i = 0; i < num; i++) {
          if (sortCols.indexOf(groupByCols.get(i)) > (num - 1)) {
            return false;
          }
        }
        return true;
      }

      return ret;
    }

    /*
     * All group by columns should be contained in the bucket column set. And
     * the number of group by columns should be equal to number of bucket
     * columns.
     */
    private boolean matchBucketColumns(List<String> grpCols,
        List<String> tblBucketCols) throws SemanticException {

      if (tblBucketCols == null || tblBucketCols.size() == 0
          || grpCols.size() == 0 || grpCols.size() != tblBucketCols.size()) {
        return false;
      }

      for (int i = 0; i < grpCols.size(); i++) {
        String tblCol = grpCols.get(i);
        if (!tblBucketCols.contains(tblCol)) {
          return false;
        }
      }
      return true;
    }
  }

  /**
   * GroupByOptProcCtx.
   *
   */
  public class GroupByOptProcCtx implements NodeProcessorCtx {
  }
}
Tech Fingerprint

Alerts (15)

'return null;' Returning null forces callers to perform null checks, risking NullPointerException. Consider using Optional<T> (Java 8+), throwing an exception, or returning a Null Object/empty collection instead.
102 129
'=' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
127
'instanceof' Frequent 'instanceof' checks can indicate a need for better polymorphism (using overridden methods in subclasses) or visitor pattern. Consider if the design can be improved.
153 155 156 158 161 180
'List' Raw collection type used. Specify generic type arguments (e.g., List<String>, Map<Integer, Client>) for type safety and clarity. Avoid raw types unless interacting with legacy code.
204 206 207 208
Complexity hotspot; lines 288 to 289 (total complexity: 8)
288 289