PageRenderTime 37ms CodeModel.GetById 18ms app.highlight 15ms RepoModel.GetById 1ms app.codeStats 0ms

/tags/release-0.0.0-rc0/hive/external/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SamplePruner.java

#
Java | 214 lines | 124 code | 25 blank | 65 comment | 18 complexity | b97f4e59ea0e11f18cd1171c19a5b930 MD5 | raw file
  1/**
  2 * Licensed to the Apache Software Foundation (ASF) under one
  3 * or more contributor license agreements.  See the NOTICE file
  4 * distributed with this work for additional information
  5 * regarding copyright ownership.  The ASF licenses this file
  6 * to you under the Apache License, Version 2.0 (the
  7 * "License"); you may not use this file except in compliance
  8 * with the License.  You may obtain a copy of the License at
  9 *
 10 *     http://www.apache.org/licenses/LICENSE-2.0
 11 *
 12 * Unless required by applicable law or agreed to in writing, software
 13 * distributed under the License is distributed on an "AS IS" BASIS,
 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 * See the License for the specific language governing permissions and
 16 * limitations under the License.
 17 */
 18
 19package org.apache.hadoop.hive.ql.optimizer;
 20
 21import java.util.ArrayList;
 22import java.util.HashMap;
 23import java.util.LinkedHashMap;
 24import java.util.Map;
 25import java.util.Stack;
 26
 27import org.apache.commons.logging.Log;
 28import org.apache.commons.logging.LogFactory;
 29import org.apache.hadoop.fs.Path;
 30import org.apache.hadoop.hive.ql.exec.FilterOperator;
 31import org.apache.hadoop.hive.ql.exec.TableScanOperator;
 32import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
 33import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
 34import org.apache.hadoop.hive.ql.lib.Dispatcher;
 35import org.apache.hadoop.hive.ql.lib.GraphWalker;
 36import org.apache.hadoop.hive.ql.lib.Node;
 37import org.apache.hadoop.hive.ql.lib.NodeProcessor;
 38import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
 39import org.apache.hadoop.hive.ql.lib.Rule;
 40import org.apache.hadoop.hive.ql.lib.RuleRegExp;
 41import org.apache.hadoop.hive.ql.metadata.Partition;
 42import org.apache.hadoop.hive.ql.parse.ParseContext;
 43import org.apache.hadoop.hive.ql.parse.SemanticException;
 44import org.apache.hadoop.hive.ql.plan.FilterDesc;
 45import org.apache.hadoop.hive.ql.plan.FilterDesc.sampleDesc;
 46
 47/**
 48 * The transformation step that does sample pruning.
 49 * 
 50 */
 51public class SamplePruner implements Transform {
 52
 53  /**
 54   * SamplePrunerCtx.
 55   *
 56   */
 57  public static class SamplePrunerCtx implements NodeProcessorCtx {
 58    HashMap<TableScanOperator, sampleDesc> opToSamplePruner;
 59
 60    public SamplePrunerCtx(
 61        HashMap<TableScanOperator, sampleDesc> opToSamplePruner) {
 62      this.opToSamplePruner = opToSamplePruner;
 63    }
 64
 65    /**
 66     * @return the opToSamplePruner
 67     */
 68    public HashMap<TableScanOperator, sampleDesc> getOpToSamplePruner() {
 69      return opToSamplePruner;
 70    }
 71
 72    /**
 73     * @param opToSamplePruner
 74     *          the opToSamplePruner to set
 75     */
 76    public void setOpToSamplePruner(
 77        HashMap<TableScanOperator, sampleDesc> opToSamplePruner) {
 78      this.opToSamplePruner = opToSamplePruner;
 79    }
 80  }
 81
 82  // The log
 83  private static final Log LOG = LogFactory
 84      .getLog("hive.ql.optimizer.SamplePruner");
 85
 86  /*
 87   * (non-Javadoc)
 88   * 
 89   * @see
 90   * org.apache.hadoop.hive.ql.optimizer.Transform#transform(org.apache.hadoop
 91   * .hive.ql.parse.ParseContext)
 92   */
 93  @Override
 94  public ParseContext transform(ParseContext pctx) throws SemanticException {
 95
 96    // create a the context for walking operators
 97    SamplePrunerCtx samplePrunerCtx = new SamplePrunerCtx(pctx
 98        .getOpToSamplePruner());
 99
100    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
101    opRules.put(new RuleRegExp("R1", "(TS%FIL%FIL%)"), getFilterProc());
102
103    // The dispatcher fires the processor corresponding to the closest matching
104    // rule and passes the context along
105    Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules,
106        samplePrunerCtx);
107    GraphWalker ogw = new DefaultGraphWalker(disp);
108
109    // Create a list of topop nodes
110    ArrayList<Node> topNodes = new ArrayList<Node>();
111    topNodes.addAll(pctx.getTopOps().values());
112    ogw.startWalking(topNodes, null);
113    return pctx;
114  }
115
116  /**
117   * FilterPPR filter processor.
118   *
119   */
120  public static class FilterPPR implements NodeProcessor {
121
122    @Override
123    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
124        Object... nodeOutputs) throws SemanticException {
125      FilterOperator filOp = (FilterOperator) nd;
126      FilterDesc filOpDesc = filOp.getConf();
127      sampleDesc sampleDescr = filOpDesc.getSampleDescr();
128
129      if ((sampleDescr == null) || !sampleDescr.getInputPruning()) {
130        return null;
131      }
132
133      assert stack.size() == 3;
134      TableScanOperator tsOp = (TableScanOperator) stack.get(0);
135      ((SamplePrunerCtx) procCtx).getOpToSamplePruner().put(tsOp, sampleDescr);
136      return null;
137    }
138  }
139
140  public static NodeProcessor getFilterProc() {
141    return new FilterPPR();
142  }
143
144  /**
145   * DefaultPPR default processor which does nothing.
146   *
147   */
148  public static class DefaultPPR implements NodeProcessor {
149
150    @Override
151    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
152        Object... nodeOutputs) throws SemanticException {
153      // Nothing needs to be done.
154      return null;
155    }
156  }
157
158  public static NodeProcessor getDefaultProc() {
159    return new DefaultPPR();
160  }
161
162  /**
163   * Prunes to get all the files in the partition that satisfy the TABLESAMPLE
164   * clause.
165   * 
166   * @param part
167   *          The partition to prune
168   * @return Path[]
169   * @throws SemanticException
170   */
171  @SuppressWarnings("nls")
172  public static Path[] prune(Partition part, sampleDesc sampleDescr)
173      throws SemanticException {
174    int num = sampleDescr.getNumerator();
175    int den = sampleDescr.getDenominator();
176    int bucketCount = part.getBucketCount();
177    String fullScanMsg = "";
178
179    // check if input pruning is possible
180    if (sampleDescr.getInputPruning()) {
181      LOG.trace("numerator = " + num);
182      LOG.trace("denominator = " + den);
183      LOG.trace("bucket count = " + bucketCount);
184      if (bucketCount == den) {
185        Path[] ret = new Path[1];
186        ret[0] = part.getBucketPath(num - 1);
187        return (ret);
188      } else if (bucketCount > den && bucketCount % den == 0) {
189        int numPathsInSample = bucketCount / den;
190        Path[] ret = new Path[numPathsInSample];
191        for (int i = 0; i < numPathsInSample; i++) {
192          ret[i] = part.getBucketPath(i * den + num - 1);
193        }
194        return ret;
195      } else if (bucketCount < den && den % bucketCount == 0) {
196        Path[] ret = new Path[1];
197        ret[0] = part.getBucketPath((num - 1) % bucketCount);
198        return ret;
199      } else {
200        // need to do full scan
201        fullScanMsg = "Tablesample denominator " + den
202            + " is not multiple/divisor of bucket count " + bucketCount
203            + " of table " + part.getTable().getTableName();
204      }
205    } else {
206      // need to do full scan
207      fullScanMsg = "Tablesample not on clustered columns";
208    }
209    LOG.warn(fullScanMsg + ", using full table scan");
210    Path[] ret = part.getPath();
211    return ret;
212  }
213
214}