PageRenderTime 49ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/tags/release-0.0.0-rc0/hive/external/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SamplePruner.java

#
Java | 214 lines | 124 code | 25 blank | 65 comment | 18 complexity | b97f4e59ea0e11f18cd1171c19a5b930 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, JSON, CPL-1.0
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hive.ql.optimizer;
  19. import java.util.ArrayList;
  20. import java.util.HashMap;
  21. import java.util.LinkedHashMap;
  22. import java.util.Map;
  23. import java.util.Stack;
  24. import org.apache.commons.logging.Log;
  25. import org.apache.commons.logging.LogFactory;
  26. import org.apache.hadoop.fs.Path;
  27. import org.apache.hadoop.hive.ql.exec.FilterOperator;
  28. import org.apache.hadoop.hive.ql.exec.TableScanOperator;
  29. import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
  30. import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
  31. import org.apache.hadoop.hive.ql.lib.Dispatcher;
  32. import org.apache.hadoop.hive.ql.lib.GraphWalker;
  33. import org.apache.hadoop.hive.ql.lib.Node;
  34. import org.apache.hadoop.hive.ql.lib.NodeProcessor;
  35. import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
  36. import org.apache.hadoop.hive.ql.lib.Rule;
  37. import org.apache.hadoop.hive.ql.lib.RuleRegExp;
  38. import org.apache.hadoop.hive.ql.metadata.Partition;
  39. import org.apache.hadoop.hive.ql.parse.ParseContext;
  40. import org.apache.hadoop.hive.ql.parse.SemanticException;
  41. import org.apache.hadoop.hive.ql.plan.FilterDesc;
  42. import org.apache.hadoop.hive.ql.plan.FilterDesc.sampleDesc;
  43. /**
  44. * The transformation step that does sample pruning.
  45. *
  46. */
  47. public class SamplePruner implements Transform {
  48. /**
  49. * SamplePrunerCtx.
  50. *
  51. */
  52. public static class SamplePrunerCtx implements NodeProcessorCtx {
  53. HashMap<TableScanOperator, sampleDesc> opToSamplePruner;
  54. public SamplePrunerCtx(
  55. HashMap<TableScanOperator, sampleDesc> opToSamplePruner) {
  56. this.opToSamplePruner = opToSamplePruner;
  57. }
  58. /**
  59. * @return the opToSamplePruner
  60. */
  61. public HashMap<TableScanOperator, sampleDesc> getOpToSamplePruner() {
  62. return opToSamplePruner;
  63. }
  64. /**
  65. * @param opToSamplePruner
  66. * the opToSamplePruner to set
  67. */
  68. public void setOpToSamplePruner(
  69. HashMap<TableScanOperator, sampleDesc> opToSamplePruner) {
  70. this.opToSamplePruner = opToSamplePruner;
  71. }
  72. }
  73. // The log
  74. private static final Log LOG = LogFactory
  75. .getLog("hive.ql.optimizer.SamplePruner");
  76. /*
  77. * (non-Javadoc)
  78. *
  79. * @see
  80. * org.apache.hadoop.hive.ql.optimizer.Transform#transform(org.apache.hadoop
  81. * .hive.ql.parse.ParseContext)
  82. */
  83. @Override
  84. public ParseContext transform(ParseContext pctx) throws SemanticException {
  85. // create a the context for walking operators
  86. SamplePrunerCtx samplePrunerCtx = new SamplePrunerCtx(pctx
  87. .getOpToSamplePruner());
  88. Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
  89. opRules.put(new RuleRegExp("R1", "(TS%FIL%FIL%)"), getFilterProc());
  90. // The dispatcher fires the processor corresponding to the closest matching
  91. // rule and passes the context along
  92. Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules,
  93. samplePrunerCtx);
  94. GraphWalker ogw = new DefaultGraphWalker(disp);
  95. // Create a list of topop nodes
  96. ArrayList<Node> topNodes = new ArrayList<Node>();
  97. topNodes.addAll(pctx.getTopOps().values());
  98. ogw.startWalking(topNodes, null);
  99. return pctx;
  100. }
  101. /**
  102. * FilterPPR filter processor.
  103. *
  104. */
  105. public static class FilterPPR implements NodeProcessor {
  106. @Override
  107. public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
  108. Object... nodeOutputs) throws SemanticException {
  109. FilterOperator filOp = (FilterOperator) nd;
  110. FilterDesc filOpDesc = filOp.getConf();
  111. sampleDesc sampleDescr = filOpDesc.getSampleDescr();
  112. if ((sampleDescr == null) || !sampleDescr.getInputPruning()) {
  113. return null;
  114. }
  115. assert stack.size() == 3;
  116. TableScanOperator tsOp = (TableScanOperator) stack.get(0);
  117. ((SamplePrunerCtx) procCtx).getOpToSamplePruner().put(tsOp, sampleDescr);
  118. return null;
  119. }
  120. }
  121. public static NodeProcessor getFilterProc() {
  122. return new FilterPPR();
  123. }
  124. /**
  125. * DefaultPPR default processor which does nothing.
  126. *
  127. */
  128. public static class DefaultPPR implements NodeProcessor {
  129. @Override
  130. public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
  131. Object... nodeOutputs) throws SemanticException {
  132. // Nothing needs to be done.
  133. return null;
  134. }
  135. }
  136. public static NodeProcessor getDefaultProc() {
  137. return new DefaultPPR();
  138. }
  139. /**
  140. * Prunes to get all the files in the partition that satisfy the TABLESAMPLE
  141. * clause.
  142. *
  143. * @param part
  144. * The partition to prune
  145. * @return Path[]
  146. * @throws SemanticException
  147. */
  148. @SuppressWarnings("nls")
  149. public static Path[] prune(Partition part, sampleDesc sampleDescr)
  150. throws SemanticException {
  151. int num = sampleDescr.getNumerator();
  152. int den = sampleDescr.getDenominator();
  153. int bucketCount = part.getBucketCount();
  154. String fullScanMsg = "";
  155. // check if input pruning is possible
  156. if (sampleDescr.getInputPruning()) {
  157. LOG.trace("numerator = " + num);
  158. LOG.trace("denominator = " + den);
  159. LOG.trace("bucket count = " + bucketCount);
  160. if (bucketCount == den) {
  161. Path[] ret = new Path[1];
  162. ret[0] = part.getBucketPath(num - 1);
  163. return (ret);
  164. } else if (bucketCount > den && bucketCount % den == 0) {
  165. int numPathsInSample = bucketCount / den;
  166. Path[] ret = new Path[numPathsInSample];
  167. for (int i = 0; i < numPathsInSample; i++) {
  168. ret[i] = part.getBucketPath(i * den + num - 1);
  169. }
  170. return ret;
  171. } else if (bucketCount < den && den % bucketCount == 0) {
  172. Path[] ret = new Path[1];
  173. ret[0] = part.getBucketPath((num - 1) % bucketCount);
  174. return ret;
  175. } else {
  176. // need to do full scan
  177. fullScanMsg = "Tablesample denominator " + den
  178. + " is not multiple/divisor of bucket count " + bucketCount
  179. + " of table " + part.getTable().getTableName();
  180. }
  181. } else {
  182. // need to do full scan
  183. fullScanMsg = "Tablesample not on clustered columns";
  184. }
  185. LOG.warn(fullScanMsg + ", using full table scan");
  186. Path[] ret = part.getPath();
  187. return ret;
  188. }
  189. }