PageRenderTime 58ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 0ms

/tags/release-0.1-rc2/hive/external/ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java

#
Java | 543 lines | 368 code | 47 blank | 128 comment | 77 complexity | 9664a05a34c0860216b1ea9e462bebf4 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, JSON, CPL-1.0
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hive.ql.ppd;
  19. import java.io.Serializable;
  20. import java.util.ArrayList;
  21. import java.util.HashSet;
  22. import java.util.Iterator;
  23. import java.util.List;
  24. import java.util.Map;
  25. import java.util.Set;
  26. import java.util.Stack;
  27. import java.util.Map.Entry;
  28. import org.apache.commons.logging.Log;
  29. import org.apache.commons.logging.LogFactory;
  30. import org.apache.hadoop.hive.conf.HiveConf;
  31. import org.apache.hadoop.hive.ql.exec.FilterOperator;
  32. import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
  33. import org.apache.hadoop.hive.ql.exec.JoinOperator;
  34. import org.apache.hadoop.hive.ql.exec.Operator;
  35. import org.apache.hadoop.hive.ql.exec.OperatorFactory;
  36. import org.apache.hadoop.hive.ql.exec.RowSchema;
  37. import org.apache.hadoop.hive.ql.exec.TableScanOperator;
  38. import org.apache.hadoop.hive.ql.exec.Utilities;
  39. import org.apache.hadoop.hive.ql.lib.Node;
  40. import org.apache.hadoop.hive.ql.lib.NodeProcessor;
  41. import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
  42. import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler;
  43. import org.apache.hadoop.hive.ql.metadata.HiveStoragePredicateHandler;
  44. import org.apache.hadoop.hive.ql.metadata.HiveUtils;
  45. import org.apache.hadoop.hive.ql.metadata.Table;
  46. import org.apache.hadoop.hive.ql.parse.OpParseContext;
  47. import org.apache.hadoop.hive.ql.parse.RowResolver;
  48. import org.apache.hadoop.hive.ql.parse.SemanticException;
  49. import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
  50. import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
  51. import org.apache.hadoop.hive.ql.plan.FilterDesc;
  52. import org.apache.hadoop.hive.ql.plan.JoinCondDesc;
  53. import org.apache.hadoop.hive.ql.plan.JoinDesc;
  54. import org.apache.hadoop.hive.ql.plan.TableScanDesc;
  55. import org.apache.hadoop.hive.serde2.Deserializer;
  56. import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
  57. import org.apache.hadoop.mapred.JobConf;
  58. /**
  59. * Operator factory for predicate pushdown processing of operator graph Each
  60. * operator determines the pushdown predicates by walking the expression tree.
  61. * Each operator merges its own pushdown predicates with those of its children
  62. * Finally the TableScan operator gathers all the predicates and inserts a
  63. * filter operator after itself. TODO: Further optimizations 1) Multi-insert
  64. * case 2) Create a filter operator for those predicates that couldn't be pushed
  65. * to the previous operators in the data flow 3) Merge multiple sequential
  66. * filter predicates into so that plans are more readable 4) Remove predicates
  67. * from filter operators that have been pushed. Currently these pushed
  68. * predicates are evaluated twice.
  69. */
  70. public final class OpProcFactory {
  71. protected static final Log LOG = LogFactory.getLog(OpProcFactory.class
  72. .getName());
  73. /**
  74. * Processor for Script Operator Prevents any predicates being pushed.
  75. */
  76. public static class ScriptPPD extends DefaultPPD implements NodeProcessor {
  77. @Override
  78. public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
  79. Object... nodeOutputs) throws SemanticException {
  80. LOG.info("Processing for " + nd.getName() + "("
  81. + ((Operator) nd).getIdentifier() + ")");
  82. // script operator is a black-box to hive so no optimization here
  83. // assuming that nothing can be pushed above the script op
  84. // same with LIMIT op
  85. return null;
  86. }
  87. }
  88. public static class LateralViewForwardPPD extends DefaultPPD implements NodeProcessor {
  89. @Override
  90. public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
  91. Object... nodeOutputs) throws SemanticException {
  92. LOG.info("Processing for " + nd.getName() + "("
  93. + ((Operator) nd).getIdentifier() + ")");
  94. OpWalkerInfo owi = (OpWalkerInfo) procCtx;
  95. ExprWalkerInfo childPreds = owi
  96. .getPrunedPreds((Operator<? extends Serializable>) nd.getChildren()
  97. .get(0));
  98. owi.putPrunedPreds((Operator<? extends Serializable>) nd, childPreds);
  99. return null;
  100. }
  101. }
  102. /**
  103. * Combines predicates of its child into a single expression and adds a filter
  104. * op as new child.
  105. */
  106. public static class TableScanPPD extends DefaultPPD implements NodeProcessor {
  107. @Override
  108. public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
  109. Object... nodeOutputs) throws SemanticException {
  110. LOG.info("Processing for " + nd.getName() + "("
  111. + ((Operator) nd).getIdentifier() + ")");
  112. OpWalkerInfo owi = (OpWalkerInfo) procCtx;
  113. TableScanOperator tsOp = (TableScanOperator) nd;
  114. mergeWithChildrenPred(tsOp, owi, null, null, false);
  115. ExprWalkerInfo pushDownPreds = owi.getPrunedPreds(tsOp);
  116. return createFilter(tsOp, pushDownPreds, owi);
  117. }
  118. }
  119. /**
  120. * Determines the push down predicates in its where expression and then
  121. * combines it with the push down predicates that are passed from its children.
  122. */
  123. public static class FilterPPD extends DefaultPPD implements NodeProcessor {
  124. @Override
  125. public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
  126. Object... nodeOutputs) throws SemanticException {
  127. LOG.info("Processing for " + nd.getName() + "("
  128. + ((Operator) nd).getIdentifier() + ")");
  129. OpWalkerInfo owi = (OpWalkerInfo) procCtx;
  130. Operator<? extends Serializable> op = (Operator<? extends Serializable>) nd;
  131. ExprNodeDesc predicate = (((FilterOperator) nd).getConf()).getPredicate();
  132. // get pushdown predicates for this operator's predicate
  133. ExprWalkerInfo ewi = ExprWalkerProcFactory.extractPushdownPreds(owi, op,
  134. predicate);
  135. if (!ewi.isDeterministic()) {
  136. /* predicate is not deterministic */
  137. if (op.getChildren() != null && op.getChildren().size() == 1) {
  138. createFilter(op, owi
  139. .getPrunedPreds((Operator<? extends Serializable>) (op
  140. .getChildren().get(0))), owi);
  141. }
  142. return null;
  143. }
  144. logExpr(nd, ewi);
  145. owi.putPrunedPreds(op, ewi);
  146. // merge it with children predicates
  147. mergeWithChildrenPred(op, owi, ewi, null, false);
  148. return null;
  149. }
  150. }
  151. /**
  152. * Determines predicates for which alias can be pushed to it's parents. See
  153. * the comments for getQualifiedAliases function.
  154. */
  155. public static class JoinPPD extends DefaultPPD implements NodeProcessor {
  156. @Override
  157. public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
  158. Object... nodeOutputs) throws SemanticException {
  159. LOG.info("Processing for " + nd.getName() + "("
  160. + ((Operator) nd).getIdentifier() + ")");
  161. OpWalkerInfo owi = (OpWalkerInfo) procCtx;
  162. Set<String> aliases = getQualifiedAliases((JoinOperator) nd, owi
  163. .getRowResolver(nd));
  164. mergeWithChildrenPred(nd, owi, null, aliases, false);
  165. return null;
  166. }
  167. /**
  168. * Figures out the aliases for whom it is safe to push predicates based on
  169. * ANSI SQL semantics For inner join, all predicates for all aliases can be
  170. * pushed For full outer join, none of the predicates can be pushed as that
  171. * would limit the number of rows for join For left outer join, all the
  172. * predicates on the left side aliases can be pushed up For right outer
  173. * join, all the predicates on the right side aliases can be pushed up Joins
  174. * chain containing both left and right outer joins are treated as full
  175. * outer join. TODO: further optimization opportunity for the case a.c1 =
  176. * b.c1 and b.c2 = c.c2 a and b are first joined and then the result with c.
  177. * But the second join op currently treats a and b as separate aliases and
  178. * thus disallowing predicate expr containing both tables a and b (such as
  179. * a.c3 + a.c4 > 20). Such predicates also can be pushed just above the
  180. * second join and below the first join
  181. *
  182. * @param op
  183. * Join Operator
  184. * @param rr
  185. * Row resolver
  186. * @return set of qualified aliases
  187. */
  188. private Set<String> getQualifiedAliases(JoinOperator op, RowResolver rr) {
  189. Set<String> aliases = new HashSet<String>();
  190. int loj = Integer.MAX_VALUE;
  191. int roj = -1;
  192. boolean oj = false;
  193. JoinCondDesc[] conds = op.getConf().getConds();
  194. Map<Integer, Set<String>> posToAliasMap = op.getPosToAliasMap();
  195. for (JoinCondDesc jc : conds) {
  196. if (jc.getType() == JoinDesc.FULL_OUTER_JOIN) {
  197. oj = true;
  198. break;
  199. } else if (jc.getType() == JoinDesc.LEFT_OUTER_JOIN) {
  200. if (jc.getLeft() < loj) {
  201. loj = jc.getLeft();
  202. }
  203. } else if (jc.getType() == JoinDesc.RIGHT_OUTER_JOIN) {
  204. if (jc.getRight() > roj) {
  205. roj = jc.getRight();
  206. }
  207. }
  208. }
  209. if (oj || (loj != Integer.MAX_VALUE && roj != -1)) {
  210. return aliases;
  211. }
  212. for (Entry<Integer, Set<String>> pa : posToAliasMap.entrySet()) {
  213. if (loj != Integer.MAX_VALUE) {
  214. if (pa.getKey() <= loj) {
  215. aliases.addAll(pa.getValue());
  216. }
  217. } else if (roj != -1) {
  218. if (pa.getKey() >= roj) {
  219. aliases.addAll(pa.getValue());
  220. }
  221. } else {
  222. aliases.addAll(pa.getValue());
  223. }
  224. }
  225. Set<String> aliases2 = rr.getTableNames();
  226. aliases.retainAll(aliases2);
  227. return aliases;
  228. }
  229. }
  230. /**
  231. * Processor for ReduceSink operator.
  232. *
  233. */
  234. public static class ReduceSinkPPD extends DefaultPPD implements NodeProcessor {
  235. @Override
  236. public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
  237. Object... nodeOutputs) throws SemanticException {
  238. LOG.info("Processing for " + nd.getName() + "("
  239. + ((Operator) nd).getIdentifier() + ")");
  240. OpWalkerInfo owi = (OpWalkerInfo) procCtx;
  241. Set<String> aliases = owi.getRowResolver(nd).getTableNames();
  242. boolean ignoreAliases = false;
  243. if (aliases.size() == 1 && aliases.contains("")) {
  244. // Reduce sink of group by operator
  245. ignoreAliases = true;
  246. }
  247. mergeWithChildrenPred(nd, owi, null, aliases, ignoreAliases);
  248. return null;
  249. }
  250. }
  251. /**
  252. * Default processor which just merges its children.
  253. */
  254. public static class DefaultPPD implements NodeProcessor {
  255. @Override
  256. public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
  257. Object... nodeOutputs) throws SemanticException {
  258. LOG.info("Processing for " + nd.getName() + "("
  259. + ((Operator) nd).getIdentifier() + ")");
  260. mergeWithChildrenPred(nd, (OpWalkerInfo) procCtx, null, null, false);
  261. return null;
  262. }
  263. /**
  264. * @param nd
  265. * @param ewi
  266. */
  267. protected void logExpr(Node nd, ExprWalkerInfo ewi) {
  268. for (Entry<String, List<ExprNodeDesc>> e : ewi.getFinalCandidates()
  269. .entrySet()) {
  270. LOG.info("Pushdown Predicates of " + nd.getName() + " For Alias : "
  271. + e.getKey());
  272. for (ExprNodeDesc n : e.getValue()) {
  273. LOG.info("\t" + n.getExprString());
  274. }
  275. }
  276. }
  277. /**
  278. * Take current operators pushdown predicates and merges them with
  279. * children's pushdown predicates.
  280. *
  281. * @param nd
  282. * current operator
  283. * @param owi
  284. * operator context during this walk
  285. * @param ewi
  286. * pushdown predicates (part of expression walker info)
  287. * @param aliases
  288. * aliases that this operator can pushdown. null means that all
  289. * aliases can be pushed down
  290. * @param ignoreAliases
  291. * @throws SemanticException
  292. */
  293. protected void mergeWithChildrenPred(Node nd, OpWalkerInfo owi,
  294. ExprWalkerInfo ewi, Set<String> aliases, boolean ignoreAliases)
  295. throws SemanticException {
  296. if (nd.getChildren() == null || nd.getChildren().size() > 1) {
  297. // ppd for multi-insert query is not yet implemented
  298. // no-op for leafs
  299. return;
  300. }
  301. Operator<? extends Serializable> op = (Operator<? extends Serializable>) nd;
  302. ExprWalkerInfo childPreds = owi
  303. .getPrunedPreds((Operator<? extends Serializable>) nd.getChildren()
  304. .get(0));
  305. if (childPreds == null) {
  306. return;
  307. }
  308. if (ewi == null) {
  309. ewi = new ExprWalkerInfo();
  310. }
  311. for (Entry<String, List<ExprNodeDesc>> e : childPreds
  312. .getFinalCandidates().entrySet()) {
  313. if (ignoreAliases || aliases == null || aliases.contains(e.getKey())
  314. || e.getKey() == null) {
  315. // e.getKey() (alias) can be null in case of constant expressions. see
  316. // input8.q
  317. ExprWalkerInfo extractPushdownPreds = ExprWalkerProcFactory
  318. .extractPushdownPreds(owi, op, e.getValue());
  319. ewi.merge(extractPushdownPreds);
  320. logExpr(nd, extractPushdownPreds);
  321. }
  322. }
  323. owi.putPrunedPreds((Operator<? extends Serializable>) nd, ewi);
  324. }
  325. }
  326. protected static Object createFilter(Operator op,
  327. ExprWalkerInfo pushDownPreds, OpWalkerInfo owi) {
  328. if (pushDownPreds == null || pushDownPreds.getFinalCandidates() == null
  329. || pushDownPreds.getFinalCandidates().size() == 0) {
  330. return null;
  331. }
  332. RowResolver inputRR = owi.getRowResolver(op);
  333. // combine all predicates into a single expression
  334. List<ExprNodeDesc> preds = null;
  335. ExprNodeDesc condn = null;
  336. Iterator<List<ExprNodeDesc>> iterator = pushDownPreds.getFinalCandidates()
  337. .values().iterator();
  338. while (iterator.hasNext()) {
  339. preds = iterator.next();
  340. int i = 0;
  341. if (condn == null) {
  342. condn = preds.get(0);
  343. i++;
  344. }
  345. for (; i < preds.size(); i++) {
  346. List<ExprNodeDesc> children = new ArrayList<ExprNodeDesc>(2);
  347. children.add(condn);
  348. children.add(preds.get(i));
  349. condn = new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo,
  350. FunctionRegistry.getGenericUDFForAnd(), children);
  351. }
  352. }
  353. if (condn == null) {
  354. return null;
  355. }
  356. if (op instanceof TableScanOperator) {
  357. boolean pushFilterToStorage;
  358. HiveConf hiveConf = owi.getParseContext().getConf();
  359. pushFilterToStorage =
  360. hiveConf.getBoolVar(HiveConf.ConfVars.HIVEOPTPPD_STORAGE);
  361. if (pushFilterToStorage) {
  362. condn = pushFilterToStorageHandler(
  363. (TableScanOperator) op,
  364. condn,
  365. owi,
  366. hiveConf);
  367. if (condn == null) {
  368. // we pushed the whole thing down
  369. return null;
  370. }
  371. }
  372. }
  373. // add new filter op
  374. List<Operator<? extends Serializable>> originalChilren = op
  375. .getChildOperators();
  376. op.setChildOperators(null);
  377. Operator<FilterDesc> output = OperatorFactory.getAndMakeChild(
  378. new FilterDesc(condn, false), new RowSchema(inputRR.getColumnInfos()),
  379. op);
  380. output.setChildOperators(originalChilren);
  381. for (Operator<? extends Serializable> ch : originalChilren) {
  382. List<Operator<? extends Serializable>> parentOperators = ch
  383. .getParentOperators();
  384. int pos = parentOperators.indexOf(op);
  385. assert pos != -1;
  386. parentOperators.remove(pos);
  387. parentOperators.add(pos, output); // add the new op as the old
  388. }
  389. OpParseContext ctx = new OpParseContext(inputRR);
  390. owi.put(output, ctx);
  391. return output;
  392. }
  393. /**
  394. * Attempts to push a predicate down into a storage handler. For
  395. * native tables, this is a no-op.
  396. *
  397. * @param tableScanOp table scan against which predicate applies
  398. *
  399. * @param originalPredicate predicate to be pushed down
  400. *
  401. * @param owi object walk info
  402. *
  403. * @param hiveConf Hive configuration
  404. *
  405. * @return portion of predicate which needs to be evaluated
  406. * by Hive as a post-filter, or null if it was possible
  407. * to push down the entire predicate
  408. */
  409. private static ExprNodeDesc pushFilterToStorageHandler(
  410. TableScanOperator tableScanOp,
  411. ExprNodeDesc originalPredicate,
  412. OpWalkerInfo owi,
  413. HiveConf hiveConf) {
  414. TableScanDesc tableScanDesc = tableScanOp.getConf();
  415. Table tbl = owi.getParseContext().getTopToTable().get(tableScanOp);
  416. if (!tbl.isNonNative()) {
  417. return originalPredicate;
  418. }
  419. HiveStorageHandler storageHandler = tbl.getStorageHandler();
  420. if (!(storageHandler instanceof HiveStoragePredicateHandler)) {
  421. // The storage handler does not provide predicate decomposition
  422. // support, so we'll implement the entire filter in Hive. However,
  423. // we still provide the full predicate to the storage handler in
  424. // case it wants to do any of its own prefiltering.
  425. tableScanDesc.setFilterExpr(originalPredicate);
  426. return originalPredicate;
  427. }
  428. HiveStoragePredicateHandler predicateHandler =
  429. (HiveStoragePredicateHandler) storageHandler;
  430. JobConf jobConf = new JobConf(owi.getParseContext().getConf());
  431. Utilities.setColumnNameList(jobConf, tableScanOp);
  432. Utilities.copyTableJobPropertiesToConf(
  433. Utilities.getTableDesc(tbl),
  434. jobConf);
  435. Deserializer deserializer = tbl.getDeserializer();
  436. HiveStoragePredicateHandler.DecomposedPredicate decomposed =
  437. predicateHandler.decomposePredicate(
  438. jobConf,
  439. deserializer,
  440. originalPredicate);
  441. if (decomposed == null) {
  442. // not able to push anything down
  443. if (LOG.isDebugEnabled()) {
  444. LOG.debug("No pushdown possible for predicate: "
  445. + originalPredicate.getExprString());
  446. }
  447. return originalPredicate;
  448. }
  449. if (LOG.isDebugEnabled()) {
  450. LOG.debug("Original predicate: "
  451. + originalPredicate.getExprString());
  452. if (decomposed.pushedPredicate != null) {
  453. LOG.debug(
  454. "Pushed predicate: "
  455. + decomposed.pushedPredicate.getExprString());
  456. }
  457. if (decomposed.residualPredicate != null) {
  458. LOG.debug(
  459. "Residual predicate: "
  460. + decomposed.residualPredicate.getExprString());
  461. }
  462. }
  463. tableScanDesc.setFilterExpr(decomposed.pushedPredicate);
  464. return decomposed.residualPredicate;
  465. }
  466. public static NodeProcessor getFilterProc() {
  467. return new FilterPPD();
  468. }
  469. public static NodeProcessor getJoinProc() {
  470. return new JoinPPD();
  471. }
  472. public static NodeProcessor getRSProc() {
  473. return new ReduceSinkPPD();
  474. }
  475. public static NodeProcessor getTSProc() {
  476. return new TableScanPPD();
  477. }
  478. public static NodeProcessor getDefaultProc() {
  479. return new DefaultPPD();
  480. }
  481. public static NodeProcessor getSCRProc() {
  482. return new ScriptPPD();
  483. }
  484. public static NodeProcessor getLIMProc() {
  485. return new ScriptPPD();
  486. }
  487. public static NodeProcessor getUDTFProc() {
  488. return new ScriptPPD();
  489. }
  490. public static NodeProcessor getLVFProc() {
  491. return new LateralViewForwardPPD();
  492. }
  493. private OpProcFactory() {
  494. // prevent instantiation
  495. }
  496. }