PageRenderTime 44ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/tags/release-0.1-rc2/hive/external/ql/src/java/org/apache/hadoop/hive/ql/optimizer/lineage/OpProcFactory.java

#
Java | 538 lines | 337 code | 85 blank | 116 comment | 40 complexity | ccdb894c82546742f13a60335ce961aa MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, JSON, CPL-1.0
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hive.ql.optimizer.lineage;
  19. import java.io.Serializable;
  20. import java.util.ArrayList;
  21. import java.util.HashMap;
  22. import java.util.Iterator;
  23. import java.util.LinkedHashSet;
  24. import java.util.List;
  25. import java.util.Map;
  26. import java.util.Set;
  27. import java.util.Stack;
  28. import org.apache.commons.logging.Log;
  29. import org.apache.commons.logging.LogFactory;
  30. import org.apache.hadoop.hive.metastore.api.FieldSchema;
  31. import org.apache.hadoop.hive.metastore.api.Table;
  32. import org.apache.hadoop.hive.ql.exec.ColumnInfo;
  33. import org.apache.hadoop.hive.ql.exec.ForwardOperator;
  34. import org.apache.hadoop.hive.ql.exec.GroupByOperator;
  35. import org.apache.hadoop.hive.ql.exec.JoinOperator;
  36. import org.apache.hadoop.hive.ql.exec.LateralViewJoinOperator;
  37. import org.apache.hadoop.hive.ql.exec.Operator;
  38. import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
  39. import org.apache.hadoop.hive.ql.exec.RowSchema;
  40. import org.apache.hadoop.hive.ql.exec.SelectOperator;
  41. import org.apache.hadoop.hive.ql.exec.TableScanOperator;
  42. import org.apache.hadoop.hive.ql.hooks.LineageInfo;
  43. import org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo;
  44. import org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency;
  45. import org.apache.hadoop.hive.ql.hooks.LineageInfo.DependencyType;
  46. import org.apache.hadoop.hive.ql.hooks.LineageInfo.TableAliasInfo;
  47. import org.apache.hadoop.hive.ql.lib.Node;
  48. import org.apache.hadoop.hive.ql.lib.NodeProcessor;
  49. import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
  50. import org.apache.hadoop.hive.ql.lib.Utils;
  51. import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
  52. import org.apache.hadoop.hive.ql.parse.ParseContext;
  53. import org.apache.hadoop.hive.ql.parse.SemanticException;
  54. import org.apache.hadoop.hive.ql.plan.AggregationDesc;
  55. import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
  56. import org.apache.hadoop.hive.ql.plan.JoinDesc;
  57. import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
  58. /**
  59. * Operator factory for the rule processors for lineage.
  60. */
  61. public class OpProcFactory {
  62. /**
  63. * Returns the parent operator in the walk path to the current operator.
  64. *
  65. * @param stack The stack encoding the path.
  66. *
  67. * @return Operator The parent operator in the current path.
  68. */
  69. protected static Operator<? extends Serializable> getParent(Stack<Node> stack) {
  70. return (Operator<? extends Serializable>)Utils.getNthAncestor(stack, 1);
  71. }
  72. /**
  73. * Processor for Script and UDTF Operators.
  74. */
  75. public static class TransformLineage extends DefaultLineage implements NodeProcessor {
  76. @Override
  77. public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
  78. Object... nodeOutputs) throws SemanticException {
  79. // LineageCTx
  80. LineageCtx lCtx = (LineageCtx) procCtx;
  81. // The operators
  82. Operator<? extends Serializable> op = (Operator<? extends Serializable>)nd;
  83. Operator<? extends Serializable> inpOp = getParent(stack);
  84. // Create a single dependency list by concatenating the dependencies of all
  85. // the cols
  86. Dependency dep = new Dependency();
  87. DependencyType new_type = LineageInfo.DependencyType.SCRIPT;
  88. dep.setType(LineageInfo.DependencyType.SCRIPT);
  89. // TODO: Fix this to a non null value.
  90. dep.setExpr(null);
  91. LinkedHashSet<BaseColumnInfo> col_set = new LinkedHashSet<BaseColumnInfo>();
  92. for(ColumnInfo ci : inpOp.getSchema().getSignature()) {
  93. Dependency d = lCtx.getIndex().getDependency(inpOp, ci);
  94. if (d != null) {
  95. new_type = LineageCtx.getNewDependencyType(d.getType(), new_type);
  96. col_set.addAll(d.getBaseCols());
  97. }
  98. }
  99. dep.setType(new_type);
  100. dep.setBaseCols(new ArrayList<BaseColumnInfo>(col_set));
  101. // This dependency is then set for all the colinfos of the script operator
  102. for(ColumnInfo ci : op.getSchema().getSignature()) {
  103. lCtx.getIndex().putDependency(op, ci, dep);
  104. }
  105. return null;
  106. }
  107. }
  108. /**
  109. * Processor for TableScan Operator. This actually creates the base column mappings.
  110. */
  111. public static class TableScanLineage extends DefaultLineage implements NodeProcessor {
  112. @Override
  113. public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
  114. Object... nodeOutputs) throws SemanticException {
  115. // LineageCtx
  116. LineageCtx lCtx = (LineageCtx) procCtx;
  117. ParseContext pctx = lCtx.getParseCtx();
  118. // Table scan operator.
  119. TableScanOperator top = (TableScanOperator)nd;
  120. org.apache.hadoop.hive.ql.metadata.Table t = pctx.getTopToTable().get(top);
  121. Table tab = t.getTTable();
  122. // Generate the mappings
  123. RowSchema rs = top.getSchema();
  124. List<FieldSchema> cols = t.getAllCols();
  125. Map<String, FieldSchema> fieldSchemaMap = new HashMap<String, FieldSchema>();
  126. for(FieldSchema col : cols) {
  127. fieldSchemaMap.put(col.getName(), col);
  128. }
  129. Iterator<VirtualColumn> vcs = VirtualColumn.registry.values().iterator();
  130. while (vcs.hasNext()) {
  131. VirtualColumn vc = vcs.next();
  132. fieldSchemaMap.put(vc.getName(), new FieldSchema(vc.getName(),
  133. vc.getTypeInfo().getTypeName(), ""));
  134. }
  135. TableAliasInfo tai = new TableAliasInfo();
  136. tai.setAlias(top.getConf().getAlias());
  137. tai.setTable(tab);
  138. for(ColumnInfo ci : rs.getSignature()) {
  139. // Create a dependency
  140. Dependency dep = new Dependency();
  141. BaseColumnInfo bci = new BaseColumnInfo();
  142. bci.setTabAlias(tai);
  143. bci.setColumn(fieldSchemaMap.get(ci.getInternalName()));
  144. // Populate the dependency
  145. dep.setType(LineageInfo.DependencyType.SIMPLE);
  146. // TODO: Find out how to get the expression here.
  147. dep.setExpr(null);
  148. dep.setBaseCols(new ArrayList<BaseColumnInfo>());
  149. dep.getBaseCols().add(bci);
  150. // Put the dependency in the map
  151. lCtx.getIndex().putDependency(top, ci, dep);
  152. }
  153. return null;
  154. }
  155. }
  156. /**
  157. * Processor for Join Operator.
  158. */
  159. public static class JoinLineage extends DefaultLineage implements NodeProcessor {
  160. @Override
  161. public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
  162. Object... nodeOutputs) throws SemanticException {
  163. // Assert that there is atleast one item in the stack. This should never
  164. // be called for leafs.
  165. assert(!stack.isEmpty());
  166. // LineageCtx
  167. LineageCtx lCtx = (LineageCtx) procCtx;
  168. JoinOperator op = (JoinOperator)nd;
  169. JoinDesc jd = op.getConf();
  170. // The input operator to the join is always a reduce sink operator
  171. ReduceSinkOperator inpOp = (ReduceSinkOperator)getParent(stack);
  172. ReduceSinkDesc rd = inpOp.getConf();
  173. int tag = rd.getTag();
  174. // Iterate over the outputs of the join operator and merge the
  175. // dependencies of the columns that corresponding to the tag.
  176. int cnt = 0;
  177. List<ExprNodeDesc> exprs = jd.getExprs().get((byte)tag);
  178. for(ColumnInfo ci : op.getSchema().getSignature()) {
  179. if (jd.getReversedExprs().get(ci.getInternalName()) != tag) {
  180. continue;
  181. }
  182. // Otherwise look up the expression corresponding to this ci
  183. ExprNodeDesc expr = exprs.get(cnt++);
  184. lCtx.getIndex().mergeDependency(op, ci,
  185. ExprProcFactory.getExprDependency(lCtx, inpOp, expr));
  186. }
  187. return null;
  188. }
  189. }
  190. /**
  191. * Processor for Join Operator.
  192. */
  193. public static class LateralViewJoinLineage extends DefaultLineage implements NodeProcessor {
  194. @Override
  195. public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
  196. Object... nodeOutputs) throws SemanticException {
  197. // Assert that there is atleast one item in the stack. This should never
  198. // be called for leafs.
  199. assert(!stack.isEmpty());
  200. // LineageCtx
  201. LineageCtx lCtx = (LineageCtx) procCtx;
  202. LateralViewJoinOperator op = (LateralViewJoinOperator)nd;
  203. boolean isUdtfPath = true;
  204. Operator<? extends Serializable> inpOp = getParent(stack);
  205. ArrayList<ColumnInfo> cols = inpOp.getSchema().getSignature();
  206. if (inpOp instanceof SelectOperator) {
  207. isUdtfPath = false;
  208. }
  209. // Dirty hack!!
  210. // For the select path the columns are the ones at the end of the
  211. // current operators schema and for the udtf path the columns are
  212. // at the beginning of the operator schema.
  213. ArrayList<ColumnInfo> out_cols = op.getSchema().getSignature();
  214. int out_cols_size = out_cols.size();
  215. int cols_size = cols.size();
  216. if (isUdtfPath) {
  217. int cnt = 0;
  218. while (cnt < cols_size) {
  219. lCtx.getIndex().mergeDependency(op, out_cols.get(cnt),
  220. lCtx.getIndex().getDependency(inpOp, cols.get(cnt)));
  221. cnt++;
  222. }
  223. }
  224. else {
  225. int cnt = cols_size - 1;
  226. while (cnt >= 0) {
  227. lCtx.getIndex().mergeDependency(op, out_cols.get(out_cols_size - cols_size + cnt),
  228. lCtx.getIndex().getDependency(inpOp, cols.get(cnt)));
  229. cnt--;
  230. }
  231. }
  232. return null;
  233. }
  234. }
  235. /**
  236. * Processor for Select operator.
  237. */
  238. public static class SelectLineage extends DefaultLineage implements NodeProcessor {
  239. @Override
  240. public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
  241. Object... nodeOutputs) throws SemanticException {
  242. LineageCtx lctx = (LineageCtx)procCtx;
  243. SelectOperator sop = (SelectOperator)nd;
  244. // if this is a selStarNoCompute then this select operator
  245. // is treated like a default operator, so just call the super classes
  246. // process method.
  247. if (sop.getConf().isSelStarNoCompute()) {
  248. return super.process(nd, stack, procCtx, nodeOutputs);
  249. }
  250. // Otherwise we treat this as a normal select operator and look at
  251. // the expressions.
  252. ArrayList<ColumnInfo> col_infos = sop.getSchema().getSignature();
  253. int cnt = 0;
  254. for(ExprNodeDesc expr : sop.getConf().getColList()) {
  255. lctx.getIndex().putDependency(sop, col_infos.get(cnt++),
  256. ExprProcFactory.getExprDependency(lctx, getParent(stack), expr));
  257. }
  258. return null;
  259. }
  260. }
  261. /**
  262. * Processor for GroupBy operator.
  263. */
  264. public static class GroupByLineage extends DefaultLineage implements NodeProcessor {
  265. @Override
  266. public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
  267. Object... nodeOutputs) throws SemanticException {
  268. LineageCtx lctx = (LineageCtx)procCtx;
  269. GroupByOperator gop = (GroupByOperator)nd;
  270. ArrayList<ColumnInfo> col_infos = gop.getSchema().getSignature();
  271. Operator<? extends Serializable> inpOp = getParent(stack);
  272. int cnt = 0;
  273. for(ExprNodeDesc expr : gop.getConf().getKeys()) {
  274. lctx.getIndex().putDependency(gop, col_infos.get(cnt++),
  275. ExprProcFactory.getExprDependency(lctx, inpOp, expr));
  276. }
  277. for(AggregationDesc agg : gop.getConf().getAggregators()) {
  278. // Concatenate the dependencies of all the parameters to
  279. // create the new dependency
  280. Dependency dep = new Dependency();
  281. DependencyType new_type = LineageInfo.DependencyType.EXPRESSION;
  282. // TODO: Get the actual string here.
  283. dep.setExpr(null);
  284. LinkedHashSet<BaseColumnInfo> bci_set = new LinkedHashSet<BaseColumnInfo>();
  285. for(ExprNodeDesc expr : agg.getParameters()) {
  286. Dependency expr_dep = ExprProcFactory.getExprDependency(lctx, inpOp, expr);
  287. if (expr_dep != null) {
  288. new_type = LineageCtx.getNewDependencyType(expr_dep.getType(), new_type);
  289. bci_set.addAll(expr_dep.getBaseCols());
  290. }
  291. }
  292. // If the bci_set is empty, this means that the inputs to this
  293. // aggregate function were all constants (e.g. count(1)). In this case
  294. // the aggregate function is just dependent on all the tables that are in
  295. // the dependency list of the input operator.
  296. if (bci_set.isEmpty()) {
  297. Set<TableAliasInfo> tai_set = new LinkedHashSet<TableAliasInfo>();
  298. if (inpOp.getSchema() != null && inpOp.getSchema().getSignature() != null ) {
  299. for(ColumnInfo ci : inpOp.getSchema().getSignature()) {
  300. Dependency inp_dep = lctx.getIndex().getDependency(inpOp, ci);
  301. // The dependency can be null as some of the input cis may not have
  302. // been set in case of joins.
  303. if (inp_dep != null) {
  304. for(BaseColumnInfo bci : inp_dep.getBaseCols()) {
  305. new_type = LineageCtx.getNewDependencyType(inp_dep.getType(), new_type);
  306. tai_set.add(bci.getTabAlias());
  307. }
  308. }
  309. }
  310. }
  311. // Create the BaseColumnInfos and set them in the bci_set
  312. for(TableAliasInfo tai : tai_set) {
  313. BaseColumnInfo bci = new BaseColumnInfo();
  314. bci.setTabAlias(tai);
  315. // This is set to null to reflect that the dependency is not on any
  316. // particular column of the table.
  317. bci.setColumn(null);
  318. bci_set.add(bci);
  319. }
  320. }
  321. dep.setBaseCols(new ArrayList<BaseColumnInfo>(bci_set));
  322. dep.setType(new_type);
  323. lctx.getIndex().putDependency(gop, col_infos.get(cnt++), dep);
  324. }
  325. return null;
  326. }
  327. }
  328. /**
  329. * Union processor.
  330. * In this case we call mergeDependency as opposed to putDependency
  331. * in order to account for visits from different parents.
  332. */
  333. public static class UnionLineage extends DefaultLineage implements NodeProcessor {
  334. protected static final Log LOG = LogFactory.getLog(OpProcFactory.class.getName());
  335. @SuppressWarnings("unchecked")
  336. @Override
  337. public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
  338. Object... nodeOutputs) throws SemanticException {
  339. // Assert that there is atleast one item in the stack. This should never
  340. // be called for leafs.
  341. assert(!stack.isEmpty());
  342. // LineageCtx
  343. LineageCtx lCtx = (LineageCtx) procCtx;
  344. Operator<? extends Serializable> op = (Operator<? extends Serializable>)nd;
  345. // Get the row schema of the input operator.
  346. // The row schema of the parent operator
  347. Operator<? extends Serializable> inpOp = getParent(stack);
  348. RowSchema rs = op.getSchema();
  349. ArrayList<ColumnInfo> inp_cols = inpOp.getSchema().getSignature();
  350. int cnt = 0;
  351. for(ColumnInfo ci : rs.getSignature()) {
  352. Dependency inp_dep = lCtx.getIndex().getDependency(inpOp, inp_cols.get(cnt++));
  353. if (inp_dep != null) {
  354. lCtx.getIndex().mergeDependency(op, ci, inp_dep);
  355. }
  356. }
  357. return null;
  358. }
  359. }
  360. /**
  361. * ReduceSink processor.
  362. */
  363. public static class ReduceSinkLineage implements NodeProcessor {
  364. protected static final Log LOG = LogFactory.getLog(OpProcFactory.class.getName());
  365. @SuppressWarnings("unchecked")
  366. @Override
  367. public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
  368. Object... nodeOutputs) throws SemanticException {
  369. // Assert that there is atleast one item in the stack. This should never
  370. // be called for leafs.
  371. assert(!stack.isEmpty());
  372. // LineageCtx
  373. LineageCtx lCtx = (LineageCtx) procCtx;
  374. ReduceSinkOperator rop = (ReduceSinkOperator)nd;
  375. ArrayList<ColumnInfo> col_infos = rop.getSchema().getSignature();
  376. Operator<? extends Serializable> inpOp = getParent(stack);
  377. int cnt = 0;
  378. // The keys are included only in case the reduce sink feeds into
  379. // a group by operator through a chain of forward operators
  380. Operator<? extends Serializable> op = rop.getChildOperators().get(0);
  381. while (op instanceof ForwardOperator) {
  382. op = op.getChildOperators().get(0);
  383. }
  384. if (op instanceof GroupByOperator) {
  385. for(ExprNodeDesc expr : rop.getConf().getKeyCols()) {
  386. lCtx.getIndex().putDependency(rop, col_infos.get(cnt++),
  387. ExprProcFactory.getExprDependency(lCtx, inpOp, expr));
  388. }
  389. }
  390. for(ExprNodeDesc expr : rop.getConf().getValueCols()) {
  391. lCtx.getIndex().putDependency(rop, col_infos.get(cnt++),
  392. ExprProcFactory.getExprDependency(lCtx, inpOp, expr));
  393. }
  394. return null;
  395. }
  396. }
  397. /**
  398. * Default processor. This basically passes the input dependencies as such
  399. * to the output dependencies.
  400. */
  401. public static class DefaultLineage implements NodeProcessor {
  402. protected static final Log LOG = LogFactory.getLog(OpProcFactory.class.getName());
  403. @SuppressWarnings("unchecked")
  404. @Override
  405. public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
  406. Object... nodeOutputs) throws SemanticException {
  407. // Assert that there is atleast one item in the stack. This should never
  408. // be called for leafs.
  409. assert(!stack.isEmpty());
  410. // LineageCtx
  411. LineageCtx lCtx = (LineageCtx) procCtx;
  412. Operator<? extends Serializable> op = (Operator<? extends Serializable>)nd;
  413. // Get the row schema of the input operator.
  414. // The row schema of the parent operator
  415. Operator<? extends Serializable> inpOp = getParent(stack);
  416. RowSchema rs = op.getSchema();
  417. ArrayList<ColumnInfo> inp_cols = inpOp.getSchema().getSignature();
  418. int cnt = 0;
  419. for(ColumnInfo ci : rs.getSignature()) {
  420. lCtx.getIndex().putDependency(op, ci,
  421. lCtx.getIndex().getDependency(inpOp, inp_cols.get(cnt++)));
  422. }
  423. return null;
  424. }
  425. }
  426. public static NodeProcessor getJoinProc() {
  427. return new JoinLineage();
  428. }
  429. public static NodeProcessor getLateralViewJoinProc() {
  430. return new LateralViewJoinLineage();
  431. }
  432. public static NodeProcessor getTSProc() {
  433. return new TableScanLineage();
  434. }
  435. public static NodeProcessor getTransformProc() {
  436. return new TransformLineage();
  437. }
  438. public static NodeProcessor getSelProc() {
  439. return new SelectLineage();
  440. }
  441. public static NodeProcessor getGroupByProc() {
  442. return new GroupByLineage();
  443. }
  444. public static NodeProcessor getUnionProc() {
  445. return new UnionLineage();
  446. }
  447. public static NodeProcessor getReduceSinkProc() {
  448. return new ReduceSinkLineage();
  449. }
  450. public static NodeProcessor getDefaultProc() {
  451. return new DefaultLineage();
  452. }
  453. }