PageRenderTime 46ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/CombinerOptimizer.java

https://github.com/dorefiend/pig
Java | 1010 lines | 547 code | 126 blank | 337 comment | 106 complexity | 89bfb99196e759a5def633530d242614 MD5 | raw file
Possible License(s): Apache-2.0, CPL-1.0
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.pig.backend.hadoop.executionengine.mapReduceLayer;
  19. import java.util.ArrayList;
  20. import java.util.HashMap;
  21. import java.util.List;
  22. import java.util.Map;
  23. import org.apache.commons.logging.Log;
  24. import org.apache.commons.logging.LogFactory;
  25. import org.apache.hadoop.conf.Configuration;
  26. import org.apache.pig.PigException;
  27. import org.apache.pig.FuncSpec;
  28. import org.apache.pig.PigWarning;
  29. import org.apache.pig.data.DataType;
  30. import org.apache.pig.backend.executionengine.ExecException;
  31. import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
  32. import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.plans.MROperPlan;
  33. import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.plans.MROpPlanVisitor;
  34. import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator;
  35. import org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.ConstantExpression;
  36. import org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.POUserFunc;
  37. import org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.POProject;
  38. import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhyPlanVisitor;
  39. import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
  40. import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.PODistinct;
  41. import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POForEach;
  42. import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POFilter;
  43. import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLimit;
  44. import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLocalRearrange;
  45. import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POPackage;
  46. import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POCombinerPackage;
  47. import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POPartialAgg;
  48. import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POPreCombinerLocalRearrange;
  49. import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POSort;
  50. import org.apache.pig.impl.plan.CompilationMessageCollector;
  51. import org.apache.pig.impl.plan.DependencyOrderWalker;
  52. import org.apache.pig.impl.plan.DepthFirstWalker;
  53. import org.apache.pig.impl.plan.OperatorKey;
  54. import org.apache.pig.impl.plan.NodeIdGenerator;
  55. import org.apache.pig.impl.plan.PlanException;
  56. import org.apache.pig.impl.plan.PlanWalker;
  57. import org.apache.pig.impl.plan.VisitorException;
  58. import org.apache.pig.impl.plan.CompilationMessageCollector.MessageType;
  59. import org.apache.pig.impl.plan.optimizer.OptimizerException;
  60. import org.apache.pig.impl.util.Pair;
  61. /**
  62. * Optimize map reduce plans to use the combiner where possible.
  63. * Algebriac functions and distinct in nested plan of a foreach are partially
  64. * computed in the map and combine phase.
  65. * A new foreach statement with initial and intermediate forms of algebraic
  66. * functions are added to map and combine plans respectively.
  67. *
  68. * If bag portion of group-by result is projected or a non algebraic
  69. * expression/udf has bag as input, combiner will not be used. This is because
  70. * the use of combiner in such case is likely to degrade performance
  71. * as there will not be much reduction in data size in combine stage to
  72. * offset the cost of the additional number of times (de)serialization is done.
  73. *
  74. *
  75. * Major areas for enhancement:
  76. * 1. use of combiner in cogroup
  77. * 2. queries with order-by, limit or sort in a nested foreach after group-by
  78. * 3. case where group-by is followed by filter that has algebraic expression
  79. *
  80. *
  81. *
  82. *
  83. */
  84. public class CombinerOptimizer extends MROpPlanVisitor {
  85. private static final String DISTINCT_UDF_CLASSNAME = org.apache.pig.builtin.Distinct.class.getName();
  86. private Log log = LogFactory.getLog(getClass());
  87. private CompilationMessageCollector messageCollector = null;
  88. private boolean doMapAgg;
  89. public CombinerOptimizer(MROperPlan plan, boolean doMapAgg) {
  90. this(plan, doMapAgg, new CompilationMessageCollector());
  91. }
  92. public CombinerOptimizer(MROperPlan plan, boolean doMapAgg,
  93. CompilationMessageCollector messageCollector) {
  94. super(plan, new DepthFirstWalker<MapReduceOper, MROperPlan>(plan));
  95. this.messageCollector = messageCollector;
  96. this.doMapAgg = doMapAgg;
  97. }
  98. public CompilationMessageCollector getMessageCollector() {
  99. return messageCollector;
  100. }
  101. @Override
  102. public void visitMROp(MapReduceOper mr) throws VisitorException {
  103. log.trace("Entering CombinerOptimizer.visitMROp");
  104. if (mr.reducePlan.isEmpty()) return;
  105. // part one - check if this MR job represents a group-by + foreach
  106. // Find the POLocalRearrange in the map. I'll need it later.
  107. List<PhysicalOperator> mapLeaves = mr.mapPlan.getLeaves();
  108. if (mapLeaves == null || mapLeaves.size() != 1) {
  109. messageCollector.collect("Expected map to have single leaf!", MessageType.Warning, PigWarning.MULTI_LEAF_MAP);
  110. return;
  111. }
  112. PhysicalOperator mapLeaf = mapLeaves.get(0);
  113. if (!(mapLeaf instanceof POLocalRearrange)) {
  114. return;
  115. }
  116. POLocalRearrange rearrange = (POLocalRearrange)mapLeaf;
  117. List<PhysicalOperator> reduceRoots = mr.reducePlan.getRoots();
  118. if (reduceRoots.size() != 1) {
  119. messageCollector.collect("Expected reduce to have single leaf", MessageType.Warning, PigWarning.MULTI_LEAF_REDUCE);
  120. return;
  121. }
  122. // I expect that the first root should always be a POPackage. If
  123. // not, I don't know what's going on, so I'm out of here.
  124. PhysicalOperator root = reduceRoots.get(0);
  125. if (!(root instanceof POPackage)) {
  126. messageCollector.collect("Expected reduce root to be a POPackage", MessageType.Warning, PigWarning.NON_PACKAGE_REDUCE_PLAN_ROOT);
  127. return;
  128. }
  129. POPackage pack = (POPackage)root;
  130. List<PhysicalOperator> packSuccessors =
  131. mr.reducePlan.getSuccessors(root);
  132. if (packSuccessors == null || packSuccessors.size() != 1) return;
  133. PhysicalOperator successor = packSuccessors.get(0);
  134. if (successor instanceof POLimit) {
  135. //POLimit is acceptable, as long has it has a single foreach
  136. // as successor
  137. List<PhysicalOperator> limitSucs =
  138. mr.reducePlan.getSuccessors(successor);
  139. if(limitSucs != null && limitSucs.size() == 1 &&
  140. limitSucs.get(0) instanceof POForEach) {
  141. // the code below will now further examine
  142. // the foreach
  143. successor = limitSucs.get(0);
  144. }
  145. }
  146. if (successor instanceof POForEach) {
  147. POForEach foreach = (POForEach)successor;
  148. List<PhysicalPlan> feInners = foreach.getInputPlans();
  149. // find algebraic operators and also check if the foreach statement
  150. // is suitable for combiner use
  151. List<Pair<PhysicalOperator, PhysicalPlan>> algebraicOps =
  152. findAlgebraicOps(feInners);
  153. if(algebraicOps == null || algebraicOps.size() == 0){
  154. // the plan is not combinable or there is nothing to combine
  155. //we're done
  156. return;
  157. }
  158. if (mr.combinePlan.getRoots().size() != 0) {
  159. messageCollector.collect("Wasn't expecting to find anything already "
  160. + "in the combiner!", MessageType.Warning, PigWarning.NON_EMPTY_COMBINE_PLAN);
  161. return;
  162. }
  163. log.info("Choosing to move algebraic foreach to combiner");
  164. try {
  165. // replace PODistinct->Project[*] with distinct udf (which is Algebriac)
  166. for(Pair<PhysicalOperator, PhysicalPlan> op2plan : algebraicOps ){
  167. if(! (op2plan.first instanceof PODistinct))
  168. continue;
  169. DistinctPatcher distinctPatcher = new DistinctPatcher(op2plan.second);
  170. distinctPatcher.visit();
  171. if(distinctPatcher.getDistinct() == null){
  172. int errCode = 2073;
  173. String msg = "Problem with replacing distinct operator with distinct built-in function.";
  174. throw new PlanException(msg, errCode, PigException.BUG);
  175. }
  176. op2plan.first = distinctPatcher.getDistinct();
  177. }
  178. //create new map foreach
  179. POForEach mfe = createForEachWithGrpProj(foreach, rearrange.getKeyType());
  180. Map<PhysicalOperator, Integer> op2newpos =
  181. new HashMap<PhysicalOperator, Integer>();
  182. Integer pos = 1;
  183. //create plan for each algebraic udf and add as inner plan in map-foreach
  184. for(Pair<PhysicalOperator, PhysicalPlan> op2plan : algebraicOps ){
  185. PhysicalPlan udfPlan = createPlanWithPredecessors(op2plan.first, op2plan.second);
  186. mfe.addInputPlan(udfPlan, false);
  187. op2newpos.put(op2plan.first, pos++);
  188. }
  189. changeFunc(mfe, POUserFunc.INITIAL);
  190. // since we will only be creating SingleTupleBag as input to
  191. // the map foreach, we should flag the POProjects in the map
  192. // foreach inner plans to also use SingleTupleBag
  193. for (PhysicalPlan mpl : mfe.getInputPlans()) {
  194. try {
  195. new fixMapProjects(mpl).visit();
  196. } catch (VisitorException e) {
  197. int errCode = 2089;
  198. String msg = "Unable to flag project operator to use single tuple bag.";
  199. throw new PlanException(msg, errCode, PigException.BUG, e);
  200. }
  201. }
  202. //create new combine foreach
  203. POForEach cfe = createForEachWithGrpProj(foreach, rearrange.getKeyType());
  204. //add algebraic functions with appropriate projection
  205. addAlgebraicFuncToCombineFE(cfe, op2newpos);
  206. changeFunc(cfe, POUserFunc.INTERMEDIATE);
  207. //fix projection and function time for algebraic functions in reduce foreach
  208. for(Pair<PhysicalOperator, PhysicalPlan> op2plan : algebraicOps ){
  209. setProjectInput(op2plan.first, op2plan.second, op2newpos.get(op2plan.first));
  210. ((POUserFunc)op2plan.first).setAlgebraicFunction(POUserFunc.FINAL);
  211. }
  212. // we have modified the foreach inner plans - so set them
  213. // again for the foreach so that foreach can do any re-initialization
  214. // around them.
  215. // FIXME - this is a necessary evil right now because the leaves are explicitly
  216. // stored in the POForeach as a list rather than computed each time at
  217. // run time from the plans for optimization. Do we want to have the Foreach
  218. // compute the leaves each time and have Java optimize it (will Java optimize?)?
  219. mfe.setInputPlans(mfe.getInputPlans());
  220. cfe.setInputPlans(cfe.getInputPlans());
  221. foreach.setInputPlans(foreach.getInputPlans());
  222. //tell POCombinerPackage which fields need projected and
  223. // which placed in bags. First field is simple project
  224. // rest need to go into bags
  225. int numFields = algebraicOps.size() + 1; // algebraic funcs + group key
  226. boolean[] bags = new boolean[numFields];
  227. bags[0] = false;
  228. for (int i = 1; i < numFields; i++) {
  229. bags[i] = true;
  230. }
  231. // Use the POCombiner package in the combine plan
  232. // as it needs to act differently than the regular
  233. // package operator.
  234. mr.combinePlan = new PhysicalPlan();
  235. POCombinerPackage combinePack =
  236. new POCombinerPackage(pack, bags);
  237. mr.combinePlan.add(combinePack);
  238. mr.combinePlan.add(cfe);
  239. mr.combinePlan.connect(combinePack, cfe);
  240. // No need to connect projections in cfe to cp, because
  241. // PigCombiner directly attaches output from package to
  242. // root of remaining plan.
  243. POLocalRearrange mlr = getNewRearrange(rearrange);
  244. POPartialAgg mapAgg = null;
  245. if(doMapAgg){
  246. mapAgg = createPartialAgg(cfe);
  247. }
  248. // A specialized local rearrange operator will replace
  249. // the normal local rearrange in the map plan. This behaves
  250. // like the regular local rearrange in the getNext()
  251. // as far as getting its input and constructing the
  252. // "key" out of the input. It then returns a tuple with
  253. // two fields - the key in the first position and the
  254. // "value" inside a bag in the second position. This output
  255. // format resembles the format out of a Package. This output
  256. // will feed to the map foreach which expects this format.
  257. // If the key field isn't in the project of the combiner or map foreach,
  258. // it is added to the end (This is required so that we can
  259. // set up the inner plan of the new Local Rearrange leaf in the map
  260. // and combine plan to contain just the project of the key).
  261. patchUpMap(mr.mapPlan, getPreCombinerLR(rearrange), mfe, mapAgg, mlr);
  262. POLocalRearrange clr = getNewRearrange(rearrange);
  263. mr.combinePlan.add(clr);
  264. mr.combinePlan.connect(cfe, clr);
  265. // Change the package operator in the reduce plan to
  266. // be the POCombiner package, as it needs to act
  267. // differently than the regular package operator.
  268. POCombinerPackage newReducePack =
  269. new POCombinerPackage(pack, bags);
  270. mr.reducePlan.replace(pack, newReducePack);
  271. // the replace() above only changes
  272. // the plan and does not change "inputs" to
  273. // operators
  274. // set up "inputs" for the operator after
  275. // package correctly
  276. List<PhysicalOperator> packList = new ArrayList<PhysicalOperator>();
  277. packList.add(newReducePack);
  278. List<PhysicalOperator> sucs = mr.reducePlan.getSuccessors(newReducePack);
  279. // there should be only one successor to package
  280. sucs.get(0).setInputs(packList);
  281. } catch (Exception e) {
  282. int errCode = 2018;
  283. String msg = "Internal error. Unable to introduce the combiner for optimization.";
  284. throw new OptimizerException(msg, errCode, PigException.BUG, e);
  285. }
  286. }
  287. }
  288. /**
  289. * Translate POForEach in combiner into a POPartialAgg
  290. * @param combineFE
  291. * @return partial aggregate operator
  292. * @throws CloneNotSupportedException
  293. */
  294. private POPartialAgg createPartialAgg(POForEach combineFE)
  295. throws CloneNotSupportedException {
  296. String scope = combineFE.getOperatorKey().scope;
  297. POPartialAgg poAgg = new POPartialAgg(new OperatorKey(scope,
  298. NodeIdGenerator.getGenerator().getNextNodeId(scope)));
  299. poAgg.addOriginalLocation(combineFE.getAlias(), combineFE.getOriginalLocations());
  300. poAgg.setResultType(combineFE.getResultType());
  301. //first plan in combine foreach is the group key
  302. poAgg.setKeyPlan(combineFE.getInputPlans().get(0).clone());
  303. List<PhysicalPlan> valuePlans = new ArrayList<PhysicalPlan>();
  304. for(int i=1; i<combineFE.getInputPlans().size(); i++){
  305. valuePlans.add(combineFE.getInputPlans().get(i).clone());
  306. }
  307. poAgg.setValuePlans(valuePlans);
  308. return poAgg;
  309. }
  310. /**
  311. * find algebraic operators and also check if the foreach statement
  312. * is suitable for combiner use
  313. * @param feInners inner plans of foreach
  314. * @return null if plan is not combinable, otherwise list of combinable operators
  315. * @throws VisitorException
  316. */
  317. private List<Pair<PhysicalOperator, PhysicalPlan>>
  318. findAlgebraicOps(List<PhysicalPlan> feInners)
  319. throws VisitorException {
  320. ArrayList<Pair<PhysicalOperator, PhysicalPlan>> algebraicOps = new ArrayList<Pair<PhysicalOperator, PhysicalPlan>>();
  321. //check each foreach inner plan
  322. for(PhysicalPlan pplan : feInners){
  323. //check for presence of non combinable operators
  324. AlgebraicPlanChecker algChecker = new AlgebraicPlanChecker(pplan);
  325. algChecker.visit();
  326. if(algChecker.sawNonAlgebraic){
  327. return null;
  328. }
  329. //if we found a combinable distinct add that to list
  330. if(algChecker.sawDistinctAgg){
  331. algebraicOps.add(new Pair<PhysicalOperator, PhysicalPlan>(algChecker.getDistinct(), pplan));
  332. continue;
  333. }
  334. List<PhysicalOperator> roots = pplan.getRoots();
  335. //combinable operators have to be attached to POProject root(s)
  336. // if root does not have a successor that is combinable, the project
  337. // has to be projecting the group column . Otherwise this MR job
  338. //is considered not combinable as we don't want to use combiner for
  339. // cases where this foreach statement is projecting bags (likely to
  340. // bad for performance because of additional (de)serialization costs)
  341. for(PhysicalOperator root : roots){
  342. if(root instanceof ConstantExpression){
  343. continue;
  344. }
  345. if(! (root instanceof POProject)){
  346. // how can this happen? - expect root of inner plan to be
  347. // constant or project. not combining it
  348. //TODO: Warn
  349. return null;
  350. }
  351. POProject proj = (POProject)root;
  352. POUserFunc combineUdf = getAlgebraicSuccessor(proj, pplan);
  353. if(combineUdf == null){
  354. if(proj.isProjectToEnd()){
  355. //project-star or project to end
  356. // not combinable
  357. return null;
  358. }
  359. // Check to see if this is a projection of the grouping column.
  360. // If so, it will be a projection of col 0
  361. List<Integer> cols = proj.getColumns();
  362. if (cols != null && cols.size() == 1 && cols.get(0) == 0) {
  363. //it is project of grouping column, so the plan is still
  364. //combinable
  365. continue;
  366. }else{
  367. //not combinable
  368. return null;
  369. }
  370. }
  371. // The algebraic udf can have more than one input. Add the udf only once
  372. boolean exist = false;
  373. for (Pair<PhysicalOperator, PhysicalPlan> pair : algebraicOps) {
  374. if (pair.first.equals(combineUdf)) {
  375. exist = true;
  376. break;
  377. }
  378. }
  379. if (!exist)
  380. algebraicOps.add(new Pair<PhysicalOperator, PhysicalPlan>(combineUdf, pplan));
  381. }
  382. }
  383. return algebraicOps;
  384. }
  385. /**
  386. * Look for a algebraic POUserFunc as successor to this project, called
  387. * recursively to skip any other projects seen on the way.
  388. * @param proj project
  389. * @param pplan physical plan
  390. * @return null if any operator other POProject or algebraic POUserFunc is
  391. * found while going down the plan, otherwise algebraic POUserFunc is returned
  392. */
  393. private POUserFunc getAlgebraicSuccessor(POProject proj, PhysicalPlan pplan) {
  394. //check if root is followed by combinable operator
  395. List<PhysicalOperator> succs = pplan.getSuccessors(proj);
  396. if(succs == null || succs.size() == 0){
  397. return null;
  398. }
  399. if(succs.size() > 1){
  400. //project shared by more than one operator - does not happen
  401. // in plans generated today
  402. // won't try to combine this
  403. return null;
  404. }
  405. PhysicalOperator succ = succs.get(0);
  406. if(succ instanceof POProject){
  407. return getAlgebraicSuccessor((POProject) succ, pplan);
  408. }
  409. if(succ instanceof POUserFunc && ((POUserFunc)succ).combinable() ){
  410. return (POUserFunc)succ;
  411. }
  412. //some other operator ? can't combine
  413. return null;
  414. }
  415. /**
  416. * Create a new foreach with same scope,alias as given foreach
  417. * add an inner plan that projects the group column, which is going to be
  418. * the first input
  419. * @param foreach source foreach
  420. * @param keyType type for group-by key
  421. * @return new POForeach
  422. */
  423. private POForEach createForEachWithGrpProj(POForEach foreach, byte keyType) {
  424. String scope = foreach.getOperatorKey().scope;
  425. POForEach newFE = new POForEach(createOperatorKey(scope), new ArrayList<PhysicalPlan>());
  426. newFE.addOriginalLocation(foreach.getAlias(), foreach.getOriginalLocations());
  427. newFE.setResultType(foreach.getResultType());
  428. //create plan that projects the group column
  429. PhysicalPlan grpProjPlan = new PhysicalPlan();
  430. //group by column is the first column
  431. POProject proj = new POProject(createOperatorKey(scope), 1, 0);
  432. proj.setResultType(keyType);
  433. grpProjPlan.add(proj);
  434. newFE.addInputPlan(grpProjPlan, false);
  435. return newFE;
  436. }
  437. /**
  438. * Create new plan and add to it the clones of operator algeOp and its
  439. * predecessors from the physical plan pplan .
  440. * @param algeOp algebraic operator
  441. * @param pplan physical plan that has algeOp
  442. * @return new plan
  443. * @throws CloneNotSupportedException
  444. * @throws PlanException
  445. */
  446. private PhysicalPlan createPlanWithPredecessors(PhysicalOperator algeOp, PhysicalPlan pplan)
  447. throws CloneNotSupportedException, PlanException {
  448. PhysicalPlan newplan = new PhysicalPlan();
  449. addPredecessorsToPlan(algeOp, pplan, newplan);
  450. return newplan;
  451. }
  452. /**
  453. * Recursively clone op and its predecessors from pplan and add them to newplan
  454. * @param op
  455. * @param pplan
  456. * @param newplan
  457. * @return
  458. * @throws CloneNotSupportedException
  459. * @throws PlanException
  460. */
  461. private PhysicalOperator addPredecessorsToPlan(PhysicalOperator op, PhysicalPlan pplan,
  462. PhysicalPlan newplan)
  463. throws CloneNotSupportedException, PlanException {
  464. PhysicalOperator newOp = op.clone();
  465. newplan.add(newOp);
  466. if(pplan.getPredecessors(op) == null || pplan.getPredecessors(op).size() == 0){
  467. return newOp;
  468. }
  469. for(PhysicalOperator pred : pplan.getPredecessors(op)){
  470. PhysicalOperator newPred = addPredecessorsToPlan(pred, pplan, newplan);
  471. newplan.connect(newPred, newOp);
  472. }
  473. return newOp;
  474. }
  475. /**
  476. * add algebraic functions with appropriate projection to new foreach in combiner
  477. * @param cfe - the new foreach in combiner
  478. * @param op2newpos - mapping of physical operator to position in input
  479. * @throws CloneNotSupportedException
  480. * @throws PlanException
  481. */
  482. private void addAlgebraicFuncToCombineFE(POForEach cfe, Map<PhysicalOperator, Integer> op2newpos)
  483. throws CloneNotSupportedException, PlanException {
  484. //an array that we will first populate with physical operators in order
  485. //of their position in input. Used while adding plans to combine foreach
  486. // just so that output of combine foreach same positions as input. That
  487. // means the same operator to position mapping can be used by reduce as well
  488. PhysicalOperator[] opsInOrder = new PhysicalOperator[op2newpos.size() + 1];
  489. for(Map.Entry<PhysicalOperator, Integer> op2pos : op2newpos.entrySet()){
  490. opsInOrder[op2pos.getValue()] = op2pos.getKey();
  491. }
  492. // first position is used by group column and a plan has been added for it,
  493. //so start with 1
  494. for(int i=1; i < opsInOrder.length; i++){
  495. //create new inner plan for foreach
  496. //add cloned copy of given physical operator and a new project.
  497. // Even if the udf in query takes multiple input, only one project
  498. // needs to be added because input to this udf
  499. //will be the INITIAL version of udf evaluated in map.
  500. PhysicalPlan newPlan = new PhysicalPlan();
  501. PhysicalOperator newOp = opsInOrder[i].clone();
  502. newPlan.add(newOp);
  503. POProject proj = new POProject(
  504. createOperatorKey(cfe.getOperatorKey().getScope()),
  505. 1, i
  506. );
  507. proj.setResultType(DataType.BAG);
  508. newPlan.add(proj);
  509. newPlan.connect(proj, newOp);
  510. cfe.addInputPlan(newPlan, false);
  511. }
  512. }
  513. /**
  514. * Replace old POLocalRearrange with new pre-combine LR,
  515. * add new map foreach, new map-local-rearrange, and connect them
  516. *
  517. * @param mapPlan
  518. * @param preCombinerLR
  519. * @param mfe
  520. * @param mapAgg
  521. * @param mlr
  522. * @throws PlanException
  523. */
  524. private void patchUpMap(PhysicalPlan mapPlan, POPreCombinerLocalRearrange preCombinerLR,
  525. POForEach mfe, POPartialAgg mapAgg, POLocalRearrange mlr)
  526. throws PlanException {
  527. POLocalRearrange oldLR = (POLocalRearrange)mapPlan.getLeaves().get(0);
  528. mapPlan.replace(oldLR, preCombinerLR);
  529. mapPlan.add(mfe);
  530. mapPlan.connect(preCombinerLR, mfe);
  531. //the operator before local rearrange
  532. PhysicalOperator opBeforeLR = mfe;
  533. if(mapAgg != null){
  534. mapPlan.add(mapAgg);
  535. mapPlan.connect(mfe, mapAgg);
  536. opBeforeLR = mapAgg;
  537. }
  538. mapPlan.add(mlr);
  539. mapPlan.connect(opBeforeLR, mlr);
  540. }
  541. /**
  542. * @param rearrange
  543. * @return
  544. */
  545. private POPreCombinerLocalRearrange getPreCombinerLR(POLocalRearrange rearrange) {
  546. String scope = rearrange.getOperatorKey().scope;
  547. POPreCombinerLocalRearrange pclr = new POPreCombinerLocalRearrange(
  548. createOperatorKey(scope),
  549. rearrange.getRequestedParallelism(), rearrange.getInputs());
  550. pclr.setPlans(rearrange.getPlans());
  551. return pclr;
  552. }
  553. private OperatorKey createOperatorKey(String scope) {
  554. return new OperatorKey(scope, NodeIdGenerator.getGenerator().getNextNodeId(scope));
  555. }
  556. /**
  557. * @param op
  558. * @param index
  559. * @param plan
  560. * @throws PlanException
  561. */
  562. private void setProjectInput(PhysicalOperator op, PhysicalPlan plan, int index) throws PlanException {
  563. String scope = op.getOperatorKey().scope;
  564. POProject proj = new POProject(new OperatorKey(scope,
  565. NodeIdGenerator.getGenerator().getNextNodeId(scope)),
  566. op.getRequestedParallelism(), index);
  567. proj.setResultType(DataType.BAG);
  568. // Remove old connections and elements from the plan
  569. plan.trimAbove(op);
  570. plan.add(proj);
  571. plan.connect(proj, op);
  572. List<PhysicalOperator> inputs =
  573. new ArrayList<PhysicalOperator>(1);
  574. inputs.add(proj);
  575. op.setInputs(inputs);
  576. }
  577. /**
  578. * Change the algebriac function type for algebraic functions in map and combine
  579. * In map and combine the algebraic functions will be leaf of the plan
  580. * @param fe
  581. * @param type
  582. * @throws PlanException
  583. */
  584. private void changeFunc(POForEach fe, byte type) throws PlanException {
  585. for(PhysicalPlan plan : fe.getInputPlans()){
  586. List<PhysicalOperator> leaves = plan.getLeaves();
  587. if (leaves == null || leaves.size() != 1) {
  588. int errCode = 2019;
  589. String msg = "Expected to find plan with single leaf. Found " + leaves.size() + " leaves.";
  590. throw new PlanException(msg, errCode, PigException.BUG);
  591. }
  592. PhysicalOperator leaf = leaves.get(0);
  593. if(leaf instanceof POProject){
  594. continue;
  595. }
  596. if (!(leaf instanceof POUserFunc)) {
  597. int errCode = 2020;
  598. String msg = "Expected to find plan with UDF or project leaf. Found " + leaf.getClass().getSimpleName();
  599. throw new PlanException(msg, errCode, PigException.BUG);
  600. }
  601. POUserFunc func = (POUserFunc)leaf;
  602. try {
  603. func.setAlgebraicFunction(type);
  604. } catch (ExecException e) {
  605. int errCode = 2075;
  606. String msg = "Could not set algebraic function type.";
  607. throw new PlanException(msg, errCode, PigException.BUG, e);
  608. }
  609. }
  610. }
  611. /**
  612. * create new Local rearrange by cloning existing rearrange and
  613. * add plan for projecting the key
  614. * @param rearrange
  615. * @return
  616. * @throws PlanException
  617. * @throws CloneNotSupportedException
  618. */
  619. private POLocalRearrange getNewRearrange(POLocalRearrange rearrange)
  620. throws PlanException, CloneNotSupportedException {
  621. POLocalRearrange newRearrange = rearrange.clone();
  622. // Set the projection to be the key
  623. PhysicalPlan newPlan = new PhysicalPlan();
  624. String scope = newRearrange.getOperatorKey().scope;
  625. POProject proj = new POProject(new OperatorKey(scope,
  626. NodeIdGenerator.getGenerator().getNextNodeId(scope)), -1, 0);
  627. proj.setResultType(newRearrange.getKeyType());
  628. newPlan.add(proj);
  629. List<PhysicalPlan> plans = new ArrayList<PhysicalPlan>(1);
  630. plans.add(newPlan);
  631. newRearrange.setPlansFromCombiner(plans);
  632. return newRearrange;
  633. }
  634. /**
  635. * Checks if there is something that prevents the use of algebraic interface,
  636. * and looks for the PODistinct that can be used as algebraic
  637. *
  638. */
  639. private static class AlgebraicPlanChecker extends PhyPlanVisitor {
  640. boolean sawNonAlgebraic = false;
  641. boolean sawDistinctAgg = false;
  642. private boolean sawForeach = false;
  643. private PODistinct distinct = null;
  644. AlgebraicPlanChecker(PhysicalPlan plan) {
  645. super(plan, new DependencyOrderWalker<PhysicalOperator, PhysicalPlan>(plan));
  646. }
  647. /* (non-Javadoc)
  648. * @see org.apache.pig.impl.plan.PlanVisitor#visit()
  649. */
  650. @Override
  651. public void visit() throws VisitorException {
  652. super.visit();
  653. // if we saw foreach and distinct agg its ok
  654. // else if we only saw foreach, mark it as non algebraic
  655. if(sawForeach && !sawDistinctAgg) {
  656. sawNonAlgebraic = true;
  657. }
  658. }
  659. @Override
  660. public void visitDistinct(PODistinct distinct) throws VisitorException {
  661. this.distinct = distinct;
  662. if(sawDistinctAgg) {
  663. // we want to combine only in the case where there is only
  664. // one PODistinct which is the only input to an agg
  665. // we apparently have seen a PODistinct before, so lets not
  666. // combine.
  667. sawNonAlgebraic = true;
  668. return;
  669. }
  670. // check that this distinct is the only input to an agg
  671. // We could have the following two cases
  672. // script 1:
  673. // ..
  674. // b = group a by ...
  675. // c = foreach b { x = distinct a; generate AGG(x), ...}
  676. // The above script leads to the following plan for AGG(x):
  677. // POUserFunc(org.apache.pig.builtin.COUNT)[long]
  678. // |
  679. // |---Project[bag][*]
  680. // |
  681. // |---PODistinct[bag]
  682. // |
  683. // |---Project[tuple][1]
  684. // script 2:
  685. // ..
  686. // b = group a by ...
  687. // c = foreach b { x = distinct a; generate AGG(x.$1), ...}
  688. // The above script leads to the following plan for AGG(x.$1):
  689. // POUserFunc(org.apache.pig.builtin.IntSum)[long]
  690. // |
  691. // |---Project[bag][1]
  692. // |
  693. // |---Project[bag][*]
  694. // |
  695. // |---PODistinct[bag]
  696. // |
  697. // |---Project[tuple][1]
  698. // So tracing from the PODistinct to its successors upto the leaf, we should
  699. // see a Project[bag][*] as the immediate successor and an optional Project[bag]
  700. // as the next successor till we see the leaf.
  701. PhysicalOperator leaf = mPlan.getLeaves().get(0);
  702. // the leaf has to be a POUserFunc (need not be algebraic)
  703. if(leaf instanceof POUserFunc) {
  704. // we want to combine only in the case where there is only
  705. // one PODistinct which is the only input to an agg.
  706. // Do not combine if there are additional inputs.
  707. List<PhysicalOperator> preds = mPlan.getPredecessors(leaf);
  708. if (preds.size() > 1) {
  709. sawNonAlgebraic = true;
  710. return;
  711. }
  712. List<PhysicalOperator> immediateSuccs = mPlan.getSuccessors(distinct);
  713. if(immediateSuccs.size() == 1 && immediateSuccs.get(0) instanceof POProject) {
  714. if(checkSuccessorIsLeaf(leaf, immediateSuccs.get(0))) { // script 1 above
  715. sawDistinctAgg = true;
  716. return;
  717. } else { // check for script 2 scenario above
  718. List<PhysicalOperator> nextSuccs = mPlan.getSuccessors(immediateSuccs.get(0));
  719. if(nextSuccs.size() == 1) {
  720. PhysicalOperator op = nextSuccs.get(0);
  721. if(op instanceof POProject) {
  722. if(checkSuccessorIsLeaf(leaf, op)) {
  723. sawDistinctAgg = true;
  724. return;
  725. }
  726. }
  727. }
  728. }
  729. }
  730. }
  731. // if we did not return above, that means we did not see
  732. // the pattern we expected
  733. sawNonAlgebraic = true;
  734. }
  735. /**
  736. * @return the distinct
  737. */
  738. public PODistinct getDistinct() {
  739. if(sawNonAlgebraic)
  740. return null;
  741. return distinct;
  742. }
  743. @Override
  744. public void visitLimit(POLimit limit) throws VisitorException {
  745. sawNonAlgebraic = true;
  746. }
  747. private boolean checkSuccessorIsLeaf(PhysicalOperator leaf, PhysicalOperator opToCheck) {
  748. List<PhysicalOperator> succs = mPlan.getSuccessors(opToCheck);
  749. if(succs.size() == 1) {
  750. PhysicalOperator op = succs.get(0);
  751. if(op == leaf) {
  752. return true;
  753. }
  754. }
  755. return false;
  756. }
  757. @Override
  758. public void visitFilter(POFilter filter) throws VisitorException {
  759. sawNonAlgebraic = true;
  760. }
  761. @Override
  762. public void visitPOForEach(POForEach fe) throws VisitorException {
  763. // we need to allow foreach as input for distinct
  764. // but don't want it for other things (why?). So lets
  765. // flag the presence of Foreach and if this is present
  766. // with a distinct agg, it will be allowed.
  767. sawForeach = true;
  768. }
  769. @Override
  770. public void visitSort(POSort sort) throws VisitorException {
  771. sawNonAlgebraic = true;
  772. }
  773. }
  774. /**
  775. * A visitor to replace
  776. * Project[bag][*]
  777. * |
  778. * |---PODistinct[bag]
  779. * with
  780. * POUserFunc(org.apache.pig.builtin.Distinct)[DataBag]
  781. */
  782. private static class DistinctPatcher extends PhyPlanVisitor {
  783. private POUserFunc distinct = null;
  784. /**
  785. * @param plan
  786. * @param walker
  787. */
  788. public DistinctPatcher(PhysicalPlan plan,
  789. PlanWalker<PhysicalOperator, PhysicalPlan> walker) {
  790. super(plan, walker);
  791. }
  792. /**
  793. * @param physicalPlan
  794. */
  795. public DistinctPatcher(PhysicalPlan physicalPlan) {
  796. this(physicalPlan, new DependencyOrderWalker<PhysicalOperator, PhysicalPlan>(physicalPlan));
  797. }
  798. /* (non-Javadoc)
  799. * @see org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhyPlanVisitor#visitProject(org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.POProject)
  800. */
  801. @Override
  802. public void visitProject(POProject proj) throws VisitorException {
  803. // check if this project is preceded by PODistinct and
  804. // has the return type bag
  805. List<PhysicalOperator> preds = mPlan.getPredecessors(proj);
  806. if(preds == null) return; // this is a leaf project and so not interesting for patching
  807. PhysicalOperator pred = preds.get(0);
  808. if(preds.size() == 1 && pred instanceof PODistinct) {
  809. if(distinct != null) {
  810. // we should not already have been patched since the
  811. // Project-Distinct pair should occur only once
  812. int errCode = 2076;
  813. String msg = "Unexpected Project-Distinct pair while trying to set up plans for use with combiner.";
  814. throw new OptimizerException(msg, errCode, PigException.BUG);
  815. }
  816. // we have stick in the POUserfunc(org.apache.pig.builtin.Distinct)[DataBag]
  817. // in place of the Project-PODistinct pair
  818. PhysicalOperator distinctPredecessor = mPlan.getPredecessors(pred).get(0);
  819. POUserFunc func = null;
  820. try {
  821. String scope = proj.getOperatorKey().scope;
  822. List<PhysicalOperator> funcInput = new ArrayList<PhysicalOperator>();
  823. FuncSpec fSpec = new FuncSpec(DISTINCT_UDF_CLASSNAME);
  824. funcInput.add(distinctPredecessor);
  825. // explicitly set distinctPredecessor's result type to
  826. // be tuple - this is relevant when distinctPredecessor is
  827. // originally a POForeach with return type BAG - we need to
  828. // set it to tuple so we get a stream of tuples.
  829. distinctPredecessor.setResultType(DataType.TUPLE);
  830. func = new POUserFunc(new OperatorKey(scope,
  831. NodeIdGenerator.getGenerator().getNextNodeId(scope)),-1, funcInput, fSpec);
  832. func.setResultType(DataType.BAG);
  833. mPlan.replace(proj, func);
  834. mPlan.remove(pred);
  835. // connect the the newly added "func" to
  836. // the predecessor to the earlier PODistinct
  837. mPlan.connect(distinctPredecessor, func);
  838. } catch (PlanException e) {
  839. int errCode = 2077;
  840. String msg = "Problem with reconfiguring plan to add distinct built-in function.";
  841. throw new OptimizerException(msg, errCode, PigException.BUG, e);
  842. }
  843. distinct = func;
  844. }
  845. }
  846. POUserFunc getDistinct(){
  847. return distinct;
  848. }
  849. }
  850. private static class fixMapProjects extends PhyPlanVisitor {
  851. public fixMapProjects(PhysicalPlan plan) {
  852. this(plan, new DepthFirstWalker<PhysicalOperator, PhysicalPlan>(
  853. plan));
  854. }
  855. /**
  856. * @param plan
  857. * @param walker
  858. */
  859. public fixMapProjects(PhysicalPlan plan,
  860. PlanWalker<PhysicalOperator, PhysicalPlan> walker) {
  861. super(plan, walker);
  862. }
  863. /*
  864. * (non-Javadoc)
  865. *
  866. * @see org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhyPlanVisitor#visitProject(org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.POProject)
  867. */
  868. @Override
  869. public void visitProject(POProject proj) throws VisitorException {
  870. if (proj.getResultType() == DataType.BAG) {
  871. // IMPORTANT ASSUMPTION:
  872. // we should be calling this visitor only for
  873. // fixing up the projects in the map's foreach
  874. // inner plan. In the map side, we are dealing
  875. // with single tuple bags - so set the flag in
  876. // the project to use single tuple bags. If in
  877. // future we don't have single tuple bags in the
  878. // input to map's foreach, we should NOT be doing
  879. // this!
  880. proj.setResultSingleTupleBag(true);
  881. }
  882. }
  883. }
  884. }