PageRenderTime 53ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 1ms

/tags/release-0.1-rc2/hive/external/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java

#
Java | 917 lines | 674 code | 118 blank | 125 comment | 159 complexity | d6fd62d3562776f7b3c7c429521ca302 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, JSON, CPL-1.0
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hive.ql.exec;
  19. import java.io.Serializable;
  20. import java.util.ArrayList;
  21. import java.util.HashMap;
  22. import java.util.Iterator;
  23. import java.util.List;
  24. import java.util.Map;
  25. import java.util.Set;
  26. import org.apache.commons.logging.Log;
  27. import org.apache.commons.logging.LogFactory;
  28. import org.apache.hadoop.conf.Configuration;
  29. import org.apache.hadoop.hive.conf.HiveConf;
  30. import org.apache.hadoop.hive.ql.exec.persistence.AbstractRowContainer;
  31. import org.apache.hadoop.hive.ql.exec.persistence.RowContainer;
  32. import org.apache.hadoop.hive.ql.metadata.HiveException;
  33. import org.apache.hadoop.hive.ql.plan.JoinCondDesc;
  34. import org.apache.hadoop.hive.ql.plan.JoinDesc;
  35. import org.apache.hadoop.hive.ql.plan.TableDesc;
  36. import org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe;
  37. import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
  38. import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
  39. import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
  40. import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
  41. import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
  42. import org.apache.hadoop.io.BooleanWritable;
  43. /**
  44. * Join operator implementation.
  45. */
  46. public abstract class CommonJoinOperator<T extends JoinDesc> extends
  47. Operator<T> implements Serializable {
  48. private static final long serialVersionUID = 1L;
  49. protected static final Log LOG = LogFactory.getLog(CommonJoinOperator.class
  50. .getName());
  51. /**
  52. * IntermediateObject.
  53. *
  54. */
  55. public static class IntermediateObject {
  56. ArrayList<Object>[] objs;
  57. int curSize;
  58. public IntermediateObject(ArrayList<Object>[] objs, int curSize) {
  59. this.objs = objs;
  60. this.curSize = curSize;
  61. }
  62. public ArrayList<Object>[] getObjs() {
  63. return objs;
  64. }
  65. public int getCurSize() {
  66. return curSize;
  67. }
  68. public void pushObj(ArrayList<Object> newObj) {
  69. objs[curSize++] = newObj;
  70. }
  71. public void popObj() {
  72. curSize--;
  73. }
  74. public Object topObj() {
  75. return objs[curSize - 1];
  76. }
  77. }
  78. protected transient int numAliases; // number of aliases
  79. /**
  80. * The expressions for join inputs.
  81. */
  82. protected transient Map<Byte, List<ExprNodeEvaluator>> joinValues;
  83. /**
  84. * The filters for join
  85. */
  86. protected transient Map<Byte, List<ExprNodeEvaluator>> joinFilters;
  87. /**
  88. * The ObjectInspectors for the join inputs.
  89. */
  90. protected transient Map<Byte, List<ObjectInspector>> joinValuesObjectInspectors;
  91. /**
  92. * The ObjectInspectors for join filters.
  93. */
  94. protected transient
  95. Map<Byte, List<ObjectInspector>> joinFilterObjectInspectors;
  96. /**
  97. * The standard ObjectInspectors for the join inputs.
  98. */
  99. protected transient Map<Byte, List<ObjectInspector>> joinValuesStandardObjectInspectors;
  100. /**
  101. * The standard ObjectInspectors for the row container.
  102. */
  103. protected transient
  104. Map<Byte, List<ObjectInspector>> rowContainerStandardObjectInspectors;
  105. protected transient Byte[] order; // order in which the results should
  106. // be output
  107. protected transient JoinCondDesc[] condn;
  108. public transient boolean noOuterJoin;
  109. protected transient Object[] dummyObj; // for outer joins, contains the
  110. // potential nulls for the concerned
  111. // aliases
  112. protected transient RowContainer<ArrayList<Object>>[] dummyObjVectors; // empty
  113. // rows
  114. // for
  115. // each
  116. // table
  117. protected transient int totalSz; // total size of the composite object
  118. // keys are the column names. basically this maps the position of the column
  119. // in
  120. // the output of the CommonJoinOperator to the input columnInfo.
  121. private transient Map<Integer, Set<String>> posToAliasMap;
  122. transient LazyBinarySerDe[] spillTableSerDe;
  123. protected transient Map<Byte, TableDesc> spillTableDesc; // spill tables are
  124. // used if the join
  125. // input is too large
  126. // to fit in memory
  127. HashMap<Byte, AbstractRowContainer<ArrayList<Object>>> storage; // map b/w table alias
  128. // to RowContainer
  129. int joinEmitInterval = -1;
  130. int joinCacheSize = 0;
  131. int nextSz = 0;
  132. transient Byte lastAlias = null;
  133. transient boolean handleSkewJoin = false;
  134. protected transient int countAfterReport;
  135. protected transient int heartbeatInterval;
  136. protected static final int NOTSKIPBIGTABLE = -1;
  137. public CommonJoinOperator() {
  138. }
  139. public CommonJoinOperator(CommonJoinOperator<T> clone) {
  140. this.joinEmitInterval = clone.joinEmitInterval;
  141. this.joinCacheSize = clone.joinCacheSize;
  142. this.nextSz = clone.nextSz;
  143. this.childOperators = clone.childOperators;
  144. this.parentOperators = clone.parentOperators;
  145. this.counterNames = clone.counterNames;
  146. this.counterNameToEnum = clone.counterNameToEnum;
  147. this.done = clone.done;
  148. this.operatorId = clone.operatorId;
  149. this.storage = clone.storage;
  150. this.condn = clone.condn;
  151. this.conf = clone.getConf();
  152. this.setSchema(clone.getSchema());
  153. this.alias = clone.alias;
  154. this.beginTime = clone.beginTime;
  155. this.inputRows = clone.inputRows;
  156. this.childOperatorsArray = clone.childOperatorsArray;
  157. this.childOperatorsTag = clone.childOperatorsTag;
  158. this.colExprMap = clone.colExprMap;
  159. this.counters = clone.counters;
  160. this.dummyObj = clone.dummyObj;
  161. this.dummyObjVectors = clone.dummyObjVectors;
  162. this.forwardCache = clone.forwardCache;
  163. this.groupKeyObject = clone.groupKeyObject;
  164. this.handleSkewJoin = clone.handleSkewJoin;
  165. this.hconf = clone.hconf;
  166. this.id = clone.id;
  167. this.inputObjInspectors = clone.inputObjInspectors;
  168. this.inputRows = clone.inputRows;
  169. this.noOuterJoin = clone.noOuterJoin;
  170. this.numAliases = clone.numAliases;
  171. this.operatorId = clone.operatorId;
  172. this.posToAliasMap = clone.posToAliasMap;
  173. this.spillTableDesc = clone.spillTableDesc;
  174. this.statsMap = clone.statsMap;
  175. this.joinFilters = clone.joinFilters;
  176. this.joinFilterObjectInspectors = clone.joinFilterObjectInspectors;
  177. }
  178. protected static <T extends JoinDesc> ObjectInspector getJoinOutputObjectInspector(
  179. Byte[] order, Map<Byte, List<ObjectInspector>> aliasToObjectInspectors,
  180. T conf) {
  181. ArrayList<ObjectInspector> structFieldObjectInspectors = new ArrayList<ObjectInspector>();
  182. for (Byte alias : order) {
  183. List<ObjectInspector> oiList = aliasToObjectInspectors.get(alias);
  184. structFieldObjectInspectors.addAll(oiList);
  185. }
  186. StructObjectInspector joinOutputObjectInspector = ObjectInspectorFactory
  187. .getStandardStructObjectInspector(conf.getOutputColumnNames(),
  188. structFieldObjectInspectors);
  189. return joinOutputObjectInspector;
  190. }
  191. Configuration hconf;
  192. @Override
  193. protected void initializeOp(Configuration hconf) throws HiveException {
  194. this.handleSkewJoin = conf.getHandleSkewJoin();
  195. this.hconf = hconf;
  196. heartbeatInterval = HiveConf.getIntVar(hconf,
  197. HiveConf.ConfVars.HIVESENDHEARTBEAT);
  198. countAfterReport = 0;
  199. totalSz = 0;
  200. // Map that contains the rows for each alias
  201. storage = new HashMap<Byte, AbstractRowContainer<ArrayList<Object>>>();
  202. numAliases = conf.getExprs().size();
  203. joinValues = new HashMap<Byte, List<ExprNodeEvaluator>>();
  204. joinFilters = new HashMap<Byte, List<ExprNodeEvaluator>>();
  205. order = conf.getTagOrder();
  206. condn = conf.getConds();
  207. noOuterJoin = conf.isNoOuterJoin();
  208. totalSz = JoinUtil.populateJoinKeyValue(joinValues, conf.getExprs(),
  209. order,NOTSKIPBIGTABLE);
  210. //process join filters
  211. joinFilters = new HashMap<Byte, List<ExprNodeEvaluator>>();
  212. JoinUtil.populateJoinKeyValue(joinFilters, conf.getFilters(),order,NOTSKIPBIGTABLE);
  213. joinValuesObjectInspectors = JoinUtil.getObjectInspectorsFromEvaluators(joinValues,
  214. inputObjInspectors,NOTSKIPBIGTABLE);
  215. joinFilterObjectInspectors = JoinUtil.getObjectInspectorsFromEvaluators(joinFilters,
  216. inputObjInspectors,NOTSKIPBIGTABLE);
  217. joinValuesStandardObjectInspectors = JoinUtil.getStandardObjectInspectors(
  218. joinValuesObjectInspectors,NOTSKIPBIGTABLE);
  219. if (noOuterJoin) {
  220. rowContainerStandardObjectInspectors = joinValuesStandardObjectInspectors;
  221. } else {
  222. Map<Byte, List<ObjectInspector>> rowContainerObjectInspectors =
  223. new HashMap<Byte, List<ObjectInspector>>();
  224. for (Byte alias : order) {
  225. ArrayList<ObjectInspector> rcOIs = new ArrayList<ObjectInspector>();
  226. rcOIs.addAll(joinValuesObjectInspectors.get(alias));
  227. // for each alias, add object inspector for boolean as the last element
  228. rcOIs.add(
  229. PrimitiveObjectInspectorFactory.writableBooleanObjectInspector);
  230. rowContainerObjectInspectors.put(alias, rcOIs);
  231. }
  232. rowContainerStandardObjectInspectors =
  233. JoinUtil.getStandardObjectInspectors(rowContainerObjectInspectors,NOTSKIPBIGTABLE);
  234. }
  235. dummyObj = new Object[numAliases];
  236. dummyObjVectors = new RowContainer[numAliases];
  237. joinEmitInterval = HiveConf.getIntVar(hconf,
  238. HiveConf.ConfVars.HIVEJOINEMITINTERVAL);
  239. joinCacheSize = HiveConf.getIntVar(hconf,
  240. HiveConf.ConfVars.HIVEJOINCACHESIZE);
  241. // construct dummy null row (indicating empty table) and
  242. // construct spill table serde which is used if input is too
  243. // large to fit into main memory.
  244. byte pos = 0;
  245. for (Byte alias : order) {
  246. int sz = conf.getExprs().get(alias).size();
  247. ArrayList<Object> nr = new ArrayList<Object>(sz);
  248. for (int j = 0; j < sz; j++) {
  249. nr.add(null);
  250. }
  251. if (!noOuterJoin) {
  252. // add whether the row is filtered or not
  253. // this value does not matter for the dummyObj
  254. // because the join values are already null
  255. nr.add(new BooleanWritable(false));
  256. }
  257. dummyObj[pos] = nr;
  258. // there should be only 1 dummy object in the RowContainer
  259. RowContainer<ArrayList<Object>> values = JoinUtil.getRowContainer(hconf,
  260. rowContainerStandardObjectInspectors.get((byte)pos),
  261. alias, 1, spillTableDesc, conf, noOuterJoin);
  262. values.add((ArrayList<Object>) dummyObj[pos]);
  263. dummyObjVectors[pos] = values;
  264. // if serde is null, the input doesn't need to be spilled out
  265. // e.g., the output columns does not contains the input table
  266. RowContainer rc = JoinUtil.getRowContainer(hconf,
  267. rowContainerStandardObjectInspectors.get((byte)pos),
  268. alias, joinCacheSize,spillTableDesc, conf,noOuterJoin);
  269. storage.put(pos, rc);
  270. pos++;
  271. }
  272. forwardCache = new Object[totalSz];
  273. outputObjInspector = getJoinOutputObjectInspector(order,
  274. joinValuesStandardObjectInspectors, conf);
  275. LOG.info("JOIN "
  276. + ((StructObjectInspector) outputObjInspector).getTypeName()
  277. + " totalsz = " + totalSz);
  278. }
  279. transient boolean newGroupStarted = false;
  280. @Override
  281. public void startGroup() throws HiveException {
  282. LOG.trace("Join: Starting new group");
  283. newGroupStarted = true;
  284. for (AbstractRowContainer<ArrayList<Object>> alw : storage.values()) {
  285. alw.clear();
  286. }
  287. }
  288. protected int getNextSize(int sz) {
  289. // A very simple counter to keep track of join entries for a key
  290. if (sz >= 100000) {
  291. return sz + 100000;
  292. }
  293. return 2 * sz;
  294. }
  295. protected transient Byte alias;
  296. transient Object[] forwardCache;
  297. private void createForwardJoinObject(IntermediateObject intObj,
  298. boolean[] nullsArr) throws HiveException {
  299. int p = 0;
  300. for (int i = 0; i < numAliases; i++) {
  301. Byte alias = order[i];
  302. int sz = joinValues.get(alias).size();
  303. if (nullsArr[i]) {
  304. for (int j = 0; j < sz; j++) {
  305. forwardCache[p++] = null;
  306. }
  307. } else {
  308. ArrayList<Object> obj = intObj.getObjs()[i];
  309. for (int j = 0; j < sz; j++) {
  310. forwardCache[p++] = obj.get(j);
  311. }
  312. }
  313. }
  314. forward(forwardCache, outputObjInspector);
  315. countAfterReport = 0;
  316. }
  317. private void copyOldArray(boolean[] src, boolean[] dest) {
  318. for (int i = 0; i < src.length; i++) {
  319. dest[i] = src[i];
  320. }
  321. }
  322. private ArrayList<boolean[]> joinObjectsInnerJoin(
  323. ArrayList<boolean[]> resNulls, ArrayList<boolean[]> inputNulls,
  324. ArrayList<Object> newObj, IntermediateObject intObj, int left,
  325. boolean newObjNull) {
  326. if (newObjNull) {
  327. return resNulls;
  328. }
  329. Iterator<boolean[]> nullsIter = inputNulls.iterator();
  330. while (nullsIter.hasNext()) {
  331. boolean[] oldNulls = nullsIter.next();
  332. boolean oldObjNull = oldNulls[left];
  333. if (!oldObjNull) {
  334. boolean[] newNulls = new boolean[intObj.getCurSize()];
  335. copyOldArray(oldNulls, newNulls);
  336. newNulls[oldNulls.length] = false;
  337. resNulls.add(newNulls);
  338. }
  339. }
  340. return resNulls;
  341. }
  342. /**
  343. * Implement semi join operator.
  344. */
  345. private ArrayList<boolean[]> joinObjectsLeftSemiJoin(
  346. ArrayList<boolean[]> resNulls, ArrayList<boolean[]> inputNulls,
  347. ArrayList<Object> newObj, IntermediateObject intObj, int left,
  348. boolean newObjNull) {
  349. if (newObjNull) {
  350. return resNulls;
  351. }
  352. Iterator<boolean[]> nullsIter = inputNulls.iterator();
  353. while (nullsIter.hasNext()) {
  354. boolean[] oldNulls = nullsIter.next();
  355. boolean oldObjNull = oldNulls[left];
  356. if (!oldObjNull) {
  357. boolean[] newNulls = new boolean[intObj.getCurSize()];
  358. copyOldArray(oldNulls, newNulls);
  359. newNulls[oldNulls.length] = false;
  360. resNulls.add(newNulls);
  361. }
  362. }
  363. return resNulls;
  364. }
  365. private ArrayList<boolean[]> joinObjectsLeftOuterJoin(
  366. ArrayList<boolean[]> resNulls, ArrayList<boolean[]> inputNulls,
  367. ArrayList<Object> newObj, IntermediateObject intObj, int left,
  368. boolean newObjNull) {
  369. // newObj is null if is already null or
  370. // if the row corresponding to the left alias does not pass through filter
  371. int filterIndex = joinValues.get(order[left]).size();
  372. if(filterIndex < intObj.getObjs()[left].size()) {
  373. newObjNull = newObjNull || ((BooleanWritable) (intObj.getObjs()[left].get(filterIndex))).get();
  374. }
  375. Iterator<boolean[]> nullsIter = inputNulls.iterator();
  376. while (nullsIter.hasNext()) {
  377. boolean[] oldNulls = nullsIter.next();
  378. boolean oldObjNull = oldNulls[left];
  379. boolean[] newNulls = new boolean[intObj.getCurSize()];
  380. copyOldArray(oldNulls, newNulls);
  381. if (oldObjNull) {
  382. newNulls[oldNulls.length] = true;
  383. } else {
  384. newNulls[oldNulls.length] = newObjNull;
  385. }
  386. resNulls.add(newNulls);
  387. }
  388. return resNulls;
  389. }
  390. private ArrayList<boolean[]> joinObjectsRightOuterJoin(
  391. ArrayList<boolean[]> resNulls, ArrayList<boolean[]> inputNulls,
  392. ArrayList<Object> newObj, IntermediateObject intObj, int left,
  393. boolean newObjNull, boolean firstRow) {
  394. if (newObjNull) {
  395. return resNulls;
  396. }
  397. if (inputNulls.isEmpty() && firstRow) {
  398. boolean[] newNulls = new boolean[intObj.getCurSize()];
  399. for (int i = 0; i < intObj.getCurSize() - 1; i++) {
  400. newNulls[i] = true;
  401. }
  402. newNulls[intObj.getCurSize() - 1] = newObjNull;
  403. resNulls.add(newNulls);
  404. return resNulls;
  405. }
  406. boolean allOldObjsNull = firstRow;
  407. Iterator<boolean[]> nullsIter = inputNulls.iterator();
  408. while (nullsIter.hasNext()) {
  409. boolean[] oldNulls = nullsIter.next();
  410. if (!oldNulls[left]) {
  411. allOldObjsNull = false;
  412. break;
  413. }
  414. }
  415. // if the row does not pass through filter, all old Objects are null
  416. if (((BooleanWritable)newObj.get(newObj.size()-1)).get()) {
  417. allOldObjsNull = true;
  418. }
  419. nullsIter = inputNulls.iterator();
  420. while (nullsIter.hasNext()) {
  421. boolean[] oldNulls = nullsIter.next();
  422. boolean oldObjNull = oldNulls[left] || allOldObjsNull;
  423. if (!oldObjNull) {
  424. boolean[] newNulls = new boolean[intObj.getCurSize()];
  425. copyOldArray(oldNulls, newNulls);
  426. newNulls[oldNulls.length] = newObjNull;
  427. resNulls.add(newNulls);
  428. } else if (allOldObjsNull) {
  429. boolean[] newNulls = new boolean[intObj.getCurSize()];
  430. for (int i = 0; i < intObj.getCurSize() - 1; i++) {
  431. newNulls[i] = true;
  432. }
  433. newNulls[oldNulls.length] = newObjNull;
  434. resNulls.add(newNulls);
  435. return resNulls;
  436. }
  437. }
  438. return resNulls;
  439. }
  440. private ArrayList<boolean[]> joinObjectsFullOuterJoin(
  441. ArrayList<boolean[]> resNulls, ArrayList<boolean[]> inputNulls,
  442. ArrayList<Object> newObj, IntermediateObject intObj, int left,
  443. boolean newObjNull, boolean firstRow) {
  444. if (newObjNull) {
  445. Iterator<boolean[]> nullsIter = inputNulls.iterator();
  446. while (nullsIter.hasNext()) {
  447. boolean[] oldNulls = nullsIter.next();
  448. boolean[] newNulls = new boolean[intObj.getCurSize()];
  449. copyOldArray(oldNulls, newNulls);
  450. newNulls[oldNulls.length] = newObjNull;
  451. resNulls.add(newNulls);
  452. }
  453. return resNulls;
  454. }
  455. if (inputNulls.isEmpty() && firstRow) {
  456. boolean[] newNulls = new boolean[intObj.getCurSize()];
  457. for (int i = 0; i < intObj.getCurSize() - 1; i++) {
  458. newNulls[i] = true;
  459. }
  460. newNulls[intObj.getCurSize() - 1] = newObjNull;
  461. resNulls.add(newNulls);
  462. return resNulls;
  463. }
  464. boolean allOldObjsNull = firstRow;
  465. Iterator<boolean[]> nullsIter = inputNulls.iterator();
  466. while (nullsIter.hasNext()) {
  467. boolean[] oldNulls = nullsIter.next();
  468. if (!oldNulls[left]) {
  469. allOldObjsNull = false;
  470. break;
  471. }
  472. }
  473. // if the row does not pass through filter, all old Objects are null
  474. if (((BooleanWritable)newObj.get(newObj.size()-1)).get()) {
  475. allOldObjsNull = true;
  476. }
  477. boolean rhsPreserved = false;
  478. nullsIter = inputNulls.iterator();
  479. while (nullsIter.hasNext()) {
  480. boolean[] oldNulls = nullsIter.next();
  481. // old obj is null even if the row corresponding to the left alias
  482. // does not pass through filter
  483. boolean oldObjNull = oldNulls[left] || ((BooleanWritable)
  484. (intObj.getObjs()[left].get(joinValues.get(order[left]).size()))).get()
  485. || allOldObjsNull;
  486. if (!oldObjNull) {
  487. boolean[] newNulls = new boolean[intObj.getCurSize()];
  488. copyOldArray(oldNulls, newNulls);
  489. newNulls[oldNulls.length] = newObjNull;
  490. resNulls.add(newNulls);
  491. } else if (oldObjNull) {
  492. boolean[] newNulls = new boolean[intObj.getCurSize()];
  493. copyOldArray(oldNulls, newNulls);
  494. newNulls[oldNulls.length] = true;
  495. resNulls.add(newNulls);
  496. if (allOldObjsNull && !rhsPreserved) {
  497. newNulls = new boolean[intObj.getCurSize()];
  498. for (int i = 0; i < oldNulls.length; i++) {
  499. newNulls[i] = true;
  500. }
  501. newNulls[oldNulls.length] = false;
  502. resNulls.add(newNulls);
  503. rhsPreserved = true;
  504. }
  505. }
  506. }
  507. return resNulls;
  508. }
  509. /*
  510. * The new input is added to the list of existing inputs. Each entry in the
  511. * array of inputNulls denotes the entries in the intermediate object to be
  512. * used. The intermediate object is augmented with the new object, and list of
  513. * nulls is changed appropriately. The list will contain all non-nulls for a
  514. * inner join. The outer joins are processed appropriately.
  515. */
  516. private ArrayList<boolean[]> joinObjects(ArrayList<boolean[]> inputNulls,
  517. ArrayList<Object> newObj, IntermediateObject intObj, int joinPos,
  518. boolean firstRow) {
  519. ArrayList<boolean[]> resNulls = new ArrayList<boolean[]>();
  520. boolean newObjNull = newObj == dummyObj[joinPos] ? true : false;
  521. if (joinPos == 0) {
  522. if (newObjNull) {
  523. return null;
  524. }
  525. boolean[] nulls = new boolean[1];
  526. nulls[0] = newObjNull;
  527. resNulls.add(nulls);
  528. return resNulls;
  529. }
  530. int left = condn[joinPos - 1].getLeft();
  531. int type = condn[joinPos - 1].getType();
  532. // process all nulls for RIGHT and FULL OUTER JOINS
  533. if (((type == JoinDesc.RIGHT_OUTER_JOIN) || (type == JoinDesc.FULL_OUTER_JOIN))
  534. && !newObjNull && (inputNulls == null) && firstRow) {
  535. boolean[] newNulls = new boolean[intObj.getCurSize()];
  536. for (int i = 0; i < newNulls.length - 1; i++) {
  537. newNulls[i] = true;
  538. }
  539. newNulls[newNulls.length - 1] = false;
  540. resNulls.add(newNulls);
  541. return resNulls;
  542. }
  543. if (inputNulls == null) {
  544. return null;
  545. }
  546. if (type == JoinDesc.INNER_JOIN) {
  547. return joinObjectsInnerJoin(resNulls, inputNulls, newObj, intObj, left,
  548. newObjNull);
  549. } else if (type == JoinDesc.LEFT_OUTER_JOIN) {
  550. return joinObjectsLeftOuterJoin(resNulls, inputNulls, newObj, intObj,
  551. left, newObjNull);
  552. } else if (type == JoinDesc.RIGHT_OUTER_JOIN) {
  553. return joinObjectsRightOuterJoin(resNulls, inputNulls, newObj, intObj,
  554. left, newObjNull, firstRow);
  555. } else if (type == JoinDesc.LEFT_SEMI_JOIN) {
  556. return joinObjectsLeftSemiJoin(resNulls, inputNulls, newObj, intObj,
  557. left, newObjNull);
  558. }
  559. assert (type == JoinDesc.FULL_OUTER_JOIN);
  560. return joinObjectsFullOuterJoin(resNulls, inputNulls, newObj, intObj, left,
  561. newObjNull, firstRow);
  562. }
  563. /*
  564. * genObject is a recursive function. For the inputs, a array of bitvectors is
  565. * maintained (inputNulls) where each entry denotes whether the element is to
  566. * be used or not (whether it is null or not). The size of the bitvector is
  567. * same as the number of inputs under consideration currently. When all inputs
  568. * are accounted for, the output is forwarded appropriately.
  569. */
  570. private void genObject(ArrayList<boolean[]> inputNulls, int aliasNum,
  571. IntermediateObject intObj, boolean firstRow) throws HiveException {
  572. boolean childFirstRow = firstRow;
  573. boolean skipping = false;
  574. if (aliasNum < numAliases) {
  575. // search for match in the rhs table
  576. AbstractRowContainer<ArrayList<Object>> aliasRes = storage.get(order[aliasNum]);
  577. for (ArrayList<Object> newObj = aliasRes.first(); newObj != null; newObj = aliasRes
  578. .next()) {
  579. // check for skipping in case of left semi join
  580. if (aliasNum > 0
  581. && condn[aliasNum - 1].getType() == JoinDesc.LEFT_SEMI_JOIN
  582. && newObj != dummyObj[aliasNum]) { // successful match
  583. skipping = true;
  584. }
  585. intObj.pushObj(newObj);
  586. // execute the actual join algorithm
  587. ArrayList<boolean[]> newNulls = joinObjects(inputNulls, newObj, intObj,
  588. aliasNum, childFirstRow);
  589. // recursively call the join the other rhs tables
  590. genObject(newNulls, aliasNum + 1, intObj, firstRow);
  591. intObj.popObj();
  592. firstRow = false;
  593. // if left-semi-join found a match, skipping the rest of the rows in the
  594. // rhs table of the semijoin
  595. if (skipping) {
  596. break;
  597. }
  598. }
  599. } else {
  600. if (inputNulls == null) {
  601. return;
  602. }
  603. Iterator<boolean[]> nullsIter = inputNulls.iterator();
  604. while (nullsIter.hasNext()) {
  605. boolean[] nullsVec = nullsIter.next();
  606. createForwardJoinObject(intObj, nullsVec);
  607. }
  608. }
  609. }
  610. /**
  611. * Forward a record of join results.
  612. *
  613. * @throws HiveException
  614. */
  615. @Override
  616. public void endGroup() throws HiveException {
  617. LOG.trace("Join Op: endGroup called: numValues=" + numAliases);
  618. checkAndGenObject();
  619. }
  620. private void genUniqueJoinObject(int aliasNum, int forwardCachePos)
  621. throws HiveException {
  622. AbstractRowContainer<ArrayList<Object>> alias = storage.get(order[aliasNum]);
  623. for (ArrayList<Object> row = alias.first(); row != null; row = alias.next()) {
  624. int sz = joinValues.get(order[aliasNum]).size();
  625. int p = forwardCachePos;
  626. for (int j = 0; j < sz; j++) {
  627. forwardCache[p++] = row.get(j);
  628. }
  629. if (aliasNum == numAliases - 1) {
  630. forward(forwardCache, outputObjInspector);
  631. countAfterReport = 0;
  632. } else {
  633. genUniqueJoinObject(aliasNum + 1, p);
  634. }
  635. }
  636. }
  637. private void genAllOneUniqueJoinObject()
  638. throws HiveException {
  639. int p = 0;
  640. for (int i = 0; i < numAliases; i++) {
  641. int sz = joinValues.get(order[i]).size();
  642. ArrayList<Object> obj = storage.get(order[i]).first();
  643. for (int j = 0; j < sz; j++) {
  644. forwardCache[p++] = obj.get(j);
  645. }
  646. }
  647. forward(forwardCache, outputObjInspector);
  648. countAfterReport = 0;
  649. }
  650. protected void checkAndGenObject() throws HiveException {
  651. if (condn[0].getType() == JoinDesc.UNIQUE_JOIN) {
  652. new IntermediateObject(new ArrayList[numAliases], 0);
  653. // Check if results need to be emitted.
  654. // Results only need to be emitted if there is a non-null entry in a table
  655. // that is preserved or if there are no non-null entries
  656. boolean preserve = false; // Will be true if there is a non-null entry
  657. // in a preserved table
  658. boolean hasNulls = false; // Will be true if there are null entries
  659. boolean allOne = true;
  660. for (int i = 0; i < numAliases; i++) {
  661. Byte alias = order[i];
  662. AbstractRowContainer<ArrayList<Object>> alw = storage.get(alias);
  663. if (alw.size() != 1) {
  664. allOne = false;
  665. }
  666. if (alw.size() == 0) {
  667. alw.add((ArrayList<Object>) dummyObj[i]);
  668. hasNulls = true;
  669. } else if (condn[i].getPreserved()) {
  670. preserve = true;
  671. }
  672. }
  673. if (hasNulls && !preserve) {
  674. return;
  675. }
  676. if (allOne) {
  677. LOG.info("calling genAllOneUniqueJoinObject");
  678. genAllOneUniqueJoinObject();
  679. LOG.info("called genAllOneUniqueJoinObject");
  680. } else {
  681. LOG.trace("calling genUniqueJoinObject");
  682. genUniqueJoinObject(0, 0);
  683. LOG.trace("called genUniqueJoinObject");
  684. }
  685. } else {
  686. // does any result need to be emitted
  687. boolean mayHasMoreThanOne = false;
  688. boolean hasEmpty = false;
  689. for (int i = 0; i < numAliases; i++) {
  690. Byte alias = order[i];
  691. AbstractRowContainer<ArrayList<Object>> alw = storage.get(alias);
  692. if (noOuterJoin) {
  693. if (alw.size() == 0) {
  694. LOG.trace("No data for alias=" + i);
  695. return;
  696. } else if (alw.size() > 1) {
  697. mayHasMoreThanOne = true;
  698. }
  699. } else {
  700. if (alw.size() == 0) {
  701. hasEmpty = true;
  702. alw.add((ArrayList<Object>) dummyObj[i]);
  703. } else if (!hasEmpty && alw.size() == 1) {
  704. ArrayList<Object> row = alw.first();
  705. int numValues = joinValues.get(alias).size();
  706. if (row == dummyObj[alias]
  707. || (row.size() > numValues && ((BooleanWritable) (row.get(numValues))).get())) {
  708. hasEmpty = true;
  709. }
  710. } else {
  711. mayHasMoreThanOne = true;
  712. if (!hasEmpty) {
  713. int numValues = joinValues.get(alias).size();
  714. for (ArrayList<Object> row = alw.first(); row != null; row = alw.next()) {
  715. if (row == dummyObj[alias]
  716. || (row.size() > numValues && ((BooleanWritable) (row.get(numValues))).get())) {
  717. hasEmpty = true;
  718. break;
  719. }
  720. }
  721. }
  722. }
  723. }
  724. }
  725. if (!hasEmpty && !mayHasMoreThanOne) {
  726. LOG.trace("calling genAllOneUniqueJoinObject");
  727. genAllOneUniqueJoinObject();
  728. LOG.trace("called genAllOneUniqueJoinObject");
  729. } else if (!hasEmpty) {
  730. LOG.trace("calling genUniqueJoinObject");
  731. genUniqueJoinObject(0, 0);
  732. LOG.trace("called genUniqueJoinObject");
  733. } else {
  734. LOG.trace("calling genObject");
  735. genObject(null, 0, new IntermediateObject(new ArrayList[numAliases], 0),
  736. true);
  737. LOG.trace("called genObject");
  738. }
  739. }
  740. }
  741. protected void reportProgress() {
  742. // Send some status periodically
  743. countAfterReport++;
  744. if ((countAfterReport % heartbeatInterval) == 0
  745. && (reporter != null)) {
  746. reporter.progress();
  747. countAfterReport = 0;
  748. }
  749. }
  750. /**
  751. * Returns true if the row does not pass through filters.
  752. */
  753. protected static Boolean isFiltered(Object row,
  754. List<ExprNodeEvaluator> filters, List<ObjectInspector> ois)
  755. throws HiveException {
  756. // apply join filters on the row.
  757. Boolean ret = false;
  758. for (int j = 0; j < filters.size(); j++) {
  759. Object condition = filters.get(j).evaluate(row);
  760. ret = (Boolean) ((PrimitiveObjectInspector)
  761. ois.get(j)).getPrimitiveJavaObject(condition);
  762. if (ret == null || !ret) {
  763. return true;
  764. }
  765. }
  766. return false;
  767. }
  768. /**
  769. * All done.
  770. *
  771. */
  772. @Override
  773. public void closeOp(boolean abort) throws HiveException {
  774. LOG.trace("Join Op close");
  775. for (AbstractRowContainer<ArrayList<Object>> alw : storage.values()) {
  776. if (alw != null) {
  777. alw.clear(); // clean up the temp files
  778. }
  779. }
  780. storage.clear();
  781. }
  782. @Override
  783. public String getName() {
  784. return "JOIN";
  785. }
  786. /**
  787. * @return the posToAliasMap
  788. */
  789. public Map<Integer, Set<String>> getPosToAliasMap() {
  790. return posToAliasMap;
  791. }
  792. /**
  793. * @param posToAliasMap
  794. * the posToAliasMap to set
  795. */
  796. public void setPosToAliasMap(Map<Integer, Set<String>> posToAliasMap) {
  797. this.posToAliasMap = posToAliasMap;
  798. }
  799. }