PageRenderTime 26ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/ql/src/java/org/apache/hadoop/hive/ql/exec/JoinOperator.java

https://github.com/apache/hive
Java | 283 lines | 185 code | 28 blank | 70 comment | 35 complexity | 017c82023d3f68be7cd7143e4bfe6747 MD5 | raw file
Possible License(s): Apache-2.0
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hive.ql.exec;
  19. import java.io.IOException;
  20. import java.io.Serializable;
  21. import java.util.List;
  22. import org.slf4j.Logger;
  23. import org.apache.hadoop.conf.Configuration;
  24. import org.apache.hadoop.fs.FileStatus;
  25. import org.apache.hadoop.fs.FileSystem;
  26. import org.apache.hadoop.fs.Path;
  27. import org.apache.hadoop.hive.ql.CompilationOpContext;
  28. import org.apache.hadoop.hive.ql.metadata.HiveException;
  29. import org.apache.hadoop.hive.ql.plan.JoinDesc;
  30. import org.apache.hadoop.hive.ql.plan.api.OperatorType;
  31. import org.apache.hadoop.hive.serde2.SerDeUtils;
  32. import org.apache.hadoop.hive.serde2.objectinspector.StructField;
  33. import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
  34. import org.apache.hadoop.io.LongWritable;
  35. /**
  36. * Join operator implementation.
  37. */
  38. public class JoinOperator extends CommonJoinOperator<JoinDesc> implements Serializable {
  39. private static final long serialVersionUID = 1L;
  40. private transient SkewJoinHandler skewJoinKeyContext = null;
  41. /**
  42. * SkewkeyTableCounter.
  43. *
  44. */
  45. public static enum SkewkeyTableCounter {
  46. SKEWJOINFOLLOWUPJOBS
  47. }
  48. private final transient LongWritable skewjoin_followup_jobs = new LongWritable(0);
  49. /** Kryo ctor. */
  50. protected JoinOperator() {
  51. super();
  52. }
  53. public JoinOperator(CompilationOpContext ctx) {
  54. super(ctx);
  55. }
  56. @Override
  57. protected void initializeOp(Configuration hconf) throws HiveException {
  58. super.initializeOp(hconf);
  59. if (handleSkewJoin) {
  60. skewJoinKeyContext = new SkewJoinHandler(this);
  61. skewJoinKeyContext.initiliaze(hconf);
  62. skewJoinKeyContext.setSkewJoinJobCounter(skewjoin_followup_jobs);
  63. }
  64. statsMap.put(SkewkeyTableCounter.SKEWJOINFOLLOWUPJOBS.toString(), skewjoin_followup_jobs);
  65. }
  66. @Override
  67. public void process(Object row, int tag) throws HiveException {
  68. try {
  69. reportProgress();
  70. lastAlias = alias;
  71. alias = (byte) tag;
  72. List<Object> nr = getFilteredValue(alias, row);
  73. addToAliasFilterTags(alias, nr, false);
  74. if (handleSkewJoin) {
  75. skewJoinKeyContext.handleSkew(tag);
  76. }
  77. // number of rows for the key in the given table
  78. final long sz = storage[alias].rowCount();
  79. StructObjectInspector soi = (StructObjectInspector) inputObjInspectors[tag];
  80. StructField sf = soi.getStructFieldRef(Utilities.ReduceField.KEY
  81. .toString());
  82. List keyObject = (List) soi.getStructFieldData(row, sf);
  83. // Are we consuming too much memory
  84. if (alias == numAliases - 1 && !(handleSkewJoin && skewJoinKeyContext.currBigKeyTag >= 0) &&
  85. !hasLeftSemiJoin && !hasLeftAntiSemiJoin) {
  86. if (sz == joinEmitInterval && !hasFilter(condn[alias-1].getLeft()) &&
  87. !hasFilter(condn[alias-1].getRight())) {
  88. // The input is sorted by alias, so if we are already in the last join
  89. // operand,
  90. // we can emit some results now.
  91. // Note this has to be done before adding the current row to the
  92. // storage,
  93. // to preserve the correctness for outer joins.
  94. checkAndGenObject();
  95. storage[alias].clearRows();
  96. }
  97. }
  98. // The input is sorted by alias, so when an alias change is detected,
  99. // reset the counter for the next join key in the stream
  100. if (!alias.equals(lastAlias)) {
  101. nextSz = getNextSize(0L);
  102. }
  103. if (sz == nextSz) {
  104. LOG.info("Table {} has {} rows for join key {}", alias, sz, keyObject);
  105. nextSz = getNextSize(nextSz);
  106. }
  107. // Add the value to the vector
  108. // if join-key is null, process each row in different group.
  109. StructObjectInspector inspector =
  110. (StructObjectInspector) sf.getFieldObjectInspector();
  111. if (SerDeUtils.hasAnyNullObject(keyObject, inspector, nullsafes)) {
  112. endGroup();
  113. startGroup();
  114. }
  115. storage[alias].addRow(nr);
  116. } catch (Exception e) {
  117. throw new HiveException(e);
  118. }
  119. }
  120. @Override
  121. public OperatorType getType() {
  122. return OperatorType.JOIN;
  123. }
  124. /**
  125. * All done.
  126. *
  127. */
  128. @Override
  129. public void closeOp(boolean abort) throws HiveException {
  130. if (handleSkewJoin) {
  131. skewJoinKeyContext.close(abort);
  132. }
  133. super.closeOp(abort);
  134. }
  135. @Override
  136. public void jobCloseOp(Configuration hconf, boolean success)
  137. throws HiveException {
  138. int numAliases = conf.getExprs().size();
  139. if (conf.getHandleSkewJoin()) {
  140. try {
  141. for (int i = 0; i < numAliases; i++) {
  142. Path specPath = conf.getBigKeysDirMap().get((byte) i);
  143. mvFileToFinalPath(specPath, hconf, success, LOG);
  144. for (int j = 0; j < numAliases; j++) {
  145. if (j == i) {
  146. continue;
  147. }
  148. specPath = getConf().getSmallKeysDirMap().get((byte) i).get(
  149. (byte) j);
  150. mvFileToFinalPath(specPath, hconf, success, LOG);
  151. }
  152. }
  153. if (success) {
  154. // move up files
  155. for (int i = 0; i < numAliases; i++) {
  156. Path specPath = conf.getBigKeysDirMap().get((byte) i);
  157. moveUpFiles(specPath, hconf, LOG);
  158. for (int j = 0; j < numAliases; j++) {
  159. if (j == i) {
  160. continue;
  161. }
  162. specPath = getConf().getSmallKeysDirMap().get((byte) i).get(
  163. (byte) j);
  164. moveUpFiles(specPath, hconf, LOG);
  165. }
  166. }
  167. }
  168. } catch (IOException e) {
  169. throw new HiveException(e);
  170. }
  171. }
  172. super.jobCloseOp(hconf, success);
  173. }
  174. private void moveUpFiles(Path specPath, Configuration hconf, Logger log)
  175. throws IOException, HiveException {
  176. FileSystem fs = specPath.getFileSystem(hconf);
  177. if (fs.exists(specPath)) {
  178. FileStatus[] taskOutputDirs = fs.listStatus(specPath);
  179. if (taskOutputDirs != null) {
  180. for (FileStatus dir : taskOutputDirs) {
  181. Utilities.renameOrMoveFiles(fs, dir.getPath(), specPath);
  182. fs.delete(dir.getPath(), true);
  183. }
  184. }
  185. }
  186. }
  187. /**
  188. * This is a similar implementation of FileSinkOperator.moveFileToFinalPath.
  189. * @param specPath
  190. * @param hconf
  191. * @param success
  192. * @param log
  193. * @throws IOException
  194. * @throws HiveException
  195. */
  196. private void mvFileToFinalPath(Path specPath, Configuration hconf,
  197. boolean success, Logger log) throws IOException, HiveException {
  198. FileSystem fs = specPath.getFileSystem(hconf);
  199. Path tmpPath = Utilities.toTempPath(specPath);
  200. Path intermediatePath = new Path(tmpPath.getParent(), tmpPath.getName()
  201. + ".intermediate");
  202. if (success) {
  203. if (fs.exists(tmpPath)) {
  204. // Step1: rename tmp output folder to intermediate path. After this
  205. // point, updates from speculative tasks still writing to tmpPath
  206. // will not appear in finalPath.
  207. Utilities.FILE_OP_LOGGER.info("Moving tmp dir: " + tmpPath + " to: " + intermediatePath + "(spec " + specPath + ")");
  208. Utilities.rename(fs, tmpPath, intermediatePath);
  209. // Step2: remove any tmp file or double-committed output files
  210. Utilities.removeTempOrDuplicateFiles(fs, intermediatePath, hconf, false);
  211. // Step3: move to the file destination
  212. Utilities.FILE_OP_LOGGER.info("Moving tmp dir: " + intermediatePath + " to: " + specPath);
  213. Utilities.renameOrMoveFiles(fs, intermediatePath, specPath);
  214. }
  215. } else {
  216. fs.delete(tmpPath, true);
  217. }
  218. }
  219. /**
  220. * Forward a record of join results.
  221. *
  222. * @throws HiveException
  223. */
  224. @Override
  225. public void endGroup() throws HiveException {
  226. // if this is a skew key, we need to handle it in a separate map reduce job.
  227. if (handleSkewJoin && skewJoinKeyContext.currBigKeyTag >= 0) {
  228. try {
  229. skewJoinKeyContext.endGroup();
  230. } catch (IOException e) {
  231. LOG.error(e.getMessage(), e);
  232. throw new HiveException(e);
  233. }
  234. return;
  235. } else {
  236. checkAndGenObject();
  237. }
  238. }
  239. @Override
  240. public boolean supportSkewJoinOptimization() {
  241. // Since skew join optimization makes a copy of the tree above joins, and
  242. // there is no multi-query optimization in place, let us not use skew join
  243. // optimizations for now.
  244. return false;
  245. }
  246. @Override
  247. public boolean opAllowedBeforeSortMergeJoin() {
  248. // If a join occurs before the sort-merge join, it is not useful to convert the the sort-merge
  249. // join to a mapjoin. It might be simpler to perform the join and then a sort-merge join
  250. // join. By converting the sort-merge join to a map-join, the job will be executed in 2
  251. // mapjoins in the best case. The number of inputs for the join is more than 1 so it would
  252. // be difficult to figure out the big table for the mapjoin.
  253. return false;
  254. }
  255. }