/ql/src/java/org/apache/hadoop/hive/ql/exec/JoinOperator.java
Java | 283 lines | 185 code | 28 blank | 70 comment | 35 complexity | 017c82023d3f68be7cd7143e4bfe6747 MD5 | raw file
Possible License(s): Apache-2.0
- /*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.hadoop.hive.ql.exec;
- import java.io.IOException;
- import java.io.Serializable;
- import java.util.List;
- import org.slf4j.Logger;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.FileStatus;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.hive.ql.CompilationOpContext;
- import org.apache.hadoop.hive.ql.metadata.HiveException;
- import org.apache.hadoop.hive.ql.plan.JoinDesc;
- import org.apache.hadoop.hive.ql.plan.api.OperatorType;
- import org.apache.hadoop.hive.serde2.SerDeUtils;
- import org.apache.hadoop.hive.serde2.objectinspector.StructField;
- import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
- import org.apache.hadoop.io.LongWritable;
- /**
- * Join operator implementation.
- */
- public class JoinOperator extends CommonJoinOperator<JoinDesc> implements Serializable {
- private static final long serialVersionUID = 1L;
- private transient SkewJoinHandler skewJoinKeyContext = null;
- /**
- * SkewkeyTableCounter.
- *
- */
- public static enum SkewkeyTableCounter {
- SKEWJOINFOLLOWUPJOBS
- }
- private final transient LongWritable skewjoin_followup_jobs = new LongWritable(0);
- /** Kryo ctor. */
- protected JoinOperator() {
- super();
- }
- public JoinOperator(CompilationOpContext ctx) {
- super(ctx);
- }
- @Override
- protected void initializeOp(Configuration hconf) throws HiveException {
- super.initializeOp(hconf);
- if (handleSkewJoin) {
- skewJoinKeyContext = new SkewJoinHandler(this);
- skewJoinKeyContext.initiliaze(hconf);
- skewJoinKeyContext.setSkewJoinJobCounter(skewjoin_followup_jobs);
- }
- statsMap.put(SkewkeyTableCounter.SKEWJOINFOLLOWUPJOBS.toString(), skewjoin_followup_jobs);
- }
- @Override
- public void process(Object row, int tag) throws HiveException {
- try {
- reportProgress();
- lastAlias = alias;
- alias = (byte) tag;
- List<Object> nr = getFilteredValue(alias, row);
- addToAliasFilterTags(alias, nr, false);
- if (handleSkewJoin) {
- skewJoinKeyContext.handleSkew(tag);
- }
- // number of rows for the key in the given table
- final long sz = storage[alias].rowCount();
- StructObjectInspector soi = (StructObjectInspector) inputObjInspectors[tag];
- StructField sf = soi.getStructFieldRef(Utilities.ReduceField.KEY
- .toString());
- List keyObject = (List) soi.getStructFieldData(row, sf);
- // Are we consuming too much memory
- if (alias == numAliases - 1 && !(handleSkewJoin && skewJoinKeyContext.currBigKeyTag >= 0) &&
- !hasLeftSemiJoin && !hasLeftAntiSemiJoin) {
- if (sz == joinEmitInterval && !hasFilter(condn[alias-1].getLeft()) &&
- !hasFilter(condn[alias-1].getRight())) {
- // The input is sorted by alias, so if we are already in the last join
- // operand,
- // we can emit some results now.
- // Note this has to be done before adding the current row to the
- // storage,
- // to preserve the correctness for outer joins.
- checkAndGenObject();
- storage[alias].clearRows();
- }
- }
- // The input is sorted by alias, so when an alias change is detected,
- // reset the counter for the next join key in the stream
- if (!alias.equals(lastAlias)) {
- nextSz = getNextSize(0L);
- }
- if (sz == nextSz) {
- LOG.info("Table {} has {} rows for join key {}", alias, sz, keyObject);
- nextSz = getNextSize(nextSz);
- }
- // Add the value to the vector
- // if join-key is null, process each row in different group.
- StructObjectInspector inspector =
- (StructObjectInspector) sf.getFieldObjectInspector();
- if (SerDeUtils.hasAnyNullObject(keyObject, inspector, nullsafes)) {
- endGroup();
- startGroup();
- }
- storage[alias].addRow(nr);
- } catch (Exception e) {
- throw new HiveException(e);
- }
- }
- @Override
- public OperatorType getType() {
- return OperatorType.JOIN;
- }
- /**
- * All done.
- *
- */
- @Override
- public void closeOp(boolean abort) throws HiveException {
- if (handleSkewJoin) {
- skewJoinKeyContext.close(abort);
- }
- super.closeOp(abort);
- }
- @Override
- public void jobCloseOp(Configuration hconf, boolean success)
- throws HiveException {
- int numAliases = conf.getExprs().size();
- if (conf.getHandleSkewJoin()) {
- try {
- for (int i = 0; i < numAliases; i++) {
- Path specPath = conf.getBigKeysDirMap().get((byte) i);
- mvFileToFinalPath(specPath, hconf, success, LOG);
- for (int j = 0; j < numAliases; j++) {
- if (j == i) {
- continue;
- }
- specPath = getConf().getSmallKeysDirMap().get((byte) i).get(
- (byte) j);
- mvFileToFinalPath(specPath, hconf, success, LOG);
- }
- }
- if (success) {
- // move up files
- for (int i = 0; i < numAliases; i++) {
- Path specPath = conf.getBigKeysDirMap().get((byte) i);
- moveUpFiles(specPath, hconf, LOG);
- for (int j = 0; j < numAliases; j++) {
- if (j == i) {
- continue;
- }
- specPath = getConf().getSmallKeysDirMap().get((byte) i).get(
- (byte) j);
- moveUpFiles(specPath, hconf, LOG);
- }
- }
- }
- } catch (IOException e) {
- throw new HiveException(e);
- }
- }
- super.jobCloseOp(hconf, success);
- }
- private void moveUpFiles(Path specPath, Configuration hconf, Logger log)
- throws IOException, HiveException {
- FileSystem fs = specPath.getFileSystem(hconf);
- if (fs.exists(specPath)) {
- FileStatus[] taskOutputDirs = fs.listStatus(specPath);
- if (taskOutputDirs != null) {
- for (FileStatus dir : taskOutputDirs) {
- Utilities.renameOrMoveFiles(fs, dir.getPath(), specPath);
- fs.delete(dir.getPath(), true);
- }
- }
- }
- }
- /**
- * This is a similar implementation of FileSinkOperator.moveFileToFinalPath.
- * @param specPath
- * @param hconf
- * @param success
- * @param log
- * @throws IOException
- * @throws HiveException
- */
- private void mvFileToFinalPath(Path specPath, Configuration hconf,
- boolean success, Logger log) throws IOException, HiveException {
- FileSystem fs = specPath.getFileSystem(hconf);
- Path tmpPath = Utilities.toTempPath(specPath);
- Path intermediatePath = new Path(tmpPath.getParent(), tmpPath.getName()
- + ".intermediate");
- if (success) {
- if (fs.exists(tmpPath)) {
- // Step1: rename tmp output folder to intermediate path. After this
- // point, updates from speculative tasks still writing to tmpPath
- // will not appear in finalPath.
- Utilities.FILE_OP_LOGGER.info("Moving tmp dir: " + tmpPath + " to: " + intermediatePath + "(spec " + specPath + ")");
- Utilities.rename(fs, tmpPath, intermediatePath);
- // Step2: remove any tmp file or double-committed output files
- Utilities.removeTempOrDuplicateFiles(fs, intermediatePath, hconf, false);
- // Step3: move to the file destination
- Utilities.FILE_OP_LOGGER.info("Moving tmp dir: " + intermediatePath + " to: " + specPath);
- Utilities.renameOrMoveFiles(fs, intermediatePath, specPath);
- }
- } else {
- fs.delete(tmpPath, true);
- }
- }
- /**
- * Forward a record of join results.
- *
- * @throws HiveException
- */
- @Override
- public void endGroup() throws HiveException {
- // if this is a skew key, we need to handle it in a separate map reduce job.
- if (handleSkewJoin && skewJoinKeyContext.currBigKeyTag >= 0) {
- try {
- skewJoinKeyContext.endGroup();
- } catch (IOException e) {
- LOG.error(e.getMessage(), e);
- throw new HiveException(e);
- }
- return;
- } else {
- checkAndGenObject();
- }
- }
- @Override
- public boolean supportSkewJoinOptimization() {
- // Since skew join optimization makes a copy of the tree above joins, and
- // there is no multi-query optimization in place, let us not use skew join
- // optimizations for now.
- return false;
- }
- @Override
- public boolean opAllowedBeforeSortMergeJoin() {
- // If a join occurs before the sort-merge join, it is not useful to convert the the sort-merge
- // join to a mapjoin. It might be simpler to perform the join and then a sort-merge join
- // join. By converting the sort-merge join to a map-join, the job will be executed in 2
- // mapjoins in the best case. The number of inputs for the join is more than 1 so it would
- // be difficult to figure out the big table for the mapjoin.
- return false;
- }
- }