JoinOperator.java | searchcode

/ql/src/java/org/apache/hadoop/hive/ql/exec/JoinOperator.java

https://github.com/apache/hive · Java · 283 lines · 185 code · 28 blank · 70 comment · 35 complexity · 017c82023d3f68be7cd7143e4bfe6747 MD5 · raw file

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.exec;

import java.io.IOException;
import java.io.Serializable;
import java.util.List;

import org.slf4j.Logger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.CompilationOpContext;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.JoinDesc;
import org.apache.hadoop.hive.ql.plan.api.OperatorType;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.io.LongWritable;

/**
 * Join operator implementation.
 */
public class JoinOperator extends CommonJoinOperator<JoinDesc> implements Serializable {
  private static final long serialVersionUID = 1L;

  private transient SkewJoinHandler skewJoinKeyContext = null;

  /**
   * SkewkeyTableCounter.
   *
   */
  public static enum SkewkeyTableCounter {
    SKEWJOINFOLLOWUPJOBS
  }

  private final transient LongWritable skewjoin_followup_jobs = new LongWritable(0);

  /** Kryo ctor. */
  protected JoinOperator() {
    super();
  }

  public JoinOperator(CompilationOpContext ctx) {
    super(ctx);
  }

  @Override
  protected void initializeOp(Configuration hconf) throws HiveException {
    super.initializeOp(hconf);
    if (handleSkewJoin) {
      skewJoinKeyContext = new SkewJoinHandler(this);
      skewJoinKeyContext.initiliaze(hconf);
      skewJoinKeyContext.setSkewJoinJobCounter(skewjoin_followup_jobs);
    }
    statsMap.put(SkewkeyTableCounter.SKEWJOINFOLLOWUPJOBS.toString(), skewjoin_followup_jobs);
  }

  @Override
  public void process(Object row, int tag) throws HiveException {
    try {
      reportProgress();

      lastAlias = alias;
      alias = (byte) tag;

      List<Object> nr = getFilteredValue(alias, row);
      addToAliasFilterTags(alias, nr, false);

      if (handleSkewJoin) {
        skewJoinKeyContext.handleSkew(tag);
      }

      // number of rows for the key in the given table
      final long sz = storage[alias].rowCount();
      StructObjectInspector soi = (StructObjectInspector) inputObjInspectors[tag];
      StructField sf = soi.getStructFieldRef(Utilities.ReduceField.KEY
          .toString());
      List keyObject = (List) soi.getStructFieldData(row, sf);
      // Are we consuming too much memory
      if (alias == numAliases - 1 && !(handleSkewJoin && skewJoinKeyContext.currBigKeyTag >= 0) &&
          !hasLeftSemiJoin && !hasLeftAntiSemiJoin) {
        if (sz == joinEmitInterval && !hasFilter(condn[alias-1].getLeft()) &&
                !hasFilter(condn[alias-1].getRight())) {
          // The input is sorted by alias, so if we are already in the last join
          // operand,
          // we can emit some results now.
          // Note this has to be done before adding the current row to the
          // storage,
          // to preserve the correctness for outer joins.
          checkAndGenObject();
          storage[alias].clearRows();
        }
      }

      // The input is sorted by alias, so when an alias change is detected,
      // reset the counter for the next join key in the stream
      if (!alias.equals(lastAlias)) {
        nextSz = getNextSize(0L);
      }
      if (sz == nextSz) {
        LOG.info("Table {} has {} rows for join key {}", alias, sz, keyObject);
        nextSz = getNextSize(nextSz);
      }

      // Add the value to the vector
      // if join-key is null, process each row in different group.
      StructObjectInspector inspector =
          (StructObjectInspector) sf.getFieldObjectInspector();
      if (SerDeUtils.hasAnyNullObject(keyObject, inspector, nullsafes)) {
        endGroup();
        startGroup();
      }
      storage[alias].addRow(nr);
    } catch (Exception e) {
      throw new HiveException(e);
    }
  }

  @Override
  public OperatorType getType() {
    return OperatorType.JOIN;
  }

  /**
   * All done.
   *
   */
  @Override
  public void closeOp(boolean abort) throws HiveException {
    if (handleSkewJoin) {
      skewJoinKeyContext.close(abort);
    }
    super.closeOp(abort);
  }

  @Override
  public void jobCloseOp(Configuration hconf, boolean success)
      throws HiveException {
    int numAliases = conf.getExprs().size();
    if (conf.getHandleSkewJoin()) {
      try {
        for (int i = 0; i < numAliases; i++) {
          Path specPath = conf.getBigKeysDirMap().get((byte) i);
          mvFileToFinalPath(specPath, hconf, success, LOG);
          for (int j = 0; j < numAliases; j++) {
            if (j == i) {
              continue;
            }
            specPath = getConf().getSmallKeysDirMap().get((byte) i).get(
                (byte) j);
            mvFileToFinalPath(specPath, hconf, success, LOG);
          }
        }

        if (success) {
          // move up files
          for (int i = 0; i < numAliases; i++) {
            Path specPath = conf.getBigKeysDirMap().get((byte) i);
            moveUpFiles(specPath, hconf, LOG);
            for (int j = 0; j < numAliases; j++) {
              if (j == i) {
                continue;
              }
              specPath = getConf().getSmallKeysDirMap().get((byte) i).get(
                  (byte) j);
              moveUpFiles(specPath, hconf, LOG);
            }
          }
        }
      } catch (IOException e) {
        throw new HiveException(e);
      }
    }
    super.jobCloseOp(hconf, success);
  }

  private void moveUpFiles(Path specPath, Configuration hconf, Logger log)
      throws IOException, HiveException {
    FileSystem fs = specPath.getFileSystem(hconf);

    if (fs.exists(specPath)) {
      FileStatus[] taskOutputDirs = fs.listStatus(specPath);
      if (taskOutputDirs != null) {
        for (FileStatus dir : taskOutputDirs) {
          Utilities.renameOrMoveFiles(fs, dir.getPath(), specPath);
          fs.delete(dir.getPath(), true);
        }
      }
    }
  }

  /**
   * This is a similar implementation of FileSinkOperator.moveFileToFinalPath.
   * @param specPath
   * @param hconf
   * @param success
   * @param log
   * @throws IOException
   * @throws HiveException
   */
  private void  mvFileToFinalPath(Path specPath, Configuration hconf,
      boolean success, Logger log) throws IOException, HiveException {

    FileSystem fs = specPath.getFileSystem(hconf);
    Path tmpPath = Utilities.toTempPath(specPath);
    Path intermediatePath = new Path(tmpPath.getParent(), tmpPath.getName()
        + ".intermediate");
    if (success) {
      if (fs.exists(tmpPath)) {
        // Step1: rename tmp output folder to intermediate path. After this
        // point, updates from speculative tasks still writing to tmpPath
        // will not appear in finalPath.
        Utilities.FILE_OP_LOGGER.info("Moving tmp dir: " + tmpPath + " to: " + intermediatePath + "(spec " + specPath + ")");
        Utilities.rename(fs, tmpPath, intermediatePath);
        // Step2: remove any tmp file or double-committed output files
        Utilities.removeTempOrDuplicateFiles(fs, intermediatePath, hconf, false);
        // Step3: move to the file destination
        Utilities.FILE_OP_LOGGER.info("Moving tmp dir: " + intermediatePath + " to: " + specPath);
        Utilities.renameOrMoveFiles(fs, intermediatePath, specPath);
      }
    } else {
      fs.delete(tmpPath, true);
    }
  }

  /**
   * Forward a record of join results.
   *
   * @throws HiveException
   */
  @Override
  public void endGroup() throws HiveException {
    // if this is a skew key, we need to handle it in a separate map reduce job.
    if (handleSkewJoin && skewJoinKeyContext.currBigKeyTag >= 0) {
      try {
        skewJoinKeyContext.endGroup();
      } catch (IOException e) {
        LOG.error(e.getMessage(), e);
        throw new HiveException(e);
      }
      return;
    } else {
      checkAndGenObject();
    }
  }

  @Override
  public boolean supportSkewJoinOptimization() {
    // Since skew join optimization makes a copy of the tree above joins, and
    // there is no multi-query optimization in place, let us not use skew join
    // optimizations for now.
    return false;
  }

  @Override
  public boolean opAllowedBeforeSortMergeJoin() {
    // If a join occurs before the sort-merge join, it is not useful to convert the the sort-merge
    // join to a mapjoin. It might be simpler to perform the join and then a sort-merge join
    // join. By converting the sort-merge join to a map-join, the job will be executed in 2
    // mapjoins in the best case. The number of inputs for the join is more than 1 so it would
    // be difficult to figure out the big table for the mapjoin.
    return false;
  }
}
Tech Fingerprint

Alerts (5)

'List' Raw collection type used. Specify generic type arguments (e.g., List<String>, Map<Integer, Client>) for type safety and clarity. Avoid raw types unless interacting with legacy code.
97
Complexity hotspot; lines 99 to 101 (total complexity: 8)
99 100 101
'catch (Exception' Catching generic 'Exception' can hide specific runtime issues. Catch more specific exception types whenever possible. Ensure caught exceptions are logged or handled appropriately, not just swallowed.
133