StatsTask.java | searchcode

/tags/release-0.0.0-rc0/hive/external/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java

# · Java · 457 lines · 307 code · 64 blank · 86 comment · 49 complexity · c131345307ac68c4dd9ac136466f5bd3 MD5 · raw file

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.hadoop.hive.ql.exec;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.Warehouse;
import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.ql.DriverContext;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.tableSpec;
import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
import org.apache.hadoop.hive.ql.plan.LoadTableDesc;
import org.apache.hadoop.hive.ql.plan.StatsWork;
import org.apache.hadoop.hive.ql.plan.api.StageType;
import org.apache.hadoop.hive.ql.stats.StatsAggregator;
import org.apache.hadoop.hive.ql.stats.StatsFactory;
import org.apache.hadoop.hive.ql.stats.StatsSetupConst;
import org.apache.hadoop.util.StringUtils;

/**
 * StatsTask implementation.
 **/
public class StatsTask extends Task<StatsWork> implements Serializable {

  private static final long serialVersionUID = 1L;

  private Table table;
  private List<LinkedHashMap<String, String>> dpPartSpecs;

  public StatsTask() {
    super();
    dpPartSpecs = null;
  }

  /**
   *
   * Partition Level Statistics.
   *
   */
  class PartitionStatistics {
    int numFiles; // number of files in the partition
    long numRows;  // number of rows in the partition
    long size;    // total size in bytes of the partition

    public PartitionStatistics() {
      numFiles = 0;
      numRows = 0L;
      size = 0L;
    }

    public PartitionStatistics(int nf, long nr, long sz) {
      numFiles = nf;
      numRows = nr;
      size = sz;
    }

    public int getNumFiles() {
      return numFiles;
    }

    public long getNumRows() {
      return numRows;
    }

    public long getSize() {
      return size;
    }

    public void setNumFiles(int nf) {
      numFiles = nf;
    }

    public void setNumRows(long nr) {
      numRows = nr;
    }

    public void setSize(long sz) {
      size = sz;
    }

    @Override
    public String toString() {
      StringBuilder sb = new StringBuilder();
      sb.append("num_files: ").append(numFiles).append(", ");
      sb.append("num_rows: ").append(numRows).append(", ");
      sb.append("total_size: ").append(size);
      return sb.toString();
    }
  }

  /**
   *
   * Table Level Statistics.
   *
   */
  class TableStatistics extends PartitionStatistics {
    int numPartitions; // number of partitions

    public TableStatistics() {
      super();
      numPartitions = 0;
    }

    public void setNumPartitions(int np) {
      numPartitions = np;
    }

    public int getNumPartitions() {
      return numPartitions;
    }

    /**
     * Incrementally update the table statistics according to the old and new
     * partition level statistics.
     * @param oldStats The old statistics of a partition.
     * @param newStats The new statistics of a partition.
     */
    public void updateStats(PartitionStatistics oldStats, PartitionStatistics newStats) {
      deletePartitionStats(oldStats);
      addPartitionStats(newStats);
    }

    /**
     * Update the table level statistics when a new partition is added.
     * @param newStats the new partition statistics.
     */
    public void addPartitionStats(PartitionStatistics newStats) {
      this.numFiles += newStats.getNumFiles();
      this.numRows += newStats.getNumRows();
      this.size += newStats.getSize();
      this.numPartitions++;
    }

    /**
     * Update the table level statistics when an old partition is dropped.
     * @param oldStats the old partition statistics.
     */
    public void deletePartitionStats(PartitionStatistics oldStats) {
      this.numFiles -= oldStats.getNumFiles();
      this.numRows -= oldStats.getNumRows();
      this.size -= oldStats.getSize();
      this.numPartitions--;
    }

    @Override
    public String toString() {
      StringBuilder sb = new StringBuilder();
      sb.append("num_partitions: ").append(numPartitions).append(", ");
      sb.append(super.toString());
      return sb.toString();
    }
  }

  @Override
  protected void receiveFeed(FeedType feedType, Object feedValue) {
    // this method should be called by MoveTask when there are dynamic partitions generated
    if (feedType == FeedType.DYNAMIC_PARTITIONS) {
      assert feedValue instanceof List<?>;
      dpPartSpecs = (List<LinkedHashMap<String, String>>) feedValue;
    }
  }

  @Override
  public int execute(DriverContext driverContext) {

    // Make sure that it is either an ANALYZE command or an INSERT OVERWRITE command
    assert (work.getLoadTableDesc() != null && work.getTableSpecs() == null ||
            work.getLoadTableDesc() == null && work.getTableSpecs() != null);
    String tableName = "";
    try {
      if (work.getLoadTableDesc() != null) {
        tableName = work.getLoadTableDesc().getTable().getTableName();
      } else {
        tableName = work.getTableSpecs().tableName;
      }
      table = db.getTable(tableName);
    }  catch (HiveException e) {
       LOG.error("Cannot get table " + tableName, e);
       console.printError("Cannot get table " + tableName, e.toString());
    }
    return aggregateStats();
  }

  @Override
  public StageType getType() {
    return StageType.STATS;
  }

  @Override
  public String getName() {
    return "STATS";
  }

  @Override
  protected void localizeMRTmpFilesImpl(Context ctx) {
    // Nothing to do for StatsTask here.
  }

  private int aggregateStats() {

    String statsImplementationClass = HiveConf.getVar(conf, HiveConf.ConfVars.HIVESTATSDBCLASS);
    StatsFactory.setImplementation(statsImplementationClass, conf);
    StatsAggregator statsAggregator = StatsFactory.getStatsAggregator();

    try {
      // Stats setup:
      Warehouse wh = new Warehouse(conf);
      FileSystem fileSys;
      FileStatus[] fileStatus;

      // manufacture a StatsAggregator
      if (!statsAggregator.connect(conf)) {
        throw new HiveException("StatsAggregator connect failed " + statsImplementationClass);
      }

      TableStatistics tblStats = new TableStatistics();

      //
      // For partitioned table get the old table statistics for incremental update
      //
      if (table.isPartitioned()) {
        org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable();
        Map<String, String> parameters = tTable.getParameters();
        if (parameters.containsKey(StatsSetupConst.ROW_COUNT)) {
          tblStats.setNumRows(Long.parseLong(parameters.get(StatsSetupConst.ROW_COUNT)));
        }
        if (parameters.containsKey(StatsSetupConst.NUM_PARTITIONS)) {
          tblStats.setNumPartitions(Integer.parseInt(parameters.get(StatsSetupConst.NUM_PARTITIONS)));
        }
        if (parameters.containsKey(StatsSetupConst.NUM_FILES)) {
          tblStats.setNumFiles(Integer.parseInt(parameters.get(StatsSetupConst.NUM_FILES)));
        }
        if (parameters.containsKey(StatsSetupConst.TOTAL_SIZE)) {
          tblStats.setSize(Long.parseLong(parameters.get(StatsSetupConst.TOTAL_SIZE)));
        }
      }

      List<Partition> partitions = getPartitionsList();

      if (partitions == null) {
        // non-partitioned tables:

        Path tablePath = wh.getDefaultTablePath(table.getDbName(), table.getTableName());
        fileSys = tablePath.getFileSystem(conf);
        fileStatus = Utilities.getFileStatusRecurse(tablePath, 1, fileSys);
        tblStats.setNumFiles(fileStatus.length);
        long tableSize = 0L;
        for (int i = 0; i < fileStatus.length; i++) {
          tableSize += fileStatus[i].getLen();
        }
        tblStats.setSize(tableSize);

        // In case of a non-partitioned table, the key for stats temporary store is "rootDir"
        String rows = statsAggregator.aggregateStats(work.getAggKey(), StatsSetupConst.ROW_COUNT);
        if (rows != null) {
          tblStats.setNumRows(Long.parseLong(rows));
        } else {
          if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_ATOMIC)) {
            throw new HiveException("StatsAggregator failed to get numRows.");
          }
        }
      } else {
        // Partitioned table:
        // Need to get the old stats of the partition
        // and update the table stats based on the old and new stats.
        for (Partition partn : partitions) {
          //
          // get the new partition stats
          //
          PartitionStatistics newPartStats = new PartitionStatistics();

          // In that case of a partition, the key for stats temporary store is "rootDir/[dynamic_partition_specs/]%"
          String partitionID = work.getAggKey() + Warehouse.makePartPath(partn.getSpec());

          String rows = statsAggregator.aggregateStats(partitionID, StatsSetupConst.ROW_COUNT);
          if (rows != null) {
            newPartStats.setNumRows(Long.parseLong(rows));
          } else {
            if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_ATOMIC)) {
              throw new HiveException("StatsAggregator failed to get numRows.");
            }
          }

          fileSys = partn.getPartitionPath().getFileSystem(conf);
          fileStatus = Utilities.getFileStatusRecurse(partn.getPartitionPath(), 1, fileSys);
          newPartStats.setNumFiles(fileStatus.length);

          long partitionSize = 0L;
          for (int i = 0; i < fileStatus.length; i++) {
            partitionSize += fileStatus[i].getLen();
          }
          newPartStats.setSize(partitionSize);

          //
          // get the old partition stats
          //
          org.apache.hadoop.hive.metastore.api.Partition tPart = partn.getTPartition();
          Map<String, String> parameters = tPart.getParameters();

          boolean hasStats =
            parameters.containsKey(StatsSetupConst.NUM_FILES) ||
            parameters.containsKey(StatsSetupConst.ROW_COUNT) ||
            parameters.containsKey(StatsSetupConst.TOTAL_SIZE);

          int  nf = parameters.containsKey(StatsSetupConst.NUM_FILES) ?
                    Integer.parseInt(parameters.get(StatsSetupConst.NUM_FILES)) :
                    0;
          long nr = parameters.containsKey(StatsSetupConst.ROW_COUNT) ?
                    Long.parseLong(parameters.get(StatsSetupConst.ROW_COUNT)) :
                    0L;
          long sz = parameters.containsKey(StatsSetupConst.TOTAL_SIZE) ?
                    Long.parseLong(parameters.get(StatsSetupConst.TOTAL_SIZE)) :
                    0L;
          if (hasStats) {
            PartitionStatistics oldPartStats = new PartitionStatistics(nf, nr, sz);
            tblStats.updateStats(oldPartStats, newPartStats);
          } else {
            tblStats.addPartitionStats(newPartStats);
          }

          //
          // update the metastore
          //
          parameters.put(StatsSetupConst.ROW_COUNT, Long.toString(newPartStats.getNumRows()));
          parameters.put(StatsSetupConst.NUM_FILES, Integer.toString(newPartStats.getNumFiles()));
          parameters.put(StatsSetupConst.TOTAL_SIZE, Long.toString(newPartStats.getSize()));

          tPart.setParameters(parameters);
          String tableFullName = table.getDbName() + "." + table.getTableName();
          db.alterPartition(tableFullName, new Partition(table, tPart));

          console.printInfo("Partition " + tableFullName + partn.getSpec() +
              " stats: [" + newPartStats.toString() + ']');
        }
      }


      //
      // write table stats to metastore
      //
      org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable();
      Map<String, String> parameters = tTable.getParameters();
      parameters.put(StatsSetupConst.ROW_COUNT, Long.toString(tblStats.getNumRows()));
      parameters.put(StatsSetupConst.NUM_PARTITIONS, Integer.toString(tblStats.getNumPartitions()));
      parameters.put(StatsSetupConst.NUM_FILES, Integer.toString(tblStats.getNumFiles()));
      parameters.put(StatsSetupConst.TOTAL_SIZE, Long.toString(tblStats.getSize()));
      tTable.setParameters(parameters);

      String tableFullName = table.getDbName() + "." + table.getTableName();

      db.alterTable(tableFullName, new Table(tTable));

      console.printInfo("Table " + tableFullName + " stats: [" + tblStats.toString() + ']');

    } catch (Exception e) {
      // return 0 since StatsTask should not fail the whole job
      console.printInfo("[Warning] could not update stats.",
          "Failed with exception " + e.getMessage() + "\n"
          + StringUtils.stringifyException(e));
    } finally {
      statsAggregator.closeConnection();
    }
    // StatsTask always return 0 so that the whole job won't fail
    return 0;
  }

  /**
   * Get the list of partitions that need to update statistics.
   * TODO: we should reuse the Partitions generated at compile time
   * since getting the list of partitions is quite expensive.
   * @return a list of partitions that need to update statistics.
   * @throws HiveException
   */
  private List<Partition> getPartitionsList() throws HiveException {

    List<Partition> list = new ArrayList<Partition>();

    if (work.getTableSpecs() != null) {

      // ANALYZE command
      tableSpec tblSpec = work.getTableSpecs();
      table = tblSpec.tableHandle;
      if (!table.isPartitioned()) {
        return null;
      }
      // get all partitions that matches with the partition spec
      List<Partition> partitions = tblSpec.partitions;
      if (partitions != null) {
        for (Partition partn : partitions) {
          list.add(partn);
        }
      }
    } else if (work.getLoadTableDesc() != null) {

      // INSERT OVERWRITE command
      LoadTableDesc tbd = work.getLoadTableDesc();
      table = db.getTable(tbd.getTable().getTableName());
      if (!table.isPartitioned()) {
        return null;
      }
      DynamicPartitionCtx dpCtx = tbd.getDPCtx();
      if (dpCtx != null && dpCtx.getNumDPCols() > 0) { // dynamic partitions
        // load the list of DP partitions and return the list of partition specs
        for (LinkedHashMap<String, String> partSpec: dpPartSpecs) {
          Partition partn = db.getPartition(table, partSpec, false);
          list.add(partn);
        }
      } else { // static partition
        Partition partn = db.getPartition(table, tbd.getPartitionSpec(), false);
        list.add(partn);
      }
    }
    return list;
  }

  /**
   * This method is static as it is called from the shutdown hook at the ExecDriver.
   */
  public static void cleanUp(String jobID, Configuration config) {
    StatsAggregator statsAggregator;
    String statsImplementationClass = HiveConf.getVar(config, HiveConf.ConfVars.HIVESTATSDBCLASS);
    StatsFactory.setImplementation(statsImplementationClass, config);
    statsAggregator = StatsFactory.getStatsAggregator();
    if (statsAggregator.connect(config)) {
      statsAggregator.cleanUp(jobID + Path.SEPARATOR); // Adding the path separator to avoid an Id being a prefix of another ID
    }
  }
}
Tech Fingerprint

Alerts (7)

'instanceof' Frequent 'instanceof' checks can indicate a need for better polymorphism (using overridden methods in subclasses) or visitor pattern. Consider if the design can be improved.
187
Complexity hotspot; lines 196 to 197 (total complexity: 6)
196 197
'+' Performance Info: Using string concatenation ('+' or '+=') inside loops can be inefficient due to repeated String object creation. Use StringBuilder (or StringBuffer for thread-safety) instead.
302
'catch (Exception' Catching generic 'Exception' can hide specific runtime issues. Catch more specific exception types whenever possible. Ensure caught exceptions are logged or handled appropriately, not just swallowed.
384
'return null;' Returning null forces callers to perform null checks, risking NullPointerException. Consider using Optional<T> (Java 8+), throwing an exception, or returning a Null Object/empty collection instead.
413 428