/tags/release-0.0.0-rc0/hive/external/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java
Java | 457 lines | 307 code | 64 blank | 86 comment | 49 complexity | c131345307ac68c4dd9ac136466f5bd3 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, JSON, CPL-1.0
- /**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.hadoop.hive.ql.exec;
- import java.io.Serializable;
- import java.util.ArrayList;
- import java.util.LinkedHashMap;
- import java.util.List;
- import java.util.Map;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.FileStatus;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.hive.conf.HiveConf;
- import org.apache.hadoop.hive.metastore.Warehouse;
- import org.apache.hadoop.hive.ql.Context;
- import org.apache.hadoop.hive.ql.DriverContext;
- import org.apache.hadoop.hive.ql.metadata.HiveException;
- import org.apache.hadoop.hive.ql.metadata.Partition;
- import org.apache.hadoop.hive.ql.metadata.Table;
- import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.tableSpec;
- import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
- import org.apache.hadoop.hive.ql.plan.LoadTableDesc;
- import org.apache.hadoop.hive.ql.plan.StatsWork;
- import org.apache.hadoop.hive.ql.plan.api.StageType;
- import org.apache.hadoop.hive.ql.stats.StatsAggregator;
- import org.apache.hadoop.hive.ql.stats.StatsFactory;
- import org.apache.hadoop.hive.ql.stats.StatsSetupConst;
- import org.apache.hadoop.util.StringUtils;
- /**
- * StatsTask implementation.
- **/
- public class StatsTask extends Task<StatsWork> implements Serializable {
- private static final long serialVersionUID = 1L;
- private Table table;
- private List<LinkedHashMap<String, String>> dpPartSpecs;
- public StatsTask() {
- super();
- dpPartSpecs = null;
- }
- /**
- *
- * Partition Level Statistics.
- *
- */
- class PartitionStatistics {
- int numFiles; // number of files in the partition
- long numRows; // number of rows in the partition
- long size; // total size in bytes of the partition
- public PartitionStatistics() {
- numFiles = 0;
- numRows = 0L;
- size = 0L;
- }
- public PartitionStatistics(int nf, long nr, long sz) {
- numFiles = nf;
- numRows = nr;
- size = sz;
- }
- public int getNumFiles() {
- return numFiles;
- }
- public long getNumRows() {
- return numRows;
- }
- public long getSize() {
- return size;
- }
- public void setNumFiles(int nf) {
- numFiles = nf;
- }
- public void setNumRows(long nr) {
- numRows = nr;
- }
- public void setSize(long sz) {
- size = sz;
- }
- @Override
- public String toString() {
- StringBuilder sb = new StringBuilder();
- sb.append("num_files: ").append(numFiles).append(", ");
- sb.append("num_rows: ").append(numRows).append(", ");
- sb.append("total_size: ").append(size);
- return sb.toString();
- }
- }
- /**
- *
- * Table Level Statistics.
- *
- */
- class TableStatistics extends PartitionStatistics {
- int numPartitions; // number of partitions
- public TableStatistics() {
- super();
- numPartitions = 0;
- }
- public void setNumPartitions(int np) {
- numPartitions = np;
- }
- public int getNumPartitions() {
- return numPartitions;
- }
- /**
- * Incrementally update the table statistics according to the old and new
- * partition level statistics.
- * @param oldStats The old statistics of a partition.
- * @param newStats The new statistics of a partition.
- */
- public void updateStats(PartitionStatistics oldStats, PartitionStatistics newStats) {
- deletePartitionStats(oldStats);
- addPartitionStats(newStats);
- }
- /**
- * Update the table level statistics when a new partition is added.
- * @param newStats the new partition statistics.
- */
- public void addPartitionStats(PartitionStatistics newStats) {
- this.numFiles += newStats.getNumFiles();
- this.numRows += newStats.getNumRows();
- this.size += newStats.getSize();
- this.numPartitions++;
- }
- /**
- * Update the table level statistics when an old partition is dropped.
- * @param oldStats the old partition statistics.
- */
- public void deletePartitionStats(PartitionStatistics oldStats) {
- this.numFiles -= oldStats.getNumFiles();
- this.numRows -= oldStats.getNumRows();
- this.size -= oldStats.getSize();
- this.numPartitions--;
- }
- @Override
- public String toString() {
- StringBuilder sb = new StringBuilder();
- sb.append("num_partitions: ").append(numPartitions).append(", ");
- sb.append(super.toString());
- return sb.toString();
- }
- }
- @Override
- protected void receiveFeed(FeedType feedType, Object feedValue) {
- // this method should be called by MoveTask when there are dynamic partitions generated
- if (feedType == FeedType.DYNAMIC_PARTITIONS) {
- assert feedValue instanceof List<?>;
- dpPartSpecs = (List<LinkedHashMap<String, String>>) feedValue;
- }
- }
- @Override
- public int execute(DriverContext driverContext) {
- // Make sure that it is either an ANALYZE command or an INSERT OVERWRITE command
- assert (work.getLoadTableDesc() != null && work.getTableSpecs() == null ||
- work.getLoadTableDesc() == null && work.getTableSpecs() != null);
- String tableName = "";
- try {
- if (work.getLoadTableDesc() != null) {
- tableName = work.getLoadTableDesc().getTable().getTableName();
- } else {
- tableName = work.getTableSpecs().tableName;
- }
- table = db.getTable(tableName);
- } catch (HiveException e) {
- LOG.error("Cannot get table " + tableName, e);
- console.printError("Cannot get table " + tableName, e.toString());
- }
- return aggregateStats();
- }
- @Override
- public StageType getType() {
- return StageType.STATS;
- }
- @Override
- public String getName() {
- return "STATS";
- }
- @Override
- protected void localizeMRTmpFilesImpl(Context ctx) {
- // Nothing to do for StatsTask here.
- }
- private int aggregateStats() {
- String statsImplementationClass = HiveConf.getVar(conf, HiveConf.ConfVars.HIVESTATSDBCLASS);
- StatsFactory.setImplementation(statsImplementationClass, conf);
- StatsAggregator statsAggregator = StatsFactory.getStatsAggregator();
- try {
- // Stats setup:
- Warehouse wh = new Warehouse(conf);
- FileSystem fileSys;
- FileStatus[] fileStatus;
- // manufacture a StatsAggregator
- if (!statsAggregator.connect(conf)) {
- throw new HiveException("StatsAggregator connect failed " + statsImplementationClass);
- }
- TableStatistics tblStats = new TableStatistics();
- //
- // For partitioned table get the old table statistics for incremental update
- //
- if (table.isPartitioned()) {
- org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable();
- Map<String, String> parameters = tTable.getParameters();
- if (parameters.containsKey(StatsSetupConst.ROW_COUNT)) {
- tblStats.setNumRows(Long.parseLong(parameters.get(StatsSetupConst.ROW_COUNT)));
- }
- if (parameters.containsKey(StatsSetupConst.NUM_PARTITIONS)) {
- tblStats.setNumPartitions(Integer.parseInt(parameters.get(StatsSetupConst.NUM_PARTITIONS)));
- }
- if (parameters.containsKey(StatsSetupConst.NUM_FILES)) {
- tblStats.setNumFiles(Integer.parseInt(parameters.get(StatsSetupConst.NUM_FILES)));
- }
- if (parameters.containsKey(StatsSetupConst.TOTAL_SIZE)) {
- tblStats.setSize(Long.parseLong(parameters.get(StatsSetupConst.TOTAL_SIZE)));
- }
- }
- List<Partition> partitions = getPartitionsList();
- if (partitions == null) {
- // non-partitioned tables:
- Path tablePath = wh.getDefaultTablePath(table.getDbName(), table.getTableName());
- fileSys = tablePath.getFileSystem(conf);
- fileStatus = Utilities.getFileStatusRecurse(tablePath, 1, fileSys);
- tblStats.setNumFiles(fileStatus.length);
- long tableSize = 0L;
- for (int i = 0; i < fileStatus.length; i++) {
- tableSize += fileStatus[i].getLen();
- }
- tblStats.setSize(tableSize);
- // In case of a non-partitioned table, the key for stats temporary store is "rootDir"
- String rows = statsAggregator.aggregateStats(work.getAggKey(), StatsSetupConst.ROW_COUNT);
- if (rows != null) {
- tblStats.setNumRows(Long.parseLong(rows));
- } else {
- if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_ATOMIC)) {
- throw new HiveException("StatsAggregator failed to get numRows.");
- }
- }
- } else {
- // Partitioned table:
- // Need to get the old stats of the partition
- // and update the table stats based on the old and new stats.
- for (Partition partn : partitions) {
- //
- // get the new partition stats
- //
- PartitionStatistics newPartStats = new PartitionStatistics();
- // In that case of a partition, the key for stats temporary store is "rootDir/[dynamic_partition_specs/]%"
- String partitionID = work.getAggKey() + Warehouse.makePartPath(partn.getSpec());
- String rows = statsAggregator.aggregateStats(partitionID, StatsSetupConst.ROW_COUNT);
- if (rows != null) {
- newPartStats.setNumRows(Long.parseLong(rows));
- } else {
- if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_ATOMIC)) {
- throw new HiveException("StatsAggregator failed to get numRows.");
- }
- }
- fileSys = partn.getPartitionPath().getFileSystem(conf);
- fileStatus = Utilities.getFileStatusRecurse(partn.getPartitionPath(), 1, fileSys);
- newPartStats.setNumFiles(fileStatus.length);
- long partitionSize = 0L;
- for (int i = 0; i < fileStatus.length; i++) {
- partitionSize += fileStatus[i].getLen();
- }
- newPartStats.setSize(partitionSize);
- //
- // get the old partition stats
- //
- org.apache.hadoop.hive.metastore.api.Partition tPart = partn.getTPartition();
- Map<String, String> parameters = tPart.getParameters();
- boolean hasStats =
- parameters.containsKey(StatsSetupConst.NUM_FILES) ||
- parameters.containsKey(StatsSetupConst.ROW_COUNT) ||
- parameters.containsKey(StatsSetupConst.TOTAL_SIZE);
- int nf = parameters.containsKey(StatsSetupConst.NUM_FILES) ?
- Integer.parseInt(parameters.get(StatsSetupConst.NUM_FILES)) :
- 0;
- long nr = parameters.containsKey(StatsSetupConst.ROW_COUNT) ?
- Long.parseLong(parameters.get(StatsSetupConst.ROW_COUNT)) :
- 0L;
- long sz = parameters.containsKey(StatsSetupConst.TOTAL_SIZE) ?
- Long.parseLong(parameters.get(StatsSetupConst.TOTAL_SIZE)) :
- 0L;
- if (hasStats) {
- PartitionStatistics oldPartStats = new PartitionStatistics(nf, nr, sz);
- tblStats.updateStats(oldPartStats, newPartStats);
- } else {
- tblStats.addPartitionStats(newPartStats);
- }
- //
- // update the metastore
- //
- parameters.put(StatsSetupConst.ROW_COUNT, Long.toString(newPartStats.getNumRows()));
- parameters.put(StatsSetupConst.NUM_FILES, Integer.toString(newPartStats.getNumFiles()));
- parameters.put(StatsSetupConst.TOTAL_SIZE, Long.toString(newPartStats.getSize()));
- tPart.setParameters(parameters);
- String tableFullName = table.getDbName() + "." + table.getTableName();
- db.alterPartition(tableFullName, new Partition(table, tPart));
- console.printInfo("Partition " + tableFullName + partn.getSpec() +
- " stats: [" + newPartStats.toString() + ']');
- }
- }
- //
- // write table stats to metastore
- //
- org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable();
- Map<String, String> parameters = tTable.getParameters();
- parameters.put(StatsSetupConst.ROW_COUNT, Long.toString(tblStats.getNumRows()));
- parameters.put(StatsSetupConst.NUM_PARTITIONS, Integer.toString(tblStats.getNumPartitions()));
- parameters.put(StatsSetupConst.NUM_FILES, Integer.toString(tblStats.getNumFiles()));
- parameters.put(StatsSetupConst.TOTAL_SIZE, Long.toString(tblStats.getSize()));
- tTable.setParameters(parameters);
- String tableFullName = table.getDbName() + "." + table.getTableName();
- db.alterTable(tableFullName, new Table(tTable));
- console.printInfo("Table " + tableFullName + " stats: [" + tblStats.toString() + ']');
- } catch (Exception e) {
- // return 0 since StatsTask should not fail the whole job
- console.printInfo("[Warning] could not update stats.",
- "Failed with exception " + e.getMessage() + "\n"
- + StringUtils.stringifyException(e));
- } finally {
- statsAggregator.closeConnection();
- }
- // StatsTask always return 0 so that the whole job won't fail
- return 0;
- }
- /**
- * Get the list of partitions that need to update statistics.
- * TODO: we should reuse the Partitions generated at compile time
- * since getting the list of partitions is quite expensive.
- * @return a list of partitions that need to update statistics.
- * @throws HiveException
- */
- private List<Partition> getPartitionsList() throws HiveException {
- List<Partition> list = new ArrayList<Partition>();
- if (work.getTableSpecs() != null) {
- // ANALYZE command
- tableSpec tblSpec = work.getTableSpecs();
- table = tblSpec.tableHandle;
- if (!table.isPartitioned()) {
- return null;
- }
- // get all partitions that matches with the partition spec
- List<Partition> partitions = tblSpec.partitions;
- if (partitions != null) {
- for (Partition partn : partitions) {
- list.add(partn);
- }
- }
- } else if (work.getLoadTableDesc() != null) {
- // INSERT OVERWRITE command
- LoadTableDesc tbd = work.getLoadTableDesc();
- table = db.getTable(tbd.getTable().getTableName());
- if (!table.isPartitioned()) {
- return null;
- }
- DynamicPartitionCtx dpCtx = tbd.getDPCtx();
- if (dpCtx != null && dpCtx.getNumDPCols() > 0) { // dynamic partitions
- // load the list of DP partitions and return the list of partition specs
- for (LinkedHashMap<String, String> partSpec: dpPartSpecs) {
- Partition partn = db.getPartition(table, partSpec, false);
- list.add(partn);
- }
- } else { // static partition
- Partition partn = db.getPartition(table, tbd.getPartitionSpec(), false);
- list.add(partn);
- }
- }
- return list;
- }
- /**
- * This method is static as it is called from the shutdown hook at the ExecDriver.
- */
- public static void cleanUp(String jobID, Configuration config) {
- StatsAggregator statsAggregator;
- String statsImplementationClass = HiveConf.getVar(config, HiveConf.ConfVars.HIVESTATSDBCLASS);
- StatsFactory.setImplementation(statsImplementationClass, config);
- statsAggregator = StatsFactory.getStatsAggregator();
- if (statsAggregator.connect(config)) {
- statsAggregator.cleanUp(jobID + Path.SEPARATOR); // Adding the path separator to avoid an Id being a prefix of another ID
- }
- }
- }