PageRenderTime 38ms CodeModel.GetById 7ms app.highlight 26ms RepoModel.GetById 1ms app.codeStats 1ms

/tags/release-0.0.0-rc0/hive/external/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java

#
Java | 457 lines | 307 code | 64 blank | 86 comment | 49 complexity | c131345307ac68c4dd9ac136466f5bd3 MD5 | raw file
  1/**
  2 * Licensed to the Apache Software Foundation (ASF) under one
  3 * or more contributor license agreements.  See the NOTICE file
  4 * distributed with this work for additional information
  5 * regarding copyright ownership.  The ASF licenses this file
  6 * to you under the Apache License, Version 2.0 (the
  7 * "License"); you may not use this file except in compliance
  8 * with the License.  You may obtain a copy of the License at
  9 *
 10 *     http://www.apache.org/licenses/LICENSE-2.0
 11 *
 12 * Unless required by applicable law or agreed to in writing, software
 13 * distributed under the License is distributed on an "AS IS" BASIS,
 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 * See the License for the specific language governing permissions and
 16 * limitations under the License.
 17 */
 18
 19
 20package org.apache.hadoop.hive.ql.exec;
 21
 22import java.io.Serializable;
 23import java.util.ArrayList;
 24import java.util.LinkedHashMap;
 25import java.util.List;
 26import java.util.Map;
 27
 28import org.apache.hadoop.conf.Configuration;
 29import org.apache.hadoop.fs.FileStatus;
 30import org.apache.hadoop.fs.FileSystem;
 31import org.apache.hadoop.fs.Path;
 32import org.apache.hadoop.hive.conf.HiveConf;
 33import org.apache.hadoop.hive.metastore.Warehouse;
 34import org.apache.hadoop.hive.ql.Context;
 35import org.apache.hadoop.hive.ql.DriverContext;
 36import org.apache.hadoop.hive.ql.metadata.HiveException;
 37import org.apache.hadoop.hive.ql.metadata.Partition;
 38import org.apache.hadoop.hive.ql.metadata.Table;
 39import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.tableSpec;
 40import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
 41import org.apache.hadoop.hive.ql.plan.LoadTableDesc;
 42import org.apache.hadoop.hive.ql.plan.StatsWork;
 43import org.apache.hadoop.hive.ql.plan.api.StageType;
 44import org.apache.hadoop.hive.ql.stats.StatsAggregator;
 45import org.apache.hadoop.hive.ql.stats.StatsFactory;
 46import org.apache.hadoop.hive.ql.stats.StatsSetupConst;
 47import org.apache.hadoop.util.StringUtils;
 48
 49/**
 50 * StatsTask implementation.
 51 **/
 52public class StatsTask extends Task<StatsWork> implements Serializable {
 53
 54  private static final long serialVersionUID = 1L;
 55
 56  private Table table;
 57  private List<LinkedHashMap<String, String>> dpPartSpecs;
 58
 59  public StatsTask() {
 60    super();
 61    dpPartSpecs = null;
 62  }
 63
 64  /**
 65   *
 66   * Partition Level Statistics.
 67   *
 68   */
 69  class PartitionStatistics {
 70    int numFiles; // number of files in the partition
 71    long numRows;  // number of rows in the partition
 72    long size;    // total size in bytes of the partition
 73
 74    public PartitionStatistics() {
 75      numFiles = 0;
 76      numRows = 0L;
 77      size = 0L;
 78    }
 79
 80    public PartitionStatistics(int nf, long nr, long sz) {
 81      numFiles = nf;
 82      numRows = nr;
 83      size = sz;
 84    }
 85
 86    public int getNumFiles() {
 87      return numFiles;
 88    }
 89
 90    public long getNumRows() {
 91      return numRows;
 92    }
 93
 94    public long getSize() {
 95      return size;
 96    }
 97
 98    public void setNumFiles(int nf) {
 99      numFiles = nf;
100    }
101
102    public void setNumRows(long nr) {
103      numRows = nr;
104    }
105
106    public void setSize(long sz) {
107      size = sz;
108    }
109
110    @Override
111    public String toString() {
112      StringBuilder sb = new StringBuilder();
113      sb.append("num_files: ").append(numFiles).append(", ");
114      sb.append("num_rows: ").append(numRows).append(", ");
115      sb.append("total_size: ").append(size);
116      return sb.toString();
117    }
118  }
119
120  /**
121   *
122   * Table Level Statistics.
123   *
124   */
125  class TableStatistics extends PartitionStatistics {
126    int numPartitions; // number of partitions
127
128    public TableStatistics() {
129      super();
130      numPartitions = 0;
131    }
132
133    public void setNumPartitions(int np) {
134      numPartitions = np;
135    }
136
137    public int getNumPartitions() {
138      return numPartitions;
139    }
140
141    /**
142     * Incrementally update the table statistics according to the old and new
143     * partition level statistics.
144     * @param oldStats The old statistics of a partition.
145     * @param newStats The new statistics of a partition.
146     */
147    public void updateStats(PartitionStatistics oldStats, PartitionStatistics newStats) {
148      deletePartitionStats(oldStats);
149      addPartitionStats(newStats);
150    }
151
152    /**
153     * Update the table level statistics when a new partition is added.
154     * @param newStats the new partition statistics.
155     */
156    public void addPartitionStats(PartitionStatistics newStats) {
157      this.numFiles += newStats.getNumFiles();
158      this.numRows += newStats.getNumRows();
159      this.size += newStats.getSize();
160      this.numPartitions++;
161    }
162
163    /**
164     * Update the table level statistics when an old partition is dropped.
165     * @param oldStats the old partition statistics.
166     */
167    public void deletePartitionStats(PartitionStatistics oldStats) {
168      this.numFiles -= oldStats.getNumFiles();
169      this.numRows -= oldStats.getNumRows();
170      this.size -= oldStats.getSize();
171      this.numPartitions--;
172    }
173
174    @Override
175    public String toString() {
176      StringBuilder sb = new StringBuilder();
177      sb.append("num_partitions: ").append(numPartitions).append(", ");
178      sb.append(super.toString());
179      return sb.toString();
180    }
181  }
182
183  @Override
184  protected void receiveFeed(FeedType feedType, Object feedValue) {
185    // this method should be called by MoveTask when there are dynamic partitions generated
186    if (feedType == FeedType.DYNAMIC_PARTITIONS) {
187      assert feedValue instanceof List<?>;
188      dpPartSpecs = (List<LinkedHashMap<String, String>>) feedValue;
189    }
190  }
191
192  @Override
193  public int execute(DriverContext driverContext) {
194
195    // Make sure that it is either an ANALYZE command or an INSERT OVERWRITE command
196    assert (work.getLoadTableDesc() != null && work.getTableSpecs() == null ||
197            work.getLoadTableDesc() == null && work.getTableSpecs() != null);
198    String tableName = "";
199    try {
200      if (work.getLoadTableDesc() != null) {
201        tableName = work.getLoadTableDesc().getTable().getTableName();
202      } else {
203        tableName = work.getTableSpecs().tableName;
204      }
205      table = db.getTable(tableName);
206    }  catch (HiveException e) {
207       LOG.error("Cannot get table " + tableName, e);
208       console.printError("Cannot get table " + tableName, e.toString());
209    }
210    return aggregateStats();
211  }
212
213  @Override
214  public StageType getType() {
215    return StageType.STATS;
216  }
217
218  @Override
219  public String getName() {
220    return "STATS";
221  }
222
223  @Override
224  protected void localizeMRTmpFilesImpl(Context ctx) {
225    // Nothing to do for StatsTask here.
226  }
227
228  private int aggregateStats() {
229
230    String statsImplementationClass = HiveConf.getVar(conf, HiveConf.ConfVars.HIVESTATSDBCLASS);
231    StatsFactory.setImplementation(statsImplementationClass, conf);
232    StatsAggregator statsAggregator = StatsFactory.getStatsAggregator();
233
234    try {
235      // Stats setup:
236      Warehouse wh = new Warehouse(conf);
237      FileSystem fileSys;
238      FileStatus[] fileStatus;
239
240      // manufacture a StatsAggregator
241      if (!statsAggregator.connect(conf)) {
242        throw new HiveException("StatsAggregator connect failed " + statsImplementationClass);
243      }
244
245      TableStatistics tblStats = new TableStatistics();
246
247      //
248      // For partitioned table get the old table statistics for incremental update
249      //
250      if (table.isPartitioned()) {
251        org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable();
252        Map<String, String> parameters = tTable.getParameters();
253        if (parameters.containsKey(StatsSetupConst.ROW_COUNT)) {
254          tblStats.setNumRows(Long.parseLong(parameters.get(StatsSetupConst.ROW_COUNT)));
255        }
256        if (parameters.containsKey(StatsSetupConst.NUM_PARTITIONS)) {
257          tblStats.setNumPartitions(Integer.parseInt(parameters.get(StatsSetupConst.NUM_PARTITIONS)));
258        }
259        if (parameters.containsKey(StatsSetupConst.NUM_FILES)) {
260          tblStats.setNumFiles(Integer.parseInt(parameters.get(StatsSetupConst.NUM_FILES)));
261        }
262        if (parameters.containsKey(StatsSetupConst.TOTAL_SIZE)) {
263          tblStats.setSize(Long.parseLong(parameters.get(StatsSetupConst.TOTAL_SIZE)));
264        }
265      }
266
267      List<Partition> partitions = getPartitionsList();
268
269      if (partitions == null) {
270        // non-partitioned tables:
271
272        Path tablePath = wh.getDefaultTablePath(table.getDbName(), table.getTableName());
273        fileSys = tablePath.getFileSystem(conf);
274        fileStatus = Utilities.getFileStatusRecurse(tablePath, 1, fileSys);
275        tblStats.setNumFiles(fileStatus.length);
276        long tableSize = 0L;
277        for (int i = 0; i < fileStatus.length; i++) {
278          tableSize += fileStatus[i].getLen();
279        }
280        tblStats.setSize(tableSize);
281
282        // In case of a non-partitioned table, the key for stats temporary store is "rootDir"
283        String rows = statsAggregator.aggregateStats(work.getAggKey(), StatsSetupConst.ROW_COUNT);
284        if (rows != null) {
285          tblStats.setNumRows(Long.parseLong(rows));
286        } else {
287          if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_ATOMIC)) {
288            throw new HiveException("StatsAggregator failed to get numRows.");
289          }
290        }
291      } else {
292        // Partitioned table:
293        // Need to get the old stats of the partition
294        // and update the table stats based on the old and new stats.
295        for (Partition partn : partitions) {
296          //
297          // get the new partition stats
298          //
299          PartitionStatistics newPartStats = new PartitionStatistics();
300
301          // In that case of a partition, the key for stats temporary store is "rootDir/[dynamic_partition_specs/]%"
302          String partitionID = work.getAggKey() + Warehouse.makePartPath(partn.getSpec());
303
304          String rows = statsAggregator.aggregateStats(partitionID, StatsSetupConst.ROW_COUNT);
305          if (rows != null) {
306            newPartStats.setNumRows(Long.parseLong(rows));
307          } else {
308            if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_ATOMIC)) {
309              throw new HiveException("StatsAggregator failed to get numRows.");
310            }
311          }
312
313          fileSys = partn.getPartitionPath().getFileSystem(conf);
314          fileStatus = Utilities.getFileStatusRecurse(partn.getPartitionPath(), 1, fileSys);
315          newPartStats.setNumFiles(fileStatus.length);
316
317          long partitionSize = 0L;
318          for (int i = 0; i < fileStatus.length; i++) {
319            partitionSize += fileStatus[i].getLen();
320          }
321          newPartStats.setSize(partitionSize);
322
323          //
324          // get the old partition stats
325          //
326          org.apache.hadoop.hive.metastore.api.Partition tPart = partn.getTPartition();
327          Map<String, String> parameters = tPart.getParameters();
328
329          boolean hasStats =
330            parameters.containsKey(StatsSetupConst.NUM_FILES) ||
331            parameters.containsKey(StatsSetupConst.ROW_COUNT) ||
332            parameters.containsKey(StatsSetupConst.TOTAL_SIZE);
333
334          int  nf = parameters.containsKey(StatsSetupConst.NUM_FILES) ?
335                    Integer.parseInt(parameters.get(StatsSetupConst.NUM_FILES)) :
336                    0;
337          long nr = parameters.containsKey(StatsSetupConst.ROW_COUNT) ?
338                    Long.parseLong(parameters.get(StatsSetupConst.ROW_COUNT)) :
339                    0L;
340          long sz = parameters.containsKey(StatsSetupConst.TOTAL_SIZE) ?
341                    Long.parseLong(parameters.get(StatsSetupConst.TOTAL_SIZE)) :
342                    0L;
343          if (hasStats) {
344            PartitionStatistics oldPartStats = new PartitionStatistics(nf, nr, sz);
345            tblStats.updateStats(oldPartStats, newPartStats);
346          } else {
347            tblStats.addPartitionStats(newPartStats);
348          }
349
350          //
351          // update the metastore
352          //
353          parameters.put(StatsSetupConst.ROW_COUNT, Long.toString(newPartStats.getNumRows()));
354          parameters.put(StatsSetupConst.NUM_FILES, Integer.toString(newPartStats.getNumFiles()));
355          parameters.put(StatsSetupConst.TOTAL_SIZE, Long.toString(newPartStats.getSize()));
356
357          tPart.setParameters(parameters);
358          String tableFullName = table.getDbName() + "." + table.getTableName();
359          db.alterPartition(tableFullName, new Partition(table, tPart));
360
361          console.printInfo("Partition " + tableFullName + partn.getSpec() +
362              " stats: [" + newPartStats.toString() + ']');
363        }
364      }
365
366
367      //
368      // write table stats to metastore
369      //
370      org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable();
371      Map<String, String> parameters = tTable.getParameters();
372      parameters.put(StatsSetupConst.ROW_COUNT, Long.toString(tblStats.getNumRows()));
373      parameters.put(StatsSetupConst.NUM_PARTITIONS, Integer.toString(tblStats.getNumPartitions()));
374      parameters.put(StatsSetupConst.NUM_FILES, Integer.toString(tblStats.getNumFiles()));
375      parameters.put(StatsSetupConst.TOTAL_SIZE, Long.toString(tblStats.getSize()));
376      tTable.setParameters(parameters);
377
378      String tableFullName = table.getDbName() + "." + table.getTableName();
379
380      db.alterTable(tableFullName, new Table(tTable));
381
382      console.printInfo("Table " + tableFullName + " stats: [" + tblStats.toString() + ']');
383
384    } catch (Exception e) {
385      // return 0 since StatsTask should not fail the whole job
386      console.printInfo("[Warning] could not update stats.",
387          "Failed with exception " + e.getMessage() + "\n"
388          + StringUtils.stringifyException(e));
389    } finally {
390      statsAggregator.closeConnection();
391    }
392    // StatsTask always return 0 so that the whole job won't fail
393    return 0;
394  }
395
396  /**
397   * Get the list of partitions that need to update statistics.
398   * TODO: we should reuse the Partitions generated at compile time
399   * since getting the list of partitions is quite expensive.
400   * @return a list of partitions that need to update statistics.
401   * @throws HiveException
402   */
403  private List<Partition> getPartitionsList() throws HiveException {
404
405    List<Partition> list = new ArrayList<Partition>();
406
407    if (work.getTableSpecs() != null) {
408
409      // ANALYZE command
410      tableSpec tblSpec = work.getTableSpecs();
411      table = tblSpec.tableHandle;
412      if (!table.isPartitioned()) {
413        return null;
414      }
415      // get all partitions that matches with the partition spec
416      List<Partition> partitions = tblSpec.partitions;
417      if (partitions != null) {
418        for (Partition partn : partitions) {
419          list.add(partn);
420        }
421      }
422    } else if (work.getLoadTableDesc() != null) {
423
424      // INSERT OVERWRITE command
425      LoadTableDesc tbd = work.getLoadTableDesc();
426      table = db.getTable(tbd.getTable().getTableName());
427      if (!table.isPartitioned()) {
428        return null;
429      }
430      DynamicPartitionCtx dpCtx = tbd.getDPCtx();
431      if (dpCtx != null && dpCtx.getNumDPCols() > 0) { // dynamic partitions
432        // load the list of DP partitions and return the list of partition specs
433        for (LinkedHashMap<String, String> partSpec: dpPartSpecs) {
434          Partition partn = db.getPartition(table, partSpec, false);
435          list.add(partn);
436        }
437      } else { // static partition
438        Partition partn = db.getPartition(table, tbd.getPartitionSpec(), false);
439        list.add(partn);
440      }
441    }
442    return list;
443  }
444
445  /**
446   * This method is static as it is called from the shutdown hook at the ExecDriver.
447   */
448  public static void cleanUp(String jobID, Configuration config) {
449    StatsAggregator statsAggregator;
450    String statsImplementationClass = HiveConf.getVar(config, HiveConf.ConfVars.HIVESTATSDBCLASS);
451    StatsFactory.setImplementation(statsImplementationClass, config);
452    statsAggregator = StatsFactory.getStatsAggregator();
453    if (statsAggregator.connect(config)) {
454      statsAggregator.cleanUp(jobID + Path.SEPARATOR); // Adding the path separator to avoid an Id being a prefix of another ID
455    }
456  }
457}