PageRenderTime 60ms CodeModel.GetById 31ms RepoModel.GetById 1ms app.codeStats 0ms

/tags/release-0.0.0-rc0/hive/external/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java

#
Java | 457 lines | 307 code | 64 blank | 86 comment | 49 complexity | c131345307ac68c4dd9ac136466f5bd3 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, JSON, CPL-1.0
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hive.ql.exec;
  19. import java.io.Serializable;
  20. import java.util.ArrayList;
  21. import java.util.LinkedHashMap;
  22. import java.util.List;
  23. import java.util.Map;
  24. import org.apache.hadoop.conf.Configuration;
  25. import org.apache.hadoop.fs.FileStatus;
  26. import org.apache.hadoop.fs.FileSystem;
  27. import org.apache.hadoop.fs.Path;
  28. import org.apache.hadoop.hive.conf.HiveConf;
  29. import org.apache.hadoop.hive.metastore.Warehouse;
  30. import org.apache.hadoop.hive.ql.Context;
  31. import org.apache.hadoop.hive.ql.DriverContext;
  32. import org.apache.hadoop.hive.ql.metadata.HiveException;
  33. import org.apache.hadoop.hive.ql.metadata.Partition;
  34. import org.apache.hadoop.hive.ql.metadata.Table;
  35. import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.tableSpec;
  36. import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
  37. import org.apache.hadoop.hive.ql.plan.LoadTableDesc;
  38. import org.apache.hadoop.hive.ql.plan.StatsWork;
  39. import org.apache.hadoop.hive.ql.plan.api.StageType;
  40. import org.apache.hadoop.hive.ql.stats.StatsAggregator;
  41. import org.apache.hadoop.hive.ql.stats.StatsFactory;
  42. import org.apache.hadoop.hive.ql.stats.StatsSetupConst;
  43. import org.apache.hadoop.util.StringUtils;
  44. /**
  45. * StatsTask implementation.
  46. **/
  47. public class StatsTask extends Task<StatsWork> implements Serializable {
  48. private static final long serialVersionUID = 1L;
  49. private Table table;
  50. private List<LinkedHashMap<String, String>> dpPartSpecs;
  51. public StatsTask() {
  52. super();
  53. dpPartSpecs = null;
  54. }
  55. /**
  56. *
  57. * Partition Level Statistics.
  58. *
  59. */
  60. class PartitionStatistics {
  61. int numFiles; // number of files in the partition
  62. long numRows; // number of rows in the partition
  63. long size; // total size in bytes of the partition
  64. public PartitionStatistics() {
  65. numFiles = 0;
  66. numRows = 0L;
  67. size = 0L;
  68. }
  69. public PartitionStatistics(int nf, long nr, long sz) {
  70. numFiles = nf;
  71. numRows = nr;
  72. size = sz;
  73. }
  74. public int getNumFiles() {
  75. return numFiles;
  76. }
  77. public long getNumRows() {
  78. return numRows;
  79. }
  80. public long getSize() {
  81. return size;
  82. }
  83. public void setNumFiles(int nf) {
  84. numFiles = nf;
  85. }
  86. public void setNumRows(long nr) {
  87. numRows = nr;
  88. }
  89. public void setSize(long sz) {
  90. size = sz;
  91. }
  92. @Override
  93. public String toString() {
  94. StringBuilder sb = new StringBuilder();
  95. sb.append("num_files: ").append(numFiles).append(", ");
  96. sb.append("num_rows: ").append(numRows).append(", ");
  97. sb.append("total_size: ").append(size);
  98. return sb.toString();
  99. }
  100. }
  101. /**
  102. *
  103. * Table Level Statistics.
  104. *
  105. */
  106. class TableStatistics extends PartitionStatistics {
  107. int numPartitions; // number of partitions
  108. public TableStatistics() {
  109. super();
  110. numPartitions = 0;
  111. }
  112. public void setNumPartitions(int np) {
  113. numPartitions = np;
  114. }
  115. public int getNumPartitions() {
  116. return numPartitions;
  117. }
  118. /**
  119. * Incrementally update the table statistics according to the old and new
  120. * partition level statistics.
  121. * @param oldStats The old statistics of a partition.
  122. * @param newStats The new statistics of a partition.
  123. */
  124. public void updateStats(PartitionStatistics oldStats, PartitionStatistics newStats) {
  125. deletePartitionStats(oldStats);
  126. addPartitionStats(newStats);
  127. }
  128. /**
  129. * Update the table level statistics when a new partition is added.
  130. * @param newStats the new partition statistics.
  131. */
  132. public void addPartitionStats(PartitionStatistics newStats) {
  133. this.numFiles += newStats.getNumFiles();
  134. this.numRows += newStats.getNumRows();
  135. this.size += newStats.getSize();
  136. this.numPartitions++;
  137. }
  138. /**
  139. * Update the table level statistics when an old partition is dropped.
  140. * @param oldStats the old partition statistics.
  141. */
  142. public void deletePartitionStats(PartitionStatistics oldStats) {
  143. this.numFiles -= oldStats.getNumFiles();
  144. this.numRows -= oldStats.getNumRows();
  145. this.size -= oldStats.getSize();
  146. this.numPartitions--;
  147. }
  148. @Override
  149. public String toString() {
  150. StringBuilder sb = new StringBuilder();
  151. sb.append("num_partitions: ").append(numPartitions).append(", ");
  152. sb.append(super.toString());
  153. return sb.toString();
  154. }
  155. }
  156. @Override
  157. protected void receiveFeed(FeedType feedType, Object feedValue) {
  158. // this method should be called by MoveTask when there are dynamic partitions generated
  159. if (feedType == FeedType.DYNAMIC_PARTITIONS) {
  160. assert feedValue instanceof List<?>;
  161. dpPartSpecs = (List<LinkedHashMap<String, String>>) feedValue;
  162. }
  163. }
  164. @Override
  165. public int execute(DriverContext driverContext) {
  166. // Make sure that it is either an ANALYZE command or an INSERT OVERWRITE command
  167. assert (work.getLoadTableDesc() != null && work.getTableSpecs() == null ||
  168. work.getLoadTableDesc() == null && work.getTableSpecs() != null);
  169. String tableName = "";
  170. try {
  171. if (work.getLoadTableDesc() != null) {
  172. tableName = work.getLoadTableDesc().getTable().getTableName();
  173. } else {
  174. tableName = work.getTableSpecs().tableName;
  175. }
  176. table = db.getTable(tableName);
  177. } catch (HiveException e) {
  178. LOG.error("Cannot get table " + tableName, e);
  179. console.printError("Cannot get table " + tableName, e.toString());
  180. }
  181. return aggregateStats();
  182. }
  183. @Override
  184. public StageType getType() {
  185. return StageType.STATS;
  186. }
  187. @Override
  188. public String getName() {
  189. return "STATS";
  190. }
  191. @Override
  192. protected void localizeMRTmpFilesImpl(Context ctx) {
  193. // Nothing to do for StatsTask here.
  194. }
  195. private int aggregateStats() {
  196. String statsImplementationClass = HiveConf.getVar(conf, HiveConf.ConfVars.HIVESTATSDBCLASS);
  197. StatsFactory.setImplementation(statsImplementationClass, conf);
  198. StatsAggregator statsAggregator = StatsFactory.getStatsAggregator();
  199. try {
  200. // Stats setup:
  201. Warehouse wh = new Warehouse(conf);
  202. FileSystem fileSys;
  203. FileStatus[] fileStatus;
  204. // manufacture a StatsAggregator
  205. if (!statsAggregator.connect(conf)) {
  206. throw new HiveException("StatsAggregator connect failed " + statsImplementationClass);
  207. }
  208. TableStatistics tblStats = new TableStatistics();
  209. //
  210. // For partitioned table get the old table statistics for incremental update
  211. //
  212. if (table.isPartitioned()) {
  213. org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable();
  214. Map<String, String> parameters = tTable.getParameters();
  215. if (parameters.containsKey(StatsSetupConst.ROW_COUNT)) {
  216. tblStats.setNumRows(Long.parseLong(parameters.get(StatsSetupConst.ROW_COUNT)));
  217. }
  218. if (parameters.containsKey(StatsSetupConst.NUM_PARTITIONS)) {
  219. tblStats.setNumPartitions(Integer.parseInt(parameters.get(StatsSetupConst.NUM_PARTITIONS)));
  220. }
  221. if (parameters.containsKey(StatsSetupConst.NUM_FILES)) {
  222. tblStats.setNumFiles(Integer.parseInt(parameters.get(StatsSetupConst.NUM_FILES)));
  223. }
  224. if (parameters.containsKey(StatsSetupConst.TOTAL_SIZE)) {
  225. tblStats.setSize(Long.parseLong(parameters.get(StatsSetupConst.TOTAL_SIZE)));
  226. }
  227. }
  228. List<Partition> partitions = getPartitionsList();
  229. if (partitions == null) {
  230. // non-partitioned tables:
  231. Path tablePath = wh.getDefaultTablePath(table.getDbName(), table.getTableName());
  232. fileSys = tablePath.getFileSystem(conf);
  233. fileStatus = Utilities.getFileStatusRecurse(tablePath, 1, fileSys);
  234. tblStats.setNumFiles(fileStatus.length);
  235. long tableSize = 0L;
  236. for (int i = 0; i < fileStatus.length; i++) {
  237. tableSize += fileStatus[i].getLen();
  238. }
  239. tblStats.setSize(tableSize);
  240. // In case of a non-partitioned table, the key for stats temporary store is "rootDir"
  241. String rows = statsAggregator.aggregateStats(work.getAggKey(), StatsSetupConst.ROW_COUNT);
  242. if (rows != null) {
  243. tblStats.setNumRows(Long.parseLong(rows));
  244. } else {
  245. if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_ATOMIC)) {
  246. throw new HiveException("StatsAggregator failed to get numRows.");
  247. }
  248. }
  249. } else {
  250. // Partitioned table:
  251. // Need to get the old stats of the partition
  252. // and update the table stats based on the old and new stats.
  253. for (Partition partn : partitions) {
  254. //
  255. // get the new partition stats
  256. //
  257. PartitionStatistics newPartStats = new PartitionStatistics();
  258. // In that case of a partition, the key for stats temporary store is "rootDir/[dynamic_partition_specs/]%"
  259. String partitionID = work.getAggKey() + Warehouse.makePartPath(partn.getSpec());
  260. String rows = statsAggregator.aggregateStats(partitionID, StatsSetupConst.ROW_COUNT);
  261. if (rows != null) {
  262. newPartStats.setNumRows(Long.parseLong(rows));
  263. } else {
  264. if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_ATOMIC)) {
  265. throw new HiveException("StatsAggregator failed to get numRows.");
  266. }
  267. }
  268. fileSys = partn.getPartitionPath().getFileSystem(conf);
  269. fileStatus = Utilities.getFileStatusRecurse(partn.getPartitionPath(), 1, fileSys);
  270. newPartStats.setNumFiles(fileStatus.length);
  271. long partitionSize = 0L;
  272. for (int i = 0; i < fileStatus.length; i++) {
  273. partitionSize += fileStatus[i].getLen();
  274. }
  275. newPartStats.setSize(partitionSize);
  276. //
  277. // get the old partition stats
  278. //
  279. org.apache.hadoop.hive.metastore.api.Partition tPart = partn.getTPartition();
  280. Map<String, String> parameters = tPart.getParameters();
  281. boolean hasStats =
  282. parameters.containsKey(StatsSetupConst.NUM_FILES) ||
  283. parameters.containsKey(StatsSetupConst.ROW_COUNT) ||
  284. parameters.containsKey(StatsSetupConst.TOTAL_SIZE);
  285. int nf = parameters.containsKey(StatsSetupConst.NUM_FILES) ?
  286. Integer.parseInt(parameters.get(StatsSetupConst.NUM_FILES)) :
  287. 0;
  288. long nr = parameters.containsKey(StatsSetupConst.ROW_COUNT) ?
  289. Long.parseLong(parameters.get(StatsSetupConst.ROW_COUNT)) :
  290. 0L;
  291. long sz = parameters.containsKey(StatsSetupConst.TOTAL_SIZE) ?
  292. Long.parseLong(parameters.get(StatsSetupConst.TOTAL_SIZE)) :
  293. 0L;
  294. if (hasStats) {
  295. PartitionStatistics oldPartStats = new PartitionStatistics(nf, nr, sz);
  296. tblStats.updateStats(oldPartStats, newPartStats);
  297. } else {
  298. tblStats.addPartitionStats(newPartStats);
  299. }
  300. //
  301. // update the metastore
  302. //
  303. parameters.put(StatsSetupConst.ROW_COUNT, Long.toString(newPartStats.getNumRows()));
  304. parameters.put(StatsSetupConst.NUM_FILES, Integer.toString(newPartStats.getNumFiles()));
  305. parameters.put(StatsSetupConst.TOTAL_SIZE, Long.toString(newPartStats.getSize()));
  306. tPart.setParameters(parameters);
  307. String tableFullName = table.getDbName() + "." + table.getTableName();
  308. db.alterPartition(tableFullName, new Partition(table, tPart));
  309. console.printInfo("Partition " + tableFullName + partn.getSpec() +
  310. " stats: [" + newPartStats.toString() + ']');
  311. }
  312. }
  313. //
  314. // write table stats to metastore
  315. //
  316. org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable();
  317. Map<String, String> parameters = tTable.getParameters();
  318. parameters.put(StatsSetupConst.ROW_COUNT, Long.toString(tblStats.getNumRows()));
  319. parameters.put(StatsSetupConst.NUM_PARTITIONS, Integer.toString(tblStats.getNumPartitions()));
  320. parameters.put(StatsSetupConst.NUM_FILES, Integer.toString(tblStats.getNumFiles()));
  321. parameters.put(StatsSetupConst.TOTAL_SIZE, Long.toString(tblStats.getSize()));
  322. tTable.setParameters(parameters);
  323. String tableFullName = table.getDbName() + "." + table.getTableName();
  324. db.alterTable(tableFullName, new Table(tTable));
  325. console.printInfo("Table " + tableFullName + " stats: [" + tblStats.toString() + ']');
  326. } catch (Exception e) {
  327. // return 0 since StatsTask should not fail the whole job
  328. console.printInfo("[Warning] could not update stats.",
  329. "Failed with exception " + e.getMessage() + "\n"
  330. + StringUtils.stringifyException(e));
  331. } finally {
  332. statsAggregator.closeConnection();
  333. }
  334. // StatsTask always return 0 so that the whole job won't fail
  335. return 0;
  336. }
  337. /**
  338. * Get the list of partitions that need to update statistics.
  339. * TODO: we should reuse the Partitions generated at compile time
  340. * since getting the list of partitions is quite expensive.
  341. * @return a list of partitions that need to update statistics.
  342. * @throws HiveException
  343. */
  344. private List<Partition> getPartitionsList() throws HiveException {
  345. List<Partition> list = new ArrayList<Partition>();
  346. if (work.getTableSpecs() != null) {
  347. // ANALYZE command
  348. tableSpec tblSpec = work.getTableSpecs();
  349. table = tblSpec.tableHandle;
  350. if (!table.isPartitioned()) {
  351. return null;
  352. }
  353. // get all partitions that matches with the partition spec
  354. List<Partition> partitions = tblSpec.partitions;
  355. if (partitions != null) {
  356. for (Partition partn : partitions) {
  357. list.add(partn);
  358. }
  359. }
  360. } else if (work.getLoadTableDesc() != null) {
  361. // INSERT OVERWRITE command
  362. LoadTableDesc tbd = work.getLoadTableDesc();
  363. table = db.getTable(tbd.getTable().getTableName());
  364. if (!table.isPartitioned()) {
  365. return null;
  366. }
  367. DynamicPartitionCtx dpCtx = tbd.getDPCtx();
  368. if (dpCtx != null && dpCtx.getNumDPCols() > 0) { // dynamic partitions
  369. // load the list of DP partitions and return the list of partition specs
  370. for (LinkedHashMap<String, String> partSpec: dpPartSpecs) {
  371. Partition partn = db.getPartition(table, partSpec, false);
  372. list.add(partn);
  373. }
  374. } else { // static partition
  375. Partition partn = db.getPartition(table, tbd.getPartitionSpec(), false);
  376. list.add(partn);
  377. }
  378. }
  379. return list;
  380. }
  381. /**
  382. * This method is static as it is called from the shutdown hook at the ExecDriver.
  383. */
  384. public static void cleanUp(String jobID, Configuration config) {
  385. StatsAggregator statsAggregator;
  386. String statsImplementationClass = HiveConf.getVar(config, HiveConf.ConfVars.HIVESTATSDBCLASS);
  387. StatsFactory.setImplementation(statsImplementationClass, config);
  388. statsAggregator = StatsFactory.getStatsAggregator();
  389. if (statsAggregator.connect(config)) {
  390. statsAggregator.cleanUp(jobID + Path.SEPARATOR); // Adding the path separator to avoid an Id being a prefix of another ID
  391. }
  392. }
  393. }