PageRenderTime 40ms CodeModel.GetById 11ms RepoModel.GetById 0ms app.codeStats 1ms

/tags/release-0.1-rc2/hive/external/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HiveHBaseTableInputFormat.java

#
Java | 421 lines | 284 code | 63 blank | 74 comment | 35 complexity | ed9fd4d43a1d66f9eafa0d02b2d45eae MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, JSON, CPL-1.0
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hive.hbase;
  19. import java.io.IOException;
  20. import java.util.ArrayList;
  21. import java.util.Arrays;
  22. import java.util.List;
  23. import org.apache.commons.logging.Log;
  24. import org.apache.commons.logging.LogFactory;
  25. import org.apache.hadoop.fs.Path;
  26. import org.apache.hadoop.hbase.HBaseConfiguration;
  27. import org.apache.hadoop.hbase.client.HTable;
  28. import org.apache.hadoop.hbase.client.Result;
  29. import org.apache.hadoop.hbase.client.Scan;
  30. import org.apache.hadoop.hbase.filter.BinaryComparator;
  31. import org.apache.hadoop.hbase.filter.CompareFilter;
  32. import org.apache.hadoop.hbase.filter.RowFilter;
  33. import org.apache.hadoop.hbase.filter.WhileMatchFilter;
  34. import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
  35. import org.apache.hadoop.hbase.mapreduce.TableInputFormatBase;
  36. import org.apache.hadoop.hbase.mapreduce.TableSplit;
  37. import org.apache.hadoop.hbase.util.Bytes;
  38. import org.apache.hadoop.hbase.util.Writables;
  39. import org.apache.hadoop.hive.ql.exec.ExprNodeConstantEvaluator;
  40. import org.apache.hadoop.hive.ql.exec.Utilities;
  41. import org.apache.hadoop.hive.ql.index.IndexPredicateAnalyzer;
  42. import org.apache.hadoop.hive.ql.index.IndexSearchCondition;
  43. import org.apache.hadoop.hive.ql.metadata.HiveException;
  44. import org.apache.hadoop.hive.ql.metadata.HiveStoragePredicateHandler;
  45. import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
  46. import org.apache.hadoop.hive.ql.plan.TableScanDesc;
  47. import org.apache.hadoop.hive.serde.Constants;
  48. import org.apache.hadoop.hive.serde2.ByteStream;
  49. import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
  50. import org.apache.hadoop.hive.serde2.SerDeException;
  51. import org.apache.hadoop.hive.serde2.lazy.LazyUtils;
  52. import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
  53. import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
  54. import org.apache.hadoop.io.Writable;
  55. import org.apache.hadoop.mapred.InputFormat;
  56. import org.apache.hadoop.mapred.InputSplit;
  57. import org.apache.hadoop.mapred.JobConf;
  58. import org.apache.hadoop.mapred.RecordReader;
  59. import org.apache.hadoop.mapred.Reporter;
  60. import org.apache.hadoop.mapreduce.Job;
  61. import org.apache.hadoop.mapreduce.JobContext;
  62. import org.apache.hadoop.mapreduce.TaskAttemptContext;
  63. import org.apache.hadoop.mapreduce.TaskAttemptID;
  64. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  65. /**
  66. * HiveHBaseTableInputFormat implements InputFormat for HBase storage handler
  67. * tables, decorating an underlying HBase TableInputFormat with extra Hive logic
  68. * such as column pruning and filter pushdown.
  69. */
  70. public class HiveHBaseTableInputFormat extends TableInputFormatBase
  71. implements InputFormat<ImmutableBytesWritable, Result> {
  72. static final Log LOG = LogFactory.getLog(HiveHBaseTableInputFormat.class);
  73. @Override
  74. public RecordReader<ImmutableBytesWritable, Result> getRecordReader(
  75. InputSplit split,
  76. JobConf jobConf,
  77. final Reporter reporter) throws IOException {
  78. HBaseSplit hbaseSplit = (HBaseSplit) split;
  79. TableSplit tableSplit = hbaseSplit.getSplit();
  80. String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME);
  81. setHTable(new HTable(new HBaseConfiguration(jobConf), Bytes.toBytes(hbaseTableName)));
  82. String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING);
  83. List<String> hbaseColumnFamilies = new ArrayList<String>();
  84. List<String> hbaseColumnQualifiers = new ArrayList<String>();
  85. List<byte []> hbaseColumnFamiliesBytes = new ArrayList<byte []>();
  86. List<byte []> hbaseColumnQualifiersBytes = new ArrayList<byte []>();
  87. int iKey;
  88. try {
  89. iKey = HBaseSerDe.parseColumnMapping(hbaseColumnsMapping, hbaseColumnFamilies,
  90. hbaseColumnFamiliesBytes, hbaseColumnQualifiers, hbaseColumnQualifiersBytes);
  91. } catch (SerDeException se) {
  92. throw new IOException(se);
  93. }
  94. List<Integer> readColIDs = ColumnProjectionUtils.getReadColumnIDs(jobConf);
  95. if (hbaseColumnFamilies.size() < readColIDs.size()) {
  96. throw new IOException("Cannot read more columns than the given table contains.");
  97. }
  98. boolean addAll = (readColIDs.size() == 0);
  99. Scan scan = new Scan();
  100. boolean empty = true;
  101. if (!addAll) {
  102. for (int i : readColIDs) {
  103. if (i == iKey) {
  104. continue;
  105. }
  106. if (hbaseColumnQualifiers.get(i) == null) {
  107. scan.addFamily(hbaseColumnFamiliesBytes.get(i));
  108. } else {
  109. scan.addColumn(hbaseColumnFamiliesBytes.get(i), hbaseColumnQualifiersBytes.get(i));
  110. }
  111. empty = false;
  112. }
  113. }
  114. // The HBase table's row key maps to a Hive table column. In the corner case when only the
  115. // row key column is selected in Hive, the HBase Scan will be empty i.e. no column family/
  116. // column qualifier will have been added to the scan. We arbitrarily add at least one column
  117. // to the HBase scan so that we can retrieve all of the row keys and return them as the Hive
  118. // tables column projection.
  119. if (empty) {
  120. for (int i = 0; i < hbaseColumnFamilies.size(); i++) {
  121. if (i == iKey) {
  122. continue;
  123. }
  124. if (hbaseColumnQualifiers.get(i) == null) {
  125. scan.addFamily(hbaseColumnFamiliesBytes.get(i));
  126. } else {
  127. scan.addColumn(hbaseColumnFamiliesBytes.get(i), hbaseColumnQualifiersBytes.get(i));
  128. }
  129. if (!addAll) {
  130. break;
  131. }
  132. }
  133. }
  134. // If Hive's optimizer gave us a filter to process, convert it to the
  135. // HBase scan form now.
  136. tableSplit = convertFilter(jobConf, scan, tableSplit, iKey);
  137. setScan(scan);
  138. Job job = new Job(jobConf);
  139. TaskAttemptContext tac =
  140. new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()) {
  141. @Override
  142. public void progress() {
  143. reporter.progress();
  144. }
  145. };
  146. final org.apache.hadoop.mapreduce.RecordReader<ImmutableBytesWritable, Result>
  147. recordReader = createRecordReader(tableSplit, tac);
  148. return new RecordReader<ImmutableBytesWritable, Result>() {
  149. @Override
  150. public void close() throws IOException {
  151. recordReader.close();
  152. }
  153. @Override
  154. public ImmutableBytesWritable createKey() {
  155. return new ImmutableBytesWritable();
  156. }
  157. @Override
  158. public Result createValue() {
  159. return new Result();
  160. }
  161. @Override
  162. public long getPos() throws IOException {
  163. return 0;
  164. }
  165. @Override
  166. public float getProgress() throws IOException {
  167. float progress = 0.0F;
  168. try {
  169. progress = recordReader.getProgress();
  170. } catch (InterruptedException e) {
  171. throw new IOException(e);
  172. }
  173. return progress;
  174. }
  175. @Override
  176. public boolean next(ImmutableBytesWritable rowKey, Result value) throws IOException {
  177. boolean next = false;
  178. try {
  179. next = recordReader.nextKeyValue();
  180. if (next) {
  181. rowKey.set(recordReader.getCurrentValue().getRow());
  182. Writables.copyWritable(recordReader.getCurrentValue(), value);
  183. }
  184. } catch (InterruptedException e) {
  185. throw new IOException(e);
  186. }
  187. return next;
  188. }
  189. };
  190. }
  191. /**
  192. * Converts a filter (which has been pushed down from Hive's optimizer)
  193. * into corresponding restrictions on the HBase scan. The
  194. * filter should already be in a form which can be fully converted.
  195. *
  196. * @param jobConf configuration for the scan
  197. *
  198. * @param scan the HBase scan object to restrict
  199. *
  200. * @param tableSplit the HBase table split to restrict, or null
  201. * if calculating splits
  202. *
  203. * @param iKey 0-based offset of key column within Hive table
  204. *
  205. * @return converted table split if any
  206. */
  207. private TableSplit convertFilter(
  208. JobConf jobConf,
  209. Scan scan,
  210. TableSplit tableSplit,
  211. int iKey)
  212. throws IOException {
  213. String filterExprSerialized =
  214. jobConf.get(TableScanDesc.FILTER_EXPR_CONF_STR);
  215. if (filterExprSerialized == null) {
  216. return tableSplit;
  217. }
  218. ExprNodeDesc filterExpr =
  219. Utilities.deserializeExpression(filterExprSerialized, jobConf);
  220. String columnNameProperty = jobConf.get(Constants.LIST_COLUMNS);
  221. List<String> columnNames =
  222. Arrays.asList(columnNameProperty.split(","));
  223. IndexPredicateAnalyzer analyzer =
  224. newIndexPredicateAnalyzer(columnNames.get(iKey));
  225. List<IndexSearchCondition> searchConditions =
  226. new ArrayList<IndexSearchCondition>();
  227. ExprNodeDesc residualPredicate =
  228. analyzer.analyzePredicate(filterExpr, searchConditions);
  229. // There should be no residual since we already negotiated
  230. // that earlier in HBaseStorageHandler.decomposePredicate.
  231. if (residualPredicate != null) {
  232. throw new RuntimeException(
  233. "Unexpected residual predicate " + residualPredicate.getExprString());
  234. }
  235. // There should be exactly one predicate since we already
  236. // negotiated that also.
  237. if (searchConditions.size() != 1) {
  238. throw new RuntimeException(
  239. "Exactly one search condition expected in push down");
  240. }
  241. // Convert the search condition into a restriction on the HBase scan
  242. IndexSearchCondition sc = searchConditions.get(0);
  243. ExprNodeConstantEvaluator eval =
  244. new ExprNodeConstantEvaluator(sc.getConstantDesc());
  245. byte [] startRow;
  246. try {
  247. ObjectInspector objInspector = eval.initialize(null);
  248. Object writable = eval.evaluate(null);
  249. ByteStream.Output serializeStream = new ByteStream.Output();
  250. LazyUtils.writePrimitiveUTF8(
  251. serializeStream,
  252. writable,
  253. (PrimitiveObjectInspector) objInspector,
  254. false,
  255. (byte) 0,
  256. null);
  257. startRow = new byte[serializeStream.getCount()];
  258. System.arraycopy(
  259. serializeStream.getData(), 0,
  260. startRow, 0, serializeStream.getCount());
  261. } catch (HiveException ex) {
  262. throw new IOException(ex);
  263. }
  264. // stopRow is exclusive, so pad it with a trailing 0 byte to
  265. // make it compare as the very next value after startRow
  266. byte [] stopRow = new byte[startRow.length + 1];
  267. System.arraycopy(startRow, 0, stopRow, 0, startRow.length);
  268. if (tableSplit != null) {
  269. tableSplit = new TableSplit(
  270. tableSplit.getTableName(),
  271. startRow,
  272. stopRow,
  273. tableSplit.getRegionLocation());
  274. }
  275. scan.setStartRow(startRow);
  276. scan.setStopRow(stopRow);
  277. // Add a WhileMatchFilter to make the scan terminate as soon
  278. // as we see a non-matching key. This is probably redundant
  279. // since the stopRow above should already take care of it for us.
  280. scan.setFilter(
  281. new WhileMatchFilter(
  282. new RowFilter(
  283. CompareFilter.CompareOp.EQUAL,
  284. new BinaryComparator(startRow))));
  285. return tableSplit;
  286. }
  287. /**
  288. * Instantiates a new predicate analyzer suitable for
  289. * determining how to push a filter down into the HBase scan,
  290. * based on the rules for what kinds of pushdown we currently support.
  291. *
  292. * @param keyColumnName name of the Hive column mapped to the HBase row key
  293. *
  294. * @return preconfigured predicate analyzer
  295. */
  296. static IndexPredicateAnalyzer newIndexPredicateAnalyzer(
  297. String keyColumnName) {
  298. IndexPredicateAnalyzer analyzer = new IndexPredicateAnalyzer();
  299. // for now, we only support equality comparisons
  300. analyzer.addComparisonOp(
  301. "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual");
  302. // and only on the key column
  303. analyzer.clearAllowedColumnNames();
  304. analyzer.allowColumnName(keyColumnName);
  305. return analyzer;
  306. }
  307. @Override
  308. public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
  309. String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME);
  310. setHTable(new HTable(new HBaseConfiguration(jobConf), Bytes.toBytes(hbaseTableName)));
  311. String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING);
  312. if (hbaseColumnsMapping == null) {
  313. throw new IOException("hbase.columns.mapping required for HBase Table.");
  314. }
  315. List<String> hbaseColumnFamilies = new ArrayList<String>();
  316. List<String> hbaseColumnQualifiers = new ArrayList<String>();
  317. List<byte []> hbaseColumnFamiliesBytes = new ArrayList<byte []>();
  318. List<byte []> hbaseColumnQualifiersBytes = new ArrayList<byte []>();
  319. int iKey;
  320. try {
  321. iKey = HBaseSerDe.parseColumnMapping(hbaseColumnsMapping, hbaseColumnFamilies,
  322. hbaseColumnFamiliesBytes, hbaseColumnQualifiers, hbaseColumnQualifiersBytes);
  323. } catch (SerDeException se) {
  324. throw new IOException(se);
  325. }
  326. Scan scan = new Scan();
  327. // Take filter pushdown into account while calculating splits; this
  328. // allows us to prune off regions immediately. Note that although
  329. // the Javadoc for the superclass getSplits says that it returns one
  330. // split per region, the implementation actually takes the scan
  331. // definition into account and excludes regions which don't satisfy
  332. // the start/stop row conditions (HBASE-1829).
  333. convertFilter(jobConf, scan, null, iKey);
  334. // REVIEW: are we supposed to be applying the getReadColumnIDs
  335. // same as in getRecordReader?
  336. for (int i = 0; i < hbaseColumnFamilies.size(); i++) {
  337. if (i == iKey) {
  338. continue;
  339. }
  340. if (hbaseColumnQualifiers.get(i) == null) {
  341. scan.addFamily(hbaseColumnFamiliesBytes.get(i));
  342. } else {
  343. scan.addColumn(hbaseColumnFamiliesBytes.get(i), hbaseColumnQualifiersBytes.get(i));
  344. }
  345. }
  346. setScan(scan);
  347. Job job = new Job(jobConf);
  348. JobContext jobContext = new JobContext(job.getConfiguration(), job.getJobID());
  349. Path [] tablePaths = FileInputFormat.getInputPaths(jobContext);
  350. List<org.apache.hadoop.mapreduce.InputSplit> splits =
  351. super.getSplits(jobContext);
  352. InputSplit [] results = new InputSplit[splits.size()];
  353. for (int i = 0; i < splits.size(); i++) {
  354. results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0]);
  355. }
  356. return results;
  357. }
  358. }