PageRenderTime 39ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/PTFRowContainer.java

http://github.com/apache/hive
Java | 311 lines | 218 code | 44 blank | 49 comment | 22 complexity | bc706f2f68713fb356e03206463fcb3f MD5 | raw file
Possible License(s): Apache-2.0
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hive.ql.exec.persistence;
  19. import java.io.IOException;
  20. import java.util.ArrayList;
  21. import java.util.HashMap;
  22. import java.util.List;
  23. import java.util.Map;
  24. import java.util.Properties;
  25. import org.apache.hadoop.conf.Configuration;
  26. import org.apache.hadoop.fs.FileSystem;
  27. import org.apache.hadoop.fs.Path;
  28. import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
  29. import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
  30. import org.apache.hadoop.hive.ql.exec.Utilities;
  31. import org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat;
  32. import org.apache.hadoop.hive.ql.metadata.HiveException;
  33. import org.apache.hadoop.hive.ql.plan.PTFDeserializer;
  34. import org.apache.hadoop.hive.ql.plan.TableDesc;
  35. import org.apache.hadoop.hive.serde.serdeConstants;
  36. import org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe;
  37. import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
  38. import org.apache.hadoop.io.BytesWritable;
  39. import org.apache.hadoop.io.SequenceFile;
  40. import org.apache.hadoop.io.Writable;
  41. import org.apache.hadoop.mapred.FileSplit;
  42. import org.apache.hadoop.mapred.InputSplit;
  43. import org.apache.hadoop.mapred.JobConf;
  44. import org.apache.hadoop.mapred.Reporter;
  45. import org.apache.hadoop.mapred.SequenceFileInputFormat;
  46. import org.apache.hadoop.mapred.SequenceFileRecordReader;
  47. import org.apache.hadoop.util.Progressable;
  48. /**
  49. * Extends the RowContainer functionality to provide random access <code>getAt(i)</code>.
  50. * It extends RowContainer behavior in the following ways:
  51. * <ol>
  52. * <li> You must continue to call <b>first</b> to signal the transition from writing to the
  53. * Container to reading from it.
  54. * <li> As rows are being added, positions at which a <i>spill</i> occurs is captured as a
  55. * BlockInfo object. At this point it captures the offset in the File at which the current
  56. * Block will be written.
  57. * <li> When first is called: we associate with each BlockInfo the File Split that it
  58. * occurs in.
  59. * <li> So in order to read a random row from the Container we do the following:
  60. * <ul>
  61. * <li> Convert the row index into a block number. This is easy because all blocks are
  62. * the same size, given by the <code>blockSize</code>
  63. * <li> The corresponding BlockInfo tells us the Split that this block starts in. Also
  64. * by looking at the next Block in the BlockInfos list, we know which Split this block ends in.
  65. * <li> So we arrange to read all the Splits that contain rows for this block. For the first
  66. * Split we seek to the startOffset that we captured in BlockInfo.
  67. * <li> So after reading the Splits, all rows in this block are in the 'currentReadBlock'
  68. * </ul>
  69. * <li> We track the span of the currentReadBlock, using
  70. * <code>currentReadBlockStartRow,blockSize</code>. So if a row is requested in this span,
  71. * we don't need to read rows from disk.
  72. * <li> If the requested row is in the 'last' block; we point the currentReadBlock to
  73. * the currentWriteBlock; the same as what RowContainer does.
  74. * <li> the <code>getAt</code> leaves the Container in the same state as a
  75. * <code>next</code> call; so a getAt and next calls can be interspersed.
  76. * </ol>
  77. */
  78. public class PTFRowContainer<Row extends List<Object>> extends RowContainer<Row> {
  79. private ArrayList<BlockInfo> blockInfos;
  80. private int currentReadBlockStartRow;
  81. public PTFRowContainer(int bs, Configuration jc, Reporter reporter
  82. ) throws HiveException {
  83. super(bs, jc, reporter);
  84. blockInfos = new ArrayList<PTFRowContainer.BlockInfo>();
  85. }
  86. @Override
  87. public void addRow(Row t) throws HiveException {
  88. if ( willSpill() ) {
  89. setupWriter();
  90. PTFRecordWriter rw = (PTFRecordWriter) getRecordWriter();
  91. BlockInfo blkInfo = new BlockInfo();
  92. try {
  93. blkInfo.startOffset = rw.outStream.getLength();
  94. blockInfos.add(blkInfo);
  95. } catch(IOException e) {
  96. clearRows();
  97. LOG.error(e.toString(), e);
  98. throw new HiveException(e);
  99. }
  100. }
  101. super.addRow(t);
  102. }
  103. @Override
  104. public Row first() throws HiveException {
  105. Row r = super.first();
  106. if ( blockInfos.size() > 0 ) {
  107. InputSplit[] inputSplits = getInputSplits();
  108. FileSplit fS = null;
  109. BlockInfo bI = blockInfos.get(0);
  110. bI.startingSplit = 0;
  111. int i = 1;
  112. bI = i < blockInfos.size() ? blockInfos.get(i) : null;
  113. for(int j=1; j < inputSplits.length && bI != null; j++) {
  114. fS = (FileSplit) inputSplits[j];
  115. while (bI != null && bI.startOffset < fS.getStart() ) {
  116. bI.startingSplit = j - 1;
  117. i++;
  118. bI = i < blockInfos.size() ? blockInfos.get(i) : null;
  119. }
  120. }
  121. while ( i < blockInfos.size() ) {
  122. bI = blockInfos.get(i);
  123. bI.startingSplit = inputSplits.length - 1;
  124. i++;
  125. }
  126. }
  127. currentReadBlockStartRow = 0;
  128. return r;
  129. }
  130. @Override
  131. public Row next() throws HiveException {
  132. boolean endOfCurrBlock = endOfCurrentReadBlock();
  133. if ( endOfCurrBlock ) {
  134. currentReadBlockStartRow += getCurrentReadBlockSize();
  135. }
  136. return super.next();
  137. }
  138. @Override
  139. public void clearRows() throws HiveException {
  140. super.clearRows();
  141. resetReadBlocks();
  142. blockInfos = new ArrayList<PTFRowContainer.BlockInfo>();
  143. }
  144. @Override
  145. public void close() throws HiveException {
  146. super.close();
  147. blockInfos = null;
  148. }
  149. public Row getAt(int rowIdx) throws HiveException {
  150. int blockSize = getBlockSize();
  151. if ( rowIdx < currentReadBlockStartRow || rowIdx >= currentReadBlockStartRow + blockSize ) {
  152. readBlock(getBlockNum(rowIdx));
  153. }
  154. return getReadBlockRow(rowIdx - currentReadBlockStartRow);
  155. }
  156. private int numBlocks() {
  157. return blockInfos.size() + 1;
  158. }
  159. private int getBlockNum(int rowIdx) {
  160. int blockSize = getBlockSize();
  161. return rowIdx / blockSize;
  162. }
  163. private void readBlock(int blockNum) throws HiveException {
  164. currentReadBlockStartRow = getBlockSize() * blockNum;
  165. if ( blockNum == numBlocks() - 1 ) {
  166. setWriteBlockAsReadBlock();
  167. return;
  168. }
  169. resetCurrentReadBlockToFirstReadBlock();
  170. BlockInfo bI = blockInfos.get(blockNum);
  171. int startSplit = bI.startingSplit;
  172. int endSplit;
  173. if ( blockNum != blockInfos.size() - 1 ) {
  174. endSplit = blockInfos.get(blockNum + 1).startingSplit;
  175. } else {
  176. endSplit = getLastActualSplit();
  177. }
  178. try {
  179. int readIntoOffset = 0;
  180. for(int i = startSplit; i <= endSplit && readIntoOffset < getBlockSize(); i++ ) {
  181. org.apache.hadoop.mapred.RecordReader rr = setReaderAtSplit(i);
  182. if ( i == startSplit ) {
  183. ((PTFSequenceFileRecordReader)rr).seek(bI.startOffset);
  184. }
  185. nextBlock(readIntoOffset);
  186. readIntoOffset = getCurrentReadBlockSize();
  187. }
  188. } catch(Exception e) {
  189. clearRows();
  190. LOG.error(e.toString(), e);
  191. if ( e instanceof HiveException ) {
  192. throw (HiveException) e;
  193. }
  194. throw new HiveException(e);
  195. }
  196. }
  197. private static class BlockInfo {
  198. // position in file where the first row in this block starts
  199. long startOffset;
  200. // inputSplitNum that contains the first row in this block.
  201. int startingSplit;
  202. }
  203. public static TableDesc createTableDesc(StructObjectInspector oI) {
  204. Map<String,String> props = new HashMap<String,String>();
  205. PTFDeserializer.addOIPropertiestoSerDePropsMap(oI, props);
  206. String colNames = props.get(serdeConstants.LIST_COLUMNS);
  207. String colTypes = props.get(serdeConstants.LIST_COLUMN_TYPES);
  208. TableDesc tblDesc = new TableDesc(
  209. PTFSequenceFileInputFormat.class, PTFHiveSequenceFileOutputFormat.class,
  210. Utilities.makeProperties(
  211. serdeConstants.SERIALIZATION_FORMAT, ""+ Utilities.ctrlaCode,
  212. serdeConstants.LIST_COLUMNS, colNames.toString(),
  213. serdeConstants.LIST_COLUMN_TYPES,colTypes.toString(),
  214. hive_metastoreConstants.TABLE_BUCKETING_VERSION, "-1",
  215. serdeConstants.SERIALIZATION_LIB,LazyBinarySerDe.class.getName()));
  216. return tblDesc;
  217. }
  218. private static class PTFRecordWriter implements RecordWriter {
  219. BytesWritable EMPTY_KEY = new BytesWritable();
  220. SequenceFile.Writer outStream;
  221. public PTFRecordWriter(SequenceFile.Writer outStream) {
  222. this.outStream = outStream;
  223. }
  224. @Override
  225. public void write(Writable r) throws IOException {
  226. outStream.append(EMPTY_KEY, r);
  227. }
  228. @Override
  229. public void close(boolean abort) throws IOException {
  230. outStream.close();
  231. }
  232. }
  233. public static class PTFHiveSequenceFileOutputFormat<K,V>
  234. extends HiveSequenceFileOutputFormat<K,V> {
  235. @Override
  236. public RecordWriter getHiveRecordWriter(JobConf jc, Path finalOutPath,
  237. Class<? extends Writable> valueClass, boolean isCompressed,
  238. Properties tableProperties, Progressable progress) throws IOException {
  239. FileSystem fs = finalOutPath.getFileSystem(jc);
  240. final SequenceFile.Writer outStream = Utilities.createSequenceWriter(jc, fs, finalOutPath,
  241. BytesWritable.class, valueClass, isCompressed, progress);
  242. return new PTFRecordWriter(outStream);
  243. }
  244. }
  245. public static class PTFSequenceFileInputFormat<K, V> extends SequenceFileInputFormat<K, V> {
  246. public PTFSequenceFileInputFormat() {
  247. super();
  248. }
  249. @Override
  250. public org.apache.hadoop.mapred.RecordReader<K, V> getRecordReader(InputSplit split,
  251. JobConf job, Reporter reporter)
  252. throws IOException {
  253. reporter.setStatus(split.toString());
  254. return new PTFSequenceFileRecordReader<K, V>(job, (FileSplit) split);
  255. }
  256. }
  257. public static class PTFSequenceFileRecordReader<K,V> extends SequenceFileRecordReader<K, V> {
  258. public PTFSequenceFileRecordReader(Configuration conf, FileSplit split)
  259. throws IOException {
  260. super(conf, split);
  261. }
  262. @Override
  263. public void seek(long pos) throws IOException {
  264. super.seek(pos);
  265. }
  266. }
  267. }