PageRenderTime 47ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/tags/release-0.2.0-rc0/hive/external/ql/src/test/org/apache/hadoop/hive/ql/io/PerformTestRCFileAndSeqFile.java

#
Java | 394 lines | 295 code | 61 blank | 38 comment | 46 complexity | 0df830a1c117dc61563b9e86262d5a34 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, JSON, CPL-1.0
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hive.ql.io;
  19. import java.io.IOException;
  20. import java.util.Random;
  21. import junit.framework.TestCase;
  22. import org.apache.hadoop.conf.Configuration;
  23. import org.apache.hadoop.fs.FileSystem;
  24. import org.apache.hadoop.fs.Path;
  25. import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
  26. import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
  27. import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;
  28. import org.apache.hadoop.hive.serde2.io.ByteWritable;
  29. import org.apache.hadoop.io.LongWritable;
  30. import org.apache.hadoop.io.SequenceFile;
  31. import org.apache.hadoop.io.SequenceFile.CompressionType;
  32. import org.apache.hadoop.io.compress.CompressionCodec;
  33. import org.apache.hadoop.io.compress.DefaultCodec;
  34. /**
  35. * PerformTestRCFileAndSeqFile.
  36. *
  37. */
  38. public class PerformTestRCFileAndSeqFile extends TestCase {
  39. private final Configuration conf = new Configuration();
  40. private Path testRCFile;
  41. private Path testSeqFile;
  42. private FileSystem fs;
  43. int columnMaxSize = 30;
  44. Random randomCharGenerator = new Random(3);
  45. Random randColLenGenerator = new Random(20);
  46. public PerformTestRCFileAndSeqFile(boolean local, String file)
  47. throws IOException {
  48. if (local) {
  49. fs = FileSystem.getLocal(conf);
  50. } else {
  51. fs = FileSystem.get(conf);
  52. }
  53. conf.setInt(RCFile.Writer.COLUMNS_BUFFER_SIZE_CONF_STR, 1 * 1024 * 1024);
  54. if (file == null) {
  55. Path dir = new Path(System.getProperty("test.data.dir", ".") + "/mapred");
  56. testRCFile = new Path(dir, "test_rcfile");
  57. testSeqFile = new Path(dir, "test_seqfile");
  58. } else {
  59. testRCFile = new Path(file + "-rcfile");
  60. testSeqFile = new Path(file + "-seqfile");
  61. }
  62. fs.delete(testRCFile, true);
  63. fs.delete(testSeqFile, true);
  64. System.out.println("RCFile:" + testRCFile.toString());
  65. System.out.println("SequenceFile:" + testSeqFile.toString());
  66. }
  67. private void writeSeqenceFileTest(FileSystem fs, int rowCount, Path file,
  68. int columnNum, CompressionCodec codec) throws IOException {
  69. byte[][] columnRandom;
  70. resetRandomGenerators();
  71. BytesRefArrayWritable bytes = new BytesRefArrayWritable(columnNum);
  72. columnRandom = new byte[columnNum][];
  73. for (int i = 0; i < columnNum; i++) {
  74. BytesRefWritable cu = new BytesRefWritable();
  75. bytes.set(i, cu);
  76. }
  77. // zero length key is not allowed by block compress writer, so we use a byte
  78. // writable
  79. ByteWritable key = new ByteWritable();
  80. SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, file,
  81. ByteWritable.class, BytesRefArrayWritable.class, CompressionType.BLOCK,
  82. codec);
  83. for (int i = 0; i < rowCount; i++) {
  84. nextRandomRow(columnRandom, bytes);
  85. seqWriter.append(key, bytes);
  86. }
  87. seqWriter.close();
  88. }
  89. private void resetRandomGenerators() {
  90. randomCharGenerator = new Random(3);
  91. randColLenGenerator = new Random(20);
  92. }
  93. private void writeRCFileTest(FileSystem fs, int rowCount, Path file,
  94. int columnNum, CompressionCodec codec) throws IOException {
  95. fs.delete(file, true);
  96. resetRandomGenerators();
  97. RCFileOutputFormat.setColumnNumber(conf, columnNum);
  98. RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, codec);
  99. byte[][] columnRandom;
  100. BytesRefArrayWritable bytes = new BytesRefArrayWritable(columnNum);
  101. columnRandom = new byte[columnNum][];
  102. for (int i = 0; i < columnNum; i++) {
  103. BytesRefWritable cu = new BytesRefWritable();
  104. bytes.set(i, cu);
  105. }
  106. for (int i = 0; i < rowCount; i++) {
  107. nextRandomRow(columnRandom, bytes);
  108. writer.append(bytes);
  109. }
  110. writer.close();
  111. }
  112. private void nextRandomRow(byte[][] row, BytesRefArrayWritable bytes) {
  113. bytes.resetValid(row.length);
  114. for (int i = 0; i < row.length; i++) {
  115. int len = Math.abs(randColLenGenerator.nextInt(columnMaxSize));
  116. row[i] = new byte[len];
  117. for (int j = 0; j < len; j++) {
  118. row[i][j] = getRandomChar(randomCharGenerator);
  119. }
  120. bytes.get(i).set(row[i], 0, len);
  121. }
  122. }
  123. private static int CHAR_END = 122 - 7;
  124. private byte getRandomChar(Random random) {
  125. byte b = 0;
  126. do {
  127. b = (byte) random.nextInt(CHAR_END);
  128. } while ((b < 65));
  129. if (b > 90) {
  130. b += 7;
  131. }
  132. return b;
  133. }
  134. public static void main(String[] args) throws Exception {
  135. int count = 1000;
  136. String file = null;
  137. try {
  138. for (int i = 0; i < args.length; ++i) { // parse command line
  139. if (args[i] == null) {
  140. continue;
  141. } else if (args[i].equals("-count")) {
  142. count = Integer.parseInt(args[++i]);
  143. } else {
  144. // file is required parameter
  145. file = args[i];
  146. }
  147. }
  148. // change it to choose the appropriate file system
  149. boolean isLocalFS = true;
  150. PerformTestRCFileAndSeqFile testcase = new PerformTestRCFileAndSeqFile(
  151. isLocalFS, file);
  152. // change these parameters
  153. boolean checkCorrect = true;
  154. CompressionCodec codec = new DefaultCodec();
  155. testcase.columnMaxSize = 30;
  156. // testcase.testWithColumnNumber(count, 2, checkCorrect, codec);
  157. // testcase.testWithColumnNumber(count, 10, checkCorrect, codec);
  158. // testcase.testWithColumnNumber(count, 25, checkCorrect, codec);
  159. testcase.testWithColumnNumber(count, 40, checkCorrect, codec);
  160. // testcase.testWithColumnNumber(count, 50, checkCorrect, codec);
  161. // testcase.testWithColumnNumber(count, 80, checkCorrect, codec);
  162. } finally {
  163. }
  164. }
  165. private void testWithColumnNumber(int rowCount, int columnNum,
  166. boolean checkCorrect, CompressionCodec codec) throws IOException {
  167. // rcfile
  168. // rcfile write
  169. long start = System.currentTimeMillis();
  170. writeRCFileTest(fs, rowCount, testRCFile, columnNum, codec);
  171. long cost = System.currentTimeMillis() - start;
  172. long fileLen = fs.getFileStatus(testRCFile).getLen();
  173. System.out.println("Write RCFile with " + columnNum
  174. + " random string columns and " + rowCount + " rows cost " + cost
  175. + " milliseconds. And the file's on disk size is " + fileLen);
  176. // sequence file write
  177. start = System.currentTimeMillis();
  178. writeSeqenceFileTest(fs, rowCount, testSeqFile, columnNum, codec);
  179. cost = System.currentTimeMillis() - start;
  180. fileLen = fs.getFileStatus(testSeqFile).getLen();
  181. System.out.println("Write SequenceFile with " + columnNum
  182. + " random string columns and " + rowCount + " rows cost " + cost
  183. + " milliseconds. And the file's on disk size is " + fileLen);
  184. // rcfile read
  185. start = System.currentTimeMillis();
  186. int readRows = performRCFileReadFirstColumnTest(fs, testRCFile, columnNum,
  187. checkCorrect);
  188. cost = System.currentTimeMillis() - start;
  189. System.out.println("Read only one column of a RCFile with " + columnNum
  190. + " random string columns and " + rowCount + " rows cost " + cost
  191. + " milliseconds.");
  192. if (rowCount != readRows) {
  193. throw new IllegalStateException("Compare read and write row count error.");
  194. }
  195. assertEquals("", rowCount, readRows);
  196. if (isLocalFileSystem() && !checkCorrect) {
  197. // make some noisy to avoid disk caches data.
  198. performSequenceFileRead(fs, rowCount, testSeqFile);
  199. }
  200. start = System.currentTimeMillis();
  201. readRows = performRCFileReadFirstAndLastColumnTest(fs, testRCFile,
  202. columnNum, checkCorrect);
  203. cost = System.currentTimeMillis() - start;
  204. System.out.println("Read only first and last columns of a RCFile with "
  205. + columnNum + " random string columns and " + rowCount + " rows cost "
  206. + cost + " milliseconds.");
  207. if (rowCount != readRows) {
  208. throw new IllegalStateException("Compare read and write row count error.");
  209. }
  210. assertEquals("", rowCount, readRows);
  211. if (isLocalFileSystem() && !checkCorrect) {
  212. // make some noisy to avoid disk caches data.
  213. performSequenceFileRead(fs, rowCount, testSeqFile);
  214. }
  215. start = System.currentTimeMillis();
  216. performRCFileFullyReadColumnTest(fs, testRCFile, columnNum, checkCorrect);
  217. cost = System.currentTimeMillis() - start;
  218. System.out.println("Read all columns of a RCFile with " + columnNum
  219. + " random string columns and " + rowCount + " rows cost " + cost
  220. + " milliseconds.");
  221. if (rowCount != readRows) {
  222. throw new IllegalStateException("Compare read and write row count error.");
  223. }
  224. assertEquals("", rowCount, readRows);
  225. // sequence file read
  226. start = System.currentTimeMillis();
  227. performSequenceFileRead(fs, rowCount, testSeqFile);
  228. cost = System.currentTimeMillis() - start;
  229. System.out.println("Read SequenceFile with " + columnNum
  230. + " random string columns and " + rowCount + " rows cost " + cost
  231. + " milliseconds.");
  232. }
  233. public boolean isLocalFileSystem() {
  234. return fs.getUri().toString().startsWith("file://");
  235. }
  236. public void performSequenceFileRead(FileSystem fs, int count, Path file) throws IOException {
  237. SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf);
  238. ByteWritable key = new ByteWritable();
  239. BytesRefArrayWritable val = new BytesRefArrayWritable();
  240. for (int i = 0; i < count; i++) {
  241. reader.next(key, val);
  242. }
  243. }
  244. public int performRCFileReadFirstColumnTest(FileSystem fs, Path file,
  245. int allColumnsNumber, boolean chechCorrect) throws IOException {
  246. byte[][] checkBytes = null;
  247. BytesRefArrayWritable checkRow = new BytesRefArrayWritable(allColumnsNumber);
  248. if (chechCorrect) {
  249. resetRandomGenerators();
  250. checkBytes = new byte[allColumnsNumber][];
  251. }
  252. int actualReadCount = 0;
  253. java.util.ArrayList<Integer> readCols = new java.util.ArrayList<Integer>();
  254. readCols.add(Integer.valueOf(0));
  255. ColumnProjectionUtils.setReadColumnIDs(conf, readCols);
  256. RCFile.Reader reader = new RCFile.Reader(fs, file, conf);
  257. LongWritable rowID = new LongWritable();
  258. BytesRefArrayWritable cols = new BytesRefArrayWritable();
  259. while (reader.next(rowID)) {
  260. reader.getCurrentRow(cols);
  261. boolean ok = true;
  262. if (chechCorrect) {
  263. nextRandomRow(checkBytes, checkRow);
  264. ok = ok && (checkRow.get(0).equals(cols.get(0)));
  265. }
  266. if (!ok) {
  267. throw new IllegalStateException("Compare read and write error.");
  268. }
  269. actualReadCount++;
  270. }
  271. return actualReadCount;
  272. }
  273. public int performRCFileReadFirstAndLastColumnTest(FileSystem fs, Path file,
  274. int allColumnsNumber, boolean chechCorrect) throws IOException {
  275. byte[][] checkBytes = null;
  276. BytesRefArrayWritable checkRow = new BytesRefArrayWritable(allColumnsNumber);
  277. if (chechCorrect) {
  278. resetRandomGenerators();
  279. checkBytes = new byte[allColumnsNumber][];
  280. }
  281. int actualReadCount = 0;
  282. java.util.ArrayList<Integer> readCols = new java.util.ArrayList<Integer>();
  283. readCols.add(Integer.valueOf(0));
  284. readCols.add(Integer.valueOf(allColumnsNumber - 1));
  285. ColumnProjectionUtils.setReadColumnIDs(conf, readCols);
  286. RCFile.Reader reader = new RCFile.Reader(fs, file, conf);
  287. LongWritable rowID = new LongWritable();
  288. BytesRefArrayWritable cols = new BytesRefArrayWritable();
  289. while (reader.next(rowID)) {
  290. reader.getCurrentRow(cols);
  291. boolean ok = true;
  292. if (chechCorrect) {
  293. nextRandomRow(checkBytes, checkRow);
  294. ok = ok && (checkRow.get(0).equals(cols.get(0)));
  295. ok = ok
  296. && checkRow.get(allColumnsNumber - 1).equals(
  297. cols.get(allColumnsNumber - 1));
  298. }
  299. if (!ok) {
  300. throw new IllegalStateException("Compare read and write error.");
  301. }
  302. actualReadCount++;
  303. }
  304. return actualReadCount;
  305. }
  306. public int performRCFileFullyReadColumnTest(FileSystem fs, Path file,
  307. int allColumnsNumber, boolean chechCorrect) throws IOException {
  308. byte[][] checkBytes = null;
  309. BytesRefArrayWritable checkRow = new BytesRefArrayWritable(allColumnsNumber);
  310. if (chechCorrect) {
  311. resetRandomGenerators();
  312. checkBytes = new byte[allColumnsNumber][];
  313. }
  314. int actualReadCount = 0;
  315. ColumnProjectionUtils.setFullyReadColumns(conf);
  316. RCFile.Reader reader = new RCFile.Reader(fs, file, conf);
  317. LongWritable rowID = new LongWritable();
  318. BytesRefArrayWritable cols = new BytesRefArrayWritable();
  319. while (reader.next(rowID)) {
  320. reader.getCurrentRow(cols);
  321. boolean ok = true;
  322. if (chechCorrect) {
  323. nextRandomRow(checkBytes, checkRow);
  324. ok = ok && checkRow.equals(cols);
  325. }
  326. if (!ok) {
  327. throw new IllegalStateException("Compare read and write error.");
  328. }
  329. actualReadCount++;
  330. }
  331. return actualReadCount;
  332. }
  333. }