PageRenderTime 63ms CodeModel.GetById 30ms RepoModel.GetById 0ms app.codeStats 1ms

/src/test/mapred/org/apache/hadoop/mapred/TestJobCounters.java

https://github.com/RS1999ent/hadoop-mapreduce
Java | 335 lines | 231 code | 39 blank | 65 comment | 12 complexity | 0c4d9edc57a3accdb3a5168bbd6cf7d1 MD5 | raw file
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.mapred;
  19. import java.io.IOException;
  20. import java.util.Formatter;
  21. import java.util.StringTokenizer;
  22. import org.junit.AfterClass;
  23. import org.junit.BeforeClass;
  24. import org.junit.Test;
  25. import static org.junit.Assert.*;
  26. import org.apache.hadoop.conf.Configuration;
  27. import org.apache.hadoop.fs.FSDataOutputStream;
  28. import org.apache.hadoop.fs.FileSystem;
  29. import org.apache.hadoop.fs.Path;
  30. import org.apache.hadoop.io.IntWritable;
  31. import org.apache.hadoop.io.Text;
  32. import org.apache.hadoop.mapreduce.Cluster;
  33. import org.apache.hadoop.mapreduce.Job;
  34. import org.apache.hadoop.mapreduce.TaskCounter;
  35. /**
  36. * This is an wordcount application that tests the count of records
  37. * got spilled to disk. It generates simple text input files. Then
  38. * runs the wordcount map/reduce application on (1) 3 i/p files(with 3 maps
  39. * and 1 reduce) and verifies the counters and (2) 4 i/p files(with 4 maps
  40. * and 1 reduce) and verifies counters. Wordcount application reads the
  41. * text input files, breaks each line into words and counts them. The output
  42. * is a locally sorted list of words and the count of how often they occurred.
  43. *
  44. */
  45. public class TestJobCounters {
  46. private void validateCounters(Counters counter, long spillRecCnt,
  47. long mapInputRecords, long mapOutputRecords) {
  48. // Check if the numer of Spilled Records is same as expected
  49. assertEquals(spillRecCnt,
  50. counter.findCounter(TaskCounter.SPILLED_RECORDS).getCounter());
  51. assertEquals(mapInputRecords,
  52. counter.findCounter(TaskCounter.MAP_INPUT_RECORDS).getCounter());
  53. assertEquals(mapOutputRecords,
  54. counter.findCounter(TaskCounter.MAP_OUTPUT_RECORDS).getCounter());
  55. }
  56. private void removeWordsFile(Path inpFile, Configuration conf)
  57. throws IOException {
  58. final FileSystem fs = inpFile.getFileSystem(conf);
  59. if (fs.exists(inpFile) && !fs.delete(inpFile, false)) {
  60. throw new IOException("Failed to delete " + inpFile);
  61. }
  62. }
  63. private static void createWordsFile(Path inpFile, Configuration conf)
  64. throws IOException {
  65. final FileSystem fs = inpFile.getFileSystem(conf);
  66. if (fs.exists(inpFile)) {
  67. return;
  68. }
  69. FSDataOutputStream out = fs.create(inpFile);
  70. try {
  71. // 1024*4 unique words --- repeated 5 times => 5*2K words
  72. int REPLICAS=5, NUMLINES=1024, NUMWORDSPERLINE=4;
  73. final String WORD = "zymurgy"; // 7 bytes + 4 id bytes
  74. final Formatter fmt = new Formatter(new StringBuilder());
  75. for (int i = 0; i < REPLICAS; i++) {
  76. for (int j = 1; j <= NUMLINES*NUMWORDSPERLINE; j+=NUMWORDSPERLINE) {
  77. ((StringBuilder)fmt.out()).setLength(0);
  78. for (int k = 0; k < NUMWORDSPERLINE; ++k) {
  79. fmt.format("%s%04d ", WORD, j + k);
  80. }
  81. ((StringBuilder)fmt.out()).append("\n");
  82. out.writeBytes(fmt.toString());
  83. }
  84. }
  85. } finally {
  86. out.close();
  87. }
  88. }
  89. private static Path IN_DIR = null;
  90. private static Path OUT_DIR = null;
  91. private static Path testdir = null;
  92. @BeforeClass
  93. public static void initPaths() throws IOException {
  94. final Configuration conf = new Configuration();
  95. final Path TEST_ROOT_DIR =
  96. new Path(System.getProperty("test.build.data", "/tmp"));
  97. testdir = new Path(TEST_ROOT_DIR, "spilledRecords.countertest");
  98. IN_DIR = new Path(testdir, "in");
  99. OUT_DIR = new Path(testdir, "out");
  100. FileSystem fs = FileSystem.getLocal(conf);
  101. testdir = new Path(TEST_ROOT_DIR, "spilledRecords.countertest");
  102. if (fs.exists(testdir) && !fs.delete(testdir, true)) {
  103. throw new IOException("Could not delete " + testdir);
  104. }
  105. if (!fs.mkdirs(IN_DIR)) {
  106. throw new IOException("Mkdirs failed to create " + IN_DIR);
  107. }
  108. // create 3 input files each with 5*2k words
  109. createWordsFile(new Path(IN_DIR, "input5_2k_1"), conf);
  110. createWordsFile(new Path(IN_DIR, "input5_2k_2"), conf);
  111. createWordsFile(new Path(IN_DIR, "input5_2k_3"), conf);
  112. }
  113. @AfterClass
  114. public static void cleanup() throws IOException {
  115. //clean up the input and output files
  116. final Configuration conf = new Configuration();
  117. final FileSystem fs = testdir.getFileSystem(conf);
  118. if (fs.exists(testdir)) {
  119. fs.delete(testdir, true);
  120. }
  121. }
  122. public static JobConf createConfiguration() throws IOException {
  123. JobConf baseConf = new JobConf(TestJobCounters.class);
  124. baseConf.setOutputKeyClass(Text.class);
  125. baseConf.setOutputValueClass(IntWritable.class);
  126. baseConf.setMapperClass(WordCount.MapClass.class);
  127. baseConf.setCombinerClass(WordCount.Reduce.class);
  128. baseConf.setReducerClass(WordCount.Reduce.class);
  129. baseConf.setNumReduceTasks(1);
  130. baseConf.setInt(JobContext.IO_SORT_MB, 1);
  131. baseConf.set(JobContext.MAP_SORT_SPILL_PERCENT, "0.50");
  132. baseConf.setInt(JobContext.MAP_COMBINE_MIN_SPILLS, 3);
  133. return baseConf;
  134. }
  135. public static Job createJob() throws IOException {
  136. final Configuration conf = new Configuration();
  137. final Job baseJob = Job.getInstance(new Cluster(conf), conf);
  138. baseJob.setOutputKeyClass(Text.class);
  139. baseJob.setOutputValueClass(IntWritable.class);
  140. baseJob.setMapperClass(NewMapTokenizer.class);
  141. baseJob.setCombinerClass(NewSummer.class);
  142. baseJob.setReducerClass(NewSummer.class);
  143. baseJob.setNumReduceTasks(1);
  144. baseJob.getConfiguration().setInt(JobContext.IO_SORT_MB, 1);
  145. baseJob.getConfiguration().set(JobContext.MAP_SORT_SPILL_PERCENT, "0.50");
  146. baseJob.getConfiguration().setInt(JobContext.MAP_COMBINE_MIN_SPILLS, 3);
  147. org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setMinInputSplitSize(
  148. baseJob, Long.MAX_VALUE);
  149. return baseJob;
  150. }
  151. @Test
  152. public void testOldCounterA() throws Exception {
  153. JobConf conf = createConfiguration();
  154. conf.setNumMapTasks(3);
  155. conf.setInt(JobContext.IO_SORT_FACTOR, 2);
  156. removeWordsFile(new Path(IN_DIR, "input5_2k_4"), conf);
  157. removeWordsFile(new Path(IN_DIR, "input5_2k_5"), conf);
  158. FileInputFormat.setInputPaths(conf, IN_DIR);
  159. FileOutputFormat.setOutputPath(conf, new Path(OUT_DIR, "outputO0"));
  160. RunningJob myJob = JobClient.runJob(conf);
  161. Counters c1 = myJob.getCounters();
  162. // Each record requires 16 bytes of metadata, 16 bytes per serialized rec
  163. // (vint word len + word + IntWritable) = (1 + 11 + 4)
  164. // (2^20 buf * .5 spill pcnt) / 32 bytes/record = 2^14 recs per spill
  165. // Each file contains 5 replicas of 4096 words, so the first spill will
  166. // contain 4 (2^14 rec / 2^12 rec/replica) replicas, the second just one.
  167. // Each map spills twice, emitting 4096 records per spill from the
  168. // combiner per spill. The merge adds an additional 8192 records, as
  169. // there are too few spills to combine (2 < 3)
  170. // Each map spills 2^14 records, so maps spill 49152 records, combined.
  171. // The reduce spill count is composed of the read from one segment and
  172. // the intermediate merge of the other two. The intermediate merge
  173. // adds 8192 records per segment read; again, there are too few spills to
  174. // combine, so all 16834 are written to disk (total 32768 spilled records
  175. // for the intermediate merge). The merge into the reduce includes only
  176. // the unmerged segment, size 8192. Total spilled records in the reduce
  177. // is 32768 from the merge + 8192 unmerged segment = 40960 records
  178. // Total: map + reduce = 49152 + 40960 = 90112
  179. // 3 files, 5120 = 5 * 1024 rec/file = 15360 input records
  180. // 4 records/line = 61440 output records
  181. validateCounters(c1, 90112, 15360, 61440);
  182. }
  183. @Test
  184. public void testOldCounterB() throws Exception {
  185. JobConf conf = createConfiguration();
  186. createWordsFile(new Path(IN_DIR, "input5_2k_4"), conf);
  187. removeWordsFile(new Path(IN_DIR, "input5_2k_5"), conf);
  188. conf.setNumMapTasks(4);
  189. conf.setInt(JobContext.IO_SORT_FACTOR, 2);
  190. FileInputFormat.setInputPaths(conf, IN_DIR);
  191. FileOutputFormat.setOutputPath(conf, new Path(OUT_DIR, "outputO1"));
  192. RunningJob myJob = JobClient.runJob(conf);
  193. Counters c1 = myJob.getCounters();
  194. // As above, each map spills 2^14 records, so 4 maps spill 2^16 records
  195. // In the reduce, there are two intermediate merges before the reduce.
  196. // 1st merge: read + write = 8192 * 4
  197. // 2nd merge: read + write = 8192 * 4
  198. // final merge: 0
  199. // Total reduce: 65536
  200. // Total: map + reduce = 2^16 + 2^16 = 131072
  201. // 4 files, 5120 = 5 * 1024 rec/file = 15360 input records
  202. // 4 records/line = 81920 output records
  203. validateCounters(c1, 131072, 20480, 81920);
  204. }
  205. @Test
  206. public void testOldCounterC() throws Exception {
  207. JobConf conf = createConfiguration();
  208. createWordsFile(new Path(IN_DIR, "input5_2k_4"), conf);
  209. createWordsFile(new Path(IN_DIR, "input5_2k_5"), conf);
  210. conf.setNumMapTasks(4);
  211. conf.setInt(JobContext.IO_SORT_FACTOR, 3);
  212. FileInputFormat.setInputPaths(conf, IN_DIR);
  213. FileOutputFormat.setOutputPath(conf, new Path(OUT_DIR, "outputO2"));
  214. RunningJob myJob = JobClient.runJob(conf);
  215. Counters c1 = myJob.getCounters();
  216. // As above, each map spills 2^14 records, so 5 maps spill 81920
  217. // 1st merge: read + write = 6 * 8192
  218. // final merge: unmerged = 2 * 8192
  219. // Total reduce: 45056
  220. // 5 files, 5120 = 5 * 1024 rec/file = 15360 input records
  221. // 4 records/line = 102400 output records
  222. validateCounters(c1, 147456, 25600, 102400);
  223. }
  224. @Test
  225. public void testNewCounterA() throws Exception {
  226. final Job job = createJob();
  227. final Configuration conf = job.getConfiguration();
  228. conf.setInt(JobContext.IO_SORT_FACTOR, 2);
  229. removeWordsFile(new Path(IN_DIR, "input5_2k_4"), conf);
  230. removeWordsFile(new Path(IN_DIR, "input5_2k_5"), conf);
  231. org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setInputPaths(
  232. job, IN_DIR);
  233. org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.setOutputPath(
  234. job, new Path(OUT_DIR, "outputN0"));
  235. assertTrue(job.waitForCompletion(true));
  236. final Counters c1 = Counters.downgrade(job.getCounters());
  237. validateCounters(c1, 90112, 15360, 61440);
  238. }
  239. @Test
  240. public void testNewCounterB() throws Exception {
  241. final Job job = createJob();
  242. final Configuration conf = job.getConfiguration();
  243. conf.setInt(JobContext.IO_SORT_FACTOR, 2);
  244. createWordsFile(new Path(IN_DIR, "input5_2k_4"), conf);
  245. removeWordsFile(new Path(IN_DIR, "input5_2k_5"), conf);
  246. org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setInputPaths(
  247. job, IN_DIR);
  248. org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.setOutputPath(
  249. job, new Path(OUT_DIR, "outputN1"));
  250. assertTrue(job.waitForCompletion(true));
  251. final Counters c1 = Counters.downgrade(job.getCounters());
  252. validateCounters(c1, 131072, 20480, 81920);
  253. }
  254. @Test
  255. public void testNewCounterC() throws Exception {
  256. final Job job = createJob();
  257. final Configuration conf = job.getConfiguration();
  258. conf.setInt(JobContext.IO_SORT_FACTOR, 3);
  259. createWordsFile(new Path(IN_DIR, "input5_2k_4"), conf);
  260. createWordsFile(new Path(IN_DIR, "input5_2k_5"), conf);
  261. org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setInputPaths(
  262. job, IN_DIR);
  263. org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.setOutputPath(
  264. job, new Path(OUT_DIR, "outputN2"));
  265. assertTrue(job.waitForCompletion(true));
  266. final Counters c1 = Counters.downgrade(job.getCounters());
  267. validateCounters(c1, 147456, 25600, 102400);
  268. }
  269. public static class NewMapTokenizer
  270. extends org.apache.hadoop.mapreduce.Mapper<Object,Text,Text,IntWritable> {
  271. private final static IntWritable one = new IntWritable(1);
  272. private Text word = new Text();
  273. public void map(Object key, Text value, Context context)
  274. throws IOException, InterruptedException {
  275. StringTokenizer itr = new StringTokenizer(value.toString());
  276. while (itr.hasMoreTokens()) {
  277. word.set(itr.nextToken());
  278. context.write(word, one);
  279. }
  280. }
  281. }
  282. public static class NewSummer
  283. extends org.apache.hadoop.mapreduce.Reducer<Text,IntWritable,
  284. Text,IntWritable> {
  285. private IntWritable result = new IntWritable();
  286. public void reduce(Text key, Iterable<IntWritable> values, Context context)
  287. throws IOException, InterruptedException {
  288. int sum = 0;
  289. for (IntWritable val : values) {
  290. sum += val.get();
  291. }
  292. result.set(sum);
  293. context.write(key, result);
  294. }
  295. }
  296. }