PageRenderTime 99ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 0ms

/src/test/hdfs/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailureReporting.java

https://github.com/RS1999ent/hadoop-hdfs
Java | 278 lines | 147 code | 37 blank | 94 comment | 1 complexity | e96a12c57a23bfa7bb892c9f635f3027 MD5 | raw file
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hdfs.server.datanode;
  19. import java.io.File;
  20. import java.util.ArrayList;
  21. import org.apache.hadoop.conf.Configuration;
  22. import org.apache.hadoop.fs.FileSystem;
  23. import org.apache.hadoop.fs.Path;
  24. import org.apache.hadoop.hdfs.DFSTestUtil;
  25. import org.apache.hadoop.hdfs.MiniDFSCluster;
  26. import org.apache.hadoop.hdfs.server.namenode.DatanodeDescriptor;
  27. import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
  28. import org.apache.hadoop.hdfs.HdfsConfiguration;
  29. import org.apache.hadoop.hdfs.DFSConfigKeys;
  30. import static org.apache.hadoop.test.MetricsAsserts.*;
  31. import org.apache.commons.logging.Log;
  32. import org.apache.commons.logging.LogFactory;
  33. import org.apache.commons.logging.impl.Log4JLogger;
  34. import org.apache.log4j.Level;
  35. import org.junit.After;
  36. import org.junit.Before;
  37. import org.junit.Test;
  38. import static org.junit.Assert.*;
  39. import static org.junit.Assume.assumeTrue;
  40. /**
  41. * Test reporting of DN volume failure counts and metrics.
  42. */
  43. public class TestDataNodeVolumeFailureReporting {
  44. private static final Log LOG = LogFactory.getLog(TestDataNodeVolumeFailureReporting.class);
  45. {
  46. ((Log4JLogger)TestDataNodeVolumeFailureReporting.LOG).getLogger().setLevel(Level.ALL);
  47. }
  48. private FileSystem fs;
  49. private MiniDFSCluster cluster;
  50. private Configuration conf;
  51. private String dataDir;
  52. // Sleep at least 3 seconds (a 1s heartbeat plus padding) to allow
  53. // for heartbeats to propagate from the datanodes to the namenode.
  54. final int WAIT_FOR_HEARTBEATS = 3000;
  55. // Wait at least (2 * re-check + 10 * heartbeat) seconds for
  56. // a datanode to be considered dead by the namenode.
  57. final int WAIT_FOR_DEATH = 15000;
  58. @Before
  59. public void setUp() throws Exception {
  60. conf = new HdfsConfiguration();
  61. conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 512L);
  62. /*
  63. * Lower the DN heartbeat, DF rate, and recheck interval to one second
  64. * so state about failures and datanode death propagates faster.
  65. */
  66. conf.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1);
  67. conf.setInt(DFSConfigKeys.DFS_DF_INTERVAL_KEY, 1000);
  68. conf.setInt(DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY, 1000);
  69. // Allow a single volume failure (there are two volumes)
  70. conf.setInt(DFSConfigKeys.DFS_DATANODE_FAILED_VOLUMES_TOLERATED_KEY, 1);
  71. cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).build();
  72. cluster.waitActive();
  73. fs = cluster.getFileSystem();
  74. dataDir = cluster.getDataDirectory();
  75. }
  76. @After
  77. public void tearDown() throws Exception {
  78. for (int i = 0; i < 3; i++) {
  79. new File(dataDir, "data"+(2*i+1)).setExecutable(true);
  80. new File(dataDir, "data"+(2*i+2)).setExecutable(true);
  81. }
  82. cluster.shutdown();
  83. }
  84. /**
  85. * Test that individual volume failures do not cause DNs to fail, that
  86. * all volumes failed on a single datanode do cause it to fail, and
  87. * that the capacities and liveliness is adjusted correctly in the NN.
  88. */
  89. @Test
  90. public void testSuccessiveVolumeFailures() throws Exception {
  91. assumeTrue(!System.getProperty("os.name").startsWith("Windows"));
  92. // Bring up two more datanodes
  93. cluster.startDataNodes(conf, 2, true, null, null);
  94. cluster.waitActive();
  95. /*
  96. * Calculate the total capacity of all the datanodes. Sleep for
  97. * three seconds to be sure the datanodes have had a chance to
  98. * heartbeat their capacities.
  99. */
  100. Thread.sleep(WAIT_FOR_HEARTBEATS);
  101. FSNamesystem ns = cluster.getNamesystem();
  102. long origCapacity = DFSTestUtil.getLiveDatanodeCapacity(ns);
  103. long dnCapacity = DFSTestUtil.getDatanodeCapacity(ns, 0);
  104. File dn1Vol1 = new File(dataDir, "data"+(2*0+1));
  105. File dn2Vol1 = new File(dataDir, "data"+(2*1+1));
  106. File dn3Vol1 = new File(dataDir, "data"+(2*2+1));
  107. File dn3Vol2 = new File(dataDir, "data"+(2*2+2));
  108. /*
  109. * Make the 1st volume directories on the first two datanodes
  110. * non-accessible. We don't make all three 1st volume directories
  111. * readonly since that would cause the entire pipeline to
  112. * fail. The client does not retry failed nodes even though
  113. * perhaps they could succeed because just a single volume failed.
  114. */
  115. assertTrue("Couldn't chmod local vol", dn1Vol1.setExecutable(false));
  116. assertTrue("Couldn't chmod local vol", dn2Vol1.setExecutable(false));
  117. /*
  118. * Create file1 and wait for 3 replicas (ie all DNs can still
  119. * store a block). Then assert that all DNs are up, despite the
  120. * volume failures.
  121. */
  122. Path file1 = new Path("/test1");
  123. DFSTestUtil.createFile(fs, file1, 1024, (short)3, 1L);
  124. DFSTestUtil.waitReplication(fs, file1, (short)3);
  125. ArrayList<DataNode> dns = cluster.getDataNodes();
  126. assertTrue("DN1 should be up", dns.get(0).isDatanodeUp());
  127. assertTrue("DN2 should be up", dns.get(1).isDatanodeUp());
  128. assertTrue("DN3 should be up", dns.get(2).isDatanodeUp());
  129. /*
  130. * The metrics should confirm the volume failures.
  131. */
  132. assertCounter("VolumeFailures", 1L,
  133. getMetrics(dns.get(0).getMetrics().name()));
  134. assertCounter("VolumeFailures", 1L,
  135. getMetrics(dns.get(1).getMetrics().name()));
  136. assertCounter("VolumeFailures", 0L,
  137. getMetrics(dns.get(2).getMetrics().name()));
  138. // Ensure we wait a sufficient amount of time
  139. assert (WAIT_FOR_HEARTBEATS * 10) > WAIT_FOR_DEATH;
  140. // Eventually the NN should report two volume failures
  141. DFSTestUtil.waitForDatanodeStatus(ns, 3, 0, 2,
  142. origCapacity - (1*dnCapacity), WAIT_FOR_HEARTBEATS);
  143. /*
  144. * Now fail a volume on the third datanode. We should be able to get
  145. * three replicas since we've already identified the other failures.
  146. */
  147. assertTrue("Couldn't chmod local vol", dn3Vol1.setExecutable(false));
  148. Path file2 = new Path("/test2");
  149. DFSTestUtil.createFile(fs, file2, 1024, (short)3, 1L);
  150. DFSTestUtil.waitReplication(fs, file2, (short)3);
  151. assertTrue("DN3 should still be up", dns.get(2).isDatanodeUp());
  152. assertCounter("VolumeFailures", 1L,
  153. getMetrics(dns.get(2).getMetrics().name()));
  154. ArrayList<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
  155. ArrayList<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
  156. ns.DFSNodesStatus(live, dead);
  157. live.clear();
  158. dead.clear();
  159. ns.DFSNodesStatus(live, dead);
  160. assertEquals("DN3 should have 1 failed volume",
  161. 1, live.get(2).getVolumeFailures());
  162. /*
  163. * Once the datanodes have a chance to heartbeat their new capacity the
  164. * total capacity should be down by three volumes (assuming the host
  165. * did not grow or shrink the data volume while the test was running).
  166. */
  167. dnCapacity = DFSTestUtil.getDatanodeCapacity(ns, 0);
  168. DFSTestUtil.waitForDatanodeStatus(ns, 3, 0, 3,
  169. origCapacity - (3*dnCapacity), WAIT_FOR_HEARTBEATS);
  170. /*
  171. * Now fail the 2nd volume on the 3rd datanode. All its volumes
  172. * are now failed and so it should report two volume failures
  173. * and that it's no longer up. Only wait for two replicas since
  174. * we'll never get a third.
  175. */
  176. assertTrue("Couldn't chmod local vol", dn3Vol2.setExecutable(false));
  177. Path file3 = new Path("/test3");
  178. DFSTestUtil.createFile(fs, file3, 1024, (short)3, 1L);
  179. DFSTestUtil.waitReplication(fs, file3, (short)2);
  180. // The DN should consider itself dead
  181. DFSTestUtil.waitForDatanodeDeath(dns.get(2));
  182. // And report two failed volumes
  183. assertCounter("VolumeFailures", 2L,
  184. getMetrics(dns.get(2).getMetrics().name()));
  185. // The NN considers the DN dead
  186. DFSTestUtil.waitForDatanodeStatus(ns, 2, 1, 2,
  187. origCapacity - (4*dnCapacity), WAIT_FOR_HEARTBEATS);
  188. /*
  189. * The datanode never tries to restore the failed volume, even if
  190. * it's subsequently repaired, but it should see this volume on
  191. * restart, so file creation should be able to succeed after
  192. * restoring the data directories and restarting the datanodes.
  193. */
  194. assertTrue("Couldn't chmod local vol", dn1Vol1.setExecutable(true));
  195. assertTrue("Couldn't chmod local vol", dn2Vol1.setExecutable(true));
  196. assertTrue("Couldn't chmod local vol", dn3Vol1.setExecutable(true));
  197. assertTrue("Couldn't chmod local vol", dn3Vol2.setExecutable(true));
  198. cluster.restartDataNodes();
  199. cluster.waitActive();
  200. Path file4 = new Path("/test4");
  201. DFSTestUtil.createFile(fs, file4, 1024, (short)3, 1L);
  202. DFSTestUtil.waitReplication(fs, file4, (short)3);
  203. /*
  204. * Eventually the capacity should be restored to its original value,
  205. * and that the volume failure count should be reported as zero by
  206. * both the metrics and the NN.
  207. */
  208. DFSTestUtil.waitForDatanodeStatus(ns, 3, 0, 0, origCapacity,
  209. WAIT_FOR_HEARTBEATS);
  210. }
  211. /**
  212. * Test that the NN re-learns of volume failures after restart.
  213. */
  214. @Test
  215. public void testVolFailureStatsPreservedOnNNRestart() throws Exception {
  216. assumeTrue(!System.getProperty("os.name").startsWith("Windows"));
  217. // Bring up two more datanodes that can tolerate 1 failure
  218. cluster.startDataNodes(conf, 2, true, null, null);
  219. cluster.waitActive();
  220. FSNamesystem ns = cluster.getNamesystem();
  221. long origCapacity = DFSTestUtil.getLiveDatanodeCapacity(ns);
  222. long dnCapacity = DFSTestUtil.getDatanodeCapacity(ns, 0);
  223. // Fail the first volume on both datanodes (we have to keep the
  224. // third healthy so one node in the pipeline will not fail).
  225. File dn1Vol1 = new File(dataDir, "data"+(2*0+1));
  226. File dn2Vol1 = new File(dataDir, "data"+(2*1+1));
  227. assertTrue("Couldn't chmod local vol", dn1Vol1.setExecutable(false));
  228. assertTrue("Couldn't chmod local vol", dn2Vol1.setExecutable(false));
  229. Path file1 = new Path("/test1");
  230. DFSTestUtil.createFile(fs, file1, 1024, (short)2, 1L);
  231. DFSTestUtil.waitReplication(fs, file1, (short)2);
  232. // The NN reports two volumes failures
  233. DFSTestUtil.waitForDatanodeStatus(ns, 3, 0, 2,
  234. origCapacity - (1*dnCapacity), WAIT_FOR_HEARTBEATS);
  235. // After restarting the NN it still see the two failures
  236. cluster.restartNameNode(0);
  237. cluster.waitActive();
  238. DFSTestUtil.waitForDatanodeStatus(ns, 3, 0, 2,
  239. origCapacity - (1*dnCapacity), WAIT_FOR_HEARTBEATS);
  240. }
  241. }