PageRenderTime 60ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 1ms

/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java

https://github.com/bmahe/hadoop-common
Java | 294 lines | 235 code | 33 blank | 26 comment | 12 complexity | 6915bb6b77791b41465a8584e31300af MD5 | raw file
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.yarn.server.nodemanager;
  19. import java.net.InetAddress;
  20. import java.net.InetSocketAddress;
  21. import java.util.ArrayList;
  22. import java.util.Iterator;
  23. import java.util.List;
  24. import java.util.Map.Entry;
  25. import org.apache.avro.AvroRuntimeException;
  26. import org.apache.commons.logging.Log;
  27. import org.apache.commons.logging.LogFactory;
  28. import org.apache.hadoop.NodeHealthCheckerService;
  29. import org.apache.hadoop.conf.Configuration;
  30. import org.apache.hadoop.net.NetUtils;
  31. import org.apache.hadoop.security.SecurityInfo;
  32. import org.apache.hadoop.security.UserGroupInformation;
  33. import org.apache.hadoop.yarn.api.records.ApplicationId;
  34. import org.apache.hadoop.yarn.api.records.ContainerId;
  35. import org.apache.hadoop.yarn.api.records.ContainerState;
  36. import org.apache.hadoop.yarn.api.records.ContainerStatus;
  37. import org.apache.hadoop.yarn.api.records.NodeHealthStatus;
  38. import org.apache.hadoop.yarn.api.records.NodeId;
  39. import org.apache.hadoop.yarn.api.records.Resource;
  40. import org.apache.hadoop.yarn.conf.YarnConfiguration;
  41. import org.apache.hadoop.yarn.event.Dispatcher;
  42. import org.apache.hadoop.yarn.exceptions.YarnRemoteException;
  43. import org.apache.hadoop.yarn.factories.RecordFactory;
  44. import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
  45. import org.apache.hadoop.yarn.ipc.YarnRPC;
  46. import org.apache.hadoop.yarn.server.RMNMSecurityInfoClass;
  47. import org.apache.hadoop.yarn.server.api.ResourceTracker;
  48. import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest;
  49. import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest;
  50. import org.apache.hadoop.yarn.server.api.records.HeartbeatResponse;
  51. import org.apache.hadoop.yarn.server.api.records.NodeStatus;
  52. import org.apache.hadoop.yarn.server.api.records.RegistrationResponse;
  53. import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
  54. import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
  55. import org.apache.hadoop.yarn.server.security.ContainerTokenSecretManager;
  56. import org.apache.hadoop.yarn.service.AbstractService;
  57. import org.apache.hadoop.yarn.util.Records;
  58. public class NodeStatusUpdaterImpl extends AbstractService implements
  59. NodeStatusUpdater {
  60. private static final Log LOG = LogFactory.getLog(NodeStatusUpdaterImpl.class);
  61. private final Object heartbeatMonitor = new Object();
  62. private final Context context;
  63. private final Dispatcher dispatcher;
  64. private ContainerTokenSecretManager containerTokenSecretManager;
  65. private long heartBeatInterval;
  66. private ResourceTracker resourceTracker;
  67. private String rmAddress;
  68. private Resource totalResource;
  69. private String containerManagerBindAddress;
  70. private String hostName;
  71. private int containerManagerPort;
  72. private int httpPort;
  73. private NodeId nodeId;
  74. private byte[] secretKeyBytes = new byte[0];
  75. private boolean isStopped;
  76. private RecordFactory recordFactory = RecordFactoryProvider.getRecordFactory(null);
  77. private final NodeHealthCheckerService healthChecker;
  78. private final NodeManagerMetrics metrics;
  79. public NodeStatusUpdaterImpl(Context context, Dispatcher dispatcher,
  80. NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics,
  81. ContainerTokenSecretManager containerTokenSecretManager) {
  82. super(NodeStatusUpdaterImpl.class.getName());
  83. this.healthChecker = healthChecker;
  84. this.context = context;
  85. this.dispatcher = dispatcher;
  86. this.metrics = metrics;
  87. this.containerTokenSecretManager = containerTokenSecretManager;
  88. }
  89. @Override
  90. public synchronized void init(Configuration conf) {
  91. this.rmAddress =
  92. conf.get(YarnConfiguration.RM_RESOURCE_TRACKER_ADDRESS,
  93. YarnConfiguration.RM_RESOURCE_TRACKER_ADDRESS);
  94. this.heartBeatInterval =
  95. conf.getLong(YarnConfiguration.NM_TO_RM_HEARTBEAT_INTERVAL_MS,
  96. YarnConfiguration.DEFAULT_NM_TO_RM_HEARTBEAT_INTERVAL_MS);
  97. int memory = conf.getInt(YarnConfiguration.NM_VMEM_GB, YarnConfiguration.DEFAULT_NM_VMEM_GB);
  98. this.totalResource = recordFactory.newRecordInstance(Resource.class);
  99. this.totalResource.setMemory(memory * 1024);
  100. metrics.addResource(totalResource);
  101. super.init(conf);
  102. }
  103. @Override
  104. public void start() {
  105. String cmBindAddressStr =
  106. getConfig().get(YarnConfiguration.NM_ADDRESS,
  107. YarnConfiguration.DEFAULT_NM_ADDRESS);
  108. InetSocketAddress cmBindAddress =
  109. NetUtils.createSocketAddr(cmBindAddressStr);
  110. String httpBindAddressStr =
  111. getConfig().get(YarnConfiguration.NM_WEBAPP_ADDRESS,
  112. YarnConfiguration.DEFAULT_NM_WEBAPP_ADDRESS);
  113. InetSocketAddress httpBindAddress =
  114. NetUtils.createSocketAddr(httpBindAddressStr);
  115. try {
  116. this.hostName = InetAddress.getLocalHost().getHostAddress();
  117. this.containerManagerPort = cmBindAddress.getPort();
  118. this.httpPort = httpBindAddress.getPort();
  119. this.containerManagerBindAddress =
  120. this.hostName + ":" + this.containerManagerPort;
  121. LOG.info("Configured ContainerManager Address is "
  122. + this.containerManagerBindAddress);
  123. // Registration has to be in start so that ContainerManager can get the
  124. // perNM tokens needed to authenticate ContainerTokens.
  125. registerWithRM();
  126. super.start();
  127. startStatusUpdater();
  128. } catch (Exception e) {
  129. throw new AvroRuntimeException(e);
  130. }
  131. }
  132. @Override
  133. public synchronized void stop() {
  134. // Interrupt the updater.
  135. this.isStopped = true;
  136. super.stop();
  137. }
  138. protected ResourceTracker getRMClient() {
  139. YarnRPC rpc = YarnRPC.create(getConfig());
  140. InetSocketAddress rmAddress = NetUtils.createSocketAddr(this.rmAddress);
  141. Configuration rmClientConf = new Configuration(getConfig());
  142. rmClientConf.setClass(
  143. YarnConfiguration.YARN_SECURITY_INFO,
  144. RMNMSecurityInfoClass.class, SecurityInfo.class);
  145. return (ResourceTracker) rpc.getProxy(ResourceTracker.class, rmAddress,
  146. rmClientConf);
  147. }
  148. private void registerWithRM() throws YarnRemoteException {
  149. this.resourceTracker = getRMClient();
  150. LOG.info("Connected to ResourceManager at " + this.rmAddress);
  151. RegisterNodeManagerRequest request = recordFactory.newRecordInstance(RegisterNodeManagerRequest.class);
  152. this.nodeId = Records.newRecord(NodeId.class);
  153. this.nodeId.setHost(this.hostName);
  154. this.nodeId.setPort(this.containerManagerPort);
  155. request.setHttpPort(this.httpPort);
  156. request.setResource(this.totalResource);
  157. request.setNodeId(this.nodeId);
  158. RegistrationResponse regResponse =
  159. this.resourceTracker.registerNodeManager(request).getRegistrationResponse();
  160. if (UserGroupInformation.isSecurityEnabled()) {
  161. this.secretKeyBytes = regResponse.getSecretKey().array();
  162. }
  163. // do this now so that its set before we start heartbeating to RM
  164. if (UserGroupInformation.isSecurityEnabled()) {
  165. LOG.info("Security enabled - updating secret keys now");
  166. // It is expected that status updater is started by this point and
  167. // RM gives the shared secret in registration during StatusUpdater#start().
  168. this.containerTokenSecretManager.setSecretKey(
  169. this.getContainerManagerBindAddress(),
  170. this.getRMNMSharedSecret());
  171. }
  172. LOG.info("Registered with ResourceManager as " + this.containerManagerBindAddress
  173. + " with total resource of " + this.totalResource);
  174. }
  175. @Override
  176. public String getContainerManagerBindAddress() {
  177. return this.containerManagerBindAddress;
  178. }
  179. @Override
  180. public byte[] getRMNMSharedSecret() {
  181. return this.secretKeyBytes.clone();
  182. }
  183. private NodeStatus getNodeStatus() {
  184. NodeStatus nodeStatus = recordFactory.newRecordInstance(NodeStatus.class);
  185. nodeStatus.setNodeId(this.nodeId);
  186. int numActiveContainers = 0;
  187. List<ContainerStatus> containersStatuses = new ArrayList<ContainerStatus>();
  188. for (Iterator<Entry<ContainerId, Container>> i =
  189. this.context.getContainers().entrySet().iterator(); i.hasNext();) {
  190. Entry<ContainerId, Container> e = i.next();
  191. ContainerId containerId = e.getKey();
  192. Container container = e.getValue();
  193. // Clone the container to send it to the RM
  194. org.apache.hadoop.yarn.api.records.ContainerStatus containerStatus =
  195. container.cloneAndGetContainerStatus();
  196. containersStatuses.add(containerStatus);
  197. ++numActiveContainers;
  198. LOG.info("Sending out status for container: " + containerStatus);
  199. if (containerStatus.getState() == ContainerState.COMPLETE) {
  200. // Remove
  201. i.remove();
  202. LOG.info("Removed completed container " + containerId);
  203. }
  204. }
  205. nodeStatus.setContainersStatuses(containersStatuses);
  206. LOG.debug(this.containerManagerBindAddress + " sending out status for " + numActiveContainers
  207. + " containers");
  208. NodeHealthStatus nodeHealthStatus = this.context.getNodeHealthStatus();
  209. if (this.healthChecker != null) {
  210. this.healthChecker.setHealthStatus(nodeHealthStatus);
  211. }
  212. LOG.debug("Node's health-status : " + nodeHealthStatus.getIsNodeHealthy()
  213. + ", " + nodeHealthStatus.getHealthReport());
  214. nodeStatus.setNodeHealthStatus(nodeHealthStatus);
  215. return nodeStatus;
  216. }
  217. @Override
  218. public void sendOutofBandHeartBeat() {
  219. synchronized (this.heartbeatMonitor) {
  220. this.heartbeatMonitor.notify();
  221. }
  222. }
  223. protected void startStatusUpdater() {
  224. new Thread() {
  225. @Override
  226. public void run() {
  227. int lastHeartBeatID = 0;
  228. while (!isStopped) {
  229. // Send heartbeat
  230. try {
  231. synchronized (heartbeatMonitor) {
  232. heartbeatMonitor.wait(heartBeatInterval);
  233. }
  234. NodeStatus nodeStatus = getNodeStatus();
  235. nodeStatus.setResponseId(lastHeartBeatID);
  236. NodeHeartbeatRequest request = recordFactory.newRecordInstance(NodeHeartbeatRequest.class);
  237. request.setNodeStatus(nodeStatus);
  238. HeartbeatResponse response =
  239. resourceTracker.nodeHeartbeat(request).getHeartbeatResponse();
  240. lastHeartBeatID = response.getResponseId();
  241. List<ContainerId> containersToCleanup = response
  242. .getContainersToCleanupList();
  243. if (containersToCleanup.size() != 0) {
  244. dispatcher.getEventHandler().handle(
  245. new CMgrCompletedContainersEvent(containersToCleanup));
  246. }
  247. List<ApplicationId> appsToCleanup =
  248. response.getApplicationsToCleanupList();
  249. if (appsToCleanup.size() != 0) {
  250. dispatcher.getEventHandler().handle(
  251. new CMgrCompletedAppsEvent(appsToCleanup));
  252. }
  253. } catch (Throwable e) {
  254. LOG.error("Caught exception in status-updater", e);
  255. break;
  256. }
  257. }
  258. }
  259. }.start();
  260. }
  261. }