PageRenderTime 33ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/launcher/ContainerLauncherImpl.java

https://github.com/rbodkin/hadoop-common
Java | 293 lines | 214 code | 33 blank | 46 comment | 9 complexity | f593916e8ede0576c1a34341389a6e14 MD5 | raw file
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.mapreduce.v2.app.launcher;
  19. import java.io.IOException;
  20. import java.nio.ByteBuffer;
  21. import java.security.PrivilegedAction;
  22. import java.util.HashMap;
  23. import java.util.Map;
  24. import java.util.concurrent.BlockingQueue;
  25. import java.util.concurrent.LinkedBlockingQueue;
  26. import java.util.concurrent.ThreadPoolExecutor;
  27. import java.util.concurrent.TimeUnit;
  28. import org.apache.commons.logging.Log;
  29. import org.apache.commons.logging.LogFactory;
  30. import org.apache.hadoop.conf.Configuration;
  31. import org.apache.hadoop.io.Text;
  32. import org.apache.hadoop.mapred.ShuffleHandler;
  33. import org.apache.hadoop.mapreduce.MRJobConfig;
  34. import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId;
  35. import org.apache.hadoop.mapreduce.v2.app.AppContext;
  36. import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptContainerLaunchedEvent;
  37. import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptDiagnosticsUpdateEvent;
  38. import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent;
  39. import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEventType;
  40. import org.apache.hadoop.mapreduce.v2.app.rm.ContainerAllocator;
  41. import org.apache.hadoop.mapreduce.v2.app.rm.ContainerAllocatorEvent;
  42. import org.apache.hadoop.net.NetUtils;
  43. import org.apache.hadoop.security.SecurityInfo;
  44. import org.apache.hadoop.security.UserGroupInformation;
  45. import org.apache.hadoop.security.token.Token;
  46. import org.apache.hadoop.util.StringUtils;
  47. import org.apache.hadoop.yarn.YarnException;
  48. import org.apache.hadoop.yarn.api.ContainerManager;
  49. import org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest;
  50. import org.apache.hadoop.yarn.api.protocolrecords.StartContainerResponse;
  51. import org.apache.hadoop.yarn.api.protocolrecords.StopContainerRequest;
  52. import org.apache.hadoop.yarn.api.records.ContainerId;
  53. import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
  54. import org.apache.hadoop.yarn.api.records.ContainerToken;
  55. import org.apache.hadoop.yarn.conf.YarnConfiguration;
  56. import org.apache.hadoop.yarn.factories.RecordFactory;
  57. import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
  58. import org.apache.hadoop.yarn.ipc.YarnRPC;
  59. import org.apache.hadoop.yarn.security.ContainerManagerSecurityInfo;
  60. import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
  61. import org.apache.hadoop.yarn.service.AbstractService;
  62. /**
  63. * This class is responsible for launching of containers.
  64. */
  65. public class ContainerLauncherImpl extends AbstractService implements
  66. ContainerLauncher {
  67. private static final Log LOG = LogFactory.getLog(ContainerLauncherImpl.class);
  68. private AppContext context;
  69. private ThreadPoolExecutor launcherPool;
  70. private Thread eventHandlingThread;
  71. private BlockingQueue<ContainerLauncherEvent> eventQueue =
  72. new LinkedBlockingQueue<ContainerLauncherEvent>();
  73. private RecordFactory recordFactory;
  74. //have a cache/map of UGIs so as to avoid creating too many RPC
  75. //client connection objects to the same NodeManager
  76. private Map<String, UserGroupInformation> ugiMap =
  77. new HashMap<String, UserGroupInformation>();
  78. public ContainerLauncherImpl(AppContext context) {
  79. super(ContainerLauncherImpl.class.getName());
  80. this.context = context;
  81. }
  82. @Override
  83. public synchronized void init(Configuration conf) {
  84. // Clone configuration for this component so that the SecurityInfo setting
  85. // doesn't affect the original configuration
  86. Configuration myLocalConfig = new Configuration(conf);
  87. myLocalConfig.setClass(
  88. YarnConfiguration.YARN_SECURITY_INFO,
  89. ContainerManagerSecurityInfo.class, SecurityInfo.class);
  90. this.recordFactory = RecordFactoryProvider.getRecordFactory(conf);
  91. super.init(myLocalConfig);
  92. }
  93. public void start() {
  94. launcherPool =
  95. new ThreadPoolExecutor(getConfig().getInt(
  96. MRJobConfig.MR_AM_CONTAINERLAUNCHER_THREAD_COUNT, 10),
  97. Integer.MAX_VALUE, 1, TimeUnit.HOURS,
  98. new LinkedBlockingQueue<Runnable>());
  99. launcherPool.prestartAllCoreThreads(); // Wait for work.
  100. eventHandlingThread = new Thread(new Runnable() {
  101. @Override
  102. public void run() {
  103. ContainerLauncherEvent event = null;
  104. while (!Thread.currentThread().isInterrupted()) {
  105. try {
  106. event = eventQueue.take();
  107. } catch (InterruptedException e) {
  108. LOG.error("Returning, interrupted : " + e);
  109. return;
  110. }
  111. // the events from the queue are handled in parallel
  112. // using a thread pool
  113. launcherPool.execute(new EventProcessor(event));
  114. // TODO: Group launching of multiple containers to a single
  115. // NodeManager into a single connection
  116. }
  117. }
  118. });
  119. eventHandlingThread.start();
  120. super.start();
  121. }
  122. public void stop() {
  123. eventHandlingThread.interrupt();
  124. launcherPool.shutdown();
  125. super.stop();
  126. }
  127. protected ContainerManager getCMProxy(ContainerId containerID,
  128. final String containerManagerBindAddr, ContainerToken containerToken)
  129. throws IOException {
  130. UserGroupInformation user = UserGroupInformation.getCurrentUser();
  131. // TODO: Synchronization problems!!
  132. if (UserGroupInformation.isSecurityEnabled()) {
  133. if(!ugiMap.containsKey(containerManagerBindAddr)) {
  134. Token<ContainerTokenIdentifier> token =
  135. new Token<ContainerTokenIdentifier>(
  136. containerToken.getIdentifier().array(),
  137. containerToken.getPassword().array(), new Text(
  138. containerToken.getKind()), new Text(
  139. containerToken.getService()));
  140. //the user in createRemoteUser in this context is not important
  141. user = UserGroupInformation.createRemoteUser(containerManagerBindAddr);
  142. user.addToken(token);
  143. ugiMap.put(containerManagerBindAddr, user);
  144. } else {
  145. user = ugiMap.get(containerManagerBindAddr);
  146. }
  147. }
  148. ContainerManager proxy =
  149. user.doAs(new PrivilegedAction<ContainerManager>() {
  150. @Override
  151. public ContainerManager run() {
  152. YarnRPC rpc = YarnRPC.create(getConfig());
  153. return (ContainerManager) rpc.getProxy(ContainerManager.class,
  154. NetUtils.createSocketAddr(containerManagerBindAddr),
  155. getConfig());
  156. }
  157. });
  158. return proxy;
  159. }
  160. /**
  161. * Setup and start the container on remote nodemanager.
  162. */
  163. private class EventProcessor implements Runnable {
  164. private ContainerLauncherEvent event;
  165. EventProcessor(ContainerLauncherEvent event) {
  166. this.event = event;
  167. }
  168. @SuppressWarnings("unchecked")
  169. @Override
  170. public void run() {
  171. LOG.info("Processing the event " + event.toString());
  172. // Load ContainerManager tokens before creating a connection.
  173. // TODO: Do it only once per NodeManager.
  174. final String containerManagerBindAddr = event.getContainerMgrAddress();
  175. ContainerId containerID = event.getContainerID();
  176. ContainerToken containerToken = event.getContainerToken();
  177. switch(event.getType()) {
  178. case CONTAINER_REMOTE_LAUNCH:
  179. ContainerRemoteLaunchEvent launchEv = (ContainerRemoteLaunchEvent) event;
  180. TaskAttemptId taskAttemptID = launchEv.getTaskAttemptID();
  181. try {
  182. ContainerManager proxy =
  183. getCMProxy(containerID, containerManagerBindAddr, containerToken);
  184. // Construct the actual Container
  185. ContainerLaunchContext containerLaunchContext =
  186. launchEv.getContainer();
  187. // Now launch the actual container
  188. StartContainerRequest startRequest = recordFactory
  189. .newRecordInstance(StartContainerRequest.class);
  190. startRequest.setContainerLaunchContext(containerLaunchContext);
  191. StartContainerResponse response = proxy.startContainer(startRequest);
  192. ByteBuffer portInfo = response
  193. .getServiceResponse(ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID);
  194. int port = -1;
  195. if(portInfo != null) {
  196. port = ShuffleHandler.deserializeMetaData(portInfo);
  197. }
  198. LOG.info("Shuffle port returned by ContainerManager for "
  199. + taskAttemptID + " : " + port);
  200. if(port < 0) {
  201. throw new IllegalStateException("Invalid shuffle port number "
  202. + port + " returned for " + taskAttemptID);
  203. }
  204. // after launching, send launched event to task attempt to move
  205. // it from ASSIGNED to RUNNING state
  206. context.getEventHandler().handle(
  207. new TaskAttemptContainerLaunchedEvent(taskAttemptID, port));
  208. } catch (Throwable t) {
  209. String message = "Container launch failed for " + containerID
  210. + " : " + StringUtils.stringifyException(t);
  211. LOG.error(message);
  212. context.getEventHandler().handle(
  213. new TaskAttemptDiagnosticsUpdateEvent(taskAttemptID, message));
  214. context.getEventHandler().handle(
  215. new TaskAttemptEvent(taskAttemptID,
  216. TaskAttemptEventType.TA_CONTAINER_LAUNCH_FAILED));
  217. }
  218. break;
  219. case CONTAINER_REMOTE_CLEANUP:
  220. // We will have to remove the launch (meant "cleanup"? FIXME) event if it is still in eventQueue
  221. // and not yet processed
  222. if (eventQueue.contains(event)) {
  223. eventQueue.remove(event); // TODO: Any synchro needed?
  224. //deallocate the container
  225. context.getEventHandler().handle(
  226. new ContainerAllocatorEvent(event.getTaskAttemptID(),
  227. ContainerAllocator.EventType.CONTAINER_DEALLOCATE));
  228. } else {
  229. try {
  230. ContainerManager proxy =
  231. getCMProxy(containerID, containerManagerBindAddr, containerToken);
  232. // TODO:check whether container is launched
  233. // kill the remote container if already launched
  234. StopContainerRequest stopRequest = recordFactory
  235. .newRecordInstance(StopContainerRequest.class);
  236. stopRequest.setContainerId(event.getContainerID());
  237. proxy.stopContainer(stopRequest);
  238. } catch (Throwable t) {
  239. //ignore the cleanup failure
  240. LOG.warn("cleanup failed for container " + event.getContainerID() ,
  241. t);
  242. }
  243. // after killing, send killed event to taskattempt
  244. context.getEventHandler().handle(
  245. new TaskAttemptEvent(event.getTaskAttemptID(),
  246. TaskAttemptEventType.TA_CONTAINER_CLEANED));
  247. }
  248. break;
  249. }
  250. }
  251. }
  252. @Override
  253. public void handle(ContainerLauncherEvent event) {
  254. try {
  255. eventQueue.put(event);
  256. } catch (InterruptedException e) {
  257. throw new YarnException(e);
  258. }
  259. }
  260. }