/src/plugins/mpi/mpich1_p4/mpich1_p4.c

https://github.com/cfenoy/slurm · C · 370 lines · 260 code · 31 blank · 79 comment · 57 complexity · 4ed5a30692e19473a9b1431e40081e37 MD5 · raw file

  1. /*****************************************************************************\
  2. ** mpi_mpich1_p4.c - Library routines for initiating jobs on with mpich1_p4
  3. *****************************************************************************
  4. * Copyright (C) 2004-2007 The Regents of the University of California.
  5. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  6. * Written by Morris Jette <jette1@llnl.gov>
  7. * CODE-OCEC-09-009. All rights reserved.
  8. *
  9. * This file is part of SLURM, a resource management program.
  10. * For details, see <http://www.schedmd.com/slurmdocs/>.
  11. * Please also read the included file: DISCLAIMER.
  12. *
  13. * SLURM is free software; you can redistribute it and/or modify it under
  14. * the terms of the GNU General Public License as published by the Free
  15. * Software Foundation; either version 2 of the License, or (at your option)
  16. * any later version.
  17. *
  18. * In addition, as a special exception, the copyright holders give permission
  19. * to link the code of portions of this program with the OpenSSL library under
  20. * certain conditions as described in each individual source file, and
  21. * distribute linked combinations including the two. You must obey the GNU
  22. * General Public License in all respects for all of the code used other than
  23. * OpenSSL. If you modify file(s) with this exception, you may extend this
  24. * exception to your version of the file(s), but you are not obligated to do
  25. * so. If you do not wish to do so, delete this exception statement from your
  26. * version. If you delete this exception statement from all source files in
  27. * the program, then also delete it here.
  28. *
  29. * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  30. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  31. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  32. * details.
  33. *
  34. * You should have received a copy of the GNU General Public License along
  35. * with SLURM; if not, write to the Free Software Foundation, Inc.,
  36. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  37. \*****************************************************************************/
  38. #if HAVE_CONFIG_H
  39. # include "config.h"
  40. #endif
  41. #include <fcntl.h>
  42. #include <signal.h>
  43. #include <stdlib.h>
  44. #include <sys/poll.h>
  45. #include <sys/types.h>
  46. #include <sys/socket.h>
  47. #include "slurm/slurm_errno.h"
  48. #include "src/common/env.h"
  49. #include "src/common/fd.h"
  50. #include "src/common/hostlist.h"
  51. #include "src/common/mpi.h"
  52. #include "src/common/net.h"
  53. #include "src/common/xmalloc.h"
  54. #include "src/common/xstring.h"
  55. #include "src/slurmd/slurmstepd/slurmstepd_job.h"
  56. /*
  57. * These variables are required by the generic plugin interface. If they
  58. * are not found in the plugin, the plugin loader will ignore it.
  59. *
  60. * plugin_name - a string giving a human-readable description of the
  61. * plugin. There is no maximum length, but the symbol must refer to
  62. * a valid string.
  63. *
  64. * plugin_type - a string suggesting the type of the plugin or its
  65. * applicability to a particular form of data or method of data handling.
  66. * If the low-level plugin API is used, the contents of this string are
  67. * unimportant and may be anything. SLURM uses the higher-level plugin
  68. * interface which requires this string to be of the form
  69. *
  70. * <application>/<method>
  71. *
  72. * where <application> is a description of the intended application of
  73. * the plugin (e.g., "switch" for SLURM switch) and <method> is a description
  74. * of how this plugin satisfies that application. SLURM will only load
  75. * a switch plugin if the plugin_type string has a prefix of "switch/".
  76. *
  77. * plugin_version - an unsigned 32-bit integer giving the version number
  78. * of the plugin. If major and minor revisions are desired, the major
  79. * version number may be multiplied by a suitable magnitude constant such
  80. * as 100 or 1000. Various SLURM versions will likely require a certain
  81. * minimum version for their plugins as this API matures.
  82. */
  83. const char plugin_name[] = "mpi MPICH1_P4 plugin";
  84. const char plugin_type[] = "mpi/mpich1_p4";
  85. const uint32_t plugin_version = 100;
  86. /* communication for master port info */
  87. pthread_t p4_tid = (pthread_t) -1;
  88. int p4_fd1 = -1, p4_fd2 = -1;
  89. /*
  90. * These vars are used to break the mpi thread out of a poll call, exit,
  91. * and allow the main thread to do a timed wait for that exit
  92. */
  93. static int shutdown_pipe[2];
  94. static bool shutdown_complete; /* Set true when mpi thr about to exit */
  95. static int shutdown_timeout; /* Num secs for main thread to wait for
  96. mpi thread to finish */
  97. static pthread_mutex_t shutdown_lock;
  98. static pthread_cond_t shutdown_cond;
  99. int p_mpi_hook_slurmstepd_prefork(const slurmd_job_t *job, char ***env)
  100. {
  101. debug("mpi/mpich1_p4: slurmstepd prefork");
  102. return SLURM_SUCCESS;
  103. }
  104. int p_mpi_hook_slurmstepd_task (const mpi_plugin_task_info_t *job,
  105. char ***env)
  106. {
  107. char *nodelist, *task_cnt;
  108. nodelist = getenvp(*env, "SLURM_NODELIST");
  109. if (nodelist) {
  110. char *host_str = NULL, *tmp;
  111. hostlist_t hl = hostlist_create(nodelist);
  112. while ((tmp = hostlist_shift(hl))) {
  113. if (host_str)
  114. xstrcat(host_str, ",");
  115. xstrcat(host_str, tmp);
  116. free(tmp);
  117. }
  118. hostlist_destroy(hl);
  119. env_array_overwrite_fmt(env, "SLURM_MPICH_NODELIST", "%s",
  120. host_str);
  121. xfree(host_str);
  122. }
  123. task_cnt = getenvp(*env, "SLURM_TASKS_PER_NODE");
  124. if (task_cnt) {
  125. char *task_str = NULL, tmp_str[32];
  126. int i=0, val, reps;
  127. while (task_cnt[i]) {
  128. if ((task_cnt[i] >= '0') && (task_cnt[i] <= '9'))
  129. val = atoi(&task_cnt[i]);
  130. else
  131. break; /* bad parse */
  132. i++;
  133. while (task_cnt[i]
  134. && (task_cnt[i] != 'x') && (task_cnt[i] != ','))
  135. i++;
  136. if (task_cnt[i] == 'x') {
  137. i++;
  138. reps = atoi(&task_cnt[i]);
  139. while (task_cnt[i] && (task_cnt[i] != ','))
  140. i++;
  141. } else
  142. reps = 1;
  143. if (task_cnt[i] == ',')
  144. i++;
  145. while (reps) {
  146. if (task_str)
  147. xstrcat(task_str, ",");
  148. snprintf(tmp_str, sizeof(tmp_str), "%d", val);
  149. xstrcat(task_str, tmp_str);
  150. reps--;
  151. }
  152. }
  153. env_array_overwrite_fmt(env, "SLURM_MPICH_TASKS", "%s",
  154. task_str);
  155. xfree(task_str);
  156. }
  157. return SLURM_SUCCESS;
  158. }
  159. static void *mpich1_thr(void *arg)
  160. {
  161. int cc, flags;
  162. int new_port, new_fd;
  163. struct pollfd ufds[2];
  164. struct sockaddr cli_addr;
  165. socklen_t cli_len;
  166. char in_buf[128];
  167. debug("waiting for p4 communication");
  168. if ((flags = fcntl(p4_fd1, F_GETFL)) < 0) {
  169. error("mpich_p4: fcntl: %m");
  170. goto done;
  171. }
  172. if (fcntl(p4_fd1, F_SETFL, flags | O_NONBLOCK) < 0) {
  173. error("mpich_p4: fcntl: %m");
  174. goto done;
  175. }
  176. ufds[0].fd = p4_fd1;
  177. ufds[0].events = POLLIN;
  178. ufds[1].fd = shutdown_pipe[0];
  179. ufds[1].events = POLLIN;
  180. while (1) {
  181. if (p4_tid == (pthread_t) -1)
  182. goto done;
  183. cc = read(p4_fd1, &new_port, sizeof(new_port));
  184. if (cc >= 0)
  185. break;
  186. if (errno != EAGAIN) {
  187. error("mpich_p4: read/1: %m");
  188. goto done;
  189. }
  190. cc = poll(ufds, 2, 10000);
  191. if (cc <= 0) {
  192. error("mpich_p4: poll/1: %m");
  193. goto done;
  194. }
  195. if (ufds[1].revents & POLLIN) {
  196. goto done;
  197. }
  198. }
  199. if (cc != sizeof(new_port)) {
  200. error("mpich_p4: read/1 %d bytes", cc);
  201. goto done;
  202. }
  203. debug("mpich_p4 read/1 port %d", new_port);
  204. ufds[0].fd = p4_fd2;
  205. /* send this port number to other tasks on demand */
  206. while (1) {
  207. if (p4_tid == (pthread_t) -1)
  208. goto done;
  209. cc = poll(ufds, 2, -1);
  210. if (cc <= 0) {
  211. error("mpich_p4: poll/2: %m");
  212. goto done;
  213. }
  214. if (ufds[1].revents & POLLIN) {
  215. goto done;
  216. }
  217. new_fd = accept(p4_fd2, &cli_addr, &cli_len);
  218. if (new_fd < 0)
  219. continue;
  220. cc = read(new_fd, in_buf, sizeof(in_buf));
  221. if (cc > 0)
  222. debug("mpich_p4 read/2 port: %s", in_buf);
  223. cc = write(new_fd, &new_port, sizeof(new_port));
  224. if (cc < sizeof(new_port))
  225. error("mpich_p4: write2: %m");
  226. close(new_fd);
  227. }
  228. done:
  229. pthread_mutex_lock(&shutdown_lock);
  230. shutdown_complete = true;
  231. pthread_cond_signal(&shutdown_cond);
  232. pthread_mutex_unlock(&shutdown_lock);
  233. return NULL;
  234. }
  235. mpi_plugin_client_state_t *
  236. p_mpi_hook_client_prelaunch(mpi_plugin_client_info_t *job, char ***env)
  237. {
  238. struct sockaddr_in sin;
  239. pthread_attr_t attr;
  240. socklen_t len = sizeof(sin);
  241. short port1, port2;
  242. debug("Using mpi/mpich1_p4");
  243. if ((p4_fd1 = socket(PF_INET, SOCK_DGRAM, 0)) < 0) {
  244. error("socket: %m");
  245. return NULL;
  246. }
  247. memset(&sin, 0, sizeof(sin));
  248. sin.sin_family = PF_INET;
  249. if (bind(p4_fd1, (struct sockaddr *) &sin, len) < 0) {
  250. error("bind: %m");
  251. return NULL;
  252. }
  253. if (getsockname(p4_fd1, (struct sockaddr *) &sin, &len) < 0) {
  254. error("getsockname: %m");
  255. return NULL;
  256. }
  257. port1 = ntohs(sin.sin_port);
  258. if ((p4_fd2 = socket(PF_INET, SOCK_STREAM, 0)) < 0) {
  259. error("socket: %m");
  260. return NULL;
  261. }
  262. memset(&sin, 0, sizeof(sin));
  263. sin.sin_family = PF_INET;
  264. sin.sin_addr.s_addr = htonl(INADDR_ANY);
  265. if (bind(p4_fd2, (struct sockaddr *) &sin, len) < 0) {
  266. error("bind: %m");
  267. return NULL;
  268. }
  269. if (listen(p4_fd2, 64) < 0)
  270. error("listen: %m");
  271. if (getsockname(p4_fd2, (struct sockaddr *) &sin, &len) < 0) {
  272. error("getsockname: %m");
  273. return NULL;
  274. }
  275. port2 = ntohs(sin.sin_port);
  276. if (pipe(shutdown_pipe) < 0) {
  277. error ("pipe: %m");
  278. return (NULL);
  279. }
  280. shutdown_complete = false;
  281. shutdown_timeout = 5;
  282. slurm_mutex_init(&shutdown_lock);
  283. pthread_cond_init(&shutdown_cond, NULL);
  284. /* Process messages in a separate thread */
  285. slurm_attr_init(&attr);
  286. pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
  287. if (pthread_create(&p4_tid, &attr, &mpich1_thr, NULL)) {
  288. error("pthread_create: %m");
  289. slurm_attr_destroy(&attr);
  290. return NULL;
  291. }
  292. slurm_attr_destroy(&attr);
  293. env_array_overwrite_fmt(env, "SLURM_MPICH_PORT1", "%hu", port1);
  294. env_array_overwrite_fmt(env, "SLURM_MPICH_PORT2", "%hu", port2);
  295. debug("mpich_p4 plugin listening on fd=%d,%d ports=%d,%d",
  296. p4_fd1, p4_fd2, port1, port2);
  297. /* only return NULL on error */
  298. return (void *)0xdeadbeef;
  299. }
  300. int p_mpi_hook_client_single_task_per_node(void)
  301. {
  302. return true;
  303. }
  304. int p_mpi_hook_client_fini(mpi_plugin_client_state_t *state)
  305. {
  306. if (p4_tid != (pthread_t)-1) {
  307. char tmp = 1;
  308. int n;
  309. /*
  310. * Write to the pipe to break the mpi thread out of a poll
  311. * (or leave the poll immediately after it is called) and exit.
  312. * Do a timed wait for the mpi thread to shut down, or just
  313. * exit if the mpi thread cannot respond.
  314. */
  315. n = write(shutdown_pipe[1], &tmp, 1);
  316. if (n == 1) {
  317. struct timespec ts = {0, 0};
  318. slurm_mutex_lock(&shutdown_lock);
  319. ts.tv_sec = time(NULL) + shutdown_timeout;
  320. while (!shutdown_complete) {
  321. if (time(NULL) >= ts.tv_sec) {
  322. break;
  323. }
  324. pthread_cond_timedwait(
  325. &shutdown_cond,
  326. &shutdown_lock, &ts);
  327. }
  328. slurm_mutex_unlock(&shutdown_lock);
  329. }
  330. if (shutdown_complete) {
  331. close(shutdown_pipe[0]);
  332. close(shutdown_pipe[1]);
  333. slurm_mutex_destroy(&shutdown_lock);
  334. pthread_cond_destroy(&shutdown_cond);
  335. }
  336. p4_tid = (pthread_t) -1;
  337. }
  338. return SLURM_SUCCESS;
  339. }