PageRenderTime 102ms CodeModel.GetById 69ms RepoModel.GetById 6ms app.codeStats 0ms

/src/plugins/mpi/mpichgm/mpichgm.c

https://github.com/cfenoy/slurm
C | 420 lines | 291 code | 47 blank | 82 comment | 42 complexity | d5387bc1d51e1d73fdb5e6603245a27f MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0
  1. /*****************************************************************************\
  2. ** mpichgm.c - srun support for MPICH-GM (GMPI)
  3. ** $Id$
  4. *****************************************************************************
  5. * Copyright (C) 2004 The Regents of the University of California.
  6. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  7. * Written by Takao Hatazaki <takao.hatazaki@hp.com>
  8. * CODE-OCEC-09-009. All rights reserved.
  9. *
  10. * This file is part of SLURM, a resource management program.
  11. * For details, see <http://www.schedmd.com/slurmdocs/>.
  12. * Please also read the included file: DISCLAIMER.
  13. *
  14. * SLURM is free software; you can redistribute it and/or modify it under
  15. * the terms of the GNU General Public License as published by the Free
  16. * Software Foundation; either version 2 of the License, or (at your option)
  17. * any later version.
  18. *
  19. * In addition, as a special exception, the copyright holders give permission
  20. * to link the code of portions of this program with the OpenSSL library under
  21. * certain conditions as described in each individual source file, and
  22. * distribute linked combinations including the two. You must obey the GNU
  23. * General Public License in all respects for all of the code used other than
  24. * OpenSSL. If you modify file(s) with this exception, you may extend this
  25. * exception to your version of the file(s), but you are not obligated to do
  26. * so. If you do not wish to do so, delete this exception statement from your
  27. * version. If you delete this exception statement from all source files in
  28. * the program, then also delete it here.
  29. *
  30. * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  31. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  32. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  33. * details.
  34. *
  35. * You should have received a copy of the GNU General Public License along
  36. * with SLURM; if not, write to the Free Software Foundation, Inc.,
  37. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  38. \*****************************************************************************/
  39. #ifdef HAVE_CONFIG_H
  40. # include "config.h"
  41. #endif
  42. #ifdef WITH_PTHREADS
  43. # include <pthread.h>
  44. #endif
  45. #include <signal.h>
  46. #include <stdlib.h>
  47. #include <sys/types.h>
  48. #include <sys/socket.h>
  49. #include <netinet/in.h>
  50. #include <strings.h>
  51. #include "src/common/slurm_xlator.h"
  52. #include "src/common/xmalloc.h"
  53. #include "src/common/xstring.h"
  54. #include "src/common/net.h"
  55. #include "src/common/mpi.h"
  56. #include "src/plugins/mpi/mpichgm/mpichgm.h"
  57. typedef struct {
  58. int defined;
  59. unsigned int port_board_id;
  60. unsigned int unique_high_id;
  61. unsigned int unique_low_id;
  62. unsigned int numanode;
  63. unsigned int remote_pid;
  64. unsigned int remote_port;
  65. } gm_slave_t;
  66. #define GMPI_RECV_BUF_LEN 65536
  67. struct gmpi_state {
  68. pthread_t tid;
  69. int fd; /* = -1 */
  70. mpi_plugin_client_info_t *job;
  71. };
  72. static int _gmpi_parse_init_recv_msg(mpi_plugin_client_info_t *job, char *rbuf,
  73. gm_slave_t *slave_data, int *ii)
  74. {
  75. unsigned int magic, id, port_board_id, unique_high_id,
  76. unique_low_id, numanode, remote_pid, remote_port;
  77. int got;
  78. gm_slave_t *dp;
  79. got = sscanf(rbuf, "<<<%u:%u:%u:%u:%u:%u:%u::%u>>>",
  80. &magic, &id, &port_board_id, &unique_high_id,
  81. &unique_low_id, &numanode, &remote_pid, &remote_port);
  82. *ii = id;
  83. if (got != 8) {
  84. error("GMPI master received invalid init message");
  85. return -1;
  86. }
  87. if (magic != job->jobid) {
  88. error("GMPI master received invalid magic number");
  89. return -1;
  90. }
  91. if (id >= job->step_layout->task_cnt)
  92. fatal("GMPI id is out of range");
  93. if (port_board_id == 0)
  94. fatal("MPI id=%d was unable to open a GM port", id);
  95. dp = &slave_data[id];
  96. if (dp->defined) {
  97. error("Ignoring the message from MPI id=%d", id);
  98. return -1;
  99. }
  100. dp->defined = 1;
  101. dp->port_board_id = port_board_id;
  102. dp->unique_high_id = unique_high_id;
  103. dp->unique_low_id = unique_low_id;
  104. dp->numanode = numanode;
  105. dp->remote_pid = remote_pid;
  106. dp->remote_port = remote_port;
  107. debug3("slave_data[%d]: <<<%u:%u:%u:%u:%u:%u:%u::%u>>>",
  108. id, magic, id, port_board_id,
  109. dp->unique_high_id, dp->unique_low_id, dp->numanode,
  110. dp->remote_pid, dp->remote_port);
  111. return 0;
  112. }
  113. static int _gmpi_establish_map(gmpi_state_t *st)
  114. {
  115. mpi_plugin_client_info_t *job = st->job;
  116. struct sockaddr_in addr;
  117. in_addr_t *iaddrs;
  118. socklen_t addrlen;
  119. int accfd, newfd, rlen, nprocs, i, j, id;
  120. size_t gmaplen, lmaplen, maplen;
  121. char *p, *rbuf = NULL, *gmap = NULL, *lmap = NULL, *map = NULL;
  122. char tmp[128];
  123. gm_slave_t *slave_data = NULL, *dp;
  124. /*
  125. * Collect info from slaves.
  126. * Will never finish unless slaves are GMPI processes.
  127. */
  128. accfd = st->fd;
  129. addrlen = sizeof(addr);
  130. nprocs = job->step_layout->task_cnt;
  131. iaddrs = (in_addr_t *)xmalloc(sizeof(*iaddrs)*nprocs);
  132. slave_data = (gm_slave_t *)xmalloc(sizeof(*slave_data)*nprocs);
  133. for (i=0; i<nprocs; i++)
  134. slave_data[i].defined = 0;
  135. i = 0;
  136. rbuf = (char *)xmalloc(GMPI_RECV_BUF_LEN);
  137. while (i < nprocs) {
  138. newfd = accept(accfd, (struct sockaddr *)&addr, &addrlen);
  139. if (newfd == -1) {
  140. error("accept(2) in GMPI master thread: %m");
  141. continue;
  142. }
  143. rlen = recv(newfd, rbuf, GMPI_RECV_BUF_LEN, 0);
  144. if (rlen <= 0) {
  145. error("GMPI master recv returned %d", rlen);
  146. close(newfd);
  147. continue;
  148. } else {
  149. rbuf[rlen] = 0;
  150. }
  151. if (_gmpi_parse_init_recv_msg(job, rbuf, slave_data,
  152. &id) == 0) {
  153. i++;
  154. iaddrs[id] = ntohl(addr.sin_addr.s_addr);
  155. }
  156. close(newfd);
  157. }
  158. xfree(rbuf);
  159. debug2("Received data from all of %d GMPI processes.", i);
  160. /*
  161. * Compose the global map string.
  162. */
  163. gmap = (char *)xmalloc(128*nprocs);
  164. p = gmap;
  165. strcpy(p, "[[[");
  166. p += 3;
  167. for (i=0; i<nprocs; i++) {
  168. dp = &slave_data[i];
  169. sprintf(tmp, "<%u:%u:%u:%u>", dp->port_board_id,
  170. dp->unique_high_id, dp->unique_low_id, dp->numanode);
  171. strcpy(p, tmp);
  172. p += strlen(tmp);
  173. }
  174. strcpy(p, "|||");
  175. p += 3;
  176. gmaplen = (size_t)(p - gmap);
  177. /*
  178. * Respond to slaves.
  179. */
  180. lmap = (char *)xmalloc(128*nprocs);
  181. for (i=0; i<nprocs; i++) {
  182. /*
  183. * Compose the string to send.
  184. */
  185. dp = &slave_data[i];
  186. p = lmap;
  187. for (j=0; j<nprocs; j++) {
  188. if (iaddrs[i] == iaddrs[j] &&
  189. (dp->numanode == slave_data[j].numanode)) {
  190. sprintf(tmp, "<%u>", j);
  191. strcpy(p, tmp);
  192. p += strlen(tmp);
  193. }
  194. }
  195. lmaplen = (size_t)(p - lmap);
  196. map = (char *)xmalloc(gmaplen+lmaplen+4);
  197. strcpy(map, gmap);
  198. strcpy(map+gmaplen, lmap);
  199. strcpy(map+gmaplen+lmaplen, "]]]");
  200. maplen = gmaplen + lmaplen + 3;
  201. /*
  202. * Send it.
  203. */
  204. if ((newfd = socket(AF_INET, SOCK_STREAM, 0)) == -1) {
  205. fatal("GMPI master failed to respond");
  206. }
  207. j = 1;
  208. if (setsockopt(newfd, SOL_SOCKET, SO_REUSEADDR,
  209. (void *)&j, sizeof(j)))
  210. error("setsockopt in GMPI master: %m");
  211. memset(&addr, 0, sizeof(addr));
  212. addr.sin_family = AF_INET;
  213. addr.sin_addr.s_addr = htonl(iaddrs[i]);
  214. addr.sin_port = htons(dp->remote_port);
  215. if (connect(newfd, (struct sockaddr *)&addr, sizeof(addr)))
  216. fatal("GMPI master failed to connect");
  217. send(newfd, map, maplen, 0);
  218. close(newfd);
  219. xfree(map);
  220. }
  221. xfree(slave_data);
  222. xfree(lmap);
  223. xfree(gmap);
  224. xfree(iaddrs);
  225. debug2("GMPI master responded to all GMPI processes");
  226. return 0;
  227. }
  228. static void _gmpi_wait_abort(gmpi_state_t *st)
  229. {
  230. mpi_plugin_client_info_t *job = st->job;
  231. struct sockaddr_in addr;
  232. socklen_t addrlen;
  233. int newfd, rlen;
  234. unsigned int magic;
  235. char *rbuf;
  236. rbuf = (char *)xmalloc(GMPI_RECV_BUF_LEN);
  237. addrlen = sizeof(addr);
  238. while (1) {
  239. newfd = accept(st->fd, (struct sockaddr *)&addr,
  240. &addrlen);
  241. if (newfd == -1) {
  242. fatal("GMPI master failed to accept (abort-wait)");
  243. }
  244. rlen = recv(newfd, rbuf, GMPI_RECV_BUF_LEN, 0);
  245. if (rlen <= 0) {
  246. error("GMPI recv (abort-wait) returned %d", rlen);
  247. close(newfd);
  248. continue;
  249. } else {
  250. rbuf[rlen] = 0;
  251. }
  252. if (sscanf(rbuf, "<<<ABORT_%u_ABORT>>>", &magic) != 1) {
  253. error("GMPI (abort-wait) received spurious message.");
  254. close(newfd);
  255. continue;
  256. }
  257. if (magic != job->jobid) {
  258. error("GMPI (abort-wait) received bad magic number.");
  259. close(newfd);
  260. continue;
  261. }
  262. close(newfd);
  263. debug("Received ABORT message from an MPI process.");
  264. slurm_signal_job_step(job->jobid, job->stepid, SIGKILL);
  265. #if 0
  266. xfree(rbuf);
  267. close(jgmpi_fd);
  268. gmpi_fd = -1;
  269. return;
  270. #endif
  271. }
  272. }
  273. static void *_gmpi_thr(void *arg)
  274. {
  275. gmpi_state_t *st;
  276. st = (gmpi_state_t *) arg;
  277. debug3("GMPI master thread pid=%lu", (unsigned long) getpid());
  278. _gmpi_establish_map(st);
  279. debug3("GMPI master thread is waiting for ABORT message.");
  280. _gmpi_wait_abort(st);
  281. return (void *)0;
  282. }
  283. static gmpi_state_t *
  284. gmpi_state_create(const mpi_plugin_client_info_t *job)
  285. {
  286. gmpi_state_t *state;
  287. state = (gmpi_state_t *)xmalloc(sizeof(gmpi_state_t));
  288. state->tid = (pthread_t)-1;
  289. state->fd = -1;
  290. state->job = (mpi_plugin_client_info_t *) job;
  291. return state;
  292. }
  293. static void
  294. gmpi_state_destroy(gmpi_state_t *st)
  295. {
  296. xfree(st);
  297. }
  298. extern gmpi_state_t *
  299. gmpi_thr_create(const mpi_plugin_client_info_t *job, char ***env)
  300. {
  301. short port;
  302. pthread_attr_t attr;
  303. gmpi_state_t *st = NULL;
  304. st = gmpi_state_create(job);
  305. /*
  306. * It is possible for one to modify the mpirun command in
  307. * MPICH-GM distribution so that it calls srun, instead of
  308. * rsh, for remote process invocations. In that case, we
  309. * should not override envs nor open the master port.
  310. */
  311. if (getenv("GMPI_PORT"))
  312. return st;
  313. if (net_stream_listen (&st->fd, &port) < 0) {
  314. error ("Unable to create GMPI listen port: %m");
  315. gmpi_state_destroy(st);
  316. return NULL;
  317. }
  318. /*
  319. * Accept in a separate thread.
  320. */
  321. slurm_attr_init(&attr);
  322. pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
  323. if (pthread_create(&st->tid, &attr, &_gmpi_thr, (void *)st)) {
  324. slurm_attr_destroy(&attr);
  325. gmpi_state_destroy(st);
  326. return NULL;
  327. }
  328. slurm_attr_destroy(&attr);
  329. env_array_overwrite_fmt(env, "GMPI_PORT", "%hu", port);
  330. env_array_overwrite_fmt(env, "GMPI_MAGIC", "%u", job->jobid);
  331. env_array_overwrite_fmt(env, "GMPI_NP", "%d",
  332. job->step_layout->task_cnt);
  333. env_array_overwrite_fmt(env, "GMPI_SHMEM", "1");
  334. /* FIXME for multi-board config. */
  335. env_array_overwrite_fmt(env, "GMPI_BOARD", "-1");
  336. /* For new MX version */
  337. env_array_overwrite_fmt(env, "MXMPI_PORT", "%hu", port);
  338. env_array_overwrite_fmt(env, "MXMPI_MAGIC", "%u", job->jobid);
  339. env_array_overwrite_fmt(env, "MXMPI_NP", "%d",
  340. job->step_layout->task_cnt);
  341. /* FIXME for multi-board config. */
  342. env_array_overwrite_fmt(env, "MXMPI_BOARD", "-1");
  343. /* for MACOSX to override default malloc */
  344. env_array_overwrite_fmt(env, "DYLD_FORCE_FLAT_NAMESPACE", "1");
  345. debug("Started GMPI master thread (%lu)", (unsigned long) st->tid);
  346. return st;
  347. }
  348. /*
  349. * Warning: This pthread_cancel/pthread_join is a little unsafe. The thread is
  350. * not joinable, so on most systems the join will fail, then the thread's state
  351. * will be destroyed, possibly before the thread has actually stopped. In
  352. * practice the thread will usually be waiting on an accept call when it gets
  353. * cancelled. If the mpi thread has a mutex locked when it is cancelled--while
  354. * using the "info" or "error" functions for logging--the caller will deadlock.
  355. * See mpich1_p4.c or mvapich.c for code that shuts down cleanly by letting
  356. * the mpi thread wait on a poll call, and creating a pipe that the poll waits
  357. * on, which can be written to by the main thread to tell the mpi thread to
  358. * exit. Also see rev 18654 of mpichmx.c, on
  359. * branches/slurm-2.1.mpi.plugin.cleanup for an implementation. There were no
  360. * myrinet systems available for testing, which is why I couldn't complete the
  361. * patch for this plugin. -djb
  362. */
  363. extern int gmpi_thr_destroy(gmpi_state_t *st)
  364. {
  365. if (st != NULL) {
  366. if (st->tid != (pthread_t)-1) {
  367. pthread_cancel(st->tid);
  368. pthread_join(st->tid, NULL);
  369. }
  370. gmpi_state_destroy(st);
  371. }
  372. return SLURM_SUCCESS;
  373. }