PageRenderTime 58ms CodeModel.GetById 28ms RepoModel.GetById 1ms app.codeStats 0ms

/src/plugins/mpi/mpichmx/mpichmx.c

https://github.com/cfenoy/slurm
C | 424 lines | 293 code | 48 blank | 83 comment | 42 complexity | 2f6693d95f87b2fbf52b3edd3779bfdd MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0
  1. /*****************************************************************************\
  2. ** mpichmx.c - srun support for MPICH-MX (based upon MPICH-GM code)
  3. *****************************************************************************
  4. * Copyright (C) 2004 The Regents of the University of California.
  5. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  6. * Written by Takao Hatazaki <takao.hatazaki@hp.com>
  7. * CODE-OCEC-09-009. All rights reserved.
  8. *
  9. * This file is part of SLURM, a resource management program.
  10. * For details, see <http://www.schedmd.com/slurmdocs/>.
  11. * Please also read the included file: DISCLAIMER.
  12. *
  13. * SLURM is free software; you can redistribute it and/or modify it under
  14. * the terms of the GNU General Public License as published by the Free
  15. * Software Foundation; either version 2 of the License, or (at your option)
  16. * any later version.
  17. *
  18. * In addition, as a special exception, the copyright holders give permission
  19. * to link the code of portions of this program with the OpenSSL library under
  20. * certain conditions as described in each individual source file, and
  21. * distribute linked combinations including the two. You must obey the GNU
  22. * General Public License in all respects for all of the code used other than
  23. * OpenSSL. If you modify file(s) with this exception, you may extend this
  24. * exception to your version of the file(s), but you are not obligated to do
  25. * so. If you do not wish to do so, delete this exception statement from your
  26. * version. If you delete this exception statement from all source files in
  27. * the program, then also delete it here.
  28. *
  29. * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  30. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  31. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  32. * details.
  33. *
  34. * You should have received a copy of the GNU General Public License along
  35. * with SLURM; if not, write to the Free Software Foundation, Inc.,
  36. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  37. \*****************************************************************************/
  38. #ifdef HAVE_CONFIG_H
  39. # include "config.h"
  40. #endif
  41. #ifdef WITH_PTHREADS
  42. # include <pthread.h>
  43. #endif
  44. #include <signal.h>
  45. #include <stdlib.h>
  46. #include <sys/types.h>
  47. #include <sys/socket.h>
  48. #include <netinet/in.h>
  49. #include <strings.h>
  50. #include "src/common/slurm_xlator.h"
  51. #include "src/common/xmalloc.h"
  52. #include "src/common/xstring.h"
  53. #include "src/common/net.h"
  54. #include "src/common/mpi.h"
  55. #include "src/plugins/mpi/mpichmx/mpichmx.h"
  56. typedef struct {
  57. int defined;
  58. unsigned int port_board_id;
  59. unsigned int unique_high_id;
  60. unsigned int unique_low_id;
  61. unsigned int numanode;
  62. unsigned int remote_pid;
  63. unsigned int remote_port;
  64. } gm_slave_t;
  65. #define GMPI_RECV_BUF_LEN 65536
  66. struct gmpi_state {
  67. pthread_t tid;
  68. int fd; /* = -1 */
  69. mpi_plugin_client_info_t *job;
  70. };
  71. static int _gmpi_parse_init_recv_msg(mpi_plugin_client_info_t *job, char *rbuf,
  72. gm_slave_t *slave_data, int *ii)
  73. {
  74. unsigned int magic, id, port_board_id, unique_high_id,
  75. unique_low_id, numanode, remote_pid, remote_port;
  76. int got;
  77. gm_slave_t *dp;
  78. got = sscanf(rbuf, "<<<%u:%u:%u:%u:%u:%u:%u::%u>>>",
  79. &magic, &id, &port_board_id, &unique_high_id,
  80. &unique_low_id, &numanode, &remote_pid, &remote_port);
  81. *ii = id;
  82. if (got != 8) {
  83. error("GMPI master received invalid init message");
  84. return -1;
  85. }
  86. if (magic != job->jobid) {
  87. error("GMPI master received invalid magic number");
  88. return -1;
  89. }
  90. if (id >= job->step_layout->task_cnt)
  91. fatal("GMPI id is out of range");
  92. #if 0
  93. /* Unlike GM ports, MX endpoints can be 0,
  94. * Pere Munt, BSC-CMS */
  95. if (port_board_id == 0)
  96. fatal("MPI id=%d was unable to open a GM port", id);
  97. #endif
  98. dp = &slave_data[id];
  99. if (dp->defined) {
  100. error("Ignoring the message from MPI id=%d", id);
  101. return -1;
  102. }
  103. dp->defined = 1;
  104. dp->port_board_id = port_board_id;
  105. dp->unique_high_id = unique_high_id;
  106. dp->unique_low_id = unique_low_id;
  107. dp->numanode = numanode;
  108. dp->remote_pid = remote_pid;
  109. dp->remote_port = remote_port;
  110. debug3("slave_data[%d]: <<<%u:%u:%u:%u:%u:%u:%u::%u>>>",
  111. id, magic, id, port_board_id,
  112. dp->unique_high_id, dp->unique_low_id, dp->numanode,
  113. dp->remote_pid, dp->remote_port);
  114. return 0;
  115. }
  116. static int _gmpi_establish_map(gmpi_state_t *st)
  117. {
  118. mpi_plugin_client_info_t *job = st->job;
  119. struct sockaddr_in addr;
  120. in_addr_t *iaddrs;
  121. socklen_t addrlen;
  122. int accfd, newfd, rlen, nprocs, i, j, id;
  123. size_t gmaplen, lmaplen, maplen;
  124. char *p, *rbuf = NULL, *gmap = NULL, *lmap = NULL, *map = NULL;
  125. char tmp[128];
  126. gm_slave_t *slave_data = NULL, *dp;
  127. /*
  128. * Collect info from slaves.
  129. * Will never finish unless slaves are GMPI processes.
  130. */
  131. accfd = st->fd;
  132. addrlen = sizeof(addr);
  133. nprocs = job->step_layout->task_cnt;
  134. iaddrs = (in_addr_t *)xmalloc(sizeof(*iaddrs)*nprocs);
  135. slave_data = (gm_slave_t *)xmalloc(sizeof(*slave_data)*nprocs);
  136. for (i=0; i<nprocs; i++)
  137. slave_data[i].defined = 0;
  138. i = 0;
  139. rbuf = (char *)xmalloc(GMPI_RECV_BUF_LEN);
  140. while (i < nprocs) {
  141. newfd = accept(accfd, (struct sockaddr *)&addr, &addrlen);
  142. if (newfd == -1) {
  143. error("accept(2) in GMPI master thread: %m");
  144. continue;
  145. }
  146. rlen = recv(newfd, rbuf, GMPI_RECV_BUF_LEN, 0);
  147. if (rlen <= 0) {
  148. error("GMPI master recv returned %d", rlen);
  149. close(newfd);
  150. continue;
  151. } else {
  152. rbuf[rlen] = 0;
  153. }
  154. if (_gmpi_parse_init_recv_msg(job, rbuf, slave_data,
  155. &id) == 0) {
  156. i++;
  157. iaddrs[id] = ntohl(addr.sin_addr.s_addr);
  158. }
  159. close(newfd);
  160. }
  161. xfree(rbuf);
  162. debug2("Received data from all of %d GMPI processes.", i);
  163. /*
  164. * Compose the global map string.
  165. */
  166. gmap = (char *)xmalloc(128*nprocs);
  167. p = gmap;
  168. strcpy(p, "[[[");
  169. p += 3;
  170. for (i=0; i<nprocs; i++) {
  171. dp = &slave_data[i];
  172. sprintf(tmp, "<%u:%u:%u:%u>", dp->port_board_id,
  173. dp->unique_high_id, dp->unique_low_id, dp->numanode);
  174. strcpy(p, tmp);
  175. p += strlen(tmp);
  176. }
  177. strcpy(p, "|||");
  178. p += 3;
  179. gmaplen = (size_t)(p - gmap);
  180. /*
  181. * Respond to slaves.
  182. */
  183. lmap = (char *)xmalloc(128*nprocs);
  184. for (i=0; i<nprocs; i++) {
  185. /*
  186. * Compose the string to send.
  187. */
  188. dp = &slave_data[i];
  189. p = lmap;
  190. for (j=0; j<nprocs; j++) {
  191. if (iaddrs[i] == iaddrs[j] &&
  192. (dp->numanode == slave_data[j].numanode)) {
  193. sprintf(tmp, "<%u>", j);
  194. strcpy(p, tmp);
  195. p += strlen(tmp);
  196. }
  197. }
  198. lmaplen = (size_t)(p - lmap);
  199. map = (char *)xmalloc(gmaplen+lmaplen+4);
  200. strcpy(map, gmap);
  201. strcpy(map+gmaplen, lmap);
  202. strcpy(map+gmaplen+lmaplen, "]]]");
  203. maplen = gmaplen + lmaplen + 3;
  204. /*
  205. * Send it.
  206. */
  207. if ((newfd = socket(AF_INET, SOCK_STREAM, 0)) == -1) {
  208. fatal("GMPI master failed to respond");
  209. }
  210. j = 1;
  211. if (setsockopt(newfd, SOL_SOCKET, SO_REUSEADDR,
  212. (void *)&j, sizeof(j)))
  213. error("setsockopt in GMPI master: %m");
  214. memset(&addr, 0, sizeof(addr));
  215. addr.sin_family = AF_INET;
  216. addr.sin_addr.s_addr = htonl(iaddrs[i]);
  217. addr.sin_port = htons(dp->remote_port);
  218. if (connect(newfd, (struct sockaddr *)&addr, sizeof(addr)))
  219. fatal("GMPI master failed to connect");
  220. send(newfd, map, maplen, 0);
  221. close(newfd);
  222. xfree(map);
  223. }
  224. xfree(slave_data);
  225. xfree(lmap);
  226. xfree(gmap);
  227. xfree(iaddrs);
  228. debug2("GMPI master responded to all GMPI processes");
  229. return 0;
  230. }
  231. static void _gmpi_wait_abort(gmpi_state_t *st)
  232. {
  233. mpi_plugin_client_info_t *job = st->job;
  234. struct sockaddr_in addr;
  235. socklen_t addrlen;
  236. int newfd, rlen;
  237. unsigned int magic;
  238. char *rbuf;
  239. rbuf = (char *)xmalloc(GMPI_RECV_BUF_LEN);
  240. addrlen = sizeof(addr);
  241. while (1) {
  242. newfd = accept(st->fd, (struct sockaddr *)&addr,
  243. &addrlen);
  244. if (newfd == -1) {
  245. fatal("GMPI master failed to accept (abort-wait)");
  246. }
  247. rlen = recv(newfd, rbuf, GMPI_RECV_BUF_LEN, 0);
  248. if (rlen <= 0) {
  249. error("GMPI recv (abort-wait) returned %d", rlen);
  250. close(newfd);
  251. continue;
  252. } else {
  253. rbuf[rlen] = 0;
  254. }
  255. if (sscanf(rbuf, "<<<ABORT_%u_ABORT>>>", &magic) != 1) {
  256. error("GMPI (abort-wait) received spurious message.");
  257. close(newfd);
  258. continue;
  259. }
  260. if (magic != job->jobid) {
  261. error("GMPI (abort-wait) received bad magic number.");
  262. close(newfd);
  263. continue;
  264. }
  265. close(newfd);
  266. debug("Received ABORT message from an MPI process.");
  267. slurm_signal_job_step(job->jobid, job->stepid, SIGKILL);
  268. #if 0
  269. xfree(rbuf);
  270. close(jgmpi_fd);
  271. gmpi_fd = -1;
  272. return;
  273. #endif
  274. }
  275. }
  276. static void *_gmpi_thr(void *arg)
  277. {
  278. gmpi_state_t *st;
  279. st = (gmpi_state_t *) arg;
  280. debug3("GMPI master thread pid=%lu", (unsigned long) getpid());
  281. _gmpi_establish_map(st);
  282. debug3("GMPI master thread is waiting for ABORT message.");
  283. _gmpi_wait_abort(st);
  284. return (void *)0;
  285. }
  286. static gmpi_state_t *
  287. gmpi_state_create(const mpi_plugin_client_info_t *job)
  288. {
  289. gmpi_state_t *state;
  290. state = (gmpi_state_t *)xmalloc(sizeof(gmpi_state_t));
  291. state->tid = (pthread_t)-1;
  292. state->fd = -1;
  293. state->job = (mpi_plugin_client_info_t *) job;
  294. return state;
  295. }
  296. static void
  297. gmpi_state_destroy(gmpi_state_t *st)
  298. {
  299. xfree(st);
  300. }
  301. extern gmpi_state_t *
  302. gmpi_thr_create(const mpi_plugin_client_info_t *job, char ***env)
  303. {
  304. short port;
  305. pthread_attr_t attr;
  306. gmpi_state_t *st = NULL;
  307. st = gmpi_state_create(job);
  308. /*
  309. * It is possible for one to modify the mpirun command in
  310. * MPICH-GM distribution so that it calls srun, instead of
  311. * rsh, for remote process invocations. In that case, we
  312. * should not override envs nor open the master port.
  313. */
  314. if (getenv("GMPI_PORT"))
  315. return st;
  316. if (net_stream_listen (&st->fd, &port) < 0) {
  317. error ("Unable to create GMPI listen port: %m");
  318. gmpi_state_destroy(st);
  319. return NULL;
  320. }
  321. /*
  322. * Accept in a separate thread.
  323. */
  324. slurm_attr_init(&attr);
  325. pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
  326. if (pthread_create(&st->tid, &attr, &_gmpi_thr, (void *)st)) {
  327. slurm_attr_destroy(&attr);
  328. gmpi_state_destroy(st);
  329. return NULL;
  330. }
  331. slurm_attr_destroy(&attr);
  332. env_array_overwrite_fmt(env, "GMPI_PORT", "%hu", port);
  333. env_array_overwrite_fmt(env, "GMPI_MAGIC", "%u", job->jobid);
  334. env_array_overwrite_fmt(env, "GMPI_NP", "%d",
  335. job->step_layout->task_cnt);
  336. env_array_overwrite_fmt(env, "GMPI_SHMEM", "1");
  337. /* FIXME for multi-board config. */
  338. env_array_overwrite_fmt(env, "GMPI_BOARD", "-1");
  339. /* For new MX version */
  340. env_array_overwrite_fmt(env, "MXMPI_PORT", "%hu", port);
  341. env_array_overwrite_fmt(env, "MXMPI_MAGIC", "%u", job->jobid);
  342. env_array_overwrite_fmt(env, "MXMPI_NP", "%d",
  343. job->step_layout->task_cnt);
  344. /* FIXME for multi-board config. */
  345. env_array_overwrite_fmt(env, "MXMPI_BOARD", "-1");
  346. /* for MACOSX to override default malloc */
  347. env_array_overwrite_fmt(env, "DYLD_FORCE_FLAT_NAMESPACE", "1");
  348. debug("Started GMPI master thread (%lu)", (unsigned long) st->tid);
  349. return st;
  350. }
  351. /*
  352. * Warning: This pthread_cancel/pthread_join is a little unsafe. The thread is
  353. * not joinable, so on most systems the join will fail, then the thread's state
  354. * will be destroyed, possibly before the thread has actually stopped. In
  355. * practice the thread will usually be waiting on an accept call when it gets
  356. * cancelled. If the mpi thread has a mutex locked when it is cancelled--while
  357. * using the "info" or "error" functions for logging--the caller will deadlock.
  358. * See mpich1_p4.c or mvapich.c for code that shuts down cleanly by letting
  359. * the mpi thread wait on a poll call, and creating a pipe that the poll waits
  360. * on, which can be written to by the main thread to tell the mpi thread to
  361. * exit. Also see rev 18654 of mpichmx.c, on
  362. * branches/slurm-2.1.mpi.plugin.cleanup for an implementation. There were no
  363. * myrinet systems available for testing, which is why I couldn't complete the
  364. * patch for this plugin. -djb
  365. */
  366. extern int gmpi_thr_destroy(gmpi_state_t *st)
  367. {
  368. if (st != NULL) {
  369. if (st->tid != (pthread_t)-1) {
  370. pthread_cancel(st->tid);
  371. pthread_join(st->tid, NULL);
  372. }
  373. gmpi_state_destroy(st);
  374. }
  375. return SLURM_SUCCESS;
  376. }