PageRenderTime 49ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/src/plugins/launch/aprun/launch_aprun.c

https://github.com/cfenoy/slurm
C | 743 lines | 470 code | 85 blank | 188 comment | 104 complexity | 94132960f7496de763be480c4348c178 MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0
  1. /*****************************************************************************\
  2. * launch_aprun.c - Define job launch using Cray's aprun.
  3. *
  4. *****************************************************************************
  5. * Copyright (C) 2012 SchedMD LLC
  6. * Written by Danny Auble <da@schedmd.com>
  7. *
  8. * This file is part of SLURM, a resource management program.
  9. * For details, see <http://www.schedmd.com/slurmdocs/>.
  10. * Please also read the included file: DISCLAIMER.
  11. *
  12. * SLURM is free software; you can redistribute it and/or modify it under
  13. * the terms of the GNU General Public License as published by the Free
  14. * Software Foundation; either version 2 of the License, or (at your option)
  15. * any later version.
  16. *
  17. * In addition, as a special exception, the copyright holders give permission
  18. * to link the code of portions of this program with the OpenSSL library under
  19. * certain conditions as described in each individual source file, and
  20. * distribute linked combinations including the two. You must obey the GNU
  21. * General Public License in all respects for all of the code used other than
  22. * OpenSSL. If you modify file(s) with this exception, you may extend this
  23. * exception to your version of the file(s), but you are not obligated to do
  24. * so. If you do not wish to do so, delete this exception statement from your
  25. * version. If you delete this exception statement from all source files in
  26. * the program, then also delete it here.
  27. *
  28. * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  29. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  30. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  31. * details.
  32. *
  33. * You should have received a copy of the GNU General Public License along
  34. * with SLURM; if not, write to the Free Software Foundation, Inc.,
  35. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  36. \*****************************************************************************/
  37. #ifdef HAVE_CONFIG_H
  38. # include "config.h"
  39. #endif
  40. #include <stdlib.h>
  41. #include <ctype.h>
  42. #include "src/common/slurm_xlator.h"
  43. #include "src/common/parse_time.h"
  44. #include "src/srun/libsrun/launch.h"
  45. #include "src/srun/libsrun/multi_prog.h"
  46. #include "src/api/step_ctx.h"
  47. #include "src/api/step_launch.h"
  48. /*
  49. * These variables are required by the generic plugin interface. If they
  50. * are not found in the plugin, the plugin loader will ignore it.
  51. *
  52. * plugin_name - a string giving a human-readable description of the
  53. * plugin. There is no maximum length, but the symbol must refer to
  54. * a valid string.
  55. *
  56. * plugin_type - a string suggesting the type of the plugin or its
  57. * applicability to a particular form of data or method of data handling.
  58. * If the low-level plugin API is used, the contents of this string are
  59. * unimportant and may be anything. SLURM uses the higher-level plugin
  60. * interface which requires this string to be of the form
  61. *
  62. * <application>/<method>
  63. *
  64. * where <application> is a description of the intended application of
  65. * the plugin (e.g., "task" for task control) and <method> is a description
  66. * of how this plugin satisfies that application. SLURM will only load
  67. * a task plugin if the plugin_type string has a prefix of "task/".
  68. *
  69. * plugin_version - an unsigned 32-bit integer giving the version number
  70. * of the plugin. If major and minor revisions are desired, the major
  71. * version number may be multiplied by a suitable magnitude constant such
  72. * as 100 or 1000. Various SLURM versions will likely require a certain
  73. * minimum version for their plugins as this API matures.
  74. */
  75. const char plugin_name[] = "launch aprun plugin";
  76. const char plugin_type[] = "launch/aprun";
  77. const uint32_t plugin_version = 101;
  78. static pid_t aprun_pid = 0;
  79. extern void launch_p_fwd_signal(int signal);
  80. /* Convert a SLURM hostlist expression into the equivalent node index
  81. * value expression.
  82. */
  83. static char *_get_nids(char *nodelist)
  84. {
  85. char *nids;
  86. int i = 0, i2 = 0;
  87. if (!nodelist)
  88. return NULL;
  89. // info("got %s", nodelist);
  90. nids = xmalloc(sizeof(char) * strlen(nodelist));
  91. while (nodelist[i] && !isdigit(nodelist[i]))
  92. i++;
  93. while (nodelist[i] && nodelist[i] != ']')
  94. nids[i2++] = nodelist[i++];
  95. // info("returning %s", nids);
  96. return nids;
  97. }
  98. /*
  99. * Parse a multi-prog input file line
  100. * line IN - line to parse
  101. * command_pos IN/OUT - where in opt.argv we are
  102. * count IN - which command we are on
  103. * return 0 if empty line, 1 if added
  104. */
  105. static int _parse_prog_line(char *in_line, int *command_pos, int count)
  106. {
  107. int i, cmd_inx;
  108. int first_task_inx, last_task_inx;
  109. hostset_t hs = NULL;
  110. char *tmp_str = NULL;
  111. xassert(opt.ntasks);
  112. /* Get the task ID string */
  113. for (i = 0; in_line[i]; i++)
  114. if (!isspace(in_line[i]))
  115. break;
  116. if (!in_line[i]) /* empty line */
  117. return 0;
  118. else if (in_line[i] == '#')
  119. return 0;
  120. else if (!isdigit(in_line[i]))
  121. goto bad_line;
  122. first_task_inx = i;
  123. for (i++; in_line[i]; i++) {
  124. if (isspace(in_line[i]))
  125. break;
  126. }
  127. if (!isspace(in_line[i]))
  128. goto bad_line;
  129. last_task_inx = i;
  130. /* Now transfer data to the function arguments */
  131. in_line[last_task_inx] = '\0';
  132. xstrfmtcat(tmp_str, "[%s]", in_line + first_task_inx);
  133. hs = hostset_create(tmp_str);
  134. xfree(tmp_str);
  135. in_line[last_task_inx] = ' ';
  136. if (!hs)
  137. goto bad_line;
  138. if (count) {
  139. opt.argc += 1;
  140. xrealloc(opt.argv, opt.argc * sizeof(char *));
  141. opt.argv[(*command_pos)++] = xstrdup(":");
  142. }
  143. opt.argc += 2;
  144. xrealloc(opt.argv, opt.argc * sizeof(char *));
  145. opt.argv[(*command_pos)++] = xstrdup("-n");
  146. opt.argv[(*command_pos)++] = xstrdup_printf("%d", hostset_count(hs));
  147. hostset_destroy(hs);
  148. /* Get the command */
  149. for (i++; in_line[i]; i++) {
  150. if (!isspace(in_line[i]))
  151. break;
  152. }
  153. if (in_line[i] == '\0')
  154. goto bad_line;
  155. cmd_inx = i;
  156. for ( ; in_line[i]; i++) {
  157. if (isspace(in_line[i])) {
  158. if (i > cmd_inx) {
  159. int diff = i - cmd_inx + 1;
  160. char tmp_char[diff + 1];
  161. snprintf(tmp_char, diff, "%s",
  162. in_line + cmd_inx);
  163. opt.argc += 1;
  164. xrealloc(opt.argv, opt.argc * sizeof(char *));
  165. opt.argv[(*command_pos)++] = xstrdup(tmp_char);
  166. }
  167. cmd_inx = i + 1;
  168. } else if (in_line[i] == '\n')
  169. break;
  170. }
  171. return 1;
  172. bad_line:
  173. error("invalid input line: %s", in_line);
  174. return 0;
  175. }
  176. static void _handle_multi_prog(char *in_file, int *command_pos)
  177. {
  178. char in_line[512];
  179. FILE *fp;
  180. int count = 0;
  181. if (verify_multi_name(in_file, opt.ntasks))
  182. exit(error_exit);
  183. fp = fopen(in_file, "r");
  184. if (!fp) {
  185. fatal("fopen(%s): %m", in_file);
  186. return;
  187. }
  188. while (fgets(in_line, sizeof(in_line), fp)) {
  189. if (_parse_prog_line(in_line, command_pos, count))
  190. count++;
  191. }
  192. return;
  193. }
  194. static void _unblock_signals(void)
  195. {
  196. sigset_t set;
  197. int i;
  198. for (i = 0; sig_array[i]; i++) {
  199. /* eliminate pending signals, then set to default */
  200. xsignal(sig_array[i], SIG_IGN);
  201. xsignal(sig_array[i], SIG_DFL);
  202. }
  203. sigemptyset(&set);
  204. xsignal_set_mask (&set);
  205. }
  206. static void _send_step_complete_rpc(srun_job_t *srun_job, int step_rc)
  207. {
  208. slurm_msg_t req;
  209. step_complete_msg_t msg;
  210. int rc;
  211. memset(&msg, 0, sizeof(step_complete_msg_t));
  212. msg.job_id = srun_job->jobid;
  213. msg.job_step_id = srun_job->stepid;
  214. msg.range_first = 0;
  215. msg.range_last = 0;
  216. msg.step_rc = step_rc;
  217. msg.jobacct = jobacctinfo_create(NULL);
  218. slurm_msg_t_init(&req);
  219. req.msg_type = REQUEST_STEP_COMPLETE;
  220. req.data = &msg;
  221. /* req.address = step_complete.parent_addr; */
  222. debug3("Sending step complete RPC to slurmctld");
  223. if (slurm_send_recv_controller_rc_msg(&req, &rc) < 0)
  224. error("Error sending step complete RPC to slurmctld");
  225. jobacctinfo_destroy(msg.jobacct);
  226. }
  227. static void _handle_step_complete(srun_job_complete_msg_t *comp_msg)
  228. {
  229. launch_p_fwd_signal(SIGKILL);
  230. return;
  231. }
  232. static void _handle_timeout(srun_timeout_msg_t *timeout_msg)
  233. {
  234. time_t now = time(NULL);
  235. char time_str[24];
  236. if (now < timeout_msg->timeout) {
  237. slurm_make_time_str(&timeout_msg->timeout,
  238. time_str, sizeof(time_str));
  239. debug("step %u.%u will timeout at %s",
  240. timeout_msg->job_id, timeout_msg->step_id, time_str);
  241. return;
  242. }
  243. slurm_make_time_str(&now, time_str, sizeof(time_str));
  244. error("*** STEP %u.%u CANCELLED AT %s DUE TO TIME LIMIT ***",
  245. timeout_msg->job_id, timeout_msg->step_id, time_str);
  246. launch_p_fwd_signal(SIGKILL);
  247. return;
  248. }
  249. static void _handle_msg(slurm_msg_t *msg)
  250. {
  251. static uint32_t slurm_uid = NO_VAL;
  252. uid_t req_uid = g_slurm_auth_get_uid(msg->auth_cred, NULL);
  253. uid_t uid = getuid();
  254. job_step_kill_msg_t *ss;
  255. srun_user_msg_t *um;
  256. if (slurm_uid == NO_VAL)
  257. slurm_uid = slurm_get_slurm_user_id();
  258. if ((req_uid != slurm_uid) && (req_uid != 0) && (req_uid != uid)) {
  259. error ("Security violation, slurm message from uid %u",
  260. (unsigned int) req_uid);
  261. return;
  262. }
  263. switch (msg->msg_type) {
  264. case SRUN_PING:
  265. debug3("slurmctld ping received");
  266. slurm_send_rc_msg(msg, SLURM_SUCCESS);
  267. slurm_free_srun_ping_msg(msg->data);
  268. break;
  269. case SRUN_JOB_COMPLETE:
  270. debug("received job step complete message");
  271. _handle_step_complete(msg->data);
  272. slurm_free_srun_job_complete_msg(msg->data);
  273. break;
  274. case SRUN_USER_MSG:
  275. um = msg->data;
  276. info("%s", um->msg);
  277. slurm_free_srun_user_msg(msg->data);
  278. break;
  279. case SRUN_TIMEOUT:
  280. debug2("received job step timeout message");
  281. _handle_timeout(msg->data);
  282. slurm_free_srun_timeout_msg(msg->data);
  283. break;
  284. case SRUN_STEP_SIGNAL:
  285. ss = msg->data;
  286. debug("received step signal %u RPC", ss->signal);
  287. launch_p_fwd_signal(ss->signal);
  288. slurm_free_job_step_kill_msg(msg->data);
  289. break;
  290. default:
  291. debug("received spurious message type: %u",
  292. msg->msg_type);
  293. break;
  294. }
  295. return;
  296. }
  297. static void *_msg_thr_internal(void *arg)
  298. {
  299. slurm_addr_t cli_addr;
  300. slurm_fd_t newsockfd;
  301. slurm_msg_t *msg;
  302. int *slurmctld_fd_ptr = (int *)arg;
  303. (void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
  304. (void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
  305. while (!srun_shutdown) {
  306. newsockfd = slurm_accept_msg_conn(*slurmctld_fd_ptr, &cli_addr);
  307. if (newsockfd == SLURM_SOCKET_ERROR) {
  308. if (errno != EINTR)
  309. error("slurm_accept_msg_conn: %m");
  310. continue;
  311. }
  312. msg = xmalloc(sizeof(slurm_msg_t));
  313. if (slurm_receive_msg(newsockfd, msg, 0) != 0) {
  314. error("slurm_receive_msg: %m");
  315. /* close the new socket */
  316. slurm_close_accepted_conn(newsockfd);
  317. continue;
  318. }
  319. _handle_msg(msg);
  320. slurm_free_msg(msg);
  321. slurm_close_accepted_conn(newsockfd);
  322. }
  323. return NULL;
  324. }
  325. static pthread_t _spawn_msg_handler(srun_job_t *job)
  326. {
  327. pthread_attr_t attr;
  328. pthread_t msg_thread;
  329. static int slurmctld_fd;
  330. slurmctld_fd = job->step_ctx->launch_state->slurmctld_socket_fd;
  331. if (slurmctld_fd < 0)
  332. return (pthread_t) 0;
  333. job->step_ctx->launch_state->slurmctld_socket_fd = -1;
  334. slurm_attr_init(&attr);
  335. if (pthread_create(&msg_thread, &attr, _msg_thr_internal,
  336. (void *) &slurmctld_fd))
  337. error("pthread_create of message thread: %m");
  338. slurm_attr_destroy(&attr);
  339. return msg_thread;
  340. }
  341. /*
  342. * init() is called when the plugin is loaded, before any other functions
  343. * are called. Put global initialization here.
  344. */
  345. extern int init(void)
  346. {
  347. verbose("%s loaded", plugin_name);
  348. return SLURM_SUCCESS;
  349. }
  350. /*
  351. * fini() is called when the plugin is removed. Clear any allocated
  352. * storage here.
  353. */
  354. extern int fini(void)
  355. {
  356. return SLURM_SUCCESS;
  357. }
  358. extern int launch_p_setup_srun_opt(char **rest)
  359. {
  360. int command_pos = 0;
  361. if (opt.test_only) {
  362. error("--test-only not supported with aprun");
  363. exit (1);
  364. } else if (opt.no_alloc) {
  365. error("--no-allocate not supported with aprun");
  366. exit (1);
  367. }
  368. if (opt.slurmd_debug != LOG_LEVEL_QUIET) {
  369. error("--slurmd-debug not supported with aprun");
  370. opt.slurmd_debug = LOG_LEVEL_QUIET;
  371. }
  372. opt.argc += 2;
  373. opt.argv = (char **) xmalloc(opt.argc * sizeof(char *));
  374. opt.argv[command_pos++] = xstrdup("aprun");
  375. /* Set default job name to the executable name rather than
  376. * "aprun" */
  377. if (!opt.job_name_set_cmd && (1 < opt.argc)) {
  378. opt.job_name_set_cmd = true;
  379. opt.job_name = xstrdup(rest[0]);
  380. }
  381. if (opt.cpus_per_task) {
  382. opt.argc += 2;
  383. xrealloc(opt.argv, opt.argc * sizeof(char *));
  384. opt.argv[command_pos++] = xstrdup("-d");
  385. opt.argv[command_pos++] = xstrdup_printf(
  386. "%u", opt.cpus_per_task);
  387. }
  388. if (opt.shared != (uint16_t)NO_VAL) {
  389. opt.argc += 2;
  390. xrealloc(opt.argv, opt.argc * sizeof(char *));
  391. opt.argv[command_pos++] = xstrdup("-F");
  392. opt.argv[command_pos++] = xstrdup("share");
  393. } else if (opt.exclusive) {
  394. opt.argc += 2;
  395. xrealloc(opt.argv, opt.argc * sizeof(char *));
  396. opt.argv[command_pos++] = xstrdup("-F");
  397. opt.argv[command_pos++] = xstrdup("exclusive");
  398. }
  399. if (opt.nodelist) {
  400. char *nids = _get_nids(opt.nodelist);
  401. opt.argc += 2;
  402. xrealloc(opt.argv, opt.argc * sizeof(char *));
  403. opt.argv[command_pos++] = xstrdup("-L");
  404. opt.argv[command_pos++] = xstrdup(nids);
  405. xfree(nids);
  406. }
  407. if (opt.mem_per_cpu != NO_VAL) {
  408. opt.argc += 2;
  409. xrealloc(opt.argv, opt.argc * sizeof(char *));
  410. opt.argv[command_pos++] = xstrdup("-m");
  411. opt.argv[command_pos++] = xstrdup_printf("%u", opt.mem_per_cpu);
  412. }
  413. if (opt.ntasks_per_node != NO_VAL) {
  414. opt.argc += 2;
  415. xrealloc(opt.argv, opt.argc * sizeof(char *));
  416. opt.argv[command_pos++] = xstrdup("-N");
  417. opt.argv[command_pos++] = xstrdup_printf(
  418. "%u", opt.ntasks_per_node);
  419. if (!opt.ntasks && opt.min_nodes)
  420. opt.ntasks = opt.ntasks_per_node * opt.min_nodes;
  421. } else if (opt.min_nodes) {
  422. uint32_t tasks_per_node;
  423. if (!opt.ntasks)
  424. opt.ntasks = opt.min_nodes;
  425. tasks_per_node = (opt.ntasks + opt.min_nodes - 1) /
  426. opt.min_nodes;
  427. opt.argc += 2;
  428. xrealloc(opt.argv, opt.argc * sizeof(char *));
  429. opt.argv[command_pos++] = xstrdup("-N");
  430. opt.argv[command_pos++] = xstrdup_printf("%u", tasks_per_node);
  431. }
  432. if (opt.ntasks && !opt.multi_prog) {
  433. opt.argc += 2;
  434. xrealloc(opt.argv, opt.argc * sizeof(char *));
  435. opt.argv[command_pos++] = xstrdup("-n");
  436. opt.argv[command_pos++] = xstrdup_printf("%u", opt.ntasks);
  437. }
  438. if ((_verbose < 3) || opt.quiet) {
  439. opt.argc += 1;
  440. xrealloc(opt.argv, opt.argc * sizeof(char *));
  441. opt.argv[command_pos++] = xstrdup("-q");
  442. }
  443. if (opt.ntasks_per_socket != NO_VAL) {
  444. opt.argc += 2;
  445. xrealloc(opt.argv, opt.argc * sizeof(char *));
  446. opt.argv[command_pos++] = xstrdup("-S");
  447. opt.argv[command_pos++] = xstrdup_printf(
  448. "%u", opt.ntasks_per_socket);
  449. }
  450. if (opt.sockets_per_node != NO_VAL) {
  451. opt.argc += 2;
  452. xrealloc(opt.argv, opt.argc * sizeof(char *));
  453. opt.argv[command_pos++] = xstrdup("-sn");
  454. opt.argv[command_pos++] = xstrdup_printf(
  455. "%u", opt.sockets_per_node);
  456. }
  457. if (opt.mem_bind && strstr(opt.mem_bind, "local")) {
  458. opt.argc += 1;
  459. xrealloc(opt.argv, opt.argc * sizeof(char *));
  460. opt.argv[command_pos++] = xstrdup("-ss");
  461. }
  462. if (opt.time_limit_str) {
  463. opt.argc += 2;
  464. xrealloc(opt.argv, opt.argc * sizeof(char *));
  465. opt.argv[command_pos++] = xstrdup("-t");
  466. opt.argv[command_pos++] = xstrdup_printf(
  467. "%d", time_str2secs(opt.time_limit_str));
  468. }
  469. if (opt.launcher_opts) {
  470. char *save_ptr = NULL, *tok;
  471. char *tmp = xstrdup(opt.launcher_opts);
  472. tok = strtok_r(tmp, " ", &save_ptr);
  473. while (tok) {
  474. opt.argc++;
  475. xrealloc(opt.argv, opt.argc * sizeof(char *));
  476. opt.argv[command_pos++] = xstrdup(tok);
  477. tok = strtok_r(NULL, " ", &save_ptr);
  478. }
  479. xfree(tmp);
  480. }
  481. /* These are srun options that are not supported by aprun, but
  482. here just in case in the future they add them.
  483. if (opt.disable_status) {
  484. xstrcat(cmd_line, " --disable-status");
  485. }
  486. if (opt.epilog) {
  487. xstrfmtcat(cmd_line, " --epilog=", opt.epilog);
  488. }
  489. if (kill_on_bad_exit) {
  490. xstrcat(cmd_line, " --kill-on-bad-exit");
  491. }
  492. if (label) {
  493. xstrcat(cmd_line, " --label");
  494. }
  495. if (opt.mpi_type) {
  496. xstrfmtcat(cmd_line, " --mpi=", opt.mpi_type);
  497. }
  498. if (opt.msg_timeout) {
  499. xstrfmtcat(cmd_line, " --msg-timeout=", opt.msg_timeout);
  500. }
  501. if (no_allocate) {
  502. xstrcat(cmd_line, " --no-allocate");
  503. }
  504. if (opt.open_mode) {
  505. xstrcat(cmd_line, " --open-mode=", opt.open_mode);
  506. }
  507. if (preserve_env) {
  508. xstrcat(cmd_line, " --preserve_env");
  509. }
  510. if (opt.prolog) {
  511. xstrcat(cmd_line, " --prolog=", opt.prolog );
  512. }
  513. if (opt.propagate) {
  514. xstrcat(cmd_line, " --propagate", opt.propagate );
  515. }
  516. if (pty) {
  517. xstrcat(cmd_line, " --pty");
  518. }
  519. if (quit_on_interrupt) {
  520. xstrcat(cmd_line, " --quit-on-interrupt");
  521. }
  522. if (opt.relative) {
  523. xstrfmtcat(cmd_line, " --relative=", opt.relative);
  524. }
  525. if (restart_dir) {
  526. xstrfmtcat(cmd_line, " --restart-dir=", opt.restart_dir);
  527. }
  528. if (resv_port) {
  529. xstrcat(cmd_line, "--resv-port");
  530. }
  531. if (opt.slurm_debug) {
  532. xstrfmtcat(cmd_line, " --slurmd-debug=", opt.slurm_debug);
  533. }
  534. if (opttask_epilog) {
  535. xstrfmtcat(cmd_line, " --task-epilog=", opt.task_epilog);
  536. }
  537. if (opt.task_prolog) {
  538. xstrfmtcat(cmd_line, " --task-prolog", opt.task_prolog);
  539. }
  540. if (test_only) {
  541. xstrcat(cmd_line, " --test-only");
  542. }
  543. if (unbuffered) {
  544. xstrcat(cmd_line, " --unbuffered");
  545. }
  546. */
  547. if (opt.multi_prog) {
  548. _handle_multi_prog(rest[0], &command_pos);
  549. /* just so we don't tack on the script to the aprun line */
  550. command_pos = opt.argc;
  551. }
  552. return command_pos;
  553. }
  554. extern int launch_p_handle_multi_prog_verify(int command_pos)
  555. {
  556. if (opt.multi_prog)
  557. return 1;
  558. return 0;
  559. }
  560. extern int launch_p_create_job_step(srun_job_t *job, bool use_all_cpus,
  561. void (*signal_function)(int),
  562. sig_atomic_t *destroy_job)
  563. {
  564. if (opt.launch_cmd) {
  565. int i = 0;
  566. char *cmd_line = NULL;
  567. while (opt.argv[i])
  568. xstrfmtcat(cmd_line, "%s ", opt.argv[i++]);
  569. printf("%s\n", cmd_line);
  570. xfree(cmd_line);
  571. exit(0);
  572. }
  573. return launch_common_create_job_step(job, use_all_cpus,
  574. signal_function,
  575. destroy_job);
  576. }
  577. extern int launch_p_step_launch(
  578. srun_job_t *job, slurm_step_io_fds_t *cio_fds, uint32_t *global_rc,
  579. slurm_step_launch_callbacks_t *step_callbacks)
  580. {
  581. int rc = 0;
  582. pthread_t msg_thread = _spawn_msg_handler(job);
  583. aprun_pid = fork();
  584. if (aprun_pid < 0) {
  585. error("fork: %m");
  586. return 1;
  587. } else if (aprun_pid > 0) {
  588. if (waitpid(aprun_pid, &rc, 0) < 0)
  589. error("Unable to reap aprun child process");
  590. *global_rc = rc;
  591. /* Just because waitpid returns something doesn't mean
  592. this function failed so always set it back to 0.
  593. */
  594. rc = 0;
  595. } else {
  596. setpgrp();
  597. _unblock_signals();
  598. /* dup stdio onto our open fds */
  599. if ((dup2(cio_fds->in.fd, 0) == -1) ||
  600. (dup2(cio_fds->out.fd, 1) == -1) ||
  601. (dup2(cio_fds->err.fd, 2) == -1)) {
  602. error("dup2: %m");
  603. return 1;
  604. }
  605. execvp(opt.argv[0], opt.argv);
  606. error("execv(aprun) error: %m");
  607. return 1;
  608. }
  609. _send_step_complete_rpc(job, *global_rc);
  610. if (msg_thread) {
  611. srun_shutdown = true;
  612. pthread_cancel(msg_thread);
  613. pthread_join(msg_thread, NULL);
  614. }
  615. return rc;
  616. }
  617. extern int launch_p_step_wait(srun_job_t *job, bool got_alloc)
  618. {
  619. return SLURM_SUCCESS;
  620. }
  621. extern int launch_p_step_terminate(void)
  622. {
  623. return SLURM_SUCCESS;
  624. }
  625. extern void launch_p_print_status(void)
  626. {
  627. }
  628. extern void launch_p_fwd_signal(int signal)
  629. {
  630. if (aprun_pid)
  631. kill(aprun_pid, signal);
  632. }