PageRenderTime 71ms CodeModel.GetById 30ms RepoModel.GetById 0ms app.codeStats 1ms

/src/plugins/switch/nrt/libpermapi/shr_64.c

https://github.com/cfenoy/slurm
C | 1859 lines | 1325 code | 169 blank | 365 comment | 358 complexity | 6cc10b5ccb557312074b347a5735a7c6 MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0

Large files files are truncated, but you can click here to view the full file

  1. /*****************************************************************************\
  2. * shr_64.c - This plug is used by POE to interact with SLURM.
  3. *
  4. *****************************************************************************
  5. * Copyright (C) 2012 SchedMD LLC.
  6. * Written by Danny Auble <da@schedmd.com> et. al.
  7. *
  8. * This file is part of SLURM, a resource management program.
  9. * For details, see <http://www.schedmd.com/slurmdocs/>.
  10. * Please also read the included file: DISCLAIMER.
  11. *
  12. * SLURM is free software; you can redistribute it and/or modify it under
  13. * the terms of the GNU General Public License as published by the Free
  14. * Software Foundation; either version 2 of the License, or (at your option)
  15. * any later version.
  16. *
  17. * In addition, as a special exception, the copyright holders give permission
  18. * to link the code of portions of this program with the OpenSSL library under
  19. * certain conditions as described in each individual source file, and
  20. * distribute linked combinations including the two. You must obey the GNU
  21. * General Public License in all respects for all of the code used other than
  22. * OpenSSL. If you modify file(s) with this exception, you may extend this
  23. * exception to your version of the file(s), but you are not obligated to do
  24. * so. If you do not wish to do so, delete this exception statement from your
  25. * version. If you delete this exception statement from all source files in
  26. * the program, then also delete it here.
  27. *
  28. * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  29. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  30. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  31. * details.
  32. *
  33. * You should have received a copy of the GNU General Public License along
  34. * with SLURM; if not, write to the Free Software Foundation, Inc.,
  35. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  36. \*****************************************************************************/
  37. #include <permapi.h>
  38. #include <ctype.h>
  39. #include <dlfcn.h>
  40. #include <fcntl.h>
  41. #include <stdlib.h>
  42. #include <unistd.h>
  43. #include <arpa/inet.h>
  44. #ifdef HAVE_CONFIG_H
  45. # include "config.h"
  46. #endif
  47. #include "src/common/slurm_xlator.h"
  48. #include "slurm/slurm.h"
  49. #include "slurm/slurm_errno.h"
  50. #include "src/api/step_ctx.h"
  51. #include "src/common/hostlist.h"
  52. #include "src/common/list.h"
  53. #include "src/common/log.h"
  54. #include "src/common/parse_time.h"
  55. #include "src/common/plugstack.h"
  56. #include "src/common/slurm_protocol_pack.h"
  57. #include "src/common/xmalloc.h"
  58. #include "src/common/xstring.h"
  59. #include "src/srun/libsrun/allocate.h"
  60. #include "src/srun/libsrun/launch.h"
  61. #include "src/srun/libsrun/opt.h"
  62. #include "src/srun/libsrun/srun_job.h"
  63. #include "src/plugins/switch/nrt/nrt_keys.h"
  64. bool srun_max_timer = false;
  65. bool srun_shutdown = false;
  66. static char *poe_cmd_fname = NULL;
  67. static void *my_handle = NULL;
  68. static srun_job_t *job = NULL;
  69. static bool got_alloc = false;
  70. static bool slurm_started = false;
  71. static log_options_t log_opts = LOG_OPTS_STDERR_ONLY;
  72. static host_usage_t *host_usage = NULL;
  73. static hostlist_t total_hl = NULL;
  74. int sig_array[] = {
  75. SIGINT, SIGQUIT, SIGCONT, SIGTERM, SIGHUP,
  76. SIGALRM, SIGUSR1, SIGUSR2, SIGPIPE, 0 };
  77. extern char **environ;
  78. /* IBM internal definitions to get information on how and who is
  79. * calling us.
  80. */
  81. #define PM_POE 0
  82. #define PM_PMD 1
  83. extern int pm_type;
  84. extern int pmdlog;
  85. extern FILE *pmd_lfp;
  86. typedef struct agent_data {
  87. uint32_t fe_auth_key;
  88. slurm_fd_t fe_comm_socket;
  89. } agent_data_t;
  90. static char *_name_from_addr(char *addr)
  91. {
  92. host_usage_t *host_ptr;
  93. xassert(host_usage);
  94. host_ptr = host_usage;
  95. while (host_ptr && host_ptr->host_address) {
  96. if (!strcmp(addr, host_ptr->host_address))
  97. return host_ptr->host_name;
  98. host_ptr++;
  99. }
  100. return NULL;
  101. }
  102. static void _pack_srun_ctx(slurm_step_ctx_t *ctx, Buf buffer)
  103. {
  104. uint8_t tmp_8 = 0;
  105. if (ctx)
  106. tmp_8 = 1;
  107. pack8(tmp_8, buffer);
  108. if (!ctx || !ctx->step_req || !ctx->step_resp) {
  109. error("_pack_srun_ctx: ctx is NULL");
  110. return;
  111. }
  112. pack_job_step_create_request_msg(ctx->step_req, buffer,
  113. SLURM_PROTOCOL_VERSION);
  114. pack_job_step_create_response_msg(ctx->step_resp, buffer,
  115. SLURM_PROTOCOL_VERSION);
  116. }
  117. static int _unpack_srun_ctx(slurm_step_ctx_t **step_ctx, Buf buffer)
  118. {
  119. slurm_step_ctx_t *ctx = NULL;
  120. uint8_t tmp_8;
  121. int rc;
  122. *step_ctx = NULL;
  123. safe_unpack8(&tmp_8, buffer);
  124. if (tmp_8 == 0) {
  125. error("_unpack_srun_ctx: ctx is NULL");
  126. return SLURM_ERROR;
  127. }
  128. ctx = xmalloc(sizeof(slurm_step_ctx_t));
  129. ctx->magic = STEP_CTX_MAGIC;
  130. rc = unpack_job_step_create_request_msg(&ctx->step_req, buffer,
  131. SLURM_PROTOCOL_VERSION);
  132. if (rc != SLURM_SUCCESS)
  133. goto unpack_error;
  134. rc = unpack_job_step_create_response_msg(&ctx->step_resp, buffer,
  135. SLURM_PROTOCOL_VERSION);
  136. if (rc != SLURM_SUCCESS)
  137. goto unpack_error;
  138. *step_ctx = ctx;
  139. return SLURM_SUCCESS;
  140. unpack_error:
  141. error("_unpack_srun_ctx: unpack error");
  142. if (ctx && ctx->step_req)
  143. slurm_free_job_step_create_request_msg(ctx->step_req);
  144. if (ctx && ctx->step_resp)
  145. slurm_free_job_step_create_response_msg(ctx->step_resp);
  146. xfree(ctx);
  147. return SLURM_ERROR;
  148. }
  149. static Buf _pack_srun_job_rec(void)
  150. {
  151. Buf buffer;
  152. host_usage_t *host_ptr;
  153. buffer = init_buf(4096);
  154. pack32(job->nhosts, buffer);
  155. packstr(job->alias_list, buffer);
  156. packstr(job->nodelist, buffer);
  157. _pack_srun_ctx(job->step_ctx, buffer);
  158. /* Since we can't rely on slurm_conf_get_nodename_from_addr
  159. working on a PERCS machine reliably we will sort all the
  160. IP's as we know them and ship them over if/when a PMD needs to
  161. forward the fanout.
  162. */
  163. xassert(host_usage);
  164. host_ptr = host_usage;
  165. while (host_ptr && host_ptr->host_name) {
  166. packstr(host_ptr->host_name, buffer);
  167. packstr(host_ptr->host_address, buffer);
  168. host_ptr++;
  169. }
  170. return buffer;
  171. }
  172. static srun_job_t * _unpack_srun_job_rec(Buf buffer)
  173. {
  174. uint32_t tmp_32;
  175. srun_job_t *job_data;
  176. host_usage_t *host_ptr;
  177. int i;
  178. job_data = xmalloc(sizeof(srun_job_t));
  179. safe_unpack32(&job_data->nhosts, buffer);
  180. safe_unpackstr_xmalloc(&job_data->alias_list, &tmp_32, buffer);
  181. safe_unpackstr_xmalloc(&job_data->nodelist, &tmp_32, buffer);
  182. if (_unpack_srun_ctx(&job_data->step_ctx, buffer))
  183. goto unpack_error;
  184. host_usage = xmalloc(sizeof(host_usage_t) * (job_data->nhosts+1));
  185. host_ptr = host_usage;
  186. for (i=0; i<job_data->nhosts; i++) {
  187. safe_unpackstr_xmalloc(&host_ptr->host_name, &tmp_32, buffer);
  188. safe_unpackstr_xmalloc(&host_ptr->host_address,
  189. &tmp_32, buffer);
  190. host_ptr++;
  191. }
  192. return job_data;
  193. unpack_error:
  194. error("_unpack_srun_job_rec: unpack error");
  195. xfree(job_data->alias_list);
  196. xfree(job_data->nodelist);
  197. xfree(job_data);
  198. return NULL;
  199. }
  200. /* Validate a message connection
  201. * Return: true=valid/authenticated */
  202. static bool _validate_connect(slurm_fd_t socket_conn, uint32_t auth_key)
  203. {
  204. struct timeval tv;
  205. fd_set read_fds;
  206. uint32_t read_key;
  207. bool valid = false;
  208. int i, n_fds;
  209. n_fds = socket_conn;
  210. while (1) {
  211. FD_ZERO(&read_fds);
  212. FD_SET(socket_conn, &read_fds);
  213. tv.tv_sec = 10;
  214. tv.tv_usec = 0;
  215. i = select((n_fds + 1), &read_fds, NULL, NULL, &tv);
  216. if (i == 0)
  217. break;
  218. if (i < 0) {
  219. if (errno == EINTR)
  220. continue;
  221. break;
  222. }
  223. i = slurm_read_stream(socket_conn, (char *)&read_key,
  224. sizeof(read_key));
  225. if ((i == sizeof(read_key)) && (read_key == auth_key)) {
  226. valid = true;
  227. } else {
  228. error("error validating incoming socket connection");
  229. sleep(1); /* Help prevent brute force attack */
  230. }
  231. break;
  232. }
  233. return valid;
  234. }
  235. /* Process a message from PMD */
  236. static void _agent_proc_connect(slurm_fd_t fe_comm_socket,uint32_t fe_auth_key)
  237. {
  238. slurm_fd_t fe_comm_conn = -1;
  239. slurm_addr_t be_addr;
  240. bool be_connected = false;
  241. Buf buffer = NULL;
  242. uint32_t buf_size;
  243. char *buf_data;
  244. int i, offset = 0;
  245. while (1) {
  246. fe_comm_conn = slurm_accept_stream(fe_comm_socket, &be_addr);
  247. if (fe_comm_conn != SLURM_SOCKET_ERROR) {
  248. if (_validate_connect(fe_comm_conn, fe_auth_key))
  249. be_connected = true;
  250. break;
  251. }
  252. if (errno != EINTR) {
  253. error("slurm_accept_stream: %m");
  254. break;
  255. }
  256. }
  257. if (!be_connected)
  258. goto fini;
  259. buffer = _pack_srun_job_rec();
  260. buf_size = get_buf_offset(buffer);
  261. buf_data = (char *) &buf_size;
  262. i = slurm_write_stream_timeout(fe_comm_conn, buf_data,
  263. sizeof(buf_size), 8000);
  264. if (i < sizeof(buf_size)) {
  265. error("_agent_proc_connect write: %m");
  266. goto fini;
  267. }
  268. buf_data = get_buf_data(buffer);
  269. while (buf_size > offset) {
  270. i = slurm_write_stream_timeout(fe_comm_conn, buf_data + offset,
  271. buf_size - offset, 8000);
  272. if (i < 0) {
  273. if ((errno != EAGAIN) && (errno != EINTR)) {
  274. error("_agent_proc_connect write: %m");
  275. break;
  276. }
  277. } else if (i > 0) {
  278. offset += i;
  279. } else {
  280. error("_agent_proc_connect write: timeout");
  281. break;
  282. }
  283. }
  284. fini: if (fe_comm_conn >= 0)
  285. slurm_close_accepted_conn(fe_comm_conn);
  286. if (buffer)
  287. free_buf(buffer);
  288. }
  289. /* Thread to wait for and process messgaes from PMD (via libpermapi) */
  290. static void *_agent_thread(void *arg)
  291. {
  292. agent_data_t *agent_data_ptr = (agent_data_t *) arg;
  293. uint32_t fe_auth_key = agent_data_ptr->fe_auth_key;
  294. slurm_fd_t fe_comm_socket = agent_data_ptr->fe_comm_socket;
  295. fd_set except_fds, read_fds;
  296. struct timeval tv;
  297. int i, n_fds;
  298. xfree(agent_data_ptr);
  299. n_fds = fe_comm_socket;
  300. while (fe_comm_socket >= 0) {
  301. FD_ZERO(&except_fds);
  302. FD_SET(fe_comm_socket, &except_fds);
  303. FD_ZERO(&read_fds);
  304. FD_SET(fe_comm_socket, &read_fds);
  305. tv.tv_sec = 0;
  306. tv.tv_usec = 0;
  307. i = select((n_fds + 1), &read_fds, NULL, &except_fds, &tv);
  308. if ((i == 0) ||
  309. ((i == -1) && (errno == EINTR))) {
  310. ;
  311. } else if (i == -1) {
  312. error("select(): %m");
  313. break;
  314. } else { /* i > 0, ready for I/O */
  315. _agent_proc_connect(fe_comm_socket, fe_auth_key);;
  316. }
  317. }
  318. slurm_shutdown_msg_engine(fe_comm_socket);
  319. return NULL;
  320. }
  321. /* Generate and return a pseudo-random 32-bit authentication key */
  322. static uint32_t _gen_auth_key(void)
  323. {
  324. struct timeval tv;
  325. uint32_t key;
  326. gettimeofday(&tv, NULL);
  327. key = (tv.tv_sec % 1000) * 1000000;
  328. key += tv.tv_usec;
  329. return key;
  330. }
  331. /* Spawn a shell to receive communications from PMD and spawn additional
  332. * PMD on other nodes using a fanout mechanism other than SLURM. */
  333. static void _spawn_fe_agent(void)
  334. {
  335. char hostname[256];
  336. uint32_t fe_auth_key = 0;
  337. slurm_fd_t fe_comm_socket = -1;
  338. slurm_addr_t comm_addr;
  339. uint16_t comm_port;
  340. pthread_attr_t agent_attr;
  341. pthread_t agent_tid;
  342. agent_data_t *agent_data_ptr;
  343. /* Open socket for back-end program to communicate with */
  344. if ((fe_comm_socket = slurm_init_msg_engine_port(0)) < 0) {
  345. error("init_msg_engine_port: %m");
  346. return;
  347. }
  348. if (slurm_get_stream_addr(fe_comm_socket, &comm_addr) < 0) {
  349. error("slurm_get_stream_addr: %m");
  350. return;
  351. }
  352. comm_port = ntohs(((struct sockaddr_in) comm_addr).sin_port);
  353. fe_auth_key = _gen_auth_key();
  354. if (gethostname_short(hostname, sizeof(hostname)))
  355. fatal("gethostname_short(): %m");
  356. /* Set up environment variables for the plugin (as called by PMD)
  357. * to load job information */
  358. setenvfs("SLURM_FE_KEY=%u", fe_auth_key);
  359. setenvfs("SLURM_FE_SOCKET=%s:%hu", hostname, comm_port);
  360. agent_data_ptr = xmalloc(sizeof(agent_data_t));
  361. agent_data_ptr->fe_auth_key = fe_auth_key;
  362. agent_data_ptr->fe_comm_socket = fe_comm_socket;
  363. slurm_attr_init(&agent_attr);
  364. pthread_attr_setdetachstate(&agent_attr, PTHREAD_CREATE_DETACHED);
  365. while ((pthread_create(&agent_tid, &agent_attr, &_agent_thread,
  366. (void *) agent_data_ptr))) {
  367. if (errno != EAGAIN)
  368. fatal("pthread_create(): %m");
  369. sleep(1);
  370. }
  371. slurm_attr_destroy(&agent_attr);
  372. }
  373. /*
  374. * Return a string representation of an array of uint16_t elements.
  375. * Each value in the array is printed in decimal notation and elements
  376. * are separated by a comma. If sequential elements in the array
  377. * contain the same value, the value is written out just once followed
  378. * by "(xN)", where "N" is the number of times the value is repeated.
  379. *
  380. * Example:
  381. * The array "1, 2, 1, 1, 1, 3, 2" becomes the string "1,2,1(x3),3,2"
  382. *
  383. * Returns an xmalloc'ed string. Free with xfree().
  384. */
  385. static char *_uint16_array_to_str(int array_len, const uint16_t *array)
  386. {
  387. int i;
  388. int previous = 0;
  389. char *sep = ","; /* seperator */
  390. char *str = xstrdup("");
  391. if(array == NULL)
  392. return str;
  393. for (i = 0; i < array_len; i++) {
  394. if ((i+1 < array_len)
  395. && (array[i] == array[i+1])) {
  396. previous++;
  397. continue;
  398. }
  399. if (i == array_len-1) /* last time through loop */
  400. sep = "";
  401. if (previous > 0) {
  402. xstrfmtcat(str, "%u(x%u)%s",
  403. array[i], previous+1, sep);
  404. } else {
  405. xstrfmtcat(str, "%u%s", array[i], sep);
  406. }
  407. previous = 0;
  408. }
  409. return str;
  410. }
  411. srun_job_t * _read_job_srun_agent(void)
  412. {
  413. char *key_str = getenv("SLURM_FE_KEY");
  414. char *sock_str = getenv("SLURM_FE_SOCKET");
  415. char buf[32], *host, *sep;
  416. slurm_fd_t resp_socket;
  417. uint16_t resp_port;
  418. uint32_t resp_auth_key, buf_size;
  419. srun_job_t *srun_job = NULL;
  420. slurm_addr_t resp_addr;
  421. char *job_data;
  422. Buf buffer;
  423. int i, offset = 0;
  424. if (!key_str) {
  425. error("SLURM_FE_KEY environment variable not set");
  426. return NULL;
  427. }
  428. if (!sock_str) {
  429. error("SLURM_FE_SOCKET environment variable not set");
  430. return NULL;
  431. }
  432. host = xstrdup(sock_str);
  433. sep = strchr(host, ':');
  434. if (!sep) {
  435. error("_read_job_srun_agent(): SLURM_FE_SOCKET is invalid: %s",
  436. sock_str);
  437. xfree(host);
  438. return NULL;
  439. }
  440. sep[0] = '\0';
  441. resp_port = atoi(sep + 1);
  442. slurm_set_addr(&resp_addr, resp_port, host);
  443. xfree(host);
  444. resp_socket = slurm_open_stream(&resp_addr);
  445. if (resp_socket < 0) {
  446. error("slurm_open_msg_conn(%s): %m", sock_str);
  447. return NULL;
  448. }
  449. resp_auth_key = atoi(key_str);
  450. memcpy(buf + 0, &resp_auth_key, 4);
  451. i = slurm_write_stream_timeout(resp_socket, buf, 4, 8000);
  452. if (i < 4) {
  453. error("_read_job_srun_agent write: %m");
  454. return NULL;
  455. }
  456. i = slurm_read_stream_timeout(resp_socket, (char *) &buf_size, 4, 8000);
  457. if (i < 4) {
  458. error("_read_job_srun_agent read (i=%d): %m", i);
  459. return NULL;
  460. }
  461. job_data = xmalloc(buf_size);
  462. while (buf_size > offset) {
  463. i = slurm_read_stream_timeout(resp_socket, job_data + offset,
  464. buf_size - offset, 8000);
  465. if (i < 0) {
  466. if ((errno != EAGAIN) && (errno != EINTR)) {
  467. error("_read_job_srun_agent read (buf=%d): %m",
  468. i);
  469. break;
  470. }
  471. } else if (i > 0) {
  472. offset += i;
  473. } else {
  474. error("_read_job_srun_agent read: timeout");
  475. break;
  476. }
  477. }
  478. slurm_shutdown_msg_engine(resp_socket);
  479. buffer = create_buf(job_data, buf_size);
  480. srun_job = _unpack_srun_job_rec(buffer);
  481. free_buf(buffer); /* This does xfree(job_data) */
  482. return srun_job;
  483. }
  484. /* Given a program name, return its communication protocol */
  485. static char *_get_cmd_protocol(char *cmd)
  486. {
  487. int stdout_pipe[2] = {-1, -1}, stderr_pipe[2] = {-1, -1};
  488. int read_size, buf_rem = 16 * 1024, offset = 0, status;
  489. pid_t pid;
  490. char *buf, *protocol = "mpi";
  491. if ((pipe(stdout_pipe) == -1) || (pipe(stderr_pipe) == -1)) {
  492. error("pipe: %m");
  493. return "mpi";
  494. }
  495. pid = fork();
  496. if (pid < 0) {
  497. error("fork: %m");
  498. return "mpi";
  499. } else if (pid == 0) {
  500. if ((dup2(stdout_pipe[1], 1) == -1) ||
  501. (dup2(stderr_pipe[1], 2) == -1)) {
  502. error("dup2: %m");
  503. return NULL;
  504. }
  505. (void) close(0); /* stdin */
  506. (void) close(stdout_pipe[0]);
  507. (void) close(stdout_pipe[1]);
  508. (void) close(stderr_pipe[0]);
  509. (void) close(stderr_pipe[1]);
  510. execlp("/usr/bin/ldd", "ldd", cmd, NULL);
  511. error("execv(ldd) error: %m");
  512. return NULL;
  513. }
  514. (void) close(stdout_pipe[1]);
  515. (void) close(stderr_pipe[1]);
  516. buf = xmalloc(buf_rem);
  517. while ((read_size = read(stdout_pipe[0], &buf[offset], buf_rem))) {
  518. if (read_size > 0) {
  519. buf_rem -= read_size;
  520. offset += read_size;
  521. if (buf_rem == 0)
  522. break;
  523. } else if ((errno != EAGAIN) || (errno != EINTR)) {
  524. error("read(pipe): %m");
  525. break;
  526. }
  527. }
  528. if (strstr(buf, "libmpi"))
  529. protocol = "mpi";
  530. else if (strstr(buf, "libshmem.so"))
  531. protocol = "shmem";
  532. else if (strstr(buf, "libxlpgas.so"))
  533. protocol = "pgas";
  534. else if (strstr(buf, "libpami.so"))
  535. protocol = "pami";
  536. else if (strstr(buf, "liblapi.so"))
  537. protocol = "lapi";
  538. xfree(buf);
  539. while ((waitpid(pid, &status, 0) == -1) && (errno == EINTR))
  540. ;
  541. (void) close(stdout_pipe[0]);
  542. (void) close(stderr_pipe[0]);
  543. return protocol;
  544. }
  545. /*
  546. * Parse a multi-prog input file line
  547. * total_tasks - Number of tasks in the job,
  548. * also size of the cmd, args, and protocol arrays
  549. * line IN - line to parse
  550. * cmd OUT - command to execute, caller must xfree this
  551. * args OUT - arguments to the command, caller must xfree this
  552. * protocol OUT - communication protocol of the command, do not xfree this
  553. */
  554. static void _parse_prog_line(int total_tasks, char *in_line, char **cmd,
  555. char **args, char **protocol)
  556. {
  557. int i, task_id;
  558. int first_arg_inx = 0, last_arg_inx = 0;
  559. int first_cmd_inx, last_cmd_inx;
  560. int first_task_inx, last_task_inx;
  561. hostset_t hs = NULL;
  562. char *task_id_str, *tmp_str = NULL;
  563. char *line_args = NULL, *line_cmd = NULL, *line_protocol = NULL;
  564. /* Get the task ID string */
  565. for (i = 0; in_line[i]; i++)
  566. if (!isspace(in_line[i]))
  567. break;
  568. if (!in_line[i]) /* empty line */
  569. return;
  570. else if (in_line[i] == '#')
  571. return;
  572. else if (!isdigit(in_line[i]))
  573. goto bad_line;
  574. first_task_inx = i;
  575. for (i++; in_line[i]; i++) {
  576. if (isspace(in_line[i]))
  577. break;
  578. }
  579. if (!isspace(in_line[i]))
  580. goto bad_line;
  581. last_task_inx = i;
  582. /* Get the command */
  583. for (i++; in_line[i]; i++) {
  584. if (!isspace(in_line[i]))
  585. break;
  586. }
  587. if (in_line[i] == '\0')
  588. goto bad_line;
  589. first_cmd_inx = i;
  590. for (i++; in_line[i]; i++) {
  591. if (isspace(in_line[i]))
  592. break;
  593. }
  594. if (!isspace(in_line[i]))
  595. goto bad_line;
  596. last_cmd_inx = i;
  597. /* Get the command's arguments */
  598. for (i++; in_line[i]; i++) {
  599. if (!isspace(in_line[i]))
  600. break;
  601. }
  602. if (in_line[i])
  603. first_arg_inx = i;
  604. for ( ; in_line[i]; i++) {
  605. if (in_line[i] == '\n') {
  606. last_arg_inx = i;
  607. break;
  608. }
  609. }
  610. /* Now transfer data to the function arguments */
  611. in_line[last_task_inx] = '\0';
  612. xstrfmtcat(tmp_str, "[%s]", in_line + first_task_inx);
  613. hs = hostset_create(tmp_str);
  614. xfree(tmp_str);
  615. in_line[last_task_inx] = ' ';
  616. if (!hs)
  617. goto bad_line;
  618. in_line[last_cmd_inx] = '\0';
  619. line_cmd = xstrdup(in_line + first_cmd_inx);
  620. in_line[last_cmd_inx] = ' ';
  621. if (last_arg_inx)
  622. in_line[last_arg_inx] = '\0';
  623. if (first_arg_inx)
  624. line_args = xstrdup(in_line + first_arg_inx);
  625. if (last_arg_inx)
  626. in_line[last_arg_inx] = '\n';
  627. line_protocol = _get_cmd_protocol(line_cmd);
  628. while ((task_id_str = hostset_pop(hs))) {
  629. task_id = strtol(task_id_str, &tmp_str, 10);
  630. if ((tmp_str[0] != '\0') || (task_id < 0))
  631. goto bad_line;
  632. if (task_id >= total_tasks)
  633. continue;
  634. cmd[task_id] = xstrdup(line_cmd);
  635. args[task_id] = xstrdup(line_args);
  636. protocol[task_id] = line_protocol;
  637. }
  638. xfree(line_args);
  639. xfree(line_cmd);
  640. hostset_destroy(hs);
  641. return;
  642. bad_line:
  643. error("invalid input line: %s", in_line);
  644. xfree(line_args);
  645. xfree(line_cmd);
  646. if (hs)
  647. hostset_destroy(hs);
  648. return;
  649. }
  650. /*
  651. * Read a line from SLURM MPMD command file or write the equivalent POE line.
  652. * line IN/OUT - line to read or write
  653. * length IN - size of line in bytes
  654. * step_id IN - -1 if input line, otherwise the step ID to output
  655. * task_id IN - count of tasks in job step (if step_id == -1)
  656. * task_id to report (if step_id != -1)
  657. * RET true if more lines to get
  658. */
  659. static bool _multi_prog_parse(char *line, int length, int step_id, int task_id)
  660. {
  661. static int total_tasks = 0;
  662. static char **args = NULL, **cmd = NULL, **protocol = NULL;
  663. int i;
  664. if (step_id < 0) {
  665. if (!args) {
  666. args = xmalloc(sizeof(char *) * task_id);
  667. cmd = xmalloc(sizeof(char *) * task_id);
  668. protocol = xmalloc(sizeof(char *) * task_id);
  669. total_tasks = task_id;
  670. }
  671. _parse_prog_line(total_tasks, line, cmd, args, protocol);
  672. return true;
  673. } else if (task_id >= total_tasks) {
  674. for (i = 0; i < total_tasks; i++) {
  675. xfree(args[i]);
  676. xfree(cmd[i]);
  677. }
  678. xfree(args);
  679. xfree(cmd);
  680. xfree(protocol);
  681. total_tasks = 0;
  682. return false;
  683. } else if (!cmd[task_id]) {
  684. error("Configuration file invalid, no record for task id %d",
  685. task_id);
  686. return true;
  687. } else if (args[task_id]) {
  688. /* <cmd>@<step_id>%<total_tasks>%<protocol>:<num_tasks> <args...> */
  689. snprintf(line, length, "%s@%d%c%d%c%s:%d %s",
  690. cmd[task_id], step_id, '%', total_tasks, '%',
  691. protocol[task_id], 1, args[task_id]);
  692. return true;
  693. } else {
  694. /* <cmd>@<step_id>%<total_tasks>%<protocol>:<num_tasks> */
  695. snprintf(line, length, "%s@%d%c%d%c%s:%d",
  696. cmd[task_id], step_id, '%', total_tasks, '%',
  697. protocol[task_id], 1);
  698. return true;
  699. }
  700. }
  701. /* Convert a SLURM format MPMD file into a POE MPMD command file */
  702. static void _re_write_cmdfile(char *slurm_cmd_fname, char *poe_cmd_fname,
  703. uint32_t step_id, int task_cnt)
  704. {
  705. char *buf, in_line[512];
  706. int fd, i, j, k;
  707. FILE *fp;
  708. if (!slurm_cmd_fname || !poe_cmd_fname)
  709. return;
  710. buf = xmalloc(1024);
  711. fp = fopen(slurm_cmd_fname, "r");
  712. if (!fp) {
  713. error("fopen(%s): %m", slurm_cmd_fname);
  714. return;
  715. }
  716. /* Read and parse SLURM MPMD format file here */
  717. while (fgets(in_line, sizeof(in_line), fp))
  718. _multi_prog_parse(in_line, 512, -1, task_cnt);
  719. fclose(fp);
  720. /* Write LoadLeveler MPMD format file here */
  721. for (i = 0; ; i++) {
  722. if (!_multi_prog_parse(in_line, 512, step_id, i))
  723. break;
  724. j = xstrfmtcat(buf, "%s\n", in_line);
  725. }
  726. i = 0;
  727. j = strlen(buf);
  728. fd = open(poe_cmd_fname, O_TRUNC | O_RDWR);
  729. if (fd < 0) {
  730. error("open(%s): %m", poe_cmd_fname);
  731. xfree(buf);
  732. return;
  733. }
  734. while ((k = write(fd, &buf[i], j))) {
  735. if (k > 0) {
  736. i += k;
  737. j -= k;
  738. } else if ((errno != EAGAIN) && (errno != EINTR)) {
  739. error("write(cmdfile): %m");
  740. break;
  741. }
  742. }
  743. close(fd);
  744. xfree(buf);
  745. }
  746. void _self_complete(srun_job_complete_msg_t *comp_msg)
  747. {
  748. kill(getpid(), SIGKILL);
  749. }
  750. void _self_signal(int signal)
  751. {
  752. kill(getpid(), signal);
  753. }
  754. void _self_timeout(srun_timeout_msg_t *timeout_msg)
  755. {
  756. time_t now = time(NULL);
  757. char time_str[24];
  758. if (now < timeout_msg->timeout) {
  759. slurm_make_time_str(&timeout_msg->timeout,
  760. time_str, sizeof(time_str));
  761. debug("step %u.%u will timeout at %s",
  762. timeout_msg->job_id, timeout_msg->step_id, time_str);
  763. return;
  764. }
  765. slurm_make_time_str(&now, time_str, sizeof(time_str));
  766. error("*** STEP %u.%u CANCELLED AT %s DUE TO TIME LIMIT ***",
  767. timeout_msg->job_id, timeout_msg->step_id, time_str);
  768. _self_signal(SIGKILL);
  769. }
  770. /************************************/
  771. /* The connection communicates information to and from the resource
  772. * manager, so that the resource manager can start the parallel task
  773. * manager, and is available for the caller to communicate directly
  774. * with the parallel task manager.
  775. * IN resource_mgr - The resource manager handle returned by pe_rm_init.
  776. * IN connect_param - Input parameter structure (rm_connect_param)
  777. * that contains the following:
  778. * machine_count: The count of hosts/machines.
  779. * machine_name: The array of machine names on which to connect.
  780. * executable: The name of the executable to be started.
  781. * IN rm_timeout - The integer value that defines a connection timeout
  782. * value. This value is defined by the MP_RM_TIMEOUT
  783. * environment variable. A value less than zero indicates there
  784. * is no timeout. A value equal to zero means to immediately
  785. * return with no wait or retry. A value greater than zero
  786. * means to wait the specified amount of time (in seconds).
  787. * OUT rm_sockfds - An array of socket file descriptors, that are
  788. * allocated by the caller, to be returned as output, of the connection.
  789. * OUT error_msg - An error message that explains the error.
  790. * RET 0 - SUCCESS, nonzero on failure.
  791. */
  792. extern int pe_rm_connect(rmhandle_t resource_mgr,
  793. rm_connect_param *connect_param,
  794. int *rm_sockfds, int rm_timeout, char **error_msg)
  795. {
  796. slurm_step_launch_callbacks_t step_callbacks;
  797. // srun_job_t *job = *(srun_job_t **)resource_mgr;
  798. int my_argc = 1;
  799. char *my_argv[2] = { connect_param->executable, NULL };
  800. // char *my_argv[2] = { "/bin/hostname", NULL };
  801. slurm_step_io_fds_t cio_fds = SLURM_STEP_IO_FDS_INITIALIZER;
  802. uint32_t global_rc = 0, orig_task_num;
  803. int i, ii = 0, rc, fd_cnt, node_cnt;
  804. int *ctx_sockfds = NULL;
  805. char *name = NULL, *total_node_list = NULL;
  806. static uint32_t task_num = 0;
  807. hostlist_t hl = NULL;
  808. xassert(job);
  809. if (pm_type == PM_PMD) {
  810. debug("got pe_rm_connect called from PMD");
  811. /* Set up how many tasks the PMD is going to launch. */
  812. job->ntasks = 1 + task_num;
  813. } else if (pm_type == PM_POE) {
  814. debug("got pe_rm_connect called");
  815. launch_common_set_stdio_fds(job, &cio_fds);
  816. } else {
  817. *error_msg = xstrdup_printf("pe_rm_connect: unknown caller");
  818. error("%s", *error_msg);
  819. return -1;
  820. }
  821. /* translate the ip to a node list which SLURM uses to send
  822. messages instead of IP addresses (at this point anyway)
  823. */
  824. for (i=0; i<connect_param->machine_count; i++) {
  825. name = _name_from_addr(connect_param->machine_name[i]);
  826. if (!name) {
  827. if (hl)
  828. hostlist_destroy(hl);
  829. *error_msg = xstrdup_printf(
  830. "pe_rm_connect: unknown host for ip %s",
  831. connect_param->machine_name[i]);
  832. error("%s", *error_msg);
  833. return -1;
  834. }
  835. if (!hl)
  836. hl = hostlist_create(name);
  837. else
  838. hostlist_push_host(hl, name);
  839. if (!total_hl)
  840. total_hl = hostlist_create(name);
  841. else
  842. hostlist_push_host(total_hl, name);
  843. }
  844. if (!hl) {
  845. *error_msg = xstrdup_printf(
  846. "pe_rm_connect: machine_count 0? it came in as "
  847. "%d but we didn't get a hostlist",
  848. connect_param->machine_count);
  849. error("%s", *error_msg);
  850. return -1;
  851. }
  852. hostlist_sort(hl);
  853. xfree(job->nodelist);
  854. job->nodelist = hostlist_ranged_string_xmalloc(hl);
  855. hostlist_destroy(hl);
  856. hostlist_sort(total_hl);
  857. total_node_list = hostlist_ranged_string_xmalloc(total_hl);
  858. node_cnt = hostlist_count(total_hl);
  859. opt.argc = my_argc;
  860. opt.argv = my_argv;
  861. opt.user_managed_io = true;
  862. orig_task_num = task_num;
  863. if (slurm_step_ctx_daemon_per_node_hack(job->step_ctx,
  864. total_node_list,
  865. node_cnt, &task_num)
  866. != SLURM_SUCCESS) {
  867. xfree(total_node_list);
  868. *error_msg = xstrdup_printf(
  869. "pe_rm_connect: problem with hack: %s",
  870. slurm_strerror(errno));
  871. error("%s", *error_msg);
  872. return -1;
  873. }
  874. xfree(total_node_list);
  875. job->fir_nodeid = orig_task_num;
  876. memset(&step_callbacks, 0, sizeof(step_callbacks));
  877. step_callbacks.step_complete = _self_complete;
  878. step_callbacks.step_signal = _self_signal;
  879. step_callbacks.step_timeout = _self_timeout;
  880. if (launch_g_step_launch(job, &cio_fds, &global_rc, &step_callbacks)) {
  881. *error_msg = xstrdup_printf(
  882. "pe_rm_connect: problem with launch: %s",
  883. slurm_strerror(errno));
  884. error("%s", *error_msg);
  885. return -1;
  886. }
  887. rc = slurm_step_ctx_get(job->step_ctx,
  888. SLURM_STEP_CTX_USER_MANAGED_SOCKETS,
  889. &fd_cnt, &ctx_sockfds);
  890. if (ctx_sockfds == NULL) {
  891. *error_msg = xstrdup_printf(
  892. "pe_rm_connect: Unable to get pmd IO socket array %d",
  893. rc);
  894. error("%s", *error_msg);
  895. return -1;
  896. }
  897. if (fd_cnt != task_num) {
  898. *error_msg = xstrdup_printf(
  899. "pe_rm_connect: looking for %d sockets but got back %d",
  900. connect_param->machine_count, fd_cnt);
  901. error("%s", *error_msg);
  902. return -1;
  903. }
  904. ii = 0;
  905. for (i=orig_task_num; i<fd_cnt; i++)
  906. rm_sockfds[ii++] = ctx_sockfds[i];
  907. return 0;
  908. }
  909. /* Releases the resource manager handle, closes the socket that is
  910. * created by the pe_rm_init function, and releases memory
  911. * allocated. When called, pe_rm_free implies the job has completed
  912. * and resources are freed and available for subsequent jobs.
  913. * IN/OUT resource_mgr
  914. *
  915. * As of PE 1207 pe_rm_free does not always complete. The parent
  916. * process seems to finish before we do. So you might be erronious errors.
  917. */
  918. extern void pe_rm_free(rmhandle_t *resource_mgr)
  919. {
  920. uint32_t rc = 0;
  921. if (job && job->step_ctx) {
  922. debug("got pe_rm_free called %p %p", job, job->step_ctx);
  923. /* Since we can't relaunch the step here don't worry about the
  924. return code.
  925. */
  926. launch_g_step_wait(job, got_alloc);
  927. /* We are at the end so don't worry about freeing the
  928. srun_job_t pointer */
  929. fini_srun(job, got_alloc, &rc, slurm_started);
  930. }
  931. if (total_hl) {
  932. hostlist_destroy(total_hl);
  933. total_hl = NULL;
  934. }
  935. *resource_mgr = NULL;
  936. dlclose(my_handle);
  937. if (poe_cmd_fname)
  938. (void) unlink(poe_cmd_fname);
  939. /* remove the hostfile if needed */
  940. if ((poe_cmd_fname = getenv("SRUN_DESTROY_HOSTFILE")))
  941. (void) unlink(poe_cmd_fname);
  942. }
  943. /* The memory that is allocated to events generated by the resource
  944. * manager is released. pe_rm_free_event must be called for every
  945. * event that is received from the resource manager by calling the
  946. * pe_rm_get_event function.
  947. * IN resource_mgr
  948. * IN job_event - The pointer to a job event. The event must have been
  949. * built by calling the pe_rm_get_event function.
  950. * RET 0 - SUCCESS, nonzero on failure.
  951. */
  952. extern int pe_rm_free_event(rmhandle_t resource_mgr, job_event_t ** job_event)
  953. {
  954. if (pm_type == PM_PMD) {
  955. debug("pe_rm_free_event called");
  956. return 0;
  957. } else if (pm_type != PM_POE) {
  958. error("pe_rm_free_event: unknown caller");
  959. return -1;
  960. }
  961. debug("got pe_rm_free_event called");
  962. if (job_event) {
  963. xfree(*job_event);
  964. }
  965. return 0;
  966. }
  967. /* This resource management interface is called to return job event
  968. * information. The pe_rm_get_event function is only called in
  969. * interactive mode.
  970. *
  971. * With interactive jobs, this function reads or selects on the listen
  972. * socket created by the pe_rm_init call. If the listen socket is not
  973. * ready to read, this function selects and waits. POE processes
  974. * should monitor this socket at all times for event notification from
  975. * the resource manager after the job has started running.
  976. *
  977. * This function returns a pointer to the event that was updated by
  978. * the transaction.
  979. * The valid events are:
  980. * JOB_ERROR_EVENT
  981. * Job error messages occurred. In this case, POE displays the
  982. * error and terminates.
  983. * JOB_STATE_EVENT
  984. * A job status change occurred, which results in one of the
  985. * following job states. In this case, the caller may need to take
  986. * appropriate action.
  987. * JOB_STATE_RUNNING
  988. * Indicates that the job has started. POE uses the
  989. * pe_rm_get_job_info function to return the job
  990. * information. When a job state of JOB_STATE_RUNNING has been
  991. * returned, the job has started running and POE can obtain the
  992. * job information by way of the pe_rm_get_job_info function call.
  993. * JOB_STATE_NOTRUN
  994. * Indicates that the job was not run, and POE will terminate.
  995. * JOB_STATE_PREEMPTED
  996. * Indicates that the job was preempted.
  997. * JOB_STATE_RESUMED
  998. * Indicates that the job has resumed.
  999. * JOB_TIMER_EVENT
  1000. * Indicates that no events occurred during the period
  1001. * specified by pe_rm_timeout.
  1002. *
  1003. * IN resource_mgr
  1004. * OUT job_event - The address of the pointer to the job_event_t
  1005. * type. If an event is generated successfully by the resource
  1006. * manager, that event is saved at the location specified, and
  1007. * pe_rm_get_event returns 0 (or a nonzero value, if the event
  1008. * is not generated successfully). Based on the event type that is
  1009. * returned, the appropriate event of the type job_event_t can
  1010. * be accessed. After the event is processed, it should be
  1011. * freed by calling pe_rm_free_event.
  1012. * OUT error_msg - The address of a character string at which the
  1013. * error message that is generated by pe_rm_get_event is
  1014. * stored. The memory for this error message is allocated by
  1015. * the malloc API call. After the error message is processed,
  1016. * the memory allocated should be freed by a calling free function.
  1017. * IN rm_timeout - The integer value that defines a connection timeout
  1018. * value. This value is defined by the MP_RETRY environment
  1019. * variable. A value less than zero indicates there is no
  1020. * timeout. A value equal to zero means to immediately return
  1021. * with no wait or retry. A value greater than zero means to
  1022. * wait the specified amount of time (in seconds).
  1023. * RET 0 - SUCCESS, nonzero on failure.
  1024. */
  1025. extern int pe_rm_get_event(rmhandle_t resource_mgr, job_event_t **job_event,
  1026. int rm_timeout, char ** error_msg)
  1027. {
  1028. job_event_t *ret_event = NULL;
  1029. int *state;
  1030. if (pm_type == PM_PMD) {
  1031. debug("pe_rm_get_event called");
  1032. return 0;
  1033. } else if (pm_type != PM_POE) {
  1034. *error_msg = xstrdup_printf("pe_rm_get_event: unknown caller");
  1035. error("%s", *error_msg);
  1036. return -1;
  1037. }
  1038. debug("got pe_rm_get_event called %d %p %p",
  1039. rm_timeout, job_event, *job_event);
  1040. ret_event = xmalloc(sizeof(job_event_t));
  1041. *job_event = ret_event;
  1042. ret_event->event = JOB_STATE_EVENT;
  1043. state = xmalloc(sizeof(int));
  1044. *state = JOB_STATE_RUNNING;
  1045. ret_event->event_data = (void *)state;
  1046. return 0;
  1047. }
  1048. /* The pe_rm_get_job_info function is called to return job
  1049. * information, after a job has been started. It can be called in
  1050. * either batch or interactive mode. For interactive jobs, it should
  1051. * be called when pe_rm_get_event returns with the JOB_STATE_EVENT
  1052. * event type, indicating the JOB_STATE_RUNNING
  1053. * state. pe_rm_get_job_info provides the job information data values,
  1054. * as defined by the job_info_t structure. It returns with an error if
  1055. * the job is not in a running state. For batch jobs, POE calls
  1056. * pe_rm_get_job_info immediately because, in batch mode, POE is
  1057. * started only after the job has been started. The pe_rm_get_job_info
  1058. * function must be capable of being called multiple times from the
  1059. * same process or a different process, and the same job data must be
  1060. * returned each time. When called from a different process, the
  1061. * environment of that process is guaranteed to be the same as the
  1062. * environment of the process that originally called the function.
  1063. *
  1064. * IN resource_mgr
  1065. * OUT job_info - The address of the pointer to the job_info_t
  1066. * type. The job_info_t type contains the job information
  1067. * returned by the resource manager for the handle that is
  1068. * specified. The caller itself must free the data areas that
  1069. * are returned.
  1070. * OUT error_msg - The address of a character string at which the
  1071. * error message that is generated by pe_rm_get_job_info is
  1072. * stored. The memory for this error message is allocated by the
  1073. * malloc API call. After the error message is processed, the memory
  1074. * allocated should be freed by a calling free function.
  1075. * RET 0 - SUCCESS, nonzero on failure.
  1076. */
  1077. extern int pe_rm_get_job_info(rmhandle_t resource_mgr, job_info_t **job_info,
  1078. char ** error_msg)
  1079. {
  1080. job_info_t *ret_info = xmalloc(sizeof(job_info_t));
  1081. int i, j;
  1082. slurm_step_layout_t *step_layout;
  1083. hostlist_t hl;
  1084. char *host;
  1085. char *mode = "IP";
  1086. host_usage_t *host_ptr;
  1087. int table_cnt;
  1088. nrt_tableinfo_t *tables, *table_ptr;
  1089. nrt_job_key_t job_key;
  1090. job_step_create_response_msg_t *resp;
  1091. int network_id_cnt = 0;
  1092. nrt_network_id_t *network_id_list;
  1093. char value[32];
  1094. if (pm_type == PM_PMD) {
  1095. debug("pe_rm_get_job_info called");
  1096. return 0;
  1097. } else if (pm_type != PM_POE) {
  1098. *error_msg = xstrdup_printf(
  1099. "pe_rm_get_job_info: unknown caller");
  1100. error("%s", *error_msg);
  1101. return -1;
  1102. }
  1103. debug("got pe_rm_get_job_info called");
  1104. if (!job || !job->step_ctx) {
  1105. error("pe_rm_get_job_info: It doesn't appear "
  1106. "pe_rm_submit_job was called. I am guessing "
  1107. "PE_RM_BATCH is set somehow. It things don't work well "
  1108. "using this mode unset the env var and retry.");
  1109. create_srun_job(&job, &got_alloc, slurm_started, 0);
  1110. /* make sure we set up a signal handler */
  1111. pre_launch_srun_job(job, slurm_started, 0);
  1112. }
  1113. *job_info = ret_info;
  1114. ret_info->job_name = xstrdup(opt.job_name);
  1115. ret_info->rm_id = NULL;
  1116. ret_info->procs = job->ntasks;
  1117. ret_info->max_instances = 0;
  1118. ret_info->check_pointable = 0;
  1119. ret_info->rset_name = "RSET_NONE";
  1120. ret_info->endpoints = 1;
  1121. slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_RESP, &resp);
  1122. if (!resp) {
  1123. *error_msg = xstrdup_printf(
  1124. "pe_rm_get_job_info: no step response in step ctx");
  1125. error("%s", *error_msg);
  1126. return -1;
  1127. }
  1128. slurm_jobinfo_ctx_get(resp->switch_job, NRT_JOBINFO_KEY, &job_key);
  1129. ret_info->job_key = job_key;
  1130. if (opt.network) {
  1131. char *network_str = xstrdup(opt.network);
  1132. char *save_ptr = NULL,
  1133. *token = strtok_r(network_str, ",", &save_ptr);
  1134. while (token) {
  1135. /* network options */
  1136. if (!strcasecmp(token, "ip") ||
  1137. !strcasecmp(token, "ipv4") ||
  1138. !strcasecmp(token, "ipv6")) {
  1139. mode = "IP";
  1140. } else if (!strcasecmp(token, "us")) {
  1141. mode = "US";
  1142. }
  1143. /* Currently ignoring all other options */
  1144. token = strtok_r(NULL, ",", &save_ptr);
  1145. }
  1146. xfree(network_str);
  1147. }
  1148. slurm_jobinfo_ctx_get(
  1149. resp->switch_job, NRT_JOBINFO_TABLESPERTASK, &table_cnt);
  1150. ret_info->protocol = xmalloc(sizeof(char *)*(table_cnt+1));
  1151. ret_info->mode = xmalloc(sizeof(char *)*(table_cnt+1));
  1152. ret_info->devicename = xmalloc(sizeof(char *)*(table_cnt+1));
  1153. ret_info->instance = xmalloc(sizeof(int)*(table_cnt+2));
  1154. slurm_jobinfo_ctx_get(resp->switch_job, NRT_JOBINFO_TABLEINFO, &tables);
  1155. debug2("got count of %d", table_cnt);
  1156. network_id_list = xmalloc(sizeof(nrt_network_id_t) * table_cnt);
  1157. for (i=0, table_ptr=tables; i<table_cnt; i++, table_ptr++) {
  1158. for (j = 0; j < network_id_cnt; j++) {
  1159. if (table_ptr->network_id == network_id_list[j])
  1160. break;
  1161. }
  1162. if (j >= network_id_cnt) {
  1163. /* add this new network ID to our table */
  1164. network_id_list[network_id_cnt++] =
  1165. table_ptr->network_id;
  1166. }
  1167. /* FIXME: Format of these data structure contents not well defined */
  1168. ret_info->protocol[i] = xstrdup(table_ptr->protocol_name);
  1169. ret_info->mode[i] = xstrdup(mode);
  1170. ret_info->devicename[i] = xstrdup(table_ptr->adapter_name);
  1171. ret_info->instance[i] = table_ptr->instance;
  1172. ret_info->max_instances = MAX(ret_info->max_instances,
  1173. ret_info->instance[i]);
  1174. debug("%d: %s %s %s %d", i, ret_info->protocol[i],
  1175. ret_info->mode[i], ret_info->devicename[i],
  1176. ret_info->instance[i]);
  1177. }
  1178. xfree(network_id_list);
  1179. ret_info->instance[i] = -1;
  1180. ret_info->num_network = network_id_cnt;
  1181. ret_info->host_count = job->nhosts;
  1182. step_layout = launch_common_get_slurm_step_layout(job);
  1183. ret_info->hosts = xmalloc(sizeof(host_usage_t)
  1184. * (ret_info->host_count+1));
  1185. host_ptr = ret_info->hosts;
  1186. i=0;
  1187. hl = hostlist_create(step_layout->node_list);
  1188. while ((host = hostlist_shift(hl))) {
  1189. slurm_addr_t addr;
  1190. host_ptr->host_name = host;
  1191. slurm_conf_get_addr(host, &addr);
  1192. host_ptr->host_address = xstrdup(inet_ntoa(addr.sin_addr));
  1193. host_ptr->task_count = step_layout->tasks[i];
  1194. host_ptr->task_ids =
  1195. xmalloc(sizeof(int) * host_ptr->task_count);
  1196. /* Task ids are already set up in the layout, so just
  1197. use them.
  1198. */
  1199. debug2("%s = %s %d tasks",
  1200. host_ptr->host_name, host_ptr->host_address,
  1201. host_ptr->task_count);
  1202. for (j=0; j<host_ptr->task_count; j++) {
  1203. host_ptr->task_ids[j] = step_layout->tids[i][j];
  1204. debug2("taskid %d", host_ptr->task_ids[j]);
  1205. }
  1206. i++;
  1207. if (i > ret_info->host_count) {
  1208. error("we have more nodes that we bargined for.");
  1209. break;
  1210. }
  1211. host_ptr++;
  1212. }
  1213. hostlist_destroy(hl);
  1214. host_usage = ret_info->hosts;
  1215. if (!got_alloc || !slurm_started) {
  1216. snprintf(value, sizeof(value), "%u", job->jobid);
  1217. setenv("SLURM_JOB_ID", value, 1);
  1218. setenv("SLURM_JOBID", value, 1);
  1219. setenv("SLURM_JOB_NODELIST", job->nodelist, 1);
  1220. }
  1221. if (!opt.preserve_env) {
  1222. snprintf(value, sizeof(value), "%u", job->ntasks);
  1223. setenv("SLURM_NTASKS", value, 1);
  1224. snprintf(value, sizeof(value), "%u", job->nhosts);
  1225. setenv("SLURM_NNODES", value, 1);
  1226. setenv("SLURM_NODELIST", job->nodelist, 1);
  1227. }
  1228. snprintf(value, sizeof(value), "%u", job->stepid);
  1229. setenv("SLURM_STEP_ID", value, 1);
  1230. setenv("SLURM_STEPID", value, 1);
  1231. setenv("SLURM_STEP_NODELIST", step_layout->node_list, 1);
  1232. snprintf(value, sizeof(value), "%u", job->nhosts);
  1233. setenv("SLURM_STEP_NUM_NODES", value, 1);
  1234. snprintf(value, sizeof(value), "%u", job->ntasks);
  1235. setenv("SLURM_STEP_NUM_TASKS", value, 1);
  1236. host = _uint16_array_to_str(step_layout->node_cnt,
  1237. step_layout->tasks);
  1238. setenv("SLURM_STEP_TASKS_PER_NODE", host, 1);
  1239. xfree(host);
  1240. return 0;
  1241. }
  1242. /* The handle to the resource manager is returned to the calling
  1243. * function. The calling process needs to use the resource manager
  1244. * handle in subsequent resource manager API calls.
  1245. *
  1246. * A version will be returned as output in the rmapi_version
  1247. * parameter, after POE supplies it as input. The resource manager
  1248. * returns the version value that is installed and running as output.
  1249. *
  1250. * A resource manager ID can be specified that defines a job that is
  1251. * currently running, and for which POE is initializing the resource
  1252. * manager. When the resource manager ID is null, a value for the
  1253. * resource manager ID is included with the job information that is
  1254. * returned by the pe_rm_get_job_info function. When pe_rm_init is
  1255. * called more than once with a null resource manager ID value, it
  1256. * returns the same ID value on the subsequent pe_rm_get_job_info
  1257. * function call.
  1258. *
  1259. * The resource manager can be initialized in either
  1260. * batch or interactive mode. The resource manager must export the
  1261. * environment variable PE_RM_BATCH=yes when in batch mode.
  1262. *
  1263. * By default, the resource manager error messages and any debugging
  1264. * messages that are generated by this function, or any subsequent
  1265. * resource manager API calls, should be written to STDERR. Errors are
  1266. * returned by way of the error message string parameter.
  1267. *
  1268. * When the resource manager is successfully instantiated and
  1269. * initialized, it returns with a file descriptor for a listen socket,
  1270. * which is used by the resource manager daemon to communicate with
  1271. * the calling process. If a resource manager wants to send
  1272. * information to the calling process, it builds an appropriate event
  1273. * that corresponds to the information and sends that event over the
  1274. * socket to the calling process. The calling process could monitor
  1275. * the socket using the select API and read the event when it is ready.
  1276. *
  1277. * IN/OUT rmapi_version - The resource manager API version level. The
  1278. * value of RM_API_VERSION is defined in permapi.h. Initially,
  1279. * POE provides this as input, and the resource manager will
  1280. * return its version level as output.
  1281. * OUT resource_mgr - Pointer to the rmhandle_t handle returned by the
  1282. * pe_rm_init function. This handle should be used by all other
  1283. * resource manager API calls.
  1284. * IN rm_id - Pointer to a character string that defines a
  1285. * resource manager ID, for checkpoint and restart cases. This
  1286. * pointer can be set to NULL, which means there is no previous
  1287. * resource manager session or job running. When it is set to a
  1288. * value, the resource manager uses the specified ID for
  1289. * returning the proper job information to a subsequent
  1290. * pe_rm_get_job_info function call.
  1291. * OUT error_msg - The address of a character string at which the
  1292. * error messages generated by this function are stored. The
  1293. * memory for this error message is allocated by the malloc API
  1294. * call. After the error message is processed, the memory
  1295. * allocated should be freed by a calling free function.
  1296. * RET - Non-negative integer representing a valid file descriptor
  1297. * number for the socket that will be used by the resource
  1298. * manager to communicate with the calling process. - SUCCESS
  1299. * integer less than 0 - FAILURE
  1300. */
  1301. extern int pe_rm_init(int *rmapi_version, rmhandle_t *resource_mgr, char *rm_id,
  1302. char** error_msg)
  1303. {
  1304. char *srun_debug = NULL, *tmp_char = NULL;
  1305. char *myargv[3] = { "poe", NULL, NULL };
  1306. int debug_level = log_opts.logfile_level;
  1307. /* SLURM was originally written against 1300, so we will
  1308. * return that, no matter what comes in so we always work.
  1309. */
  1310. *rmapi_version = 1300;
  1311. *resource_mgr = (void *)&job;
  1312. #ifdef MYSELF_SO
  1313. /* Since POE opens this lib without
  1314. RTLD_LAZY | RTLD_GLOBAL | RTLD_DEEPBIND
  1315. we just open ourself again with those options and bada bing
  1316. bada boom we are good to go with the symbols we need.
  1317. */
  1318. my_handle = dlopen(MYSELF_SO, RTLD_LAZY | RTLD_GLOBAL | RTLD_DEEPBIND);
  1319. if (!my_handle) {
  1320. debug("%s", dlerror());
  1321. return 1;
  1322. }
  1323. #else
  1324. fatal("I haven't been told where I am. This should never happen.");
  1325. #endif
  1326. if (slurm_select_init(1) != SLURM_SUCCESS )
  1327. fatal( "failed to initialize node selection plugin" );
  1328. slurm_set_launch_type("launch/slurm");
  1329. if (getenv("SLURM_STARTED_STEP"))
  1330. slurm_started = true;
  1331. if ((srun_debug = getenv("SRUN_DEBUG")))
  1332. debug_level = atoi(srun_debug);
  1333. if (debug_level) {
  1334. log_opts.stderr_level = log_opts.logfile_level =
  1335. log_opts.syslog_level = debug_level;
  1336. }
  1337. /* This will be used later in the code to set the
  1338. * _verbose level. */
  1339. if (debug_level >= LOG_LEVEL_INFO)
  1340. debug_level -= LOG_LEVEL_INFO;
  1341. if (pm_type == PM_PMD) {
  1342. log_alter_with_fp(log_opts, LOG_DAEMON, pmd_lfp);
  1343. myargv[0] = myargv[1] = "pmd";
  1344. } else {
  1345. char *poe_argv = getenv("MP_I_SAVED_ARGV");
  1346. log_alter(log_opts, LOG_DAEMON, "/dev/null");
  1347. myargv[1] = getenv("SLURM_JOB_NAME");
  1348. if (poe_argv) {
  1349. char *adapter_use = NULL;
  1350. char *bulk_xfer = NULL;
  1351. char *collectives = NULL;
  1352. char *euidevice = NULL;
  1353. char *euilib = NULL;
  1354. char *immediate = NULL;
  1355. char *instances = NULL;
  1356. char *tmp_argv = xstrdup(poe_argv);
  1357. char *tok, *save_ptr = NULL;
  1358. int tok_inx = 0;
  1359. /* Parse the command line
  1360. * Map the following options to their srun equivalent
  1361. * -adapter_use shared | dedicated
  1362. * -collective_groups #
  1363. * -euidevice sn_all | sn_single
  1364. * -euilib ip | us
  1365. * -imm_send_buffers #
  1366. * -instances #
  1367. * -use_bulk_xfer yes | no
  1368. */
  1369. tok = strtok_r(tmp_argv, " ", &save_ptr);
  1370. while (tok) {
  1371. if ((tok_inx == 1) && !myargv[1]) {
  1372. myargv[1] = xstrdup(tok);
  1373. } else if (!strcmp(tok, "-adapter_use")) {
  1374. tok = strtok_r(NULL, " ", &save_ptr);
  1375. if (!tok)
  1376. break;
  1377. adapter_use = xstrdup(tok);
  1378. } else if (!strcmp(tok, "-collective_groups")){
  1379. tok = strtok_r(NULL, " ", &save_ptr);
  1380. if (!tok)
  1381. break;
  1382. collectives = xstrdup(tok);
  1383. } else if (!strcmp(tok, "-euidevice")) {
  1384. tok = strtok_r(NULL, " ", &save_ptr);
  1385. if (!tok)
  1386. break;
  1387. euidevice = xstrdup(tok);
  1388. } else if (!strcmp(tok, "-euilib")) {
  1389. tok = strtok_r(NULL, " ", &save_ptr);
  1390. if (!tok)
  1391. break;
  1392. euilib = xstrdup(tok);
  1393. } else if (!strcmp(tok, "-imm_send_buffers")) {
  1394. tok = strtok_r(NULL, " ", &save_ptr);
  1395. if (!tok)
  1396. break;
  1397. immediate = xstrdup(tok);
  1398. } else if (!strcmp(tok, "-instances")) {
  1399. tok = strtok_r(NULL, " ", &save_ptr);
  1400. if (!tok)
  1401. break;
  1402. instances = xstrdup(tok);
  1403. } else if (!strcmp(tok, "-use_bulk_xfer")) {
  1404. tok = strtok_r(NULL, " ", &save_ptr);
  1405. if (!tok)
  1406. break;
  1407. bulk_xfer = xstrdup(tok);
  1408. }
  1409. tok = strtok_r(NULL, " ", &save_ptr);
  1410. tok_inx++;
  1411. }
  1412. xfree(tmp_argv);
  1413. /* Parse the environment variables */
  1414. if (!adapter_use) {
  1415. char *tmp = getenv("MP_ADAPTER_USE");
  1416. if (tmp)
  1417. adapter_use = xstrdup(tmp);
  1418. }
  1419. if (!collectives) {
  1420. char *tmp = getenv("MP_COLLECTIVE_GROUPS");
  1421. if (tmp)
  1422. collectives = xstrdup(tmp);
  1423. }
  1424. if (!euidevice) {
  1425. char *tmp = getenv("MP_EUIDEVICE");
  1426. if (tmp)
  1427. euidevice = xstrdup(tmp);
  1428. }
  1429. if (!euilib) {
  1430. char *tmp = getenv("MP_EUILIB");
  1431. if (tmp)
  1432. euilib = xstrdup(tmp);
  1433. }
  1434. if (!immediate) {
  1435. char *tmp = getenv("MP_IMM_SEND_BUFFERS");
  1436. if (tmp)
  1437. immediate = xstrdup(tmp);
  1438. }
  1439. if (!instances) {
  1440. char *tmp = getenv("MP_INSTANCES");
  1441. if (tmp)
  1442. instances = xstrdup(tmp);
  1443. }
  1444. if (!bulk_xfer) {
  1445. char *tmp = getenv("MP_USE_BULK_XFER");
  1446. if (tmp)
  1447. bulk_xfer = xstrdup(tmp);
  1448. }
  1449. xfree(opt.network);
  1450. if (adapter_use) {
  1451. if (!strcmp(adapter_use, "dedicated"))
  1452. opt.exclusive = true;
  1453. xfree(adapter_use);
  1454. }
  1455. if (collectives) {
  1456. if (opt.network)
  1457. xstrcat(opt.network, ",");
  1458. xstrcat(opt.network, "cau=");
  1459. xstrcat(opt.network, collectives);
  1460. }
  1461. if (euidevice) {
  1462. if (opt.network)
  1463. xstrcat(opt.network, ",");
  1464. xstrcat(opt.network, "devname=");
  1465. xstrcat(opt.network, euidevice);
  1466. }
  1467. if (euilib) {
  1468. if (opt.network)
  1469. xstrcat(opt.network, ",");
  1470. xstrcat(opt.network, euilib);
  1471. }
  1472. if (immediate) {
  1473. if (opt.network)
  1474. xstrcat(opt.network, ",");
  1475. xstrcat(opt.network, "immed=");
  1476. xstrcat(opt.network, immediate);
  1477. }
  1478. if (instances) {
  1479. if (opt.network)
  1480. xstrcat(opt.network, ",");
  1481. xstrcat(opt.network, "instances=");
  1482. xstrcat(opt.network, instances);
  1483. }
  1484. if (bulk_xfer && !strcmp(bulk_xfer, "yes")) {
  1485. if (opt.network)
  1486. xstrcat(opt.network, ",");
  1487. xstrcat(opt.network, "bulk_xfer");
  1488. }
  1489. xfree(bulk_xfer);
  1490. xfree(collectives);
  1491. xfree(euidevice);
  1492. xfree(euilib);
  1493. xfree(immediate);
  1494. xfree(instances);
  1495. }
  1496. if (!myargv[1])
  1497. myargv[1] = "poe";
  1498. }
  1499. debug("got pe_rm_init called");
  1500. /* This needs to happen before any other threads so we can
  1501. catch the signals correctly. Send in NULL for logopts
  1502. because we just set it up.
  1503. */
  1504. init_srun(2, myargv, NULL, debug_level, 0);
  1505. /* This has to be done after init_srun so as to not get over
  1506. written. */
  1507. if (getenv("SLURM_PRESERVE_ENV"))
  1508. opt.preserve_env = true;
  1509. if ((tmp_char = getenv("SRUN_EXC_NODES")))
  1510. opt.exc_nodes = xstrdup(tmp_char);
  1511. if ((tmp_char = getenv("SRUN_WITH_NODES")))
  1512. opt.nodelist = xstrdup(tmp_char);
  1513. if ((tmp_char = getenv("SRUN_RELATIVE"))) {
  1514. opt.relative = atoi(tmp_char);
  1515. opt.relative_set = 1;
  1516. }
  1517. if (pm_type == PM_PMD) {
  1518. uint32_t job_id = -1, step_id = -1;
  1519. if ((srun_debug = getenv("SLURM_JOB_ID")))
  1520. job_id = atoi(srun_debug);
  1521. if (

Large files files are truncated, but you can click here to view the full file