PageRenderTime 69ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/src/scontrol/info_job.c

https://github.com/cfenoy/slurm
C | 898 lines | 695 code | 99 blank | 104 comment | 232 complexity | 1126f74c552a686c51325774f87df42f MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0
  1. /*****************************************************************************\
  2. * info_job.c - job information functions for scontrol.
  3. *****************************************************************************
  4. * Copyright (C) 2002-2007 The Regents of the University of California.
  5. * Copyright (C) 2008-2010 Lawrence Livermore National Security.
  6. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  7. * Written by Morris Jette <jette1@llnl.gov>
  8. * CODE-OCEC-09-009. All rights reserved.
  9. *
  10. * This file is part of SLURM, a resource management program.
  11. * For details, see <http://www.schedmd.com/slurmdocs/>.
  12. * Please also read the included file: DISCLAIMER.
  13. *
  14. * SLURM is free software; you can redistribute it and/or modify it under
  15. * the terms of the GNU General Public License as published by the Free
  16. * Software Foundation; either version 2 of the License, or (at your option)
  17. * any later version.
  18. *
  19. * In addition, as a special exception, the copyright holders give permission
  20. * to link the code of portions of this program with the OpenSSL library under
  21. * certain conditions as described in each individual source file, and
  22. * distribute linked combinations including the two. You must obey the GNU
  23. * General Public License in all respects for all of the code used other than
  24. * OpenSSL. If you modify file(s) with this exception, you may extend this
  25. * exception to your version of the file(s), but you are not obligated to do
  26. * so. If you do not wish to do so, delete this exception statement from your
  27. * version. If you delete this exception statement from all source files in
  28. * the program, then also delete it here.
  29. *
  30. * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  31. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  32. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  33. * details.
  34. *
  35. * You should have received a copy of the GNU General Public License along
  36. * with SLURM; if not, write to the Free Software Foundation, Inc.,
  37. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  38. \*****************************************************************************/
  39. #include <sys/types.h>
  40. #include <sys/stat.h>
  41. #include <fcntl.h>
  42. #include "scontrol.h"
  43. #include "src/common/stepd_api.h"
  44. #include "src/plugins/select/bluegene/bg_enums.h"
  45. #define POLL_SLEEP 3 /* retry interval in seconds */
  46. static bool _in_node_bit_list(int inx, int *node_list_array);
  47. static int _scontrol_load_jobs(job_info_msg_t ** job_buffer_pptr,
  48. uint32_t job_id);
  49. /*
  50. * Determine if a node index is in a node list pair array.
  51. * RET - true if specified index is in the array
  52. */
  53. static bool
  54. _in_node_bit_list(int inx, int *node_list_array)
  55. {
  56. int i;
  57. bool rc = false;
  58. for (i=0; ; i+=2) {
  59. if (node_list_array[i] == -1)
  60. break;
  61. if ((inx >= node_list_array[i]) &&
  62. (inx <= node_list_array[i+1])) {
  63. rc = true;
  64. break;
  65. }
  66. }
  67. return rc;
  68. }
  69. /* Load current job table information into *job_buffer_pptr */
  70. static int
  71. _scontrol_load_jobs(job_info_msg_t ** job_buffer_pptr, uint32_t job_id)
  72. {
  73. int error_code;
  74. static uint16_t last_show_flags = 0xffff;
  75. uint16_t show_flags = 0;
  76. job_info_msg_t * job_info_ptr = NULL;
  77. if (all_flag)
  78. show_flags |= SHOW_ALL;
  79. if (detail_flag) {
  80. show_flags |= SHOW_DETAIL;
  81. if (detail_flag > 1)
  82. show_flags |= SHOW_DETAIL2;
  83. }
  84. if (old_job_info_ptr) {
  85. if (last_show_flags != show_flags)
  86. old_job_info_ptr->last_update = (time_t) 0;
  87. if (job_id) {
  88. error_code = slurm_load_job(&job_info_ptr, job_id,
  89. show_flags);
  90. } else {
  91. error_code = slurm_load_jobs(
  92. old_job_info_ptr->last_update,
  93. &job_info_ptr, show_flags);
  94. }
  95. if (error_code == SLURM_SUCCESS)
  96. slurm_free_job_info_msg (old_job_info_ptr);
  97. else if (slurm_get_errno () == SLURM_NO_CHANGE_IN_DATA) {
  98. job_info_ptr = old_job_info_ptr;
  99. error_code = SLURM_SUCCESS;
  100. if (quiet_flag == -1)
  101. printf ("slurm_load_jobs no change in data\n");
  102. }
  103. } else if (job_id) {
  104. error_code = slurm_load_job(&job_info_ptr, job_id, show_flags);
  105. } else {
  106. error_code = slurm_load_jobs((time_t) NULL, &job_info_ptr,
  107. show_flags);
  108. }
  109. if (error_code == SLURM_SUCCESS) {
  110. old_job_info_ptr = job_info_ptr;
  111. if (job_id)
  112. old_job_info_ptr->last_update = (time_t) 0;
  113. last_show_flags = show_flags;
  114. *job_buffer_pptr = job_info_ptr;
  115. }
  116. return error_code;
  117. }
  118. /*
  119. * scontrol_pid_info - given a local process id, print the corresponding
  120. * slurm job id and its expected end time
  121. * IN job_pid - the local process id of interest
  122. */
  123. extern void
  124. scontrol_pid_info(pid_t job_pid)
  125. {
  126. int error_code;
  127. uint32_t job_id;
  128. time_t end_time;
  129. long rem_time;
  130. error_code = slurm_pid2jobid (job_pid, &job_id);
  131. if (error_code) {
  132. exit_code = 1;
  133. if (quiet_flag != 1)
  134. slurm_perror ("slurm_pid2jobid error");
  135. return;
  136. }
  137. error_code = slurm_get_end_time(job_id, &end_time);
  138. if (error_code) {
  139. exit_code = 1;
  140. if (quiet_flag != 1)
  141. slurm_perror ("slurm_get_end_time error");
  142. return;
  143. }
  144. printf("Slurm job id %u ends at %s\n", job_id, ctime(&end_time));
  145. rem_time = slurm_get_rem_time(job_id);
  146. printf("slurm_get_rem_time is %ld\n", rem_time);
  147. return;
  148. }
  149. /*
  150. * scontrol_print_completing - print jobs in completing state and
  151. * associated nodes in COMPLETING or DOWN state
  152. */
  153. extern void
  154. scontrol_print_completing (void)
  155. {
  156. int error_code, i;
  157. job_info_msg_t *job_info_msg;
  158. job_info_t *job_info;
  159. node_info_msg_t *node_info_msg;
  160. uint16_t show_flags = 0;
  161. error_code = _scontrol_load_jobs (&job_info_msg, 0);
  162. if (error_code) {
  163. exit_code = 1;
  164. if (quiet_flag != 1)
  165. slurm_perror ("slurm_load_jobs error");
  166. return;
  167. }
  168. /* Must load all nodes including hidden for cross-index
  169. * from job's node_inx to node table to work */
  170. /*if (all_flag) Always set this flag */
  171. show_flags |= SHOW_ALL;
  172. error_code = scontrol_load_nodes (&node_info_msg, show_flags);
  173. if (error_code) {
  174. exit_code = 1;
  175. if (quiet_flag != 1)
  176. slurm_perror ("slurm_load_nodes error");
  177. return;
  178. }
  179. /* Scan the jobs for completing state */
  180. job_info = job_info_msg->job_array;
  181. for (i=0; i<job_info_msg->record_count; i++) {
  182. if (job_info[i].job_state & JOB_COMPLETING)
  183. scontrol_print_completing_job(&job_info[i],
  184. node_info_msg);
  185. }
  186. }
  187. extern void
  188. scontrol_print_completing_job(job_info_t *job_ptr,
  189. node_info_msg_t *node_info_msg)
  190. {
  191. int i;
  192. node_info_t *node_info;
  193. hostlist_t all_nodes, comp_nodes, down_nodes;
  194. char *node_buf;
  195. all_nodes = hostlist_create(job_ptr->nodes);
  196. comp_nodes = hostlist_create("");
  197. down_nodes = hostlist_create("");
  198. for (i=0; i<node_info_msg->record_count; i++) {
  199. node_info = &(node_info_msg->node_array[i]);
  200. if (IS_NODE_COMPLETING(node_info) &&
  201. (_in_node_bit_list(i, job_ptr->node_inx)))
  202. hostlist_push_host(comp_nodes, node_info->name);
  203. else if (IS_NODE_DOWN(node_info) &&
  204. (hostlist_find(all_nodes, node_info->name) != -1))
  205. hostlist_push_host(down_nodes, node_info->name);
  206. }
  207. fprintf(stdout, "JobId=%u ", job_ptr->job_id);
  208. node_buf = hostlist_ranged_string_xmalloc(comp_nodes);
  209. if (node_buf && node_buf[0])
  210. fprintf(stdout, "Nodes(COMPLETING)=%s ", node_buf);
  211. xfree(node_buf);
  212. node_buf = hostlist_ranged_string_xmalloc(down_nodes);
  213. if (node_buf && node_buf[0])
  214. fprintf(stdout, "Nodes(DOWN)=%s ", node_buf);
  215. xfree(node_buf);
  216. fprintf(stdout, "\n");
  217. hostlist_destroy(all_nodes);
  218. hostlist_destroy(comp_nodes);
  219. hostlist_destroy(down_nodes);
  220. }
  221. extern uint16_t
  222. scontrol_get_job_state(uint32_t job_id)
  223. {
  224. job_info_msg_t * job_buffer_ptr = NULL;
  225. int error_code = SLURM_SUCCESS, i;
  226. job_info_t *job_ptr = NULL;
  227. error_code = _scontrol_load_jobs(&job_buffer_ptr, job_id);
  228. if (error_code) {
  229. exit_code = 1;
  230. if (quiet_flag == -1)
  231. slurm_perror ("slurm_load_jobs error");
  232. return (uint16_t) NO_VAL;
  233. }
  234. if (quiet_flag == -1) {
  235. char time_str[32];
  236. slurm_make_time_str((time_t *)&job_buffer_ptr->last_update,
  237. time_str, sizeof(time_str));
  238. printf("last_update_time=%s, records=%d\n",
  239. time_str, job_buffer_ptr->record_count);
  240. }
  241. job_ptr = job_buffer_ptr->job_array ;
  242. for (i = 0; i < job_buffer_ptr->record_count; i++) {
  243. if (job_ptr->job_id == job_id)
  244. return job_ptr->job_state;
  245. }
  246. if (quiet_flag == -1)
  247. printf("Could not find job %u", job_id);
  248. return (uint16_t) NO_VAL;
  249. }
  250. /*
  251. * scontrol_print_job - print the specified job's information
  252. * IN job_id - job's id or NULL to print information about all jobs
  253. */
  254. extern void
  255. scontrol_print_job (char * job_id_str)
  256. {
  257. int error_code = SLURM_SUCCESS, i, print_cnt = 0;
  258. uint32_t job_id = 0;
  259. job_info_msg_t * job_buffer_ptr = NULL;
  260. job_info_t *job_ptr = NULL;
  261. if (job_id_str)
  262. job_id = (uint32_t) strtol (job_id_str, (char **)NULL, 10);
  263. error_code = _scontrol_load_jobs(&job_buffer_ptr, job_id);
  264. if (error_code) {
  265. exit_code = 1;
  266. if (quiet_flag != 1)
  267. slurm_perror ("slurm_load_jobs error");
  268. return;
  269. }
  270. if (quiet_flag == -1) {
  271. char time_str[32];
  272. slurm_make_time_str ((time_t *)&job_buffer_ptr->last_update,
  273. time_str, sizeof(time_str));
  274. printf ("last_update_time=%s, records=%d\n",
  275. time_str, job_buffer_ptr->record_count);
  276. }
  277. job_ptr = job_buffer_ptr->job_array ;
  278. for (i = 0; i < job_buffer_ptr->record_count; i++) {
  279. print_cnt++;
  280. slurm_print_job_info (stdout, & job_ptr[i], one_liner ) ;
  281. }
  282. if (print_cnt == 0) {
  283. if (job_id_str) {
  284. exit_code = 1;
  285. if (quiet_flag != 1)
  286. printf ("Job %u not found\n", job_id);
  287. } else if (quiet_flag != 1)
  288. printf ("No jobs in the system\n");
  289. }
  290. }
  291. /*
  292. * scontrol_print_step - print the specified job step's information
  293. * IN job_step_id_str - job step's id or NULL to print information
  294. * about all job steps
  295. */
  296. extern void
  297. scontrol_print_step (char *job_step_id_str)
  298. {
  299. int error_code, i;
  300. uint32_t job_id = NO_VAL, step_id = NO_VAL;
  301. char *next_str;
  302. job_step_info_response_msg_t *job_step_info_ptr;
  303. job_step_info_t * job_step_ptr;
  304. static uint32_t last_job_id = 0, last_step_id = 0;
  305. static job_step_info_response_msg_t *old_job_step_info_ptr = NULL;
  306. static uint16_t last_show_flags = 0xffff;
  307. uint16_t show_flags = 0;
  308. if (job_step_id_str) {
  309. job_id = (uint32_t) strtol (job_step_id_str, &next_str, 10);
  310. if (next_str[0] == '.')
  311. step_id = (uint32_t) strtol (&next_str[1], NULL, 10);
  312. }
  313. if (all_flag)
  314. show_flags |= SHOW_ALL;
  315. if ((old_job_step_info_ptr) &&
  316. (last_job_id == job_id) && (last_step_id == step_id)) {
  317. if (last_show_flags != show_flags)
  318. old_job_step_info_ptr->last_update = (time_t) 0;
  319. error_code = slurm_get_job_steps (
  320. old_job_step_info_ptr->last_update,
  321. job_id, step_id, &job_step_info_ptr,
  322. show_flags);
  323. if (error_code == SLURM_SUCCESS)
  324. slurm_free_job_step_info_response_msg (
  325. old_job_step_info_ptr);
  326. else if (slurm_get_errno () == SLURM_NO_CHANGE_IN_DATA) {
  327. job_step_info_ptr = old_job_step_info_ptr;
  328. error_code = SLURM_SUCCESS;
  329. if (quiet_flag == -1)
  330. printf ("slurm_get_job_steps no change in data\n");
  331. }
  332. }
  333. else {
  334. if (old_job_step_info_ptr) {
  335. slurm_free_job_step_info_response_msg (
  336. old_job_step_info_ptr);
  337. old_job_step_info_ptr = NULL;
  338. }
  339. error_code = slurm_get_job_steps ( (time_t) 0, job_id, step_id,
  340. &job_step_info_ptr,
  341. show_flags);
  342. }
  343. if (error_code) {
  344. exit_code = 1;
  345. if (quiet_flag != 1)
  346. slurm_perror ("slurm_get_job_steps error");
  347. return;
  348. }
  349. old_job_step_info_ptr = job_step_info_ptr;
  350. last_show_flags = show_flags;
  351. last_job_id = job_id;
  352. last_step_id = step_id;
  353. if (quiet_flag == -1) {
  354. char time_str[32];
  355. slurm_make_time_str ((time_t *)&job_step_info_ptr->last_update,
  356. time_str, sizeof(time_str));
  357. printf ("last_update_time=%s, records=%d\n",
  358. time_str, job_step_info_ptr->job_step_count);
  359. }
  360. job_step_ptr = job_step_info_ptr->job_steps ;
  361. for (i = 0; i < job_step_info_ptr->job_step_count; i++) {
  362. slurm_print_job_step_info (stdout, & job_step_ptr[i],
  363. one_liner ) ;
  364. }
  365. if (job_step_info_ptr->job_step_count == 0) {
  366. if (job_step_id_str) {
  367. exit_code = 1;
  368. if (quiet_flag != 1)
  369. printf ("Job step %u.%u not found\n",
  370. job_id, step_id);
  371. } else if (quiet_flag != 1)
  372. printf ("No job steps in the system\n");
  373. }
  374. }
  375. /* Return 1 on success, 0 on failure to find a jobid in the string */
  376. static int _parse_jobid(const char *jobid_str, uint32_t *out_jobid)
  377. {
  378. char *ptr, *job;
  379. long jobid;
  380. job = xstrdup(jobid_str);
  381. ptr = index(job, '.');
  382. if (ptr != NULL) {
  383. *ptr = '\0';
  384. }
  385. jobid = strtol(job, &ptr, 10);
  386. if (!xstring_is_whitespace(ptr)) {
  387. fprintf(stderr, "\"%s\" does not look like a jobid\n", job);
  388. xfree(job);
  389. return 0;
  390. }
  391. *out_jobid = (uint32_t) jobid;
  392. xfree(job);
  393. return 1;
  394. }
  395. /* Return 1 on success, 0 on failure to find a stepid in the string */
  396. static int _parse_stepid(const char *jobid_str, uint32_t *out_stepid)
  397. {
  398. char *ptr, *job, *step;
  399. long stepid;
  400. job = xstrdup(jobid_str);
  401. ptr = index(job, '.');
  402. if (ptr == NULL) {
  403. /* did not find a period, so no step ID in this string */
  404. xfree(job);
  405. return 0;
  406. } else {
  407. step = ptr + 1;
  408. }
  409. stepid = strtol(step, &ptr, 10);
  410. if (!xstring_is_whitespace(ptr)) {
  411. fprintf(stderr, "\"%s\" does not look like a stepid\n", step);
  412. xfree(job);
  413. return 0;
  414. }
  415. *out_stepid = (uint32_t) stepid;
  416. xfree(job);
  417. return 1;
  418. }
  419. static bool
  420. _in_task_array(pid_t pid, slurmstepd_task_info_t *task_array,
  421. uint32_t task_array_count)
  422. {
  423. int i;
  424. for (i = 0; i < task_array_count; i++) {
  425. if (pid == task_array[i].pid)
  426. return true;
  427. }
  428. return false;
  429. }
  430. static void
  431. _list_pids_one_step(const char *node_name, uint32_t jobid, uint32_t stepid)
  432. {
  433. int fd;
  434. slurmstepd_task_info_t *task_info;
  435. uint32_t *pids;
  436. uint32_t count = 0;
  437. uint32_t tcount = 0;
  438. int i;
  439. fd = stepd_connect(NULL, node_name, jobid, stepid);
  440. if (fd == -1) {
  441. exit_code = 1;
  442. if (errno == ENOENT) {
  443. fprintf(stderr,
  444. "Job step %u.%u does not exist on this node.\n",
  445. jobid, stepid);
  446. exit_code = 1;
  447. } else {
  448. perror("Unable to connect to slurmstepd");
  449. }
  450. return;
  451. }
  452. stepd_task_info(fd, &task_info, &tcount);
  453. for (i = 0; i < (int)tcount; i++) {
  454. if (!task_info[i].exited) {
  455. printf("%-8d %-8u %-6u %-7d %-8d\n",
  456. task_info[i].pid,
  457. jobid,
  458. stepid,
  459. task_info[i].id,
  460. task_info[i].gtid);
  461. }
  462. }
  463. stepd_list_pids(fd, &pids, &count);
  464. for (i = 0; i < count; i++) {
  465. if (!_in_task_array((pid_t)pids[i], task_info, tcount)) {
  466. printf("%-8d %-8u %-6u %-7s %-8s\n",
  467. pids[i], jobid, stepid, "-", "-");
  468. }
  469. }
  470. if (count > 0)
  471. xfree(pids);
  472. if (tcount > 0)
  473. xfree(task_info);
  474. close(fd);
  475. }
  476. static void
  477. _list_pids_all_steps(const char *node_name, uint32_t jobid)
  478. {
  479. List steps;
  480. ListIterator itr;
  481. step_loc_t *stepd;
  482. int count = 0;
  483. steps = stepd_available(NULL, node_name);
  484. if (!steps || list_count(steps) == 0) {
  485. fprintf(stderr, "Job %u does not exist on this node.\n", jobid);
  486. if (steps)
  487. list_destroy(steps);
  488. exit_code = 1;
  489. return;
  490. }
  491. itr = list_iterator_create(steps);
  492. while((stepd = list_next(itr))) {
  493. if (jobid == stepd->jobid) {
  494. _list_pids_one_step(stepd->nodename, stepd->jobid,
  495. stepd->stepid);
  496. count++;
  497. }
  498. }
  499. list_iterator_destroy(itr);
  500. list_destroy(steps);
  501. if (count == 0) {
  502. fprintf(stderr, "Job %u does not exist on this node.\n",
  503. jobid);
  504. exit_code = 1;
  505. }
  506. }
  507. static void
  508. _list_pids_all_jobs(const char *node_name)
  509. {
  510. List steps;
  511. ListIterator itr;
  512. step_loc_t *stepd;
  513. steps = stepd_available(NULL, node_name);
  514. if (!steps || list_count(steps) == 0) {
  515. fprintf(stderr, "No job steps exist on this node.\n");
  516. if (steps)
  517. list_destroy(steps);
  518. exit_code = 1;
  519. return;
  520. }
  521. itr = list_iterator_create(steps);
  522. while((stepd = list_next(itr))) {
  523. _list_pids_one_step(stepd->nodename, stepd->jobid,
  524. stepd->stepid);
  525. }
  526. list_iterator_destroy(itr);
  527. list_destroy(steps);
  528. }
  529. /*
  530. * scontrol_list_pids - given a slurmd job ID or job ID + step ID,
  531. * print the process IDs of the processes each job step (or
  532. * just the specified step ID).
  533. * IN jobid_str - string representing a jobid: jobid[.stepid]
  534. * IN node_name - May be NULL, in which case it will attempt to
  535. * determine the NodeName of the local host on its own.
  536. * This is mostly of use when multiple-slurmd support is in use,
  537. * because if NULL is used when there are multiple slurmd on the
  538. * node, one of them will be selected more-or-less at random.
  539. */
  540. extern void
  541. scontrol_list_pids(const char *jobid_str, const char *node_name)
  542. {
  543. uint32_t jobid = 0, stepid = 0;
  544. /* Job ID is optional */
  545. if (jobid_str != NULL
  546. && jobid_str[0] != '*'
  547. && !_parse_jobid(jobid_str, &jobid)) {
  548. exit_code = 1;
  549. return;
  550. }
  551. /* Step ID is optional */
  552. printf("%-8s %-8s %-6s %-7s %-8s\n",
  553. "PID", "JOBID", "STEPID", "LOCALID", "GLOBALID");
  554. if (jobid_str == NULL || jobid_str[0] == '*') {
  555. _list_pids_all_jobs(node_name);
  556. } else if (_parse_stepid(jobid_str, &stepid)) {
  557. _list_pids_one_step(node_name, jobid, stepid);
  558. } else {
  559. _list_pids_all_steps(node_name, jobid);
  560. }
  561. }
  562. /*
  563. * scontrol_print_hosts - given a node list expression, return
  564. * a list of nodes, one per line
  565. */
  566. extern void
  567. scontrol_print_hosts (char * node_list)
  568. {
  569. hostlist_t hl;
  570. char *host;
  571. if (!node_list) {
  572. error("host list is empty");
  573. return;
  574. }
  575. hl = hostlist_create(node_list);
  576. if (!hl) {
  577. fprintf(stderr, "Invalid hostlist: %s\n", node_list);
  578. return;
  579. }
  580. while ((host = hostlist_shift(hl))) {
  581. printf("%s\n", host);
  582. free(host);
  583. }
  584. hostlist_destroy(hl);
  585. }
  586. /* Replace '\n' with ',', remove duplicate comma */
  587. static void
  588. _reformat_hostlist(char *hostlist)
  589. {
  590. int i, o;
  591. for (i=0; (hostlist[i] != '\0'); i++) {
  592. if (hostlist[i] == '\n')
  593. hostlist[i] = ',';
  594. }
  595. o = 0;
  596. for (i=0; (hostlist[i] != '\0'); i++) {
  597. while ((hostlist[i] == ',') && (hostlist[i+1] == ','))
  598. i++;
  599. hostlist[o++] = hostlist[i];
  600. }
  601. hostlist[o] = '\0';
  602. }
  603. /*
  604. * scontrol_encode_hostlist - given a list of hostnames or the pathname
  605. * of a file containing hostnames, translate them into a hostlist
  606. * expression
  607. */
  608. extern int
  609. scontrol_encode_hostlist(char *hostlist)
  610. {
  611. char *io_buf = NULL, *tmp_list, *ranged_string;
  612. int buf_size = 1024 * 1024;
  613. hostlist_t hl;
  614. if (!hostlist) {
  615. fprintf(stderr, "Hostlist is NULL\n");
  616. return SLURM_ERROR;
  617. }
  618. if (hostlist[0] == '/') {
  619. ssize_t buf_read;
  620. int fd = open(hostlist, O_RDONLY);
  621. if (fd < 0) {
  622. fprintf(stderr, "Can not open %s\n", hostlist);
  623. return SLURM_ERROR;
  624. }
  625. io_buf = xmalloc(buf_size);
  626. buf_read = read(fd, io_buf, buf_size);
  627. close(fd);
  628. if (buf_read >= buf_size) {
  629. /* If over 1MB, the file is almost certainly invalid */
  630. fprintf(stderr, "File %s is too large\n", hostlist);
  631. return SLURM_ERROR;
  632. }
  633. io_buf[buf_read] = '\0';
  634. _reformat_hostlist(io_buf);
  635. tmp_list = io_buf;
  636. } else
  637. tmp_list = hostlist;
  638. hl = hostlist_create(tmp_list);
  639. if (hl == NULL) {
  640. fprintf(stderr, "Invalid hostlist: %s\n", tmp_list);
  641. return SLURM_ERROR;
  642. }
  643. ranged_string = hostlist_ranged_string_xmalloc(hl);
  644. printf("%s\n", ranged_string);
  645. hostlist_destroy(hl);
  646. xfree(ranged_string);
  647. xfree(io_buf);
  648. return SLURM_SUCCESS;
  649. }
  650. /*
  651. * Test if any BG blocks are in deallocating state since they are
  652. * probably related to this job we will want to sleep longer
  653. * RET 1: deallocate in progress
  654. * 0: no deallocate in progress
  655. * -1: error occurred
  656. */
  657. static int _blocks_dealloc(void)
  658. {
  659. static block_info_msg_t *bg_info_ptr = NULL, *new_bg_ptr = NULL;
  660. int rc = 0, error_code = 0, i;
  661. uint16_t show_flags = 0;
  662. if (all_flag)
  663. show_flags |= SHOW_ALL;
  664. if (bg_info_ptr) {
  665. error_code = slurm_load_block_info(bg_info_ptr->last_update,
  666. &new_bg_ptr, show_flags);
  667. if (error_code == SLURM_SUCCESS)
  668. slurm_free_block_info_msg(bg_info_ptr);
  669. else if (slurm_get_errno() == SLURM_NO_CHANGE_IN_DATA) {
  670. error_code = SLURM_SUCCESS;
  671. new_bg_ptr = bg_info_ptr;
  672. }
  673. } else {
  674. error_code = slurm_load_block_info((time_t) NULL,
  675. &new_bg_ptr, show_flags);
  676. }
  677. if (error_code) {
  678. error("slurm_load_partitions: %s",
  679. slurm_strerror(slurm_get_errno()));
  680. return -1;
  681. }
  682. for (i=0; i<new_bg_ptr->record_count; i++) {
  683. if(new_bg_ptr->block_array[i].state == BG_BLOCK_TERM) {
  684. rc = 1;
  685. break;
  686. }
  687. }
  688. bg_info_ptr = new_bg_ptr;
  689. return rc;
  690. }
  691. static int _wait_bluegene_block_ready(resource_allocation_response_msg_t *alloc)
  692. {
  693. int is_ready = SLURM_ERROR, i, rc = 0;
  694. char *block_id = NULL;
  695. int cur_delay = 0;
  696. int max_delay = BG_FREE_PREVIOUS_BLOCK + BG_MIN_BLOCK_BOOT +
  697. (BG_INCR_BLOCK_BOOT * alloc->node_cnt);
  698. select_g_select_jobinfo_get(alloc->select_jobinfo,
  699. SELECT_JOBDATA_BLOCK_ID,
  700. &block_id);
  701. for (i=0; (cur_delay < max_delay); i++) {
  702. if (i) {
  703. if (i == 1) {
  704. info("Waiting for block %s to become ready for "
  705. "job", block_id);
  706. } else
  707. debug("still waiting");
  708. sleep(POLL_SLEEP);
  709. rc = _blocks_dealloc();
  710. if ((rc == 0) || (rc == -1))
  711. cur_delay += POLL_SLEEP;
  712. }
  713. rc = slurm_job_node_ready(alloc->job_id);
  714. if (rc == READY_JOB_FATAL)
  715. break; /* fatal error */
  716. if ((rc == READY_JOB_ERROR) || (rc == EAGAIN))
  717. continue; /* retry */
  718. if ((rc & READY_JOB_STATE) == 0) /* job killed */
  719. break;
  720. if (rc & READY_NODE_STATE) { /* job and node ready */
  721. is_ready = SLURM_SUCCESS;
  722. break;
  723. }
  724. }
  725. if (is_ready == SLURM_SUCCESS)
  726. info("Block %s is ready for job %u", block_id, alloc->job_id);
  727. else if ((rc & READY_JOB_STATE) == 0)
  728. info("Job %u no longer running", alloc->job_id);
  729. else
  730. info("Problem running job %u", alloc->job_id);
  731. xfree(block_id);
  732. return is_ready;
  733. }
  734. static int _wait_nodes_ready(uint32_t job_id)
  735. {
  736. int is_ready = SLURM_ERROR, i, rc = 0;
  737. int cur_delay = 0;
  738. int suspend_time, resume_time, max_delay;
  739. suspend_time = slurm_get_suspend_timeout();
  740. resume_time = slurm_get_resume_timeout();
  741. if ((suspend_time == 0) || (resume_time == 0))
  742. return SLURM_SUCCESS; /* Power save mode disabled */
  743. max_delay = suspend_time + resume_time;
  744. max_delay *= 5; /* Allow for ResumeRate support */
  745. for (i=0; (cur_delay < max_delay); i++) {
  746. if (i) {
  747. if (i == 1)
  748. info("Waiting for nodes to boot");
  749. sleep(POLL_SLEEP);
  750. cur_delay += POLL_SLEEP;
  751. }
  752. rc = slurm_job_node_ready(job_id);
  753. if (rc == READY_JOB_FATAL)
  754. break; /* fatal error */
  755. if ((rc == READY_JOB_ERROR) || (rc == EAGAIN))
  756. continue; /* retry */
  757. if ((rc & READY_JOB_STATE) == 0) /* job killed */
  758. break;
  759. if (rc & READY_NODE_STATE) { /* job and node ready */
  760. is_ready = SLURM_SUCCESS;
  761. break;
  762. }
  763. }
  764. if (is_ready == SLURM_SUCCESS)
  765. info("Nodes are ready for job %u", job_id);
  766. else if ((rc & READY_JOB_STATE) == 0)
  767. info("Job %u no longer running", job_id);
  768. else
  769. info("Problem running job %u", job_id);
  770. return is_ready;
  771. }
  772. /*
  773. * Wait until a job is ready to execute or enters some failed state
  774. * RET 1: job ready to run
  775. * 0: job can't run (cancelled, failure state, timeout, etc.)
  776. */
  777. extern int scontrol_job_ready(char *job_id_str)
  778. {
  779. int rc;
  780. uint32_t job_id;
  781. job_id = atoi(job_id_str);
  782. if (job_id <= 0) {
  783. fprintf(stderr, "Invalid job_id %s", job_id_str);
  784. return SLURM_ERROR;
  785. }
  786. if(cluster_flags & CLUSTER_FLAG_BG) {
  787. resource_allocation_response_msg_t *alloc;
  788. rc = slurm_allocation_lookup_lite(job_id, &alloc);
  789. if (rc == SLURM_SUCCESS) {
  790. rc = _wait_bluegene_block_ready(alloc);
  791. slurm_free_resource_allocation_response_msg(alloc);
  792. } else {
  793. error("slurm_allocation_lookup_lite: %m");
  794. rc = SLURM_ERROR;
  795. }
  796. } else
  797. rc = _wait_nodes_ready(job_id);
  798. return rc;
  799. }