PageRenderTime 58ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/src/smap/job_functions.c

https://github.com/cfenoy/slurm
C | 504 lines | 417 code | 47 blank | 40 comment | 94 complexity | 5131d375e5a90ecd807ffbe7c633dcb1 MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0
  1. /*****************************************************************************\
  2. * job_functions.c - Functions related to job display mode of smap.
  3. *****************************************************************************
  4. * Copyright (C) 2002-2007 The Regents of the University of California.
  5. * Copyright (C) 2008-2011 Lawrence Livermore National Security.
  6. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  7. * Written by Danny Auble <da@llnl.gov>
  8. *
  9. * CODE-OCEC-09-009. All rights reserved.
  10. *
  11. * This file is part of SLURM, a resource management program.
  12. * For details, see <http://www.schedmd.com/slurmdocs/>.
  13. * Please also read the included file: DISCLAIMER.
  14. *
  15. * SLURM is free software; you can redistribute it and/or modify it under
  16. * the terms of the GNU General Public License as published by the Free
  17. * Software Foundation; either version 2 of the License, or (at your option)
  18. * any later version.
  19. *
  20. * In addition, as a special exception, the copyright holders give permission
  21. * to link the code of portions of this program with the OpenSSL library under
  22. * certain conditions as described in each individual source file, and
  23. * distribute linked combinations including the two. You must obey the GNU
  24. * General Public License in all respects for all of the code used other than
  25. * OpenSSL. If you modify file(s) with this exception, you may extend this
  26. * exception to your version of the file(s), but you are not obligated to do
  27. * so. If you do not wish to do so, delete this exception statement from your
  28. * version. If you delete this exception statement from all source files in
  29. * the program, then also delete it here.
  30. *
  31. * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  32. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  33. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  34. * details.
  35. *
  36. * You should have received a copy of the GNU General Public License along
  37. * with SLURM; if not, write to the Free Software Foundation, Inc.,
  38. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  39. \*****************************************************************************/
  40. #include "src/common/uid.h"
  41. #include "src/common/node_select.h"
  42. #include "src/common/parse_time.h"
  43. #include "src/smap/smap.h"
  44. static int _get_node_cnt(job_info_t * job);
  45. static int _max_cpus_per_node(void);
  46. static int _nodes_in_list(char *node_list);
  47. static void _print_header_job(void);
  48. static int _print_text_job(job_info_t * job_ptr);
  49. extern void get_job(void)
  50. {
  51. int error_code = -1, i, recs;
  52. static int printed_jobs = 0;
  53. static int count = 0;
  54. static job_info_msg_t *job_info_ptr = NULL, *new_job_ptr = NULL;
  55. job_info_t *job_ptr = NULL;
  56. uint16_t show_flags = 0;
  57. bitstr_t *nodes_req = NULL;
  58. static uint16_t last_flags = 0;
  59. if (params.all_flag)
  60. show_flags |= SHOW_ALL;
  61. if (job_info_ptr) {
  62. if (show_flags != last_flags)
  63. job_info_ptr->last_update = 0;
  64. error_code = slurm_load_jobs(job_info_ptr->last_update,
  65. &new_job_ptr, show_flags);
  66. if (error_code == SLURM_SUCCESS)
  67. slurm_free_job_info_msg(job_info_ptr);
  68. else if (slurm_get_errno() == SLURM_NO_CHANGE_IN_DATA) {
  69. error_code = SLURM_SUCCESS;
  70. new_job_ptr = job_info_ptr;
  71. }
  72. } else
  73. error_code = slurm_load_jobs((time_t) NULL, &new_job_ptr,
  74. show_flags);
  75. last_flags = show_flags;
  76. if (error_code) {
  77. if (quiet_flag != 1) {
  78. if (!params.commandline) {
  79. mvwprintw(text_win,
  80. main_ycord, 1,
  81. "slurm_load_job: %s",
  82. slurm_strerror(slurm_get_errno()));
  83. main_ycord++;
  84. } else {
  85. printf("slurm_load_job: %s\n",
  86. slurm_strerror(slurm_get_errno()));
  87. }
  88. }
  89. }
  90. if (!params.no_header)
  91. _print_header_job();
  92. if (new_job_ptr)
  93. recs = new_job_ptr->record_count;
  94. else
  95. recs = 0;
  96. if (!params.commandline)
  97. if ((text_line_cnt+printed_jobs) > count)
  98. text_line_cnt--;
  99. printed_jobs = 0;
  100. count = 0;
  101. if (params.hl)
  102. nodes_req = get_requested_node_bitmap();
  103. for (i = 0; i < recs; i++) {
  104. job_ptr = &(new_job_ptr->job_array[i]);
  105. if (!IS_JOB_PENDING(job_ptr) && !IS_JOB_RUNNING(job_ptr) &&
  106. !IS_JOB_SUSPENDED(job_ptr) && !IS_JOB_COMPLETING(job_ptr))
  107. continue; /* job has completed */
  108. if (nodes_req) {
  109. int overlap = 0;
  110. bitstr_t *loc_bitmap = bit_alloc(bit_size(nodes_req));
  111. inx2bitstr(loc_bitmap, job_ptr->node_inx);
  112. overlap = bit_overlap(loc_bitmap, nodes_req);
  113. FREE_NULL_BITMAP(loc_bitmap);
  114. if (!overlap)
  115. continue;
  116. }
  117. if (job_ptr->node_inx[0] != -1) {
  118. int j = 0;
  119. job_ptr->num_nodes = 0;
  120. while (job_ptr->node_inx[j] >= 0) {
  121. job_ptr->num_nodes +=
  122. (job_ptr->node_inx[j + 1] + 1) -
  123. job_ptr->node_inx[j];
  124. set_grid_inx(job_ptr->node_inx[j],
  125. job_ptr->node_inx[j + 1], count);
  126. j += 2;
  127. }
  128. if (!params.commandline) {
  129. if ((count >= text_line_cnt) &&
  130. (printed_jobs < (getmaxy(text_win) - 4))) {
  131. job_ptr->num_cpus =
  132. (int)letters[count%62];
  133. wattron(text_win,
  134. COLOR_PAIR(colors[count%6]));
  135. _print_text_job(job_ptr);
  136. wattroff(text_win,
  137. COLOR_PAIR(colors[count%6]));
  138. printed_jobs++;
  139. }
  140. } else {
  141. job_ptr->num_cpus = (int)letters[count%62];
  142. _print_text_job(job_ptr);
  143. }
  144. count++;
  145. }
  146. if (count == 128)
  147. count = 0;
  148. }
  149. for (i = 0; i < recs; i++) {
  150. job_ptr = &(new_job_ptr->job_array[i]);
  151. if (!IS_JOB_PENDING(job_ptr))
  152. continue; /* job has completed */
  153. if (!params.commandline) {
  154. if ((count>=text_line_cnt) &&
  155. (printed_jobs < (getmaxy(text_win) - 4))) {
  156. xfree(job_ptr->nodes);
  157. job_ptr->nodes = xstrdup("waiting...");
  158. job_ptr->num_cpus = (int) letters[count%62];
  159. wattron(text_win,
  160. COLOR_PAIR(colors[count%6]));
  161. _print_text_job(job_ptr);
  162. wattroff(text_win,
  163. COLOR_PAIR(colors[count%6]));
  164. printed_jobs++;
  165. }
  166. } else {
  167. xfree(job_ptr->nodes);
  168. job_ptr->nodes = xstrdup("waiting...");
  169. job_ptr->num_cpus = (int) letters[count%62];
  170. _print_text_job(job_ptr);
  171. printed_jobs++;
  172. }
  173. count++;
  174. if (count == 128)
  175. count = 0;
  176. }
  177. if (params.commandline && params.iterate)
  178. printf("\n");
  179. if (!params.commandline)
  180. main_ycord++;
  181. job_info_ptr = new_job_ptr;
  182. return;
  183. }
  184. static void _print_header_job(void)
  185. {
  186. if (!params.commandline) {
  187. mvwprintw(text_win, main_ycord,
  188. main_xcord, "ID");
  189. main_xcord += 3;
  190. mvwprintw(text_win, main_ycord,
  191. main_xcord, "JOBID");
  192. main_xcord += 8;
  193. mvwprintw(text_win, main_ycord,
  194. main_xcord, "PARTITION");
  195. main_xcord += 10;
  196. if (params.cluster_flags & CLUSTER_FLAG_BG) {
  197. mvwprintw(text_win, main_ycord,
  198. main_xcord, "BG_BLOCK");
  199. main_xcord += 18;
  200. }
  201. if (params.cluster_flags & CLUSTER_FLAG_CRAYXT) {
  202. mvwprintw(text_win, main_ycord,
  203. main_xcord, "RESV_ID");
  204. main_xcord += 18;
  205. }
  206. mvwprintw(text_win, main_ycord,
  207. main_xcord, "USER");
  208. main_xcord += 9;
  209. mvwprintw(text_win, main_ycord,
  210. main_xcord, "NAME");
  211. main_xcord += 10;
  212. mvwprintw(text_win, main_ycord,
  213. main_xcord, "ST");
  214. main_xcord += 8;
  215. mvwprintw(text_win, main_ycord,
  216. main_xcord, "TIME");
  217. main_xcord += 5;
  218. mvwprintw(text_win, main_ycord,
  219. main_xcord, "NODES");
  220. main_xcord += 6;
  221. if (params.cluster_flags & CLUSTER_FLAG_BG)
  222. mvwprintw(text_win, main_ycord,
  223. main_xcord, "MIDPLANELIST");
  224. else
  225. mvwprintw(text_win, main_ycord,
  226. main_xcord, "NODELIST");
  227. main_xcord = 1;
  228. main_ycord++;
  229. } else {
  230. printf(" JOBID ");
  231. printf("PARTITION ");
  232. if (params.cluster_flags & CLUSTER_FLAG_BG)
  233. printf(" BG_BLOCK ");
  234. printf(" USER ");
  235. printf(" NAME ");
  236. printf("ST ");
  237. printf(" TIME ");
  238. printf("NODES ");
  239. if (params.cluster_flags & CLUSTER_FLAG_BG)
  240. printf("MIDPLANELIST\n");
  241. else
  242. printf("NODELIST\n");
  243. }
  244. }
  245. static long _job_time_used(job_info_t * job_ptr)
  246. {
  247. time_t end_time;
  248. if ((job_ptr->start_time == 0) || IS_JOB_PENDING(job_ptr))
  249. return 0L;
  250. if (IS_JOB_SUSPENDED(job_ptr))
  251. return (long) job_ptr->pre_sus_time;
  252. if (IS_JOB_RUNNING(job_ptr) || (job_ptr->end_time == 0))
  253. end_time = time(NULL);
  254. else
  255. end_time = job_ptr->end_time;
  256. if (job_ptr->suspend_time)
  257. return (long) (difftime(end_time, job_ptr->suspend_time)
  258. + job_ptr->pre_sus_time);
  259. return (long) (difftime(end_time, job_ptr->start_time));
  260. }
  261. static int _print_text_job(job_info_t * job_ptr)
  262. {
  263. time_t time_diff;
  264. int printed = 0;
  265. int tempxcord;
  266. int prefixlen = 0;
  267. int i = 0;
  268. int width = 0;
  269. char time_buf[20];
  270. char tmp_cnt[8];
  271. uint32_t node_cnt = 0;
  272. char *ionodes = NULL, *uname;
  273. if (params.cluster_flags & CLUSTER_FLAG_BG) {
  274. select_g_select_jobinfo_get(job_ptr->select_jobinfo,
  275. SELECT_JOBDATA_IONODES,
  276. &ionodes);
  277. select_g_select_jobinfo_get(job_ptr->select_jobinfo,
  278. SELECT_JOBDATA_NODE_CNT,
  279. &node_cnt);
  280. if (!strcasecmp(job_ptr->nodes,"waiting..."))
  281. xfree(ionodes);
  282. } else
  283. node_cnt = job_ptr->num_nodes;
  284. if ((node_cnt == 0) || (node_cnt == NO_VAL))
  285. node_cnt = _get_node_cnt(job_ptr);
  286. if (params.cluster_flags & CLUSTER_FLAG_BG)
  287. convert_num_unit((float)node_cnt, tmp_cnt,
  288. sizeof(tmp_cnt), UNIT_NONE);
  289. else
  290. snprintf(tmp_cnt, sizeof(tmp_cnt), "%d", node_cnt);
  291. if (!params.commandline) {
  292. mvwprintw(text_win, main_ycord,
  293. main_xcord, "%c", job_ptr->num_cpus);
  294. main_xcord += 3;
  295. mvwprintw(text_win, main_ycord,
  296. main_xcord, "%d", job_ptr->job_id);
  297. main_xcord += 8;
  298. mvwprintw(text_win, main_ycord,
  299. main_xcord, "%.10s", job_ptr->partition);
  300. main_xcord += 10;
  301. if (params.cluster_flags & CLUSTER_FLAG_BG) {
  302. mvwprintw(text_win, main_ycord,
  303. main_xcord, "%.16s",
  304. select_g_select_jobinfo_sprint(
  305. job_ptr->select_jobinfo,
  306. time_buf,
  307. sizeof(time_buf),
  308. SELECT_PRINT_BG_ID));
  309. main_xcord += 18;
  310. }
  311. if (params.cluster_flags & CLUSTER_FLAG_CRAYXT) {
  312. mvwprintw(text_win, main_ycord,
  313. main_xcord, "%.16s",
  314. select_g_select_jobinfo_sprint(
  315. job_ptr->select_jobinfo,
  316. time_buf, sizeof(time_buf),
  317. SELECT_PRINT_DATA));
  318. main_xcord += 18;
  319. }
  320. uname = uid_to_string((uid_t) job_ptr->user_id);
  321. mvwprintw(text_win, main_ycord,
  322. main_xcord, "%.8s", uname);
  323. xfree(uname);
  324. main_xcord += 9;
  325. mvwprintw(text_win, main_ycord,
  326. main_xcord, "%.9s", job_ptr->name);
  327. main_xcord += 10;
  328. mvwprintw(text_win, main_ycord,
  329. main_xcord, "%.2s",
  330. job_state_string_compact(job_ptr->job_state));
  331. main_xcord += 2;
  332. if (!strcasecmp(job_ptr->nodes,"waiting...")) {
  333. sprintf(time_buf,"00:00:00");
  334. } else {
  335. time_diff = (time_t) _job_time_used(job_ptr);
  336. secs2time_str(time_diff, time_buf, sizeof(time_buf));
  337. }
  338. width = strlen(time_buf);
  339. mvwprintw(text_win, main_ycord,
  340. main_xcord + (10 - width), "%s",
  341. time_buf);
  342. main_xcord += 11;
  343. mvwprintw(text_win,
  344. main_ycord,
  345. main_xcord, "%5s", tmp_cnt);
  346. main_xcord += 6;
  347. tempxcord = main_xcord;
  348. i=0;
  349. while (job_ptr->nodes[i] != '\0') {
  350. if ((printed = mvwaddch(text_win,
  351. main_ycord,
  352. main_xcord,
  353. job_ptr->nodes[i])) < 0) {
  354. xfree(ionodes);
  355. return printed;
  356. }
  357. main_xcord++;
  358. width = getmaxx(text_win) - 1 - main_xcord;
  359. if (job_ptr->nodes[i] == '[')
  360. prefixlen = i + 1;
  361. else if (job_ptr->nodes[i] == ','
  362. && (width - 9) <= 0) {
  363. main_ycord++;
  364. main_xcord = tempxcord + prefixlen;
  365. }
  366. i++;
  367. }
  368. if (ionodes) {
  369. mvwprintw(text_win,
  370. main_ycord,
  371. main_xcord, "[%s]",
  372. ionodes);
  373. main_xcord += strlen(ionodes)+2;
  374. xfree(ionodes);
  375. }
  376. main_xcord = 1;
  377. main_ycord++;
  378. } else {
  379. printf("%8d ", job_ptr->job_id);
  380. printf("%9.9s ", job_ptr->partition);
  381. if (params.cluster_flags & CLUSTER_FLAG_BG)
  382. printf("%16.16s ",
  383. select_g_select_jobinfo_sprint(
  384. job_ptr->select_jobinfo,
  385. time_buf, sizeof(time_buf),
  386. SELECT_PRINT_BG_ID));
  387. if (params.cluster_flags & CLUSTER_FLAG_CRAYXT)
  388. printf("%16.16s ",
  389. select_g_select_jobinfo_sprint(
  390. job_ptr->select_jobinfo,
  391. time_buf, sizeof(time_buf),
  392. SELECT_PRINT_DATA));
  393. uname = uid_to_string((uid_t) job_ptr->user_id);
  394. printf("%8.8s ", uname);
  395. xfree(uname);
  396. printf("%6.6s ", job_ptr->name);
  397. printf("%2.2s ",
  398. job_state_string_compact(job_ptr->job_state));
  399. if (!strcasecmp(job_ptr->nodes,"waiting...")) {
  400. sprintf(time_buf,"00:00:00");
  401. } else {
  402. time_diff = (time_t) _job_time_used(job_ptr);
  403. secs2time_str(time_diff, time_buf, sizeof(time_buf));
  404. }
  405. printf("%10.10s ", time_buf);
  406. printf("%5s ", tmp_cnt);
  407. printf("%s", job_ptr->nodes);
  408. if (ionodes) {
  409. printf("[%s]", ionodes);
  410. xfree(ionodes);
  411. }
  412. printf("\n");
  413. }
  414. return printed;
  415. }
  416. static int _get_node_cnt(job_info_t * job)
  417. {
  418. int node_cnt = 0, round;
  419. bool completing = job->job_state & JOB_COMPLETING;
  420. uint16_t base_job_state = job->job_state & (~JOB_COMPLETING);
  421. static int max_cpus = 0;
  422. if (base_job_state == JOB_PENDING || completing) {
  423. if (max_cpus == 0)
  424. max_cpus = _max_cpus_per_node();
  425. node_cnt = _nodes_in_list(job->req_nodes);
  426. node_cnt = MAX(node_cnt, job->num_nodes);
  427. round = job->num_cpus + max_cpus - 1;
  428. round /= max_cpus; /* round up */
  429. node_cnt = MAX(node_cnt, round);
  430. } else
  431. node_cnt = _nodes_in_list(job->nodes);
  432. return node_cnt;
  433. }
  434. static int _nodes_in_list(char *node_list)
  435. {
  436. hostset_t host_set = hostset_create(node_list);
  437. int count = hostset_count(host_set);
  438. hostset_destroy(host_set);
  439. return count;
  440. }
  441. /* Return the maximum number of processors for any node in the cluster */
  442. static int _max_cpus_per_node(void)
  443. {
  444. int error_code, max_cpus = 1;
  445. node_info_msg_t *node_info_ptr = NULL;
  446. error_code = slurm_load_node ((time_t) NULL, &node_info_ptr,
  447. params.all_flag ? 1 : 0);
  448. if (error_code == SLURM_SUCCESS) {
  449. int i;
  450. node_info_t *node_ptr = node_info_ptr->node_array;
  451. for (i=0; i<node_info_ptr->record_count; i++) {
  452. max_cpus = MAX(max_cpus, node_ptr[i].cpus);
  453. }
  454. slurm_free_node_info_msg (node_info_ptr);
  455. }
  456. return max_cpus;
  457. }