PageRenderTime 45ms CodeModel.GetById 10ms RepoModel.GetById 0ms app.codeStats 1ms

/src/plugins/select/bluegene/bg_record_functions.c

https://github.com/cfenoy/slurm
C | 1884 lines | 1402 code | 220 blank | 262 comment | 364 complexity | f7d7f008935ccb1fbd5da84a2fbdf961 MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0

Large files files are truncated, but you can click here to view the full file

  1. /*****************************************************************************\
  2. * bg_record_functions.c - header for creating blocks in a static environment.
  3. *
  4. * $Id: bg_record_functions.c 12954 2008-01-04 20:37:49Z da $
  5. *****************************************************************************
  6. * Copyright (C) 2008 Lawrence Livermore National Security.
  7. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  8. * Written by Danny Auble <da@llnl.gov>
  9. *
  10. * This file is part of SLURM, a resource management program.
  11. * For details, see <http://www.schedmd.com/slurmdocs/>.
  12. * Please also read the included file: DISCLAIMER.
  13. *
  14. * SLURM is free software; you can redistribute it and/or modify it under
  15. * the terms of the GNU General Public License as published by the Free
  16. * Software Foundation; either version 2 of the License, or (at your option)
  17. * any later version.
  18. *
  19. * In addition, as a special exception, the copyright holders give permission
  20. * to link the code of portions of this program with the OpenSSL library under
  21. * certain conditions as described in each individual source file, and
  22. * distribute linked combinations including the two. You must obey the GNU
  23. * General Public License in all respects for all of the code used other than
  24. * OpenSSL. If you modify file(s) with this exception, you may extend this
  25. * exception to your version of the file(s), but you are not obligated to do
  26. * so. If you do not wish to do so, delete this exception statement from your
  27. * version. If you delete this exception statement from all source files in
  28. * the program, then also delete it here.
  29. *
  30. * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  31. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  32. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  33. * details.
  34. *
  35. * You should have received a copy of the GNU General Public License along
  36. * with SLURM; if not, write to the Free Software Foundation, Inc.,
  37. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  38. \*****************************************************************************/
  39. #include "bg_core.h"
  40. #include "bg_dynamic_block.h"
  41. #include "src/common/uid.h"
  42. #include "src/common/slurm_accounting_storage.h"
  43. #include "src/slurmctld/trigger_mgr.h"
  44. #include "src/slurmctld/locks.h"
  45. /* some local functions */
  46. static int _set_block_nodes_accounting(bg_record_t *bg_record, char *reason);
  47. static void _addto_mp_list(bg_record_t *bg_record,
  48. uint16_t *start, uint16_t *end);
  49. static int _ba_mp_cmpf_inc(ba_mp_t *node_a, ba_mp_t *node_b);
  50. static void _set_block_avail(bg_record_t *bg_record);
  51. extern void print_bg_record(bg_record_t* bg_record)
  52. {
  53. char *conn_type;
  54. if (!bg_record) {
  55. error("print_bg_record, record given is null");
  56. return;
  57. }
  58. conn_type = conn_type_string_full(bg_record->conn_type);
  59. #if _DEBUG
  60. info(" bg_record: ");
  61. if (bg_record->bg_block_id)
  62. info("\tbg_block_id: %s", bg_record->bg_block_id);
  63. info("\tnodes: %s", bg_record->mp_str);
  64. info("\tsize: %d MPs %u Nodes %d cpus",
  65. bg_record->mp_count,
  66. bg_record->cnode_cnt,
  67. bg_record->cpu_cnt);
  68. info("\tgeo: %ux%ux%u", bg_record->geo[X], bg_record->geo[Y],
  69. bg_record->geo[Z]);
  70. info("\tconn_type: %s", conn_type);
  71. #ifdef HAVE_BGL
  72. info("\tnode_use: %s", node_use_string(bg_record->node_use));
  73. #endif
  74. if (bg_record->mp_bitmap) {
  75. char bitstring[BITSIZE];
  76. bit_fmt(bitstring, BITSIZE, bg_record->mp_bitmap);
  77. info("\tbitmap: %s", bitstring);
  78. }
  79. #else
  80. {
  81. char tmp_char[256];
  82. format_node_name(bg_record, tmp_char, sizeof(tmp_char));
  83. info("Record: BlockID:%s Nodes:%s Conn:%s",
  84. bg_record->bg_block_id, tmp_char,
  85. conn_type);
  86. }
  87. #endif
  88. xfree(conn_type);
  89. }
  90. extern void destroy_bg_record(void *object)
  91. {
  92. bg_record_t* bg_record = (bg_record_t*) object;
  93. if (bg_record) {
  94. bg_record->magic = 0;
  95. if (bg_record->ba_mp_list) {
  96. list_destroy(bg_record->ba_mp_list);
  97. bg_record->ba_mp_list = NULL;
  98. }
  99. xfree(bg_record->bg_block_id);
  100. xfree(bg_record->blrtsimage);
  101. xfree(bg_record->ionode_str);
  102. FREE_NULL_BITMAP(bg_record->ionode_bitmap);
  103. if (bg_record->job_list) {
  104. list_destroy(bg_record->job_list);
  105. bg_record->job_list = NULL;
  106. }
  107. xfree(bg_record->linuximage);
  108. xfree(bg_record->mloaderimage);
  109. xfree(bg_record->mp_str);
  110. FREE_NULL_BITMAP(bg_record->mp_bitmap);
  111. xfree(bg_record->ramdiskimage);
  112. xfree(bg_record->reason);
  113. xfree(bg_record);
  114. }
  115. }
  116. extern void process_nodes(bg_record_t *bg_record, bool startup)
  117. {
  118. int j=0;
  119. int diff=0;
  120. int largest_diff=-1;
  121. uint16_t best_start[SYSTEM_DIMENSIONS];
  122. uint16_t start[SYSTEM_DIMENSIONS];
  123. uint16_t end[SYSTEM_DIMENSIONS];
  124. bool start_set=0;
  125. ListIterator itr;
  126. ba_mp_t* ba_mp = NULL;
  127. int dim;
  128. static char tmp_char[SYSTEM_DIMENSIONS+1],
  129. tmp_char2[SYSTEM_DIMENSIONS+1];
  130. static int *cluster_dims = NULL;
  131. if (!cluster_dims) {
  132. /* do some initing that only needs to happen once. */
  133. cluster_dims = select_g_ba_get_dims();
  134. memset(tmp_char, 0, sizeof(tmp_char));
  135. memset(tmp_char2, 0, sizeof(tmp_char2));
  136. }
  137. if (!bg_record->ba_mp_list || !list_count(bg_record->ba_mp_list)) {
  138. char *nodes = bg_record->mp_str;
  139. if (!bg_record->ba_mp_list)
  140. bg_record->ba_mp_list = list_create(destroy_ba_mp);
  141. memset(&best_start, 0, sizeof(best_start));
  142. //bg_record->mp_count = 0;
  143. if ((bg_record->conn_type[0] >= SELECT_SMALL) && (!startup))
  144. error("process_nodes: "
  145. "We shouldn't be here there could be some "
  146. "badness if we use this logic %s",
  147. bg_record->mp_str);
  148. while (nodes[j] != '\0') {
  149. int mid = j + SYSTEM_DIMENSIONS + 1;
  150. int fin = mid + SYSTEM_DIMENSIONS + 1;
  151. if (((nodes[j] == '[') || (nodes[j] == ',')) &&
  152. ((nodes[mid] == 'x') || (nodes[mid] == '-')) &&
  153. ((nodes[fin] == ']') || (nodes[fin] == ','))) {
  154. j++; /* Skip leading '[' or ',' */
  155. for (dim = 0; dim < SYSTEM_DIMENSIONS;
  156. dim++, j++)
  157. start[dim] = select_char2coord(
  158. nodes[j]);
  159. j++; /* Skip middle 'x' or '-' */
  160. for (dim = 0; dim < SYSTEM_DIMENSIONS;
  161. dim++, j++)
  162. end[dim] = select_char2coord(nodes[j]);
  163. diff = end[0]-start[0];
  164. _addto_mp_list(bg_record, start, end);
  165. } else if ((nodes[j] >= '0'&& nodes[j] <= '9')
  166. || (nodes[j] >= 'A' && nodes[j] <= 'Z')) {
  167. for (dim = 0; dim < SYSTEM_DIMENSIONS;
  168. dim++, j++)
  169. start[dim] = select_char2coord(
  170. nodes[j]);
  171. diff = 0;
  172. _addto_mp_list(bg_record, start, start);
  173. } else {
  174. j++;
  175. continue;
  176. }
  177. if (diff > largest_diff) {
  178. largest_diff = diff;
  179. memcpy(best_start, start, sizeof(best_start));
  180. if (bg_conf->slurm_debug_level
  181. >= LOG_LEVEL_DEBUG3) {
  182. for (dim = 0;
  183. dim < SYSTEM_DIMENSIONS;
  184. dim++)
  185. tmp_char[dim] = alpha_num[
  186. best_start[dim]];
  187. debug3("process_nodes: start is now %s",
  188. tmp_char);
  189. }
  190. }
  191. if (bg_record->mp_str[j] != ',')
  192. break;
  193. }
  194. if (largest_diff == -1)
  195. fatal("No hostnames given here");
  196. memcpy(bg_record->start, best_start, sizeof(bg_record->start));
  197. start_set = 1;
  198. if (bg_conf->slurm_debug_level >= LOG_LEVEL_DEBUG3) {
  199. for (dim = 0; dim < SYSTEM_DIMENSIONS; dim++) {
  200. tmp_char[dim] = alpha_num[best_start[dim]];
  201. tmp_char2[dim] =
  202. alpha_num[bg_record->start[dim]];
  203. }
  204. debug3("process_nodes: start is %s %s",
  205. tmp_char, tmp_char2);
  206. }
  207. }
  208. memset(bg_record->geo, 0, sizeof(bg_record->geo));
  209. for (dim = 0; dim < SYSTEM_DIMENSIONS; dim++) {
  210. end[dim] = (int16_t)-1;
  211. if (!start_set)
  212. bg_record->start[dim] = HOSTLIST_BASE;
  213. }
  214. list_sort(bg_record->ba_mp_list, (ListCmpF) _ba_mp_cmpf_inc);
  215. FREE_NULL_BITMAP(bg_record->mp_bitmap);
  216. bg_record->mp_bitmap = bit_alloc(node_record_count);
  217. bg_record->mp_count = 0;
  218. itr = list_iterator_create(bg_record->ba_mp_list);
  219. while ((ba_mp = list_next(itr))) {
  220. if (!ba_mp->used)
  221. continue;
  222. bg_record->mp_count++;
  223. debug3("process_nodes: %s is included in this block",
  224. ba_mp->coord_str);
  225. for (dim = 0; dim < SYSTEM_DIMENSIONS; dim++) {
  226. if (ba_mp->coord[dim] > (int16_t)end[dim]) {
  227. bg_record->geo[dim]++;
  228. end[dim] = ba_mp->coord[dim];
  229. }
  230. if (!start_set && (ba_mp->coord[dim] <
  231. (int16_t)bg_record->start[dim]))
  232. bg_record->start[dim] = ba_mp->coord[dim];
  233. }
  234. bit_set(bg_record->mp_bitmap, ba_mp->index);
  235. }
  236. list_iterator_destroy(itr);
  237. if (bg_conf->slurm_debug_level >= LOG_LEVEL_DEBUG3) {
  238. for (dim = 0; dim < SYSTEM_DIMENSIONS; dim++) {
  239. tmp_char[dim] = alpha_num[bg_record->geo[dim]];
  240. tmp_char2[dim] = alpha_num[bg_record->start[dim]];
  241. }
  242. debug3("process_nodes: geo = %s mp count is %d start is %s",
  243. tmp_char, bg_record->mp_count, tmp_char2);
  244. }
  245. /* This check is for sub midplane systems to figure out what
  246. the largest block can be.
  247. */
  248. for (dim = 0; dim < SYSTEM_DIMENSIONS; dim++) {
  249. if (cluster_dims[dim] > 1)
  250. break;
  251. }
  252. if (dim < SYSTEM_DIMENSIONS) {
  253. /* means we have more than 1 base partition */
  254. for (dim = 0; dim < SYSTEM_DIMENSIONS; dim++) {
  255. if (bg_record->geo[dim] != cluster_dims[dim])
  256. break;
  257. }
  258. if (dim == SYSTEM_DIMENSIONS)
  259. bg_record->full_block = 1;
  260. } else if (bg_record->cnode_cnt == bg_conf->mp_cnode_cnt)
  261. bg_record->full_block = 1;
  262. return;
  263. }
  264. /*
  265. * NOTE: This function does not do a mutex lock so if you are copying the
  266. * main bg_list you need to lock 'block_state_mutex' before calling
  267. */
  268. extern List copy_bg_list(List in_list)
  269. {
  270. bg_record_t *bg_record = NULL;
  271. bg_record_t *new_record = NULL;
  272. List out_list = list_create(destroy_bg_record);
  273. ListIterator itr = list_iterator_create(in_list);
  274. while ((bg_record = (bg_record_t *) list_next(itr))) {
  275. if (bg_record->magic != BLOCK_MAGIC) {
  276. error("trying to copy a bad record");
  277. continue;
  278. }
  279. /* we don't care about blocks being destroyed and the
  280. * job is gone */
  281. if (bg_record->destroy
  282. && (!bg_record->job_ptr
  283. && (!bg_record->job_list
  284. || !list_count(bg_record->job_list))))
  285. continue;
  286. new_record = xmalloc(sizeof(bg_record_t));
  287. new_record->original = bg_record;
  288. copy_bg_record(bg_record, new_record);
  289. list_append(out_list, new_record);
  290. }
  291. list_iterator_destroy(itr);
  292. return out_list;
  293. }
  294. extern void copy_bg_record(bg_record_t *fir_record, bg_record_t *sec_record)
  295. {
  296. int i;
  297. ListIterator itr = NULL;
  298. ba_mp_t *ba_mp = NULL, *new_ba_mp = NULL;
  299. if (!fir_record || !sec_record) {
  300. error("copy_bg_record: "
  301. "given a null for either first record or second record");
  302. return;
  303. }
  304. xfree(sec_record->bg_block_id);
  305. sec_record->action = fir_record->action;
  306. sec_record->bg_block_id = xstrdup(fir_record->bg_block_id);
  307. if (sec_record->ba_mp_list)
  308. list_destroy(sec_record->ba_mp_list);
  309. sec_record->ba_mp_list = list_create(destroy_ba_mp);
  310. if (fir_record->ba_mp_list) {
  311. itr = list_iterator_create(fir_record->ba_mp_list);
  312. while ((ba_mp = list_next(itr))) {
  313. new_ba_mp = ba_copy_mp(ba_mp);
  314. if (ba_mp->cnode_bitmap)
  315. new_ba_mp->cnode_bitmap =
  316. bit_copy(ba_mp->cnode_bitmap);
  317. if (ba_mp->cnode_err_bitmap)
  318. new_ba_mp->cnode_err_bitmap =
  319. bit_copy(ba_mp->cnode_err_bitmap);
  320. if (ba_mp->cnode_usable_bitmap)
  321. new_ba_mp->cnode_usable_bitmap =
  322. bit_copy(ba_mp->cnode_usable_bitmap);
  323. list_append(sec_record->ba_mp_list, new_ba_mp);
  324. }
  325. list_iterator_destroy(itr);
  326. }
  327. FREE_NULL_BITMAP(sec_record->mp_bitmap);
  328. if (fir_record->mp_bitmap
  329. && !(sec_record->mp_bitmap = bit_copy(fir_record->mp_bitmap))) {
  330. error("Unable to copy bitmap for %s", fir_record->mp_str);
  331. sec_record->mp_bitmap = NULL;
  332. }
  333. sec_record->boot_state = fir_record->boot_state;
  334. sec_record->boot_count = fir_record->boot_count;
  335. sec_record->cnode_cnt = fir_record->cnode_cnt;
  336. sec_record->cnode_err_cnt = fir_record->cnode_err_cnt;
  337. memcpy(sec_record->conn_type, fir_record->conn_type,
  338. sizeof(sec_record->conn_type));
  339. sec_record->cpu_cnt = fir_record->cpu_cnt;
  340. sec_record->destroy = fir_record->destroy;
  341. sec_record->err_ratio = fir_record->err_ratio;
  342. sec_record->free_cnt = fir_record->free_cnt;
  343. sec_record->full_block = fir_record->full_block;
  344. for (i=0;i<SYSTEM_DIMENSIONS;i++) {
  345. sec_record->geo[i] = fir_record->geo[i];
  346. sec_record->start[i] = fir_record->start[i];
  347. }
  348. for (i=0;i<HIGHEST_DIMENSIONS;i++)
  349. sec_record->start_small[i] = fir_record->start_small[i];
  350. xfree(sec_record->ionode_str);
  351. sec_record->ionode_str = xstrdup(fir_record->ionode_str);
  352. FREE_NULL_BITMAP(sec_record->ionode_bitmap);
  353. if (fir_record->ionode_bitmap
  354. && (sec_record->ionode_bitmap
  355. = bit_copy(fir_record->ionode_bitmap)) == NULL) {
  356. error("Unable to copy ionode_bitmap for %s",
  357. fir_record->mp_str);
  358. sec_record->ionode_bitmap = NULL;
  359. }
  360. if (sec_record->job_list) {
  361. list_destroy(sec_record->job_list);
  362. sec_record->job_list = NULL;
  363. }
  364. if (fir_record->job_list) {
  365. struct job_record *job_ptr;
  366. sec_record->job_list = list_create(NULL);
  367. itr = list_iterator_create(fir_record->job_list);
  368. while ((job_ptr = list_next(itr)))
  369. list_append(sec_record->job_list, job_ptr);
  370. list_iterator_destroy(itr);
  371. }
  372. sec_record->job_ptr = fir_record->job_ptr;
  373. sec_record->job_running = fir_record->job_running;
  374. sec_record->magic = fir_record->magic;
  375. xfree(sec_record->blrtsimage);
  376. sec_record->blrtsimage = xstrdup(fir_record->blrtsimage);
  377. xfree(sec_record->linuximage);
  378. sec_record->linuximage = xstrdup(fir_record->linuximage);
  379. xfree(sec_record->mloaderimage);
  380. sec_record->mloaderimage = xstrdup(fir_record->mloaderimage);
  381. xfree(sec_record->ramdiskimage);
  382. sec_record->ramdiskimage = xstrdup(fir_record->ramdiskimage);
  383. sec_record->modifying = fir_record->modifying;
  384. sec_record->mp_count = fir_record->mp_count;
  385. xfree(sec_record->mp_str);
  386. sec_record->mp_str = xstrdup(fir_record->mp_str);
  387. #ifdef HAVE_BGL
  388. sec_record->node_use = fir_record->node_use;
  389. #endif
  390. /* Don't set the original, only in bg_copy_list does it happen
  391. * for a reason. */
  392. /* sec_record->original = fir_record; */
  393. xfree(sec_record->reason);
  394. sec_record->reason = xstrdup(fir_record->reason);
  395. sec_record->state = fir_record->state;
  396. }
  397. /*
  398. * Comparator used for sorting blocks smallest to largest
  399. *
  400. * returns: -1: rec_a > rec_b 0: rec_a == rec_b 1: rec_a < rec_b
  401. *
  402. */
  403. extern int bg_record_cmpf_inc(bg_record_t* rec_a, bg_record_t* rec_b)
  404. {
  405. int size_a = rec_a->cnode_cnt;
  406. int size_b = rec_b->cnode_cnt;
  407. /* We only look at this if we are ordering blocks larger than
  408. * a midplane, order of ionodes is how we order otherwise. */
  409. if ((size_a >= bg_conf->mp_cnode_cnt)
  410. || (size_b >= bg_conf->mp_cnode_cnt)) {
  411. if (size_a < size_b)
  412. return -1;
  413. else if (size_a > size_b)
  414. return 1;
  415. }
  416. if (rec_a->mp_str && rec_b->mp_str) {
  417. size_a = strcmp(rec_a->mp_str, rec_b->mp_str);
  418. if (size_a < 0)
  419. return -1;
  420. else if (size_a > 0)
  421. return 1;
  422. }
  423. if (!rec_a->ionode_bitmap || !rec_b->ionode_bitmap)
  424. return 0;
  425. if (bit_ffs(rec_a->ionode_bitmap) < bit_ffs(rec_b->ionode_bitmap))
  426. return -1;
  427. else
  428. return 1;
  429. return 0;
  430. }
  431. /*
  432. * Comparator used for sorting blocks from earliest available to lastest
  433. *
  434. * returns: -1: rec_a < rec_b 0: rec_a == rec_b 1: rec_a > rec_b
  435. *
  436. */
  437. extern int bg_record_sort_aval_inc(bg_record_t* rec_a, bg_record_t* rec_b)
  438. {
  439. if ((rec_a->job_running == BLOCK_ERROR_STATE)
  440. && (rec_b->job_running != BLOCK_ERROR_STATE))
  441. return 1;
  442. else if ((rec_a->job_running != BLOCK_ERROR_STATE)
  443. && (rec_b->job_running == BLOCK_ERROR_STATE))
  444. return -1;
  445. if (!rec_a->avail_set)
  446. _set_block_avail(rec_a);
  447. if (!rec_b->avail_set)
  448. _set_block_avail(rec_b);
  449. /* Don't use this check below. It will mess up preemption by
  450. sending this smaller block to the back of the list just
  451. because it is fully used.
  452. */
  453. /* if (!rec_a->avail_cnode_cnt && rec_b->avail_cnode_cnt) */
  454. /* return 1; */
  455. /* else if (rec_a->avail_cnode_cnt && !rec_b->avail_cnode_cnt) */
  456. /* return -1; */
  457. if (rec_a->job_list && rec_b->job_list) {
  458. /* we only want to use this sort on 1 midplane blocks
  459. that are used for sharing
  460. */
  461. if (rec_a->avail_cnode_cnt > rec_b->avail_cnode_cnt)
  462. return 1;
  463. else if (rec_a->avail_cnode_cnt < rec_b->avail_cnode_cnt)
  464. return -1;
  465. }
  466. if (rec_a->avail_job_end > rec_b->avail_job_end)
  467. return 1;
  468. else if (rec_a->avail_job_end < rec_b->avail_job_end)
  469. return -1;
  470. /* if (!job_ptr_a && job_ptr_b) */
  471. /* return -1; */
  472. /* else if (job_ptr_a && !job_ptr_b) */
  473. /* return 1; */
  474. /* else if (job_ptr_a && job_ptr_b) { */
  475. /* if (job_ptr_a->end_time > job_ptr_b->end_time) */
  476. /* return 1; */
  477. /* else if (job_ptr_a->end_time < job_ptr_b->end_time) */
  478. /* return -1; */
  479. /* } */
  480. return bg_record_cmpf_inc(rec_a, rec_b);
  481. }
  482. /* Try to requeue job running on block and put block in an error state.
  483. * block_state_mutex must be unlocked before calling this.
  484. */
  485. extern void requeue_and_error(bg_record_t *bg_record, char *reason)
  486. {
  487. int rc;
  488. if (bg_record->magic != BLOCK_MAGIC) {
  489. error("requeue_and_error: magic was bad");
  490. return;
  491. }
  492. if (bg_record->job_running > NO_JOB_RUNNING)
  493. bg_requeue_job(bg_record->job_running, 0, 0);
  494. else if (bg_record->job_list) {
  495. ListIterator itr = list_iterator_create(bg_record->job_list);
  496. struct job_record *job_ptr;
  497. while ((job_ptr = list_next(itr)))
  498. bg_requeue_job(job_ptr->job_id, 0, 0);
  499. list_iterator_destroy(itr);
  500. }
  501. slurm_mutex_lock(&block_state_mutex);
  502. rc = block_ptr_exist_in_list(bg_lists->main, bg_record);
  503. slurm_mutex_unlock(&block_state_mutex);
  504. if (rc)
  505. put_block_in_error_state(bg_record, reason);
  506. else
  507. error("requeue_and_error: block disappeared");
  508. return;
  509. }
  510. /* block_state_mutex must be locked before calling this. */
  511. extern int add_bg_record(List records, List *used_nodes,
  512. select_ba_request_t *blockreq,
  513. bool no_check, bitoff_t io_start)
  514. {
  515. bg_record_t *bg_record = NULL;
  516. ba_mp_t *ba_mp = NULL;
  517. ListIterator itr;
  518. int i, len;
  519. char *conn_type = NULL;
  520. xassert(bg_conf->slurm_user_name);
  521. if (!records) {
  522. fatal("add_bg_record: no records list given");
  523. }
  524. bg_record = (bg_record_t*) xmalloc(sizeof(bg_record_t));
  525. bg_record->magic = BLOCK_MAGIC;
  526. if (used_nodes && *used_nodes) {
  527. #ifdef HAVE_BGQ
  528. bg_record->ba_mp_list = *used_nodes;
  529. *used_nodes = NULL;
  530. #else
  531. bg_record->ba_mp_list = list_create(destroy_ba_mp);
  532. if (copy_node_path(*used_nodes, &bg_record->ba_mp_list)
  533. == SLURM_ERROR)
  534. error("add_bg_record: "
  535. "couldn't copy the path for the allocation");
  536. #endif
  537. } else
  538. bg_record->ba_mp_list = list_create(destroy_ba_mp);
  539. /* bg_record->boot_state = 0; Implicit */
  540. bg_record->state = BG_BLOCK_FREE;
  541. #ifdef HAVE_BGL
  542. if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) {
  543. conn_type = conn_type_string_full(blockreq->conn_type);
  544. info("add_bg_record: asking for %s %d %d %s",
  545. blockreq->save_name, blockreq->small32, blockreq->small128,
  546. conn_type);
  547. xfree(conn_type);
  548. }
  549. #else
  550. if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) {
  551. conn_type = conn_type_string_full(blockreq->conn_type);
  552. info("add_bg_record: asking for %s %d %d %d %d %d %s",
  553. blockreq->save_name, blockreq->small256,
  554. blockreq->small128, blockreq->small64,
  555. blockreq->small32, blockreq->small16,
  556. conn_type);
  557. xfree(conn_type);
  558. }
  559. #endif
  560. /* Set the bitmap blank here if it is a full node we don't
  561. want anything set we also don't want the bg_record->ionode_str set.
  562. */
  563. bg_record->ionode_bitmap = bit_alloc(bg_conf->ionodes_per_mp);
  564. len = strlen(blockreq->save_name);
  565. i=0;
  566. while (i<len
  567. && blockreq->save_name[i] != '['
  568. && (blockreq->save_name[i] < '0' || blockreq->save_name[i] > 'Z'
  569. || (blockreq->save_name[i] > '9'
  570. && blockreq->save_name[i] < 'A')))
  571. i++;
  572. if (i<len) {
  573. len -= i;
  574. len += strlen(bg_conf->slurm_node_prefix)+1;
  575. bg_record->mp_str = xmalloc(len);
  576. snprintf(bg_record->mp_str, len, "%s%s",
  577. bg_conf->slurm_node_prefix, blockreq->save_name+i);
  578. } else
  579. fatal("add_bg_record: MPs=%s is in a weird format",
  580. blockreq->save_name);
  581. process_nodes(bg_record, false);
  582. #ifdef HAVE_BGL
  583. bg_record->node_use = SELECT_COPROCESSOR_MODE;
  584. #endif
  585. memcpy(bg_record->conn_type, blockreq->conn_type,
  586. sizeof(bg_record->conn_type));
  587. bg_record->cpu_cnt = bg_conf->cpus_per_mp * bg_record->mp_count;
  588. bg_record->cnode_cnt = bg_conf->mp_cnode_cnt * bg_record->mp_count;
  589. bg_record->job_running = NO_JOB_RUNNING;
  590. #ifdef HAVE_BGL
  591. if (blockreq->blrtsimage)
  592. bg_record->blrtsimage = xstrdup(blockreq->blrtsimage);
  593. else
  594. bg_record->blrtsimage = xstrdup(bg_conf->default_blrtsimage);
  595. #endif
  596. #ifdef HAVE_BG_L_P
  597. if (blockreq->linuximage)
  598. bg_record->linuximage = xstrdup(blockreq->linuximage);
  599. else
  600. bg_record->linuximage = xstrdup(bg_conf->default_linuximage);
  601. if (blockreq->ramdiskimage)
  602. bg_record->ramdiskimage = xstrdup(blockreq->ramdiskimage);
  603. else
  604. bg_record->ramdiskimage =
  605. xstrdup(bg_conf->default_ramdiskimage);
  606. #endif
  607. if (blockreq->mloaderimage)
  608. bg_record->mloaderimage = xstrdup(blockreq->mloaderimage);
  609. else
  610. bg_record->mloaderimage =
  611. xstrdup(bg_conf->default_mloaderimage);
  612. #ifdef HAVE_BGQ
  613. /* The start is always right, for blocks larger than 1, from
  614. the blockreq so don't take chances. */
  615. if (bg_record->mp_count > 1)
  616. memcpy(bg_record->start, blockreq->start,
  617. sizeof(bg_record->start));
  618. #endif
  619. if (bg_record->conn_type[0] < SELECT_SMALL) {
  620. /* this needs to be an append so we keep things in the
  621. order we got them, they will be sorted later */
  622. list_append(records, bg_record);
  623. /* this isn't a correct list so we need to set it later for
  624. now we just used it to be the mp number */
  625. if (!used_nodes) {
  626. debug4("add_bg_record: "
  627. "we didn't get a request list so we are "
  628. "destroying this mp list");
  629. list_destroy(bg_record->ba_mp_list);
  630. bg_record->ba_mp_list = NULL;
  631. } else if (bg_conf->sub_blocks && bg_record->mp_count == 1) {
  632. ba_mp_t *ba_mp = list_peek(bg_record->ba_mp_list);
  633. xassert(ba_mp);
  634. /* This will be a list containing jobs running on this
  635. block */
  636. bg_record->job_list = list_create(NULL);
  637. /* Create these now so we can deal with error
  638. cnodes if/when they happen. Since this is
  639. the easiest place to figure it out for
  640. blocks that don't use the entire block */
  641. if ((ba_mp->cnode_bitmap =
  642. ba_create_ba_mp_cnode_bitmap(bg_record))) {
  643. ba_mp->cnode_err_bitmap =
  644. bit_alloc(bg_conf->mp_cnode_cnt);
  645. ba_mp->cnode_usable_bitmap =
  646. bit_copy(ba_mp->cnode_bitmap);
  647. }
  648. }
  649. } else {
  650. List ba_mp_list = NULL;
  651. if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
  652. info("add_bg_record: adding a small block");
  653. if (no_check)
  654. goto no_check;
  655. /* if the ionode cnt for small32 is 0 then don't
  656. allow a sub quarter allocation
  657. */
  658. if (bg_conf->nodecard_ionode_cnt < 2) {
  659. if (!bg_conf->nodecard_ionode_cnt && blockreq->small32)
  660. fatal("add_bg_record: "
  661. "There is an error in your "
  662. "bluegene.conf file.\n"
  663. "Can't create a 32 node block with "
  664. "IonodesPerMP=%u. (Try setting it "
  665. "to at least 16)",
  666. bg_conf->ionodes_per_mp);
  667. #ifdef HAVE_BGP
  668. if (blockreq->small16)
  669. fatal("add_bg_record: "
  670. "There is an error in your "
  671. "bluegene.conf file.\n"
  672. "Can't create a 16 node block with "
  673. "IonodesPerMP=%u. (Try setting it to "
  674. "at least 32)",
  675. bg_conf->ionodes_per_mp);
  676. #endif
  677. #ifndef HAVE_BGL
  678. if ((bg_conf->io_ratio < 0.5) && blockreq->small64)
  679. fatal("add_bg_record: "
  680. "There is an error in your "
  681. "bluegene.conf file.\n"
  682. "Can't create a 64 node block with "
  683. "IonodesPerMP=%u. (Try setting it "
  684. "to at least 8)",
  685. bg_conf->ionodes_per_mp);
  686. #endif
  687. }
  688. #ifdef HAVE_BGL
  689. if (blockreq->small32==0 && blockreq->small128==0) {
  690. info("add_bg_record: "
  691. "No specs given for this small block, "
  692. "I am spliting this block into 4 128CnBlocks");
  693. blockreq->small128=4;
  694. }
  695. i = (blockreq->small32*bg_conf->nodecard_cnode_cnt) +
  696. (blockreq->small128*bg_conf->quarter_cnode_cnt);
  697. if (i != bg_conf->mp_cnode_cnt)
  698. fatal("add_bg_record: "
  699. "There is an error in your bluegene.conf file.\n"
  700. "I am unable to request %d nodes consisting of "
  701. "%u 32CnBlocks and\n%u 128CnBlocks in one "
  702. "base partition with %u nodes.",
  703. i, blockreq->small32, blockreq->small128,
  704. bg_conf->mp_cnode_cnt);
  705. #else
  706. if (!blockreq->small16 && !blockreq->small32
  707. && !blockreq->small64 && !blockreq->small128
  708. && !blockreq->small256) {
  709. info("add_bg_record: "
  710. "No specs given for this small block, "
  711. "I am spliting this block into 2 256CnBlocks");
  712. blockreq->small256=2;
  713. }
  714. i = (blockreq->small16*16)
  715. + (blockreq->small32*32)
  716. + (blockreq->small64*64)
  717. + (blockreq->small128*128)
  718. + (blockreq->small256*256);
  719. if (i != bg_conf->mp_cnode_cnt)
  720. fatal("add_bg_record: "
  721. "There is an error in your bluegene.conf file.\n"
  722. "I am unable to request %d nodes consisting of "
  723. "%u 16CNBlocks, %u 32CNBlocks,\n"
  724. "%u 64CNBlocks, %u 128CNBlocks, "
  725. "and %u 256CNBlocks\n"
  726. "in one base partition with %u nodes.",
  727. i, blockreq->small16, blockreq->small32,
  728. blockreq->small64, blockreq->small128,
  729. blockreq->small256, bg_conf->mp_cnode_cnt);
  730. #endif
  731. no_check:
  732. /* Automatically create 2-way split if
  733. * conn_type == SELECT_SMALL in bluegene.conf
  734. * Here we go through each node listed and do the same thing
  735. * for each node.
  736. */
  737. ba_mp_list = bg_record->ba_mp_list;
  738. bg_record->ba_mp_list = list_create(NULL);
  739. itr = list_iterator_create(ba_mp_list);
  740. while ((ba_mp = list_next(itr)) != NULL) {
  741. xfree(bg_record->mp_str);
  742. bg_record->mp_str = xstrdup_printf(
  743. "%s%s",
  744. bg_conf->slurm_node_prefix,
  745. ba_mp->coord_str);
  746. list_append(bg_record->ba_mp_list, ba_mp);
  747. handle_small_record_request(records, blockreq,
  748. bg_record, io_start);
  749. list_flush(bg_record->ba_mp_list);
  750. }
  751. list_iterator_destroy(itr);
  752. destroy_bg_record(bg_record);
  753. list_destroy(ba_mp_list);
  754. }
  755. return SLURM_SUCCESS;
  756. }
  757. extern int handle_small_record_request(List records,
  758. select_ba_request_t *blockreq,
  759. bg_record_t *bg_record, bitoff_t start)
  760. {
  761. bitstr_t *ionodes = bit_alloc(bg_conf->ionodes_per_mp);
  762. int i=0, ionode_cnt = 0;
  763. bg_record_t *found_record = NULL;
  764. xassert(records);
  765. xassert(blockreq);
  766. xassert(bg_record);
  767. xassert(start >= 0);
  768. xassert(start < bg_conf->ionodes_per_mp);
  769. #ifndef HAVE_BGL
  770. for(i=0; i<blockreq->small16; i++) {
  771. bit_nset(ionodes, start, start);
  772. found_record = create_small_record(bg_record, ionodes, 16);
  773. /* this needs to be an append so we
  774. keep things in the order we got
  775. them, they will be sorted later */
  776. list_append(records, found_record);
  777. bit_nclear(ionodes, start, start);
  778. start++;
  779. }
  780. #endif
  781. if ((ionode_cnt = bg_conf->nodecard_ionode_cnt))
  782. ionode_cnt--;
  783. for(i=0; i<blockreq->small32; i++) {
  784. bit_nset(ionodes, start, start+ionode_cnt);
  785. found_record = create_small_record(bg_record, ionodes, 32);
  786. /* this needs to be an append so we
  787. keep things in the order we got
  788. them, they will be sorted later */
  789. list_append(records, found_record);
  790. bit_nclear(ionodes, start, start+ionode_cnt);
  791. start+=ionode_cnt+1;
  792. }
  793. #ifndef HAVE_BGL
  794. if ((ionode_cnt = bg_conf->nodecard_ionode_cnt * 2))
  795. ionode_cnt--;
  796. for(i=0; i<blockreq->small64; i++) {
  797. bit_nset(ionodes, start, start+ionode_cnt);
  798. found_record = create_small_record(bg_record, ionodes, 64);
  799. /* this needs to be an append so we
  800. keep things in the order we got
  801. them, they will be sorted later */
  802. list_append(records, found_record);
  803. bit_nclear(ionodes, start, start+ionode_cnt);
  804. start+=ionode_cnt+1;
  805. }
  806. #endif
  807. if ((ionode_cnt = bg_conf->quarter_ionode_cnt))
  808. ionode_cnt--;
  809. for(i=0; i<blockreq->small128; i++) {
  810. bit_nset(ionodes, start, start+ionode_cnt);
  811. found_record = create_small_record(bg_record, ionodes, 128);
  812. /* this needs to be an append so we
  813. keep things in the order we got
  814. them, they will be sorted later */
  815. list_append(records, found_record);
  816. bit_nclear(ionodes, start, start+ionode_cnt);
  817. start+=ionode_cnt+1;
  818. }
  819. #ifndef HAVE_BGL
  820. if ((ionode_cnt = bg_conf->quarter_ionode_cnt * 2))
  821. ionode_cnt--;
  822. for(i=0; i<blockreq->small256; i++) {
  823. bit_nset(ionodes, start, start+ionode_cnt);
  824. found_record = create_small_record(bg_record, ionodes, 256);
  825. /* this needs to be an append so we
  826. keep things in the order we got
  827. them, they will be sorted later */
  828. list_append(records, found_record);
  829. bit_nclear(ionodes, start, start+ionode_cnt);
  830. start+=ionode_cnt+1;
  831. }
  832. #endif
  833. FREE_NULL_BITMAP(ionodes);
  834. return SLURM_SUCCESS;
  835. }
  836. extern int format_node_name(bg_record_t *bg_record, char *buf, int buf_size)
  837. {
  838. if (bg_record->ionode_str) {
  839. snprintf(buf, buf_size, "%s[%s]",
  840. bg_record->mp_str,
  841. bg_record->ionode_str);
  842. } else {
  843. snprintf(buf, buf_size, "%s", bg_record->mp_str);
  844. }
  845. return SLURM_SUCCESS;
  846. }
  847. /*
  848. * This could potentially lock the node lock in the slurmctld with
  849. * slurm_drain_node, or slurm_fail_job so if slurmctld_locked is called we
  850. * will call the functions without locking the locks again.
  851. */
  852. extern int down_nodecard(char *mp_name, bitoff_t io_start,
  853. bool slurmctld_locked, char *reason)
  854. {
  855. List requests = NULL;
  856. List delete_list = NULL, pass_list = NULL;
  857. ListIterator itr = NULL;
  858. bg_record_t *bg_record = NULL, *found_record = NULL,
  859. tmp_record, *error_bg_record = NULL;
  860. bg_record_t *smallest_bg_record = NULL;
  861. struct node_record *node_ptr = NULL;
  862. int mp_bit = 0;
  863. bool has_pass = 0;
  864. static int io_cnt = NO_VAL;
  865. static int create_size = NO_VAL;
  866. static select_ba_request_t blockreq;
  867. int rc = SLURM_SUCCESS;
  868. slurmctld_lock_t job_write_lock = {
  869. NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK };
  870. xassert(mp_name);
  871. if (!reason)
  872. reason = "select_bluegene: nodecard down";
  873. if (io_cnt == NO_VAL) {
  874. io_cnt = 1;
  875. /* Translate 1 nodecard count to ionode count */
  876. if ((io_cnt *= bg_conf->io_ratio))
  877. io_cnt--;
  878. /* make sure we create something that is able to be
  879. created */
  880. if (bg_conf->smallest_block < bg_conf->nodecard_cnode_cnt)
  881. create_size = bg_conf->nodecard_cnode_cnt;
  882. else
  883. create_size = bg_conf->smallest_block;
  884. }
  885. node_ptr = find_node_record(mp_name);
  886. if (!node_ptr) {
  887. error ("down_sub_node_blocks: invalid node specified '%s'",
  888. mp_name);
  889. return EINVAL;
  890. }
  891. /* this is here for sanity check to make sure we don't core on
  892. these bits when we set them below. */
  893. if (io_start >= bg_conf->ionodes_per_mp
  894. || (io_start+io_cnt) >= bg_conf->ionodes_per_mp) {
  895. debug("io %d-%d not configured on this "
  896. "system, only %d ionodes per midplane",
  897. io_start, io_start+io_cnt, bg_conf->ionodes_per_mp);
  898. return EINVAL;
  899. }
  900. mp_bit = (node_ptr - node_record_table_ptr);
  901. memset(&blockreq, 0, sizeof(select_ba_request_t));
  902. blockreq.conn_type[0] = SELECT_SMALL;
  903. blockreq.save_name = mp_name;
  904. debug3("here setting node %d of %d and ionodes %d-%d of %d",
  905. mp_bit, node_record_count, io_start,
  906. io_start+io_cnt, bg_conf->ionodes_per_mp);
  907. memset(&tmp_record, 0, sizeof(bg_record_t));
  908. tmp_record.mp_count = 1;
  909. tmp_record.cnode_cnt = bg_conf->nodecard_cnode_cnt;
  910. tmp_record.mp_bitmap = bit_alloc(node_record_count);
  911. bit_set(tmp_record.mp_bitmap, mp_bit);
  912. tmp_record.ionode_bitmap = bit_alloc(bg_conf->ionodes_per_mp);
  913. bit_nset(tmp_record.ionode_bitmap, io_start, io_start+io_cnt);
  914. /* To avoid deadlock we always must lock the slurmctld before
  915. the block_state_mutex.
  916. */
  917. if (!slurmctld_locked)
  918. lock_slurmctld(job_write_lock);
  919. slurm_mutex_lock(&block_state_mutex);
  920. itr = list_iterator_create(bg_lists->main);
  921. while ((bg_record = list_next(itr))) {
  922. if (bg_record->destroy)
  923. continue;
  924. if (!bit_test(bg_record->mp_bitmap, mp_bit)
  925. #ifndef HAVE_BG_L_P
  926. /* In BGQ if a nodeboard goes down you can no
  927. longer use any block using that nodeboard in a
  928. passthrough, so we need to remove it.
  929. */
  930. && !(has_pass = block_mp_passthrough(bg_record, mp_bit))
  931. #endif
  932. )
  933. continue;
  934. if (!has_pass && !blocks_overlap(bg_record, &tmp_record))
  935. continue;
  936. if (bg_record->job_running > NO_JOB_RUNNING) {
  937. job_fail(bg_record->job_running);
  938. } else if (bg_record->job_list) {
  939. ListIterator job_itr = list_iterator_create(
  940. bg_record->job_list);
  941. struct job_record *job_ptr;
  942. while ((job_ptr = list_next(job_itr))) {
  943. job_fail(job_ptr->job_id);
  944. }
  945. list_iterator_destroy(job_itr);
  946. }
  947. /* If Running Dynamic mode and the block is
  948. smaller than the create size just continue on.
  949. */
  950. if (bg_conf->layout_mode == LAYOUT_DYNAMIC) {
  951. if (bg_record->cnode_cnt < create_size) {
  952. if (!delete_list)
  953. delete_list = list_create(NULL);
  954. list_append(delete_list, bg_record);
  955. continue;
  956. } else if (has_pass) {
  957. /* Set it up so the passthrough blocks
  958. get removed since they are no
  959. longer valid.
  960. */
  961. if (!pass_list)
  962. pass_list = list_create(NULL);
  963. list_append(pass_list, bg_record);
  964. continue;
  965. }
  966. } else if (has_pass) /* on non-dynamic systems this
  967. block doesn't really mean
  968. anything we just needed to
  969. fail the job (which was
  970. probably already failed).
  971. */
  972. continue;
  973. /* keep track of the smallest size that is at least
  974. the size of create_size. */
  975. if (!smallest_bg_record ||
  976. (smallest_bg_record->cnode_cnt > bg_record->cnode_cnt))
  977. smallest_bg_record = bg_record;
  978. }
  979. list_iterator_destroy(itr);
  980. /* We cannot unlock block_state_mutex here until we are done
  981. * with smallest_bg_record.
  982. */
  983. if (bg_conf->layout_mode != LAYOUT_DYNAMIC) {
  984. debug3("running non-dynamic mode");
  985. /* This should never happen, but just in case... */
  986. if (delete_list) {
  987. list_destroy(delete_list);
  988. delete_list = NULL;
  989. }
  990. /* If we found a block that is smaller or equal to a
  991. midplane we will just mark it in an error state as
  992. opposed to draining the node.
  993. */
  994. if (smallest_bg_record
  995. && (smallest_bg_record->cnode_cnt < bg_conf->mp_cnode_cnt)){
  996. if (smallest_bg_record->state & BG_BLOCK_ERROR_FLAG) {
  997. rc = SLURM_NO_CHANGE_IN_DATA;
  998. slurm_mutex_unlock(&block_state_mutex);
  999. goto cleanup;
  1000. }
  1001. slurm_mutex_unlock(&block_state_mutex);
  1002. error_bg_record = smallest_bg_record;
  1003. goto cleanup;
  1004. }
  1005. slurm_mutex_unlock(&block_state_mutex);
  1006. debug("No block under 1 midplane available for this nodecard. "
  1007. "Draining the whole node.");
  1008. /* the slurmctld is always locked here */
  1009. if (!node_already_down(mp_name))
  1010. drain_nodes(mp_name, reason,
  1011. slurm_get_slurm_user_id());
  1012. rc = SLURM_SUCCESS;
  1013. goto cleanup;
  1014. }
  1015. /* below is only for Dynamic mode */
  1016. if (delete_list) {
  1017. int cnt_set = 0;
  1018. bitstr_t *iobitmap = bit_alloc(bg_conf->ionodes_per_mp);
  1019. itr = list_iterator_create(delete_list);
  1020. while ((bg_record = list_next(itr))) {
  1021. debug2("combining smaller than nodecard "
  1022. "dynamic block %s",
  1023. bg_record->bg_block_id);
  1024. bit_or(iobitmap, bg_record->ionode_bitmap);
  1025. cnt_set++;
  1026. }
  1027. list_iterator_destroy(itr);
  1028. list_destroy(delete_list);
  1029. delete_list = NULL;
  1030. if (!cnt_set) {
  1031. FREE_NULL_BITMAP(iobitmap);
  1032. rc = SLURM_ERROR;
  1033. slurm_mutex_unlock(&block_state_mutex);
  1034. goto cleanup;
  1035. }
  1036. /* set the start to be the same as the start of the
  1037. ionode_bitmap. If no ionodes set (not a small
  1038. block) set io_start = 0. */
  1039. if ((io_start = bit_ffs(iobitmap)) == -1) {
  1040. io_start = 0;
  1041. if (create_size > bg_conf->nodecard_cnode_cnt)
  1042. blockreq.small128 = 4;
  1043. else
  1044. blockreq.small32 = 16;
  1045. } else if (create_size <= bg_conf->nodecard_cnode_cnt)
  1046. blockreq.small32 = 1;
  1047. else
  1048. /* this should never happen */
  1049. blockreq.small128 = 1;
  1050. FREE_NULL_BITMAP(iobitmap);
  1051. } else if (smallest_bg_record) {
  1052. debug2("smallest dynamic block is %s",
  1053. smallest_bg_record->bg_block_id);
  1054. if (smallest_bg_record->cnode_cnt == create_size) {
  1055. slurm_mutex_unlock(&block_state_mutex);
  1056. error_bg_record = smallest_bg_record;
  1057. goto cleanup;
  1058. }
  1059. /* If the block is bigger than the asked for error we
  1060. need to resume it to keep accounting correct.
  1061. */
  1062. if (smallest_bg_record->state & BG_BLOCK_ERROR_FLAG)
  1063. resume_block(smallest_bg_record);
  1064. if (create_size > smallest_bg_record->cnode_cnt) {
  1065. /* we should never get here. This means we
  1066. * have a create_size that is bigger than a
  1067. * block that is already made.
  1068. */
  1069. slurm_mutex_unlock(&block_state_mutex);
  1070. error_bg_record = smallest_bg_record;
  1071. goto cleanup;
  1072. }
  1073. debug3("node count is %d", smallest_bg_record->cnode_cnt);
  1074. switch(smallest_bg_record->cnode_cnt) {
  1075. #ifndef HAVE_BGL
  1076. case 64:
  1077. blockreq.small32 = 2;
  1078. break;
  1079. case 256:
  1080. blockreq.small32 = 8;
  1081. break;
  1082. #endif
  1083. case 128:
  1084. blockreq.small32 = 4;
  1085. break;
  1086. case 512:
  1087. default:
  1088. blockreq.small32 = 16;
  1089. break;
  1090. }
  1091. if (create_size != bg_conf->nodecard_cnode_cnt) {
  1092. blockreq.small128 = blockreq.small32 / 4;
  1093. blockreq.small32 = 0;
  1094. }
  1095. if ((io_start =
  1096. bit_ffs(smallest_bg_record->ionode_bitmap)) == -1)
  1097. /* set the start to be the same as the start of the
  1098. ionode_bitmap. If no ionodes set (not a small
  1099. block) set io_start = 0. */
  1100. io_start = 0;
  1101. } else {
  1102. switch(create_size) {
  1103. #ifndef HAVE_BGL
  1104. case 64:
  1105. blockreq.small64 = 8;
  1106. break;
  1107. case 256:
  1108. blockreq.small256 = 2;
  1109. #endif
  1110. case 32:
  1111. blockreq.small32 = 16;
  1112. break;
  1113. case 128:
  1114. blockreq.small128 = 4;
  1115. break;
  1116. case 512:
  1117. slurm_mutex_unlock(&block_state_mutex);
  1118. /* the slurmctld is always locked here */
  1119. if (!node_already_down(mp_name))
  1120. drain_nodes(mp_name, reason,
  1121. slurm_get_slurm_user_id());
  1122. rc = SLURM_SUCCESS;
  1123. goto cleanup;
  1124. break;
  1125. default:
  1126. error("Unknown create size of %d", create_size);
  1127. break;
  1128. }
  1129. /* since we don't have a block in this midplane
  1130. we need to start at the beginning. */
  1131. io_start = 0;
  1132. /* we also need a bg_block to pretend to be the
  1133. smallest block that takes up the entire midplane. */
  1134. }
  1135. /* Here we need to add blocks that take up nodecards on this
  1136. midplane. Since Slurm only keeps track of midplanes
  1137. natively this is the only want to handle this case.
  1138. */
  1139. requests = list_create(destroy_bg_record);
  1140. add_bg_record(requests, NULL, &blockreq, 1, io_start);
  1141. if (bg_conf->sub_blocks
  1142. && (!smallest_bg_record
  1143. || smallest_bg_record->cnode_cnt == bg_conf->mp_cnode_cnt)) {
  1144. bg_record_t *rem_record = NULL;
  1145. memset(&blockreq, 0, sizeof(select_ba_request_t));
  1146. blockreq.conn_type[0] = SELECT_SMALL;
  1147. blockreq.save_name = mp_name;
  1148. blockreq.small256 = 2;
  1149. add_bg_record(requests, NULL, &blockreq, 1, io_start);
  1150. itr = list_iterator_create(requests);
  1151. while ((bg_record = list_next(itr))) {
  1152. if (bit_overlap(bg_record->ionode_bitmap,
  1153. tmp_record.ionode_bitmap)) {
  1154. if (bg_record->cnode_cnt == 256) {
  1155. print_bg_record(bg_record);
  1156. rem_record = bg_record;
  1157. list_remove(itr);
  1158. break;
  1159. }
  1160. }
  1161. }
  1162. if (!rem_record) {
  1163. /* this should never happen */
  1164. error("down_nodecard: something bad happened "
  1165. "with creation of 256 block");
  1166. } else {
  1167. list_iterator_reset(itr);
  1168. while ((bg_record = list_next(itr))) {
  1169. if (bg_record->cnode_cnt == 256)
  1170. continue;
  1171. if (!bit_overlap(bg_record->ionode_bitmap,
  1172. rem_record->ionode_bitmap)) {
  1173. print_bg_record(bg_record);
  1174. list_delete_item(itr);
  1175. }
  1176. }
  1177. destroy_bg_record(rem_record);
  1178. }
  1179. list_iterator_destroy(itr);
  1180. }
  1181. if (pass_list) {
  1182. delete_list = pass_list;
  1183. pass_list = NULL;
  1184. } else
  1185. delete_list = list_create(NULL);
  1186. while ((bg_record = list_pop(requests))) {
  1187. itr = list_iterator_create(bg_lists->main);
  1188. while ((found_record = list_next(itr))) {
  1189. if (found_record->destroy)
  1190. continue;
  1191. if (!blocks_overlap(bg_record, found_record))
  1192. continue;
  1193. list_push(delete_list, found_record);
  1194. }
  1195. list_iterator_destroy(itr);
  1196. /* we need to add this record since it doesn't exist */
  1197. if (bridge_block_create(bg_record) == SLURM_ERROR) {
  1198. destroy_bg_record(bg_record);
  1199. error("down_sub_node_blocks: "
  1200. "unable to configure block in api");
  1201. continue;
  1202. }
  1203. debug("adding block %s to fill in small blocks "
  1204. "around bad nodecards",
  1205. bg_record->bg_block_id);
  1206. print_bg_record(bg_record);
  1207. list_append(bg_lists->main, bg_record);
  1208. if (bit_overlap(bg_record->ionode_bitmap,
  1209. tmp_record.ionode_bitmap)) {
  1210. /* here we know the error block doesn't exist
  1211. so just set the state here */
  1212. error_bg_record = bg_record;
  1213. }
  1214. }
  1215. list_destroy(requests);
  1216. sort_bg_record_inc_size(bg_lists->main);
  1217. last_bg_update = time(NULL);
  1218. slurm_mutex_unlock(&block_state_mutex);
  1219. cleanup:
  1220. if (!slurmctld_locked)
  1221. unlock_slurmctld(job_write_lock);
  1222. FREE_NULL_BITMAP(tmp_record.mp_bitmap);
  1223. FREE_NULL_BITMAP(tmp_record.ionode_bitmap);
  1224. if (error_bg_record) {
  1225. /* all locks must be released before going into
  1226. * put_block_in_error_state.
  1227. */
  1228. if (slurmctld_locked)
  1229. unlock_slurmctld(job_write_lock);
  1230. rc = put_block_in_error_state(error_bg_record, reason);
  1231. if (slurmctld_locked)
  1232. lock_slurmctld(job_write_lock);
  1233. }
  1234. if (pass_list) {
  1235. delete_list = pass_list;
  1236. pass_list = NULL;
  1237. }
  1238. if (delete_list) {
  1239. bool delete_it = 0;
  1240. if (bg_conf->layout_mode == LAYOUT_DYNAMIC)
  1241. delete_it = 1;
  1242. free_block_list(NO_VAL, delete_list, delete_it, 0);
  1243. list_destroy(delete_list);
  1244. delete_list = NULL;
  1245. }
  1246. return rc;
  1247. }
  1248. extern int up_nodecard(char *mp_name, bitstr_t *ionode_bitmap)
  1249. {
  1250. ListIterator itr = NULL;
  1251. bg_record_t *bg_record = NULL;
  1252. struct node_record *node_ptr = NULL;
  1253. int mp_bit = 0;
  1254. int ret = 0;
  1255. xassert(mp_name);
  1256. xassert(ionode_bitmap);
  1257. node_ptr = find_node_record(mp_name);
  1258. if (!node_ptr) {
  1259. error ("down_sub_node_blocks: invalid node specified %s",
  1260. mp_name);
  1261. return EINVAL;
  1262. }
  1263. mp_bit = (node_ptr - node_record_table_ptr);
  1264. slurm_mutex_lock(&block_state_mutex);
  1265. itr = list_iterator_create(bg_lists->main);
  1266. while ((bg_record = list_next(itr))) {
  1267. if (bg_record->job_running != BLOCK_ERROR_STATE)
  1268. continue;
  1269. if (!bit_test(bg_record->mp_bitmap, mp_bit))
  1270. continue;
  1271. if (!bit_overlap(bg_record->ionode_bitmap, ionode_bitmap)) {
  1272. continue;
  1273. }
  1274. resume_block(bg_record);
  1275. }
  1276. list_iterator_destroy(itr);
  1277. slurm_mutex_unlock(&block_state_mutex);
  1278. /* FIX ME: This needs to call the opposite of
  1279. slurm_drain_nodes which does not yet exist.
  1280. */
  1281. if ((ret = node_already_down(mp_name))) {
  1282. /* means it was drained */
  1283. if (ret == 2) {
  1284. /* debug("node %s put back into service after " */
  1285. /* "being in an error state", */
  1286. /* mp_name); */
  1287. }
  1288. }
  1289. return SLURM_SUCCESS;
  1290. }
  1291. /* block_state_mutex must be unlocked before calling this. */
  1292. extern int put_block_in_error_state(bg_record_t *bg_record, char *reason)
  1293. {
  1294. xassert(bg_record);
  1295. /* Only check this if the blocks are created, meaning this
  1296. isn't at startup.
  1297. */
  1298. if (blocks_are_created) {
  1299. /* Since we are putting this block in an error state we need
  1300. to wait for the job to be removed. We don't really
  1301. need to free the block though since we may just
  1302. want it to be in an error state for some reason. */
  1303. while ((bg_record->magic == BLOCK_MAGIC)
  1304. && ((bg_record->job_running > NO_JOB_RUNNING)
  1305. || (bg_record->job_list
  1306. && list_count(bg_record->job_list)))) {
  1307. if (bg_record->job_running > NO_JOB_RUNNING)
  1308. debug2("block %s is still running job %d",
  1309. bg_record->bg_block_id,
  1310. bg_record->job_running);
  1311. else
  1312. debug2("block %s is still running jobs",
  1313. bg_record->bg_block_id);
  1314. sleep(1);
  1315. }
  1316. }
  1317. slurm_mutex_lock(&block_state_mutex);
  1318. if (!block_ptr_exist_in_list(bg_lists->main, bg_record)) {
  1319. slurm_mutex_unlock(&block_state_mutex);
  1320. error("while trying to put block in "
  1321. "error state it disappeared");
  1322. return SLURM_ERROR;
  1323. }
  1324. /* we add the block to these lists so we don't try to schedule
  1325. on them. */
  1326. if (!block_ptr_exist_in_list(bg_lists->job_running, bg_record)) {
  1327. list_push(bg_lists->job_running, bg_record);
  1328. num_unused_cpus -= bg_record->cpu_cnt;
  1329. } else if (!(bg_record->state & BG_BLOCK_ERROR_FLAG)) {
  1330. info("hey I was in the job_running table %d %d %s?",
  1331. list_count(bg_record->job_list), num_unused_cpus,
  1332. bg_block_state_string(bg_record->state));
  1333. xassert(0);
  1334. }
  1335. if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
  1336. list_push(bg_lists->booted, bg_record);
  1337. bg_record->job_running = BLOCK_ERROR_STATE;
  1338. bg_record->state |= BG_BLOCK_ERROR_FLAG;
  1339. /* Only send if reason is set. If it isn't set then
  1340. accounting should already know about this error state */
  1341. if (reason) {
  1342. info("Setting Block %s to ERROR state. (reason: '%s')",
  1343. bg_record->bg_block_id, reason);
  1344. xfree(bg_record->reason);
  1345. bg_record->reason = xstrdup(reason);
  1346. _set_block_nodes_accounting(bg_record, reason);
  1347. }
  1348. last_bg_update = time(NULL);
  1349. slurm_mutex_unlock(&block_state_mutex);
  1350. trigger_block_error();
  1351. return SLURM_SUCCESS;
  1352. }
  1353. /* block_state_mutex should be locked before calling */
  1354. extern int resume_block(bg_record_t *bg_record)
  1355. {
  1356. xassert(bg_record);
  1357. if (bg_record->job_running > NO_JOB_RUNNING
  1358. || (bg_record->job_list && list_count(bg_record->job_list)))
  1359. return SLURM_SUCCESS;
  1360. if (bg_record->state & BG_BLOCK_ERROR_FLAG) {
  1361. ba_mp_t *ba_mp;
  1362. ListIterator itr;
  1363. struct node_record *node_ptr;
  1364. bg_record->state &= (~BG_BLOCK_ERROR_FLAG);
  1365. info("Block %s put back into service after "
  1366. "being in an error state.",
  1367. bg_record->bg_block_id);
  1368. /* Remove the block error message from each slurm node. */
  1369. itr = list_iterator_create(bg_record->ba_mp_list);
  1370. while ((ba_mp = list_next(itr))) {
  1371. node_ptr = &node_record_table_ptr[ba_mp->index];
  1372. if (node_ptr->reason
  1373. && !strncmp(node_ptr->reason, "update_block", 12))
  1374. xfree(node_ptr->reason);
  1375. }
  1376. list_iterator_destroy(itr);
  1377. }
  1378. if (remove_from_bg_list(bg_lists->job_running, bg_record)
  1379. == SLURM_SUCCESS)
  1380. num_unused_cpus += bg_record->cpu_cnt;
  1381. if (bg_record->state != BG_BLOCK_INITED)
  1382. remove_from_bg_list(bg_lists->booted, bg_record);
  1383. else if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
  1384. list_push(bg_lists->booted, bg_record);
  1385. bg_record->job_running = NO_JOB_RUNNING;
  1386. xfree(bg_record->reason);
  1387. last_bg_update = time(NULL);
  1388. _set_block_nodes_accounting(bg_record, NULL);
  1389. return SLURM_SUCCESS;
  1390. }
  1391. /* block_state_mutex should be locked before calling this function */
  1392. extern int bg_reset_block(bg_record_t *bg_record, struct job_record *job_ptr)
  1393. {
  1394. int rc = SLURM_SUCCESS;
  1395. if (!bg_record) {
  1396. error("bg_reset_block: No block given to reset");
  1397. return SLURM_ERROR;
  1398. }
  1399. if (bg_record->job_list)
  1400. ba_remove_job_in_block_job_list(bg_record, job_ptr);
  1401. if ((bg_record->job_running > NO_JOB_RUNNING)
  1402. && (!bg_record->job_list || !list_count(bg_record->job_list))) {
  1403. #ifndef HAVE_BG_L_P
  1404. /* Just in case the slurmctld wasn't up at the
  1405. time a step completion message came through
  1406. we will clear all the cnode_bitmaps of the
  1407. midplanes of this block. So we can use
  1408. those cnodes on the next job that uses this
  1409. block.
  1410. */
  1411. ba_mp_t *ba_mp = NULL;
  1412. ListIterator itr = list_iterator_create(bg_record->ba_mp_list);
  1413. while ((ba_mp = list_next(itr))) {
  1414. if (!ba_mp->used)
  1415. continue;
  1416. if (ba_mp->cnode_usable_bitmap) {
  1417. FREE_NULL_BITMAP(ba_mp->cnode_bitmap);
  1418. ba_mp->cnode_bitmap =
  1419. bit_copy(ba_mp->cnode_usable_bitmap);
  1420. } else if (ba_mp->cnode_bitmap)
  1421. bit_nclear(ba_mp->cnode_bitmap, 0,
  1422. bit_size(ba_mp->cnode_bitmap)-1);
  1423. }
  1424. list_iterator_destroy(itr);
  1425. #endif
  1426. bg_record->job_running = NO_JOB_RUNNING;
  1427. }
  1428. if (bg_record->job_ptr) {
  1429. num_unused_cpus += bg_record->job_ptr->total_cpus;
  1430. bg_record->job_ptr = NULL;
  1431. }
  1432. /* remove user from list */
  1433. bridge_block_sync_users(bg_record);
  1434. /* Don't reset these (boot_(state/count)), they will be
  1435. reset when state changes, and needs to outlast a job
  1436. allocation.
  1437. */
  1438. /* bg_record->boot_state = 0; */
  1439. /* bg_record->boot_count = 0; */
  1440. last_bg_update = time(NULL);
  1441. /* Only remove from the job_running list if
  1442. job_running == NO_JOB_RUNNING, since blocks in
  1443. error state could also be in this list and we don't
  1444. want to remove them.
  1445. */
  1446. if (bg_record->job_running == NO_JOB_RUNNING
  1447. && (!bg_record->job_list || !list_count(bg_record->job_list))) {
  1448. remove_from_bg_list(bg_lists->job_running, bg_record);
  1449. /* At this point, no job is running on the block
  1450. anymore, so if there are any errors on it, free it
  1451. now.
  1452. */
  1453. if (bg_record->cnode_err_cnt) {
  1454. if (bg_conf->slurm_debug_flags
  1455. & DEBUG_FLAG_SELECT_TYPE)
  1456. info("%s has %d in error",
  1457. bg_record->bg_block_id,
  1458. bg_record->cnode_err_cnt);
  1459. bg_free_block(bg_record, 0, 1);
  1460. }
  1461. }
  1462. if (!list_count(bg_lists->job_running)
  1463. && (num_unused_cpus != num_possible_unused_cpus)) {
  1464. /* This should never happen, but if it does reset the
  1465. num_unused_cpus and go on your way.
  1466. */
  1467. error("Hey we are here with no jobs and we have only "
  1468. "%d usuable cpus. We should have %d!",
  1469. num_unused_cpus, num_possible_unused_cpus);
  1470. //xassert(0);
  1471. num_unused_cpus = num_possible_unused_cpus;
  1472. }
  1473. return rc;
  1474. }
  1475. /************************* local functions ***************************/
  1476. /* block_state_mutex should be locked before calling */
  1477. static int _check_all_blocks_error(int node_inx, time_t event_time,
  1478. char *reason)
  1479. {
  1480. bg_record_t *bg_record = NULL;
  1481. ListIterator itr = NULL;
  1482. struct node_record send_node, *node_ptr;
  1483. struct config_record config_rec;
  1484. int total_cpus = 0;
  1485. int rc = SLURM_SUCCESS;
  1486. xassert(node_inx <= node_record_count);
  1487. node_ptr = &node_record_table_ptr[node_inx];
  1488. /* only do this if the node isn't in the DRAINED state.
  1489. DRAINING is ok */
  1490. if (IS_NODE_DRAINED(node_ptr)

Large files files are truncated, but you can click here to view the full file