PageRenderTime 64ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 1ms

/src/plugins/select/bluegene/ba_bgq/block_allocator.c

https://github.com/cfenoy/slurm
C | 2297 lines | 1631 code | 249 blank | 417 comment | 341 complexity | ccae00b205ada74ac1ef529a0e108eb2 MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0

Large files files are truncated, but you can click here to view the full file

  1. /*****************************************************************************\
  2. * block_allocator.c - Assorted functions for layout of bgq blocks,
  3. * wiring, mapping for smap, etc.
  4. * $Id$
  5. *****************************************************************************
  6. * Copyright (C) 2004-2007 The Regents of the University of California.
  7. * Copyright (C) 2008-2011 Lawrence Livermore National Security.
  8. * Copyright (C) 2011 SchedMD LLC.
  9. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  10. * Written by Danny Auble <da@schedmd.com>
  11. *
  12. * This file is part of SLURM, a resource management program.
  13. * For details, see <http://www.schedmd.com/slurmdocs/>.
  14. * Please also read the included file: DISCLAIMER.
  15. *
  16. * SLURM is free software; you can redistribute it and/or modify it under
  17. * the terms of the GNU General Public License as published by the Free
  18. * Software Foundation; either version 2 of the License, or (at your option)
  19. * any later version.
  20. *
  21. * In addition, as a special exception, the copyright holders give permission
  22. * to link the code of portions of this program with the OpenSSL library under
  23. * certain conditions as described in each individual source file, and
  24. * distribute linked combinations including the two. You must obey the GNU
  25. * General Public License in all respects for all of the code used other than
  26. * OpenSSL. If you modify file(s) with this exception, you may extend this
  27. * exception to your version of the file(s), but you are not obligated to do
  28. * so. If you do not wish to do so, delete this exception statement from your
  29. * version. If you delete this exception statement from all source files in
  30. * the program, then also delete it here.
  31. *
  32. * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  33. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  34. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  35. * details.
  36. *
  37. * You should have received a copy of the GNU General Public License along
  38. * with SLURM; if not, write to the Free Software Foundation, Inc.,
  39. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  40. \*****************************************************************************/
  41. #if HAVE_CONFIG_H
  42. # include "config.h"
  43. #endif
  44. #include <stdio.h>
  45. #include <stdlib.h>
  46. #include <math.h>
  47. #include "block_allocator.h"
  48. #include "src/common/uid.h"
  49. #include "src/common/timers.h"
  50. #include "src/common/slurmdb_defs.h"
  51. #include "../bg_list_functions.h"
  52. #define DEBUG_PA
  53. #define BEST_COUNT_INIT 20
  54. /* in Q there are always 5 dimensions in a nodecard/board */
  55. typedef struct {
  56. int start[5];
  57. int end[5];
  58. } ba_nc_coords_t;
  59. #define mp_strip_unaltered(__mp) (__mp & ~BA_MP_USED_ALTERED_PASS)
  60. /* _ba_system is the "current" system that the structures will work
  61. * on */
  62. ba_mp_t ****ba_main_grid = NULL;
  63. ba_geo_system_t *ba_mp_geo_system = NULL;
  64. static ba_geo_system_t *ba_main_geo_system = NULL;
  65. static uint16_t *deny_pass = NULL;
  66. static ba_nc_coords_t g_nc_coords[16];
  67. static ba_mp_t **ba_main_grid_array = NULL;
  68. /* increment Y -> Z -> A -> X -> E
  69. * used for doing nodecard coords */
  70. static int ba_nc_dim_order[5] = {Y, Z, A, X, E};
  71. /** internal helper functions */
  72. /* */
  73. static char *_copy_from_main(List main_mps, List ret_list);
  74. /* */
  75. static char *_reset_altered_mps(List main_mps, bool get_name);
  76. /* */
  77. static int _check_deny_pass(int dim);
  78. /* */
  79. static int _fill_in_wires(List mps, ba_mp_t *start_mp, int dim,
  80. uint16_t geometry, uint16_t conn_type,
  81. bool full_check);
  82. /* */
  83. static void _setup_next_mps(int level, uint16_t *coords);
  84. /* */
  85. static void _increment_nc_coords(int dim, int *mp_coords, int *dim_size);
  86. /** */
  87. static bool _mp_used(ba_mp_t* ba_mp, int dim);
  88. /** */
  89. static bool _mp_out_used(ba_mp_t* ba_mp, int dim);
  90. /** */
  91. static uint16_t _find_distance(uint16_t start, uint16_t end, int dim);
  92. static int _ba_set_ionode_str_internal(int level, int *coords,
  93. int *start_offset, int *end_offset,
  94. hostlist_t hl);
  95. static bitstr_t *_find_sub_block(ba_geo_table_t **geo_table,
  96. uint16_t *start_loc, bitstr_t *total_bitmap,
  97. uint32_t node_count);
  98. static ba_geo_table_t *_find_geo_table(uint32_t orig_node_count,
  99. uint32_t *node_count,
  100. uint32_t total_count);
  101. extern void ba_create_system()
  102. {
  103. int a,x,y,z, i = 0, dim;
  104. uint16_t coords[SYSTEM_DIMENSIONS];
  105. int mp_coords[5];
  106. if (ba_main_grid)
  107. ba_destroy_system();
  108. slurm_mutex_lock(&ba_system_mutex);
  109. /* build all the possible geos for the mid planes */
  110. ba_main_geo_system = xmalloc(sizeof(ba_geo_system_t));
  111. ba_main_geo_system->dim_count = SYSTEM_DIMENSIONS;
  112. ba_main_geo_system->dim_size =
  113. xmalloc(sizeof(int) * ba_main_geo_system->dim_count);
  114. for (dim = 0; dim < SYSTEM_DIMENSIONS; dim++)
  115. ba_main_geo_system->dim_size[dim] = DIM_SIZE[dim];
  116. ba_create_geo_table(ba_main_geo_system, 0);
  117. //ba_print_geo_table(ba_main_geo_system);
  118. /* build all the possible geos for a sub block inside a mid plane */
  119. ba_mp_geo_system = xmalloc(sizeof(ba_geo_system_t));
  120. ba_mp_geo_system->dim_count = 5;
  121. ba_mp_geo_system->dim_size =
  122. xmalloc(sizeof(int) * ba_mp_geo_system->dim_count);
  123. /* These will never change. */
  124. ba_mp_geo_system->dim_size[0] = 4;
  125. ba_mp_geo_system->dim_size[1] = 4;
  126. ba_mp_geo_system->dim_size[2] = 4;
  127. ba_mp_geo_system->dim_size[3] = 4;
  128. ba_mp_geo_system->dim_size[4] = 2;
  129. /* FIXME: We need to not create and geo with a dimension of 3 in it.
  130. * There apparently is a limitation in BGQ where you can't
  131. * make a sub-block with a dimension of 3. If this ever goes
  132. * away just remove remove the extra parameter to the
  133. * ba_create_geo_table.
  134. *
  135. * FROM IBM:
  136. * We have recently encountered a problematic scenario with
  137. * sub-block jobs and how the system (used for I/O) and user
  138. * (used for MPI) torus class routes are configured. The
  139. * network device hardware has cutoff registers to prevent
  140. * packets from flowing outside of the
  141. * sub-block. Unfortunately, when the sub-block has a size 3,
  142. * the job can attempt to send user packets outside of its
  143. * sub-block. This causes it to be terminated by signal 36.
  144. */
  145. ba_create_geo_table(ba_mp_geo_system, 1);
  146. //ba_print_geo_table(ba_mp_geo_system);
  147. /* Now set it up to mark the corners of each nodecard. This
  148. is used if running a sub-block job on a small block later.
  149. */
  150. /* This is the basic idea for each small block size origin 00000
  151. 32 = 2x2x2x2x2
  152. 64 = 2x2x4x2x2
  153. 128 = 2x2x4x4x2
  154. 256 = 4x2x4x4x2
  155. 512 = 4x4x4x4x2
  156. */
  157. /* 32node boundaries (this is what the following code generates)
  158. N00 - 32 = 00000x11111
  159. N01 - 64 = 00200x11311
  160. N02 - 96 = 00020x11131
  161. N03 - 128 = 00220x11331
  162. N04 - 160 = 20000x31111
  163. N05 - 192 = 20200x31311
  164. N06 - 224 = 20020x31131
  165. N07 - 256 = 20220x31331
  166. N08 - 288 = 02000x13111
  167. N09 - 320 = 02200x13311
  168. N10 - 352 = 02020x13131
  169. N11 - 384 = 02220x13331
  170. N12 - 416 = 22000x33111
  171. N13 - 448 = 22200x33311
  172. N14 - 480 = 22020x33131
  173. N15 - 512 = 22220x33331
  174. */
  175. memset(&mp_coords, 0, sizeof(mp_coords));
  176. for (i=0; i<16; i++) {
  177. /*
  178. * increment Y -> Z -> A -> X
  179. * E always goes from 0->1
  180. */
  181. for (dim = 0; dim < 5; dim++) {
  182. g_nc_coords[i].start[dim] =
  183. g_nc_coords[i].end[dim] = mp_coords[dim];
  184. g_nc_coords[i].end[dim]++;
  185. }
  186. /* info("%d\tgot %c%c%c%c%cx%c%c%c%c%c", */
  187. /* i, */
  188. /* alpha_num[g_nc_coords[i].start[A]], */
  189. /* alpha_num[g_nc_coords[i].start[X]], */
  190. /* alpha_num[g_nc_coords[i].start[Y]], */
  191. /* alpha_num[g_nc_coords[i].start[Z]], */
  192. /* alpha_num[g_nc_coords[i].start[E]], */
  193. /* alpha_num[g_nc_coords[i].end[A]], */
  194. /* alpha_num[g_nc_coords[i].end[X]], */
  195. /* alpha_num[g_nc_coords[i].end[Y]], */
  196. /* alpha_num[g_nc_coords[i].end[Z]], */
  197. /* alpha_num[g_nc_coords[i].end[E]]); */
  198. _increment_nc_coords(0, mp_coords, ba_mp_geo_system->dim_size);
  199. }
  200. /* Set up a flat array to be used in conjunction with the
  201. ba_geo system.
  202. */
  203. ba_main_grid_array = xmalloc(sizeof(ba_mp_t *) *
  204. ba_main_geo_system->total_size);
  205. i = 0;
  206. ba_main_grid = (ba_mp_t****)
  207. xmalloc(sizeof(ba_mp_t***) * DIM_SIZE[A]);
  208. for (a = 0; a < DIM_SIZE[A]; a++) {
  209. ba_main_grid[a] = (ba_mp_t***)
  210. xmalloc(sizeof(ba_mp_t**) * DIM_SIZE[X]);
  211. for (x = 0; x < DIM_SIZE[X]; x++) {
  212. ba_main_grid[a][x] = (ba_mp_t**)
  213. xmalloc(sizeof(ba_mp_t*) * DIM_SIZE[Y]);
  214. for (y = 0; y < DIM_SIZE[Y]; y++) {
  215. ba_main_grid[a][x][y] = (ba_mp_t*)
  216. xmalloc(sizeof(ba_mp_t) * DIM_SIZE[Z]);
  217. for (z = 0; z < DIM_SIZE[Z]; z++) {
  218. ba_mp_t *ba_mp = &ba_main_grid
  219. [a][x][y][z];
  220. ba_mp->coord[A] = a;
  221. ba_mp->coord[X] = x;
  222. ba_mp->coord[Y] = y;
  223. ba_mp->coord[Z] = z;
  224. snprintf(ba_mp->coord_str,
  225. sizeof(ba_mp->coord_str),
  226. "%c%c%c%c",
  227. alpha_num[ba_mp->coord[A]],
  228. alpha_num[ba_mp->coord[X]],
  229. alpha_num[ba_mp->coord[Y]],
  230. alpha_num[ba_mp->coord[Z]]);
  231. ba_setup_mp(ba_mp, true, false);
  232. ba_mp->state = NODE_STATE_IDLE;
  233. /* This might get changed
  234. later, but just incase set
  235. it up here.
  236. */
  237. ba_mp->index = i++;
  238. ba_mp->ba_geo_index =
  239. ba_node_xlate_to_1d(
  240. ba_mp->coord,
  241. ba_main_geo_system);
  242. ba_main_grid_array[ba_mp->ba_geo_index]
  243. = ba_mp;
  244. }
  245. }
  246. }
  247. }
  248. _setup_next_mps(A, coords);
  249. slurm_mutex_unlock(&ba_system_mutex);
  250. }
  251. /** */
  252. extern void ba_destroy_system(void)
  253. {
  254. int a, x, y, z;
  255. slurm_mutex_lock(&ba_system_mutex);
  256. xfree(ba_main_grid_array);
  257. if (ba_main_grid) {
  258. for (a=0; a<DIM_SIZE[A]; a++) {
  259. for (x = 0; x < DIM_SIZE[X]; x++) {
  260. for (y = 0; y < DIM_SIZE[Y]; y++) {
  261. for (z=0; z < DIM_SIZE[Z]; z++) {
  262. free_internal_ba_mp(
  263. &ba_main_grid
  264. [a][x][y][z]);
  265. }
  266. xfree(ba_main_grid[a][x][y]);
  267. }
  268. xfree(ba_main_grid[a][x]);
  269. }
  270. xfree(ba_main_grid[a]);
  271. }
  272. xfree(ba_main_grid);
  273. ba_main_grid = NULL;
  274. }
  275. if (ba_main_geo_system) {
  276. ba_free_geo_table(ba_main_geo_system);
  277. xfree(ba_main_geo_system->dim_size);
  278. xfree(ba_main_geo_system);
  279. }
  280. if (ba_mp_geo_system) {
  281. ba_free_geo_table(ba_mp_geo_system);
  282. xfree(ba_mp_geo_system->dim_size);
  283. xfree(ba_mp_geo_system);
  284. }
  285. memset(DIM_SIZE, 0, sizeof(DIM_SIZE));
  286. slurm_mutex_unlock(&ba_system_mutex);
  287. }
  288. /*
  289. * create a block request. Note that if the geometry is given,
  290. * then size is ignored. If elongate is true, the algorithm will try
  291. * to fit that a block of cubic shape and then it will try other
  292. * elongated geometries. (ie, 2x2x2 -> 4x2x1 -> 8x1x1).
  293. *
  294. * IN/OUT - ba_request: structure to allocate and fill in.
  295. *
  296. * ALL below IN's need to be set within the ba_request before the call
  297. * if you want them to be used.
  298. * ALL below OUT's are set and returned within the ba_request.
  299. * IN - avail_mp_bitmap: bitmap of usable midplanes.
  300. * IN - blrtsimage: BlrtsImage for this block if not default
  301. * IN - conn_type: connection type of request (TORUS or MESH or SMALL)
  302. * IN - elongate: if true, will try to fit different geometries of
  303. * same size requests
  304. * IN/OUT - geometry: requested/returned geometry of block
  305. * IN - linuximage: LinuxImage for this block if not default
  306. * IN - mloaderimage: MLoaderImage for this block if not default
  307. * IN - nodecards: Number of nodecards in each block in request only
  308. * used of small block allocations.
  309. * OUT - passthroughs: if there were passthroughs used in the
  310. * generation of the block.
  311. * IN - procs: Number of real processors requested
  312. * IN - quarters: Number of midplane quarters in each block in request only
  313. * used of small block allocations.
  314. * IN - RamDiskimage: RamDiskImage for this block if not default
  315. * IN - rotate: if true, allows rotation of block during fit
  316. * OUT - save_name: hostlist of midplanes used in block
  317. * IN/OUT - size: requested/returned count of midplanes in block
  318. * IN - start: geo location of where to start the allocation
  319. * IN - start_req: if set use the start variable to start at
  320. * return success of allocation/validation of params
  321. */
  322. extern int new_ba_request(select_ba_request_t* ba_request)
  323. {
  324. int i=0;
  325. xfree(ba_request->save_name);
  326. if (ba_request->geometry[0] != (uint16_t)NO_VAL) {
  327. for (i=0; i<cluster_dims; i++){
  328. if ((ba_request->geometry[i] < 1)
  329. || (ba_request->geometry[i] > DIM_SIZE[i])) {
  330. error("new_ba_request Error, "
  331. "request geometry is invalid dim %d "
  332. "can't be %c, largest is %c",
  333. i,
  334. alpha_num[ba_request->geometry[i]],
  335. alpha_num[DIM_SIZE[i]]);
  336. return 0;
  337. }
  338. }
  339. ba_request->size = 1;
  340. for (i=0; i<cluster_dims; i++)
  341. ba_request->size *= ba_request->geometry[i];
  342. }
  343. if (!(cluster_flags & CLUSTER_FLAG_BGQ)) {
  344. if (ba_request->size
  345. && (ba_request->geometry[0] == (uint16_t)NO_VAL)) {
  346. ba_request->geometry[0] = ba_request->size;
  347. } else {
  348. error("new_ba_request: "
  349. "No size or geometry given");
  350. return 0;
  351. }
  352. return 1;
  353. }
  354. if (ba_request->deny_pass == (uint16_t)NO_VAL)
  355. ba_request->deny_pass = ba_deny_pass;
  356. deny_pass = &ba_request->deny_pass;
  357. return 1;
  358. }
  359. /**
  360. * print a block request
  361. */
  362. extern void print_ba_request(select_ba_request_t* ba_request)
  363. {
  364. int i;
  365. if (ba_request == NULL){
  366. error("print_ba_request Error, request is NULL");
  367. return;
  368. }
  369. debug(" ba_request:");
  370. debug(" geometry:\t");
  371. for (i=0; i<cluster_dims; i++){
  372. debug("%d", ba_request->geometry[i]);
  373. }
  374. debug(" conn_type:\t");
  375. for (i=0; i<cluster_dims; i++){
  376. debug("%d", ba_request->conn_type[i]);
  377. }
  378. debug(" size:\t%d", ba_request->size);
  379. debug(" rotate:\t%d", ba_request->rotate);
  380. debug(" elongate:\t%d", ba_request->elongate);
  381. }
  382. /* ba_system_mutex needs to be locked before calling this. */
  383. extern ba_mp_t *coord2ba_mp(const uint16_t *coord)
  384. {
  385. if ((coord[A] >= DIM_SIZE[A]) || (coord[X] >= DIM_SIZE[X]) ||
  386. (coord[Y] >= DIM_SIZE[Y]) || (coord[Z] >= DIM_SIZE[Z])) {
  387. error("Invalid coordinate %d:%d:%d:%d",
  388. coord[A], coord[X], coord[Y], coord[Z]);
  389. return NULL;
  390. }
  391. return &ba_main_grid[coord[A]][coord[X]][coord[Y]][coord[Z]];
  392. }
  393. /*
  394. * Try to allocate a block.
  395. *
  396. * IN - ba_request: allocation request
  397. * OUT - results: List of results of the allocation request. Each
  398. * list entry will be a coordinate. allocate_block will create the
  399. * list, but the caller must destroy it.
  400. *
  401. * return: success or error of request
  402. */
  403. extern int allocate_block(select_ba_request_t* ba_request, List results)
  404. {
  405. if (!ba_initialized){
  406. error("Error, configuration not initialized, "
  407. "calling ba_init(NULL, 1)");
  408. ba_init(NULL, 1);
  409. }
  410. if (!ba_request){
  411. error("allocate_block Error, request not initialized");
  412. return 0;
  413. }
  414. if (!(cluster_flags & CLUSTER_FLAG_BG))
  415. return 0;
  416. if ((ba_request->save_name = set_bg_block(results, ba_request)))
  417. return 1;
  418. debug2("allocate_block: can't allocate");
  419. return 0;
  420. }
  421. /*
  422. * Admin wants to remove a previous allocation.
  423. * will allow Admin to delete a previous allocation retrival by letter code.
  424. */
  425. extern int remove_block(List mps, bool is_small)
  426. {
  427. int dim;
  428. ba_mp_t* curr_ba_mp = NULL;
  429. ba_mp_t* ba_mp = NULL;
  430. ListIterator itr;
  431. slurm_mutex_lock(&ba_system_mutex);
  432. itr = list_iterator_create(mps);
  433. while ((curr_ba_mp = (ba_mp_t*) list_next(itr))) {
  434. /* since the list that comes in might not be pointers
  435. to the main list we need to point to that main list */
  436. ba_mp = coord2ba_mp(curr_ba_mp->coord);
  437. if (curr_ba_mp->used) {
  438. ba_mp->used &= (~BA_MP_USED_TRUE);
  439. if (ba_mp->used == BA_MP_USED_FALSE)
  440. bit_clear(ba_main_mp_bitmap,
  441. ba_mp->ba_geo_index);
  442. }
  443. ba_mp->used &= (~BA_MP_USED_ALTERED_PASS);
  444. /* Small blocks don't use wires, and only have 1 mp,
  445. so just break. */
  446. if (ba_debug_flags & DEBUG_FLAG_BG_ALGO)
  447. info("remove_block: midplane %s used state now %d",
  448. ba_mp->coord_str, ba_mp->used);
  449. for (dim=0; dim<cluster_dims; dim++) {
  450. /* House the altered usage here without any
  451. error so we don't take it from the original.
  452. */
  453. uint16_t altered_usage;
  454. if (curr_ba_mp == ba_mp) {
  455. altered_usage = ba_mp->alter_switch[dim].usage
  456. & (~BG_SWITCH_CABLE_ERROR_FULL);
  457. /* Remove the usage that was altered */
  458. /* info("remove_block: %s(%d) %s removing %s", */
  459. /* ba_mp->coord_str, dim, */
  460. /* ba_switch_usage_str( */
  461. /* ba_mp->axis_switch[dim].usage), */
  462. /* ba_switch_usage_str( */
  463. /* ba_mp->alter_switch[dim].usage)); */
  464. ba_mp->axis_switch[dim].usage &=
  465. (~altered_usage);
  466. /* info("remove_block: %s(%d) is now at %s", */
  467. /* ba_mp->coord_str, dim, */
  468. /* ba_switch_usage_str( */
  469. /* ba_mp->axis_switch[dim].usage)); */
  470. continue;
  471. }
  472. /* Set this after we know curr_ba_mp isn't
  473. the same as ba_mp so we don't mess up the
  474. original.
  475. */
  476. altered_usage = curr_ba_mp->axis_switch[dim].usage
  477. & (~BG_SWITCH_CABLE_ERROR_FULL);
  478. if (altered_usage != BG_SWITCH_NONE) {
  479. if (ba_debug_flags & DEBUG_FLAG_BG_ALGO_DEEP)
  480. info("remove_block: 2 %s(%d) %s %s "
  481. "removing %s",
  482. ba_mp->coord_str, dim,
  483. curr_ba_mp->coord_str,
  484. ba_switch_usage_str(
  485. ba_mp->axis_switch
  486. [dim].usage),
  487. ba_switch_usage_str(
  488. altered_usage));
  489. /* Just remove the usage set here */
  490. ba_mp->axis_switch[dim].usage &=
  491. (~altered_usage);
  492. if (ba_debug_flags & DEBUG_FLAG_BG_ALGO_DEEP)
  493. info("remove_block: 2 %s(%d) is "
  494. "now at %s",
  495. ba_mp->coord_str, dim,
  496. ba_switch_usage_str(
  497. ba_mp->axis_switch[dim].
  498. usage));
  499. }
  500. //ba_mp->alter_switch[dim].usage = BG_SWITCH_NONE;
  501. }
  502. }
  503. list_iterator_destroy(itr);
  504. slurm_mutex_unlock(&ba_system_mutex);
  505. return 1;
  506. }
  507. /*
  508. * Used to set a block into a virtual system. The system can be
  509. * cleared first and this function sets all the wires and midplanes
  510. * used in the mplist given. The mplist is a list of ba_mp_t's
  511. * that are already set up. This is very handly to test if there are
  512. * any passthroughs used by one block when adding another block that
  513. * also uses those wires, and neither use any overlapping
  514. * midplanes. Doing a simple bitmap & will not reveal this.
  515. *
  516. * Returns SLURM_SUCCESS if mplist fits into system without
  517. * conflict, and SLURM_ERROR if mplist conflicts with something
  518. * already in the system.
  519. */
  520. extern int check_and_set_mp_list(List mps)
  521. {
  522. int rc = SLURM_ERROR;
  523. int i;
  524. ba_switch_t *ba_switch = NULL, *curr_ba_switch = NULL;
  525. ba_mp_t *ba_mp = NULL, *curr_ba_mp = NULL;
  526. ListIterator itr = NULL;
  527. if (!mps)
  528. return rc;
  529. slurm_mutex_lock(&ba_system_mutex);
  530. itr = list_iterator_create(mps);
  531. while ((ba_mp = list_next(itr))) {
  532. /* info("checking %s", ba_mp->coord_str); */
  533. curr_ba_mp = coord2ba_mp(ba_mp->coord);
  534. if (ba_mp->used && curr_ba_mp->used) {
  535. /* Only error if the midplane isn't already
  536. * marked down or in a error state outside of
  537. * the bluegene block.
  538. */
  539. uint16_t base_state, mp_flags;
  540. base_state = curr_ba_mp->state & NODE_STATE_BASE;
  541. mp_flags = curr_ba_mp->state & NODE_STATE_FLAGS;
  542. if (!(mp_flags & (NODE_STATE_DRAIN | NODE_STATE_FAIL))
  543. && (base_state != NODE_STATE_DOWN)) {
  544. if (ba_debug_flags & DEBUG_FLAG_BG_ALGO_DEEP)
  545. info("check_and_set_mp_list: "
  546. "I have already been to "
  547. "this mp %s %s %d %d",
  548. ba_mp->coord_str,
  549. node_state_string(
  550. curr_ba_mp->state),
  551. ba_mp->used, curr_ba_mp->used);
  552. rc = SLURM_ERROR;
  553. goto end_it;
  554. }
  555. }
  556. if (ba_mp->used) {
  557. curr_ba_mp->used = ba_mp->used;
  558. xassert(!bit_test(ba_main_mp_bitmap,
  559. ba_mp->ba_geo_index));
  560. bit_set(ba_main_mp_bitmap, ba_mp->ba_geo_index);
  561. }
  562. if (ba_debug_flags & DEBUG_FLAG_BG_ALGO_DEEP)
  563. info("check_and_set_mp_list: "
  564. "%s is used ?= %d %d",
  565. curr_ba_mp->coord_str,
  566. curr_ba_mp->used, ba_mp->used);
  567. for(i=0; i<cluster_dims; i++) {
  568. ba_switch = &ba_mp->axis_switch[i];
  569. curr_ba_switch = &curr_ba_mp->axis_switch[i];
  570. //info("checking dim %d", i);
  571. if (ba_switch->usage == BG_SWITCH_NONE)
  572. continue;
  573. else if (ba_switch->usage
  574. & BG_SWITCH_CABLE_ERROR_FULL) {
  575. error("check_and_set_mp_list: Somehow we got "
  576. "a switch with an error set in it. "
  577. "This should never happen except "
  578. "on a system with missing cables such "
  579. "as a half rack system. %u",
  580. ba_switch->usage);
  581. continue;
  582. }
  583. if (ba_switch->usage & curr_ba_switch->usage) {
  584. if (ba_debug_flags & DEBUG_FLAG_BG_ALGO_DEEP)
  585. info("check_and_set_mp_list: "
  586. "%s(%d) is already in "
  587. "use the way we want to use it. "
  588. "%s already at %s",
  589. ba_mp->coord_str, i,
  590. ba_switch_usage_str(
  591. ba_switch->usage),
  592. ba_switch_usage_str(
  593. curr_ba_switch->usage));
  594. rc = SLURM_ERROR;
  595. goto end_it;
  596. }
  597. /* Since we are only checking to see if this
  598. block is creatable we don't need to check
  599. hardware issues like bad cables.
  600. */
  601. /* else if ((curr_ba_switch->usage */
  602. /* & BG_SWITCH_CABLE_ERROR_SET) */
  603. /* && (ba_switch->usage & BG_SWITCH_OUT_PASS)) { */
  604. /* if (ba_debug_flags & DEBUG_FLAG_BG_ALGO_DEEP) */
  605. /* info("check_and_set_mp_list: " */
  606. /* "%s(%d)'s cable is not available " */
  607. /* "can't really make this block. " */
  608. /* "We need %s and system is %s", */
  609. /* ba_mp->coord_str, i, */
  610. /* ba_switch_usage_str( */
  611. /* ba_switch->usage), */
  612. /* ba_switch_usage_str( */
  613. /* curr_ba_switch->usage)); */
  614. /* } */
  615. if (ba_debug_flags & DEBUG_FLAG_BG_ALGO_DEEP)
  616. info("check_and_set_mp_list: "
  617. "setting %s(%d) to from %s to %s",
  618. ba_mp->coord_str, i,
  619. ba_switch_usage_str(curr_ba_switch->usage),
  620. ba_switch_usage_str(curr_ba_switch->usage
  621. | ba_switch->usage));
  622. curr_ba_switch->usage |= ba_switch->usage;
  623. }
  624. }
  625. rc = SLURM_SUCCESS;
  626. end_it:
  627. list_iterator_destroy(itr);
  628. slurm_mutex_unlock(&ba_system_mutex);
  629. return rc;
  630. }
  631. /*
  632. * Used to find, and set up midplanes and the wires in the virtual
  633. * system and return them in List results
  634. *
  635. * IN/OUT results - a list with a NULL destroyer filled in with
  636. * midplanes and wires set to create the block with the api. If
  637. * only interested in the hostlist NULL can be excepted also.
  638. * IN ba_request - request for the block
  639. *
  640. * To be set in the ba_request
  641. * start - where to start the allocation. (optional)
  642. * geometry or size - the requested geometry of the block. (required)
  643. * conn_type - mesh, torus, or small. (required)
  644. *
  645. * RET char * - hostlist of midplanes results represent must be
  646. * xfreed. NULL on failure
  647. */
  648. extern char *set_bg_block(List results, select_ba_request_t* ba_request)
  649. {
  650. List main_mps = NULL;
  651. char *name = NULL;
  652. ba_mp_t* ba_mp = NULL;
  653. int dim;
  654. uint16_t local_deny_pass = ba_deny_pass;
  655. ba_geo_table_t *ba_geo_table = NULL;
  656. bitstr_t *success_bitmap = NULL;
  657. uint16_t orig_conn_type[HIGHEST_DIMENSIONS];
  658. xassert(ba_initialized);
  659. if (!ba_request->size) {
  660. if (ba_request->geometry[0] == (uint16_t)NO_VAL) {
  661. error("set_bg_block: No size or geometry given.");
  662. return NULL;
  663. }
  664. ba_request->size = 1;
  665. for (dim=0; dim<cluster_dims; dim++)
  666. ba_request->size *= ba_request->geometry[dim];
  667. }
  668. /* set up the geo_table */
  669. xassert(ba_request->size);
  670. if (!(ba_geo_table =
  671. ba_main_geo_system->geo_table_ptr[ba_request->size])) {
  672. error("set_bg_block: No geometries for %d midplanes",
  673. ba_request->size);
  674. return NULL;
  675. }
  676. if (!deny_pass)
  677. deny_pass = &local_deny_pass;
  678. memcpy(orig_conn_type, ba_request->conn_type,
  679. sizeof(ba_request->conn_type));
  680. slurm_mutex_lock(&ba_system_mutex);
  681. while (ba_geo_table) {
  682. ListIterator itr;
  683. int scan_offset = 0, cnt = 0, i=0;
  684. uint16_t start_loc[ba_main_geo_system->dim_count];
  685. if (ba_request->geometry[0] != (uint16_t)NO_VAL) {
  686. /* if we are requesting a specific geo, go directly to
  687. that geo_table. */
  688. if (memcmp(ba_request->geometry, ba_geo_table->geometry,
  689. sizeof(uint16_t) * cluster_dims)) {
  690. ba_geo_table = ba_geo_table->next_ptr;
  691. continue;
  692. }
  693. }
  694. try_again:
  695. if (success_bitmap)
  696. FREE_NULL_BITMAP(success_bitmap);
  697. if (main_mps && list_count(main_mps)) {
  698. _reset_altered_mps(main_mps, 0);
  699. list_flush(main_mps);
  700. }
  701. if (ba_geo_test_all(ba_main_mp_bitmap,
  702. &success_bitmap,
  703. ba_geo_table, &cnt,
  704. ba_main_geo_system, deny_pass,
  705. start_loc, &scan_offset, false)
  706. != SLURM_SUCCESS) {
  707. if (ba_request->geometry[0] != (uint16_t)NO_VAL) {
  708. ba_geo_table = NULL;
  709. break;
  710. }
  711. ba_geo_table = ba_geo_table->next_ptr;
  712. continue;
  713. }
  714. if (ba_request->start_req) {
  715. /* if we are requesting a specific start make
  716. sure that is what is returned. Else try
  717. again. Since this only happens with smap
  718. or startup this handling it this way
  719. shouldn't be that big of a deal. */
  720. if (memcmp(ba_request->start, start_loc,
  721. sizeof(uint16_t) * cluster_dims))
  722. goto try_again;
  723. }
  724. main_mps = list_create(NULL);
  725. for (i=0; i<ba_main_geo_system->total_size; i++) {
  726. if (!bit_test(success_bitmap, i))
  727. continue;
  728. ba_mp = ba_main_grid_array[i];
  729. xassert(ba_mp);
  730. for (dim=0; dim<cluster_dims; dim++) {
  731. if (_mp_used(ba_mp, dim))
  732. goto try_again;
  733. if (ba_geo_table->geometry[dim] == 1) {
  734. /* Always check MESH here since we
  735. * only care about the IN/OUT ports.
  736. * all 1 dimensions need a TORUS */
  737. ba_mp->alter_switch[dim].usage
  738. |= BG_SWITCH_WRAPPED;
  739. if (ba_debug_flags
  740. & DEBUG_FLAG_BG_ALGO_DEEP)
  741. info("set_bg_block: "
  742. "using mp %s(%d) "
  743. "in 1 geo %s added %s",
  744. ba_mp->coord_str, dim,
  745. ba_switch_usage_str(
  746. ba_mp->
  747. axis_switch[dim].
  748. usage),
  749. ba_switch_usage_str(
  750. ba_mp->
  751. alter_switch[dim].
  752. usage));
  753. continue;
  754. }
  755. }
  756. ba_mp->used = BA_MP_USED_ALTERED;
  757. list_append(main_mps, ba_mp);
  758. }
  759. /* If we are going to take up the entire dimension
  760. might as well force it to be TORUS. Check against
  761. MESH here instead of !TORUS so we don't mess up
  762. small block allocations.
  763. */
  764. for (dim=0; dim<cluster_dims; dim++) {
  765. if (((ba_request->conn_type[dim] == SELECT_MESH)
  766. || (ba_request->conn_type[dim] == SELECT_NAV))
  767. && ((ba_geo_table->geometry[dim] == 1)
  768. || (ba_geo_table->geometry[dim]
  769. == DIM_SIZE[dim]))) {
  770. /* On a Q all single midplane blocks
  771. * must be a TORUS.
  772. *
  773. * Also if we are using all midplanes
  774. * in a dimension might as well make
  775. * it a torus.
  776. */
  777. ba_request->conn_type[dim] = SELECT_TORUS;
  778. } else if (ba_request->conn_type[dim] == SELECT_NAV) {
  779. /* Set everything else to the default */
  780. ba_request->conn_type[dim] =
  781. bg_conf->default_conn_type[dim];
  782. }
  783. }
  784. itr = list_iterator_create(main_mps);
  785. while ((ba_mp = list_next(itr))) {
  786. if (ba_mp->used & BA_MP_USED_PASS_BIT)
  787. continue;
  788. for (dim=0; dim<cluster_dims; dim++) {
  789. if ((ba_geo_table->geometry[dim] == 1)
  790. || (ba_mp->coord[dim] != start_loc[dim]))
  791. continue;
  792. if (!_fill_in_wires(
  793. main_mps, ba_mp, dim,
  794. ba_geo_table->geometry[dim],
  795. ba_request->conn_type[dim],
  796. ba_request->full_check)) {
  797. list_iterator_destroy(itr);
  798. memcpy(ba_request->conn_type,
  799. orig_conn_type,
  800. sizeof(ba_request->conn_type));
  801. goto try_again;
  802. }
  803. }
  804. }
  805. list_iterator_destroy(itr);
  806. /* fill in the start with the actual start of the
  807. * block since it isn't always easy to figure out and
  808. * is easily */
  809. memcpy(ba_request->start, start_loc, sizeof(ba_request->start));
  810. break;
  811. }
  812. if (success_bitmap)
  813. FREE_NULL_BITMAP(success_bitmap);
  814. if (ba_geo_table) {
  815. /* Success */
  816. if (results)
  817. name = _copy_from_main(main_mps, results);
  818. else
  819. name = _reset_altered_mps(main_mps, 1);
  820. }
  821. if (main_mps) {
  822. /* handle failure */
  823. if (!name)
  824. _reset_altered_mps(main_mps, 0);
  825. list_destroy(main_mps);
  826. main_mps = NULL;
  827. }
  828. slurm_mutex_unlock(&ba_system_mutex);
  829. if (name)
  830. debug2("name = %s", name);
  831. else
  832. debug2("can't allocate");
  833. if (deny_pass == &local_deny_pass)
  834. deny_pass = NULL;
  835. return name;
  836. }
  837. extern void ba_rotate_geo(uint16_t *req_geo, int rot_cnt)
  838. {
  839. uint16_t tmp;
  840. switch (rot_cnt) {
  841. case 0: /* ABCD -> ABDC */
  842. case 3: /* DABC -> DACB */
  843. case 6: /* CDAB -> CDBA */
  844. case 9: /* CADB -> CABD */
  845. case 14: /* DBAC -> DBCA */
  846. case 17: /* ACBD -> ACDB */
  847. case 20: /* BDCA -> BCDA */
  848. case 21: /* BCDA -> BCAD */
  849. SWAP(req_geo[Y], req_geo[Z], tmp);
  850. break;
  851. case 1: /* ABDC -> ADBC */
  852. case 4: /* DACB -> DCAB */
  853. case 7: /* CDBA -> CBDA */
  854. case 10: /* CABD -> CBAD */
  855. case 12: /* BADC -> BDAC */
  856. case 15: /* DBCA -> DCBA */
  857. case 18: /* ACDB -> ADCB */
  858. case 22: /* BCAD -> BACD */
  859. SWAP(req_geo[X], req_geo[Y], tmp);
  860. break;
  861. case 2: /* ADBC -> DABC */
  862. case 5: /* DCAB -> CDAB */
  863. case 13: /* BDAC -> DBAC */
  864. case 23: /* BACD -> ABCD */
  865. SWAP(req_geo[A], req_geo[X], tmp);
  866. break;
  867. case 16: /* DCBA -> ACBD */
  868. case 19: /* ADCB -> BDCA */
  869. SWAP(req_geo[A], req_geo[Z], tmp);
  870. break;
  871. case 8: /* CBDA -> CADB */
  872. SWAP(req_geo[X], req_geo[Z], tmp);
  873. break;
  874. case 11: /* CBAD -> BCAD -> BACD -> BADC */
  875. SWAP(req_geo[A], req_geo[X], tmp);
  876. SWAP(req_geo[X], req_geo[Y], tmp);
  877. SWAP(req_geo[Y], req_geo[Z], tmp);
  878. break;
  879. }
  880. }
  881. extern bool ba_sub_block_in_bitmap(select_jobinfo_t *jobinfo,
  882. bitstr_t *usable_bitmap, bool step)
  883. {
  884. bitstr_t *found_bits = NULL;
  885. uint32_t node_count;
  886. ba_geo_table_t *geo_table = NULL;
  887. int clr_cnt, dim;
  888. uint16_t start_loc[ba_mp_geo_system->dim_count];
  889. xassert(jobinfo);
  890. xassert(usable_bitmap);
  891. node_count = jobinfo->cnode_cnt;
  892. clr_cnt = bit_clear_count(usable_bitmap);
  893. if (clr_cnt < node_count)
  894. return false;
  895. jobinfo->dim_cnt = ba_mp_geo_system->dim_count;
  896. try_again:
  897. if (ba_debug_flags & DEBUG_FLAG_BG_ALGO) {
  898. bit_not(usable_bitmap);
  899. char *tmp_char = ba_node_map_ranged_hostlist(
  900. usable_bitmap, ba_mp_geo_system);
  901. bit_not(usable_bitmap);
  902. info("ba_sub_block_in_bitmap: "
  903. "looking for %u in a field of %u (%s).",
  904. node_count, clr_cnt, tmp_char);
  905. xfree(tmp_char);
  906. }
  907. if (!(geo_table = _find_geo_table(node_count, &node_count, clr_cnt)))
  908. return false;
  909. if (!(found_bits = _find_sub_block(
  910. &geo_table, start_loc, usable_bitmap, node_count))) {
  911. /* This is to vet we have a good geo on this request. So if a
  912. person asks for 12 and the only reason they can't get it is
  913. because they can't get that geo and if they would of asked
  914. for 16 then they could run we do that for them.
  915. */
  916. node_count++;
  917. if (clr_cnt > node_count) {
  918. if (ba_debug_flags & DEBUG_FLAG_BG_ALGO)
  919. info("trying with a larger size");
  920. goto try_again;
  921. }
  922. return false;
  923. }
  924. if (jobinfo->units_avail)
  925. FREE_NULL_BITMAP(jobinfo->units_avail);
  926. if (jobinfo->units_used)
  927. FREE_NULL_BITMAP(jobinfo->units_used);
  928. jobinfo->units_avail = found_bits;
  929. found_bits = NULL;
  930. jobinfo->units_used = bit_copy(jobinfo->units_avail);
  931. /* ba_sub_block_in_bitmap works for both job and step
  932. allocations. It sets the units_used to the
  933. opposite of units_available by default. If used for a step
  934. we want all units used to be that of the avail for easy
  935. clearing.
  936. */
  937. if (!step)
  938. bit_not(jobinfo->units_used);
  939. xfree(jobinfo->ionode_str);
  940. jobinfo->cnode_cnt = node_count;
  941. for (dim = 0; dim < jobinfo->dim_cnt; dim++) {
  942. jobinfo->geometry[dim] = geo_table->geometry[dim];
  943. jobinfo->start_loc[dim] = start_loc[dim];
  944. }
  945. if (node_count < bg_conf->mp_cnode_cnt) {
  946. jobinfo->ionode_str = ba_node_map_ranged_hostlist(
  947. jobinfo->units_avail, ba_mp_geo_system);
  948. if (ba_debug_flags & DEBUG_FLAG_BG_ALGO) {
  949. char *tmp_char;
  950. bitstr_t *total_bitmap = bit_copy(usable_bitmap);
  951. bit_or(total_bitmap, jobinfo->units_avail);
  952. bit_not(total_bitmap);
  953. tmp_char = ba_node_map_ranged_hostlist(
  954. total_bitmap, ba_mp_geo_system);
  955. FREE_NULL_BITMAP(total_bitmap);
  956. info("ba_sub_block_in_bitmap: "
  957. "can use cnodes %s leaving '%s' usable.",
  958. jobinfo->ionode_str, tmp_char);
  959. xfree(tmp_char);
  960. }
  961. } else if (ba_debug_flags & DEBUG_FLAG_BG_ALGO) {
  962. info("ba_sub_block_in_bitmap: "
  963. "can use all cnodes leaving none usable.");
  964. }
  965. return true;
  966. }
  967. extern int ba_sub_block_in_bitmap_clear(
  968. select_jobinfo_t *jobinfo, bitstr_t *usable_bitmap)
  969. {
  970. char *tmp_char = NULL, *tmp_char2 = NULL;
  971. if (!jobinfo->units_avail) {
  972. error("ba_sub_block_in_bitmap_clear: "
  973. "no units avail bitmap on the jobinfo");
  974. return SLURM_ERROR;
  975. }
  976. /* use units_avail here instead of units_used so it works for
  977. both jobs and steps with no other code.
  978. */
  979. bit_not(jobinfo->units_avail);
  980. bit_and(usable_bitmap, jobinfo->units_avail);
  981. bit_not(jobinfo->units_avail);
  982. if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_ALGO) {
  983. tmp_char = ba_node_map_ranged_hostlist(
  984. jobinfo->units_avail, ba_mp_geo_system);
  985. bit_not(usable_bitmap);
  986. tmp_char2 = ba_node_map_ranged_hostlist(
  987. usable_bitmap, ba_mp_geo_system);
  988. bit_not(usable_bitmap);
  989. info("ba_sub_block_in_bitmap_clear: "
  990. "cleared cnodes %s making '%s' available.",
  991. tmp_char, tmp_char2);
  992. xfree(tmp_char);
  993. xfree(tmp_char2);
  994. }
  995. return SLURM_SUCCESS;
  996. }
  997. extern ba_mp_t *ba_sub_block_in_record(
  998. bg_record_t *bg_record, uint32_t *node_count, select_jobinfo_t *jobinfo)
  999. {
  1000. ListIterator itr = NULL;
  1001. ba_mp_t *ba_mp = NULL;
  1002. ba_geo_table_t *geo_table = NULL;
  1003. char *tmp_char = NULL;
  1004. uint32_t orig_node_count = *node_count;
  1005. int dim;
  1006. uint32_t max_clear_cnt = 0, clear_cnt;
  1007. bitstr_t *total_bitmap = NULL;
  1008. uint16_t start_loc[ba_mp_geo_system->dim_count];
  1009. xassert(ba_mp_geo_system);
  1010. xassert(bg_record->ba_mp_list);
  1011. xassert(jobinfo);
  1012. xassert(!jobinfo->units_used);
  1013. jobinfo->dim_cnt = ba_mp_geo_system->dim_count;
  1014. try_again:
  1015. if (!(geo_table = _find_geo_table(
  1016. orig_node_count, node_count, bg_record->cnode_cnt)))
  1017. return NULL;
  1018. itr = list_iterator_create(bg_record->ba_mp_list);
  1019. while ((ba_mp = list_next(itr))) {
  1020. if (!ba_mp->used)
  1021. continue;
  1022. /* Create the bitmap if it doesn't exist. Since this
  1023. * is a copy of the original and the cnode_bitmap is
  1024. * only used for sub-block jobs we only create it
  1025. * when needed. */
  1026. if (!ba_mp->cnode_bitmap)
  1027. ba_mp->cnode_bitmap =
  1028. ba_create_ba_mp_cnode_bitmap(bg_record);
  1029. if (!ba_mp->cnode_err_bitmap)
  1030. ba_mp->cnode_err_bitmap =
  1031. bit_alloc(bg_conf->mp_cnode_cnt);
  1032. total_bitmap = bit_copy(ba_mp->cnode_bitmap);
  1033. bit_or(total_bitmap, ba_mp->cnode_err_bitmap);
  1034. if ((jobinfo->units_used = _find_sub_block(
  1035. &geo_table, start_loc, total_bitmap, *node_count)))
  1036. break;
  1037. clear_cnt = bit_clear_count(total_bitmap);
  1038. FREE_NULL_BITMAP(total_bitmap);
  1039. /* Grab the most empty midplane to be used later if we
  1040. can't find a spot.
  1041. */
  1042. if (max_clear_cnt < clear_cnt)
  1043. max_clear_cnt = clear_cnt;
  1044. if (ba_debug_flags & DEBUG_FLAG_BG_ALGO)
  1045. info("couldn't place it on %s", ba_mp->coord_str);
  1046. geo_table = ba_mp_geo_system->geo_table_ptr[*node_count];
  1047. }
  1048. list_iterator_destroy(itr);
  1049. /* This is to vet we have a good geo on this request. So if a
  1050. person asks for 12 and the only reason they can't get it is
  1051. because they can't get that geo and if they would of asked
  1052. for 16 then they could run we do that for them.
  1053. */
  1054. if (!ba_mp) {
  1055. if (max_clear_cnt > (*node_count)+1) {
  1056. if (ba_debug_flags & DEBUG_FLAG_BG_ALGO)
  1057. info("trying with a larger size");
  1058. (*node_count)++;
  1059. goto try_again;
  1060. }
  1061. return NULL;
  1062. }
  1063. /* SUCCESS! */
  1064. /* Since we use conn_type as the relative start point, if the
  1065. block uses more than 1 midplane we need to give the
  1066. relative start point a boost when we go to a different midplane.
  1067. */
  1068. memset(jobinfo->conn_type, 0, sizeof(jobinfo->conn_type));
  1069. for (dim=0; dim<SYSTEM_DIMENSIONS; dim++)
  1070. jobinfo->conn_type[dim] = _find_distance(
  1071. bg_record->start[dim], ba_mp->coord[dim], dim);
  1072. bit_or(ba_mp->cnode_bitmap, jobinfo->units_used);
  1073. jobinfo->ionode_str = ba_node_map_ranged_hostlist(
  1074. jobinfo->units_used, ba_mp_geo_system);
  1075. if (ba_debug_flags & DEBUG_FLAG_BG_ALGO) {
  1076. bit_or(total_bitmap, jobinfo->units_used);
  1077. bit_not(total_bitmap);
  1078. tmp_char = ba_node_map_ranged_hostlist(
  1079. total_bitmap, ba_mp_geo_system);
  1080. info("ba_sub_block_in_record: "
  1081. "using cnodes %s on mp %s "
  1082. "leaving '%s' on this midplane "
  1083. "usable in this block (%s)",
  1084. jobinfo->ionode_str,
  1085. ba_mp->coord_str, tmp_char,
  1086. bg_record->bg_block_id);
  1087. xfree(tmp_char);
  1088. }
  1089. for (dim = 0; dim < jobinfo->dim_cnt; dim++) {
  1090. jobinfo->geometry[dim] =
  1091. geo_table->geometry[dim];
  1092. jobinfo->start_loc[dim] = start_loc[dim];
  1093. }
  1094. FREE_NULL_BITMAP(total_bitmap);
  1095. return ba_mp;
  1096. }
  1097. extern int ba_sub_block_in_record_clear(
  1098. bg_record_t *bg_record, struct step_record *step_ptr)
  1099. {
  1100. bitoff_t bit;
  1101. ListIterator itr = NULL;
  1102. ba_mp_t *ba_mp = NULL;
  1103. select_jobinfo_t *jobinfo = NULL;
  1104. char *tmp_char = NULL, *tmp_char2 = NULL, *tmp_char3 = NULL;
  1105. xassert(bg_record);
  1106. xassert(step_ptr);
  1107. jobinfo = step_ptr->select_jobinfo->data;
  1108. xassert(jobinfo);
  1109. /* If we are using the entire block and the block is larger
  1110. * than 1 midplane we don't need to do anything. */
  1111. if ((jobinfo->cnode_cnt == bg_record->cnode_cnt)
  1112. && (bg_record->mp_count != 1))
  1113. return SLURM_SUCCESS;
  1114. if ((bit = bit_ffs(step_ptr->step_node_bitmap)) == -1) {
  1115. error("ba_sub_block_in_record_clear: "
  1116. "we couldn't find any bits set");
  1117. return SLURM_ERROR;
  1118. }
  1119. itr = list_iterator_create(bg_record->ba_mp_list);
  1120. while ((ba_mp = list_next(itr))) {
  1121. if (ba_mp->index != bit)
  1122. continue;
  1123. if (!jobinfo->units_used) {
  1124. /* from older version of slurm */
  1125. error("ba_sub_block_in_record_clear: "
  1126. "didn't have the units_used bitmap "
  1127. "for some reason?");
  1128. break;
  1129. } else if (!ba_mp->cnode_bitmap) {
  1130. /* If the job allocation has already finished
  1131. before processing the job step completion
  1132. this could happen, but it should already be
  1133. checked before it gets here so this should
  1134. never happen, this is just for safely sake.
  1135. */
  1136. error("ba_sub_block_in_record_clear: no cnode_bitmap? "
  1137. "job %u(%p) is in state %s on block %s %u(%p). "
  1138. "This should never happen.",
  1139. step_ptr->job_ptr->job_id, step_ptr->job_ptr,
  1140. job_state_string(step_ptr->job_ptr->job_state
  1141. & (~JOB_CONFIGURING)),
  1142. bg_record->bg_block_id, bg_record->job_running,
  1143. bg_record->job_ptr);
  1144. break;
  1145. }
  1146. bit_not(jobinfo->units_used);
  1147. bit_and(ba_mp->cnode_bitmap, jobinfo->units_used);
  1148. if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_ALGO) {
  1149. bitstr_t *total_bitmap = bit_copy(ba_mp->cnode_bitmap);
  1150. if (ba_mp->cnode_err_bitmap) {
  1151. bit_or(total_bitmap, ba_mp->cnode_err_bitmap);
  1152. tmp_char3 = ba_node_map_ranged_hostlist(
  1153. ba_mp->cnode_err_bitmap,
  1154. ba_mp_geo_system);
  1155. }
  1156. bit_not(jobinfo->units_used);
  1157. tmp_char = ba_node_map_ranged_hostlist(
  1158. jobinfo->units_used, ba_mp_geo_system);
  1159. bit_not(total_bitmap);
  1160. tmp_char2 = ba_node_map_ranged_hostlist(
  1161. total_bitmap, ba_mp_geo_system);
  1162. info("ba_sub_block_in_record_clear: "
  1163. "cleared cnodes %s on mp %s, making '%s' "
  1164. "on this midplane usable in this block (%s), "
  1165. "%s are in Software Failure",
  1166. tmp_char, ba_mp->coord_str, tmp_char2,
  1167. bg_record->bg_block_id, tmp_char3);
  1168. xfree(tmp_char);
  1169. xfree(tmp_char2);
  1170. xfree(tmp_char3);
  1171. FREE_NULL_BITMAP(total_bitmap);
  1172. }
  1173. break;
  1174. }
  1175. list_iterator_destroy(itr);
  1176. return SLURM_SUCCESS;
  1177. }
  1178. extern void ba_sync_job_to_block(bg_record_t *bg_record,
  1179. struct job_record *job_ptr)
  1180. {
  1181. struct step_record *step_ptr;
  1182. ListIterator itr;
  1183. ba_mp_t *ba_mp;
  1184. select_jobinfo_t *jobinfo, *step_jobinfo;
  1185. xassert(bg_record);
  1186. xassert(job_ptr);
  1187. if (bg_record->job_list) {
  1188. if (!find_job_in_bg_record(bg_record, job_ptr->job_id)) {
  1189. ba_mp = list_peek(bg_record->ba_mp_list);
  1190. list_append(bg_record->job_list, job_ptr);
  1191. jobinfo = job_ptr->select_jobinfo->data;
  1192. /* If you were switching from no sub-block
  1193. allocations to allowing it, the units_avail
  1194. wouldn't be around for any jobs, but no
  1195. problem since they were always the size of
  1196. the block.
  1197. */
  1198. if (!jobinfo->units_avail) {
  1199. jobinfo->units_avail =
  1200. bit_copy(ba_mp->cnode_bitmap);
  1201. bit_not(jobinfo->units_avail);
  1202. }
  1203. /* Since we are syncing this information lets
  1204. clear out the old stuff. (You need to use
  1205. the jobinfo->units_avail here instead of
  1206. ba_mp->cnode_bitmap because the above trick
  1207. only works when coming from a system where
  1208. no sub-block allocation was allowed.)
  1209. */
  1210. FREE_NULL_BITMAP(jobinfo->units_used);
  1211. jobinfo->units_used = bit_copy(jobinfo->units_avail);
  1212. bit_not(jobinfo->units_used);
  1213. if (bit_overlap(ba_mp->cnode_bitmap,
  1214. jobinfo->units_avail)) {
  1215. error("we have an overlapping job allocation "
  1216. "(%u) mp %s", job_ptr->job_id,
  1217. ba_mp->coord_str);
  1218. }
  1219. bit_or(ba_mp->cnode_bitmap, jobinfo->units_avail);
  1220. /* info("%s now has %d left", ba_mp->coord_str, */
  1221. /* bit_clear_count(ba_mp->cnode_bitmap)); */
  1222. itr = list_iterator_create(job_ptr->step_list);
  1223. while ((step_ptr = list_next(itr))) {
  1224. step_jobinfo = step_ptr->select_jobinfo->data;
  1225. if (bit_overlap(jobinfo->units_used,
  1226. step_jobinfo->units_avail)) {
  1227. error("we have an overlapping step "
  1228. "(%u.%u) mp %s", job_ptr->job_id,
  1229. step_ptr->step_id,
  1230. ba_mp->coord_str);
  1231. }
  1232. bit_or(jobinfo->units_used,
  1233. step_jobinfo->units_avail);
  1234. /* info("allocation %u now has %d left", */
  1235. /* job_ptr->job_id, */
  1236. /* bit_clear_count(jobinfo->units_used));*/
  1237. }
  1238. list_iterator_destroy(itr);
  1239. }
  1240. } else {
  1241. ListIterator ba_itr = NULL;
  1242. bg_record->job_running = job_ptr->job_id;
  1243. bg_record->job_ptr = job_ptr;
  1244. itr = list_iterator_create(job_ptr->step_list);
  1245. while ((step_ptr = list_next(itr))) {
  1246. struct node_record *node_ptr;
  1247. int node_inx;
  1248. jobinfo = step_ptr->select_jobinfo->data;
  1249. if (jobinfo->cnode_cnt == bg_record->cnode_cnt)
  1250. continue;
  1251. if (!ba_itr)
  1252. ba_itr = list_iterator_create(
  1253. bg_record->ba_mp_list);
  1254. else
  1255. list_iterator_reset(ba_itr);
  1256. if (!(node_ptr = find_node_record(
  1257. step_ptr->step_layout->node_list))) {
  1258. error("can't find midplane %s",
  1259. step_ptr->step_layout->node_list);
  1260. continue;
  1261. }
  1262. node_inx = node_ptr - node_record_table_ptr;
  1263. while ((ba_mp = list_next(ba_itr))) {
  1264. if (node_inx != ba_mp->index)
  1265. continue;
  1266. if (!ba_mp->cnode_bitmap)
  1267. ba_mp->cnode_bitmap =
  1268. ba_create_ba_mp_cnode_bitmap(
  1269. bg_record);
  1270. if (!ba_mp->cnode_err_bitmap)
  1271. ba_mp->cnode_err_bitmap = bit_alloc(
  1272. bg_conf->mp_cnode_cnt);
  1273. if (bit_overlap(ba_mp->cnode_bitmap,
  1274. jobinfo->units_used)) {
  1275. error("we have an overlapping step "
  1276. "(%u.%u) mp %s", job_ptr->job_id,
  1277. step_ptr->step_id,
  1278. ba_mp->coord_str);
  1279. }
  1280. bit_or(ba_mp->cnode_bitmap,
  1281. jobinfo->units_used);
  1282. break;
  1283. }
  1284. }
  1285. list_iterator_destroy(itr);
  1286. if (ba_itr)
  1287. list_iterator_destroy(ba_itr);
  1288. }
  1289. }
  1290. extern bitstr_t *ba_create_ba_mp_cnode_bitmap(bg_record_t *bg_record)
  1291. {
  1292. int start, end, ionode_num;
  1293. char *tmp_char, *tmp_char2;
  1294. bitstr_t *cnode_bitmap = bit_alloc(bg_conf->mp_cnode_cnt);
  1295. if (!bg_record->ionode_bitmap
  1296. || ((start = bit_ffs(bg_record->ionode_bitmap)) == -1))
  1297. return cnode_bitmap;
  1298. end = bit_fls(bg_record->ionode_bitmap);
  1299. for (ionode_num = start; ionode_num <= end; ionode_num++) {
  1300. int nc_num, nc_start, nc_end;
  1301. if (!bit_test(bg_record->ionode_bitmap, ionode_num))
  1302. continue;
  1303. nc_start = ionode_num * (int)bg_conf->nc_ratio;
  1304. nc_end = nc_start + (int)bg_conf->nc_ratio;
  1305. for (nc_num = nc_start; nc_num < nc_end; nc_num++)
  1306. ba_node_map_set_range(cnode_bitmap,
  1307. g_nc_coords[nc_num].start,
  1308. g_nc_coords[nc_num].end,
  1309. ba_mp_geo_system);
  1310. }
  1311. if (ba_debug_flags & DEBUG_FLAG_BG_ALGO_DEEP)
  1312. tmp_char = ba_node_map_ranged_hostlist(cnode_bitmap,
  1313. ba_mp_geo_system);
  1314. bit_not(cnode_bitmap);
  1315. if (ba_debug_flags & DEBUG_FLAG_BG_ALGO_DEEP) {
  1316. tmp_char2 = ba_node_map_ranged_hostlist(cnode_bitmap,
  1317. ba_mp_geo_system);
  1318. info("ba_create_ba_mp_cnode_bitmap: can only use %s cnodes of "
  1319. "this midplane leaving %s unusable", tmp_char, tmp_char2);
  1320. xfree(tmp_char);
  1321. xfree(tmp_char2);
  1322. }
  1323. return cnode_bitmap;
  1324. }
  1325. extern void ba_set_ionode_str(bg_record_t *bg_record)
  1326. {
  1327. int ionode_num, coords[5];
  1328. hostlist_t hl;
  1329. bool set_small = 0;
  1330. if (!bg_record->ionode_bitmap
  1331. || bit_ffs(bg_record->ionode_bitmap) == -1)
  1332. return;
  1333. hl = hostlist_create_dims("", 5);
  1334. for (ionode_num = bit_ffs(bg_record->ionode_bitmap);
  1335. ionode_num <= bit_fls(bg_record->ionode_bitmap);
  1336. ionode_num++) {
  1337. int nc_num, nc_start, nc_end;
  1338. if (!bit_test(bg_record->ionode_bitmap, ionode_num))
  1339. continue;
  1340. nc_start = ionode_num * (int)bg_conf->nc_ratio;
  1341. if (!set_small) {
  1342. int dim;
  1343. set_small = 1;
  1344. for (dim = 0; dim<5; dim++)
  1345. bg_record->start_small[dim] =
  1346. g_nc_coords[nc_start].start[dim];
  1347. }
  1348. nc_end = nc_start + (int)bg_conf->nc_ratio;
  1349. for (nc_num = nc_start; nc_num < nc_end; nc_num++) {
  1350. if (_ba_set_ionode_str_internal(
  1351. 0, coords,
  1352. g_nc_coords[nc_num].start,
  1353. g_nc_coords[nc_num].end,
  1354. hl)
  1355. == -1) {
  1356. hostlist_destroy(hl);
  1357. hl = NULL;
  1358. return;
  1359. }
  1360. }
  1361. }
  1362. bg_record->ionode_str = hostlist_ranged_string_xmalloc_dims(hl, 5, 0);
  1363. //info("iostring is %s", bg_record->ionode_str);
  1364. hostlist_destroy(hl);
  1365. hl = NULL;
  1366. }
  1367. /* Check to see if a job has been added to the bg_record NO_VAL
  1368. * returns the first one on the list. */
  1369. extern struct job_record *ba_remove_job_in_block_job_list(
  1370. bg_record_t *bg_record, struct job_record *in_job_ptr)
  1371. {
  1372. ListIterator itr;
  1373. struct job_record *job_ptr = NULL;
  1374. select_jobinfo_t *jobinfo;
  1375. ba_mp_t *ba_mp;
  1376. char *tmp_char = NULL, *tmp_char2 = NULL, *tmp_char3 = NULL;
  1377. bool bad_magic = 0;
  1378. bitstr_t *used_cnodes = NULL;
  1379. xassert(bg_record);
  1380. if (!bg_record->job_list)
  1381. return NULL;
  1382. ba_mp = list_peek(bg_record->ba_mp_list);
  1383. xassert(ba_mp);
  1384. if (in_job_ptr && in_job_ptr->magic != JOB_MAGIC) {
  1385. /* This can happen if the mmcs job hangs out in the system
  1386. * forever, or at least gets cleared a after the SLURM
  1387. * job is out of the controller.
  1388. */
  1389. bad_magic = 1;
  1390. used_cnodes = bit_copy(ba_mp->cnode_bitmap);
  1391. /* Take out the part (if any) of the midplane that
  1392. isn't part of the block.
  1393. */
  1394. bit_not(ba_mp->cnode_usable_bitmap);
  1395. bit_and(used_cnodes, ba_mp->cnode_usable_bitmap);
  1396. bit_not(ba_mp->cnode_usable_bitmap);
  1397. }
  1398. again:
  1399. itr = list_iterator_create(bg_record->job_list);
  1400. while ((job_ptr = list_next(itr))) {
  1401. if (job_ptr->magic != JOB_MAGIC) {
  1402. error("on block %s we found a job with bad magic",
  1403. bg_record->bg_block_id);
  1404. list_delete_item(itr);
  1405. continue;
  1406. } else if (bad_magic) {
  1407. jobinfo = job_ptr->select_jobinfo->data;
  1408. if (!jobinfo->units_avail) {
  1409. error("ba_remove_job_in_block_job_list: "
  1410. "no units avail bitmap on the jobinfo, "
  1411. "continuing");
  1412. continue;
  1413. }
  1414. bit_not(jobinfo->units_avail);
  1415. bit_and(used_cnodes, jobinfo->units_avail);
  1416. bit_not(jobinfo->units_avail);
  1417. continue;
  1418. }
  1419. if (!in_job_ptr) {
  1420. /* if there is not an in_job_ptr it is because
  1421. the jobs finished while the slurmctld
  1422. wasn't running and somehow the state was
  1423. messed up. So the cpus were never added to
  1424. the mix, so don't remove them. This should
  1425. probably never happen.
  1426. */
  1427. //num_unused_cpus += job_ptr->total_cpus;
  1428. list_delete_item(itr);
  1429. continue;
  1430. }
  1431. if (job_ptr == in_job_ptr) {
  1432. num_unused_cpus += job_ptr->total_cpus;
  1433. list_delete_item(itr);
  1434. break;
  1435. }
  1436. }
  1437. list_iterator_destroy(itr);
  1438. if (!in_job_ptr) {
  1439. if (ba_mp->cnode_usable_bitmap) {
  1440. FREE_NULL_BITMAP(ba_mp->cnode_bitmap);
  1441. ba_mp->cnode_bitmap =
  1442. bit_copy(ba_mp->cnode_usable_bitmap);
  1443. } else if (ba_mp->cnode_bitmap)
  1444. bit_nclear(ba_mp->cnode_bitmap, 0,
  1445. bit_size(ba_mp->cnode_bitmap)-1);
  1446. return NULL;
  1447. } else if (!job_ptr && !bad_magic) {
  1448. /* If the job was not found reset the block with the
  1449. running jobs and go from there.
  1450. */
  1451. if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
  1452. error("ba_remove_job_in_block_job_list: "
  1453. "Couldn't remove sub-block job %u from "
  1454. "block %s",
  1455. in_job_ptr->job_id, bg_record->bg_block_id);
  1456. }
  1457. bad_magic = 1;
  1458. used_cnodes = bit_copy(ba_mp->cnode_bitmap);
  1459. goto again;
  1460. }
  1461. if (bad_magic) {
  1462. uint32_t current_cnode_cnt = bit_set_count(used_cnodes);
  1463. num_unused_cpus += current_cnode_cnt * bg_conf->cpu_ratio;
  1464. bit_not(used_cnodes);
  1465. bit_and(ba_mp->cnode_bitmap, used_cnodes);
  1466. bit_not(used_cnodes);
  1467. if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
  1468. debug("ba_remove_job_in_block_job_list: "
  1469. "Removing old sub-block job using %d cnodes "
  1470. "from block %s",
  1471. current_cnode_cnt, bg_record->bg_block_id);
  1472. }
  1473. } else {
  1474. if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
  1475. debug("ba_remove_job_in_block_job_list: "
  1476. "Removing sub-block job %u from block %s",
  1477. job_ptr->job_id, bg_record->bg_block_id);
  1478. }
  1479. jobinfo = job_ptr->select_jobinfo->data;
  1480. if (!jobinfo->units_avail) {
  1481. error("ba_remove_job_in_block_job_list: "
  1482. "no units avail bitmap on the jobinfo");
  1483. return job_ptr;
  1484. }
  1485. used_cnodes = jobinfo->units_avail;
  1486. }
  1487. bit_not(used_cnodes);
  1488. bit_and(ba_mp->cnode_bitmap, used_cnodes);

Large files files are truncated, but you can click here to view the full file