PageRenderTime 65ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 1ms

/src/plugins/select/bluegene/select_bluegene.c

https://github.com/cfenoy/slurm
C | 3388 lines | 2622 code | 355 blank | 411 comment | 569 complexity | 81364133d6bb41bb0dd94aaffd03c13c MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0
  1. /*****************************************************************************\
  2. * select_bluegene.c - node selection plugin for Blue Gene system.
  3. *****************************************************************************
  4. * Copyright (C) 2004-2007 The Regents of the University of California.
  5. * Copyright (C) 2008-2011 Lawrence Livermore National Security.
  6. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  7. * Written by Dan Phung <phung4@llnl.gov> Danny Auble <da@llnl.gov>
  8. * CODE-OCEC-09-009. All rights reserved.
  9. *
  10. * This file is part of SLURM, a resource management program.
  11. * For details, see <http://www.schedmd.com/slurmdocs/>.
  12. * Please also read the included file: DISCLAIMER.
  13. *
  14. * SLURM is free software; you can redistribute it and/or modify it under
  15. * the terms of the GNU General Public License as published by the Free
  16. * Software Foundation; either version 2 of the License, or (at your option)
  17. * any later version.
  18. *
  19. * In addition, as a special exception, the copyright holders give permission
  20. * to link the code of portions of this program with the OpenSSL library under
  21. * certain conditions as described in each individual source file, and
  22. * distribute linked combinations including the two. You must obey the GNU
  23. * General Public License in all respects for all of the code used other than
  24. * OpenSSL. If you modify file(s) with this exception, you may extend this
  25. * exception to your version of the file(s), but you are not obligated to do
  26. * so. If you do not wish to do so, delete this exception statement from your
  27. * version. If you delete this exception statement from all source files in
  28. * the program, then also delete it here.
  29. *
  30. * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  31. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  32. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  33. * details.
  34. *
  35. * You should have received a copy of the GNU General Public License along
  36. * with SLURM; if not, write to the Free Software Foundation, Inc.,
  37. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  38. \*****************************************************************************/
  39. #include "src/common/slurm_xlator.h"
  40. #include "bg_core.h"
  41. #include "bg_read_config.h"
  42. #include "bg_defined_block.h"
  43. #ifndef HAVE_BG_L_P
  44. # include "ba_bgq/block_allocator.h"
  45. #else
  46. # include "ba/block_allocator.h"
  47. #endif
  48. #include "src/slurmctld/trigger_mgr.h"
  49. #include <fcntl.h>
  50. #define HUGE_BUF_SIZE (1024*16)
  51. /* These are defined here so when we link with something other than
  52. * the slurmctld we will have these symbols defined. They will get
  53. * overwritten when linking with the slurmctld.
  54. */
  55. #if defined (__APPLE__)
  56. slurmctld_config_t slurmctld_config __attribute__((weak_import));
  57. slurm_ctl_conf_t slurmctld_conf __attribute__((weak_import));
  58. struct node_record *node_record_table_ptr __attribute__((weak_import)) = NULL;
  59. int bg_recover __attribute__((weak_import)) = NOT_FROM_CONTROLLER;
  60. List part_list __attribute__((weak_import)) = NULL;
  61. int node_record_count __attribute__((weak_import));
  62. time_t last_node_update __attribute__((weak_import));
  63. time_t last_job_update __attribute__((weak_import));
  64. char *alpha_num __attribute__((weak_import)) =
  65. "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
  66. void *acct_db_conn __attribute__((weak_import)) = NULL;
  67. char *slurmctld_cluster_name __attribute__((weak_import)) = NULL;
  68. slurmdb_cluster_rec_t *working_cluster_rec __attribute__((weak_import)) = NULL;
  69. #else
  70. slurmctld_config_t slurmctld_config;
  71. slurm_ctl_conf_t slurmctld_conf;
  72. struct node_record *node_record_table_ptr = NULL;
  73. int bg_recover = NOT_FROM_CONTROLLER;
  74. List part_list = NULL;
  75. int node_record_count;
  76. time_t last_node_update;
  77. time_t last_job_update;
  78. char *alpha_num = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
  79. void *acct_db_conn = NULL;
  80. char *slurmctld_cluster_name = NULL;
  81. slurmdb_cluster_rec_t *working_cluster_rec = NULL;
  82. #endif
  83. /*
  84. * These variables are required by the generic plugin interface. If they
  85. * are not found in the plugin, the plugin loader will ignore it.
  86. *
  87. * plugin_name - a string giving a human-readable description of the
  88. * plugin. There is no maximum length, but the symbol must refer to
  89. * a valid string.
  90. *
  91. * plugin_type - a string suggesting the type of the plugin or its
  92. * applicability to a particular form of data or method of data handling.
  93. * If the low-level plugin API is used, the contents of this string are
  94. * unimportant and may be anything. SLURM uses the higher-level plugin
  95. * interface which requires this string to be of the form
  96. *
  97. * <application>/<method>
  98. *
  99. * where <application> is a description of the intended application of
  100. * the plugin (e.g., "select" for SLURM node selection) and <method>
  101. * is a description of how this plugin satisfies that application. SLURM will
  102. * only load select plugins if the plugin_type string has a
  103. * prefix of "select/".
  104. *
  105. * plugin_version - an unsigned 32-bit integer giving the version number
  106. * of the plugin. If major and minor revisions are desired, the major
  107. * version number may be multiplied by a suitable magnitude constant such
  108. * as 100 or 1000. Various SLURM versions will likely require a certain
  109. * minimum version for their plugins as the node selection API matures.
  110. */
  111. const char plugin_name[] = "BlueGene node selection plugin";
  112. const char plugin_type[] = "select/bluegene";
  113. const uint32_t plugin_id = 100;
  114. const uint32_t plugin_version = 200;
  115. /* Global variables */
  116. bg_config_t *bg_conf = NULL;
  117. bg_lists_t *bg_lists = NULL;
  118. time_t last_bg_update;
  119. pthread_mutex_t block_state_mutex = PTHREAD_MUTEX_INITIALIZER;
  120. int blocks_are_created = 0;
  121. int num_unused_cpus = 0;
  122. int num_possible_unused_cpus = 0;
  123. slurmctld_lock_t job_read_lock = {
  124. NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };
  125. extern int select_p_alter_node_cnt(enum select_node_cnt type, void *data);
  126. static void _destroy_bg_config(bg_config_t *bg_conf)
  127. {
  128. if (bg_conf) {
  129. if (bg_conf->blrts_list) {
  130. list_destroy(bg_conf->blrts_list);
  131. bg_conf->blrts_list = NULL;
  132. }
  133. xfree(bg_conf->bridge_api_file);
  134. xfree(bg_conf->default_blrtsimage);
  135. xfree(bg_conf->default_linuximage);
  136. xfree(bg_conf->default_mloaderimage);
  137. xfree(bg_conf->default_ramdiskimage);
  138. if (bg_conf->linux_list) {
  139. list_destroy(bg_conf->linux_list);
  140. bg_conf->linux_list = NULL;
  141. }
  142. if (bg_conf->mloader_list) {
  143. list_destroy(bg_conf->mloader_list);
  144. bg_conf->mloader_list = NULL;
  145. }
  146. if (bg_conf->ramdisk_list) {
  147. list_destroy(bg_conf->ramdisk_list);
  148. bg_conf->ramdisk_list = NULL;
  149. }
  150. xfree(bg_conf->slurm_user_name);
  151. xfree(bg_conf->slurm_node_prefix);
  152. xfree(bg_conf);
  153. }
  154. }
  155. static void _destroy_bg_lists(bg_lists_t *bg_lists)
  156. {
  157. if (bg_lists) {
  158. if (bg_lists->booted) {
  159. list_destroy(bg_lists->booted);
  160. bg_lists->booted = NULL;
  161. }
  162. if (bg_lists->job_running) {
  163. list_destroy(bg_lists->job_running);
  164. bg_lists->job_running = NULL;
  165. num_unused_cpus = 0;
  166. }
  167. if (bg_lists->main) {
  168. list_destroy(bg_lists->main);
  169. bg_lists->main = NULL;
  170. }
  171. if (bg_lists->valid_small32) {
  172. list_destroy(bg_lists->valid_small32);
  173. bg_lists->valid_small32 = NULL;
  174. }
  175. if (bg_lists->valid_small64) {
  176. list_destroy(bg_lists->valid_small64);
  177. bg_lists->valid_small64 = NULL;
  178. }
  179. if (bg_lists->valid_small128) {
  180. list_destroy(bg_lists->valid_small128);
  181. bg_lists->valid_small128 = NULL;
  182. }
  183. if (bg_lists->valid_small256) {
  184. list_destroy(bg_lists->valid_small256);
  185. bg_lists->valid_small256 = NULL;
  186. }
  187. xfree(bg_lists);
  188. }
  189. }
  190. #ifdef HAVE_BG
  191. static int _delete_old_blocks(List curr_block_list, List found_block_list)
  192. {
  193. ListIterator itr_curr, itr_found;
  194. bg_record_t *found_record = NULL, *init_record = NULL;
  195. List destroy_list = list_create(NULL);
  196. xassert(curr_block_list);
  197. xassert(found_block_list);
  198. slurm_mutex_lock(&block_state_mutex);
  199. if (!bg_recover) {
  200. info("removing all current blocks (clean start)");
  201. itr_curr = list_iterator_create(curr_block_list);
  202. while ((init_record = list_next(itr_curr))) {
  203. list_remove(itr_curr);
  204. init_record->modifying = 0;
  205. /* The block needs to exist in the main list
  206. * just to make sure we query the state. */
  207. if (!(found_record = find_bg_record_in_list(
  208. bg_lists->main,
  209. init_record->bg_block_id)))
  210. list_push(bg_lists->main, init_record);
  211. else {
  212. destroy_bg_record(init_record);
  213. init_record = found_record;
  214. }
  215. /* Make sure this block isn't in an
  216. error state since if it is it won't
  217. disappear. */
  218. if (init_record->state & BG_BLOCK_ERROR_FLAG)
  219. resume_block(init_record);
  220. list_push(destroy_list, init_record);
  221. }
  222. list_iterator_destroy(itr_curr);
  223. } else {
  224. info("removing unspecified blocks");
  225. itr_curr = list_iterator_create(curr_block_list);
  226. while ((init_record = list_next(itr_curr))) {
  227. itr_found = list_iterator_create(found_block_list);
  228. while ((found_record = list_next(itr_found))) {
  229. if (!strcmp(init_record->bg_block_id,
  230. found_record->bg_block_id)) {
  231. /* don't delete this one */
  232. break;
  233. }
  234. }
  235. list_iterator_destroy(itr_found);
  236. if (found_record == NULL) {
  237. list_remove(itr_curr);
  238. init_record->modifying = 0;
  239. /* The block needs to exist in the main list
  240. * just to make sure we query the state. */
  241. if (!(found_record = find_bg_record_in_list(
  242. bg_lists->main,
  243. init_record->bg_block_id)))
  244. list_push(bg_lists->main, init_record);
  245. else {
  246. destroy_bg_record(init_record);
  247. init_record = found_record;
  248. }
  249. /* Make sure this block isn't in an
  250. error state since if it is it won't
  251. disappear. */
  252. if (init_record->state & BG_BLOCK_ERROR_FLAG)
  253. resume_block(init_record);
  254. /* Since we can't requeue a running
  255. job in the free block function (not
  256. thread safe here) we must do it
  257. now.
  258. */
  259. if ((init_record->job_running > NO_JOB_RUNNING)
  260. || init_record->job_ptr) {
  261. /* Don't worry about dealing
  262. with this job here. Trying
  263. to requeue/cancel now will
  264. cause a race condition
  265. locking up the slurmctld.
  266. It will be handled when the
  267. blocks are synced. This
  268. should only happen if the
  269. bluegene.conf gets changed
  270. and jobs are running on
  271. blocks that don't exist in
  272. the new config (hopefully
  273. rarely).
  274. */
  275. init_record->job_running =
  276. NO_JOB_RUNNING;
  277. init_record->job_ptr = NULL;
  278. } else if (init_record->job_list &&
  279. list_count(init_record->job_list))
  280. list_flush(init_record->job_list);
  281. list_push(destroy_list, init_record);
  282. }
  283. }
  284. list_iterator_destroy(itr_curr);
  285. }
  286. slurm_mutex_unlock(&block_state_mutex);
  287. free_block_list(NO_VAL, destroy_list, 1, 0);
  288. list_destroy(destroy_list);
  289. return SLURM_SUCCESS;
  290. }
  291. static void _set_bg_lists()
  292. {
  293. if (!bg_lists)
  294. bg_lists = xmalloc(sizeof(bg_lists_t));
  295. slurm_mutex_lock(&block_state_mutex);
  296. if (bg_lists->booted)
  297. list_destroy(bg_lists->booted);
  298. bg_lists->booted = list_create(NULL);
  299. if (bg_lists->job_running)
  300. list_destroy(bg_lists->job_running);
  301. bg_lists->job_running = list_create(NULL);
  302. if (bg_lists->main)
  303. list_destroy(bg_lists->main);
  304. bg_lists->main = list_create(destroy_bg_record);
  305. slurm_mutex_unlock(&block_state_mutex);
  306. }
  307. static bg_record_t *_translate_info_2_record(block_info_t *block_info)
  308. {
  309. bg_record_t *bg_record = NULL;
  310. bitstr_t *mp_bitmap = NULL, *ionode_bitmap = NULL;
  311. mp_bitmap = bit_alloc(node_record_count);
  312. ionode_bitmap = bit_alloc(bg_conf->ionodes_per_mp);
  313. if (block_info->mp_inx
  314. && inx2bitstr(mp_bitmap, block_info->mp_inx) == -1)
  315. error("Job state recovered incompatible with "
  316. "bluegene.conf. mp=%u",
  317. node_record_count);
  318. if (block_info->ionode_inx
  319. && inx2bitstr(ionode_bitmap, block_info->ionode_inx) == -1)
  320. error("Job state recovered incompatible with "
  321. "bluegene.conf. ionodes=%u",
  322. bg_conf->ionodes_per_mp);
  323. bg_record = xmalloc(sizeof(bg_record_t));
  324. bg_record->magic = BLOCK_MAGIC;
  325. bg_record->bg_block_id = block_info->bg_block_id;
  326. block_info->bg_block_id = NULL;
  327. bg_record->mp_str = block_info->mp_str;
  328. block_info->mp_str = NULL;
  329. bg_record->ionode_bitmap = ionode_bitmap;
  330. ionode_bitmap = NULL;
  331. if (block_info->ionode_str) {
  332. ba_set_ionode_str(bg_record);
  333. if (!bg_record->ionode_str
  334. || strcmp(block_info->ionode_str, bg_record->ionode_str)) {
  335. error("block %s didn't compute with the correct "
  336. "ionode_str. Stored as '%s' and "
  337. "came back as '%s'",
  338. bg_record->bg_block_id,
  339. block_info->ionode_str, bg_record->ionode_str);
  340. }
  341. }
  342. bg_record->mp_bitmap = mp_bitmap;
  343. mp_bitmap = NULL;
  344. /* put_block_in_error_state should be
  345. called after the bg_lists->main has been
  346. made. We can't call it here since
  347. this record isn't the record kept
  348. around in bg_lists->main.
  349. */
  350. bg_record->state = block_info->state;
  351. bg_record->cnode_cnt = block_info->cnode_cnt;
  352. bg_record->mp_count = bit_set_count(bg_record->mp_bitmap);
  353. /* Don't copy the job_list from the block_info, we will fill
  354. it in later in the job sync.
  355. */
  356. bg_record->job_running = NO_JOB_RUNNING;
  357. if (bg_conf->sub_blocks && (bg_record->mp_count == 1))
  358. bg_record->job_list = list_create(NULL);
  359. #ifdef HAVE_BGL
  360. bg_record->node_use = block_info->node_use;
  361. #endif
  362. memcpy(bg_record->conn_type, block_info->conn_type,
  363. sizeof(bg_record->conn_type));
  364. bg_record->blrtsimage = block_info->blrtsimage;
  365. block_info->blrtsimage = NULL;
  366. bg_record->linuximage = block_info->linuximage;
  367. block_info->linuximage = NULL;
  368. bg_record->mloaderimage = block_info->mloaderimage;
  369. block_info->mloaderimage = NULL;
  370. bg_record->ramdiskimage = block_info->ramdiskimage;
  371. block_info->ramdiskimage = NULL;
  372. bg_record->reason = block_info->reason;
  373. block_info->reason = NULL;
  374. slurm_free_block_info_members(block_info);
  375. return bg_record;
  376. }
  377. static void _local_pack_block_job_info(struct job_record *job_ptr, Buf buffer,
  378. uint16_t protocol_version)
  379. {
  380. block_job_info_t block_job;
  381. select_jobinfo_t *jobinfo = job_ptr->select_jobinfo->data;
  382. memset(&block_job, 0, sizeof(block_job_info_t));
  383. block_job.job_id = job_ptr->job_id;
  384. block_job.user_id = job_ptr->user_id;
  385. if (jobinfo) {
  386. block_job.user_name = jobinfo->user_name;
  387. block_job.cnodes = jobinfo->ionode_str;
  388. } else
  389. error("NO JOBINFO for job %u magic %u!!!!!!!!!!!!!!",
  390. job_ptr->job_id, job_ptr->magic);
  391. /* block_job.cnode_inx -- try not to set */
  392. slurm_pack_block_job_info(&block_job, buffer, protocol_version);
  393. }
  394. /* Pack all relevent information about a block */
  395. /* NOTE: There is a matching pack function in
  396. * common/slurm_protocol_pack.c dealing with the block_info_t
  397. * structure there. If anything changes here please update that as well.
  398. * The unpack for this is in common/slurm_protocol_pack.c
  399. */
  400. static void _pack_block(bg_record_t *bg_record, Buf buffer,
  401. uint16_t protocol_version)
  402. {
  403. #ifdef HAVE_BGQ
  404. int dim;
  405. #endif
  406. uint32_t count = NO_VAL, running_job = 0;
  407. struct job_record *job_ptr;
  408. ListIterator itr;
  409. if (protocol_version >= SLURM_2_4_PROTOCOL_VERSION) {
  410. packstr(bg_record->bg_block_id, buffer);
  411. packstr(bg_record->blrtsimage, buffer);
  412. pack_bit_fmt(bg_record->mp_bitmap, buffer);
  413. #ifdef HAVE_BGQ
  414. pack32(SYSTEM_DIMENSIONS, buffer);
  415. for (dim=0; dim<SYSTEM_DIMENSIONS; dim++)
  416. pack16(bg_record->conn_type[dim], buffer);
  417. #else
  418. pack32(1, buffer); /* for dimensions of conn_type */
  419. pack16(bg_record->conn_type[0], buffer);
  420. #endif
  421. packstr(bg_record->ionode_str, buffer);
  422. pack_bit_fmt(bg_record->ionode_bitmap, buffer);
  423. if (bg_record->job_list)
  424. count = list_count(bg_record->job_list);
  425. if (count && count != NO_VAL) {
  426. pack32(count, buffer);
  427. itr = list_iterator_create(bg_record->job_list);
  428. while ((job_ptr = list_next(itr))) {
  429. if (job_ptr->magic != JOB_MAGIC) {
  430. error("_pack_block: "
  431. "bad magic found when "
  432. "packing block %s",
  433. bg_record->bg_block_id);
  434. list_delete_item(itr);
  435. slurm_pack_block_job_info(
  436. NULL, buffer,
  437. protocol_version);
  438. continue;
  439. }
  440. _local_pack_block_job_info(
  441. job_ptr, buffer, protocol_version);
  442. }
  443. list_iterator_destroy(itr);
  444. } else if (bg_record->job_ptr
  445. && (bg_record->job_ptr->magic == JOB_MAGIC)) {
  446. pack32(1, buffer);
  447. _local_pack_block_job_info(
  448. bg_record->job_ptr, buffer, protocol_version);
  449. } else
  450. pack32(count, buffer);
  451. count = NO_VAL;
  452. packstr(bg_record->linuximage, buffer);
  453. packstr(bg_record->mloaderimage, buffer);
  454. packstr(bg_record->mp_str, buffer);
  455. pack32(bg_record->cnode_cnt, buffer);
  456. pack32(bg_record->cnode_err_cnt, buffer);
  457. pack16((uint16_t)bg_record->node_use, buffer);
  458. packstr(bg_record->ramdiskimage, buffer);
  459. packstr(bg_record->reason, buffer);
  460. pack16((uint16_t)bg_record->state, buffer);
  461. } else if (protocol_version >= SLURM_2_3_PROTOCOL_VERSION) {
  462. packstr(bg_record->bg_block_id, buffer);
  463. packstr(bg_record->blrtsimage, buffer);
  464. pack_bit_fmt(bg_record->mp_bitmap, buffer);
  465. #ifdef HAVE_BGQ
  466. pack32(SYSTEM_DIMENSIONS, buffer);
  467. for (dim=0; dim<SYSTEM_DIMENSIONS; dim++)
  468. pack16(bg_record->conn_type[dim], buffer);
  469. #else
  470. pack32(1, buffer); /* for dimensions of conn_type */
  471. pack16(bg_record->conn_type[0], buffer);
  472. #endif
  473. packstr(bg_record->ionode_str, buffer);
  474. pack_bit_fmt(bg_record->ionode_bitmap, buffer);
  475. if (bg_record->job_list)
  476. count = list_count(bg_record->job_list);
  477. pack32(count, buffer);
  478. if (count && count != NO_VAL) {
  479. itr = list_iterator_create(bg_record->job_list);
  480. while ((job_ptr = list_next(itr))) {
  481. if (job_ptr->magic != JOB_MAGIC) {
  482. error("_pack_block 2.3: "
  483. "bad magic found when "
  484. "packing block %s",
  485. bg_record->bg_block_id);
  486. list_delete_item(itr);
  487. continue;
  488. }
  489. _local_pack_block_job_info(
  490. job_ptr, buffer, protocol_version);
  491. }
  492. list_iterator_destroy(itr);
  493. }
  494. if ((count == 1) && running_job)
  495. pack32((uint32_t)running_job, buffer);
  496. else
  497. pack32((uint32_t)bg_record->job_running, buffer);
  498. count = NO_VAL;
  499. packstr(bg_record->linuximage, buffer);
  500. packstr(bg_record->mloaderimage, buffer);
  501. packstr(bg_record->mp_str, buffer);
  502. packnull(buffer); /* for mp_used_str */
  503. pack32((uint32_t)bg_record->cnode_cnt, buffer);
  504. pack16((uint16_t)bg_record->node_use, buffer);
  505. packnull(buffer); /* for user_name */
  506. packstr(bg_record->ramdiskimage, buffer);
  507. packstr(bg_record->reason, buffer);
  508. pack16((uint16_t)bg_record->state, buffer);
  509. packnull(buffer); /* for mp_used_inx */
  510. }
  511. }
  512. /* Pack all extra information about a block (Only needed for saving state.) */
  513. static void _pack_block_ext(bg_record_t *bg_record, Buf buffer,
  514. uint16_t protocol_version)
  515. {
  516. ListIterator itr;
  517. ba_mp_t *ba_mp;
  518. uint32_t count = NO_VAL;
  519. int i;
  520. xassert(bg_record);
  521. if (protocol_version >= SLURM_2_3_PROTOCOL_VERSION) {
  522. if (bg_record->ba_mp_list)
  523. count = list_count(bg_record->ba_mp_list);
  524. pack32(count, buffer);
  525. if (count && count != NO_VAL) {
  526. itr = list_iterator_create(bg_record->ba_mp_list);
  527. while ((ba_mp = list_next(itr)))
  528. pack_ba_mp(ba_mp, buffer, protocol_version);
  529. list_iterator_destroy(itr);
  530. }
  531. pack32(bg_record->cpu_cnt, buffer);
  532. for (i=0; i<SYSTEM_DIMENSIONS; i++) {
  533. pack16(bg_record->geo[i], buffer);
  534. pack16(bg_record->start[i], buffer);
  535. }
  536. pack16(bg_record->full_block, buffer);
  537. pack32(bg_record->switch_count, buffer);
  538. } else {
  539. /* didn't exist before 2.3 */
  540. }
  541. }
  542. /* UNPack all extra information about a block */
  543. static int _unpack_block_ext(bg_record_t *bg_record, Buf buffer,
  544. uint16_t protocol_version)
  545. {
  546. ba_mp_t *ba_mp;
  547. uint32_t count = NO_VAL;
  548. int i;
  549. uint16_t temp16;
  550. xassert(bg_record);
  551. if (protocol_version >= SLURM_2_3_PROTOCOL_VERSION) {
  552. safe_unpack32(&count, buffer);
  553. if (count == NO_VAL) {
  554. error("_unpack_block_ext: bg_record record has no "
  555. "mp_list");
  556. goto unpack_error;
  557. }
  558. bg_record->ba_mp_list = list_create(destroy_ba_mp);
  559. for (i=0; i<count; i++) {
  560. if (unpack_ba_mp(&ba_mp, buffer, protocol_version)
  561. == SLURM_ERROR)
  562. goto unpack_error;
  563. list_append(bg_record->ba_mp_list, ba_mp);
  564. }
  565. safe_unpack32(&bg_record->cpu_cnt, buffer);
  566. for (i=0; i<SYSTEM_DIMENSIONS; i++) {
  567. safe_unpack16(&bg_record->geo[i], buffer);
  568. safe_unpack16(&bg_record->start[i], buffer);
  569. }
  570. safe_unpack16(&temp16, buffer);
  571. bg_record->full_block = temp16;
  572. safe_pack32(bg_record->switch_count, buffer);
  573. } else {
  574. /* packing didn't exist before 2.3, so set things up
  575. * to go forward */
  576. if (bg_conf->mp_cnode_cnt > bg_record->cnode_cnt) {
  577. bg_record->cpu_cnt = bg_conf->cpus_per_mp /
  578. (bg_conf->mp_cnode_cnt / bg_record->cnode_cnt);
  579. } else {
  580. bg_record->cpu_cnt = bg_conf->cpus_per_mp
  581. * bg_record->mp_count;
  582. }
  583. process_nodes(bg_record, true);
  584. }
  585. return SLURM_SUCCESS;
  586. unpack_error:
  587. error("Problem unpacking extended block info for %s, "
  588. "removing from list",
  589. bg_record->bg_block_id);
  590. return SLURM_ERROR;
  591. }
  592. static int _load_state_file(List curr_block_list, char *dir_name)
  593. {
  594. int state_fd, i;
  595. char *state_file = NULL;
  596. Buf buffer = NULL;
  597. char *data = NULL;
  598. int data_size = 0;
  599. block_info_msg_t *block_ptr = NULL;
  600. bg_record_t *bg_record = NULL;
  601. char temp[256];
  602. List results = NULL;
  603. int data_allocated, data_read = 0;
  604. char *ver_str = NULL;
  605. uint32_t ver_str_len;
  606. char *name = NULL;
  607. struct part_record *part_ptr = NULL;
  608. bitstr_t *usable_mp_bitmap = NULL;
  609. ListIterator itr = NULL;
  610. uint16_t protocol_version = (uint16_t)NO_VAL;
  611. uint32_t record_count;
  612. xassert(curr_block_list);
  613. xassert(dir_name);
  614. state_file = xstrdup(dir_name);
  615. xstrcat(state_file, "/block_state");
  616. state_fd = open(state_file, O_RDONLY);
  617. if (state_fd < 0) {
  618. error("No block state file (%s) to recover", state_file);
  619. xfree(state_file);
  620. return SLURM_SUCCESS;
  621. } else {
  622. data_allocated = BUF_SIZE;
  623. data = xmalloc(data_allocated);
  624. while (1) {
  625. data_read = read(state_fd, &data[data_size],
  626. BUF_SIZE);
  627. if (data_read < 0) {
  628. if (errno == EINTR)
  629. continue;
  630. else {
  631. error("Read error on %s: %m",
  632. state_file);
  633. break;
  634. }
  635. } else if (data_read == 0) /* eof */
  636. break;
  637. data_size += data_read;
  638. data_allocated += data_read;
  639. xrealloc(data, data_allocated);
  640. }
  641. close(state_fd);
  642. }
  643. xfree(state_file);
  644. buffer = create_buf(data, data_size);
  645. safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer);
  646. debug3("Version string in block_state header is %s", ver_str);
  647. if (ver_str) {
  648. if (!strcmp(ver_str, BLOCK_STATE_VERSION)) {
  649. protocol_version = SLURM_PROTOCOL_VERSION;
  650. }
  651. }
  652. if (protocol_version == (uint16_t)NO_VAL) {
  653. error("***********************************************");
  654. error("Can not recover block state, "
  655. "data version incompatible");
  656. error("***********************************************");
  657. xfree(ver_str);
  658. free_buf(buffer);
  659. return EFAULT;
  660. }
  661. xfree(ver_str);
  662. safe_unpack32(&record_count, buffer);
  663. slurm_mutex_lock(&block_state_mutex);
  664. reset_ba_system(true);
  665. /* Locks are already in place to protect part_list here */
  666. usable_mp_bitmap = bit_alloc(node_record_count);
  667. itr = list_iterator_create(part_list);
  668. while ((part_ptr = list_next(itr))) {
  669. /* we only want to use mps that are in partitions */
  670. if (!part_ptr->node_bitmap) {
  671. debug4("Partition %s doesn't have any nodes in it.",
  672. part_ptr->name);
  673. continue;
  674. }
  675. bit_or(usable_mp_bitmap, part_ptr->node_bitmap);
  676. }
  677. list_iterator_destroy(itr);
  678. if (bit_ffs(usable_mp_bitmap) == -1) {
  679. fatal("We don't have any nodes in any partitions. "
  680. "Can't create blocks. "
  681. "Please check your slurm.conf.");
  682. }
  683. for (i=0; i<record_count; i++) {
  684. block_info_t block_info;
  685. if (slurm_unpack_block_info_members(
  686. &block_info, buffer, protocol_version))
  687. goto unpack_error;
  688. if (!(bg_record = _translate_info_2_record(&block_info)))
  689. continue;
  690. if (_unpack_block_ext(bg_record, buffer, protocol_version)
  691. != SLURM_SUCCESS) {
  692. destroy_bg_record(bg_record);
  693. goto unpack_error;
  694. }
  695. /* This means the block here wasn't able to be
  696. processed correctly, so don't add.
  697. */
  698. if (!bg_record->mp_count) {
  699. error("block %s(%s) can't be made in the current "
  700. "system, but was around in the previous one.",
  701. bg_record->bg_block_id, bg_record->mp_str);
  702. list_destroy(results);
  703. destroy_bg_record(bg_record);
  704. continue;
  705. }
  706. if ((bg_conf->layout_mode == LAYOUT_OVERLAP)
  707. || bg_record->full_block)
  708. reset_ba_system(false);
  709. if (bg_record->ba_mp_list) {
  710. /* only do this for blocks bigger than 1
  711. midplane */
  712. if (bg_record->cpu_cnt >= bg_conf->cpus_per_mp)
  713. if (check_and_set_mp_list(bg_record->ba_mp_list)
  714. == SLURM_ERROR)
  715. error("something happened in the "
  716. "load of %s, keeping it "
  717. "around though",
  718. bg_record->bg_block_id);
  719. } else {
  720. select_ba_request_t ba_request;
  721. ba_set_removable_mps(usable_mp_bitmap, 1);
  722. /* we want the mps that aren't
  723. * in this record to mark them as used
  724. */
  725. if (ba_set_removable_mps(bg_record->mp_bitmap, 1)
  726. != SLURM_SUCCESS)
  727. fatal("1 It doesn't seem we have a bitmap "
  728. "for %s",
  729. bg_record->bg_block_id);
  730. #ifdef HAVE_BGQ
  731. results = list_create(destroy_ba_mp);
  732. #else
  733. results = list_create(NULL);
  734. #endif
  735. /* info("adding back %s %s", bg_record->bg_block_id, */
  736. /* bg_record->mp_str); */
  737. memset(&ba_request, 0, sizeof(ba_request));
  738. memcpy(ba_request.start, bg_record->start,
  739. sizeof(bg_record->start));
  740. memcpy(ba_request.geometry, bg_record->geo,
  741. sizeof(bg_record->geo));
  742. memcpy(ba_request.conn_type, bg_record->conn_type,
  743. sizeof(bg_record->conn_type));
  744. ba_request.start_req = 1;
  745. name = set_bg_block(results, &ba_request);
  746. ba_reset_all_removed_mps();
  747. if (!name) {
  748. error("I was unable to make the "
  749. "requested block.");
  750. list_destroy(results);
  751. destroy_bg_record(bg_record);
  752. bg_record = NULL;
  753. continue;
  754. }
  755. snprintf(temp, sizeof(temp), "%s%s",
  756. bg_conf->slurm_node_prefix,
  757. name);
  758. xfree(name);
  759. if (strcmp(temp, bg_record->mp_str)) {
  760. fatal("bad wiring in preserved state "
  761. "(found %s, but allocated %s) "
  762. "YOU MUST COLDSTART",
  763. bg_record->mp_str, temp);
  764. }
  765. if (bg_record->ba_mp_list)
  766. list_destroy(bg_record->ba_mp_list);
  767. #ifdef HAVE_BGQ
  768. bg_record->ba_mp_list = results;
  769. results = NULL;
  770. #else
  771. bg_record->ba_mp_list = list_create(destroy_ba_mp);
  772. copy_node_path(results, &bg_record->ba_mp_list);
  773. list_destroy(results);
  774. #endif
  775. }
  776. // bridge_block_create(bg_record);
  777. list_push(curr_block_list, bg_record);
  778. }
  779. FREE_NULL_BITMAP(usable_mp_bitmap);
  780. sort_bg_record_inc_size(curr_block_list);
  781. slurm_mutex_unlock(&block_state_mutex);
  782. info("Recovered %d blocks", list_count(curr_block_list));
  783. slurm_free_block_info_msg(block_ptr);
  784. free_buf(buffer);
  785. return SLURM_SUCCESS;
  786. unpack_error:
  787. FREE_NULL_BITMAP(usable_mp_bitmap);
  788. slurm_mutex_unlock(&block_state_mutex);
  789. error("Incomplete block data checkpoint file");
  790. free_buf(buffer);
  791. return SLURM_FAILURE;
  792. }
  793. static void _handle_existing_block(bg_record_t *bg_record)
  794. {
  795. char *conn_type;
  796. char node_str[256];
  797. xassert(bg_record);
  798. format_node_name(bg_record, node_str, sizeof(node_str));
  799. conn_type = conn_type_string_full(bg_record->conn_type);
  800. info("Existing: BlockID:%s Nodes:%s Conn:%s",
  801. bg_record->bg_block_id, node_str, conn_type);
  802. xfree(conn_type);
  803. /* Sanity check to make sure we have the correct setup from
  804. the save.
  805. */
  806. if (bg_conf->sub_blocks && bg_record->mp_count == 1) {
  807. ba_mp_t *ba_mp = list_peek(bg_record->ba_mp_list);
  808. xassert(ba_mp);
  809. if (!ba_mp->cnode_bitmap) {
  810. error("_handle_existing_block: No cnode_bitmap "
  811. "for block %s, creating it",
  812. bg_record->bg_block_id);
  813. if ((ba_mp->cnode_bitmap =
  814. ba_create_ba_mp_cnode_bitmap(bg_record))) {
  815. if (!ba_mp->cnode_err_bitmap)
  816. ba_mp->cnode_err_bitmap =
  817. bit_alloc(bg_conf->
  818. mp_cnode_cnt);
  819. FREE_NULL_BITMAP(ba_mp->cnode_usable_bitmap);
  820. ba_mp->cnode_usable_bitmap =
  821. bit_copy(ba_mp->cnode_bitmap);
  822. }
  823. }
  824. }
  825. if (bg_record->state & BG_BLOCK_ERROR_FLAG)
  826. put_block_in_error_state(bg_record, NULL);
  827. else if (((bg_record->state == BG_BLOCK_INITED)
  828. || (bg_record->state == BG_BLOCK_BOOTING))
  829. && !block_ptr_exist_in_list(bg_lists->booted, bg_record))
  830. list_push(bg_lists->booted, bg_record);
  831. }
  832. /*
  833. * _validate_config_blocks - Match slurm configuration information with
  834. * current BG block configuration.
  835. * IN/OUT curr_block_list - List of blocks already existing on the system.
  836. * IN/OUT found_block_list - List of blocks found on the system
  837. * that are listed in the bluegene.conf.
  838. * NOTE: Both of the lists above should be created with list_create(NULL)
  839. * since the bg_lists->main will contain the complete list of pointers
  840. * and be destroyed with it.
  841. *
  842. * RET - SLURM_SUCCESS if no blocks need to be deleted, else an error
  843. * code. Writes bg_block_id into bg_lists->main records.
  844. */
  845. static int _validate_config_blocks(List curr_block_list,
  846. List found_block_list, char *dir)
  847. {
  848. int rc = SLURM_ERROR;
  849. bg_record_t* bg_record = NULL;
  850. bg_record_t* init_bg_record = NULL;
  851. int full_created = 0;
  852. ListIterator itr_conf;
  853. ListIterator itr_curr;
  854. char tmp_char[256];
  855. int dim;
  856. xassert(curr_block_list);
  857. xassert(found_block_list);
  858. /* read in state from last run. */
  859. if (bg_recover)
  860. rc = _load_state_file(curr_block_list, dir);
  861. #ifndef HAVE_BG_FILES
  862. if (rc != SLURM_SUCCESS)
  863. return rc;
  864. #endif
  865. /* read current bg block info into curr_block_list This
  866. * happens in the state load before this in emulation mode */
  867. if (bridge_blocks_load_curr(curr_block_list) == SLURM_ERROR)
  868. return SLURM_ERROR;
  869. if (!bg_recover)
  870. return SLURM_ERROR;
  871. #ifdef HAVE_BG_FILES
  872. /* Since we just checked all the blocks from state against that
  873. in the database we can now check to see if there were once
  874. blocks that are now gone from the database and remove them
  875. from the list.
  876. */
  877. itr_curr = list_iterator_create(curr_block_list);
  878. while ((bg_record = list_next(itr_curr))) {
  879. if (bg_record->modifying) {
  880. bg_record->modifying = 0;
  881. continue;
  882. }
  883. error("Found state for block %s, but that "
  884. "block isn't in the system anymore, removing",
  885. bg_record->bg_block_id);
  886. list_delete_item(itr_curr);
  887. }
  888. list_iterator_destroy(itr_curr);
  889. #endif
  890. if (bg_conf->layout_mode == LAYOUT_DYNAMIC) {
  891. /* Since we don't read the blocks in a Dynamic system
  892. we can just transfer the list here and return.
  893. */
  894. list_transfer(bg_lists->main, curr_block_list);
  895. itr_conf = list_iterator_create(bg_lists->main);
  896. while ((bg_record = list_next(itr_conf)))
  897. _handle_existing_block(bg_record);
  898. list_iterator_destroy(itr_conf);
  899. return SLURM_SUCCESS;
  900. }
  901. /* Only when we are looking at a non-dynamic system do we need
  902. to go through the following logic to make sure things are insync.
  903. */
  904. itr_curr = list_iterator_create(curr_block_list);
  905. itr_conf = list_iterator_create(bg_lists->main);
  906. while ((bg_record = list_next(itr_conf))) {
  907. list_iterator_reset(itr_curr);
  908. while ((init_bg_record = list_next(itr_curr))) {
  909. if (!bit_equal(bg_record->mp_bitmap,
  910. init_bg_record->mp_bitmap))
  911. continue; /* wrong nodes */
  912. if (!bit_equal(bg_record->ionode_bitmap,
  913. init_bg_record->ionode_bitmap))
  914. continue;
  915. if ((bg_record->conn_type[0] < SELECT_SMALL)
  916. && (init_bg_record->conn_type[0] < SELECT_SMALL)) {
  917. for (dim = 0; dim < SYSTEM_DIMENSIONS; dim++) {
  918. /* Only look at how far we
  919. have set. The bg_record
  920. should of been set up
  921. correctly in the
  922. parse_blockreq() function.
  923. */
  924. if (bg_record->conn_type[dim] ==
  925. (uint16_t)NO_VAL) {
  926. dim = SYSTEM_DIMENSIONS;
  927. break;
  928. }
  929. if (bg_record->conn_type[dim] !=
  930. init_bg_record->conn_type[dim])
  931. break; /* wrong conn_type */
  932. }
  933. if (dim < SYSTEM_DIMENSIONS)
  934. continue;
  935. }
  936. copy_bg_record(init_bg_record, bg_record);
  937. /* remove from the curr list since we just
  938. matched it no reason to keep it around
  939. anymore */
  940. list_delete_item(itr_curr);
  941. break;
  942. }
  943. if (!bg_record->bg_block_id) {
  944. format_node_name(bg_record, tmp_char,
  945. sizeof(tmp_char));
  946. info("Block found in bluegene.conf to be "
  947. "created: Nodes:%s",
  948. tmp_char);
  949. } else {
  950. if (bg_record->full_block)
  951. full_created = 1;
  952. list_push(found_block_list, bg_record);
  953. _handle_existing_block(bg_record);
  954. }
  955. }
  956. if (!full_created) {
  957. list_iterator_reset(itr_curr);
  958. while ((init_bg_record = list_next(itr_curr))) {
  959. if (init_bg_record->full_block) {
  960. list_remove(itr_curr);
  961. bg_record = init_bg_record;
  962. list_append(bg_lists->main, bg_record);
  963. list_push(found_block_list, bg_record);
  964. _handle_existing_block(bg_record);
  965. break;
  966. }
  967. }
  968. }
  969. list_iterator_destroy(itr_conf);
  970. list_iterator_destroy(itr_curr);
  971. if (!list_count(curr_block_list))
  972. rc = SLURM_SUCCESS;
  973. else
  974. rc = SLURM_ERROR;
  975. return rc;
  976. }
  977. static List _get_config(void)
  978. {
  979. config_key_pair_t *key_pair;
  980. List my_list = list_create(destroy_config_key_pair);
  981. if (!my_list)
  982. fatal("malloc failure on list_create");
  983. key_pair = xmalloc(sizeof(config_key_pair_t));
  984. key_pair->name = xstrdup("DefaultConnType");
  985. key_pair->value = conn_type_string_full(bg_conf->default_conn_type);
  986. list_append(my_list, key_pair);
  987. #ifndef HAVE_BG_FILES
  988. key_pair = xmalloc(sizeof(config_key_pair_t));
  989. key_pair->name = xstrdup("Emulated");
  990. key_pair->value = xstrdup("yes");
  991. list_append(my_list, key_pair);
  992. #endif
  993. key_pair = xmalloc(sizeof(config_key_pair_t));
  994. key_pair->name = xstrdup("MaxBlockInError");
  995. key_pair->value = xstrdup_printf("%u", bg_conf->max_block_err);
  996. list_append(my_list, key_pair);
  997. key_pair = xmalloc(sizeof(config_key_pair_t));
  998. key_pair->name = xstrdup("MidPlaneNodeCnt");
  999. key_pair->value = xstrdup_printf("%u", bg_conf->mp_cnode_cnt);
  1000. list_append(my_list, key_pair);
  1001. key_pair = xmalloc(sizeof(config_key_pair_t));
  1002. key_pair->name = xstrdup("NodeCPUCnt");
  1003. key_pair->value = xstrdup_printf("%u", bg_conf->cpu_ratio);
  1004. list_append(my_list, key_pair);
  1005. #ifdef HAVE_BGL
  1006. key_pair = xmalloc(sizeof(config_key_pair_t));
  1007. key_pair->name = xstrdup("BlrtsImage");
  1008. key_pair->value = xstrdup(bg_conf->default_blrtsimage);
  1009. list_append(my_list, key_pair);
  1010. key_pair = xmalloc(sizeof(config_key_pair_t));
  1011. key_pair->name = xstrdup("LinuxImage");
  1012. key_pair->value = xstrdup(bg_conf->default_linuximage);
  1013. list_append(my_list, key_pair);
  1014. key_pair = xmalloc(sizeof(config_key_pair_t));
  1015. key_pair->name = xstrdup("RamDiskImage");
  1016. key_pair->value = xstrdup(bg_conf->default_ramdiskimage);
  1017. list_append(my_list, key_pair);
  1018. #elif defined HAVE_BGP
  1019. key_pair = xmalloc(sizeof(config_key_pair_t));
  1020. key_pair->name = xstrdup("CnloadImage");
  1021. key_pair->value = xstrdup(bg_conf->default_linuximage);
  1022. list_append(my_list, key_pair);
  1023. key_pair = xmalloc(sizeof(config_key_pair_t));
  1024. key_pair->name = xstrdup("IoloadImage");
  1025. key_pair->value = xstrdup(bg_conf->default_ramdiskimage);
  1026. list_append(my_list, key_pair);
  1027. #endif
  1028. key_pair = xmalloc(sizeof(config_key_pair_t));
  1029. key_pair->name = xstrdup("BridgeAPILogFile");
  1030. key_pair->value = xstrdup(bg_conf->bridge_api_file);
  1031. list_append(my_list, key_pair);
  1032. key_pair = xmalloc(sizeof(config_key_pair_t));
  1033. key_pair->name = xstrdup("BridgeAPIVerbose");
  1034. key_pair->value = xstrdup_printf("%u", bg_conf->bridge_api_verb);
  1035. list_append(my_list, key_pair);
  1036. if (bg_conf->deny_pass) {
  1037. key_pair = xmalloc(sizeof(config_key_pair_t));
  1038. key_pair->name = xstrdup("DenyPassThrough");
  1039. if (bg_conf->deny_pass & PASS_DENY_A)
  1040. xstrcat(key_pair->value, "A,");
  1041. if (bg_conf->deny_pass & PASS_DENY_X)
  1042. xstrcat(key_pair->value, "X,");
  1043. if (bg_conf->deny_pass & PASS_DENY_Y)
  1044. xstrcat(key_pair->value, "Y,");
  1045. if (bg_conf->deny_pass & PASS_DENY_Z)
  1046. xstrcat(key_pair->value, "Z,");
  1047. if (key_pair->value)
  1048. key_pair->value[strlen(key_pair->value)-1] = '\0';
  1049. list_append(my_list, key_pair);
  1050. }
  1051. key_pair = xmalloc(sizeof(config_key_pair_t));
  1052. key_pair->name = xstrdup("IONodesPerMP");
  1053. key_pair->value = xstrdup_printf("%u", bg_conf->ionodes_per_mp);
  1054. list_append(my_list, key_pair);
  1055. key_pair = xmalloc(sizeof(config_key_pair_t));
  1056. key_pair->name = xstrdup("LayoutMode");
  1057. switch(bg_conf->layout_mode) {
  1058. case LAYOUT_STATIC:
  1059. key_pair->value = xstrdup("Static");
  1060. break;
  1061. case LAYOUT_OVERLAP:
  1062. key_pair->value = xstrdup("Overlap");
  1063. break;
  1064. case LAYOUT_DYNAMIC:
  1065. key_pair->value = xstrdup("Dynamic");
  1066. break;
  1067. default:
  1068. key_pair->value = xstrdup("Unknown");
  1069. break;
  1070. }
  1071. list_append(my_list, key_pair);
  1072. key_pair = xmalloc(sizeof(config_key_pair_t));
  1073. key_pair->name = xstrdup("MloaderImage");
  1074. key_pair->value = xstrdup(bg_conf->default_mloaderimage);
  1075. list_append(my_list, key_pair);
  1076. key_pair = xmalloc(sizeof(config_key_pair_t));
  1077. key_pair->name = xstrdup("NodeCardNodeCnt");
  1078. key_pair->value = xstrdup_printf("%u", bg_conf->nodecard_cnode_cnt);
  1079. list_append(my_list, key_pair);
  1080. if (bg_conf->sub_blocks) {
  1081. key_pair = xmalloc(sizeof(config_key_pair_t));
  1082. key_pair->name = xstrdup("AllowSubBlockAllocations");
  1083. key_pair->value = xstrdup("Yes");
  1084. list_append(my_list, key_pair);
  1085. }
  1086. if (bg_conf->sub_mp_sys) {
  1087. key_pair = xmalloc(sizeof(config_key_pair_t));
  1088. key_pair->name = xstrdup("SubMidplaneSystem");
  1089. key_pair->value = xstrdup("Yes");
  1090. list_append(my_list, key_pair);
  1091. }
  1092. list_sort(my_list, (ListCmpF) sort_key_pairs);
  1093. return my_list;
  1094. }
  1095. #endif
  1096. /*
  1097. * init() is called when the plugin is loaded, before any other functions
  1098. * are called. Put global initialization here.
  1099. */
  1100. extern int init(void)
  1101. {
  1102. #ifdef HAVE_BG
  1103. if (!bg_conf) {
  1104. /* This is needed on all systems where srun wraps the
  1105. bluegene calling program (i.e. runjob).
  1106. */
  1107. bg_conf = xmalloc(sizeof(bg_config_t));
  1108. /* set some defaults for most systems */
  1109. bg_conf->actual_cnodes_per_mp = bg_conf->mp_cnode_cnt = 512;
  1110. bg_conf->quarter_cnode_cnt = 128;
  1111. bg_conf->nodecard_cnode_cnt = 32;
  1112. bg_conf->mp_nodecard_cnt = bg_conf->mp_cnode_cnt
  1113. / bg_conf->nodecard_cnode_cnt;
  1114. }
  1115. if (bg_recover != NOT_FROM_CONTROLLER) {
  1116. #if defined HAVE_BG_L_P && (SYSTEM_DIMENSIONS != 3)
  1117. fatal("SYSTEM_DIMENSIONS value (%d) invalid for BlueGene",
  1118. SYSTEM_DIMENSIONS);
  1119. #elif defined HAVE_BGQ && (SYSTEM_DIMENSIONS != 4)
  1120. fatal("SYSTEM_DIMENSIONS value (%d) invalid for BGQ",
  1121. SYSTEM_DIMENSIONS);
  1122. #endif
  1123. #if defined HAVE_BG_FILES && defined HAVE_BG_L_P
  1124. #ifdef HAVE_BGL
  1125. if (!getenv("CLASSPATH") || !getenv("DB2INSTANCE")
  1126. || !getenv("VWSPATH"))
  1127. fatal("db2profile has not been "
  1128. "run to setup DB2 environment");
  1129. if ((SELECT_COPROCESSOR_MODE != RM_PARTITION_COPROCESSOR_MODE)
  1130. || (SELECT_VIRTUAL_NODE_MODE
  1131. != RM_PARTITION_VIRTUAL_NODE_MODE))
  1132. fatal("enum node_use_type out of sync with rm_api.h");
  1133. #endif
  1134. if ((SELECT_MESH != RM_MESH)
  1135. || (SELECT_TORUS != RM_TORUS)
  1136. || (SELECT_NAV != RM_NAV))
  1137. fatal("enum conn_type out of sync with rm_api.h");
  1138. #endif
  1139. verbose("%s loading...", plugin_name);
  1140. /* if this is coming from something other than the controller
  1141. we don't want to read the config or anything like that. */
  1142. _set_bg_lists();
  1143. xfree(bg_conf->slurm_user_name);
  1144. xfree(bg_conf->slurm_node_prefix);
  1145. slurm_conf_lock();
  1146. xassert(slurmctld_conf.slurm_user_name);
  1147. xassert(slurmctld_conf.node_prefix);
  1148. bg_conf->slurm_user_name =
  1149. xstrdup(slurmctld_conf.slurm_user_name);
  1150. bg_conf->slurm_node_prefix =
  1151. xstrdup(slurmctld_conf.node_prefix);
  1152. bg_conf->slurm_debug_flags = slurmctld_conf.debug_flags;
  1153. bg_conf->slurm_debug_level = slurmctld_conf.slurmctld_debug;
  1154. slurm_conf_unlock();
  1155. if (bg_conf->blrts_list)
  1156. list_destroy(bg_conf->blrts_list);
  1157. bg_conf->blrts_list = list_create(destroy_image);
  1158. if (bg_conf->linux_list)
  1159. list_destroy(bg_conf->linux_list);
  1160. bg_conf->linux_list = list_create(destroy_image);
  1161. if (bg_conf->mloader_list)
  1162. list_destroy(bg_conf->mloader_list);
  1163. bg_conf->mloader_list = list_create(destroy_image);
  1164. if (bg_conf->ramdisk_list)
  1165. list_destroy(bg_conf->ramdisk_list);
  1166. bg_conf->ramdisk_list = list_create(destroy_image);
  1167. ba_init(NULL, 1);
  1168. verbose("BlueGene plugin loaded successfully");
  1169. }
  1170. verbose("%s loaded", plugin_name);
  1171. #else
  1172. if (bg_recover != NOT_FROM_CONTROLLER)
  1173. fatal("select/bluegene is incompatible with a "
  1174. "non BlueGene system");
  1175. #endif
  1176. return SLURM_SUCCESS;
  1177. }
  1178. extern int fini ( void )
  1179. {
  1180. int rc = SLURM_SUCCESS;
  1181. ba_fini();
  1182. _destroy_bg_config(bg_conf);
  1183. _destroy_bg_lists(bg_lists);
  1184. return rc;
  1185. }
  1186. /*
  1187. * The remainder of this file implements the standard SLURM
  1188. * node selection API.
  1189. */
  1190. /* We rely upon DB2 to save and restore BlueGene state */
  1191. extern int select_p_state_save(char *dir_name)
  1192. {
  1193. #ifdef HAVE_BG
  1194. ListIterator itr;
  1195. bg_record_t *bg_record = NULL;
  1196. int error_code = 0, log_fd;
  1197. char *old_file, *new_file, *reg_file;
  1198. uint32_t blocks_packed = 0, tmp_offset, block_offset;
  1199. Buf buffer = init_buf(BUF_SIZE);
  1200. slurmctld_lock_t job_read_lock =
  1201. { NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };
  1202. DEF_TIMERS;
  1203. debug("bluegene: select_p_state_save");
  1204. START_TIMER;
  1205. /* write header: time */
  1206. packstr(BLOCK_STATE_VERSION, buffer);
  1207. block_offset = get_buf_offset(buffer);
  1208. pack32(blocks_packed, buffer);
  1209. /* Lock job read before block to avoid deadlock job lock is
  1210. * needed because we look at the job_ptr's to send job info. */
  1211. lock_slurmctld(job_read_lock);
  1212. /* write block records to buffer */
  1213. slurm_mutex_lock(&block_state_mutex);
  1214. itr = list_iterator_create(bg_lists->main);
  1215. while ((bg_record = list_next(itr))) {
  1216. if (bg_record->magic != BLOCK_MAGIC)
  1217. continue;
  1218. xassert(bg_record->bg_block_id != NULL);
  1219. _pack_block(bg_record, buffer, SLURM_PROTOCOL_VERSION);
  1220. _pack_block_ext(bg_record, buffer, SLURM_PROTOCOL_VERSION);
  1221. blocks_packed++;
  1222. }
  1223. list_iterator_destroy(itr);
  1224. slurm_mutex_unlock(&block_state_mutex);
  1225. unlock_slurmctld(job_read_lock);
  1226. tmp_offset = get_buf_offset(buffer);
  1227. set_buf_offset(buffer, block_offset);
  1228. pack32(blocks_packed, buffer);
  1229. set_buf_offset(buffer, tmp_offset);
  1230. /* Maintain config read lock until we copy state_save_location *\
  1231. \* unlock_slurmctld(part_read_lock); - see below */
  1232. /* write the buffer to file */
  1233. slurm_conf_lock();
  1234. old_file = xstrdup(slurmctld_conf.state_save_location);
  1235. xstrcat(old_file, "/block_state.old");
  1236. reg_file = xstrdup(slurmctld_conf.state_save_location);
  1237. xstrcat(reg_file, "/block_state");
  1238. new_file = xstrdup(slurmctld_conf.state_save_location);
  1239. xstrcat(new_file, "/block_state.new");
  1240. slurm_conf_unlock();
  1241. log_fd = creat(new_file, 0600);
  1242. if (log_fd < 0) {
  1243. error("Can't save state, error creating file %s, %m",
  1244. new_file);
  1245. error_code = errno;
  1246. } else {
  1247. int pos = 0, nwrite = get_buf_offset(buffer), amount;
  1248. char *data = (char *)get_buf_data(buffer);
  1249. while (nwrite > 0) {
  1250. amount = write(log_fd, &data[pos], nwrite);
  1251. if ((amount < 0) && (errno != EINTR)) {
  1252. error("Error writing file %s, %m", new_file);
  1253. error_code = errno;
  1254. break;
  1255. }
  1256. nwrite -= amount;
  1257. pos += amount;
  1258. }
  1259. fsync(log_fd);
  1260. close(log_fd);
  1261. }
  1262. if (error_code)
  1263. (void) unlink(new_file);
  1264. else { /* file shuffle */
  1265. (void) unlink(old_file);
  1266. if (link(reg_file, old_file))
  1267. debug4("unable to create link for %s -> %s: %m",
  1268. reg_file, old_file);
  1269. (void) unlink(reg_file);
  1270. if (link(new_file, reg_file))
  1271. debug4("unable to create link for %s -> %s: %m",
  1272. new_file, reg_file);
  1273. (void) unlink(new_file);
  1274. }
  1275. xfree(old_file);
  1276. xfree(reg_file);
  1277. xfree(new_file);
  1278. free_buf(buffer);
  1279. END_TIMER2("select_p_state_save");
  1280. return SLURM_SUCCESS;
  1281. #else
  1282. return SLURM_ERROR;
  1283. #endif
  1284. }
  1285. extern int select_p_state_restore(char *dir_name)
  1286. {
  1287. #ifdef HAVE_BG
  1288. debug("bluegene: select_p_state_restore");
  1289. /* found bg blocks already on system */
  1290. List curr_block_list = NULL;
  1291. List found_block_list = NULL;
  1292. static time_t last_config_update = (time_t) 0;
  1293. /* only run on startup */
  1294. if (last_config_update)
  1295. return SLURM_SUCCESS;
  1296. last_config_update = time(NULL);
  1297. curr_block_list = list_create(destroy_bg_record);
  1298. found_block_list = list_create(NULL);
  1299. //#if 0
  1300. /* Check to see if the configs we have are correct */
  1301. if (_validate_config_blocks(curr_block_list, found_block_list, dir_name)
  1302. == SLURM_ERROR) {
  1303. _delete_old_blocks(curr_block_list, found_block_list);
  1304. }
  1305. //#endif
  1306. /* looking for blocks only I created */
  1307. if (bg_conf->layout_mode == LAYOUT_DYNAMIC) {
  1308. info("No blocks created until jobs are submitted");
  1309. } else {
  1310. if (create_defined_blocks(bg_conf->layout_mode,
  1311. found_block_list)
  1312. == SLURM_ERROR) {
  1313. /* error in creating the static blocks, so
  1314. * blocks referenced by submitted jobs won't
  1315. * correspond to actual slurm blocks.
  1316. */
  1317. fatal("Error, could not create the static blocks");
  1318. return SLURM_ERROR;
  1319. }
  1320. }
  1321. list_destroy(curr_block_list);
  1322. curr_block_list = NULL;
  1323. list_destroy(found_block_list);
  1324. found_block_list = NULL;
  1325. slurm_mutex_lock(&block_state_mutex);
  1326. last_bg_update = time(NULL);
  1327. sort_bg_record_inc_size(bg_lists->main);
  1328. slurm_mutex_unlock(&block_state_mutex);
  1329. if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
  1330. info("Blocks have finished being created.");
  1331. return SLURM_SUCCESS;
  1332. #else
  1333. return SLURM_ERROR;
  1334. #endif
  1335. }
  1336. /* Sync BG blocks to currently active jobs */
  1337. extern int select_p_job_init(List job_list)
  1338. {
  1339. #ifdef HAVE_BG
  1340. int rc = sync_jobs(job_list);
  1341. /* after we have synced the blocks then we say they are
  1342. created. */
  1343. blocks_are_created = 1;
  1344. return rc;
  1345. #else
  1346. return SLURM_ERROR;
  1347. #endif
  1348. }
  1349. extern bool select_p_node_ranking(struct node_record *node_ptr, int node_cnt)
  1350. {
  1351. return false;
  1352. }
  1353. /* All initialization is performed by init() */
  1354. extern int select_p_node_init(struct node_record *node_ptr_array, int node_cnt)
  1355. {
  1356. #ifdef HAVE_BG
  1357. int i = 0;
  1358. uint32_t real_memory, threads, cores;
  1359. if (!node_ptr_array)
  1360. return SLURM_SUCCESS;
  1361. xassert(bg_conf);
  1362. /* we need the amount of memory for a midplane */
  1363. real_memory = bg_conf->mp_cnode_cnt;
  1364. /* Set up some knowns that perhaps aren't all the way
  1365. in the slurm.conf.
  1366. */
  1367. #ifdef HAVE_BGL
  1368. threads = 1;
  1369. cores = 2;
  1370. real_memory *= 512;
  1371. #elif defined HAVE_BGP
  1372. threads = 1;
  1373. cores = 4;
  1374. real_memory *= 2048;
  1375. #else
  1376. /* BGQ */
  1377. threads = 4;
  1378. cores = 16;
  1379. real_memory *= 16384;
  1380. #endif
  1381. bg_conf->cpus_per_mp = bg_conf->mp_cnode_cnt * cores;
  1382. for (i = 0; i < node_cnt; i++) {
  1383. struct node_record *node_ptr = &node_ptr_array[i];
  1384. select_nodeinfo_t *nodeinfo = NULL;
  1385. if (!node_ptr->name)
  1386. continue;
  1387. node_ptr->threads = threads;
  1388. node_ptr->cores = cores;
  1389. node_ptr->sockets = bg_conf->mp_cnode_cnt;
  1390. node_ptr->config_ptr->cpus = node_ptr->cpus =
  1391. bg_conf->cpus_per_mp;
  1392. node_ptr->real_memory = real_memory;
  1393. xassert(node_ptr->select_nodeinfo);
  1394. nodeinfo = node_ptr->select_nodeinfo->data;
  1395. xassert(nodeinfo);
  1396. slurm_mutex_lock(&ba_system_mutex);
  1397. if (!(nodeinfo->ba_mp = str2ba_mp(node_ptr->name))) {
  1398. slurm_mutex_unlock(&ba_system_mutex);
  1399. continue;
  1400. }
  1401. nodeinfo->ba_mp->index = i;
  1402. if (IS_NODE_DOWN(node_ptr) || IS_NODE_DRAIN(node_ptr))
  1403. ba_update_mp_state(
  1404. nodeinfo->ba_mp, node_ptr->node_state);
  1405. nodeinfo->ba_mp->state = node_ptr->node_state;
  1406. slurm_mutex_unlock(&ba_system_mutex);
  1407. }
  1408. return SLURM_SUCCESS;
  1409. #else
  1410. return SLURM_ERROR;
  1411. #endif
  1412. }
  1413. /*
  1414. * Called by slurmctld when a new configuration file is loaded
  1415. * or scontrol is used to change block configuration
  1416. */
  1417. extern int select_p_block_init(List part_list)
  1418. {
  1419. #ifdef HAVE_BG
  1420. /* select_p_node_init needs to be called before this to set
  1421. this up correctly
  1422. */
  1423. if (read_bg_conf() == SLURM_ERROR) {
  1424. fatal("Error, could not read the file");
  1425. return SLURM_ERROR;
  1426. }
  1427. if (part_list) {
  1428. struct part_record *part_ptr = NULL;
  1429. ListIterator itr = list_iterator_create(part_list);
  1430. while ((part_ptr = list_next(itr))) {
  1431. char *this_node_name;
  1432. hostlist_t host_list;
  1433. part_ptr->total_cpus = 0;
  1434. if (!part_ptr->nodes) /* no nodes in partition */
  1435. continue;
  1436. if (!(host_list = hostlist_create(part_ptr->nodes))) {
  1437. error("hostlist_create error on %s, %m",
  1438. part_ptr->nodes);
  1439. continue;
  1440. }
  1441. while ((this_node_name = hostlist_shift(host_list))) {
  1442. struct node_record *node_ptr =
  1443. find_node_record(this_node_name);
  1444. if (node_ptr == NULL) {
  1445. error("select_p_block_init: "
  1446. "invalid node name %s",
  1447. this_node_name);
  1448. free(this_node_name);
  1449. hostlist_destroy(host_list);
  1450. continue;
  1451. }
  1452. free(this_node_name);
  1453. part_ptr->total_cpus += node_ptr->cpus;
  1454. }
  1455. hostlist_destroy(host_list);
  1456. part_ptr->max_nodes = part_ptr->max_nodes_orig;
  1457. part_ptr->min_nodes = part_ptr->min_nodes_orig;
  1458. select_p_alter_node_cnt(SELECT_SET_MP_CNT,
  1459. &part_ptr->max_nodes);
  1460. select_p_alter_node_cnt(SELECT_SET_MP_CNT,
  1461. &part_ptr->min_nodes);
  1462. }
  1463. list_iterator_destroy(itr);
  1464. }
  1465. return SLURM_SUCCESS;
  1466. #else
  1467. return SLURM_ERROR;
  1468. #endif
  1469. }
  1470. /*
  1471. * select_p_job_test - Given a specification of scheduling requirements,
  1472. * identify the nodes which "best" satify the request. The specified
  1473. * nodes may be DOWN or BUSY at the time of this test as may be used
  1474. * to deterime if a job could ever run.
  1475. * IN/OUT job_ptr - pointer to job being scheduled start_time is set
  1476. * when we can possibly start job.
  1477. * IN/OUT bitmap - usable nodes are set on input, nodes not required to
  1478. * satisfy the request are cleared, other left set
  1479. * IN min_nodes - minimum count of nodes
  1480. * IN max_nodes - maximum count of nodes (0==don't care)
  1481. * IN req_nodes - requested (or desired) count of nodes
  1482. * IN mode - SELECT_MODE_RUN_NOW: try to schedule job now
  1483. * SELECT_MODE_TEST_ONLY: test if job can ever run
  1484. * SELECT_MODE_WILL_RUN: determine when and where job can run
  1485. * IN preemptee_candidates - List of pointers to jobs which can be preempted.
  1486. * IN/OUT preemptee_job_list - Pointer to list of job pointers. These are the
  1487. * jobs to be preempted to initiate the pending job. Not set
  1488. * if mode=SELECT_MODE_TEST_ONLY or input pointer is NULL.
  1489. * IN exc_core_bitmap - bitmap of cores being reserved.
  1490. * RET zero on success, EINVAL otherwise
  1491. * NOTE: bitmap must be a superset of req_nodes at the time that
  1492. * select_p_job_test is called
  1493. */
  1494. extern int select_p_job_test(struct job_record *job_ptr, bitstr_t *bitmap,
  1495. uint32_t min_nodes, uint32_t max_nodes,
  1496. uint32_t req_nodes, uint16_t mode,
  1497. List preemptee_candidates,
  1498. List *preemptee_job_list,
  1499. bitstr_t *exc_core_bitmap)
  1500. {
  1501. #ifdef HAVE_BG
  1502. /* submit_job - is there a block where we have:
  1503. * 1) geometry requested
  1504. * 2) min/max nodes (MPs) requested
  1505. * 3) type: TORUS or MESH or NAV (torus else mesh)
  1506. *
  1507. * note: we don't have to worry about security at this level
  1508. * as the SLURM block logic will handle access rights.
  1509. */
  1510. return submit_job(job_ptr, bitmap, min_nodes, max_nodes,
  1511. req_nodes, mode, preemptee_candidates,
  1512. preemptee_job_list);
  1513. #else
  1514. return SLURM_ERROR;
  1515. #endif
  1516. }
  1517. extern int select_p_job_begin(struct job_record *job_ptr)
  1518. {
  1519. #ifdef HAVE_BG
  1520. return start_job(job_ptr);
  1521. #else
  1522. return SLURM_ERROR;
  1523. #endif
  1524. }
  1525. extern int select_p_job_ready(struct job_record *job_ptr)
  1526. {
  1527. #ifdef HAVE_BG
  1528. int rc = 1;
  1529. char *block_id = NULL;
  1530. bg_record_t *bg_record = NULL;
  1531. rc = get_select_jobinfo(job_ptr->select_jobinfo->data,
  1532. SELECT_JOBDATA_BLOCK_ID, &block_id);
  1533. if (rc == SLURM_SUCCESS) {
  1534. slurm_mutex_lock(&block_state_mutex);
  1535. bg_record = find_bg_record_in_list(bg_lists->main, block_id);
  1536. if (bg_record) {
  1537. uint32_t job_id = NO_JOB_RUNNING, uid = NO_VAL;
  1538. struct job_record *found_job_ptr = NULL;
  1539. if (bg_record->job_list
  1540. && list_count(bg_record->job_list)) {
  1541. ListIterator itr = list_iterator_create(
  1542. bg_record->job_list);
  1543. xassert(itr);
  1544. while ((found_job_ptr = list_next(itr))) {
  1545. if (found_job_ptr->magic != JOB_MAGIC) {
  1546. error("select_p_job_ready: "
  1547. "bad magic found when "
  1548. "looking at job %u",
  1549. job_ptr->job_id);
  1550. list_delete_item(itr);
  1551. continue;
  1552. }
  1553. if (found_job_ptr->job_id
  1554. == job_ptr->job_id)
  1555. break;
  1556. }
  1557. list_iterator_destroy(itr);
  1558. } else if (bg_record->job_ptr)
  1559. found_job_ptr = bg_record->job_ptr;
  1560. if (found_job_ptr) {
  1561. job_id = found_job_ptr->job_id;
  1562. uid = found_job_ptr->user_id;
  1563. }
  1564. if (job_id != job_ptr->job_id) {
  1565. rc = 0;
  1566. } else if (!bg_record->free_cnt
  1567. && (uid == job_ptr->user_id)
  1568. && (bg_record->state == BG_BLOCK_INITED)) {
  1569. /* Clear the state just incase we
  1570. * missed it somehow. */
  1571. job_ptr->job_state &= (~JOB_CONFIGURING);
  1572. last_job_update = time(NULL);
  1573. rc = 1;
  1574. } else if (uid != job_ptr->user_id)
  1575. rc = 0;
  1576. else
  1577. rc = READY_JOB_ERROR; /* try again */
  1578. } else {
  1579. /* This means the block has been removed and
  1580. is no longer valid. This could happen
  1581. often during an epilog on a busy system.
  1582. */
  1583. debug2("block_ready: block %s not in bg_lists->main.",
  1584. block_id);
  1585. rc = READY_JOB_FATAL; /* fatal error */
  1586. }
  1587. slurm_mutex_unlock(&block_state_mutex);
  1588. } else
  1589. rc = READY_JOB_ERROR;
  1590. /* info("returning %d for job %u block %s %d %d", */
  1591. /* rc, job_ptr->job_id, block_id, */
  1592. /* READY_JOB_ERROR, READY_JOB_FATAL); */
  1593. xfree(block_id);
  1594. return rc;
  1595. #else
  1596. return SLURM_ERROR;
  1597. #endif
  1598. }
  1599. extern int select_p_job_resized(struct job_record *job_ptr,
  1600. struct node_record *node_ptr)
  1601. {
  1602. return ESLURM_NOT_SUPPORTED;
  1603. }
  1604. extern bool select_p_job_expand_allow(void)
  1605. {
  1606. return false;
  1607. }
  1608. extern int select_p_job_expand(struct job_record *from_job_ptr,
  1609. struct job_record *to_job_ptr)
  1610. {
  1611. return ESLURM_NOT_SUPPORTED;
  1612. }
  1613. extern int select_p_job_signal(struct job_record *job_ptr, int signal)
  1614. {
  1615. return SLURM_SUCCESS;
  1616. }
  1617. extern int select_p_job_fini(struct job_record *job_ptr)
  1618. {
  1619. #ifdef HAVE_BG
  1620. return term_job(job_ptr);
  1621. #else
  1622. return SLURM_ERROR;
  1623. #endif
  1624. }
  1625. extern int select_p_job_suspend(struct job_record *job_ptr, bool indf_susp)
  1626. {
  1627. return ESLURM_NOT_SUPPORTED;
  1628. }
  1629. extern int select_p_job_resume(struct job_record *job_ptr, bool indf_susp)
  1630. {
  1631. return ESLURM_NOT_SUPPORTED;
  1632. }
  1633. extern bitstr_t *select_p_step_pick_nodes(struct job_record *job_ptr,
  1634. select_jobinfo_t *step_jobinfo,
  1635. uint32_t node_count)
  1636. {
  1637. bitstr_t *picked_mps = NULL;
  1638. bg_record_t *bg_record = NULL;
  1639. char *tmp_char = NULL;
  1640. ba_mp_t *ba_mp = NULL;
  1641. select_jobinfo_t *jobinfo = NULL;
  1642. int dim;
  1643. xassert(job_ptr);
  1644. slurm_mutex_lock(&block_state_mutex);
  1645. jobinfo = job_ptr->select_jobinfo->data;
  1646. bg_record = jobinfo->bg_record;
  1647. if (!bg_record)
  1648. fatal("This job %u does not have a bg block "
  1649. "assigned to it, but for some reason we are "
  1650. "trying to start a step on it?",
  1651. job_ptr->job_id);
  1652. else if (bg_record->magic != BLOCK_MAGIC) {
  1653. bg_record = find_bg_record_in_list(
  1654. bg_lists->main, jobinfo->bg_block_id);
  1655. if (!bg_record || (bg_record->magic != BLOCK_MAGIC)) {
  1656. int rc;
  1657. error("select_p_step_pick_nodes: "
  1658. "Whoa, some how we got a bad block for job %u, "
  1659. "it should be %s but we couldn't find "
  1660. "it on the system, no step for you, "
  1661. "and ending job.",
  1662. job_ptr->job_id, jobinfo->bg_block_id);
  1663. slurm_mutex_unlock(&block_state_mutex);
  1664. if ((rc = job_requeue(0, job_ptr->job_id,
  1665. -1, (uint16_t)NO_VAL, false))) {
  1666. error("Couldn't requeue job %u, failing it: %s",
  1667. job_ptr->job_id, slurm_strerror(rc));
  1668. job_fail(job_ptr->job_id);
  1669. }
  1670. return NULL;
  1671. }
  1672. error("select_p_step_pick_nodes: Whoa, some how we got a "
  1673. "bad block for job %u, it should be %s "
  1674. "(we found it so no big deal, but strange)",
  1675. job_ptr->job_id, jobinfo->bg_block_id);
  1676. jobinfo->bg_record = bg_record;
  1677. }
  1678. xassert(!step_jobinfo->units_used);
  1679. xfree(step_jobinfo->bg_block_id);
  1680. step_jobinfo->bg_block_id = xstrdup(bg_record->bg_block_id);
  1681. step_jobinfo->block_cnode_cnt = bg_record->cnode_cnt;
  1682. if (((cluster_flags & CLUSTER_FLAG_BGL)
  1683. || (cluster_flags & CLUSTER_FLAG_BGP))
  1684. || ((node_count == bg_record->cnode_cnt)
  1685. || (node_count > bg_conf->mp_cnode_cnt))) {
  1686. /* If we are using the whole block (or more than 1
  1687. midplane of it) we need to verify
  1688. if anything else is used. If anything else is used
  1689. return NULL, else return that we can use the entire
  1690. thing.
  1691. On BGL/P This is always the default, no matter how
  1692. big the step is since you can only run 1 step per block.
  1693. */
  1694. step_jobinfo->dim_cnt = jobinfo->dim_cnt;
  1695. if (list_count(job_ptr->step_list)) {
  1696. if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
  1697. info("select_p_step_pick_nodes: Looking "
  1698. "for more than one midplane of "
  1699. "block %s for job %u, "
  1700. "but some of it is used.",
  1701. bg_record->bg_block_id, job_ptr->job_id);
  1702. goto end_it;
  1703. }
  1704. if (!(picked_mps = bit_copy(job_ptr->node_bitmap)))
  1705. fatal("bit_copy malloc failure");
  1706. if (cluster_flags & CLUSTER_FLAG_BGQ) {
  1707. bitstr_t *used_bitmap;
  1708. if (node_count > bg_conf->mp_cnode_cnt) {
  1709. /* Here we have to make sure nothing
  1710. else is able to run on this block
  1711. since we are using more than 1
  1712. midplane but potentially not the
  1713. entire allocation.
  1714. */
  1715. FREE_NULL_BITMAP(jobinfo->units_avail);
  1716. FREE_NULL_BITMAP(jobinfo->units_used);
  1717. jobinfo->units_avail =
  1718. ba_create_ba_mp_cnode_bitmap(bg_record);
  1719. jobinfo->units_used =
  1720. bit_copy(jobinfo->units_avail);
  1721. }
  1722. if (jobinfo->units_avail)
  1723. used_bitmap = jobinfo->units_used;
  1724. else {
  1725. ba_mp = list_peek(bg_record->ba_mp_list);
  1726. xassert(ba_mp);
  1727. if (!ba_mp->cnode_bitmap)
  1728. ba_mp->cnode_bitmap =
  1729. ba_create_ba_mp_cnode_bitmap(
  1730. bg_record);
  1731. used_bitmap = ba_mp->cnode_bitmap;
  1732. }
  1733. /* units_used and units_avail will be the
  1734. same, the exact opposite of used_bitmap.
  1735. */
  1736. step_jobinfo->units_used = bit_copy(used_bitmap);
  1737. bit_not(step_jobinfo->units_used);
  1738. step_jobinfo->units_avail =
  1739. bit_copy(step_jobinfo->units_used);
  1740. bit_or(used_bitmap, step_jobinfo->units_used);
  1741. }
  1742. step_jobinfo->ionode_str = xstrdup(jobinfo->ionode_str);
  1743. } else if (jobinfo->units_avail) {
  1744. bitstr_t *total_bitmap = jobinfo->units_used;
  1745. ba_mp = list_peek(bg_record->ba_mp_list);
  1746. xassert(ba_mp);
  1747. if (ba_mp->cnode_err_bitmap) {
  1748. total_bitmap = bit_copy(jobinfo->units_used);
  1749. bit_or(total_bitmap, ba_mp->cnode_err_bitmap);
  1750. }
  1751. /* handle a sub-block allocation where the allocation
  1752. itself if a small block.
  1753. */
  1754. step_jobinfo->cnode_cnt = node_count;
  1755. if (!(ba_sub_block_in_bitmap(step_jobinfo, total_bitmap, 1))) {
  1756. if (total_bitmap != jobinfo->units_used)
  1757. FREE_NULL_BITMAP(total_bitmap);
  1758. goto end_it;
  1759. }
  1760. if (total_bitmap != jobinfo->units_used)
  1761. FREE_NULL_BITMAP(total_bitmap);
  1762. node_count = step_jobinfo->cnode_cnt;
  1763. if (!(picked_mps = bit_copy(job_ptr->node_bitmap)))
  1764. fatal("bit_copy malloc failure");
  1765. bit_or(jobinfo->units_used, step_jobinfo->units_used);
  1766. for (dim = 0; dim < step_jobinfo->dim_cnt; dim++) {
  1767. /* The IBM software works off a relative
  1768. position in the block instead of the
  1769. absolute position used in SLURM.
  1770. Since conn_type doesn't mean anything for a
  1771. step we can just overload it since it is getting
  1772. sent aready and we don't need to bloat
  1773. anything if we don't have to.
  1774. So setting it here we can have both
  1775. absolute and relative.
  1776. We don't need to add here since we are
  1777. always only dealing with a block that is 1
  1778. midplane or less.
  1779. */
  1780. step_jobinfo->conn_type[dim] =
  1781. step_jobinfo->start_loc[dim]
  1782. - bg_record->start_small[dim];
  1783. }
  1784. } else if ((ba_mp = ba_sub_block_in_record(
  1785. bg_record, &node_count, step_jobinfo))) {
  1786. if (!(picked_mps = bit_alloc(bit_size(job_ptr->node_bitmap))))
  1787. fatal("bit_copy malloc failure");
  1788. bit_set(picked_mps, ba_mp->index);
  1789. for (dim = 0; dim < step_jobinfo->dim_cnt; dim++) {
  1790. /* The IBM software works off a relative
  1791. position in the block instead of the
  1792. absolute position used in SLURM.
  1793. Since conn_type doesn't mean anything for a
  1794. step we can just overload it since it is getting
  1795. sent aready and we don't need to bloat
  1796. anything if we don't have to.
  1797. So setting it here we can have both
  1798. absolute and relative.
  1799. We add here since if not using the first
  1800. midplane we have already setup the
  1801. conn_type to point to the starting point of
  1802. the relative position in the block.
  1803. */
  1804. step_jobinfo->conn_type[dim] +=
  1805. step_jobinfo->start_loc[dim]
  1806. - bg_record->start_small[dim];
  1807. }
  1808. }
  1809. if (picked_mps) {
  1810. if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) {
  1811. tmp_char = bitmap2node_name(picked_mps);
  1812. info("select_p_step_pick_nodes: new step for job %u "
  1813. "will be running on %s(%s)",
  1814. job_ptr->job_id, bg_record->bg_block_id, tmp_char);
  1815. xfree(tmp_char);
  1816. }
  1817. step_jobinfo->cnode_cnt = node_count;
  1818. }
  1819. end_it:
  1820. slurm_mutex_unlock(&block_state_mutex);
  1821. return picked_mps;
  1822. }
  1823. extern int select_p_step_finish(struct step_record *step_ptr)
  1824. {
  1825. bg_record_t *bg_record = NULL;
  1826. select_jobinfo_t *jobinfo = NULL, *step_jobinfo = NULL;
  1827. int rc = SLURM_SUCCESS;
  1828. char *tmp_char = NULL;
  1829. xassert(step_ptr);
  1830. if (IS_JOB_COMPLETING(step_ptr->job_ptr)) {
  1831. debug("step completion %u.%u was received after job "
  1832. "allocation is already completing, no cleanup needed",
  1833. step_ptr->job_ptr->job_id, step_ptr->step_id);
  1834. return SLURM_SUCCESS;
  1835. }
  1836. jobinfo = step_ptr->job_ptr->select_jobinfo->data;
  1837. step_jobinfo = step_ptr->select_jobinfo->data;
  1838. if (step_jobinfo->cnode_cnt > bg_conf->mp_cnode_cnt) {
  1839. /* This means we were using units_avail and units_used
  1840. as midplanes not cnodes for either the whole job
  1841. allocation or a portion of it.
  1842. */
  1843. FREE_NULL_BITMAP(jobinfo->units_avail);
  1844. FREE_NULL_BITMAP(jobinfo->units_used);
  1845. } else if (jobinfo->units_avail)
  1846. rc = ba_sub_block_in_bitmap_clear(
  1847. step_jobinfo, jobinfo->units_used);
  1848. else {
  1849. slurm_mutex_lock(&block_state_mutex);
  1850. bg_record = jobinfo->bg_record;
  1851. if (!bg_record)
  1852. fatal("This step %u.%u does not have a bg block "
  1853. "assigned to it, but for some reason we are "
  1854. "trying to end the step?",
  1855. step_ptr->job_ptr->job_id, step_ptr->step_id);
  1856. else if (bg_record->magic != BLOCK_MAGIC) {
  1857. bg_record = find_bg_record_in_list(
  1858. bg_lists->main, jobinfo->bg_block_id);
  1859. if (!bg_record || (bg_record->magic != BLOCK_MAGIC)) {
  1860. error("select_p_step_finish: "
  1861. "Whoa, some how we got a bad block "
  1862. "for job %u, it should be %s but "
  1863. "we couldn't find it on the system, "
  1864. "so no real need to clear it up.",
  1865. step_ptr->job_ptr->job_id,
  1866. jobinfo->bg_block_id);
  1867. slurm_mutex_unlock(&block_state_mutex);
  1868. return SLURM_ERROR;
  1869. }
  1870. error("select_p_step_finish: Whoa, some how we "
  1871. "got a bad block for job %u, it should be %s "
  1872. "(we found it so no big deal, but strange)",
  1873. step_ptr->job_ptr->job_id, jobinfo->bg_block_id);
  1874. jobinfo->bg_record = bg_record;
  1875. }
  1876. rc = ba_sub_block_in_record_clear(bg_record, step_ptr);
  1877. slurm_mutex_unlock(&block_state_mutex);
  1878. }
  1879. if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) {
  1880. tmp_char = bitmap2node_name(step_ptr->step_node_bitmap);
  1881. info("select_p_step_finish: step %u.%u cleared from %s",
  1882. step_ptr->job_ptr->job_id, step_ptr->step_id, tmp_char);
  1883. xfree(tmp_char);
  1884. }
  1885. return rc;
  1886. }
  1887. /* The unpack for this is in common/slurm_protocol_pack.c */
  1888. extern int select_p_pack_select_info(time_t last_query_time,
  1889. uint16_t show_flags, Buf *buffer_ptr,
  1890. uint16_t protocol_version)
  1891. {
  1892. #ifdef HAVE_BG
  1893. ListIterator itr;
  1894. bg_record_t *bg_record = NULL;
  1895. uint32_t blocks_packed = 0, tmp_offset;
  1896. Buf buffer;
  1897. /* check to see if data has changed */
  1898. if (last_query_time >= last_bg_update) {
  1899. debug2("Node select info hasn't changed since %ld",
  1900. last_bg_update);
  1901. return SLURM_NO_CHANGE_IN_DATA;
  1902. } else if (blocks_are_created) {
  1903. *buffer_ptr = NULL;
  1904. buffer = init_buf(HUGE_BUF_SIZE);
  1905. pack32(blocks_packed, buffer);
  1906. pack_time(last_bg_update, buffer);
  1907. if (protocol_version >= SLURM_2_3_PROTOCOL_VERSION) {
  1908. if (bg_lists->main) {
  1909. slurmctld_lock_t job_read_lock =
  1910. { NO_LOCK, READ_LOCK,
  1911. NO_LOCK, NO_LOCK };
  1912. /* Lock job read before block to avoid
  1913. * deadlock job lock is needed because
  1914. * we look at the job_ptr's to send
  1915. * job info. */
  1916. lock_slurmctld(job_read_lock);
  1917. slurm_mutex_lock(&block_state_mutex);
  1918. itr = list_iterator_create(bg_lists->main);
  1919. while ((bg_record = list_next(itr))) {
  1920. if (bg_record->magic != BLOCK_MAGIC)
  1921. continue;
  1922. _pack_block(bg_record, buffer,
  1923. protocol_version);
  1924. blocks_packed++;
  1925. }
  1926. list_iterator_destroy(itr);
  1927. slurm_mutex_unlock(&block_state_mutex);
  1928. unlock_slurmctld(job_read_lock);
  1929. } else {
  1930. error("select_p_pack_select_info: "
  1931. "no bg_lists->main");
  1932. return SLURM_ERROR;
  1933. }
  1934. }
  1935. tmp_offset = get_buf_offset(buffer);
  1936. set_buf_offset(buffer, 0);
  1937. pack32(blocks_packed, buffer);
  1938. set_buf_offset(buffer, tmp_offset);
  1939. *buffer_ptr = buffer;
  1940. } else {
  1941. error("select_p_pack_select_info: bg_lists->main not created "
  1942. "yet");
  1943. return SLURM_ERROR;
  1944. }
  1945. return SLURM_SUCCESS;
  1946. #else
  1947. return SLURM_ERROR;
  1948. #endif
  1949. }
  1950. extern int select_p_select_nodeinfo_pack(select_nodeinfo_t *nodeinfo,
  1951. Buf buffer,
  1952. uint16_t protocol_version)
  1953. {
  1954. return select_nodeinfo_pack(nodeinfo, buffer, protocol_version);
  1955. }
  1956. extern int select_p_select_nodeinfo_unpack(select_nodeinfo_t **nodeinfo,
  1957. Buf buffer,
  1958. uint16_t protocol_version)
  1959. {
  1960. return select_nodeinfo_unpack(nodeinfo, buffer, protocol_version);
  1961. }
  1962. extern select_nodeinfo_t *select_p_select_nodeinfo_alloc(void)
  1963. {
  1964. return select_nodeinfo_alloc(0);
  1965. }
  1966. extern int select_p_select_nodeinfo_free(select_nodeinfo_t *nodeinfo)
  1967. {
  1968. return select_nodeinfo_free(nodeinfo);
  1969. }
  1970. extern int select_p_select_nodeinfo_set_all(void)
  1971. {
  1972. if (bg_recover != NOT_FROM_CONTROLLER)
  1973. bridge_status_init();
  1974. return select_nodeinfo_set_all();
  1975. }
  1976. extern int select_p_select_nodeinfo_set(struct job_record *job_ptr)
  1977. {
  1978. return SLURM_SUCCESS;
  1979. }
  1980. extern int select_p_select_nodeinfo_get(select_nodeinfo_t *nodeinfo,
  1981. enum select_nodedata_type dinfo,
  1982. enum node_states state,
  1983. void *data)
  1984. {
  1985. return select_nodeinfo_get(nodeinfo, dinfo, state, data);
  1986. }
  1987. extern select_jobinfo_t *select_p_select_jobinfo_alloc(void)
  1988. {
  1989. return alloc_select_jobinfo();
  1990. }
  1991. extern int select_p_select_jobinfo_set(select_jobinfo_t *jobinfo,
  1992. enum select_jobdata_type data_type,
  1993. void *data)
  1994. {
  1995. return set_select_jobinfo(jobinfo, data_type, data);
  1996. }
  1997. extern int select_p_select_jobinfo_get(select_jobinfo_t *jobinfo,
  1998. enum select_jobdata_type data_type,
  1999. void *data)
  2000. {
  2001. return get_select_jobinfo(jobinfo, data_type, data);
  2002. }
  2003. extern select_jobinfo_t *select_p_select_jobinfo_copy(select_jobinfo_t *jobinfo)
  2004. {
  2005. return copy_select_jobinfo(jobinfo);
  2006. }
  2007. extern int select_p_select_jobinfo_free(select_jobinfo_t *jobinfo)
  2008. {
  2009. return free_select_jobinfo(jobinfo);
  2010. }
  2011. extern int select_p_select_jobinfo_pack(select_jobinfo_t *jobinfo, Buf buffer,
  2012. uint16_t protocol_version)
  2013. {
  2014. return pack_select_jobinfo(jobinfo, buffer, protocol_version);
  2015. }
  2016. extern int select_p_select_jobinfo_unpack(select_jobinfo_t **jobinfo,
  2017. Buf buffer,
  2018. uint16_t protocol_version)
  2019. {
  2020. return unpack_select_jobinfo(jobinfo, buffer, protocol_version);
  2021. }
  2022. extern char *select_p_select_jobinfo_sprint(select_jobinfo_t *jobinfo,
  2023. char *buf, size_t size, int mode)
  2024. {
  2025. return sprint_select_jobinfo(jobinfo, buf, size, mode);
  2026. }
  2027. extern char *select_p_select_jobinfo_xstrdup(select_jobinfo_t *jobinfo,
  2028. int mode)
  2029. {
  2030. return xstrdup_select_jobinfo(jobinfo, mode);
  2031. }
  2032. extern int select_p_update_block(update_block_msg_t *block_desc_ptr)
  2033. {
  2034. #ifdef HAVE_BG
  2035. int rc = SLURM_SUCCESS;
  2036. bg_record_t *bg_record = NULL;
  2037. char reason[200];
  2038. List kill_job_list = NULL;
  2039. kill_job_struct_t *freeit;
  2040. ListIterator itr;
  2041. if (!block_desc_ptr->bg_block_id) {
  2042. error("update_block: No name specified");
  2043. return ESLURM_INVALID_BLOCK_NAME;
  2044. }
  2045. slurm_mutex_lock(&block_state_mutex);
  2046. bg_record = find_bg_record_in_list(bg_lists->main,
  2047. block_desc_ptr->bg_block_id);
  2048. if (!bg_record) {
  2049. error("update_block: block %s not found",
  2050. block_desc_ptr->bg_block_id);
  2051. slurm_mutex_unlock(&block_state_mutex);
  2052. return ESLURM_INVALID_BLOCK_NAME;
  2053. }
  2054. if (block_desc_ptr->reason)
  2055. snprintf(reason, sizeof(reason), "%s", block_desc_ptr->reason);
  2056. else if (block_desc_ptr->state == BG_BLOCK_BOOTING)
  2057. snprintf(reason, sizeof(reason),
  2058. "update_block: "
  2059. "Admin recreated %s.", bg_record->bg_block_id);
  2060. else if (block_desc_ptr->state == BG_BLOCK_NAV) {
  2061. if (bg_record->conn_type[0] < SELECT_SMALL)
  2062. snprintf(reason, sizeof(reason),
  2063. "update_block: "
  2064. "Admin removed block %s",
  2065. bg_record->bg_block_id);
  2066. else
  2067. snprintf(reason, sizeof(reason),
  2068. "update_block: "
  2069. "Removed all blocks on midplane %s",
  2070. bg_record->mp_str);
  2071. } else {
  2072. uint16_t state = bg_record->state;
  2073. if (block_desc_ptr->state == BG_BLOCK_ERROR_FLAG)
  2074. state |= BG_BLOCK_ERROR_FLAG;
  2075. else if (state & BG_BLOCK_ERROR_FLAG)
  2076. state &= (~BG_BLOCK_ERROR_FLAG);
  2077. else
  2078. state = block_desc_ptr->state;
  2079. snprintf(reason, sizeof(reason),
  2080. "update_block: "
  2081. "Admin set block %s state to %s",
  2082. bg_record->bg_block_id,
  2083. bg_block_state_string(state));
  2084. }
  2085. /* First fail any job running on this block (Not for resume though) */
  2086. if (block_desc_ptr->state != BG_BLOCK_TERM) {
  2087. if (bg_record->job_running > NO_JOB_RUNNING) {
  2088. if (!kill_job_list)
  2089. kill_job_list =
  2090. bg_status_create_kill_job_list();
  2091. freeit = xmalloc(sizeof(kill_job_struct_t));
  2092. freeit->jobid = bg_record->job_running;
  2093. list_push(kill_job_list, freeit);
  2094. } else if (bg_record->job_list
  2095. && list_count(bg_record->job_list)) {
  2096. struct job_record *job_ptr;
  2097. if (!kill_job_list)
  2098. kill_job_list =
  2099. bg_status_create_kill_job_list();
  2100. itr = list_iterator_create(bg_record->job_list);
  2101. while ((job_ptr = list_next(itr))) {
  2102. if (job_ptr->magic != JOB_MAGIC)
  2103. continue;
  2104. freeit = xmalloc(sizeof(kill_job_struct_t));
  2105. freeit->jobid = job_ptr->job_id;
  2106. list_push(kill_job_list, freeit);
  2107. }
  2108. list_iterator_destroy(itr);
  2109. }
  2110. }
  2111. if (kill_job_list) {
  2112. slurm_mutex_unlock(&block_state_mutex);
  2113. bg_status_process_kill_job_list(kill_job_list, 0);
  2114. list_destroy(kill_job_list);
  2115. kill_job_list = NULL;
  2116. slurm_mutex_lock(&block_state_mutex);
  2117. if (!block_ptr_exist_in_list(bg_lists->main, bg_record)) {
  2118. slurm_mutex_unlock(&block_state_mutex);
  2119. error("while trying to put block in "
  2120. "error state it disappeared");
  2121. return SLURM_ERROR;
  2122. }
  2123. }
  2124. if (block_desc_ptr->state == BG_BLOCK_ERROR_FLAG) {
  2125. bg_record_t *found_record = NULL;
  2126. List delete_list = list_create(NULL);
  2127. bool delete_it = 0;
  2128. /* This loop shouldn't do much in regular Dynamic mode
  2129. since there shouldn't be overlapped blocks. But if
  2130. there is a trouble block that isn't going away and
  2131. we need to mark it in an error state there could be
  2132. blocks overlapped where we need to requeue the jobs.
  2133. */
  2134. itr = list_iterator_create(bg_lists->main);
  2135. while ((found_record = list_next(itr))) {
  2136. if (bg_record == found_record)
  2137. continue;
  2138. if (!blocks_overlap(bg_record, found_record)) {
  2139. debug2("block %s isn't part of errored %s",
  2140. found_record->bg_block_id,
  2141. bg_record->bg_block_id);
  2142. continue;
  2143. }
  2144. if (found_record->job_running > NO_JOB_RUNNING) {
  2145. if (found_record->job_ptr
  2146. && IS_JOB_CONFIGURING(
  2147. found_record->job_ptr))
  2148. info("Pending job %u on block %s "
  2149. "will try to be requeued "
  2150. "because overlapping block %s "
  2151. "is in an error state.",
  2152. found_record->job_running,
  2153. found_record->bg_block_id,
  2154. bg_record->bg_block_id);
  2155. else
  2156. info("Failing job %u on block %s "
  2157. "because overlapping block %s "
  2158. "is in an error state.",
  2159. found_record->job_running,
  2160. found_record->bg_block_id,
  2161. bg_record->bg_block_id);
  2162. /* This job will be requeued in the
  2163. free_block_list code below, just
  2164. make note of it here.
  2165. */
  2166. } else {
  2167. debug2("block %s is part of errored %s "
  2168. "but no running job",
  2169. found_record->bg_block_id,
  2170. bg_record->bg_block_id);
  2171. }
  2172. resume_block(found_record);
  2173. list_push(delete_list, found_record);
  2174. }
  2175. list_iterator_destroy(itr);
  2176. slurm_mutex_unlock(&block_state_mutex);
  2177. if (bg_conf->layout_mode == LAYOUT_DYNAMIC)
  2178. delete_it = 1;
  2179. free_block_list(NO_VAL, delete_list, delete_it, 0);
  2180. list_destroy(delete_list);
  2181. put_block_in_error_state(bg_record, reason);
  2182. } else if (block_desc_ptr->state == BG_BLOCK_FREE) {
  2183. /* Resume the block first and then free the block */
  2184. resume_block(bg_record);
  2185. /* Increment free_cnt to make sure we don't loose this
  2186. * block since bg_free_block will unlock block_state_mutex.
  2187. */
  2188. bg_record->free_cnt++;
  2189. bg_free_block(bg_record, 0, 1);
  2190. bg_record->free_cnt--;
  2191. slurm_mutex_unlock(&block_state_mutex);
  2192. } else if (block_desc_ptr->state == BG_BLOCK_TERM) {
  2193. /* This can't be RM_PARTITION_READY since the enum
  2194. changed from BGL to BGP and if we are running cross
  2195. cluster it just doesn't work.
  2196. */
  2197. resume_block(bg_record);
  2198. slurm_mutex_unlock(&block_state_mutex);
  2199. } else if (bg_conf->layout_mode == LAYOUT_DYNAMIC
  2200. && (block_desc_ptr->state == BG_BLOCK_NAV)) {
  2201. /* This means remove the block from the system. If
  2202. the block is a small block we need to remove all the
  2203. blocks on that midplane.
  2204. */
  2205. bg_record_t *found_record = NULL;
  2206. ListIterator itr;
  2207. List delete_list = list_create(NULL);
  2208. list_push(delete_list, bg_record);
  2209. /* only do the while loop if we are dealing with a
  2210. small block */
  2211. if (bg_record->conn_type[0] < SELECT_SMALL)
  2212. goto large_block;
  2213. itr = list_iterator_create(bg_lists->main);
  2214. while ((found_record = list_next(itr))) {
  2215. if (bg_record == found_record)
  2216. continue;
  2217. if (!bit_equal(bg_record->mp_bitmap,
  2218. found_record->mp_bitmap)) {
  2219. debug2("block %s isn't part of to be freed %s",
  2220. found_record->bg_block_id,
  2221. bg_record->bg_block_id);
  2222. continue;
  2223. }
  2224. if (found_record->job_running > NO_JOB_RUNNING) {
  2225. if (found_record->job_ptr
  2226. && IS_JOB_CONFIGURING(
  2227. found_record->job_ptr))
  2228. info("Pending job %u on block %s "
  2229. "will try to be requeued "
  2230. "because overlapping block %s "
  2231. "is being removed.",
  2232. found_record->job_running,
  2233. found_record->bg_block_id,
  2234. bg_record->bg_block_id);
  2235. else
  2236. info("Running job %u on block %s "
  2237. "will try to be requeued "
  2238. "because overlapping block %s "
  2239. "is being removed.",
  2240. found_record->job_running,
  2241. found_record->bg_block_id,
  2242. bg_record->bg_block_id);
  2243. /* This job will be requeued in the
  2244. free_block_list code below, just
  2245. make note of it here.
  2246. */
  2247. } else if (found_record->job_list &&
  2248. list_count(found_record->job_list)) {
  2249. struct job_record *job_ptr = NULL;
  2250. ListIterator job_itr = list_iterator_create(
  2251. found_record->job_list);
  2252. while ((job_ptr = list_next(job_itr))) {
  2253. if (job_ptr->magic != JOB_MAGIC) {
  2254. error("select_p_update_block: "
  2255. "bad magic found when "
  2256. "looking at block %s",
  2257. found_record->
  2258. bg_block_id);
  2259. list_delete_item(itr);
  2260. continue;
  2261. }
  2262. if (IS_JOB_CONFIGURING(job_ptr))
  2263. info("Pending job %u on "
  2264. "block %s "
  2265. "will try to be requeued "
  2266. "because related block %s "
  2267. "is in an error state.",
  2268. job_ptr->job_id,
  2269. found_record->bg_block_id,
  2270. bg_record->bg_block_id);
  2271. else
  2272. info("Running job %u on "
  2273. "block %s "
  2274. "will try to be requeued "
  2275. "because related block %s "
  2276. "is being removed.",
  2277. job_ptr->job_id,
  2278. found_record->bg_block_id,
  2279. bg_record->bg_block_id);
  2280. /* This job will be requeued in the
  2281. free_block_list code below, just
  2282. make note of it here.
  2283. */
  2284. }
  2285. list_iterator_destroy(job_itr);
  2286. } else {
  2287. debug2("block %s is part of to be freed %s "
  2288. "but no running job",
  2289. found_record->bg_block_id,
  2290. bg_record->bg_block_id);
  2291. }
  2292. list_push(delete_list, found_record);
  2293. }
  2294. list_iterator_destroy(itr);
  2295. large_block:
  2296. /* make sure if we are removing a block to put it back
  2297. to a normal state in accounting first */
  2298. itr = list_iterator_create(delete_list);
  2299. while ((found_record = list_next(itr))) {
  2300. if (found_record->state & BG_BLOCK_ERROR_FLAG)
  2301. resume_block(found_record);
  2302. }
  2303. list_iterator_destroy(itr);
  2304. slurm_mutex_unlock(&block_state_mutex);
  2305. free_block_list(NO_VAL, delete_list, 1, 0);
  2306. list_destroy(delete_list);
  2307. } else if (block_desc_ptr->state == BG_BLOCK_BOOTING) {
  2308. /* This means recreate the block, remove it and then
  2309. recreate it.
  2310. */
  2311. /* make sure if we are removing a block to put it back
  2312. to a normal state in accounting first */
  2313. if (bg_record->state & BG_BLOCK_ERROR_FLAG)
  2314. resume_block(bg_record);
  2315. term_jobs_on_block(bg_record->bg_block_id);
  2316. if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
  2317. info("select_p_update_block: "
  2318. "freeing the block %s.", bg_record->bg_block_id);
  2319. /* Increment free_cnt to make sure we don't loose this
  2320. * block since bg_free_block will unlock block_state_mutex.
  2321. */
  2322. bg_record->free_cnt++;
  2323. bg_free_block(bg_record, 1, 1);
  2324. bg_record->free_cnt--;
  2325. if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
  2326. info("select_p_update_block: done");
  2327. /* Now remove it from the main list since we are
  2328. looking for a state change and it won't be caught
  2329. unless it is in the main list until now.
  2330. */
  2331. remove_from_bg_list(bg_lists->main, bg_record);
  2332. #if defined HAVE_BG_FILES
  2333. if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
  2334. info("select_p_update_block: "
  2335. "removing %s from database",
  2336. bg_record->bg_block_id);
  2337. rc = bridge_block_remove(bg_record);
  2338. if (rc != SLURM_SUCCESS) {
  2339. if (rc == BG_ERROR_BLOCK_NOT_FOUND) {
  2340. debug("select_p_update_block: "
  2341. "block %s is not found",
  2342. bg_record->bg_block_id);
  2343. } else {
  2344. error("select_p_update_block: "
  2345. "rm_remove_partition(%s): %s",
  2346. bg_record->bg_block_id,
  2347. bg_err_str(rc));
  2348. }
  2349. } else
  2350. if (bg_conf->slurm_debug_flags
  2351. & DEBUG_FLAG_SELECT_TYPE)
  2352. info("select_p_update_block: done %s",
  2353. (char *)bg_record->bg_block_id);
  2354. #endif
  2355. xfree(bg_record->bg_block_id);
  2356. if (bridge_block_create(bg_record) == SLURM_ERROR) {
  2357. destroy_bg_record(bg_record);
  2358. error("select_p_update_block: "
  2359. "unable to configure block in api");
  2360. } else {
  2361. print_bg_record(bg_record);
  2362. list_append(bg_lists->main, bg_record);
  2363. sort_bg_record_inc_size(bg_lists->main);
  2364. }
  2365. slurm_mutex_unlock(&block_state_mutex);
  2366. } else {
  2367. slurm_mutex_unlock(&block_state_mutex);
  2368. error("state is ? %s",
  2369. bg_block_state_string(block_desc_ptr->state));
  2370. return ESLURM_INVALID_NODE_STATE;
  2371. }
  2372. /* info("%s", reason); */
  2373. last_bg_update = time(NULL);
  2374. return rc;
  2375. #else
  2376. return SLURM_ERROR;
  2377. #endif
  2378. }
  2379. extern int select_p_update_sub_node (update_block_msg_t *block_desc_ptr)
  2380. {
  2381. #ifdef HAVE_BG
  2382. int rc = SLURM_SUCCESS;
  2383. int i = 0, j = 0;
  2384. char coord[SYSTEM_DIMENSIONS+1], *node_name = NULL;
  2385. char ionodes[128];
  2386. int set = 0;
  2387. double nc_pos = 0, last_pos = -1;
  2388. bitstr_t *ionode_bitmap = NULL;
  2389. char *name = NULL;
  2390. if (bg_conf->layout_mode != LAYOUT_DYNAMIC) {
  2391. info("You can't use this call unless you are on a Dynamically "
  2392. "allocated system. Please use update BlockName instead");
  2393. rc = ESLURM_INVALID_BLOCK_LAYOUT;
  2394. goto end_it;
  2395. }
  2396. memset(coord, 0, sizeof(coord));
  2397. memset(ionodes, 0, 128);
  2398. if (!block_desc_ptr->mp_str) {
  2399. error("update_sub_node: No name specified");
  2400. rc = ESLURM_INVALID_BLOCK_NAME;
  2401. goto end_it;
  2402. }
  2403. name = block_desc_ptr->mp_str;
  2404. while (name[j] != '\0') {
  2405. if (name[j] == '[') {
  2406. if (set<1) {
  2407. rc = SLURM_ERROR;
  2408. goto end_it;
  2409. }
  2410. i = j++;
  2411. if ((name[j] < '0'
  2412. || name[j] > 'Z'
  2413. || (name[j] > '9'
  2414. && name[j] < 'A'))) {
  2415. error("update_sub_node: sub block is empty");
  2416. rc = SLURM_ERROR;
  2417. goto end_it;
  2418. }
  2419. while (name[i] != '\0') {
  2420. if (name[i] == ']')
  2421. break;
  2422. i++;
  2423. }
  2424. if (name[i] != ']') {
  2425. error("update_sub_node: "
  2426. "No close (']') on sub block");
  2427. rc = SLURM_ERROR;
  2428. goto end_it;
  2429. }
  2430. strncpy(ionodes, name+j, i-j);
  2431. set++;
  2432. break;
  2433. } else if ((name[j] >= '0'
  2434. && name[j] <= '9')
  2435. || (name[j] >= 'A'
  2436. && name[j] <= 'Z')) {
  2437. if (set) {
  2438. rc = SLURM_ERROR;
  2439. goto end_it;
  2440. }
  2441. /* make sure we are asking for a correct name */
  2442. for(i = 0; i < SYSTEM_DIMENSIONS; i++) {
  2443. if ((name[j+i] >= '0'
  2444. && name[j+i] <= '9')
  2445. || (name[j+i] >= 'A'
  2446. && name[j+i] <= 'Z'))
  2447. continue;
  2448. error("update_sub_node: "
  2449. "misformatted name given %s",
  2450. name);
  2451. rc = SLURM_ERROR;
  2452. goto end_it;
  2453. }
  2454. strncpy(coord, name+j,
  2455. SYSTEM_DIMENSIONS);
  2456. j += SYSTEM_DIMENSIONS-1;
  2457. set++;
  2458. }
  2459. j++;
  2460. }
  2461. if (set != 2) {
  2462. error("update_sub_node: "
  2463. "I didn't get the base partition and the sub part.");
  2464. rc = SLURM_ERROR;
  2465. goto end_it;
  2466. }
  2467. ionode_bitmap = bit_alloc(bg_conf->ionodes_per_mp);
  2468. bit_unfmt(ionode_bitmap, ionodes);
  2469. if (bit_ffs(ionode_bitmap) == -1) {
  2470. error("update_sub_node: Invalid ionode '%s' given.", ionodes);
  2471. rc = SLURM_ERROR;
  2472. FREE_NULL_BITMAP(ionode_bitmap);
  2473. goto end_it;
  2474. }
  2475. node_name = xstrdup_printf("%s%s", bg_conf->slurm_node_prefix, coord);
  2476. /* find out how many nodecards to get for each ionode */
  2477. if (block_desc_ptr->state == BG_BLOCK_ERROR_FLAG) {
  2478. info("Admin setting %s[%s] in an error state",
  2479. node_name, ionodes);
  2480. for(i = 0; i<bg_conf->ionodes_per_mp; i++) {
  2481. if (bit_test(ionode_bitmap, i)) {
  2482. if ((int)nc_pos != (int)last_pos) {
  2483. /* find first bit in nc */
  2484. int start_io =
  2485. (int)nc_pos * bg_conf->io_ratio;
  2486. down_nodecard(node_name, start_io,
  2487. 0, NULL);
  2488. last_pos = nc_pos;
  2489. }
  2490. }
  2491. nc_pos += bg_conf->nc_ratio;
  2492. }
  2493. } else if (block_desc_ptr->state == BG_BLOCK_FREE) {
  2494. info("Admin setting %s[%s] in an free state",
  2495. node_name, ionodes);
  2496. up_nodecard(node_name, ionode_bitmap);
  2497. } else {
  2498. error("update_sub_node: Unknown state %s",
  2499. bg_block_state_string(block_desc_ptr->state));
  2500. rc = ESLURM_INVALID_BLOCK_STATE;
  2501. }
  2502. FREE_NULL_BITMAP(ionode_bitmap);
  2503. xfree(node_name);
  2504. last_bg_update = time(NULL);
  2505. end_it:
  2506. return rc;
  2507. #else
  2508. return SLURM_ERROR;
  2509. #endif
  2510. }
  2511. /* While the realtime server should get all the cnode state changes on
  2512. * older versions of the IBM driver if a job has a timeout it doesn't
  2513. * always happen. So what happens is the runjob_mux will now send a
  2514. * nice cancel to the slurmctld to make sure it gets marked.
  2515. */
  2516. extern int select_p_fail_cnode(struct step_record *step_ptr)
  2517. {
  2518. #if defined HAVE_BG && !defined HAVE_BG_L_P
  2519. bg_record_t *bg_record;
  2520. select_nodeinfo_t *nodeinfo;
  2521. select_jobinfo_t *jobinfo;
  2522. select_jobinfo_t *step_jobinfo;
  2523. struct node_record *node_ptr = NULL;
  2524. ListIterator itr, itr2;
  2525. ba_mp_t *ba_mp = NULL, *found_ba_mp;
  2526. int i;
  2527. xassert(step_ptr);
  2528. jobinfo = step_ptr->job_ptr->select_jobinfo->data;
  2529. step_jobinfo = step_ptr->select_jobinfo->data;
  2530. /* block_state must be locked before ba_system */
  2531. slurm_mutex_lock(&block_state_mutex);
  2532. slurm_mutex_lock(&ba_system_mutex);
  2533. for (i=0; i<bit_size(step_ptr->step_node_bitmap); i++) {
  2534. if (!bit_test(step_ptr->step_node_bitmap, i))
  2535. continue;
  2536. ba_mp = ba_inx2ba_mp(i);
  2537. xassert(ba_mp);
  2538. if (!ba_mp->cnode_err_bitmap)
  2539. ba_mp->cnode_err_bitmap =
  2540. bit_alloc(bg_conf->mp_cnode_cnt);
  2541. if (jobinfo->units_avail) {
  2542. bit_or(ba_mp->cnode_err_bitmap,
  2543. step_jobinfo->units_used);
  2544. } else {
  2545. bit_nset(ba_mp->cnode_err_bitmap, 0,
  2546. bit_size(ba_mp->cnode_err_bitmap)-1);
  2547. }
  2548. node_ptr = &(node_record_table_ptr[ba_mp->index]);
  2549. xassert(node_ptr->select_nodeinfo);
  2550. nodeinfo = (select_nodeinfo_t *)node_ptr->select_nodeinfo->data;
  2551. xassert(nodeinfo);
  2552. xfree(nodeinfo->failed_cnodes);
  2553. nodeinfo->failed_cnodes = ba_node_map_ranged_hostlist(
  2554. ba_mp->cnode_err_bitmap, ba_mp_geo_system);
  2555. }
  2556. if (!ba_mp) {
  2557. error("select_p_fail_cnode: no ba_mp? "
  2558. "This should never happen");
  2559. slurm_mutex_unlock(&ba_system_mutex);
  2560. slurm_mutex_unlock(&block_state_mutex);
  2561. return SLURM_ERROR;
  2562. }
  2563. itr = list_iterator_create(bg_lists->main);
  2564. while ((bg_record = (bg_record_t *)list_next(itr))) {
  2565. float err_ratio;
  2566. if (!bit_overlap(step_ptr->step_node_bitmap,
  2567. bg_record->mp_bitmap))
  2568. continue;
  2569. bg_record->cnode_err_cnt = 0;
  2570. itr2 = list_iterator_create(bg_record->ba_mp_list);
  2571. while ((found_ba_mp = (ba_mp_t *)list_next(itr2))) {
  2572. if (!found_ba_mp->used
  2573. || !bit_test(step_ptr->step_node_bitmap,
  2574. found_ba_mp->index))
  2575. continue;
  2576. /* perhaps this block isn't involved in this
  2577. error */
  2578. if (jobinfo->units_avail
  2579. && found_ba_mp->cnode_usable_bitmap
  2580. && bit_overlap(found_ba_mp->cnode_usable_bitmap,
  2581. ba_mp->cnode_err_bitmap))
  2582. continue;
  2583. if (!found_ba_mp->cnode_err_bitmap)
  2584. found_ba_mp->cnode_err_bitmap =
  2585. bit_alloc(bg_conf->mp_cnode_cnt);
  2586. bit_or(found_ba_mp->cnode_err_bitmap,
  2587. ba_mp->cnode_err_bitmap);
  2588. bg_record->cnode_err_cnt +=
  2589. bit_set_count(found_ba_mp->cnode_err_bitmap);
  2590. }
  2591. list_iterator_destroy(itr2);
  2592. err_ratio = (float)bg_record->cnode_err_cnt
  2593. / (float)bg_record->cnode_cnt;
  2594. bg_record->err_ratio = err_ratio * 100;
  2595. /* handle really small ratios */
  2596. if (!bg_record->err_ratio && bg_record->cnode_err_cnt)
  2597. bg_record->err_ratio = 1;
  2598. debug("select_p_fail_cnode: "
  2599. "count in error for %s is %u with ratio at %u",
  2600. bg_record->bg_block_id,
  2601. bg_record->cnode_err_cnt,
  2602. bg_record->err_ratio);
  2603. }
  2604. list_iterator_destroy(itr);
  2605. slurm_mutex_unlock(&ba_system_mutex);
  2606. slurm_mutex_unlock(&block_state_mutex);
  2607. #endif
  2608. return SLURM_SUCCESS;
  2609. }
  2610. extern int select_p_get_info_from_plugin (enum select_plugindata_info dinfo,
  2611. struct job_record *job_ptr,
  2612. void *data)
  2613. {
  2614. #ifdef HAVE_BG
  2615. uint16_t *tmp16 = (uint16_t *) data;
  2616. uint32_t *tmp32 = (uint32_t *) data;
  2617. List *tmp_list = (List *) data;
  2618. int rc = SLURM_SUCCESS;
  2619. switch(dinfo) {
  2620. case SELECT_CR_PLUGIN:
  2621. *tmp32 = 0;
  2622. break;
  2623. case SELECT_STATIC_PART:
  2624. if (bg_conf->layout_mode == LAYOUT_DYNAMIC)
  2625. *tmp16 = 0;
  2626. else /* LAYOUT_STATIC || LAYOUT_OVERLAP */
  2627. *tmp16 = 1;
  2628. break;
  2629. case SELECT_CONFIG_INFO:
  2630. *tmp_list = _get_config();
  2631. break;
  2632. default:
  2633. error("select_p_get_info_from_plugin info %d invalid",
  2634. dinfo);
  2635. rc = SLURM_ERROR;
  2636. break;
  2637. }
  2638. return rc;
  2639. #else
  2640. return SLURM_ERROR;
  2641. #endif
  2642. }
  2643. extern int select_p_update_node_config (int index)
  2644. {
  2645. #ifdef HAVE_BG
  2646. return SLURM_SUCCESS;
  2647. #else
  2648. return SLURM_ERROR;
  2649. #endif
  2650. }
  2651. extern int select_p_update_node_state(struct node_record *node_ptr)
  2652. {
  2653. #ifdef HAVE_BG
  2654. ba_mp_t *curr_mp;
  2655. int rc = SLURM_SUCCESS;
  2656. xassert(node_ptr);
  2657. slurm_mutex_lock(&ba_system_mutex);
  2658. if ((curr_mp = str2ba_mp(node_ptr->name)))
  2659. ba_update_mp_state(curr_mp, node_ptr->node_state);
  2660. else
  2661. rc = SLURM_ERROR;
  2662. slurm_mutex_unlock(&ba_system_mutex);
  2663. return rc;
  2664. #else
  2665. return SLURM_ERROR;
  2666. #endif
  2667. }
  2668. extern int select_p_alter_node_cnt(enum select_node_cnt type, void *data)
  2669. {
  2670. #ifdef HAVE_BG
  2671. job_desc_msg_t *job_desc = (job_desc_msg_t *)data;
  2672. uint16_t *cpus = (uint16_t *)data;
  2673. uint32_t *nodes = (uint32_t *)data, tmp = 0;
  2674. int i;
  2675. uint16_t req_geometry[SYSTEM_DIMENSIONS];
  2676. if (!bg_conf->mp_cnode_cnt) {
  2677. fatal("select_p_alter_node_cnt: This can't be called "
  2678. "before init");
  2679. }
  2680. switch (type) {
  2681. case SELECT_GET_NODE_SCALING:
  2682. if ((*nodes) != INFINITE) {
  2683. if (bg_conf->sub_mp_sys)
  2684. (*nodes) = bg_conf->actual_cnodes_per_mp;
  2685. else
  2686. (*nodes) = bg_conf->mp_cnode_cnt;
  2687. }
  2688. break;
  2689. case SELECT_GET_NODE_CPU_CNT:
  2690. if ((*cpus) != (uint16_t)INFINITE)
  2691. (*cpus) = bg_conf->cpu_ratio;
  2692. break;
  2693. case SELECT_GET_MP_CPU_CNT:
  2694. if ((*nodes) != INFINITE)
  2695. (*nodes) = bg_conf->cpus_per_mp;
  2696. break;
  2697. case SELECT_SET_MP_CNT:
  2698. if (((*nodes) == INFINITE) || ((*nodes) == NO_VAL))
  2699. tmp = (*nodes);
  2700. else if ((*nodes) > bg_conf->mp_cnode_cnt) {
  2701. tmp = (*nodes);
  2702. tmp /= bg_conf->mp_cnode_cnt;
  2703. if (tmp < 1)
  2704. tmp = 1;
  2705. } else
  2706. tmp = 1;
  2707. (*nodes) = tmp;
  2708. break;
  2709. case SELECT_APPLY_NODE_MIN_OFFSET:
  2710. if ((*nodes) == 1) {
  2711. /* Job will actually get more than one c-node,
  2712. * but we can't be sure exactly how much so we
  2713. * don't scale up this value. */
  2714. break;
  2715. }
  2716. if (bg_conf->sub_mp_sys)
  2717. (*nodes) = bg_conf->actual_cnodes_per_mp;
  2718. else
  2719. (*nodes) *= bg_conf->mp_cnode_cnt;
  2720. break;
  2721. case SELECT_APPLY_NODE_MAX_OFFSET:
  2722. if ((*nodes) != INFINITE) {
  2723. if (bg_conf->sub_mp_sys)
  2724. (*nodes) = bg_conf->actual_cnodes_per_mp;
  2725. else
  2726. (*nodes) *= bg_conf->mp_cnode_cnt;
  2727. }
  2728. break;
  2729. case SELECT_SET_NODE_CNT:
  2730. get_select_jobinfo(job_desc->select_jobinfo->data,
  2731. SELECT_JOBDATA_ALTERED, &tmp);
  2732. if (tmp == 1) {
  2733. return SLURM_SUCCESS;
  2734. }
  2735. tmp = 1;
  2736. set_select_jobinfo(job_desc->select_jobinfo->data,
  2737. SELECT_JOBDATA_ALTERED, &tmp);
  2738. if (job_desc->min_nodes == (uint32_t) NO_VAL)
  2739. return SLURM_SUCCESS;
  2740. else if ((job_desc->min_nodes == 1)
  2741. && (job_desc->min_cpus != NO_VAL)) {
  2742. job_desc->min_nodes = job_desc->min_cpus;
  2743. if (job_desc->ntasks_per_node
  2744. && job_desc->ntasks_per_node != NO_VAL)
  2745. job_desc->min_nodes /=
  2746. job_desc->ntasks_per_node;
  2747. }
  2748. get_select_jobinfo(job_desc->select_jobinfo->data,
  2749. SELECT_JOBDATA_GEOMETRY, &req_geometry);
  2750. if (req_geometry[0] != 0
  2751. && req_geometry[0] != (uint16_t)NO_VAL) {
  2752. job_desc->min_nodes = 1;
  2753. for (i=0; i<SYSTEM_DIMENSIONS; i++)
  2754. job_desc->min_nodes *=
  2755. (uint16_t)req_geometry[i];
  2756. job_desc->min_nodes *= bg_conf->mp_cnode_cnt;
  2757. job_desc->max_nodes = job_desc->min_nodes;
  2758. }
  2759. /* make sure if the user only specified min_cpus to
  2760. set min_nodes correctly
  2761. */
  2762. if ((job_desc->min_cpus != NO_VAL)
  2763. && (job_desc->min_cpus > job_desc->min_nodes)) {
  2764. float tmp_float = (float)job_desc->min_cpus
  2765. / (float)bg_conf->cpu_ratio;
  2766. tmp = (uint32_t)tmp_float;
  2767. if (tmp_float != (float)tmp)
  2768. tmp++;
  2769. if (tmp > job_desc->min_nodes) {
  2770. /* This means they actually asked for
  2771. nodes and tasks.
  2772. */
  2773. if ((job_desc->max_nodes != NO_VAL)
  2774. && (tmp > job_desc->max_nodes)) {
  2775. #ifndef HAVE_BG_L_P
  2776. float divisor = 0;
  2777. /* ntasks_per_node should be
  2778. * validated beforehand. */
  2779. if (job_desc->ntasks_per_node
  2780. && (job_desc->ntasks_per_node
  2781. != NO_VAL))
  2782. divisor = (float)job_desc->
  2783. ntasks_per_node
  2784. / bg_conf->cpu_ratio;
  2785. /* On Q systems you can have 2
  2786. processes per thread */
  2787. if (!divisor || divisor > 2) {
  2788. error("Asking for more "
  2789. "resources than "
  2790. "possible. Denied.");
  2791. return SLURM_ERROR;
  2792. } else
  2793. tmp /= divisor;
  2794. #else
  2795. error("Asking for more resources than "
  2796. "possible. Requested %u nodes "
  2797. "and %u "
  2798. "tasks, giving them %u nodes.",
  2799. job_desc->min_nodes,
  2800. job_desc->min_cpus, tmp);
  2801. #endif
  2802. }
  2803. job_desc->min_nodes = tmp;
  2804. }
  2805. }
  2806. /* initialize min_cpus to the min_nodes */
  2807. job_desc->min_cpus = job_desc->min_nodes * bg_conf->cpu_ratio;
  2808. if ((job_desc->max_nodes == (uint32_t) NO_VAL)
  2809. || (job_desc->max_nodes < job_desc->min_nodes))
  2810. job_desc->max_nodes = job_desc->min_nodes;
  2811. /* See if min_nodes is greater than one base partition */
  2812. if (job_desc->min_nodes > bg_conf->mp_cnode_cnt) {
  2813. /*
  2814. * if it is make sure it is a factor of
  2815. * bg_conf->mp_cnode_cnt, if it isn't make it
  2816. * that way
  2817. */
  2818. tmp = job_desc->min_nodes % bg_conf->mp_cnode_cnt;
  2819. if (tmp > 0)
  2820. job_desc->min_nodes +=
  2821. (bg_conf->mp_cnode_cnt-tmp);
  2822. }
  2823. tmp = job_desc->min_nodes / bg_conf->mp_cnode_cnt;
  2824. /* this means it is greater or equal to one mp */
  2825. if (tmp > 0) {
  2826. set_select_jobinfo(job_desc->select_jobinfo->data,
  2827. SELECT_JOBDATA_NODE_CNT,
  2828. &job_desc->min_nodes);
  2829. job_desc->min_nodes = tmp;
  2830. job_desc->min_cpus = bg_conf->cpus_per_mp * tmp;
  2831. } else {
  2832. #ifdef HAVE_BGL
  2833. if (job_desc->min_nodes <= bg_conf->nodecard_cnode_cnt
  2834. && bg_conf->nodecard_ionode_cnt)
  2835. job_desc->min_nodes =
  2836. bg_conf->nodecard_cnode_cnt;
  2837. else if (job_desc->min_nodes
  2838. <= bg_conf->quarter_cnode_cnt)
  2839. job_desc->min_nodes =
  2840. bg_conf->quarter_cnode_cnt;
  2841. else
  2842. job_desc->min_nodes =
  2843. bg_conf->mp_cnode_cnt;
  2844. set_select_jobinfo(job_desc->select_jobinfo->data,
  2845. SELECT_JOBDATA_NODE_CNT,
  2846. &job_desc->min_nodes);
  2847. tmp = bg_conf->mp_cnode_cnt/job_desc->min_nodes;
  2848. job_desc->min_cpus = bg_conf->cpus_per_mp/tmp;
  2849. job_desc->min_nodes = 1;
  2850. #else
  2851. /* If it is allowed to run sub block allocations then
  2852. an allocation can be any size. If it doesn't line
  2853. up with a geometry it will be massaged later.
  2854. */
  2855. if (!bg_conf->sub_blocks) {
  2856. i = bg_conf->smallest_block;
  2857. while (i <= bg_conf->mp_cnode_cnt) {
  2858. if (job_desc->min_nodes <= i) {
  2859. job_desc->min_nodes = i;
  2860. break;
  2861. }
  2862. i *= 2;
  2863. }
  2864. }
  2865. set_select_jobinfo(job_desc->select_jobinfo->data,
  2866. SELECT_JOBDATA_NODE_CNT,
  2867. &job_desc->min_nodes);
  2868. job_desc->min_cpus = job_desc->min_nodes
  2869. * bg_conf->cpu_ratio;
  2870. job_desc->min_nodes = 1;
  2871. #endif
  2872. }
  2873. if (job_desc->max_nodes > bg_conf->mp_cnode_cnt) {
  2874. tmp = job_desc->max_nodes % bg_conf->mp_cnode_cnt;
  2875. if (tmp > 0)
  2876. job_desc->max_nodes +=
  2877. (bg_conf->mp_cnode_cnt-tmp);
  2878. }
  2879. tmp = job_desc->max_nodes / bg_conf->mp_cnode_cnt;
  2880. if (tmp > 0) {
  2881. job_desc->max_nodes = tmp;
  2882. job_desc->max_cpus =
  2883. job_desc->max_nodes * bg_conf->cpus_per_mp;
  2884. tmp = NO_VAL;
  2885. } else {
  2886. #ifdef HAVE_BGL
  2887. if (job_desc->max_nodes <= bg_conf->nodecard_cnode_cnt
  2888. && bg_conf->nodecard_ionode_cnt)
  2889. job_desc->max_nodes =
  2890. bg_conf->nodecard_cnode_cnt;
  2891. else if (job_desc->max_nodes
  2892. <= bg_conf->quarter_cnode_cnt)
  2893. job_desc->max_nodes =
  2894. bg_conf->quarter_cnode_cnt;
  2895. else
  2896. job_desc->max_nodes =
  2897. bg_conf->mp_cnode_cnt;
  2898. tmp = bg_conf->mp_cnode_cnt/job_desc->max_nodes;
  2899. job_desc->max_cpus = bg_conf->cpus_per_mp/tmp;
  2900. job_desc->max_nodes = 1;
  2901. #else
  2902. if (!bg_conf->sub_blocks) {
  2903. i = bg_conf->smallest_block;
  2904. while (i <= bg_conf->mp_cnode_cnt) {
  2905. if (job_desc->max_nodes <= i) {
  2906. job_desc->max_nodes = i;
  2907. break;
  2908. }
  2909. i *= 2;
  2910. }
  2911. }
  2912. job_desc->max_cpus =
  2913. job_desc->max_nodes * bg_conf->cpu_ratio;
  2914. job_desc->max_nodes = 1;
  2915. #endif
  2916. }
  2917. tmp = NO_VAL;
  2918. break;
  2919. default:
  2920. error("unknown option %d for alter_node_cnt", type);
  2921. }
  2922. return SLURM_SUCCESS;
  2923. #else
  2924. return SLURM_ERROR;
  2925. #endif
  2926. }
  2927. extern int select_p_reconfigure(void)
  2928. {
  2929. #ifdef HAVE_BG
  2930. slurm_conf_lock();
  2931. if (!slurmctld_conf.slurm_user_name
  2932. || strcmp(bg_conf->slurm_user_name, slurmctld_conf.slurm_user_name))
  2933. error("The slurm user has changed from '%s' to '%s'. "
  2934. "If this is really what you "
  2935. "want you will need to restart slurm for this "
  2936. "change to be enforced in the bluegene plugin.",
  2937. bg_conf->slurm_user_name, slurmctld_conf.slurm_user_name);
  2938. if (!slurmctld_conf.node_prefix
  2939. || strcmp(bg_conf->slurm_node_prefix, slurmctld_conf.node_prefix))
  2940. error("Node Prefix has changed from '%s' to '%s'. "
  2941. "If this is really what you "
  2942. "want you will need to restart slurm for this "
  2943. "change to be enforced in the bluegene plugin.",
  2944. bg_conf->slurm_node_prefix, slurmctld_conf.node_prefix);
  2945. bg_conf->slurm_debug_flags = slurmctld_conf.debug_flags;
  2946. bg_conf->slurm_debug_level = slurmctld_conf.slurmctld_debug;
  2947. set_ba_debug_flags(bg_conf->slurm_debug_flags);
  2948. slurm_conf_unlock();
  2949. return SLURM_SUCCESS;
  2950. #else
  2951. return SLURM_ERROR;
  2952. #endif
  2953. }
  2954. extern bitstr_t *select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt,
  2955. bitstr_t **core_bitmap)
  2956. {
  2957. #ifdef HAVE_BG
  2958. /* Reserve a block of appropriate geometry by issuing a fake job
  2959. * WILL_RUN call */
  2960. int i, rc;
  2961. uint32_t tmp_u32;
  2962. uint16_t conn_type[SYSTEM_DIMENSIONS];
  2963. uint16_t geo[SYSTEM_DIMENSIONS];
  2964. uint16_t reboot = 0;
  2965. uint16_t rotate = 1;
  2966. List preemptee_candidates, preemptee_job_list;
  2967. struct job_record job_rec;
  2968. bitstr_t *tmp_bitmap;
  2969. memset(&job_rec, 0, sizeof(struct job_record));
  2970. job_rec.details = xmalloc(sizeof(struct job_details));
  2971. job_rec.select_jobinfo = select_g_select_jobinfo_alloc();
  2972. tmp_u32 = 1;
  2973. set_select_jobinfo(job_rec.select_jobinfo->data,
  2974. SELECT_JOBDATA_ALTERED, &tmp_u32);
  2975. set_select_jobinfo(job_rec.select_jobinfo->data,
  2976. SELECT_JOBDATA_NODE_CNT, &node_cnt);
  2977. for (i = 0; i < SYSTEM_DIMENSIONS; i++) {
  2978. conn_type[i] = SELECT_NAV;
  2979. geo[i] = 0;
  2980. }
  2981. select_g_select_jobinfo_set(job_rec.select_jobinfo,
  2982. SELECT_JOBDATA_GEOMETRY, &geo);
  2983. select_g_select_jobinfo_set(job_rec.select_jobinfo,
  2984. SELECT_JOBDATA_CONN_TYPE, &conn_type);
  2985. select_g_select_jobinfo_set(job_rec.select_jobinfo,
  2986. SELECT_JOBDATA_REBOOT, &reboot);
  2987. select_g_select_jobinfo_set(job_rec.select_jobinfo,
  2988. SELECT_JOBDATA_ROTATE, &rotate);
  2989. job_rec.details->min_cpus = node_cnt * bg_conf->cpus_per_mp;
  2990. job_rec.details->max_cpus = job_rec.details->min_cpus;
  2991. tmp_bitmap = bit_copy(avail_bitmap);
  2992. preemptee_candidates = list_create(NULL);
  2993. if (preemptee_candidates == NULL)
  2994. fatal("list_create: malloc failure");
  2995. rc = submit_job(&job_rec, tmp_bitmap, node_cnt, node_cnt, node_cnt,
  2996. SELECT_MODE_WILL_RUN, preemptee_candidates,
  2997. &preemptee_job_list);
  2998. list_destroy(preemptee_candidates);
  2999. xfree(job_rec.details);
  3000. select_g_select_jobinfo_free(job_rec.select_jobinfo);
  3001. if (rc == SLURM_SUCCESS) {
  3002. char *resv_nodes = bitmap2node_name(tmp_bitmap);
  3003. info("Reservation request for %u nodes satisfied with %s",
  3004. node_cnt, resv_nodes);
  3005. xfree(resv_nodes);
  3006. return tmp_bitmap;
  3007. } else {
  3008. info("Reservation request for %u nodes failed", node_cnt);
  3009. FREE_NULL_BITMAP(tmp_bitmap);
  3010. }
  3011. #endif
  3012. return NULL;
  3013. }
  3014. extern void select_p_ba_init(node_info_msg_t *node_info_ptr, bool sanity_check)
  3015. {
  3016. ba_init(node_info_ptr, sanity_check);
  3017. }
  3018. extern void select_p_ba_fini(void)
  3019. {
  3020. ba_fini();
  3021. }
  3022. extern int *select_p_ba_get_dims(void)
  3023. {
  3024. #ifdef HAVE_BG
  3025. return DIM_SIZE;
  3026. #else
  3027. return NULL;
  3028. #endif
  3029. }