PageRenderTime 66ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/src/plugins/select/bluegene/select_bluegene.c

https://github.com/cfenoy/slurm
C | 3388 lines | 2622 code | 355 blank | 411 comment | 569 complexity | 81364133d6bb41bb0dd94aaffd03c13c MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0

Large files files are truncated, but you can click here to view the full file

  1. /*****************************************************************************\
  2. * select_bluegene.c - node selection plugin for Blue Gene system.
  3. *****************************************************************************
  4. * Copyright (C) 2004-2007 The Regents of the University of California.
  5. * Copyright (C) 2008-2011 Lawrence Livermore National Security.
  6. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  7. * Written by Dan Phung <phung4@llnl.gov> Danny Auble <da@llnl.gov>
  8. * CODE-OCEC-09-009. All rights reserved.
  9. *
  10. * This file is part of SLURM, a resource management program.
  11. * For details, see <http://www.schedmd.com/slurmdocs/>.
  12. * Please also read the included file: DISCLAIMER.
  13. *
  14. * SLURM is free software; you can redistribute it and/or modify it under
  15. * the terms of the GNU General Public License as published by the Free
  16. * Software Foundation; either version 2 of the License, or (at your option)
  17. * any later version.
  18. *
  19. * In addition, as a special exception, the copyright holders give permission
  20. * to link the code of portions of this program with the OpenSSL library under
  21. * certain conditions as described in each individual source file, and
  22. * distribute linked combinations including the two. You must obey the GNU
  23. * General Public License in all respects for all of the code used other than
  24. * OpenSSL. If you modify file(s) with this exception, you may extend this
  25. * exception to your version of the file(s), but you are not obligated to do
  26. * so. If you do not wish to do so, delete this exception statement from your
  27. * version. If you delete this exception statement from all source files in
  28. * the program, then also delete it here.
  29. *
  30. * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  31. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  32. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  33. * details.
  34. *
  35. * You should have received a copy of the GNU General Public License along
  36. * with SLURM; if not, write to the Free Software Foundation, Inc.,
  37. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  38. \*****************************************************************************/
  39. #include "src/common/slurm_xlator.h"
  40. #include "bg_core.h"
  41. #include "bg_read_config.h"
  42. #include "bg_defined_block.h"
  43. #ifndef HAVE_BG_L_P
  44. # include "ba_bgq/block_allocator.h"
  45. #else
  46. # include "ba/block_allocator.h"
  47. #endif
  48. #include "src/slurmctld/trigger_mgr.h"
  49. #include <fcntl.h>
  50. #define HUGE_BUF_SIZE (1024*16)
  51. /* These are defined here so when we link with something other than
  52. * the slurmctld we will have these symbols defined. They will get
  53. * overwritten when linking with the slurmctld.
  54. */
  55. #if defined (__APPLE__)
  56. slurmctld_config_t slurmctld_config __attribute__((weak_import));
  57. slurm_ctl_conf_t slurmctld_conf __attribute__((weak_import));
  58. struct node_record *node_record_table_ptr __attribute__((weak_import)) = NULL;
  59. int bg_recover __attribute__((weak_import)) = NOT_FROM_CONTROLLER;
  60. List part_list __attribute__((weak_import)) = NULL;
  61. int node_record_count __attribute__((weak_import));
  62. time_t last_node_update __attribute__((weak_import));
  63. time_t last_job_update __attribute__((weak_import));
  64. char *alpha_num __attribute__((weak_import)) =
  65. "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
  66. void *acct_db_conn __attribute__((weak_import)) = NULL;
  67. char *slurmctld_cluster_name __attribute__((weak_import)) = NULL;
  68. slurmdb_cluster_rec_t *working_cluster_rec __attribute__((weak_import)) = NULL;
  69. #else
  70. slurmctld_config_t slurmctld_config;
  71. slurm_ctl_conf_t slurmctld_conf;
  72. struct node_record *node_record_table_ptr = NULL;
  73. int bg_recover = NOT_FROM_CONTROLLER;
  74. List part_list = NULL;
  75. int node_record_count;
  76. time_t last_node_update;
  77. time_t last_job_update;
  78. char *alpha_num = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
  79. void *acct_db_conn = NULL;
  80. char *slurmctld_cluster_name = NULL;
  81. slurmdb_cluster_rec_t *working_cluster_rec = NULL;
  82. #endif
  83. /*
  84. * These variables are required by the generic plugin interface. If they
  85. * are not found in the plugin, the plugin loader will ignore it.
  86. *
  87. * plugin_name - a string giving a human-readable description of the
  88. * plugin. There is no maximum length, but the symbol must refer to
  89. * a valid string.
  90. *
  91. * plugin_type - a string suggesting the type of the plugin or its
  92. * applicability to a particular form of data or method of data handling.
  93. * If the low-level plugin API is used, the contents of this string are
  94. * unimportant and may be anything. SLURM uses the higher-level plugin
  95. * interface which requires this string to be of the form
  96. *
  97. * <application>/<method>
  98. *
  99. * where <application> is a description of the intended application of
  100. * the plugin (e.g., "select" for SLURM node selection) and <method>
  101. * is a description of how this plugin satisfies that application. SLURM will
  102. * only load select plugins if the plugin_type string has a
  103. * prefix of "select/".
  104. *
  105. * plugin_version - an unsigned 32-bit integer giving the version number
  106. * of the plugin. If major and minor revisions are desired, the major
  107. * version number may be multiplied by a suitable magnitude constant such
  108. * as 100 or 1000. Various SLURM versions will likely require a certain
  109. * minimum version for their plugins as the node selection API matures.
  110. */
  111. const char plugin_name[] = "BlueGene node selection plugin";
  112. const char plugin_type[] = "select/bluegene";
  113. const uint32_t plugin_id = 100;
  114. const uint32_t plugin_version = 200;
  115. /* Global variables */
  116. bg_config_t *bg_conf = NULL;
  117. bg_lists_t *bg_lists = NULL;
  118. time_t last_bg_update;
  119. pthread_mutex_t block_state_mutex = PTHREAD_MUTEX_INITIALIZER;
  120. int blocks_are_created = 0;
  121. int num_unused_cpus = 0;
  122. int num_possible_unused_cpus = 0;
  123. slurmctld_lock_t job_read_lock = {
  124. NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };
  125. extern int select_p_alter_node_cnt(enum select_node_cnt type, void *data);
  126. static void _destroy_bg_config(bg_config_t *bg_conf)
  127. {
  128. if (bg_conf) {
  129. if (bg_conf->blrts_list) {
  130. list_destroy(bg_conf->blrts_list);
  131. bg_conf->blrts_list = NULL;
  132. }
  133. xfree(bg_conf->bridge_api_file);
  134. xfree(bg_conf->default_blrtsimage);
  135. xfree(bg_conf->default_linuximage);
  136. xfree(bg_conf->default_mloaderimage);
  137. xfree(bg_conf->default_ramdiskimage);
  138. if (bg_conf->linux_list) {
  139. list_destroy(bg_conf->linux_list);
  140. bg_conf->linux_list = NULL;
  141. }
  142. if (bg_conf->mloader_list) {
  143. list_destroy(bg_conf->mloader_list);
  144. bg_conf->mloader_list = NULL;
  145. }
  146. if (bg_conf->ramdisk_list) {
  147. list_destroy(bg_conf->ramdisk_list);
  148. bg_conf->ramdisk_list = NULL;
  149. }
  150. xfree(bg_conf->slurm_user_name);
  151. xfree(bg_conf->slurm_node_prefix);
  152. xfree(bg_conf);
  153. }
  154. }
  155. static void _destroy_bg_lists(bg_lists_t *bg_lists)
  156. {
  157. if (bg_lists) {
  158. if (bg_lists->booted) {
  159. list_destroy(bg_lists->booted);
  160. bg_lists->booted = NULL;
  161. }
  162. if (bg_lists->job_running) {
  163. list_destroy(bg_lists->job_running);
  164. bg_lists->job_running = NULL;
  165. num_unused_cpus = 0;
  166. }
  167. if (bg_lists->main) {
  168. list_destroy(bg_lists->main);
  169. bg_lists->main = NULL;
  170. }
  171. if (bg_lists->valid_small32) {
  172. list_destroy(bg_lists->valid_small32);
  173. bg_lists->valid_small32 = NULL;
  174. }
  175. if (bg_lists->valid_small64) {
  176. list_destroy(bg_lists->valid_small64);
  177. bg_lists->valid_small64 = NULL;
  178. }
  179. if (bg_lists->valid_small128) {
  180. list_destroy(bg_lists->valid_small128);
  181. bg_lists->valid_small128 = NULL;
  182. }
  183. if (bg_lists->valid_small256) {
  184. list_destroy(bg_lists->valid_small256);
  185. bg_lists->valid_small256 = NULL;
  186. }
  187. xfree(bg_lists);
  188. }
  189. }
  190. #ifdef HAVE_BG
  191. static int _delete_old_blocks(List curr_block_list, List found_block_list)
  192. {
  193. ListIterator itr_curr, itr_found;
  194. bg_record_t *found_record = NULL, *init_record = NULL;
  195. List destroy_list = list_create(NULL);
  196. xassert(curr_block_list);
  197. xassert(found_block_list);
  198. slurm_mutex_lock(&block_state_mutex);
  199. if (!bg_recover) {
  200. info("removing all current blocks (clean start)");
  201. itr_curr = list_iterator_create(curr_block_list);
  202. while ((init_record = list_next(itr_curr))) {
  203. list_remove(itr_curr);
  204. init_record->modifying = 0;
  205. /* The block needs to exist in the main list
  206. * just to make sure we query the state. */
  207. if (!(found_record = find_bg_record_in_list(
  208. bg_lists->main,
  209. init_record->bg_block_id)))
  210. list_push(bg_lists->main, init_record);
  211. else {
  212. destroy_bg_record(init_record);
  213. init_record = found_record;
  214. }
  215. /* Make sure this block isn't in an
  216. error state since if it is it won't
  217. disappear. */
  218. if (init_record->state & BG_BLOCK_ERROR_FLAG)
  219. resume_block(init_record);
  220. list_push(destroy_list, init_record);
  221. }
  222. list_iterator_destroy(itr_curr);
  223. } else {
  224. info("removing unspecified blocks");
  225. itr_curr = list_iterator_create(curr_block_list);
  226. while ((init_record = list_next(itr_curr))) {
  227. itr_found = list_iterator_create(found_block_list);
  228. while ((found_record = list_next(itr_found))) {
  229. if (!strcmp(init_record->bg_block_id,
  230. found_record->bg_block_id)) {
  231. /* don't delete this one */
  232. break;
  233. }
  234. }
  235. list_iterator_destroy(itr_found);
  236. if (found_record == NULL) {
  237. list_remove(itr_curr);
  238. init_record->modifying = 0;
  239. /* The block needs to exist in the main list
  240. * just to make sure we query the state. */
  241. if (!(found_record = find_bg_record_in_list(
  242. bg_lists->main,
  243. init_record->bg_block_id)))
  244. list_push(bg_lists->main, init_record);
  245. else {
  246. destroy_bg_record(init_record);
  247. init_record = found_record;
  248. }
  249. /* Make sure this block isn't in an
  250. error state since if it is it won't
  251. disappear. */
  252. if (init_record->state & BG_BLOCK_ERROR_FLAG)
  253. resume_block(init_record);
  254. /* Since we can't requeue a running
  255. job in the free block function (not
  256. thread safe here) we must do it
  257. now.
  258. */
  259. if ((init_record->job_running > NO_JOB_RUNNING)
  260. || init_record->job_ptr) {
  261. /* Don't worry about dealing
  262. with this job here. Trying
  263. to requeue/cancel now will
  264. cause a race condition
  265. locking up the slurmctld.
  266. It will be handled when the
  267. blocks are synced. This
  268. should only happen if the
  269. bluegene.conf gets changed
  270. and jobs are running on
  271. blocks that don't exist in
  272. the new config (hopefully
  273. rarely).
  274. */
  275. init_record->job_running =
  276. NO_JOB_RUNNING;
  277. init_record->job_ptr = NULL;
  278. } else if (init_record->job_list &&
  279. list_count(init_record->job_list))
  280. list_flush(init_record->job_list);
  281. list_push(destroy_list, init_record);
  282. }
  283. }
  284. list_iterator_destroy(itr_curr);
  285. }
  286. slurm_mutex_unlock(&block_state_mutex);
  287. free_block_list(NO_VAL, destroy_list, 1, 0);
  288. list_destroy(destroy_list);
  289. return SLURM_SUCCESS;
  290. }
  291. static void _set_bg_lists()
  292. {
  293. if (!bg_lists)
  294. bg_lists = xmalloc(sizeof(bg_lists_t));
  295. slurm_mutex_lock(&block_state_mutex);
  296. if (bg_lists->booted)
  297. list_destroy(bg_lists->booted);
  298. bg_lists->booted = list_create(NULL);
  299. if (bg_lists->job_running)
  300. list_destroy(bg_lists->job_running);
  301. bg_lists->job_running = list_create(NULL);
  302. if (bg_lists->main)
  303. list_destroy(bg_lists->main);
  304. bg_lists->main = list_create(destroy_bg_record);
  305. slurm_mutex_unlock(&block_state_mutex);
  306. }
  307. static bg_record_t *_translate_info_2_record(block_info_t *block_info)
  308. {
  309. bg_record_t *bg_record = NULL;
  310. bitstr_t *mp_bitmap = NULL, *ionode_bitmap = NULL;
  311. mp_bitmap = bit_alloc(node_record_count);
  312. ionode_bitmap = bit_alloc(bg_conf->ionodes_per_mp);
  313. if (block_info->mp_inx
  314. && inx2bitstr(mp_bitmap, block_info->mp_inx) == -1)
  315. error("Job state recovered incompatible with "
  316. "bluegene.conf. mp=%u",
  317. node_record_count);
  318. if (block_info->ionode_inx
  319. && inx2bitstr(ionode_bitmap, block_info->ionode_inx) == -1)
  320. error("Job state recovered incompatible with "
  321. "bluegene.conf. ionodes=%u",
  322. bg_conf->ionodes_per_mp);
  323. bg_record = xmalloc(sizeof(bg_record_t));
  324. bg_record->magic = BLOCK_MAGIC;
  325. bg_record->bg_block_id = block_info->bg_block_id;
  326. block_info->bg_block_id = NULL;
  327. bg_record->mp_str = block_info->mp_str;
  328. block_info->mp_str = NULL;
  329. bg_record->ionode_bitmap = ionode_bitmap;
  330. ionode_bitmap = NULL;
  331. if (block_info->ionode_str) {
  332. ba_set_ionode_str(bg_record);
  333. if (!bg_record->ionode_str
  334. || strcmp(block_info->ionode_str, bg_record->ionode_str)) {
  335. error("block %s didn't compute with the correct "
  336. "ionode_str. Stored as '%s' and "
  337. "came back as '%s'",
  338. bg_record->bg_block_id,
  339. block_info->ionode_str, bg_record->ionode_str);
  340. }
  341. }
  342. bg_record->mp_bitmap = mp_bitmap;
  343. mp_bitmap = NULL;
  344. /* put_block_in_error_state should be
  345. called after the bg_lists->main has been
  346. made. We can't call it here since
  347. this record isn't the record kept
  348. around in bg_lists->main.
  349. */
  350. bg_record->state = block_info->state;
  351. bg_record->cnode_cnt = block_info->cnode_cnt;
  352. bg_record->mp_count = bit_set_count(bg_record->mp_bitmap);
  353. /* Don't copy the job_list from the block_info, we will fill
  354. it in later in the job sync.
  355. */
  356. bg_record->job_running = NO_JOB_RUNNING;
  357. if (bg_conf->sub_blocks && (bg_record->mp_count == 1))
  358. bg_record->job_list = list_create(NULL);
  359. #ifdef HAVE_BGL
  360. bg_record->node_use = block_info->node_use;
  361. #endif
  362. memcpy(bg_record->conn_type, block_info->conn_type,
  363. sizeof(bg_record->conn_type));
  364. bg_record->blrtsimage = block_info->blrtsimage;
  365. block_info->blrtsimage = NULL;
  366. bg_record->linuximage = block_info->linuximage;
  367. block_info->linuximage = NULL;
  368. bg_record->mloaderimage = block_info->mloaderimage;
  369. block_info->mloaderimage = NULL;
  370. bg_record->ramdiskimage = block_info->ramdiskimage;
  371. block_info->ramdiskimage = NULL;
  372. bg_record->reason = block_info->reason;
  373. block_info->reason = NULL;
  374. slurm_free_block_info_members(block_info);
  375. return bg_record;
  376. }
  377. static void _local_pack_block_job_info(struct job_record *job_ptr, Buf buffer,
  378. uint16_t protocol_version)
  379. {
  380. block_job_info_t block_job;
  381. select_jobinfo_t *jobinfo = job_ptr->select_jobinfo->data;
  382. memset(&block_job, 0, sizeof(block_job_info_t));
  383. block_job.job_id = job_ptr->job_id;
  384. block_job.user_id = job_ptr->user_id;
  385. if (jobinfo) {
  386. block_job.user_name = jobinfo->user_name;
  387. block_job.cnodes = jobinfo->ionode_str;
  388. } else
  389. error("NO JOBINFO for job %u magic %u!!!!!!!!!!!!!!",
  390. job_ptr->job_id, job_ptr->magic);
  391. /* block_job.cnode_inx -- try not to set */
  392. slurm_pack_block_job_info(&block_job, buffer, protocol_version);
  393. }
  394. /* Pack all relevent information about a block */
  395. /* NOTE: There is a matching pack function in
  396. * common/slurm_protocol_pack.c dealing with the block_info_t
  397. * structure there. If anything changes here please update that as well.
  398. * The unpack for this is in common/slurm_protocol_pack.c
  399. */
  400. static void _pack_block(bg_record_t *bg_record, Buf buffer,
  401. uint16_t protocol_version)
  402. {
  403. #ifdef HAVE_BGQ
  404. int dim;
  405. #endif
  406. uint32_t count = NO_VAL, running_job = 0;
  407. struct job_record *job_ptr;
  408. ListIterator itr;
  409. if (protocol_version >= SLURM_2_4_PROTOCOL_VERSION) {
  410. packstr(bg_record->bg_block_id, buffer);
  411. packstr(bg_record->blrtsimage, buffer);
  412. pack_bit_fmt(bg_record->mp_bitmap, buffer);
  413. #ifdef HAVE_BGQ
  414. pack32(SYSTEM_DIMENSIONS, buffer);
  415. for (dim=0; dim<SYSTEM_DIMENSIONS; dim++)
  416. pack16(bg_record->conn_type[dim], buffer);
  417. #else
  418. pack32(1, buffer); /* for dimensions of conn_type */
  419. pack16(bg_record->conn_type[0], buffer);
  420. #endif
  421. packstr(bg_record->ionode_str, buffer);
  422. pack_bit_fmt(bg_record->ionode_bitmap, buffer);
  423. if (bg_record->job_list)
  424. count = list_count(bg_record->job_list);
  425. if (count && count != NO_VAL) {
  426. pack32(count, buffer);
  427. itr = list_iterator_create(bg_record->job_list);
  428. while ((job_ptr = list_next(itr))) {
  429. if (job_ptr->magic != JOB_MAGIC) {
  430. error("_pack_block: "
  431. "bad magic found when "
  432. "packing block %s",
  433. bg_record->bg_block_id);
  434. list_delete_item(itr);
  435. slurm_pack_block_job_info(
  436. NULL, buffer,
  437. protocol_version);
  438. continue;
  439. }
  440. _local_pack_block_job_info(
  441. job_ptr, buffer, protocol_version);
  442. }
  443. list_iterator_destroy(itr);
  444. } else if (bg_record->job_ptr
  445. && (bg_record->job_ptr->magic == JOB_MAGIC)) {
  446. pack32(1, buffer);
  447. _local_pack_block_job_info(
  448. bg_record->job_ptr, buffer, protocol_version);
  449. } else
  450. pack32(count, buffer);
  451. count = NO_VAL;
  452. packstr(bg_record->linuximage, buffer);
  453. packstr(bg_record->mloaderimage, buffer);
  454. packstr(bg_record->mp_str, buffer);
  455. pack32(bg_record->cnode_cnt, buffer);
  456. pack32(bg_record->cnode_err_cnt, buffer);
  457. pack16((uint16_t)bg_record->node_use, buffer);
  458. packstr(bg_record->ramdiskimage, buffer);
  459. packstr(bg_record->reason, buffer);
  460. pack16((uint16_t)bg_record->state, buffer);
  461. } else if (protocol_version >= SLURM_2_3_PROTOCOL_VERSION) {
  462. packstr(bg_record->bg_block_id, buffer);
  463. packstr(bg_record->blrtsimage, buffer);
  464. pack_bit_fmt(bg_record->mp_bitmap, buffer);
  465. #ifdef HAVE_BGQ
  466. pack32(SYSTEM_DIMENSIONS, buffer);
  467. for (dim=0; dim<SYSTEM_DIMENSIONS; dim++)
  468. pack16(bg_record->conn_type[dim], buffer);
  469. #else
  470. pack32(1, buffer); /* for dimensions of conn_type */
  471. pack16(bg_record->conn_type[0], buffer);
  472. #endif
  473. packstr(bg_record->ionode_str, buffer);
  474. pack_bit_fmt(bg_record->ionode_bitmap, buffer);
  475. if (bg_record->job_list)
  476. count = list_count(bg_record->job_list);
  477. pack32(count, buffer);
  478. if (count && count != NO_VAL) {
  479. itr = list_iterator_create(bg_record->job_list);
  480. while ((job_ptr = list_next(itr))) {
  481. if (job_ptr->magic != JOB_MAGIC) {
  482. error("_pack_block 2.3: "
  483. "bad magic found when "
  484. "packing block %s",
  485. bg_record->bg_block_id);
  486. list_delete_item(itr);
  487. continue;
  488. }
  489. _local_pack_block_job_info(
  490. job_ptr, buffer, protocol_version);
  491. }
  492. list_iterator_destroy(itr);
  493. }
  494. if ((count == 1) && running_job)
  495. pack32((uint32_t)running_job, buffer);
  496. else
  497. pack32((uint32_t)bg_record->job_running, buffer);
  498. count = NO_VAL;
  499. packstr(bg_record->linuximage, buffer);
  500. packstr(bg_record->mloaderimage, buffer);
  501. packstr(bg_record->mp_str, buffer);
  502. packnull(buffer); /* for mp_used_str */
  503. pack32((uint32_t)bg_record->cnode_cnt, buffer);
  504. pack16((uint16_t)bg_record->node_use, buffer);
  505. packnull(buffer); /* for user_name */
  506. packstr(bg_record->ramdiskimage, buffer);
  507. packstr(bg_record->reason, buffer);
  508. pack16((uint16_t)bg_record->state, buffer);
  509. packnull(buffer); /* for mp_used_inx */
  510. }
  511. }
  512. /* Pack all extra information about a block (Only needed for saving state.) */
  513. static void _pack_block_ext(bg_record_t *bg_record, Buf buffer,
  514. uint16_t protocol_version)
  515. {
  516. ListIterator itr;
  517. ba_mp_t *ba_mp;
  518. uint32_t count = NO_VAL;
  519. int i;
  520. xassert(bg_record);
  521. if (protocol_version >= SLURM_2_3_PROTOCOL_VERSION) {
  522. if (bg_record->ba_mp_list)
  523. count = list_count(bg_record->ba_mp_list);
  524. pack32(count, buffer);
  525. if (count && count != NO_VAL) {
  526. itr = list_iterator_create(bg_record->ba_mp_list);
  527. while ((ba_mp = list_next(itr)))
  528. pack_ba_mp(ba_mp, buffer, protocol_version);
  529. list_iterator_destroy(itr);
  530. }
  531. pack32(bg_record->cpu_cnt, buffer);
  532. for (i=0; i<SYSTEM_DIMENSIONS; i++) {
  533. pack16(bg_record->geo[i], buffer);
  534. pack16(bg_record->start[i], buffer);
  535. }
  536. pack16(bg_record->full_block, buffer);
  537. pack32(bg_record->switch_count, buffer);
  538. } else {
  539. /* didn't exist before 2.3 */
  540. }
  541. }
  542. /* UNPack all extra information about a block */
  543. static int _unpack_block_ext(bg_record_t *bg_record, Buf buffer,
  544. uint16_t protocol_version)
  545. {
  546. ba_mp_t *ba_mp;
  547. uint32_t count = NO_VAL;
  548. int i;
  549. uint16_t temp16;
  550. xassert(bg_record);
  551. if (protocol_version >= SLURM_2_3_PROTOCOL_VERSION) {
  552. safe_unpack32(&count, buffer);
  553. if (count == NO_VAL) {
  554. error("_unpack_block_ext: bg_record record has no "
  555. "mp_list");
  556. goto unpack_error;
  557. }
  558. bg_record->ba_mp_list = list_create(destroy_ba_mp);
  559. for (i=0; i<count; i++) {
  560. if (unpack_ba_mp(&ba_mp, buffer, protocol_version)
  561. == SLURM_ERROR)
  562. goto unpack_error;
  563. list_append(bg_record->ba_mp_list, ba_mp);
  564. }
  565. safe_unpack32(&bg_record->cpu_cnt, buffer);
  566. for (i=0; i<SYSTEM_DIMENSIONS; i++) {
  567. safe_unpack16(&bg_record->geo[i], buffer);
  568. safe_unpack16(&bg_record->start[i], buffer);
  569. }
  570. safe_unpack16(&temp16, buffer);
  571. bg_record->full_block = temp16;
  572. safe_pack32(bg_record->switch_count, buffer);
  573. } else {
  574. /* packing didn't exist before 2.3, so set things up
  575. * to go forward */
  576. if (bg_conf->mp_cnode_cnt > bg_record->cnode_cnt) {
  577. bg_record->cpu_cnt = bg_conf->cpus_per_mp /
  578. (bg_conf->mp_cnode_cnt / bg_record->cnode_cnt);
  579. } else {
  580. bg_record->cpu_cnt = bg_conf->cpus_per_mp
  581. * bg_record->mp_count;
  582. }
  583. process_nodes(bg_record, true);
  584. }
  585. return SLURM_SUCCESS;
  586. unpack_error:
  587. error("Problem unpacking extended block info for %s, "
  588. "removing from list",
  589. bg_record->bg_block_id);
  590. return SLURM_ERROR;
  591. }
  592. static int _load_state_file(List curr_block_list, char *dir_name)
  593. {
  594. int state_fd, i;
  595. char *state_file = NULL;
  596. Buf buffer = NULL;
  597. char *data = NULL;
  598. int data_size = 0;
  599. block_info_msg_t *block_ptr = NULL;
  600. bg_record_t *bg_record = NULL;
  601. char temp[256];
  602. List results = NULL;
  603. int data_allocated, data_read = 0;
  604. char *ver_str = NULL;
  605. uint32_t ver_str_len;
  606. char *name = NULL;
  607. struct part_record *part_ptr = NULL;
  608. bitstr_t *usable_mp_bitmap = NULL;
  609. ListIterator itr = NULL;
  610. uint16_t protocol_version = (uint16_t)NO_VAL;
  611. uint32_t record_count;
  612. xassert(curr_block_list);
  613. xassert(dir_name);
  614. state_file = xstrdup(dir_name);
  615. xstrcat(state_file, "/block_state");
  616. state_fd = open(state_file, O_RDONLY);
  617. if (state_fd < 0) {
  618. error("No block state file (%s) to recover", state_file);
  619. xfree(state_file);
  620. return SLURM_SUCCESS;
  621. } else {
  622. data_allocated = BUF_SIZE;
  623. data = xmalloc(data_allocated);
  624. while (1) {
  625. data_read = read(state_fd, &data[data_size],
  626. BUF_SIZE);
  627. if (data_read < 0) {
  628. if (errno == EINTR)
  629. continue;
  630. else {
  631. error("Read error on %s: %m",
  632. state_file);
  633. break;
  634. }
  635. } else if (data_read == 0) /* eof */
  636. break;
  637. data_size += data_read;
  638. data_allocated += data_read;
  639. xrealloc(data, data_allocated);
  640. }
  641. close(state_fd);
  642. }
  643. xfree(state_file);
  644. buffer = create_buf(data, data_size);
  645. safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer);
  646. debug3("Version string in block_state header is %s", ver_str);
  647. if (ver_str) {
  648. if (!strcmp(ver_str, BLOCK_STATE_VERSION)) {
  649. protocol_version = SLURM_PROTOCOL_VERSION;
  650. }
  651. }
  652. if (protocol_version == (uint16_t)NO_VAL) {
  653. error("***********************************************");
  654. error("Can not recover block state, "
  655. "data version incompatible");
  656. error("***********************************************");
  657. xfree(ver_str);
  658. free_buf(buffer);
  659. return EFAULT;
  660. }
  661. xfree(ver_str);
  662. safe_unpack32(&record_count, buffer);
  663. slurm_mutex_lock(&block_state_mutex);
  664. reset_ba_system(true);
  665. /* Locks are already in place to protect part_list here */
  666. usable_mp_bitmap = bit_alloc(node_record_count);
  667. itr = list_iterator_create(part_list);
  668. while ((part_ptr = list_next(itr))) {
  669. /* we only want to use mps that are in partitions */
  670. if (!part_ptr->node_bitmap) {
  671. debug4("Partition %s doesn't have any nodes in it.",
  672. part_ptr->name);
  673. continue;
  674. }
  675. bit_or(usable_mp_bitmap, part_ptr->node_bitmap);
  676. }
  677. list_iterator_destroy(itr);
  678. if (bit_ffs(usable_mp_bitmap) == -1) {
  679. fatal("We don't have any nodes in any partitions. "
  680. "Can't create blocks. "
  681. "Please check your slurm.conf.");
  682. }
  683. for (i=0; i<record_count; i++) {
  684. block_info_t block_info;
  685. if (slurm_unpack_block_info_members(
  686. &block_info, buffer, protocol_version))
  687. goto unpack_error;
  688. if (!(bg_record = _translate_info_2_record(&block_info)))
  689. continue;
  690. if (_unpack_block_ext(bg_record, buffer, protocol_version)
  691. != SLURM_SUCCESS) {
  692. destroy_bg_record(bg_record);
  693. goto unpack_error;
  694. }
  695. /* This means the block here wasn't able to be
  696. processed correctly, so don't add.
  697. */
  698. if (!bg_record->mp_count) {
  699. error("block %s(%s) can't be made in the current "
  700. "system, but was around in the previous one.",
  701. bg_record->bg_block_id, bg_record->mp_str);
  702. list_destroy(results);
  703. destroy_bg_record(bg_record);
  704. continue;
  705. }
  706. if ((bg_conf->layout_mode == LAYOUT_OVERLAP)
  707. || bg_record->full_block)
  708. reset_ba_system(false);
  709. if (bg_record->ba_mp_list) {
  710. /* only do this for blocks bigger than 1
  711. midplane */
  712. if (bg_record->cpu_cnt >= bg_conf->cpus_per_mp)
  713. if (check_and_set_mp_list(bg_record->ba_mp_list)
  714. == SLURM_ERROR)
  715. error("something happened in the "
  716. "load of %s, keeping it "
  717. "around though",
  718. bg_record->bg_block_id);
  719. } else {
  720. select_ba_request_t ba_request;
  721. ba_set_removable_mps(usable_mp_bitmap, 1);
  722. /* we want the mps that aren't
  723. * in this record to mark them as used
  724. */
  725. if (ba_set_removable_mps(bg_record->mp_bitmap, 1)
  726. != SLURM_SUCCESS)
  727. fatal("1 It doesn't seem we have a bitmap "
  728. "for %s",
  729. bg_record->bg_block_id);
  730. #ifdef HAVE_BGQ
  731. results = list_create(destroy_ba_mp);
  732. #else
  733. results = list_create(NULL);
  734. #endif
  735. /* info("adding back %s %s", bg_record->bg_block_id, */
  736. /* bg_record->mp_str); */
  737. memset(&ba_request, 0, sizeof(ba_request));
  738. memcpy(ba_request.start, bg_record->start,
  739. sizeof(bg_record->start));
  740. memcpy(ba_request.geometry, bg_record->geo,
  741. sizeof(bg_record->geo));
  742. memcpy(ba_request.conn_type, bg_record->conn_type,
  743. sizeof(bg_record->conn_type));
  744. ba_request.start_req = 1;
  745. name = set_bg_block(results, &ba_request);
  746. ba_reset_all_removed_mps();
  747. if (!name) {
  748. error("I was unable to make the "
  749. "requested block.");
  750. list_destroy(results);
  751. destroy_bg_record(bg_record);
  752. bg_record = NULL;
  753. continue;
  754. }
  755. snprintf(temp, sizeof(temp), "%s%s",
  756. bg_conf->slurm_node_prefix,
  757. name);
  758. xfree(name);
  759. if (strcmp(temp, bg_record->mp_str)) {
  760. fatal("bad wiring in preserved state "
  761. "(found %s, but allocated %s) "
  762. "YOU MUST COLDSTART",
  763. bg_record->mp_str, temp);
  764. }
  765. if (bg_record->ba_mp_list)
  766. list_destroy(bg_record->ba_mp_list);
  767. #ifdef HAVE_BGQ
  768. bg_record->ba_mp_list = results;
  769. results = NULL;
  770. #else
  771. bg_record->ba_mp_list = list_create(destroy_ba_mp);
  772. copy_node_path(results, &bg_record->ba_mp_list);
  773. list_destroy(results);
  774. #endif
  775. }
  776. // bridge_block_create(bg_record);
  777. list_push(curr_block_list, bg_record);
  778. }
  779. FREE_NULL_BITMAP(usable_mp_bitmap);
  780. sort_bg_record_inc_size(curr_block_list);
  781. slurm_mutex_unlock(&block_state_mutex);
  782. info("Recovered %d blocks", list_count(curr_block_list));
  783. slurm_free_block_info_msg(block_ptr);
  784. free_buf(buffer);
  785. return SLURM_SUCCESS;
  786. unpack_error:
  787. FREE_NULL_BITMAP(usable_mp_bitmap);
  788. slurm_mutex_unlock(&block_state_mutex);
  789. error("Incomplete block data checkpoint file");
  790. free_buf(buffer);
  791. return SLURM_FAILURE;
  792. }
  793. static void _handle_existing_block(bg_record_t *bg_record)
  794. {
  795. char *conn_type;
  796. char node_str[256];
  797. xassert(bg_record);
  798. format_node_name(bg_record, node_str, sizeof(node_str));
  799. conn_type = conn_type_string_full(bg_record->conn_type);
  800. info("Existing: BlockID:%s Nodes:%s Conn:%s",
  801. bg_record->bg_block_id, node_str, conn_type);
  802. xfree(conn_type);
  803. /* Sanity check to make sure we have the correct setup from
  804. the save.
  805. */
  806. if (bg_conf->sub_blocks && bg_record->mp_count == 1) {
  807. ba_mp_t *ba_mp = list_peek(bg_record->ba_mp_list);
  808. xassert(ba_mp);
  809. if (!ba_mp->cnode_bitmap) {
  810. error("_handle_existing_block: No cnode_bitmap "
  811. "for block %s, creating it",
  812. bg_record->bg_block_id);
  813. if ((ba_mp->cnode_bitmap =
  814. ba_create_ba_mp_cnode_bitmap(bg_record))) {
  815. if (!ba_mp->cnode_err_bitmap)
  816. ba_mp->cnode_err_bitmap =
  817. bit_alloc(bg_conf->
  818. mp_cnode_cnt);
  819. FREE_NULL_BITMAP(ba_mp->cnode_usable_bitmap);
  820. ba_mp->cnode_usable_bitmap =
  821. bit_copy(ba_mp->cnode_bitmap);
  822. }
  823. }
  824. }
  825. if (bg_record->state & BG_BLOCK_ERROR_FLAG)
  826. put_block_in_error_state(bg_record, NULL);
  827. else if (((bg_record->state == BG_BLOCK_INITED)
  828. || (bg_record->state == BG_BLOCK_BOOTING))
  829. && !block_ptr_exist_in_list(bg_lists->booted, bg_record))
  830. list_push(bg_lists->booted, bg_record);
  831. }
  832. /*
  833. * _validate_config_blocks - Match slurm configuration information with
  834. * current BG block configuration.
  835. * IN/OUT curr_block_list - List of blocks already existing on the system.
  836. * IN/OUT found_block_list - List of blocks found on the system
  837. * that are listed in the bluegene.conf.
  838. * NOTE: Both of the lists above should be created with list_create(NULL)
  839. * since the bg_lists->main will contain the complete list of pointers
  840. * and be destroyed with it.
  841. *
  842. * RET - SLURM_SUCCESS if no blocks need to be deleted, else an error
  843. * code. Writes bg_block_id into bg_lists->main records.
  844. */
  845. static int _validate_config_blocks(List curr_block_list,
  846. List found_block_list, char *dir)
  847. {
  848. int rc = SLURM_ERROR;
  849. bg_record_t* bg_record = NULL;
  850. bg_record_t* init_bg_record = NULL;
  851. int full_created = 0;
  852. ListIterator itr_conf;
  853. ListIterator itr_curr;
  854. char tmp_char[256];
  855. int dim;
  856. xassert(curr_block_list);
  857. xassert(found_block_list);
  858. /* read in state from last run. */
  859. if (bg_recover)
  860. rc = _load_state_file(curr_block_list, dir);
  861. #ifndef HAVE_BG_FILES
  862. if (rc != SLURM_SUCCESS)
  863. return rc;
  864. #endif
  865. /* read current bg block info into curr_block_list This
  866. * happens in the state load before this in emulation mode */
  867. if (bridge_blocks_load_curr(curr_block_list) == SLURM_ERROR)
  868. return SLURM_ERROR;
  869. if (!bg_recover)
  870. return SLURM_ERROR;
  871. #ifdef HAVE_BG_FILES
  872. /* Since we just checked all the blocks from state against that
  873. in the database we can now check to see if there were once
  874. blocks that are now gone from the database and remove them
  875. from the list.
  876. */
  877. itr_curr = list_iterator_create(curr_block_list);
  878. while ((bg_record = list_next(itr_curr))) {
  879. if (bg_record->modifying) {
  880. bg_record->modifying = 0;
  881. continue;
  882. }
  883. error("Found state for block %s, but that "
  884. "block isn't in the system anymore, removing",
  885. bg_record->bg_block_id);
  886. list_delete_item(itr_curr);
  887. }
  888. list_iterator_destroy(itr_curr);
  889. #endif
  890. if (bg_conf->layout_mode == LAYOUT_DYNAMIC) {
  891. /* Since we don't read the blocks in a Dynamic system
  892. we can just transfer the list here and return.
  893. */
  894. list_transfer(bg_lists->main, curr_block_list);
  895. itr_conf = list_iterator_create(bg_lists->main);
  896. while ((bg_record = list_next(itr_conf)))
  897. _handle_existing_block(bg_record);
  898. list_iterator_destroy(itr_conf);
  899. return SLURM_SUCCESS;
  900. }
  901. /* Only when we are looking at a non-dynamic system do we need
  902. to go through the following logic to make sure things are insync.
  903. */
  904. itr_curr = list_iterator_create(curr_block_list);
  905. itr_conf = list_iterator_create(bg_lists->main);
  906. while ((bg_record = list_next(itr_conf))) {
  907. list_iterator_reset(itr_curr);
  908. while ((init_bg_record = list_next(itr_curr))) {
  909. if (!bit_equal(bg_record->mp_bitmap,
  910. init_bg_record->mp_bitmap))
  911. continue; /* wrong nodes */
  912. if (!bit_equal(bg_record->ionode_bitmap,
  913. init_bg_record->ionode_bitmap))
  914. continue;
  915. if ((bg_record->conn_type[0] < SELECT_SMALL)
  916. && (init_bg_record->conn_type[0] < SELECT_SMALL)) {
  917. for (dim = 0; dim < SYSTEM_DIMENSIONS; dim++) {
  918. /* Only look at how far we
  919. have set. The bg_record
  920. should of been set up
  921. correctly in the
  922. parse_blockreq() function.
  923. */
  924. if (bg_record->conn_type[dim] ==
  925. (uint16_t)NO_VAL) {
  926. dim = SYSTEM_DIMENSIONS;
  927. break;
  928. }
  929. if (bg_record->conn_type[dim] !=
  930. init_bg_record->conn_type[dim])
  931. break; /* wrong conn_type */
  932. }
  933. if (dim < SYSTEM_DIMENSIONS)
  934. continue;
  935. }
  936. copy_bg_record(init_bg_record, bg_record);
  937. /* remove from the curr list since we just
  938. matched it no reason to keep it around
  939. anymore */
  940. list_delete_item(itr_curr);
  941. break;
  942. }
  943. if (!bg_record->bg_block_id) {
  944. format_node_name(bg_record, tmp_char,
  945. sizeof(tmp_char));
  946. info("Block found in bluegene.conf to be "
  947. "created: Nodes:%s",
  948. tmp_char);
  949. } else {
  950. if (bg_record->full_block)
  951. full_created = 1;
  952. list_push(found_block_list, bg_record);
  953. _handle_existing_block(bg_record);
  954. }
  955. }
  956. if (!full_created) {
  957. list_iterator_reset(itr_curr);
  958. while ((init_bg_record = list_next(itr_curr))) {
  959. if (init_bg_record->full_block) {
  960. list_remove(itr_curr);
  961. bg_record = init_bg_record;
  962. list_append(bg_lists->main, bg_record);
  963. list_push(found_block_list, bg_record);
  964. _handle_existing_block(bg_record);
  965. break;
  966. }
  967. }
  968. }
  969. list_iterator_destroy(itr_conf);
  970. list_iterator_destroy(itr_curr);
  971. if (!list_count(curr_block_list))
  972. rc = SLURM_SUCCESS;
  973. else
  974. rc = SLURM_ERROR;
  975. return rc;
  976. }
  977. static List _get_config(void)
  978. {
  979. config_key_pair_t *key_pair;
  980. List my_list = list_create(destroy_config_key_pair);
  981. if (!my_list)
  982. fatal("malloc failure on list_create");
  983. key_pair = xmalloc(sizeof(config_key_pair_t));
  984. key_pair->name = xstrdup("DefaultConnType");
  985. key_pair->value = conn_type_string_full(bg_conf->default_conn_type);
  986. list_append(my_list, key_pair);
  987. #ifndef HAVE_BG_FILES
  988. key_pair = xmalloc(sizeof(config_key_pair_t));
  989. key_pair->name = xstrdup("Emulated");
  990. key_pair->value = xstrdup("yes");
  991. list_append(my_list, key_pair);
  992. #endif
  993. key_pair = xmalloc(sizeof(config_key_pair_t));
  994. key_pair->name = xstrdup("MaxBlockInError");
  995. key_pair->value = xstrdup_printf("%u", bg_conf->max_block_err);
  996. list_append(my_list, key_pair);
  997. key_pair = xmalloc(sizeof(config_key_pair_t));
  998. key_pair->name = xstrdup("MidPlaneNodeCnt");
  999. key_pair->value = xstrdup_printf("%u", bg_conf->mp_cnode_cnt);
  1000. list_append(my_list, key_pair);
  1001. key_pair = xmalloc(sizeof(config_key_pair_t));
  1002. key_pair->name = xstrdup("NodeCPUCnt");
  1003. key_pair->value = xstrdup_printf("%u", bg_conf->cpu_ratio);
  1004. list_append(my_list, key_pair);
  1005. #ifdef HAVE_BGL
  1006. key_pair = xmalloc(sizeof(config_key_pair_t));
  1007. key_pair->name = xstrdup("BlrtsImage");
  1008. key_pair->value = xstrdup(bg_conf->default_blrtsimage);
  1009. list_append(my_list, key_pair);
  1010. key_pair = xmalloc(sizeof(config_key_pair_t));
  1011. key_pair->name = xstrdup("LinuxImage");
  1012. key_pair->value = xstrdup(bg_conf->default_linuximage);
  1013. list_append(my_list, key_pair);
  1014. key_pair = xmalloc(sizeof(config_key_pair_t));
  1015. key_pair->name = xstrdup("RamDiskImage");
  1016. key_pair->value = xstrdup(bg_conf->default_ramdiskimage);
  1017. list_append(my_list, key_pair);
  1018. #elif defined HAVE_BGP
  1019. key_pair = xmalloc(sizeof(config_key_pair_t));
  1020. key_pair->name = xstrdup("CnloadImage");
  1021. key_pair->value = xstrdup(bg_conf->default_linuximage);
  1022. list_append(my_list, key_pair);
  1023. key_pair = xmalloc(sizeof(config_key_pair_t));
  1024. key_pair->name = xstrdup("IoloadImage");
  1025. key_pair->value = xstrdup(bg_conf->default_ramdiskimage);
  1026. list_append(my_list, key_pair);
  1027. #endif
  1028. key_pair = xmalloc(sizeof(config_key_pair_t));
  1029. key_pair->name = xstrdup("BridgeAPILogFile");
  1030. key_pair->value = xstrdup(bg_conf->bridge_api_file);
  1031. list_append(my_list, key_pair);
  1032. key_pair = xmalloc(sizeof(config_key_pair_t));
  1033. key_pair->name = xstrdup("BridgeAPIVerbose");
  1034. key_pair->value = xstrdup_printf("%u", bg_conf->bridge_api_verb);
  1035. list_append(my_list, key_pair);
  1036. if (bg_conf->deny_pass) {
  1037. key_pair = xmalloc(sizeof(config_key_pair_t));
  1038. key_pair->name = xstrdup("DenyPassThrough");
  1039. if (bg_conf->deny_pass & PASS_DENY_A)
  1040. xstrcat(key_pair->value, "A,");
  1041. if (bg_conf->deny_pass & PASS_DENY_X)
  1042. xstrcat(key_pair->value, "X,");
  1043. if (bg_conf->deny_pass & PASS_DENY_Y)
  1044. xstrcat(key_pair->value, "Y,");
  1045. if (bg_conf->deny_pass & PASS_DENY_Z)
  1046. xstrcat(key_pair->value, "Z,");
  1047. if (key_pair->value)
  1048. key_pair->value[strlen(key_pair->value)-1] = '\0';
  1049. list_append(my_list, key_pair);
  1050. }
  1051. key_pair = xmalloc(sizeof(config_key_pair_t));
  1052. key_pair->name = xstrdup("IONodesPerMP");
  1053. key_pair->value = xstrdup_printf("%u", bg_conf->ionodes_per_mp);
  1054. list_append(my_list, key_pair);
  1055. key_pair = xmalloc(sizeof(config_key_pair_t));
  1056. key_pair->name = xstrdup("LayoutMode");
  1057. switch(bg_conf->layout_mode) {
  1058. case LAYOUT_STATIC:
  1059. key_pair->value = xstrdup("Static");
  1060. break;
  1061. case LAYOUT_OVERLAP:
  1062. key_pair->value = xstrdup("Overlap");
  1063. break;
  1064. case LAYOUT_DYNAMIC:
  1065. key_pair->value = xstrdup("Dynamic");
  1066. break;
  1067. default:
  1068. key_pair->value = xstrdup("Unknown");
  1069. break;
  1070. }
  1071. list_append(my_list, key_pair);
  1072. key_pair = xmalloc(sizeof(config_key_pair_t));
  1073. key_pair->name = xstrdup("MloaderImage");
  1074. key_pair->value = xstrdup(bg_conf->default_mloaderimage);
  1075. list_append(my_list, key_pair);
  1076. key_pair = xmalloc(sizeof(config_key_pair_t));
  1077. key_pair->name = xstrdup("NodeCardNodeCnt");
  1078. key_pair->value = xstrdup_printf("%u", bg_conf->nodecard_cnode_cnt);
  1079. list_append(my_list, key_pair);
  1080. if (bg_conf->sub_blocks) {
  1081. key_pair = xmalloc(sizeof(config_key_pair_t));
  1082. key_pair->name = xstrdup("AllowSubBlockAllocations");
  1083. key_pair->value = xstrdup("Yes");
  1084. list_append(my_list, key_pair);
  1085. }
  1086. if (bg_conf->sub_mp_sys) {
  1087. key_pair = xmalloc(sizeof(config_key_pair_t));
  1088. key_pair->name = xstrdup("SubMidplaneSystem");
  1089. key_pair->value = xstrdup("Yes");
  1090. list_append(my_list, key_pair);
  1091. }
  1092. list_sort(my_list, (ListCmpF) sort_key_pairs);
  1093. return my_list;
  1094. }
  1095. #endif
  1096. /*
  1097. * init() is called when the plugin is loaded, before any other functions
  1098. * are called. Put global initialization here.
  1099. */
  1100. extern int init(void)
  1101. {
  1102. #ifdef HAVE_BG
  1103. if (!bg_conf) {
  1104. /* This is needed on all systems where srun wraps the
  1105. bluegene calling program (i.e. runjob).
  1106. */
  1107. bg_conf = xmalloc(sizeof(bg_config_t));
  1108. /* set some defaults for most systems */
  1109. bg_conf->actual_cnodes_per_mp = bg_conf->mp_cnode_cnt = 512;
  1110. bg_conf->quarter_cnode_cnt = 128;
  1111. bg_conf->nodecard_cnode_cnt = 32;
  1112. bg_conf->mp_nodecard_cnt = bg_conf->mp_cnode_cnt
  1113. / bg_conf->nodecard_cnode_cnt;
  1114. }
  1115. if (bg_recover != NOT_FROM_CONTROLLER) {
  1116. #if defined HAVE_BG_L_P && (SYSTEM_DIMENSIONS != 3)
  1117. fatal("SYSTEM_DIMENSIONS value (%d) invalid for BlueGene",
  1118. SYSTEM_DIMENSIONS);
  1119. #elif defined HAVE_BGQ && (SYSTEM_DIMENSIONS != 4)
  1120. fatal("SYSTEM_DIMENSIONS value (%d) invalid for BGQ",
  1121. SYSTEM_DIMENSIONS);
  1122. #endif
  1123. #if defined HAVE_BG_FILES && defined HAVE_BG_L_P
  1124. #ifdef HAVE_BGL
  1125. if (!getenv("CLASSPATH") || !getenv("DB2INSTANCE")
  1126. || !getenv("VWSPATH"))
  1127. fatal("db2profile has not been "
  1128. "run to setup DB2 environment");
  1129. if ((SELECT_COPROCESSOR_MODE != RM_PARTITION_COPROCESSOR_MODE)
  1130. || (SELECT_VIRTUAL_NODE_MODE
  1131. != RM_PARTITION_VIRTUAL_NODE_MODE))
  1132. fatal("enum node_use_type out of sync with rm_api.h");
  1133. #endif
  1134. if ((SELECT_MESH != RM_MESH)
  1135. || (SELECT_TORUS != RM_TORUS)
  1136. || (SELECT_NAV != RM_NAV))
  1137. fatal("enum conn_type out of sync with rm_api.h");
  1138. #endif
  1139. verbose("%s loading...", plugin_name);
  1140. /* if this is coming from something other than the controller
  1141. we don't want to read the config or anything like that. */
  1142. _set_bg_lists();
  1143. xfree(bg_conf->slurm_user_name);
  1144. xfree(bg_conf->slurm_node_prefix);
  1145. slurm_conf_lock();
  1146. xassert(slurmctld_conf.slurm_user_name);
  1147. xassert(slurmctld_conf.node_prefix);
  1148. bg_conf->slurm_user_name =
  1149. xstrdup(slurmctld_conf.slurm_user_name);
  1150. bg_conf->slurm_node_prefix =
  1151. xstrdup(slurmctld_conf.node_prefix);
  1152. bg_conf->slurm_debug_flags = slurmctld_conf.debug_flags;
  1153. bg_conf->slurm_debug_level = slurmctld_conf.slurmctld_debug;
  1154. slurm_conf_unlock();
  1155. if (bg_conf->blrts_list)
  1156. list_destroy(bg_conf->blrts_list);
  1157. bg_conf->blrts_list = list_create(destroy_image);
  1158. if (bg_conf->linux_list)
  1159. list_destroy(bg_conf->linux_list);
  1160. bg_conf->linux_list = list_create(destroy_image);
  1161. if (bg_conf->mloader_list)
  1162. list_destroy(bg_conf->mloader_list);
  1163. bg_conf->mloader_list = list_create(destroy_image);
  1164. if (bg_conf->ramdisk_list)
  1165. list_destroy(bg_conf->ramdisk_list);
  1166. bg_conf->ramdisk_list = list_create(destroy_image);
  1167. ba_init(NULL, 1);
  1168. verbose("BlueGene plugin loaded successfully");
  1169. }
  1170. verbose("%s loaded", plugin_name);
  1171. #else
  1172. if (bg_recover != NOT_FROM_CONTROLLER)
  1173. fatal("select/bluegene is incompatible with a "
  1174. "non BlueGene system");
  1175. #endif
  1176. return SLURM_SUCCESS;
  1177. }
  1178. extern int fini ( void )
  1179. {
  1180. int rc = SLURM_SUCCESS;
  1181. ba_fini();
  1182. _destroy_bg_config(bg_conf);
  1183. _destroy_bg_lists(bg_lists);
  1184. return rc;
  1185. }
  1186. /*
  1187. * The remainder of this file implements the standard SLURM
  1188. * node selection API.
  1189. */
  1190. /* We rely upon DB2 to save and restore BlueGene state */
  1191. extern int select_p_state_save(char *dir_name)
  1192. {
  1193. #ifdef HAVE_BG
  1194. ListIterator itr;
  1195. bg_record_t *bg_record = NULL;
  1196. int error_code = 0, log_fd;
  1197. char *old_file, *new_file, *reg_file;
  1198. uint32_t blocks_packed = 0, tmp_offset, block_offset;
  1199. Buf buffer = init_buf(BUF_SIZE);
  1200. slurmctld_lock_t job_read_lock =
  1201. { NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };
  1202. DEF_TIMERS;
  1203. debug("bluegene: select_p_state_save");
  1204. START_TIMER;
  1205. /* write header: time */
  1206. packstr(BLOCK_STATE_VERSION, buffer);
  1207. block_offset = get_buf_offset(buffer);
  1208. pack32(blocks_packed, buffer);
  1209. /* Lock job read before block to avoid deadlock job lock is
  1210. * needed because we look at the job_ptr's to send job info. */
  1211. lock_slurmctld(job_read_lock);
  1212. /* write block records to buffer */
  1213. slurm_mutex_lock(&block_state_mutex);
  1214. itr = list_iterator_create(bg_lists->main);
  1215. while ((bg_record = list_next(itr))) {
  1216. if (bg_record->magic != BLOCK_MAGIC)
  1217. continue;
  1218. xassert(bg_record->bg_block_id != NULL);
  1219. _pack_block(bg_record, buffer, SLURM_PROTOCOL_VERSION);
  1220. _pack_block_ext(bg_record, buffer, SLURM_PROTOCOL_VERSION);
  1221. blocks_packed++;
  1222. }
  1223. list_iterator_destroy(itr);
  1224. slurm_mutex_unlock(&block_state_mutex);
  1225. unlock_slurmctld(job_read_lock);
  1226. tmp_offset = get_buf_offset(buffer);
  1227. set_buf_offset(buffer, block_offset);
  1228. pack32(blocks_packed, buffer);
  1229. set_buf_offset(buffer, tmp_offset);
  1230. /* Maintain config read lock until we copy state_save_location *\
  1231. \* unlock_slurmctld(part_read_lock); - see below */
  1232. /* write the buffer to file */
  1233. slurm_conf_lock();
  1234. old_file = xstrdup(slurmctld_conf.state_save_location);
  1235. xstrcat(old_file, "/block_state.old");
  1236. reg_file = xstrdup(slurmctld_conf.state_save_location);
  1237. xstrcat(reg_file, "/block_state");
  1238. new_file = xstrdup(slurmctld_conf.state_save_location);
  1239. xstrcat(new_file, "/block_state.new");
  1240. slurm_conf_unlock();
  1241. log_fd = creat(new_file, 0600);
  1242. if (log_fd < 0) {
  1243. error("Can't save state, error creating file %s, %m",
  1244. new_file);
  1245. error_code = errno;
  1246. } else {
  1247. int pos = 0, nwrite = get_buf_offset(buffer), amount;
  1248. char *data = (char *)get_buf_data(buffer);
  1249. while (nwrite > 0) {
  1250. amount = write(log_fd, &data[pos], nwrite);
  1251. if ((amount < 0) && (errno != EINTR)) {
  1252. error("Error writing file %s, %m", new_file);
  1253. error_code = errno;
  1254. break;
  1255. }
  1256. nwrite -= amount;
  1257. pos += amount;
  1258. }
  1259. fsync(log_fd);
  1260. close(log_fd);
  1261. }
  1262. if (error_code)
  1263. (void) unlink(new_file);
  1264. else { /* file shuffle */
  1265. (void) unlink(old_file);
  1266. if (link(reg_file, old_file))
  1267. debug4("unable to create link for %s -> %s: %m",
  1268. reg_file, old_file);
  1269. (void) unlink(reg_file);
  1270. if (link(new_file, reg_file))
  1271. debug4("unable to create link for %s -> %s: %m",
  1272. new_file, reg_file);
  1273. (void) unlink(new_file);
  1274. }
  1275. xfree(old_file);
  1276. xfree(reg_file);
  1277. xfree(new_file);
  1278. free_buf(buffer);
  1279. END_TIMER2("select_p_state_save");
  1280. return SLURM_SUCCESS;
  1281. #else
  1282. return SLURM_ERROR;
  1283. #endif
  1284. }
  1285. extern int select_p_state_restore(char *dir_name)
  1286. {
  1287. #ifdef HAVE_BG
  1288. debug("bluegene: select_p_state_restore");
  1289. /* found bg blocks already on system */
  1290. List curr_block_list = NULL;
  1291. List found_block_list = NULL;
  1292. static time_t last_config_update = (time_t) 0;
  1293. /* only run on startup */
  1294. if (last_config_update)
  1295. return SLURM_SUCCESS;
  1296. last_config_update = time(NULL);
  1297. curr_block_list = list_create(destroy_bg_record);
  1298. found_block_list = list_create(NULL);
  1299. //#if 0
  1300. /* Check to see if the configs we have are correct */
  1301. if (_validate_config_blocks(curr_block_list, found_block_list, dir_name)
  1302. == SLURM_ERROR) {
  1303. _delete_old_blocks(curr_block_list, found_block_list);
  1304. }
  1305. //#endif
  1306. /* looking for blocks only I created */
  1307. if (bg_conf->layout_mode == LAYOUT_DYNAMIC) {
  1308. info("No blocks created until jobs are submitted");
  1309. } else {
  1310. if (create_defined_blocks(bg_conf->layout_mode,
  1311. found_block_list)
  1312. == SLURM_ERROR) {
  1313. /* error in creating the static blocks, so
  1314. * blocks referenced by submitted jobs won't
  1315. * correspond to actual slurm blocks.
  1316. */
  1317. fatal("Error, could not create the static blocks");
  1318. return SLURM_ERROR;
  1319. }
  1320. }
  1321. list_destroy(curr_block_list);
  1322. curr_block_list = NULL;
  1323. list_destroy(found_block_list);
  1324. found_block_list = NULL;
  1325. slurm_mutex_lock(&block_state_mutex);
  1326. last_bg_update = time(NULL);
  1327. sort_bg_record_inc_size(bg_lists->main);
  1328. slurm_mutex_unlock(&block_state_mutex);
  1329. if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
  1330. info("Blocks have finished being created.");
  1331. return SLURM_SUCCESS;
  1332. #else
  1333. return SLURM_ERROR;
  1334. #endif
  1335. }
  1336. /* Sync BG blocks to currently active jobs */
  1337. extern int select_p_job_init(List job_list)
  1338. {
  1339. #ifdef HAVE_BG
  1340. int rc = sync_jobs(job_list);
  1341. /* after we have synced the blocks then we say they are
  1342. created. */
  1343. blocks_are_created = 1;
  1344. return rc;
  1345. #else
  1346. return SLURM_ERROR;
  1347. #endif
  1348. }
  1349. extern bool select_p_node_ranking(struct node_record *node_ptr, int node_cnt)
  1350. {
  1351. return false;
  1352. }
  1353. /* All initialization is performed by init() */
  1354. extern int select_p_node_init(struct node_record *node_ptr_array, int node_cnt)
  1355. {
  1356. #ifdef HAVE_BG
  1357. int i = 0;
  1358. uint32_t real_memory, threads, cores;
  1359. if (!node_ptr_array)
  1360. return SLURM_SUCCESS;
  1361. xassert(bg_conf);
  1362. /* we need the amount of memory for a midplane */
  1363. real_memory = bg_conf->mp_cnode_cnt;
  1364. /* Set up some knowns that perhaps aren't all the way
  1365. in the slurm.conf.
  1366. */
  1367. #ifdef HAVE_BGL
  1368. threads = 1;
  1369. cores = 2;
  1370. real_memory *= 512;
  1371. #elif defined HAVE_BGP
  1372. threads = 1;
  1373. cores = 4;
  1374. real_memory *= 2048;
  1375. #else
  1376. /* BGQ */
  1377. threads = 4;
  1378. cores = 16;
  1379. real_memory *= 16384;
  1380. #endif
  1381. bg_conf->cpus_per_mp = bg_conf->mp_cnode_cnt * cores;
  1382. for (i = 0; i < node_cnt; i++) {
  1383. struct node_record *node_ptr = &node_ptr_array[i];
  1384. select_nodeinfo_t *nodeinfo = NULL;
  1385. if (!node_ptr->name)
  1386. continue;
  1387. node_ptr->threads = threads;
  1388. node_ptr->cores = cores;
  1389. node_ptr->sockets = bg_conf->mp_cnode_cnt;
  1390. node_ptr->config_ptr->cpus = node_ptr->cpus =
  1391. bg_conf->cpus_per_mp;
  1392. node_ptr->real_memory = real_memory;
  1393. xassert(node_ptr->select_nodeinfo);
  1394. nodeinfo = node_ptr->select_nodeinfo->data;
  1395. xassert(nodeinfo);
  1396. slurm_mutex_lock(&ba_system_mutex);
  1397. if (!(nodeinfo->ba_mp = str2ba_mp(node_ptr->name))) {
  1398. slurm_mutex_unlock(&ba_system_mutex);
  1399. continue;
  1400. }
  1401. nodeinfo->ba_mp->index = i;
  1402. if (IS_NODE_DOWN(node_ptr) || IS_NODE_DRAIN(node_ptr))
  1403. ba_update_mp_state(
  1404. nodeinfo->ba_mp, node_ptr->node_state);
  1405. nodeinfo->ba_mp->state = node_ptr->node_state;
  1406. slurm_mutex_unlock(&ba_system_mutex);
  1407. }
  1408. return SLURM_SUCCESS;
  1409. #else
  1410. return SLURM_ERROR;
  1411. #endif
  1412. }
  1413. /*
  1414. * Called by slurmctld when a new configuration file is loaded
  1415. * or scontrol is used to change block configuration
  1416. */
  1417. extern int select_p_block_init(List part_list)
  1418. {
  1419. #ifdef HAVE_BG
  1420. /* select_p_node_init needs to be called before this to set
  1421. this up correctly
  1422. */
  1423. if (read_bg_conf() == SLURM_ERROR) {
  1424. fatal("Error, could not read the file");
  1425. return SLURM_ERROR;
  1426. }
  1427. if (part_list) {
  1428. struct part_record *part_ptr = NULL;
  1429. ListIterator itr = list_iterator_create(part_list);
  1430. while ((part_ptr = list_next(itr))) {
  1431. char *this_node_name;
  1432. hostlist_t host_list;
  1433. part_ptr->total_cpus = 0;
  1434. if (!part_ptr->nodes) /* no nodes in partition */
  1435. continue;
  1436. if (!(host_list = hostlist_create(part_ptr->nodes))) {
  1437. error("hostlist_create error on %s, %m",
  1438. part_ptr->nodes);
  1439. continue;
  1440. }
  1441. while ((this_node_name = hostlist_shift(host_list))) {
  1442. struct node_record *node_ptr =
  1443. find_node_record(this_node_name);
  1444. if (node_ptr == NULL) {
  1445. error("select_p_block_init: "
  1446. "invalid node name %s",
  1447. this_node_name);
  1448. free(this_node_name);
  1449. hostlist_destroy(host_list);
  1450. continue;
  1451. }
  1452. free(this_node_name);
  1453. part_ptr->total_cpus += node_ptr->cpus;
  1454. }
  1455. hostlist_destroy(host_list);
  1456. part_ptr->max_nodes = part_ptr->max_nodes_orig;
  1457. part_ptr->min_nodes = part_ptr->min_nodes_orig;
  1458. select_p_alter_node_cnt(SELECT_SET_MP_CNT,
  1459. &part_ptr->max_nodes);
  1460. select_p_alter_node_cnt(SELECT_SET_MP_CNT,
  1461. &part_ptr->min_nodes);
  1462. }
  1463. list_iterator_destroy(itr);
  1464. }
  1465. return SLURM_SUCCESS;
  1466. #else
  1467. return SLURM_ERROR;
  1468. #endif
  1469. }
  1470. /*
  1471. * select_p_job_test - Given a specification of scheduling requirements,
  1472. * identify the nodes which "best" satify the request. The specified
  1473. * nodes may be DOWN or BUSY at the time of this test as may be used
  1474. * to deterime if a job could ever run.
  1475. * IN/OUT job_ptr - pointer to job being scheduled start_time is set
  1476. * when we can possibly start job.
  1477. * IN/OUT bitmap - usable nodes are set on input, nodes not required to
  1478. * satisfy the request are cleared, other left set
  1479. * IN min_nodes - minimum count of nodes
  1480. * IN max_nodes - maximum count of nodes (0==don't care)
  1481. * IN req_nodes - requested (or desired) count of nodes
  1482. * IN mode …

Large files files are truncated, but you can click here to view the full file