/src/plugins/select/bluegene/select_bluegene.c
C | 3388 lines | 2622 code | 355 blank | 411 comment | 569 complexity | 81364133d6bb41bb0dd94aaffd03c13c MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0
Large files files are truncated, but you can click here to view the full file
- /*****************************************************************************\
- * select_bluegene.c - node selection plugin for Blue Gene system.
- *****************************************************************************
- * Copyright (C) 2004-2007 The Regents of the University of California.
- * Copyright (C) 2008-2011 Lawrence Livermore National Security.
- * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
- * Written by Dan Phung <phung4@llnl.gov> Danny Auble <da@llnl.gov>
- * CODE-OCEC-09-009. All rights reserved.
- *
- * This file is part of SLURM, a resource management program.
- * For details, see <http://www.schedmd.com/slurmdocs/>.
- * Please also read the included file: DISCLAIMER.
- *
- * SLURM is free software; you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * In addition, as a special exception, the copyright holders give permission
- * to link the code of portions of this program with the OpenSSL library under
- * certain conditions as described in each individual source file, and
- * distribute linked combinations including the two. You must obey the GNU
- * General Public License in all respects for all of the code used other than
- * OpenSSL. If you modify file(s) with this exception, you may extend this
- * exception to your version of the file(s), but you are not obligated to do
- * so. If you do not wish to do so, delete this exception statement from your
- * version. If you delete this exception statement from all source files in
- * the program, then also delete it here.
- *
- * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License along
- * with SLURM; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- \*****************************************************************************/
- #include "src/common/slurm_xlator.h"
- #include "bg_core.h"
- #include "bg_read_config.h"
- #include "bg_defined_block.h"
- #ifndef HAVE_BG_L_P
- # include "ba_bgq/block_allocator.h"
- #else
- # include "ba/block_allocator.h"
- #endif
- #include "src/slurmctld/trigger_mgr.h"
- #include <fcntl.h>
- #define HUGE_BUF_SIZE (1024*16)
- /* These are defined here so when we link with something other than
- * the slurmctld we will have these symbols defined. They will get
- * overwritten when linking with the slurmctld.
- */
- #if defined (__APPLE__)
- slurmctld_config_t slurmctld_config __attribute__((weak_import));
- slurm_ctl_conf_t slurmctld_conf __attribute__((weak_import));
- struct node_record *node_record_table_ptr __attribute__((weak_import)) = NULL;
- int bg_recover __attribute__((weak_import)) = NOT_FROM_CONTROLLER;
- List part_list __attribute__((weak_import)) = NULL;
- int node_record_count __attribute__((weak_import));
- time_t last_node_update __attribute__((weak_import));
- time_t last_job_update __attribute__((weak_import));
- char *alpha_num __attribute__((weak_import)) =
- "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
- void *acct_db_conn __attribute__((weak_import)) = NULL;
- char *slurmctld_cluster_name __attribute__((weak_import)) = NULL;
- slurmdb_cluster_rec_t *working_cluster_rec __attribute__((weak_import)) = NULL;
- #else
- slurmctld_config_t slurmctld_config;
- slurm_ctl_conf_t slurmctld_conf;
- struct node_record *node_record_table_ptr = NULL;
- int bg_recover = NOT_FROM_CONTROLLER;
- List part_list = NULL;
- int node_record_count;
- time_t last_node_update;
- time_t last_job_update;
- char *alpha_num = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
- void *acct_db_conn = NULL;
- char *slurmctld_cluster_name = NULL;
- slurmdb_cluster_rec_t *working_cluster_rec = NULL;
- #endif
- /*
- * These variables are required by the generic plugin interface. If they
- * are not found in the plugin, the plugin loader will ignore it.
- *
- * plugin_name - a string giving a human-readable description of the
- * plugin. There is no maximum length, but the symbol must refer to
- * a valid string.
- *
- * plugin_type - a string suggesting the type of the plugin or its
- * applicability to a particular form of data or method of data handling.
- * If the low-level plugin API is used, the contents of this string are
- * unimportant and may be anything. SLURM uses the higher-level plugin
- * interface which requires this string to be of the form
- *
- * <application>/<method>
- *
- * where <application> is a description of the intended application of
- * the plugin (e.g., "select" for SLURM node selection) and <method>
- * is a description of how this plugin satisfies that application. SLURM will
- * only load select plugins if the plugin_type string has a
- * prefix of "select/".
- *
- * plugin_version - an unsigned 32-bit integer giving the version number
- * of the plugin. If major and minor revisions are desired, the major
- * version number may be multiplied by a suitable magnitude constant such
- * as 100 or 1000. Various SLURM versions will likely require a certain
- * minimum version for their plugins as the node selection API matures.
- */
- const char plugin_name[] = "BlueGene node selection plugin";
- const char plugin_type[] = "select/bluegene";
- const uint32_t plugin_id = 100;
- const uint32_t plugin_version = 200;
- /* Global variables */
- bg_config_t *bg_conf = NULL;
- bg_lists_t *bg_lists = NULL;
- time_t last_bg_update;
- pthread_mutex_t block_state_mutex = PTHREAD_MUTEX_INITIALIZER;
- int blocks_are_created = 0;
- int num_unused_cpus = 0;
- int num_possible_unused_cpus = 0;
- slurmctld_lock_t job_read_lock = {
- NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };
- extern int select_p_alter_node_cnt(enum select_node_cnt type, void *data);
- static void _destroy_bg_config(bg_config_t *bg_conf)
- {
- if (bg_conf) {
- if (bg_conf->blrts_list) {
- list_destroy(bg_conf->blrts_list);
- bg_conf->blrts_list = NULL;
- }
- xfree(bg_conf->bridge_api_file);
- xfree(bg_conf->default_blrtsimage);
- xfree(bg_conf->default_linuximage);
- xfree(bg_conf->default_mloaderimage);
- xfree(bg_conf->default_ramdiskimage);
- if (bg_conf->linux_list) {
- list_destroy(bg_conf->linux_list);
- bg_conf->linux_list = NULL;
- }
- if (bg_conf->mloader_list) {
- list_destroy(bg_conf->mloader_list);
- bg_conf->mloader_list = NULL;
- }
- if (bg_conf->ramdisk_list) {
- list_destroy(bg_conf->ramdisk_list);
- bg_conf->ramdisk_list = NULL;
- }
- xfree(bg_conf->slurm_user_name);
- xfree(bg_conf->slurm_node_prefix);
- xfree(bg_conf);
- }
- }
- static void _destroy_bg_lists(bg_lists_t *bg_lists)
- {
- if (bg_lists) {
- if (bg_lists->booted) {
- list_destroy(bg_lists->booted);
- bg_lists->booted = NULL;
- }
- if (bg_lists->job_running) {
- list_destroy(bg_lists->job_running);
- bg_lists->job_running = NULL;
- num_unused_cpus = 0;
- }
- if (bg_lists->main) {
- list_destroy(bg_lists->main);
- bg_lists->main = NULL;
- }
- if (bg_lists->valid_small32) {
- list_destroy(bg_lists->valid_small32);
- bg_lists->valid_small32 = NULL;
- }
- if (bg_lists->valid_small64) {
- list_destroy(bg_lists->valid_small64);
- bg_lists->valid_small64 = NULL;
- }
- if (bg_lists->valid_small128) {
- list_destroy(bg_lists->valid_small128);
- bg_lists->valid_small128 = NULL;
- }
- if (bg_lists->valid_small256) {
- list_destroy(bg_lists->valid_small256);
- bg_lists->valid_small256 = NULL;
- }
- xfree(bg_lists);
- }
- }
- #ifdef HAVE_BG
- static int _delete_old_blocks(List curr_block_list, List found_block_list)
- {
- ListIterator itr_curr, itr_found;
- bg_record_t *found_record = NULL, *init_record = NULL;
- List destroy_list = list_create(NULL);
- xassert(curr_block_list);
- xassert(found_block_list);
- slurm_mutex_lock(&block_state_mutex);
- if (!bg_recover) {
- info("removing all current blocks (clean start)");
- itr_curr = list_iterator_create(curr_block_list);
- while ((init_record = list_next(itr_curr))) {
- list_remove(itr_curr);
- init_record->modifying = 0;
- /* The block needs to exist in the main list
- * just to make sure we query the state. */
- if (!(found_record = find_bg_record_in_list(
- bg_lists->main,
- init_record->bg_block_id)))
- list_push(bg_lists->main, init_record);
- else {
- destroy_bg_record(init_record);
- init_record = found_record;
- }
- /* Make sure this block isn't in an
- error state since if it is it won't
- disappear. */
- if (init_record->state & BG_BLOCK_ERROR_FLAG)
- resume_block(init_record);
- list_push(destroy_list, init_record);
- }
- list_iterator_destroy(itr_curr);
- } else {
- info("removing unspecified blocks");
- itr_curr = list_iterator_create(curr_block_list);
- while ((init_record = list_next(itr_curr))) {
- itr_found = list_iterator_create(found_block_list);
- while ((found_record = list_next(itr_found))) {
- if (!strcmp(init_record->bg_block_id,
- found_record->bg_block_id)) {
- /* don't delete this one */
- break;
- }
- }
- list_iterator_destroy(itr_found);
- if (found_record == NULL) {
- list_remove(itr_curr);
- init_record->modifying = 0;
- /* The block needs to exist in the main list
- * just to make sure we query the state. */
- if (!(found_record = find_bg_record_in_list(
- bg_lists->main,
- init_record->bg_block_id)))
- list_push(bg_lists->main, init_record);
- else {
- destroy_bg_record(init_record);
- init_record = found_record;
- }
- /* Make sure this block isn't in an
- error state since if it is it won't
- disappear. */
- if (init_record->state & BG_BLOCK_ERROR_FLAG)
- resume_block(init_record);
- /* Since we can't requeue a running
- job in the free block function (not
- thread safe here) we must do it
- now.
- */
- if ((init_record->job_running > NO_JOB_RUNNING)
- || init_record->job_ptr) {
- /* Don't worry about dealing
- with this job here. Trying
- to requeue/cancel now will
- cause a race condition
- locking up the slurmctld.
- It will be handled when the
- blocks are synced. This
- should only happen if the
- bluegene.conf gets changed
- and jobs are running on
- blocks that don't exist in
- the new config (hopefully
- rarely).
- */
- init_record->job_running =
- NO_JOB_RUNNING;
- init_record->job_ptr = NULL;
- } else if (init_record->job_list &&
- list_count(init_record->job_list))
- list_flush(init_record->job_list);
- list_push(destroy_list, init_record);
- }
- }
- list_iterator_destroy(itr_curr);
- }
- slurm_mutex_unlock(&block_state_mutex);
- free_block_list(NO_VAL, destroy_list, 1, 0);
- list_destroy(destroy_list);
- return SLURM_SUCCESS;
- }
- static void _set_bg_lists()
- {
- if (!bg_lists)
- bg_lists = xmalloc(sizeof(bg_lists_t));
- slurm_mutex_lock(&block_state_mutex);
- if (bg_lists->booted)
- list_destroy(bg_lists->booted);
- bg_lists->booted = list_create(NULL);
- if (bg_lists->job_running)
- list_destroy(bg_lists->job_running);
- bg_lists->job_running = list_create(NULL);
- if (bg_lists->main)
- list_destroy(bg_lists->main);
- bg_lists->main = list_create(destroy_bg_record);
- slurm_mutex_unlock(&block_state_mutex);
- }
- static bg_record_t *_translate_info_2_record(block_info_t *block_info)
- {
- bg_record_t *bg_record = NULL;
- bitstr_t *mp_bitmap = NULL, *ionode_bitmap = NULL;
- mp_bitmap = bit_alloc(node_record_count);
- ionode_bitmap = bit_alloc(bg_conf->ionodes_per_mp);
- if (block_info->mp_inx
- && inx2bitstr(mp_bitmap, block_info->mp_inx) == -1)
- error("Job state recovered incompatible with "
- "bluegene.conf. mp=%u",
- node_record_count);
- if (block_info->ionode_inx
- && inx2bitstr(ionode_bitmap, block_info->ionode_inx) == -1)
- error("Job state recovered incompatible with "
- "bluegene.conf. ionodes=%u",
- bg_conf->ionodes_per_mp);
- bg_record = xmalloc(sizeof(bg_record_t));
- bg_record->magic = BLOCK_MAGIC;
- bg_record->bg_block_id = block_info->bg_block_id;
- block_info->bg_block_id = NULL;
- bg_record->mp_str = block_info->mp_str;
- block_info->mp_str = NULL;
- bg_record->ionode_bitmap = ionode_bitmap;
- ionode_bitmap = NULL;
- if (block_info->ionode_str) {
- ba_set_ionode_str(bg_record);
- if (!bg_record->ionode_str
- || strcmp(block_info->ionode_str, bg_record->ionode_str)) {
- error("block %s didn't compute with the correct "
- "ionode_str. Stored as '%s' and "
- "came back as '%s'",
- bg_record->bg_block_id,
- block_info->ionode_str, bg_record->ionode_str);
- }
- }
- bg_record->mp_bitmap = mp_bitmap;
- mp_bitmap = NULL;
- /* put_block_in_error_state should be
- called after the bg_lists->main has been
- made. We can't call it here since
- this record isn't the record kept
- around in bg_lists->main.
- */
- bg_record->state = block_info->state;
- bg_record->cnode_cnt = block_info->cnode_cnt;
- bg_record->mp_count = bit_set_count(bg_record->mp_bitmap);
- /* Don't copy the job_list from the block_info, we will fill
- it in later in the job sync.
- */
- bg_record->job_running = NO_JOB_RUNNING;
- if (bg_conf->sub_blocks && (bg_record->mp_count == 1))
- bg_record->job_list = list_create(NULL);
- #ifdef HAVE_BGL
- bg_record->node_use = block_info->node_use;
- #endif
- memcpy(bg_record->conn_type, block_info->conn_type,
- sizeof(bg_record->conn_type));
- bg_record->blrtsimage = block_info->blrtsimage;
- block_info->blrtsimage = NULL;
- bg_record->linuximage = block_info->linuximage;
- block_info->linuximage = NULL;
- bg_record->mloaderimage = block_info->mloaderimage;
- block_info->mloaderimage = NULL;
- bg_record->ramdiskimage = block_info->ramdiskimage;
- block_info->ramdiskimage = NULL;
- bg_record->reason = block_info->reason;
- block_info->reason = NULL;
- slurm_free_block_info_members(block_info);
- return bg_record;
- }
- static void _local_pack_block_job_info(struct job_record *job_ptr, Buf buffer,
- uint16_t protocol_version)
- {
- block_job_info_t block_job;
- select_jobinfo_t *jobinfo = job_ptr->select_jobinfo->data;
- memset(&block_job, 0, sizeof(block_job_info_t));
- block_job.job_id = job_ptr->job_id;
- block_job.user_id = job_ptr->user_id;
- if (jobinfo) {
- block_job.user_name = jobinfo->user_name;
- block_job.cnodes = jobinfo->ionode_str;
- } else
- error("NO JOBINFO for job %u magic %u!!!!!!!!!!!!!!",
- job_ptr->job_id, job_ptr->magic);
- /* block_job.cnode_inx -- try not to set */
- slurm_pack_block_job_info(&block_job, buffer, protocol_version);
- }
- /* Pack all relevent information about a block */
- /* NOTE: There is a matching pack function in
- * common/slurm_protocol_pack.c dealing with the block_info_t
- * structure there. If anything changes here please update that as well.
- * The unpack for this is in common/slurm_protocol_pack.c
- */
- static void _pack_block(bg_record_t *bg_record, Buf buffer,
- uint16_t protocol_version)
- {
- #ifdef HAVE_BGQ
- int dim;
- #endif
- uint32_t count = NO_VAL, running_job = 0;
- struct job_record *job_ptr;
- ListIterator itr;
- if (protocol_version >= SLURM_2_4_PROTOCOL_VERSION) {
- packstr(bg_record->bg_block_id, buffer);
- packstr(bg_record->blrtsimage, buffer);
- pack_bit_fmt(bg_record->mp_bitmap, buffer);
- #ifdef HAVE_BGQ
- pack32(SYSTEM_DIMENSIONS, buffer);
- for (dim=0; dim<SYSTEM_DIMENSIONS; dim++)
- pack16(bg_record->conn_type[dim], buffer);
- #else
- pack32(1, buffer); /* for dimensions of conn_type */
- pack16(bg_record->conn_type[0], buffer);
- #endif
- packstr(bg_record->ionode_str, buffer);
- pack_bit_fmt(bg_record->ionode_bitmap, buffer);
- if (bg_record->job_list)
- count = list_count(bg_record->job_list);
- if (count && count != NO_VAL) {
- pack32(count, buffer);
- itr = list_iterator_create(bg_record->job_list);
- while ((job_ptr = list_next(itr))) {
- if (job_ptr->magic != JOB_MAGIC) {
- error("_pack_block: "
- "bad magic found when "
- "packing block %s",
- bg_record->bg_block_id);
- list_delete_item(itr);
- slurm_pack_block_job_info(
- NULL, buffer,
- protocol_version);
- continue;
- }
- _local_pack_block_job_info(
- job_ptr, buffer, protocol_version);
- }
- list_iterator_destroy(itr);
- } else if (bg_record->job_ptr
- && (bg_record->job_ptr->magic == JOB_MAGIC)) {
- pack32(1, buffer);
- _local_pack_block_job_info(
- bg_record->job_ptr, buffer, protocol_version);
- } else
- pack32(count, buffer);
- count = NO_VAL;
- packstr(bg_record->linuximage, buffer);
- packstr(bg_record->mloaderimage, buffer);
- packstr(bg_record->mp_str, buffer);
- pack32(bg_record->cnode_cnt, buffer);
- pack32(bg_record->cnode_err_cnt, buffer);
- pack16((uint16_t)bg_record->node_use, buffer);
- packstr(bg_record->ramdiskimage, buffer);
- packstr(bg_record->reason, buffer);
- pack16((uint16_t)bg_record->state, buffer);
- } else if (protocol_version >= SLURM_2_3_PROTOCOL_VERSION) {
- packstr(bg_record->bg_block_id, buffer);
- packstr(bg_record->blrtsimage, buffer);
- pack_bit_fmt(bg_record->mp_bitmap, buffer);
- #ifdef HAVE_BGQ
- pack32(SYSTEM_DIMENSIONS, buffer);
- for (dim=0; dim<SYSTEM_DIMENSIONS; dim++)
- pack16(bg_record->conn_type[dim], buffer);
- #else
- pack32(1, buffer); /* for dimensions of conn_type */
- pack16(bg_record->conn_type[0], buffer);
- #endif
- packstr(bg_record->ionode_str, buffer);
- pack_bit_fmt(bg_record->ionode_bitmap, buffer);
- if (bg_record->job_list)
- count = list_count(bg_record->job_list);
- pack32(count, buffer);
- if (count && count != NO_VAL) {
- itr = list_iterator_create(bg_record->job_list);
- while ((job_ptr = list_next(itr))) {
- if (job_ptr->magic != JOB_MAGIC) {
- error("_pack_block 2.3: "
- "bad magic found when "
- "packing block %s",
- bg_record->bg_block_id);
- list_delete_item(itr);
- continue;
- }
- _local_pack_block_job_info(
- job_ptr, buffer, protocol_version);
- }
- list_iterator_destroy(itr);
- }
- if ((count == 1) && running_job)
- pack32((uint32_t)running_job, buffer);
- else
- pack32((uint32_t)bg_record->job_running, buffer);
- count = NO_VAL;
- packstr(bg_record->linuximage, buffer);
- packstr(bg_record->mloaderimage, buffer);
- packstr(bg_record->mp_str, buffer);
- packnull(buffer); /* for mp_used_str */
- pack32((uint32_t)bg_record->cnode_cnt, buffer);
- pack16((uint16_t)bg_record->node_use, buffer);
- packnull(buffer); /* for user_name */
- packstr(bg_record->ramdiskimage, buffer);
- packstr(bg_record->reason, buffer);
- pack16((uint16_t)bg_record->state, buffer);
- packnull(buffer); /* for mp_used_inx */
- }
- }
- /* Pack all extra information about a block (Only needed for saving state.) */
- static void _pack_block_ext(bg_record_t *bg_record, Buf buffer,
- uint16_t protocol_version)
- {
- ListIterator itr;
- ba_mp_t *ba_mp;
- uint32_t count = NO_VAL;
- int i;
- xassert(bg_record);
- if (protocol_version >= SLURM_2_3_PROTOCOL_VERSION) {
- if (bg_record->ba_mp_list)
- count = list_count(bg_record->ba_mp_list);
- pack32(count, buffer);
- if (count && count != NO_VAL) {
- itr = list_iterator_create(bg_record->ba_mp_list);
- while ((ba_mp = list_next(itr)))
- pack_ba_mp(ba_mp, buffer, protocol_version);
- list_iterator_destroy(itr);
- }
- pack32(bg_record->cpu_cnt, buffer);
- for (i=0; i<SYSTEM_DIMENSIONS; i++) {
- pack16(bg_record->geo[i], buffer);
- pack16(bg_record->start[i], buffer);
- }
- pack16(bg_record->full_block, buffer);
- pack32(bg_record->switch_count, buffer);
- } else {
- /* didn't exist before 2.3 */
- }
- }
- /* UNPack all extra information about a block */
- static int _unpack_block_ext(bg_record_t *bg_record, Buf buffer,
- uint16_t protocol_version)
- {
- ba_mp_t *ba_mp;
- uint32_t count = NO_VAL;
- int i;
- uint16_t temp16;
- xassert(bg_record);
- if (protocol_version >= SLURM_2_3_PROTOCOL_VERSION) {
- safe_unpack32(&count, buffer);
- if (count == NO_VAL) {
- error("_unpack_block_ext: bg_record record has no "
- "mp_list");
- goto unpack_error;
- }
- bg_record->ba_mp_list = list_create(destroy_ba_mp);
- for (i=0; i<count; i++) {
- if (unpack_ba_mp(&ba_mp, buffer, protocol_version)
- == SLURM_ERROR)
- goto unpack_error;
- list_append(bg_record->ba_mp_list, ba_mp);
- }
- safe_unpack32(&bg_record->cpu_cnt, buffer);
- for (i=0; i<SYSTEM_DIMENSIONS; i++) {
- safe_unpack16(&bg_record->geo[i], buffer);
- safe_unpack16(&bg_record->start[i], buffer);
- }
- safe_unpack16(&temp16, buffer);
- bg_record->full_block = temp16;
- safe_pack32(bg_record->switch_count, buffer);
- } else {
- /* packing didn't exist before 2.3, so set things up
- * to go forward */
- if (bg_conf->mp_cnode_cnt > bg_record->cnode_cnt) {
- bg_record->cpu_cnt = bg_conf->cpus_per_mp /
- (bg_conf->mp_cnode_cnt / bg_record->cnode_cnt);
- } else {
- bg_record->cpu_cnt = bg_conf->cpus_per_mp
- * bg_record->mp_count;
- }
- process_nodes(bg_record, true);
- }
- return SLURM_SUCCESS;
- unpack_error:
- error("Problem unpacking extended block info for %s, "
- "removing from list",
- bg_record->bg_block_id);
- return SLURM_ERROR;
- }
- static int _load_state_file(List curr_block_list, char *dir_name)
- {
- int state_fd, i;
- char *state_file = NULL;
- Buf buffer = NULL;
- char *data = NULL;
- int data_size = 0;
- block_info_msg_t *block_ptr = NULL;
- bg_record_t *bg_record = NULL;
- char temp[256];
- List results = NULL;
- int data_allocated, data_read = 0;
- char *ver_str = NULL;
- uint32_t ver_str_len;
- char *name = NULL;
- struct part_record *part_ptr = NULL;
- bitstr_t *usable_mp_bitmap = NULL;
- ListIterator itr = NULL;
- uint16_t protocol_version = (uint16_t)NO_VAL;
- uint32_t record_count;
- xassert(curr_block_list);
- xassert(dir_name);
- state_file = xstrdup(dir_name);
- xstrcat(state_file, "/block_state");
- state_fd = open(state_file, O_RDONLY);
- if (state_fd < 0) {
- error("No block state file (%s) to recover", state_file);
- xfree(state_file);
- return SLURM_SUCCESS;
- } else {
- data_allocated = BUF_SIZE;
- data = xmalloc(data_allocated);
- while (1) {
- data_read = read(state_fd, &data[data_size],
- BUF_SIZE);
- if (data_read < 0) {
- if (errno == EINTR)
- continue;
- else {
- error("Read error on %s: %m",
- state_file);
- break;
- }
- } else if (data_read == 0) /* eof */
- break;
- data_size += data_read;
- data_allocated += data_read;
- xrealloc(data, data_allocated);
- }
- close(state_fd);
- }
- xfree(state_file);
- buffer = create_buf(data, data_size);
- safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer);
- debug3("Version string in block_state header is %s", ver_str);
- if (ver_str) {
- if (!strcmp(ver_str, BLOCK_STATE_VERSION)) {
- protocol_version = SLURM_PROTOCOL_VERSION;
- }
- }
- if (protocol_version == (uint16_t)NO_VAL) {
- error("***********************************************");
- error("Can not recover block state, "
- "data version incompatible");
- error("***********************************************");
- xfree(ver_str);
- free_buf(buffer);
- return EFAULT;
- }
- xfree(ver_str);
- safe_unpack32(&record_count, buffer);
- slurm_mutex_lock(&block_state_mutex);
- reset_ba_system(true);
- /* Locks are already in place to protect part_list here */
- usable_mp_bitmap = bit_alloc(node_record_count);
- itr = list_iterator_create(part_list);
- while ((part_ptr = list_next(itr))) {
- /* we only want to use mps that are in partitions */
- if (!part_ptr->node_bitmap) {
- debug4("Partition %s doesn't have any nodes in it.",
- part_ptr->name);
- continue;
- }
- bit_or(usable_mp_bitmap, part_ptr->node_bitmap);
- }
- list_iterator_destroy(itr);
- if (bit_ffs(usable_mp_bitmap) == -1) {
- fatal("We don't have any nodes in any partitions. "
- "Can't create blocks. "
- "Please check your slurm.conf.");
- }
- for (i=0; i<record_count; i++) {
- block_info_t block_info;
- if (slurm_unpack_block_info_members(
- &block_info, buffer, protocol_version))
- goto unpack_error;
- if (!(bg_record = _translate_info_2_record(&block_info)))
- continue;
- if (_unpack_block_ext(bg_record, buffer, protocol_version)
- != SLURM_SUCCESS) {
- destroy_bg_record(bg_record);
- goto unpack_error;
- }
- /* This means the block here wasn't able to be
- processed correctly, so don't add.
- */
- if (!bg_record->mp_count) {
- error("block %s(%s) can't be made in the current "
- "system, but was around in the previous one.",
- bg_record->bg_block_id, bg_record->mp_str);
- list_destroy(results);
- destroy_bg_record(bg_record);
- continue;
- }
- if ((bg_conf->layout_mode == LAYOUT_OVERLAP)
- || bg_record->full_block)
- reset_ba_system(false);
- if (bg_record->ba_mp_list) {
- /* only do this for blocks bigger than 1
- midplane */
- if (bg_record->cpu_cnt >= bg_conf->cpus_per_mp)
- if (check_and_set_mp_list(bg_record->ba_mp_list)
- == SLURM_ERROR)
- error("something happened in the "
- "load of %s, keeping it "
- "around though",
- bg_record->bg_block_id);
- } else {
- select_ba_request_t ba_request;
- ba_set_removable_mps(usable_mp_bitmap, 1);
- /* we want the mps that aren't
- * in this record to mark them as used
- */
- if (ba_set_removable_mps(bg_record->mp_bitmap, 1)
- != SLURM_SUCCESS)
- fatal("1 It doesn't seem we have a bitmap "
- "for %s",
- bg_record->bg_block_id);
- #ifdef HAVE_BGQ
- results = list_create(destroy_ba_mp);
- #else
- results = list_create(NULL);
- #endif
- /* info("adding back %s %s", bg_record->bg_block_id, */
- /* bg_record->mp_str); */
- memset(&ba_request, 0, sizeof(ba_request));
- memcpy(ba_request.start, bg_record->start,
- sizeof(bg_record->start));
- memcpy(ba_request.geometry, bg_record->geo,
- sizeof(bg_record->geo));
- memcpy(ba_request.conn_type, bg_record->conn_type,
- sizeof(bg_record->conn_type));
- ba_request.start_req = 1;
- name = set_bg_block(results, &ba_request);
- ba_reset_all_removed_mps();
- if (!name) {
- error("I was unable to make the "
- "requested block.");
- list_destroy(results);
- destroy_bg_record(bg_record);
- bg_record = NULL;
- continue;
- }
- snprintf(temp, sizeof(temp), "%s%s",
- bg_conf->slurm_node_prefix,
- name);
- xfree(name);
- if (strcmp(temp, bg_record->mp_str)) {
- fatal("bad wiring in preserved state "
- "(found %s, but allocated %s) "
- "YOU MUST COLDSTART",
- bg_record->mp_str, temp);
- }
- if (bg_record->ba_mp_list)
- list_destroy(bg_record->ba_mp_list);
- #ifdef HAVE_BGQ
- bg_record->ba_mp_list = results;
- results = NULL;
- #else
- bg_record->ba_mp_list = list_create(destroy_ba_mp);
- copy_node_path(results, &bg_record->ba_mp_list);
- list_destroy(results);
- #endif
- }
- // bridge_block_create(bg_record);
- list_push(curr_block_list, bg_record);
- }
- FREE_NULL_BITMAP(usable_mp_bitmap);
- sort_bg_record_inc_size(curr_block_list);
- slurm_mutex_unlock(&block_state_mutex);
- info("Recovered %d blocks", list_count(curr_block_list));
- slurm_free_block_info_msg(block_ptr);
- free_buf(buffer);
- return SLURM_SUCCESS;
- unpack_error:
- FREE_NULL_BITMAP(usable_mp_bitmap);
- slurm_mutex_unlock(&block_state_mutex);
- error("Incomplete block data checkpoint file");
- free_buf(buffer);
- return SLURM_FAILURE;
- }
- static void _handle_existing_block(bg_record_t *bg_record)
- {
- char *conn_type;
- char node_str[256];
- xassert(bg_record);
- format_node_name(bg_record, node_str, sizeof(node_str));
- conn_type = conn_type_string_full(bg_record->conn_type);
- info("Existing: BlockID:%s Nodes:%s Conn:%s",
- bg_record->bg_block_id, node_str, conn_type);
- xfree(conn_type);
- /* Sanity check to make sure we have the correct setup from
- the save.
- */
- if (bg_conf->sub_blocks && bg_record->mp_count == 1) {
- ba_mp_t *ba_mp = list_peek(bg_record->ba_mp_list);
- xassert(ba_mp);
- if (!ba_mp->cnode_bitmap) {
- error("_handle_existing_block: No cnode_bitmap "
- "for block %s, creating it",
- bg_record->bg_block_id);
- if ((ba_mp->cnode_bitmap =
- ba_create_ba_mp_cnode_bitmap(bg_record))) {
- if (!ba_mp->cnode_err_bitmap)
- ba_mp->cnode_err_bitmap =
- bit_alloc(bg_conf->
- mp_cnode_cnt);
- FREE_NULL_BITMAP(ba_mp->cnode_usable_bitmap);
- ba_mp->cnode_usable_bitmap =
- bit_copy(ba_mp->cnode_bitmap);
- }
- }
- }
- if (bg_record->state & BG_BLOCK_ERROR_FLAG)
- put_block_in_error_state(bg_record, NULL);
- else if (((bg_record->state == BG_BLOCK_INITED)
- || (bg_record->state == BG_BLOCK_BOOTING))
- && !block_ptr_exist_in_list(bg_lists->booted, bg_record))
- list_push(bg_lists->booted, bg_record);
- }
- /*
- * _validate_config_blocks - Match slurm configuration information with
- * current BG block configuration.
- * IN/OUT curr_block_list - List of blocks already existing on the system.
- * IN/OUT found_block_list - List of blocks found on the system
- * that are listed in the bluegene.conf.
- * NOTE: Both of the lists above should be created with list_create(NULL)
- * since the bg_lists->main will contain the complete list of pointers
- * and be destroyed with it.
- *
- * RET - SLURM_SUCCESS if no blocks need to be deleted, else an error
- * code. Writes bg_block_id into bg_lists->main records.
- */
- static int _validate_config_blocks(List curr_block_list,
- List found_block_list, char *dir)
- {
- int rc = SLURM_ERROR;
- bg_record_t* bg_record = NULL;
- bg_record_t* init_bg_record = NULL;
- int full_created = 0;
- ListIterator itr_conf;
- ListIterator itr_curr;
- char tmp_char[256];
- int dim;
- xassert(curr_block_list);
- xassert(found_block_list);
- /* read in state from last run. */
- if (bg_recover)
- rc = _load_state_file(curr_block_list, dir);
- #ifndef HAVE_BG_FILES
- if (rc != SLURM_SUCCESS)
- return rc;
- #endif
- /* read current bg block info into curr_block_list This
- * happens in the state load before this in emulation mode */
- if (bridge_blocks_load_curr(curr_block_list) == SLURM_ERROR)
- return SLURM_ERROR;
- if (!bg_recover)
- return SLURM_ERROR;
- #ifdef HAVE_BG_FILES
- /* Since we just checked all the blocks from state against that
- in the database we can now check to see if there were once
- blocks that are now gone from the database and remove them
- from the list.
- */
- itr_curr = list_iterator_create(curr_block_list);
- while ((bg_record = list_next(itr_curr))) {
- if (bg_record->modifying) {
- bg_record->modifying = 0;
- continue;
- }
- error("Found state for block %s, but that "
- "block isn't in the system anymore, removing",
- bg_record->bg_block_id);
- list_delete_item(itr_curr);
- }
- list_iterator_destroy(itr_curr);
- #endif
- if (bg_conf->layout_mode == LAYOUT_DYNAMIC) {
- /* Since we don't read the blocks in a Dynamic system
- we can just transfer the list here and return.
- */
- list_transfer(bg_lists->main, curr_block_list);
- itr_conf = list_iterator_create(bg_lists->main);
- while ((bg_record = list_next(itr_conf)))
- _handle_existing_block(bg_record);
- list_iterator_destroy(itr_conf);
- return SLURM_SUCCESS;
- }
- /* Only when we are looking at a non-dynamic system do we need
- to go through the following logic to make sure things are insync.
- */
- itr_curr = list_iterator_create(curr_block_list);
- itr_conf = list_iterator_create(bg_lists->main);
- while ((bg_record = list_next(itr_conf))) {
- list_iterator_reset(itr_curr);
- while ((init_bg_record = list_next(itr_curr))) {
- if (!bit_equal(bg_record->mp_bitmap,
- init_bg_record->mp_bitmap))
- continue; /* wrong nodes */
- if (!bit_equal(bg_record->ionode_bitmap,
- init_bg_record->ionode_bitmap))
- continue;
- if ((bg_record->conn_type[0] < SELECT_SMALL)
- && (init_bg_record->conn_type[0] < SELECT_SMALL)) {
- for (dim = 0; dim < SYSTEM_DIMENSIONS; dim++) {
- /* Only look at how far we
- have set. The bg_record
- should of been set up
- correctly in the
- parse_blockreq() function.
- */
- if (bg_record->conn_type[dim] ==
- (uint16_t)NO_VAL) {
- dim = SYSTEM_DIMENSIONS;
- break;
- }
- if (bg_record->conn_type[dim] !=
- init_bg_record->conn_type[dim])
- break; /* wrong conn_type */
- }
- if (dim < SYSTEM_DIMENSIONS)
- continue;
- }
- copy_bg_record(init_bg_record, bg_record);
- /* remove from the curr list since we just
- matched it no reason to keep it around
- anymore */
- list_delete_item(itr_curr);
- break;
- }
- if (!bg_record->bg_block_id) {
- format_node_name(bg_record, tmp_char,
- sizeof(tmp_char));
- info("Block found in bluegene.conf to be "
- "created: Nodes:%s",
- tmp_char);
- } else {
- if (bg_record->full_block)
- full_created = 1;
- list_push(found_block_list, bg_record);
- _handle_existing_block(bg_record);
- }
- }
- if (!full_created) {
- list_iterator_reset(itr_curr);
- while ((init_bg_record = list_next(itr_curr))) {
- if (init_bg_record->full_block) {
- list_remove(itr_curr);
- bg_record = init_bg_record;
- list_append(bg_lists->main, bg_record);
- list_push(found_block_list, bg_record);
- _handle_existing_block(bg_record);
- break;
- }
- }
- }
- list_iterator_destroy(itr_conf);
- list_iterator_destroy(itr_curr);
- if (!list_count(curr_block_list))
- rc = SLURM_SUCCESS;
- else
- rc = SLURM_ERROR;
- return rc;
- }
- static List _get_config(void)
- {
- config_key_pair_t *key_pair;
- List my_list = list_create(destroy_config_key_pair);
- if (!my_list)
- fatal("malloc failure on list_create");
- key_pair = xmalloc(sizeof(config_key_pair_t));
- key_pair->name = xstrdup("DefaultConnType");
- key_pair->value = conn_type_string_full(bg_conf->default_conn_type);
- list_append(my_list, key_pair);
- #ifndef HAVE_BG_FILES
- key_pair = xmalloc(sizeof(config_key_pair_t));
- key_pair->name = xstrdup("Emulated");
- key_pair->value = xstrdup("yes");
- list_append(my_list, key_pair);
- #endif
- key_pair = xmalloc(sizeof(config_key_pair_t));
- key_pair->name = xstrdup("MaxBlockInError");
- key_pair->value = xstrdup_printf("%u", bg_conf->max_block_err);
- list_append(my_list, key_pair);
- key_pair = xmalloc(sizeof(config_key_pair_t));
- key_pair->name = xstrdup("MidPlaneNodeCnt");
- key_pair->value = xstrdup_printf("%u", bg_conf->mp_cnode_cnt);
- list_append(my_list, key_pair);
- key_pair = xmalloc(sizeof(config_key_pair_t));
- key_pair->name = xstrdup("NodeCPUCnt");
- key_pair->value = xstrdup_printf("%u", bg_conf->cpu_ratio);
- list_append(my_list, key_pair);
- #ifdef HAVE_BGL
- key_pair = xmalloc(sizeof(config_key_pair_t));
- key_pair->name = xstrdup("BlrtsImage");
- key_pair->value = xstrdup(bg_conf->default_blrtsimage);
- list_append(my_list, key_pair);
- key_pair = xmalloc(sizeof(config_key_pair_t));
- key_pair->name = xstrdup("LinuxImage");
- key_pair->value = xstrdup(bg_conf->default_linuximage);
- list_append(my_list, key_pair);
- key_pair = xmalloc(sizeof(config_key_pair_t));
- key_pair->name = xstrdup("RamDiskImage");
- key_pair->value = xstrdup(bg_conf->default_ramdiskimage);
- list_append(my_list, key_pair);
- #elif defined HAVE_BGP
- key_pair = xmalloc(sizeof(config_key_pair_t));
- key_pair->name = xstrdup("CnloadImage");
- key_pair->value = xstrdup(bg_conf->default_linuximage);
- list_append(my_list, key_pair);
- key_pair = xmalloc(sizeof(config_key_pair_t));
- key_pair->name = xstrdup("IoloadImage");
- key_pair->value = xstrdup(bg_conf->default_ramdiskimage);
- list_append(my_list, key_pair);
- #endif
- key_pair = xmalloc(sizeof(config_key_pair_t));
- key_pair->name = xstrdup("BridgeAPILogFile");
- key_pair->value = xstrdup(bg_conf->bridge_api_file);
- list_append(my_list, key_pair);
- key_pair = xmalloc(sizeof(config_key_pair_t));
- key_pair->name = xstrdup("BridgeAPIVerbose");
- key_pair->value = xstrdup_printf("%u", bg_conf->bridge_api_verb);
- list_append(my_list, key_pair);
- if (bg_conf->deny_pass) {
- key_pair = xmalloc(sizeof(config_key_pair_t));
- key_pair->name = xstrdup("DenyPassThrough");
- if (bg_conf->deny_pass & PASS_DENY_A)
- xstrcat(key_pair->value, "A,");
- if (bg_conf->deny_pass & PASS_DENY_X)
- xstrcat(key_pair->value, "X,");
- if (bg_conf->deny_pass & PASS_DENY_Y)
- xstrcat(key_pair->value, "Y,");
- if (bg_conf->deny_pass & PASS_DENY_Z)
- xstrcat(key_pair->value, "Z,");
- if (key_pair->value)
- key_pair->value[strlen(key_pair->value)-1] = '\0';
- list_append(my_list, key_pair);
- }
- key_pair = xmalloc(sizeof(config_key_pair_t));
- key_pair->name = xstrdup("IONodesPerMP");
- key_pair->value = xstrdup_printf("%u", bg_conf->ionodes_per_mp);
- list_append(my_list, key_pair);
- key_pair = xmalloc(sizeof(config_key_pair_t));
- key_pair->name = xstrdup("LayoutMode");
- switch(bg_conf->layout_mode) {
- case LAYOUT_STATIC:
- key_pair->value = xstrdup("Static");
- break;
- case LAYOUT_OVERLAP:
- key_pair->value = xstrdup("Overlap");
- break;
- case LAYOUT_DYNAMIC:
- key_pair->value = xstrdup("Dynamic");
- break;
- default:
- key_pair->value = xstrdup("Unknown");
- break;
- }
- list_append(my_list, key_pair);
- key_pair = xmalloc(sizeof(config_key_pair_t));
- key_pair->name = xstrdup("MloaderImage");
- key_pair->value = xstrdup(bg_conf->default_mloaderimage);
- list_append(my_list, key_pair);
- key_pair = xmalloc(sizeof(config_key_pair_t));
- key_pair->name = xstrdup("NodeCardNodeCnt");
- key_pair->value = xstrdup_printf("%u", bg_conf->nodecard_cnode_cnt);
- list_append(my_list, key_pair);
- if (bg_conf->sub_blocks) {
- key_pair = xmalloc(sizeof(config_key_pair_t));
- key_pair->name = xstrdup("AllowSubBlockAllocations");
- key_pair->value = xstrdup("Yes");
- list_append(my_list, key_pair);
- }
- if (bg_conf->sub_mp_sys) {
- key_pair = xmalloc(sizeof(config_key_pair_t));
- key_pair->name = xstrdup("SubMidplaneSystem");
- key_pair->value = xstrdup("Yes");
- list_append(my_list, key_pair);
- }
- list_sort(my_list, (ListCmpF) sort_key_pairs);
- return my_list;
- }
- #endif
- /*
- * init() is called when the plugin is loaded, before any other functions
- * are called. Put global initialization here.
- */
- extern int init(void)
- {
- #ifdef HAVE_BG
- if (!bg_conf) {
- /* This is needed on all systems where srun wraps the
- bluegene calling program (i.e. runjob).
- */
- bg_conf = xmalloc(sizeof(bg_config_t));
- /* set some defaults for most systems */
- bg_conf->actual_cnodes_per_mp = bg_conf->mp_cnode_cnt = 512;
- bg_conf->quarter_cnode_cnt = 128;
- bg_conf->nodecard_cnode_cnt = 32;
- bg_conf->mp_nodecard_cnt = bg_conf->mp_cnode_cnt
- / bg_conf->nodecard_cnode_cnt;
- }
- if (bg_recover != NOT_FROM_CONTROLLER) {
- #if defined HAVE_BG_L_P && (SYSTEM_DIMENSIONS != 3)
- fatal("SYSTEM_DIMENSIONS value (%d) invalid for BlueGene",
- SYSTEM_DIMENSIONS);
- #elif defined HAVE_BGQ && (SYSTEM_DIMENSIONS != 4)
- fatal("SYSTEM_DIMENSIONS value (%d) invalid for BGQ",
- SYSTEM_DIMENSIONS);
- #endif
- #if defined HAVE_BG_FILES && defined HAVE_BG_L_P
- #ifdef HAVE_BGL
- if (!getenv("CLASSPATH") || !getenv("DB2INSTANCE")
- || !getenv("VWSPATH"))
- fatal("db2profile has not been "
- "run to setup DB2 environment");
- if ((SELECT_COPROCESSOR_MODE != RM_PARTITION_COPROCESSOR_MODE)
- || (SELECT_VIRTUAL_NODE_MODE
- != RM_PARTITION_VIRTUAL_NODE_MODE))
- fatal("enum node_use_type out of sync with rm_api.h");
- #endif
- if ((SELECT_MESH != RM_MESH)
- || (SELECT_TORUS != RM_TORUS)
- || (SELECT_NAV != RM_NAV))
- fatal("enum conn_type out of sync with rm_api.h");
- #endif
- verbose("%s loading...", plugin_name);
- /* if this is coming from something other than the controller
- we don't want to read the config or anything like that. */
- _set_bg_lists();
- xfree(bg_conf->slurm_user_name);
- xfree(bg_conf->slurm_node_prefix);
- slurm_conf_lock();
- xassert(slurmctld_conf.slurm_user_name);
- xassert(slurmctld_conf.node_prefix);
- bg_conf->slurm_user_name =
- xstrdup(slurmctld_conf.slurm_user_name);
- bg_conf->slurm_node_prefix =
- xstrdup(slurmctld_conf.node_prefix);
- bg_conf->slurm_debug_flags = slurmctld_conf.debug_flags;
- bg_conf->slurm_debug_level = slurmctld_conf.slurmctld_debug;
- slurm_conf_unlock();
- if (bg_conf->blrts_list)
- list_destroy(bg_conf->blrts_list);
- bg_conf->blrts_list = list_create(destroy_image);
- if (bg_conf->linux_list)
- list_destroy(bg_conf->linux_list);
- bg_conf->linux_list = list_create(destroy_image);
- if (bg_conf->mloader_list)
- list_destroy(bg_conf->mloader_list);
- bg_conf->mloader_list = list_create(destroy_image);
- if (bg_conf->ramdisk_list)
- list_destroy(bg_conf->ramdisk_list);
- bg_conf->ramdisk_list = list_create(destroy_image);
- ba_init(NULL, 1);
- verbose("BlueGene plugin loaded successfully");
- }
- verbose("%s loaded", plugin_name);
- #else
- if (bg_recover != NOT_FROM_CONTROLLER)
- fatal("select/bluegene is incompatible with a "
- "non BlueGene system");
- #endif
- return SLURM_SUCCESS;
- }
- extern int fini ( void )
- {
- int rc = SLURM_SUCCESS;
- ba_fini();
- _destroy_bg_config(bg_conf);
- _destroy_bg_lists(bg_lists);
- return rc;
- }
- /*
- * The remainder of this file implements the standard SLURM
- * node selection API.
- */
- /* We rely upon DB2 to save and restore BlueGene state */
- extern int select_p_state_save(char *dir_name)
- {
- #ifdef HAVE_BG
- ListIterator itr;
- bg_record_t *bg_record = NULL;
- int error_code = 0, log_fd;
- char *old_file, *new_file, *reg_file;
- uint32_t blocks_packed = 0, tmp_offset, block_offset;
- Buf buffer = init_buf(BUF_SIZE);
- slurmctld_lock_t job_read_lock =
- { NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };
- DEF_TIMERS;
- debug("bluegene: select_p_state_save");
- START_TIMER;
- /* write header: time */
- packstr(BLOCK_STATE_VERSION, buffer);
- block_offset = get_buf_offset(buffer);
- pack32(blocks_packed, buffer);
- /* Lock job read before block to avoid deadlock job lock is
- * needed because we look at the job_ptr's to send job info. */
- lock_slurmctld(job_read_lock);
- /* write block records to buffer */
- slurm_mutex_lock(&block_state_mutex);
- itr = list_iterator_create(bg_lists->main);
- while ((bg_record = list_next(itr))) {
- if (bg_record->magic != BLOCK_MAGIC)
- continue;
- xassert(bg_record->bg_block_id != NULL);
- _pack_block(bg_record, buffer, SLURM_PROTOCOL_VERSION);
- _pack_block_ext(bg_record, buffer, SLURM_PROTOCOL_VERSION);
- blocks_packed++;
- }
- list_iterator_destroy(itr);
- slurm_mutex_unlock(&block_state_mutex);
- unlock_slurmctld(job_read_lock);
- tmp_offset = get_buf_offset(buffer);
- set_buf_offset(buffer, block_offset);
- pack32(blocks_packed, buffer);
- set_buf_offset(buffer, tmp_offset);
- /* Maintain config read lock until we copy state_save_location *\
- \* unlock_slurmctld(part_read_lock); - see below */
- /* write the buffer to file */
- slurm_conf_lock();
- old_file = xstrdup(slurmctld_conf.state_save_location);
- xstrcat(old_file, "/block_state.old");
- reg_file = xstrdup(slurmctld_conf.state_save_location);
- xstrcat(reg_file, "/block_state");
- new_file = xstrdup(slurmctld_conf.state_save_location);
- xstrcat(new_file, "/block_state.new");
- slurm_conf_unlock();
- log_fd = creat(new_file, 0600);
- if (log_fd < 0) {
- error("Can't save state, error creating file %s, %m",
- new_file);
- error_code = errno;
- } else {
- int pos = 0, nwrite = get_buf_offset(buffer), amount;
- char *data = (char *)get_buf_data(buffer);
- while (nwrite > 0) {
- amount = write(log_fd, &data[pos], nwrite);
- if ((amount < 0) && (errno != EINTR)) {
- error("Error writing file %s, %m", new_file);
- error_code = errno;
- break;
- }
- nwrite -= amount;
- pos += amount;
- }
- fsync(log_fd);
- close(log_fd);
- }
- if (error_code)
- (void) unlink(new_file);
- else { /* file shuffle */
- (void) unlink(old_file);
- if (link(reg_file, old_file))
- debug4("unable to create link for %s -> %s: %m",
- reg_file, old_file);
- (void) unlink(reg_file);
- if (link(new_file, reg_file))
- debug4("unable to create link for %s -> %s: %m",
- new_file, reg_file);
- (void) unlink(new_file);
- }
- xfree(old_file);
- xfree(reg_file);
- xfree(new_file);
- free_buf(buffer);
- END_TIMER2("select_p_state_save");
- return SLURM_SUCCESS;
- #else
- return SLURM_ERROR;
- #endif
- }
- extern int select_p_state_restore(char *dir_name)
- {
- #ifdef HAVE_BG
- debug("bluegene: select_p_state_restore");
- /* found bg blocks already on system */
- List curr_block_list = NULL;
- List found_block_list = NULL;
- static time_t last_config_update = (time_t) 0;
- /* only run on startup */
- if (last_config_update)
- return SLURM_SUCCESS;
- last_config_update = time(NULL);
- curr_block_list = list_create(destroy_bg_record);
- found_block_list = list_create(NULL);
- //#if 0
- /* Check to see if the configs we have are correct */
- if (_validate_config_blocks(curr_block_list, found_block_list, dir_name)
- == SLURM_ERROR) {
- _delete_old_blocks(curr_block_list, found_block_list);
- }
- //#endif
- /* looking for blocks only I created */
- if (bg_conf->layout_mode == LAYOUT_DYNAMIC) {
- info("No blocks created until jobs are submitted");
- } else {
- if (create_defined_blocks(bg_conf->layout_mode,
- found_block_list)
- == SLURM_ERROR) {
- /* error in creating the static blocks, so
- * blocks referenced by submitted jobs won't
- * correspond to actual slurm blocks.
- */
- fatal("Error, could not create the static blocks");
- return SLURM_ERROR;
- }
- }
- list_destroy(curr_block_list);
- curr_block_list = NULL;
- list_destroy(found_block_list);
- found_block_list = NULL;
- slurm_mutex_lock(&block_state_mutex);
- last_bg_update = time(NULL);
- sort_bg_record_inc_size(bg_lists->main);
- slurm_mutex_unlock(&block_state_mutex);
- if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
- info("Blocks have finished being created.");
- return SLURM_SUCCESS;
- #else
- return SLURM_ERROR;
- #endif
- }
- /* Sync BG blocks to currently active jobs */
- extern int select_p_job_init(List job_list)
- {
- #ifdef HAVE_BG
- int rc = sync_jobs(job_list);
- /* after we have synced the blocks then we say they are
- created. */
- blocks_are_created = 1;
- return rc;
- #else
- return SLURM_ERROR;
- #endif
- }
- extern bool select_p_node_ranking(struct node_record *node_ptr, int node_cnt)
- {
- return false;
- }
- /* All initialization is performed by init() */
- extern int select_p_node_init(struct node_record *node_ptr_array, int node_cnt)
- {
- #ifdef HAVE_BG
- int i = 0;
- uint32_t real_memory, threads, cores;
- if (!node_ptr_array)
- return SLURM_SUCCESS;
- xassert(bg_conf);
- /* we need the amount of memory for a midplane */
- real_memory = bg_conf->mp_cnode_cnt;
- /* Set up some knowns that perhaps aren't all the way
- in the slurm.conf.
- */
- #ifdef HAVE_BGL
- threads = 1;
- cores = 2;
- real_memory *= 512;
- #elif defined HAVE_BGP
- threads = 1;
- cores = 4;
- real_memory *= 2048;
- #else
- /* BGQ */
- threads = 4;
- cores = 16;
- real_memory *= 16384;
- #endif
- bg_conf->cpus_per_mp = bg_conf->mp_cnode_cnt * cores;
- for (i = 0; i < node_cnt; i++) {
- struct node_record *node_ptr = &node_ptr_array[i];
- select_nodeinfo_t *nodeinfo = NULL;
- if (!node_ptr->name)
- continue;
- node_ptr->threads = threads;
- node_ptr->cores = cores;
- node_ptr->sockets = bg_conf->mp_cnode_cnt;
- node_ptr->config_ptr->cpus = node_ptr->cpus =
- bg_conf->cpus_per_mp;
- node_ptr->real_memory = real_memory;
- xassert(node_ptr->select_nodeinfo);
- nodeinfo = node_ptr->select_nodeinfo->data;
- xassert(nodeinfo);
- slurm_mutex_lock(&ba_system_mutex);
- if (!(nodeinfo->ba_mp = str2ba_mp(node_ptr->name))) {
- slurm_mutex_unlock(&ba_system_mutex);
- continue;
- }
- nodeinfo->ba_mp->index = i;
- if (IS_NODE_DOWN(node_ptr) || IS_NODE_DRAIN(node_ptr))
- ba_update_mp_state(
- nodeinfo->ba_mp, node_ptr->node_state);
- nodeinfo->ba_mp->state = node_ptr->node_state;
- slurm_mutex_unlock(&ba_system_mutex);
- }
- return SLURM_SUCCESS;
- #else
- return SLURM_ERROR;
- #endif
- }
- /*
- * Called by slurmctld when a new configuration file is loaded
- * or scontrol is used to change block configuration
- */
- extern int select_p_block_init(List part_list)
- {
- #ifdef HAVE_BG
- /* select_p_node_init needs to be called before this to set
- this up correctly
- */
- if (read_bg_conf() == SLURM_ERROR) {
- fatal("Error, could not read the file");
- return SLURM_ERROR;
- }
- if (part_list) {
- struct part_record *part_ptr = NULL;
- ListIterator itr = list_iterator_create(part_list);
- while ((part_ptr = list_next(itr))) {
- char *this_node_name;
- hostlist_t host_list;
- part_ptr->total_cpus = 0;
- if (!part_ptr->nodes) /* no nodes in partition */
- continue;
- if (!(host_list = hostlist_create(part_ptr->nodes))) {
- error("hostlist_create error on %s, %m",
- part_ptr->nodes);
- continue;
- }
- while ((this_node_name = hostlist_shift(host_list))) {
- struct node_record *node_ptr =
- find_node_record(this_node_name);
- if (node_ptr == NULL) {
- error("select_p_block_init: "
- "invalid node name %s",
- this_node_name);
- free(this_node_name);
- hostlist_destroy(host_list);
- continue;
- }
- free(this_node_name);
- part_ptr->total_cpus += node_ptr->cpus;
- }
- hostlist_destroy(host_list);
- part_ptr->max_nodes = part_ptr->max_nodes_orig;
- part_ptr->min_nodes = part_ptr->min_nodes_orig;
- select_p_alter_node_cnt(SELECT_SET_MP_CNT,
- &part_ptr->max_nodes);
- select_p_alter_node_cnt(SELECT_SET_MP_CNT,
- &part_ptr->min_nodes);
- }
- list_iterator_destroy(itr);
- }
- return SLURM_SUCCESS;
- #else
- return SLURM_ERROR;
- #endif
- }
- /*
- * select_p_job_test - Given a specification of scheduling requirements,
- * identify the nodes which "best" satify the request. The specified
- * nodes may be DOWN or BUSY at the time of this test as may be used
- * to deterime if a job could ever run.
- * IN/OUT job_ptr - pointer to job being scheduled start_time is set
- * when we can possibly start job.
- * IN/OUT bitmap - usable nodes are set on input, nodes not required to
- * satisfy the request are cleared, other left set
- * IN min_nodes - minimum count of nodes
- * IN max_nodes - maximum count of nodes (0==don't care)
- * IN req_nodes - requested (or desired) count of nodes
- * IN mode …
Large files files are truncated, but you can click here to view the full file