PageRenderTime 80ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 1ms

/src/common/gres.c

https://github.com/cfenoy/slurm
C | 4721 lines | 3817 code | 442 blank | 462 comment | 1109 complexity | 5c6779d3369a8bd29341bd3b1700a882 MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0

Large files files are truncated, but you can click here to view the full file

  1. /*****************************************************************************\
  2. * gres.c - driver for gres plugin
  3. *****************************************************************************
  4. * Copyright (C) 2010 Lawrence Livermore National Security.
  5. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  6. * Written by Morris Jette <jette1@llnl.gov>
  7. * CODE-OCEC-09-009. All rights reserved.
  8. *
  9. * This file is part of SLURM, a resource management program.
  10. * For details, see <http://www.schedmd.com/slurmdocs/>.
  11. * Please also read the included file: DISCLAIMER.
  12. *
  13. * SLURM is free software; you can redistribute it and/or modify it under
  14. * the terms of the GNU General Public License as published by the Free
  15. * Software Foundation; either version 2 of the License, or (at your option)
  16. * any later version.
  17. *
  18. * In addition, as a special exception, the copyright holders give permission
  19. * to link the code of portions of this program with the OpenSSL library under
  20. * certain conditions as described in each individual source file, and
  21. * distribute linked combinations including the two. You must obey the GNU
  22. * General Public License in all respects for all of the code used other than
  23. * OpenSSL. If you modify file(s) with this exception, you may extend this
  24. * exception to your version of the file(s), but you are not obligated to do
  25. * so. If you do not wish to do so, delete this exception statement from your
  26. * version. If you delete this exception statement from all source files in
  27. * the program, then also delete it here.
  28. *
  29. * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  30. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  31. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  32. * details.
  33. *
  34. * You should have received a copy of the GNU General Public License along
  35. * with SLURM; if not, write to the Free Software Foundation, Inc.,
  36. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  37. \*****************************************************************************/
  38. #if HAVE_CONFIG_H
  39. # include "config.h"
  40. # if STDC_HEADERS
  41. # include <string.h>
  42. # endif
  43. # if HAVE_SYS_TYPES_H
  44. # include <sys/types.h>
  45. # endif /* HAVE_SYS_TYPES_H */
  46. # if HAVE_UNISTD_H
  47. # include <unistd.h>
  48. # endif
  49. # if HAVE_INTTYPES_H
  50. # include <inttypes.h>
  51. # else /* ! HAVE_INTTYPES_H */
  52. # if HAVE_STDINT_H
  53. # include <stdint.h>
  54. # endif
  55. # endif /* HAVE_INTTYPES_H */
  56. # ifdef HAVE_LIMITS_H
  57. # include <limits.h>
  58. # endif
  59. #else /* ! HAVE_CONFIG_H */
  60. # include <limits.h>
  61. # include <sys/types.h>
  62. # include <stdint.h>
  63. # include <stdlib.h>
  64. # include <string.h>
  65. #endif /* HAVE_CONFIG_H */
  66. #include <stdio.h>
  67. #include <stdlib.h>
  68. #include <sys/stat.h>
  69. #include "slurm/slurm.h"
  70. #include "slurm/slurm_errno.h"
  71. #include "src/common/gres.h"
  72. #include "src/common/list.h"
  73. #include "src/common/macros.h"
  74. #include "src/common/pack.h"
  75. #include "src/common/parse_config.h"
  76. #include "src/common/plugin.h"
  77. #include "src/common/plugrack.h"
  78. #include "src/common/slurm_protocol_api.h"
  79. #include "src/common/xmalloc.h"
  80. #include "src/common/xstring.h"
  81. #define GRES_MAGIC 0x438a34d4
  82. /* Gres symbols provided by the plugin */
  83. typedef struct slurm_gres_ops {
  84. int (*node_config_load) ( List gres_conf_list );
  85. void (*job_set_env) ( char ***job_env_ptr,
  86. void *gres_ptr );
  87. void (*step_set_env) ( char ***job_env_ptr,
  88. void *gres_ptr );
  89. void (*send_stepd) ( int fd );
  90. void (*recv_stepd) ( int fd );
  91. } slurm_gres_ops_t;
  92. /* Gres plugin context, one for each gres type */
  93. typedef struct slurm_gres_context {
  94. plugin_handle_t cur_plugin;
  95. char * gres_name; /* name (e.g. "gpu") */
  96. char * gres_name_colon; /* name + colon (e.g. "gpu:") */
  97. int gres_name_colon_len; /* size of gres_name_colon */
  98. char * gres_type; /* plugin name (e.g. "gres/gpu") */
  99. bool has_file; /* found "File=" in slurm.conf */
  100. slurm_gres_ops_t ops; /* pointers to plugin symbols */
  101. uint32_t plugin_id; /* key for searches */
  102. plugrack_t plugin_list; /* plugrack info */
  103. } slurm_gres_context_t;
  104. /* Generic gres data structure for adding to a list. Depending upon the
  105. * context, gres_data points to gres_node_state_t, gres_job_state_t or
  106. * gres_step_state_t */
  107. typedef struct gres_state {
  108. uint32_t plugin_id;
  109. void *gres_data;
  110. } gres_state_t;
  111. /* Local variables */
  112. static int gres_context_cnt = -1;
  113. static uint32_t gres_cpu_cnt = 0;
  114. static bool gres_debug = false;
  115. static slurm_gres_context_t *gres_context = NULL;
  116. static char *gres_plugin_list = NULL;
  117. static pthread_mutex_t gres_context_lock = PTHREAD_MUTEX_INITIALIZER;
  118. static List gres_conf_list = NULL;
  119. static bool init_run = false;
  120. /* Local functions */
  121. static gres_node_state_t *
  122. _build_gres_node_state(void);
  123. static uint32_t _build_id(char *gres_name);
  124. static bitstr_t *_cpu_bitmap_rebuild(bitstr_t *old_cpu_bitmap, int new_size);
  125. static void _destroy_gres_slurmd_conf(void *x);
  126. static uint32_t _get_gres_cnt(char *orig_config, char *gres_name,
  127. char *gres_name_colon, int gres_name_colon_len);
  128. static char * _get_gres_conf(void);
  129. static uint32_t _get_tot_gres_cnt(uint32_t plugin_id, uint32_t *set_cnt);
  130. static int _gres_find_id(void *x, void *key);
  131. static void _gres_job_list_delete(void *list_element);
  132. extern int _job_alloc(void *job_gres_data, void *node_gres_data,
  133. int node_cnt, int node_offset, uint32_t cpu_cnt,
  134. char *gres_name, uint32_t job_id, char *node_name,
  135. bitstr_t *core_bitmap);
  136. static int _job_config_validate(char *config, uint32_t *gres_cnt,
  137. slurm_gres_context_t *context_ptr);
  138. static int _job_dealloc(void *job_gres_data, void *node_gres_data,
  139. int node_offset, char *gres_name, uint32_t job_id,
  140. char *node_name);
  141. static void _job_state_delete(void *gres_data);
  142. static void * _job_state_dup(void *gres_data);
  143. static void * _job_state_dup2(void *gres_data, int node_index);
  144. static int _job_state_validate(char *config, void **gres_data,
  145. slurm_gres_context_t *gres_name);
  146. extern uint32_t _job_test(void *job_gres_data, void *node_gres_data,
  147. bool use_total_gres, bitstr_t *cpu_bitmap,
  148. int cpu_start_bit, int cpu_end_bit, bool *topo_set,
  149. uint32_t job_id, char *node_name, char *gres_name);
  150. static int _load_gres_plugin(char *plugin_name,
  151. slurm_gres_context_t *plugin_context);
  152. static int _log_gres_slurmd_conf(void *x, void *arg);
  153. static void _my_stat(char *file_name);
  154. static int _node_config_init(char *node_name, char *orig_config,
  155. slurm_gres_context_t *context_ptr,
  156. gres_state_t *gres_ptr);
  157. static int _node_reconfig(char *node_name, char *orig_config,
  158. char **new_config, gres_state_t *gres_ptr,
  159. uint16_t fast_schedule,
  160. slurm_gres_context_t *context_ptr);
  161. static void _node_state_dealloc(gres_state_t *gres_ptr);
  162. static void * _node_state_dup(void *gres_data);
  163. static void _node_state_log(void *gres_data, char *node_name,
  164. char *gres_name);
  165. static int _parse_gres_config(void **dest, slurm_parser_enum_t type,
  166. const char *key, const char *value,
  167. const char *line, char **leftover);
  168. static void _set_gres_cnt(char *orig_config, char **new_config,
  169. uint32_t new_cnt, char *gres_name,
  170. char *gres_name_colon, int gres_name_colon_len);
  171. static int _step_alloc(void *step_gres_data, void *job_gres_data,
  172. int node_offset, int cpu_cnt, char *gres_name,
  173. uint32_t job_id, uint32_t step_id);
  174. static int _step_dealloc(void *step_gres_data, void *job_gres_data,
  175. char *gres_name, uint32_t job_id,
  176. uint32_t step_id);
  177. static void * _step_state_dup(void *gres_data);
  178. static void * _step_state_dup2(void *gres_data, int node_index);
  179. static int _step_state_validate(char *config, void **gres_data,
  180. slurm_gres_context_t *context_ptr);
  181. static uint32_t _step_test(void *step_gres_data, void *job_gres_data,
  182. int node_offset, bool ignore_alloc, char *gres_name,
  183. uint32_t job_id, uint32_t step_id);
  184. static int _strcmp(const char *s1, const char *s2);
  185. static int _unload_gres_plugin(slurm_gres_context_t *plugin_context);
  186. static void _validate_config(slurm_gres_context_t *context_ptr);
  187. static int _validate_file(char *path_name, char *gres_name);
  188. static void _validate_gres_node_cpus(gres_node_state_t *node_gres_ptr,
  189. int cpus_ctld, char *node_name);
  190. /* Convert a gres_name into a number for faster comparision operations */
  191. static uint32_t _build_id(char *gres_name)
  192. {
  193. int i, j;
  194. uint32_t id = 0;
  195. for (i=0, j=0; gres_name[i]; i++) {
  196. id += (gres_name[i] << j);
  197. j = (j + 8) % 32;
  198. }
  199. return id;
  200. }
  201. static int _gres_find_id(void *x, void *key)
  202. {
  203. uint32_t *plugin_id = (uint32_t *)key;
  204. gres_state_t *state_ptr = (gres_state_t *) x;
  205. if (state_ptr->plugin_id == *plugin_id)
  206. return 1;
  207. return 0;
  208. }
  209. /* Variant of strcmp that will accept NULL string pointers */
  210. static int _strcmp(const char *s1, const char *s2)
  211. {
  212. if ((s1 != NULL) && (s2 == NULL))
  213. return 1;
  214. if ((s1 == NULL) && (s2 == NULL))
  215. return 0;
  216. if ((s1 == NULL) && (s2 != NULL))
  217. return -1;
  218. return strcmp(s1, s2);
  219. }
  220. static int _load_gres_plugin(char *plugin_name,
  221. slurm_gres_context_t *plugin_context)
  222. {
  223. /*
  224. * Must be synchronized with slurm_gres_ops_t above.
  225. */
  226. static const char *syms[] = {
  227. "node_config_load",
  228. "job_set_env",
  229. "step_set_env",
  230. "send_stepd",
  231. "recv_stepd",
  232. };
  233. int n_syms = sizeof(syms) / sizeof(char *);
  234. /* Find the correct plugin */
  235. plugin_context->gres_type = xstrdup("gres/");
  236. xstrcat(plugin_context->gres_type, plugin_name);
  237. plugin_context->plugin_list = NULL;
  238. plugin_context->cur_plugin = PLUGIN_INVALID_HANDLE;
  239. plugin_context->cur_plugin = plugin_load_and_link(
  240. plugin_context->gres_type,
  241. n_syms, syms,
  242. (void **) &plugin_context->ops);
  243. if (plugin_context->cur_plugin != PLUGIN_INVALID_HANDLE)
  244. return SLURM_SUCCESS;
  245. if (errno != EPLUGIN_NOTFOUND) {
  246. error("Couldn't load specified plugin name for %s: %s",
  247. plugin_context->gres_type, plugin_strerror(errno));
  248. return SLURM_ERROR;
  249. }
  250. debug("gres: Couldn't find the specified plugin name for %s looking "
  251. "at all files", plugin_context->gres_type);
  252. /* Get plugin list */
  253. if (plugin_context->plugin_list == NULL) {
  254. char *plugin_dir;
  255. plugin_context->plugin_list = plugrack_create();
  256. if (plugin_context->plugin_list == NULL) {
  257. error("gres: cannot create plugin manager");
  258. return SLURM_ERROR;
  259. }
  260. plugrack_set_major_type(plugin_context->plugin_list,
  261. "gres");
  262. plugrack_set_paranoia(plugin_context->plugin_list,
  263. PLUGRACK_PARANOIA_NONE, 0);
  264. plugin_dir = slurm_get_plugin_dir();
  265. plugrack_read_dir(plugin_context->plugin_list, plugin_dir);
  266. xfree(plugin_dir);
  267. }
  268. plugin_context->cur_plugin = plugrack_use_by_type(
  269. plugin_context->plugin_list,
  270. plugin_context->gres_type );
  271. if (plugin_context->cur_plugin == PLUGIN_INVALID_HANDLE) {
  272. debug("Cannot find plugin of type %s, just track gres counts",
  273. plugin_context->gres_type);
  274. return SLURM_ERROR;
  275. }
  276. /* Dereference the API. */
  277. if (plugin_get_syms(plugin_context->cur_plugin,
  278. n_syms, syms,
  279. (void **) &plugin_context->ops ) < n_syms ) {
  280. error("Incomplete %s plugin detected",
  281. plugin_context->gres_type);
  282. return SLURM_ERROR;
  283. }
  284. return SLURM_SUCCESS;
  285. }
  286. static int _unload_gres_plugin(slurm_gres_context_t *plugin_context)
  287. {
  288. int rc;
  289. /*
  290. * Must check return code here because plugins might still
  291. * be loaded and active.
  292. */
  293. if (plugin_context->plugin_list)
  294. rc = plugrack_destroy(plugin_context->plugin_list);
  295. else {
  296. rc = SLURM_SUCCESS;
  297. plugin_unload(plugin_context->cur_plugin);
  298. }
  299. xfree(plugin_context->gres_name);
  300. xfree(plugin_context->gres_name_colon);
  301. xfree(plugin_context->gres_type);
  302. return rc;
  303. }
  304. /*
  305. * Initialize the gres plugin.
  306. *
  307. * Returns a SLURM errno.
  308. */
  309. extern int gres_plugin_init(void)
  310. {
  311. int i, j, rc = SLURM_SUCCESS;
  312. char *last = NULL, *names, *one_name, *full_name;
  313. if (init_run && (gres_context_cnt >= 0))
  314. return rc;
  315. slurm_mutex_lock(&gres_context_lock);
  316. if (slurm_get_debug_flags() & DEBUG_FLAG_GRES)
  317. gres_debug = true;
  318. else
  319. gres_debug = false;
  320. if (gres_context_cnt >= 0)
  321. goto fini;
  322. gres_plugin_list = slurm_get_gres_plugins();
  323. gres_context_cnt = 0;
  324. if ((gres_plugin_list == NULL) || (gres_plugin_list[0] == '\0'))
  325. goto fini;
  326. gres_context_cnt = 0;
  327. names = xstrdup(gres_plugin_list);
  328. one_name = strtok_r(names, ",", &last);
  329. while (one_name) {
  330. full_name = xstrdup("gres/");
  331. xstrcat(full_name, one_name);
  332. for (i=0; i<gres_context_cnt; i++) {
  333. if (!strcmp(full_name, gres_context[i].gres_type))
  334. break;
  335. }
  336. xfree(full_name);
  337. if (i<gres_context_cnt) {
  338. error("Duplicate plugin %s ignored",
  339. gres_context[i].gres_type);
  340. } else {
  341. xrealloc(gres_context, (sizeof(slurm_gres_context_t) *
  342. (gres_context_cnt + 1)));
  343. (void) _load_gres_plugin(one_name,
  344. gres_context +
  345. gres_context_cnt);
  346. /* Ignore return code.
  347. * Proceed to support gres even without the plugin */
  348. gres_context[gres_context_cnt].gres_name =
  349. xstrdup(one_name);
  350. gres_context[gres_context_cnt].plugin_id =
  351. _build_id(one_name);
  352. gres_context_cnt++;
  353. }
  354. one_name = strtok_r(NULL, ",", &last);
  355. }
  356. xfree(names);
  357. /* Insure that plugin_id is valid and unique */
  358. for (i=0; i<gres_context_cnt; i++) {
  359. for (j=i+1; j<gres_context_cnt; j++) {
  360. if (gres_context[i].plugin_id !=
  361. gres_context[j].plugin_id)
  362. continue;
  363. fatal("Gres: Duplicate plugin_id %u for %s and %s, "
  364. "change gres name for one of them",
  365. gres_context[i].plugin_id,
  366. gres_context[i].gres_type,
  367. gres_context[j].gres_type);
  368. }
  369. xassert(gres_context[i].gres_name);
  370. gres_context[i].gres_name_colon =
  371. xstrdup_printf("%s:", gres_context[i].gres_name);
  372. gres_context[i].gres_name_colon_len =
  373. strlen(gres_context[i].gres_name_colon);
  374. }
  375. init_run = true;
  376. fini: slurm_mutex_unlock(&gres_context_lock);
  377. return rc;
  378. }
  379. /*
  380. * Terminate the gres plugin. Free memory.
  381. *
  382. * Returns a SLURM errno.
  383. */
  384. extern int gres_plugin_fini(void)
  385. {
  386. int i, j, rc = SLURM_SUCCESS;
  387. slurm_mutex_lock(&gres_context_lock);
  388. if (gres_context_cnt < 0)
  389. goto fini;
  390. init_run = false;
  391. for (i=0; i<gres_context_cnt; i++) {
  392. j = _unload_gres_plugin(gres_context + i);
  393. if (j != SLURM_SUCCESS)
  394. rc = j;
  395. }
  396. xfree(gres_context);
  397. xfree(gres_plugin_list);
  398. FREE_NULL_LIST(gres_conf_list);
  399. gres_context_cnt = -1;
  400. fini: slurm_mutex_unlock(&gres_context_lock);
  401. return rc;
  402. }
  403. /*
  404. **************************************************************************
  405. * P L U G I N C A L L S *
  406. **************************************************************************
  407. */
  408. /*
  409. * Provide a plugin-specific help message for salloc, sbatch and srun
  410. * IN/OUT msg - buffer provided by caller and filled in by plugin
  411. * IN msg_size - size of msg buffer in bytes
  412. */
  413. extern int gres_plugin_help_msg(char *msg, int msg_size)
  414. {
  415. int i, rc;
  416. char *header = "Valid gres options are:\n";
  417. if (msg_size < 1)
  418. return EINVAL;
  419. msg[0] = '\0';
  420. rc = gres_plugin_init();
  421. if ((strlen(header) + 2) <= msg_size)
  422. strcat(msg, header);
  423. slurm_mutex_lock(&gres_context_lock);
  424. for (i=0; ((i < gres_context_cnt) && (rc == SLURM_SUCCESS)); i++) {
  425. if ((strlen(msg) + strlen(gres_context[i].gres_name) + 9) >
  426. msg_size)
  427. break;
  428. strcat(msg, gres_context[i].gres_name);
  429. strcat(msg, "[:count]\n");
  430. }
  431. slurm_mutex_unlock(&gres_context_lock);
  432. return rc;
  433. }
  434. /*
  435. * Perform reconfig, re-read any configuration files
  436. * OUT did_change - set if gres configuration changed
  437. */
  438. extern int gres_plugin_reconfig(bool *did_change)
  439. {
  440. int rc = SLURM_SUCCESS;
  441. char *plugin_names = slurm_get_gres_plugins();
  442. bool plugin_change;
  443. if (did_change)
  444. *did_change = false;
  445. slurm_mutex_lock(&gres_context_lock);
  446. if (slurm_get_debug_flags() & DEBUG_FLAG_GRES)
  447. gres_debug = true;
  448. else
  449. gres_debug = false;
  450. if (_strcmp(plugin_names, gres_plugin_list))
  451. plugin_change = true;
  452. else
  453. plugin_change = false;
  454. slurm_mutex_unlock(&gres_context_lock);
  455. if (plugin_change) {
  456. error("GresPlugins changed from %s to %s ignored",
  457. gres_plugin_list, plugin_names);
  458. error("Restart the slurmctld daemon to change GresPlugins");
  459. if (did_change)
  460. *did_change = true;
  461. #if 0
  462. /* This logic would load new plugins, but we need the old
  463. * plugins to persist in order to process old state
  464. * information. */
  465. rc = gres_plugin_fini();
  466. if (rc == SLURM_SUCCESS)
  467. rc = gres_plugin_init();
  468. #endif
  469. }
  470. xfree(plugin_names);
  471. return rc;
  472. }
  473. /*
  474. * Return the pathname of the gres.conf file
  475. */
  476. static char *_get_gres_conf(void)
  477. {
  478. char *val = getenv("SLURM_CONF");
  479. char *rc = NULL;
  480. int i;
  481. if (!val)
  482. return xstrdup(GRES_CONFIG_FILE);
  483. /* Replace file name on end of path */
  484. i = strlen(val) - strlen("slurm.conf") + strlen("gres.conf") + 1;
  485. rc = xmalloc(i);
  486. strcpy(rc, val);
  487. val = strrchr(rc, (int)'/');
  488. if (val) /* absolute path */
  489. val++;
  490. else /* not absolute path */
  491. val = rc;
  492. strcpy(val, "gres.conf");
  493. return rc;
  494. }
  495. /*
  496. * Destroy a gres_slurmd_conf_t record, free it's memory
  497. */
  498. static void _destroy_gres_slurmd_conf(void *x)
  499. {
  500. gres_slurmd_conf_t *p = (gres_slurmd_conf_t *) x;
  501. xassert(p);
  502. xfree(p->cpus);
  503. xfree(p->file); /* Only used by slurmd */
  504. xfree(p->name);
  505. xfree(p);
  506. }
  507. /*
  508. * Log the contents of a gres_slurmd_conf_t record
  509. */
  510. static int _log_gres_slurmd_conf(void *x, void *arg)
  511. {
  512. gres_slurmd_conf_t *p;
  513. p = (gres_slurmd_conf_t *) x;
  514. xassert(p);
  515. if (!gres_debug) {
  516. verbose("Gres Name=%s Count=%u", p->name, p->count);
  517. return 0;
  518. }
  519. if (p->cpus) {
  520. info("Gres Name=%s Count=%u ID=%u File=%s CPUs=%s CpuCnt=%u",
  521. p->name, p->count, p->plugin_id, p->file, p->cpus,
  522. p->cpu_cnt);
  523. } else if (p->file) {
  524. info("Gres Name=%s Count=%u ID=%u File=%s",
  525. p->name, p->count, p->plugin_id, p->file);
  526. } else {
  527. info("Gres Name=%s Count=%u ID=%u", p->name, p->count,
  528. p->plugin_id);
  529. }
  530. return 0;
  531. }
  532. static void _my_stat(char *file_name)
  533. {
  534. struct stat config_stat;
  535. if (stat(file_name, &config_stat) < 0)
  536. fatal("can't stat gres.conf file %s: %m", file_name);
  537. }
  538. static int _validate_file(char *path_name, char *gres_name)
  539. {
  540. char *file_name, *slash, *one_name, *root_path;
  541. char *formatted_path = NULL;
  542. hostlist_t hl;
  543. int i, file_count = 0;
  544. i = strlen(path_name);
  545. if ((i < 3) || (path_name[i-1] != ']')) {
  546. _my_stat(path_name);
  547. return 1;
  548. }
  549. slash = strrchr(path_name, '/');
  550. if (slash) {
  551. i = strlen(path_name);
  552. formatted_path = xmalloc(i+1);
  553. slash[0] = '\0';
  554. root_path = xstrdup(path_name);
  555. xstrcat(root_path, "/");
  556. slash[0] = '/';
  557. file_name = slash + 1;
  558. } else {
  559. file_name = path_name;
  560. root_path = NULL;
  561. }
  562. hl = hostlist_create(file_name);
  563. if (hl == NULL)
  564. fatal("can't parse File=%s", path_name);
  565. while ((one_name = hostlist_shift(hl))) {
  566. if (slash) {
  567. sprintf(formatted_path, "%s/%s", root_path, one_name);
  568. _my_stat(formatted_path);
  569. } else {
  570. _my_stat(one_name);
  571. }
  572. file_count++;
  573. free(one_name);
  574. }
  575. hostlist_destroy(hl);
  576. xfree(formatted_path);
  577. xfree(root_path);
  578. return file_count;
  579. }
  580. /*
  581. * Build gres_slurmd_conf_t record based upon a line from the gres.conf file
  582. */
  583. static int _parse_gres_config(void **dest, slurm_parser_enum_t type,
  584. const char *key, const char *value,
  585. const char *line, char **leftover)
  586. {
  587. static s_p_options_t _gres_options[] = {
  588. {"Count", S_P_STRING}, /* Number of Gres available */
  589. {"CPUs" , S_P_STRING}, /* CPUs to bind to Gres resource */
  590. {"File", S_P_STRING}, /* Path to Gres device */
  591. {NULL}
  592. };
  593. int i;
  594. s_p_hashtbl_t *tbl;
  595. gres_slurmd_conf_t *p;
  596. long tmp_long;
  597. char *tmp_str, *last;
  598. tbl = s_p_hashtbl_create(_gres_options);
  599. s_p_parse_line(tbl, *leftover, leftover);
  600. p = xmalloc(sizeof(gres_slurmd_conf_t));
  601. p->name = xstrdup(value);
  602. p->cpu_cnt = gres_cpu_cnt;
  603. if (s_p_get_string(&p->cpus, "CPUs", tbl)) {
  604. bitstr_t *cpu_bitmap; /* Just use to validate config */
  605. cpu_bitmap = bit_alloc(gres_cpu_cnt);
  606. if (cpu_bitmap == NULL)
  607. fatal("bit_alloc: malloc failure");
  608. i = bit_unfmt(cpu_bitmap, p->cpus);
  609. if (i != 0) {
  610. fatal("Invalid gres data for %s, CPUs=%s (only %u CPUs"
  611. " are available)",
  612. p->name, p->cpus, gres_cpu_cnt);
  613. }
  614. FREE_NULL_BITMAP(cpu_bitmap);
  615. }
  616. if (s_p_get_string(&p->file, "File", tbl)) {
  617. p->count = _validate_file(p->file, p->name);
  618. p->has_file = 1;
  619. }
  620. if (s_p_get_string(&tmp_str, "Count", tbl)) {
  621. tmp_long = strtol(tmp_str, &last, 10);
  622. if ((tmp_long == LONG_MIN) || (tmp_long == LONG_MAX)) {
  623. fatal("Invalid gres data for %s, Count=%s", p->name,
  624. tmp_str);
  625. }
  626. if ((last[0] == 'k') || (last[0] == 'K'))
  627. tmp_long *= 1024;
  628. else if ((last[0] == 'm') || (last[0] == 'M'))
  629. tmp_long *= (1024 * 1024);
  630. else if ((last[0] == 'g') || (last[0] == 'G'))
  631. tmp_long *= (1024 * 1024 * 1024);
  632. else if (last[0] != '\0') {
  633. fatal("Invalid gres data for %s, Count=%s", p->name,
  634. tmp_str);
  635. }
  636. if (p->count && (p->count != tmp_long)) {
  637. fatal("Invalid gres data for %s, Count does not match "
  638. "File value", p->name);
  639. }
  640. if ((tmp_long < 0) || (tmp_long >= NO_VAL)) {
  641. fatal("Gres %s has invalid count value %ld",
  642. p->name, tmp_long);
  643. }
  644. p->count = tmp_long;
  645. xfree(tmp_str);
  646. } else if (p->count == 0)
  647. p->count = 1;
  648. s_p_hashtbl_destroy(tbl);
  649. for (i=0; i<gres_context_cnt; i++) {
  650. if (strcasecmp(value, gres_context[i].gres_name) == 0)
  651. break;
  652. }
  653. if (i >= gres_context_cnt) {
  654. error("Ignoring gres.conf Name=%s", value);
  655. _destroy_gres_slurmd_conf(p);
  656. return 0;
  657. }
  658. p->plugin_id = gres_context[i].plugin_id;
  659. *dest = (void *)p;
  660. return 1;
  661. }
  662. static void _validate_config(slurm_gres_context_t *context_ptr)
  663. {
  664. ListIterator iter;
  665. gres_slurmd_conf_t *gres_slurmd_conf;
  666. int has_file = -1, rec_count = 0;
  667. iter = list_iterator_create(gres_conf_list);
  668. if (iter == NULL)
  669. fatal("list_iterator_create: malloc failure");
  670. while ((gres_slurmd_conf = (gres_slurmd_conf_t *) list_next(iter))) {
  671. if (gres_slurmd_conf->plugin_id != context_ptr->plugin_id)
  672. continue;
  673. rec_count++;
  674. if (has_file == -1)
  675. has_file = (int) gres_slurmd_conf->has_file;
  676. else if (( has_file && !gres_slurmd_conf->has_file) ||
  677. (!has_file && gres_slurmd_conf->has_file)) {
  678. fatal("gres.conf for %s, some records have File "
  679. "specification while others do not",
  680. context_ptr->gres_name);
  681. }
  682. if ((has_file == 0) && (rec_count > 1)) {
  683. fatal("gres.conf duplicate records for %s",
  684. context_ptr->gres_name);
  685. }
  686. }
  687. list_iterator_destroy(iter);
  688. }
  689. extern int gres_plugin_node_config_devices_path(char **dev_path,
  690. char **gres_name,
  691. int array_len)
  692. {
  693. static s_p_options_t _gres_options[] = {
  694. {"Name", S_P_ARRAY, _parse_gres_config, NULL},
  695. {NULL}
  696. };
  697. int count, i;
  698. struct stat config_stat;
  699. s_p_hashtbl_t *tbl;
  700. gres_slurmd_conf_t **gres_array;
  701. char *gres_conf_file;
  702. gres_plugin_init();
  703. gres_conf_file = _get_gres_conf();
  704. if (stat(gres_conf_file, &config_stat) < 0) {
  705. error("can't stat gres.conf file %s: %m", gres_conf_file);
  706. xfree(gres_conf_file);
  707. return 0;
  708. }
  709. slurm_mutex_lock(&gres_context_lock);
  710. tbl = s_p_hashtbl_create(_gres_options);
  711. if (s_p_parse_file(tbl, NULL, gres_conf_file, false) == SLURM_ERROR)
  712. fatal("error opening/reading %s", gres_conf_file);
  713. FREE_NULL_LIST(gres_conf_list);
  714. gres_conf_list = list_create(_destroy_gres_slurmd_conf);
  715. if (gres_conf_list == NULL)
  716. fatal("list_create: malloc failure");
  717. if (s_p_get_array((void ***) &gres_array, &count, "Name", tbl)) {
  718. if (count > array_len) {
  719. error("GRES device count exceeds array size (%d > %d)",
  720. count, array_len);
  721. count = array_len;
  722. }
  723. for (i = 0; i < count; i++) {
  724. if ((gres_array[i]) && (gres_array[i]->file)) {
  725. dev_path[i] = gres_array[i]->file;
  726. gres_name[i] = gres_array[i]->name;
  727. gres_array[i] = NULL;
  728. }
  729. }
  730. }
  731. s_p_hashtbl_destroy(tbl);
  732. slurm_mutex_unlock(&gres_context_lock);
  733. xfree(gres_conf_file);
  734. return count;
  735. }
  736. /* No gres.conf file found.
  737. * Initialize gres table with zero counts of all resources.
  738. * Counts can be altered by node_config_load() in the gres plugin. */
  739. static int _no_gres_conf(uint32_t cpu_cnt)
  740. {
  741. int i, rc = SLURM_SUCCESS;
  742. gres_slurmd_conf_t *p;
  743. slurm_mutex_lock(&gres_context_lock);
  744. FREE_NULL_LIST(gres_conf_list);
  745. gres_conf_list = list_create(_destroy_gres_slurmd_conf);
  746. if (gres_conf_list == NULL)
  747. fatal("list_create: malloc failure");
  748. p = xmalloc(sizeof(gres_slurmd_conf_t *) * gres_context_cnt);
  749. for (i = 0; ((i < gres_context_cnt) && (rc == SLURM_SUCCESS)); i++) {
  750. p = xmalloc(sizeof(gres_slurmd_conf_t));
  751. p->cpu_cnt = cpu_cnt;
  752. p->name = xstrdup(gres_context[i].gres_name);
  753. p->plugin_id = gres_context[i].plugin_id;
  754. list_append(gres_conf_list, p);
  755. rc = (*(gres_context[i].ops.node_config_load))(gres_conf_list);
  756. }
  757. slurm_mutex_unlock(&gres_context_lock);
  758. return rc;
  759. }
  760. /*
  761. * Load this node's configuration (how many resources it has, topology, etc.)
  762. * IN cpu_cnt - Number of CPUs on configured on this node
  763. */
  764. extern int gres_plugin_node_config_load(uint32_t cpu_cnt)
  765. {
  766. static s_p_options_t _gres_options[] = {
  767. {"Name", S_P_ARRAY, _parse_gres_config, NULL},
  768. {NULL}
  769. };
  770. int count, i, rc;
  771. struct stat config_stat;
  772. s_p_hashtbl_t *tbl;
  773. gres_slurmd_conf_t **gres_array;
  774. char *gres_conf_file;
  775. rc = gres_plugin_init();
  776. if (gres_context_cnt == 0)
  777. return SLURM_SUCCESS;
  778. gres_conf_file = _get_gres_conf();
  779. if (stat(gres_conf_file, &config_stat) < 0) {
  780. error("can't stat gres.conf file %s, assuming zero resource "
  781. "counts", gres_conf_file);
  782. xfree(gres_conf_file);
  783. return _no_gres_conf(cpu_cnt);
  784. }
  785. slurm_mutex_lock(&gres_context_lock);
  786. gres_cpu_cnt = cpu_cnt;
  787. tbl = s_p_hashtbl_create(_gres_options);
  788. if (s_p_parse_file(tbl, NULL, gres_conf_file, false) == SLURM_ERROR)
  789. fatal("error opening/reading %s", gres_conf_file);
  790. FREE_NULL_LIST(gres_conf_list);
  791. gres_conf_list = list_create(_destroy_gres_slurmd_conf);
  792. if (gres_conf_list == NULL)
  793. fatal("list_create: malloc failure");
  794. if (s_p_get_array((void ***) &gres_array, &count, "Name", tbl)) {
  795. for (i = 0; i < count; i++) {
  796. list_append(gres_conf_list, gres_array[i]);
  797. gres_array[i] = NULL;
  798. }
  799. }
  800. s_p_hashtbl_destroy(tbl);
  801. list_for_each(gres_conf_list, _log_gres_slurmd_conf, NULL);
  802. for (i=0; ((i < gres_context_cnt) && (rc == SLURM_SUCCESS)); i++) {
  803. _validate_config(&gres_context[i]);
  804. if (gres_context[i].ops.node_config_load == NULL)
  805. continue; /* No plugin */
  806. rc = (*(gres_context[i].ops.node_config_load))(gres_conf_list);
  807. }
  808. slurm_mutex_unlock(&gres_context_lock);
  809. xfree(gres_conf_file);
  810. return rc;
  811. }
  812. /*
  813. * Pack this node's gres configuration into a buffer
  814. * IN/OUT buffer - message buffer to pack
  815. */
  816. extern int gres_plugin_node_config_pack(Buf buffer)
  817. {
  818. int rc;
  819. uint32_t magic = GRES_MAGIC;
  820. uint16_t rec_cnt = 0, version= SLURM_PROTOCOL_VERSION;
  821. ListIterator iter;
  822. gres_slurmd_conf_t *gres_slurmd_conf;
  823. rc = gres_plugin_init();
  824. slurm_mutex_lock(&gres_context_lock);
  825. pack16(version, buffer);
  826. if (gres_conf_list)
  827. rec_cnt = list_count(gres_conf_list);
  828. pack16(rec_cnt, buffer);
  829. if (rec_cnt) {
  830. iter = list_iterator_create(gres_conf_list);
  831. if (iter == NULL)
  832. fatal("list_iterator_create: malloc failure");
  833. while ((gres_slurmd_conf =
  834. (gres_slurmd_conf_t *) list_next(iter))) {
  835. pack32(magic, buffer);
  836. pack32(gres_slurmd_conf->count, buffer);
  837. pack32(gres_slurmd_conf->cpu_cnt, buffer);
  838. pack8(gres_slurmd_conf->has_file, buffer);
  839. pack32(gres_slurmd_conf->plugin_id, buffer);
  840. packstr(gres_slurmd_conf->cpus, buffer);
  841. packstr(gres_slurmd_conf->name, buffer);
  842. }
  843. list_iterator_destroy(iter);
  844. }
  845. slurm_mutex_unlock(&gres_context_lock);
  846. return rc;
  847. }
  848. /*
  849. * Unpack this node's configuration from a buffer (built/packed by slurmd)
  850. * IN/OUT buffer - message buffer to unpack
  851. * IN node_name - name of node whose data is being unpacked
  852. */
  853. extern int gres_plugin_node_config_unpack(Buf buffer, char* node_name)
  854. {
  855. int i, j, rc;
  856. uint32_t count, cpu_cnt, magic, plugin_id, utmp32;
  857. uint16_t rec_cnt, version;
  858. uint8_t has_file;
  859. char *tmp_cpus, *tmp_name;
  860. gres_slurmd_conf_t *p;
  861. rc = gres_plugin_init();
  862. FREE_NULL_LIST(gres_conf_list);
  863. gres_conf_list = list_create(_destroy_gres_slurmd_conf);
  864. if (gres_conf_list == NULL)
  865. fatal("list_create: malloc failure");
  866. safe_unpack16(&version, buffer);
  867. if (version != SLURM_PROTOCOL_VERSION)
  868. return SLURM_ERROR;
  869. safe_unpack16(&rec_cnt, buffer);
  870. if (rec_cnt == 0)
  871. return SLURM_SUCCESS;
  872. slurm_mutex_lock(&gres_context_lock);
  873. for (i=0; i<rec_cnt; i++) {
  874. safe_unpack32(&magic, buffer);
  875. if (magic != GRES_MAGIC)
  876. goto unpack_error;
  877. safe_unpack32(&count, buffer);
  878. safe_unpack32(&cpu_cnt, buffer);
  879. safe_unpack8(&has_file, buffer);
  880. safe_unpack32(&plugin_id, buffer);
  881. safe_unpackstr_xmalloc(&tmp_cpus, &utmp32, buffer);
  882. safe_unpackstr_xmalloc(&tmp_name, &utmp32, buffer);
  883. for (j=0; j<gres_context_cnt; j++) {
  884. if (gres_context[j].plugin_id != plugin_id)
  885. continue;
  886. if (strcmp(gres_context[j].gres_name, tmp_name)) {
  887. /* Should be caught in gres_plugin_init() */
  888. error("gres_plugin_node_config_unpack: gres/%s"
  889. " duplicate plugin ID with %s, unable "
  890. "to process",
  891. tmp_name, gres_context[j].gres_name);
  892. continue;
  893. }
  894. if (gres_context[j].has_file && !has_file && count) {
  895. error("gres_plugin_node_config_unpack: gres/%s"
  896. " lacks File parameter for node %s",
  897. tmp_name, node_name);
  898. has_file = 1;
  899. }
  900. if (has_file && (count > 1024)) {
  901. /* Avoid over-subscribing memory with huge
  902. * bitmaps */
  903. error("gres_plugin_node_config_unpack: gres/%s"
  904. " has File plus very large Count (%u) "
  905. "for node %s, resetting value to 1024",
  906. tmp_name, count, node_name);
  907. count = 1024;
  908. }
  909. gres_context[j].has_file = has_file;
  910. break;
  911. }
  912. if (j >= gres_context_cnt) {
  913. /* A sign that GresPlugins is inconsistently
  914. * configured. Not a fatal error. Skip this data. */
  915. error("gres_plugin_node_config_unpack: no plugin "
  916. "configured to unpack data type %s from node %s",
  917. tmp_name, node_name);
  918. xfree(tmp_cpus);
  919. xfree(tmp_name);
  920. continue;
  921. }
  922. p = xmalloc(sizeof(gres_slurmd_conf_t));
  923. p->count = count;
  924. p->cpu_cnt = cpu_cnt;
  925. p->has_file = has_file;
  926. p->cpus = tmp_cpus;
  927. tmp_cpus = NULL; /* Nothing left to xfree */
  928. p->name = tmp_name; /* We need to preserve for accounting! */
  929. p->plugin_id = plugin_id;
  930. list_append(gres_conf_list, p);
  931. }
  932. slurm_mutex_unlock(&gres_context_lock);
  933. return rc;
  934. unpack_error:
  935. error("gres_plugin_node_config_unpack: unpack error from node %s",
  936. node_name);
  937. xfree(tmp_cpus);
  938. xfree(tmp_name);
  939. slurm_mutex_unlock(&gres_context_lock);
  940. return SLURM_ERROR;
  941. }
  942. /*
  943. * Delete an element placed on gres_list by _node_config_validate()
  944. * free associated memory
  945. */
  946. static void _gres_node_list_delete(void *list_element)
  947. {
  948. int i;
  949. gres_state_t *gres_ptr;
  950. gres_node_state_t *gres_node_ptr;
  951. gres_ptr = (gres_state_t *) list_element;
  952. gres_node_ptr = (gres_node_state_t *) gres_ptr->gres_data;
  953. FREE_NULL_BITMAP(gres_node_ptr->gres_bit_alloc);
  954. for (i=0; i<gres_node_ptr->topo_cnt; i++) {
  955. FREE_NULL_BITMAP(gres_node_ptr->topo_cpus_bitmap[i]);
  956. FREE_NULL_BITMAP(gres_node_ptr->topo_gres_bitmap[i]);
  957. }
  958. xfree(gres_node_ptr->topo_cpus_bitmap);
  959. xfree(gres_node_ptr->topo_gres_bitmap);
  960. xfree(gres_node_ptr->topo_gres_cnt_alloc);
  961. xfree(gres_node_ptr->topo_gres_cnt_avail);
  962. xfree(gres_node_ptr);
  963. xfree(gres_ptr);
  964. }
  965. static uint32_t _get_gres_cnt(char *orig_config, char *gres_name,
  966. char *gres_name_colon, int gres_name_colon_len)
  967. {
  968. char *node_gres_config, *tok, *last_num = NULL, *last_tok = NULL;
  969. uint32_t gres_config_cnt = 0;
  970. if (orig_config == NULL)
  971. return gres_config_cnt;
  972. node_gres_config = xstrdup(orig_config);
  973. tok = strtok_r(node_gres_config, ",", &last_tok);
  974. while (tok) {
  975. if (!strcmp(tok, gres_name)) {
  976. gres_config_cnt = 1;
  977. break;
  978. }
  979. if (!strncmp(tok, gres_name_colon, gres_name_colon_len)) {
  980. tok += gres_name_colon_len;
  981. gres_config_cnt = strtol(tok, &last_num, 10);
  982. if (last_num[0] == '\0')
  983. ;
  984. else if ((last_num[0] == 'k') || (last_num[0] == 'K'))
  985. gres_config_cnt *= 1024;
  986. else if ((last_num[0] == 'm') || (last_num[0] == 'M'))
  987. gres_config_cnt *= (1024 * 1024);
  988. else if ((last_num[0] == 'g') || (last_num[0] == 'G'))
  989. gres_config_cnt *= (1024 * 1024 * 1024);
  990. break;
  991. }
  992. tok = strtok_r(NULL, ",", &last_tok);
  993. }
  994. xfree(node_gres_config);
  995. return gres_config_cnt;
  996. }
  997. static void _set_gres_cnt(char *orig_config, char **new_config,
  998. uint32_t new_cnt, char *gres_name,
  999. char *gres_name_colon, int gres_name_colon_len)
  1000. {
  1001. char *new_configured_res = NULL, *node_gres_config;
  1002. char *last_tok = NULL, *tok;
  1003. if (*new_config)
  1004. node_gres_config = xstrdup(*new_config);
  1005. else if (orig_config)
  1006. node_gres_config = xstrdup(orig_config);
  1007. else
  1008. return;
  1009. tok = strtok_r(node_gres_config, ",", &last_tok);
  1010. while (tok) {
  1011. if (new_configured_res)
  1012. xstrcat(new_configured_res, ",");
  1013. if (strcmp(tok, gres_name) &&
  1014. strncmp(tok, gres_name_colon, gres_name_colon_len)) {
  1015. xstrcat(new_configured_res, tok);
  1016. } else if ((new_cnt % (1024 * 1024 * 1024)) == 0) {
  1017. new_cnt /= (1024 * 1024 * 1024);
  1018. xstrfmtcat(new_configured_res, "%s:%uG",
  1019. gres_name, new_cnt);
  1020. } else if ((new_cnt % (1024 * 1024)) == 0) {
  1021. new_cnt /= (1024 * 1024);
  1022. xstrfmtcat(new_configured_res, "%s:%uM",
  1023. gres_name, new_cnt);
  1024. } else if ((new_cnt % 1024) == 0) {
  1025. new_cnt /= 1024;
  1026. xstrfmtcat(new_configured_res, "%s:%uK",
  1027. gres_name, new_cnt);
  1028. } else {
  1029. xstrfmtcat(new_configured_res, "%s:%u",
  1030. gres_name, new_cnt);
  1031. }
  1032. tok = strtok_r(NULL, ",", &last_tok);
  1033. }
  1034. xfree(node_gres_config);
  1035. xfree(*new_config);
  1036. *new_config = new_configured_res;
  1037. }
  1038. static gres_node_state_t *_build_gres_node_state(void)
  1039. {
  1040. gres_node_state_t *gres_data;
  1041. gres_data = xmalloc(sizeof(gres_node_state_t));
  1042. gres_data->gres_cnt_config = NO_VAL;
  1043. gres_data->gres_cnt_found = NO_VAL;
  1044. return gres_data;
  1045. }
  1046. /*
  1047. * Build a node's gres record based only upon the slurm.conf contents
  1048. */
  1049. static int _node_config_init(char *node_name, char *orig_config,
  1050. slurm_gres_context_t *context_ptr,
  1051. gres_state_t *gres_ptr)
  1052. {
  1053. int rc = SLURM_SUCCESS;
  1054. uint32_t gres_config_cnt = 0;
  1055. bool updated_config = false;
  1056. gres_node_state_t *gres_data;
  1057. if (gres_ptr->gres_data == NULL) {
  1058. gres_ptr->gres_data = _build_gres_node_state();
  1059. updated_config = true;
  1060. }
  1061. gres_data = (gres_node_state_t *) gres_ptr->gres_data;
  1062. /* If the resource isn't configured for use with this node*/
  1063. if ((orig_config == NULL) || (orig_config[0] == '\0') ||
  1064. (updated_config == false)) {
  1065. gres_data->gres_cnt_config = 0;
  1066. return rc;
  1067. }
  1068. gres_config_cnt = _get_gres_cnt(orig_config,
  1069. context_ptr->gres_name,
  1070. context_ptr->gres_name_colon,
  1071. context_ptr->gres_name_colon_len);
  1072. gres_data->gres_cnt_config = gres_config_cnt;
  1073. /* Use count from recovered state, if higher */
  1074. gres_data->gres_cnt_avail = MAX(gres_data->gres_cnt_avail,
  1075. gres_config_cnt);
  1076. if ((gres_data->gres_bit_alloc != NULL) &&
  1077. (gres_data->gres_cnt_avail >
  1078. bit_size(gres_data->gres_bit_alloc))) {
  1079. gres_data->gres_bit_alloc =
  1080. bit_realloc(gres_data->gres_bit_alloc,
  1081. gres_data->gres_cnt_avail);
  1082. if (gres_data->gres_bit_alloc == NULL)
  1083. fatal("bit_alloc: malloc failure");
  1084. }
  1085. return rc;
  1086. }
  1087. /*
  1088. * Build a node's gres record based only upon the slurm.conf contents
  1089. * IN node_name - name of the node for which the gres information applies
  1090. * IN orig_config - Gres information supplied from slurm.conf
  1091. * IN/OUT gres_list - List of Gres records for this node to track usage
  1092. */
  1093. extern int gres_plugin_init_node_config(char *node_name, char *orig_config,
  1094. List *gres_list)
  1095. {
  1096. int i, rc;
  1097. ListIterator gres_iter;
  1098. gres_state_t *gres_ptr;
  1099. rc = gres_plugin_init();
  1100. slurm_mutex_lock(&gres_context_lock);
  1101. if ((gres_context_cnt > 0) && (*gres_list == NULL)) {
  1102. *gres_list = list_create(_gres_node_list_delete);
  1103. if (*gres_list == NULL)
  1104. fatal("list_create malloc failure");
  1105. }
  1106. for (i=0; ((i < gres_context_cnt) && (rc == SLURM_SUCCESS)); i++) {
  1107. /* Find or create gres_state entry on the list */
  1108. gres_iter = list_iterator_create(*gres_list);
  1109. while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
  1110. if (gres_ptr->plugin_id == gres_context[i].plugin_id)
  1111. break;
  1112. }
  1113. list_iterator_destroy(gres_iter);
  1114. if (gres_ptr == NULL) {
  1115. gres_ptr = xmalloc(sizeof(gres_state_t));
  1116. gres_ptr->plugin_id = gres_context[i].plugin_id;
  1117. list_append(*gres_list, gres_ptr);
  1118. }
  1119. rc = _node_config_init(node_name, orig_config,
  1120. &gres_context[i], gres_ptr);
  1121. }
  1122. slurm_mutex_unlock(&gres_context_lock);
  1123. return rc;
  1124. }
  1125. /*
  1126. * Determine gres availability on some node
  1127. * plugin_id IN - plugin number to search for
  1128. * set_cnt OUT - count of gres.conf records of this id found by slurmd
  1129. * (each can have different topology)
  1130. * RET - total number of gres available of this ID on this node in (sum
  1131. * across all records of this ID)
  1132. */
  1133. static uint32_t _get_tot_gres_cnt(uint32_t plugin_id, uint32_t *set_cnt)
  1134. {
  1135. ListIterator iter;
  1136. gres_slurmd_conf_t *gres_slurmd_conf;
  1137. uint32_t gres_cnt = 0, cpu_set_cnt = 0, rec_cnt = 0;
  1138. xassert(set_cnt);
  1139. *set_cnt = 0;
  1140. if (gres_conf_list == NULL)
  1141. return gres_cnt;
  1142. iter = list_iterator_create(gres_conf_list);
  1143. if (iter == NULL)
  1144. fatal("list_iterator_create: malloc failure");
  1145. while ((gres_slurmd_conf = (gres_slurmd_conf_t *) list_next(iter))) {
  1146. if (gres_slurmd_conf->plugin_id != plugin_id)
  1147. continue;
  1148. gres_cnt += gres_slurmd_conf->count;
  1149. rec_cnt++;
  1150. if (gres_slurmd_conf->cpus)
  1151. cpu_set_cnt++;
  1152. }
  1153. list_iterator_destroy(iter);
  1154. if (cpu_set_cnt)
  1155. *set_cnt = rec_cnt;
  1156. return gres_cnt;
  1157. }
  1158. /*
  1159. * Map a given GRES type ID back to a GRES type name.
  1160. * gres_id IN - GRES type ID to search for.
  1161. * gres_name IN - Pre-allocated string in which to store the GRES type name.
  1162. * gres_name_len - Size of gres_name in bytes
  1163. * RET - error code (currently not used--always return SLURM_SUCCESS)
  1164. */
  1165. extern int gres_gresid_to_gresname(uint32_t gres_id, char* gres_name,
  1166. int gres_name_len)
  1167. {
  1168. ListIterator iter;
  1169. gres_slurmd_conf_t *gres_slurmd_conf;
  1170. int rc = SLURM_SUCCESS;
  1171. int found = 0;
  1172. if (gres_conf_list == NULL) {
  1173. /* Should not reach this as if there are GRES id's then there
  1174. * must have been a gres_conf_list.
  1175. */
  1176. info("%s--The gres_conf_list is NULL!!!\n", __FUNCTION__);
  1177. snprintf(gres_name, gres_name_len, "%u", gres_id);
  1178. return rc;
  1179. }
  1180. iter = list_iterator_create(gres_conf_list);
  1181. if (iter == NULL)
  1182. fatal("list_iterator_create: malloc failure");
  1183. while ((gres_slurmd_conf = (gres_slurmd_conf_t *) list_next(iter))) {
  1184. if (gres_slurmd_conf->plugin_id != gres_id)
  1185. continue;
  1186. strncpy(gres_name, gres_slurmd_conf->name, gres_name_len);
  1187. found = 1;
  1188. break;
  1189. }
  1190. list_iterator_destroy(iter);
  1191. if (!found) /* Could not find GRES type name, use id */
  1192. snprintf(gres_name, gres_name_len, "%u", gres_id);
  1193. return rc;
  1194. }
  1195. extern int _node_config_validate(char *node_name, char *orig_config,
  1196. char **new_config, gres_state_t *gres_ptr,
  1197. uint16_t fast_schedule, char **reason_down,
  1198. slurm_gres_context_t *context_ptr)
  1199. {
  1200. int i, j, gres_inx, rc = SLURM_SUCCESS;
  1201. uint32_t gres_cnt, set_cnt = 0;
  1202. bool updated_config = false;
  1203. gres_node_state_t *gres_data;
  1204. ListIterator iter;
  1205. gres_slurmd_conf_t *gres_slurmd_conf;
  1206. if (gres_ptr->gres_data == NULL)
  1207. gres_ptr->gres_data = _build_gres_node_state();
  1208. gres_data = (gres_node_state_t *) gres_ptr->gres_data;
  1209. gres_cnt = _get_tot_gres_cnt(context_ptr->plugin_id, &set_cnt);
  1210. if (gres_data->gres_cnt_found != gres_cnt) {
  1211. if (gres_data->gres_cnt_found != NO_VAL) {
  1212. info("%s: count changed for node %s from %u to %u",
  1213. context_ptr->gres_type, node_name,
  1214. gres_data->gres_cnt_found, gres_cnt);
  1215. }
  1216. gres_data->gres_cnt_found = gres_cnt;
  1217. updated_config = true;
  1218. }
  1219. if (updated_config == false)
  1220. return SLURM_SUCCESS;
  1221. if ((set_cnt == 0) && (set_cnt != gres_data->topo_cnt)) {
  1222. /* Need to clear topology info */
  1223. xfree(gres_data->topo_gres_cnt_alloc);
  1224. xfree(gres_data->topo_gres_cnt_avail);
  1225. for (i=0; i<gres_data->topo_cnt; i++) {
  1226. FREE_NULL_BITMAP(gres_data->topo_gres_bitmap[i]);
  1227. FREE_NULL_BITMAP(gres_data->topo_cpus_bitmap[i]);
  1228. }
  1229. xfree(gres_data->topo_gres_bitmap);
  1230. xfree(gres_data->topo_cpus_bitmap);
  1231. gres_data->topo_cnt = set_cnt;
  1232. }
  1233. if (context_ptr->has_file && (set_cnt != gres_data->topo_cnt)) {
  1234. /* Need to rebuild topology info */
  1235. /* Resize the data structures here */
  1236. gres_data->topo_gres_cnt_alloc =
  1237. xrealloc(gres_data->topo_gres_cnt_alloc,
  1238. set_cnt * sizeof(uint32_t));
  1239. if (gres_data->topo_gres_cnt_alloc == NULL)
  1240. fatal("xrealloc: malloc failure");
  1241. gres_data->topo_gres_cnt_avail =
  1242. xrealloc(gres_data->topo_gres_cnt_avail,
  1243. set_cnt * sizeof(uint32_t));
  1244. if (gres_data->topo_gres_cnt_alloc == NULL)
  1245. fatal("xrealloc: malloc failure");
  1246. for (i=0; i<gres_data->topo_cnt; i++)
  1247. FREE_NULL_BITMAP(gres_data->topo_gres_bitmap[i]);
  1248. gres_data->topo_gres_bitmap =
  1249. xrealloc(gres_data->topo_gres_bitmap,
  1250. set_cnt * sizeof(bitstr_t *));
  1251. if (gres_data->topo_gres_bitmap == NULL)
  1252. fatal("xrealloc: malloc failure");
  1253. for (i=0; i<gres_data->topo_cnt; i++)
  1254. FREE_NULL_BITMAP(gres_data->topo_cpus_bitmap[i]);
  1255. gres_data->topo_cpus_bitmap =
  1256. xrealloc(gres_data->topo_cpus_bitmap,
  1257. set_cnt * sizeof(bitstr_t *));
  1258. if (gres_data->topo_cpus_bitmap == NULL)
  1259. fatal("xrealloc: malloc failure");
  1260. gres_data->topo_cnt = set_cnt;
  1261. iter = list_iterator_create(gres_conf_list);
  1262. if (iter == NULL)
  1263. fatal("list_iterator_create: malloc failure");
  1264. gres_inx = i = 0;
  1265. while ((gres_slurmd_conf = (gres_slurmd_conf_t *)
  1266. list_next(iter))) {
  1267. if (gres_slurmd_conf->plugin_id !=
  1268. context_ptr->plugin_id)
  1269. continue;
  1270. gres_data->topo_gres_cnt_avail[i] =
  1271. gres_slurmd_conf->count;
  1272. gres_data->topo_cpus_bitmap[i] =
  1273. bit_alloc(gres_slurmd_conf->cpu_cnt);
  1274. if (gres_data->topo_cpus_bitmap[i] == NULL)
  1275. fatal("bit_alloc: malloc failure");
  1276. if (gres_slurmd_conf->cpus) {
  1277. bit_unfmt(gres_data->topo_cpus_bitmap[i],
  1278. gres_slurmd_conf->cpus);
  1279. } else {
  1280. error("%s: has CPUs configured for only some "
  1281. "of the records on node %s",
  1282. context_ptr->gres_type, node_name);
  1283. bit_nset(gres_data->topo_cpus_bitmap[i], 0,
  1284. (gres_slurmd_conf->cpu_cnt - 1));
  1285. }
  1286. gres_data->topo_gres_bitmap[i] = bit_alloc(gres_cnt);
  1287. if (gres_data->topo_gres_bitmap[i] == NULL)
  1288. fatal("bit_alloc: malloc failure");
  1289. for (j=0; j<gres_slurmd_conf->count; j++) {
  1290. bit_set(gres_data->topo_gres_bitmap[i],
  1291. gres_inx++);
  1292. }
  1293. i++;
  1294. }
  1295. list_iterator_destroy(iter);
  1296. }
  1297. if ((orig_config == NULL) || (orig_config[0] == '\0'))
  1298. gres_data->gres_cnt_config = 0;
  1299. else if (gres_data->gres_cnt_config == NO_VAL) {
  1300. /* This should have been filled in by _node_config_init() */
  1301. gres_data->gres_cnt_config =
  1302. _get_gres_cnt(orig_config, context_ptr->gres_name,
  1303. context_ptr->gres_name_colon,
  1304. context_ptr->gres_name_colon_len);
  1305. }
  1306. if ((gres_data->gres_cnt_config == 0) || (fast_schedule > 0))
  1307. gres_data->gres_cnt_avail = gres_data->gres_cnt_config;
  1308. else if (gres_data->gres_cnt_found != NO_VAL)
  1309. gres_data->gres_cnt_avail = gres_data->gres_cnt_found;
  1310. else if (gres_data->gres_cnt_avail == NO_VAL)
  1311. gres_data->gres_cnt_avail = 0;
  1312. if (context_ptr->has_file) {
  1313. if (gres_data->gres_bit_alloc == NULL) {
  1314. gres_data->gres_bit_alloc =
  1315. bit_alloc(gres_data->gres_cnt_avail);
  1316. } else if (gres_data->gres_cnt_avail !=
  1317. bit_size(gres_data->gres_bit_alloc)) {
  1318. gres_data->gres_bit_alloc =
  1319. bit_realloc(gres_data->gres_bit_alloc,
  1320. gres_data->gres_cnt_avail);
  1321. }
  1322. if (gres_data->gres_bit_alloc == NULL)
  1323. fatal("bit_alloc: malloc failure");
  1324. }
  1325. if ((fast_schedule < 2) &&
  1326. (gres_data->gres_cnt_found < gres_data->gres_cnt_config)) {
  1327. if (reason_down && (*reason_down == NULL)) {
  1328. xstrfmtcat(*reason_down, "%s count too low (%u < %u)",
  1329. context_ptr->gres_type,
  1330. gres_data->gres_cnt_found,
  1331. gres_data->gres_cnt_config);
  1332. }
  1333. rc = EINVAL;
  1334. } else if ((fast_schedule == 2) && gres_data->topo_cnt &&
  1335. (gres_data->gres_cnt_found != gres_data->gres_cnt_config)) {
  1336. error("%s on node %s configured for %u resources but %u found,"
  1337. " ignoring topology support",
  1338. context_ptr->gres_type, node_name,
  1339. gres_data->gres_cnt_config, gres_data->gres_cnt_found);
  1340. if (gres_data->topo_cpus_bitmap) {
  1341. for (i=0; i<gres_data->topo_cnt; i++) {
  1342. FREE_NULL_BITMAP(gres_data->topo_cpus_bitmap[i]);
  1343. FREE_NULL_BITMAP(gres_data->topo_gres_bitmap[i]);
  1344. }
  1345. xfree(gres_data->topo_cpus_bitmap);
  1346. xfree(gres_data->topo_gres_bitmap);
  1347. xfree(gres_data->topo_gres_cnt_alloc);
  1348. xfree(gres_data->topo_gres_cnt_avail);
  1349. }
  1350. gres_data->topo_cnt = 0;
  1351. } else if ((fast_schedule == 0) &&
  1352. (gres_data->gres_cnt_found > gres_data->gres_cnt_config)) {
  1353. /* need to rebuild new_config */
  1354. _set_gres_cnt(orig_config, new_config,
  1355. gres_data->gres_cnt_found,
  1356. context_ptr->gres_name,
  1357. context_ptr->gres_name_colon,
  1358. context_ptr->gres_name_colon_len);
  1359. }
  1360. return rc;
  1361. }
  1362. /*
  1363. * Validate a node's configuration and put a gres record onto a list
  1364. * Called immediately after gres_plugin_node_config_unpack().
  1365. * IN node_name - name of the node for which the gres information applies
  1366. * IN orig_config - Gres information supplied from slurm.conf
  1367. * IN/OUT new_config - Updated gres info from slurm.conf if FastSchedule=0
  1368. * IN/OUT gres_list - List of Gres records for this node to track usage
  1369. * IN fast_schedule - 0: Validate and use actual hardware configuration
  1370. * 1: Validate hardware config, but use slurm.conf config
  1371. * 2: Don't validate hardware, use slurm.conf configuration
  1372. * OUT reason_down - set to an explanation of failure, if any, don't set if NULL
  1373. */
  1374. extern int gres_plugin_node_config_validate(char *node_name,
  1375. char *orig_config,
  1376. char **new_config,
  1377. List *gres_list,
  1378. uint16_t fast_schedule,
  1379. char **reason_down)
  1380. {
  1381. int i, rc, rc2;
  1382. ListIterator gres_iter;
  1383. gres_state_t *gres_ptr;
  1384. rc = gres_plugin_init();
  1385. slurm_mutex_lock(&gres_context_lock);
  1386. if ((gres_context_cnt > 0) && (*gres_list == NULL)) {
  1387. *gres_list = list_create(_gres_node_list_delete);
  1388. if (*gres_list == NULL)
  1389. fatal("list_create malloc failure");
  1390. }
  1391. for (i=0; ((i < gres_context_cnt) && (rc == SLURM_SUCCESS)); i++) {
  1392. /* Find or create gres_state entry on the list */
  1393. gres_iter = list_iterator_create(*gres_list);
  1394. while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
  1395. if (gres_ptr->plugin_id == gres_context[i].plugin_id)
  1396. break;
  1397. }
  1398. list_iterator_destroy(gres_iter);
  1399. if (gres_ptr == NULL) {
  1400. gres_ptr = xmalloc(sizeof(gres_state_t));
  1401. gres_ptr->plugin_id = gres_context[i].plugin_id;
  1402. list_append(*gres_list, gres_ptr);
  1403. }
  1404. rc2 = _node_config_validate(node_name, orig_config, new_config,
  1405. gres_ptr, fast_schedule,
  1406. reason_down, &gres_context[i]);
  1407. rc = MAX(rc, rc2);
  1408. }
  1409. slurm_mutex_unlock(&gres_context_lock);
  1410. return rc;
  1411. }
  1412. static int _node_reconfig(char *node_name, char *orig_config, char **new_config,
  1413. gres_state_t *gres_ptr, uint16_t fast_schedule,
  1414. slurm_gres_context_t *context_ptr)
  1415. {
  1416. int rc = SLURM_SUCCESS;
  1417. gres_node_state_t *gres_data;
  1418. xassert(gres_ptr);
  1419. if (gres_ptr->gres_data == NULL)
  1420. gres_ptr->gres_data = _build_gres_node_state();
  1421. gres_data = gres_ptr->gres_data;
  1422. gres_data->gres_cnt_config = _get_gres_cnt(orig_config,
  1423. context_ptr->gres_name,
  1424. context_ptr->gres_name_colon,
  1425. context_ptr->
  1426. gres_name_colon_len);
  1427. if ((gres_data->gres_cnt_config == 0) || (fast_schedule > 0))
  1428. gres_data->gres_cnt_avail = gres_data->gres_cnt_config;
  1429. else if (gres_data->gres_cnt_found != NO_VAL)
  1430. gres_data->gres_cnt_avail = gres_data->gres_cnt_found;
  1431. else if (gres_data->gres_cnt_avail == NO_VAL)
  1432. gres_data->gres_cnt_avail = 0;
  1433. if (context_ptr->has_file) {
  1434. if (gres_data->gres_bit_alloc == NULL) {
  1435. gres_data->gres_bit_alloc =
  1436. bit_alloc(gres_data->gres_cnt_avail);
  1437. } else if (gres_data->gres_cnt_avail !=
  1438. bit_size(gres_data->gres_bit_alloc)) {
  1439. gres_data->gres_bit_alloc =
  1440. bit_realloc(gres_data->gres_bit_alloc,
  1441. gres_data->gres_cnt_avail);
  1442. }
  1443. if (gres_data->gres_bit_alloc == NULL)
  1444. fatal("bit_alloc: malloc failure");
  1445. }
  1446. if ((fast_schedule < 2) &&
  1447. (gres_data->gres_cnt_found != NO_VAL) &&
  1448. (gres_data->gres_cnt_found < gres_data->gres_cnt_config)) {
  1449. /* Do not set node DOWN, but give the node
  1450. * a chance to register with more resources */
  1451. gres_data->gres_cnt_found = NO_VAL;
  1452. } else if ((fast_schedule == 0) &&
  1453. (gres_data->gres_cnt_found != NO_VAL) &&
  1454. (gres_data->gres_cnt_found > gres_data->gres_cnt_config)) {
  1455. _set_gres_cnt(orig_config, new_config,
  1456. gres_data->gres_cnt_found,
  1457. context_ptr->gres_name,
  1458. context_ptr->gres_name_colon,
  1459. context_ptr->gres_name_colon_len);
  1460. }
  1461. return rc;
  1462. }
  1463. /*
  1464. * Note that a node's configuration has been modified (e.g. "scontol update ..")
  1465. * IN node_name - name of the node for which the gres information applies
  1466. * IN orig_config - Gres information supplied from slurm.conf
  1467. * IN/OUT new_config - Updated gres info from slurm.conf if FastSchedule=0
  1468. * IN/OUT gres_list - List of Gres records for this node to track usage
  1469. * IN fast_schedule - 0: Validate and use actual hardware configuration
  1470. * 1: Validate hardware config, but use slurm.conf config
  1471. * 2: Don't validate hardware, use slurm.conf configuration
  1472. */
  1473. extern int gres_plugin_node_reconfig(char *node_name,
  1474. char *orig_config,
  1475. char

Large files files are truncated, but you can click here to view the full file