/src/modules/slurm.c

https://code.google.com/ · C · 343 lines · 211 code · 58 blank · 74 comment · 49 complexity · b8643ab9fc31b8559a64c466b42ce2a4 MD5 · raw file

  1. /*****************************************************************************\
  2. * $Id$
  3. *****************************************************************************
  4. * Copyright (C) 2001-2007 The Regents of the University of California.
  5. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  6. * Written by Mark Grondona <mgrondona@llnl.gov>.
  7. * UCRL-CODE-2003-005.
  8. *
  9. * This file is part of Pdsh, a parallel remote shell program.
  10. * For details, see <http://www.llnl.gov/linux/pdsh/>.
  11. *
  12. * Pdsh is free software; you can redistribute it and/or modify it under
  13. * the terms of the GNU General Public License as published by the Free
  14. * Software Foundation; either version 2 of the License, or (at your option)
  15. * any later version.
  16. *
  17. * Pdsh is distributed in the hope that it will be useful, but WITHOUT ANY
  18. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  19. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  20. * details.
  21. *
  22. * You should have received a copy of the GNU General Public License along
  23. * with Pdsh; if not, write to the Free Software Foundation, Inc.,
  24. * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
  25. \*****************************************************************************/
  26. #if HAVE_CONFIG_H
  27. # include "config.h"
  28. #endif
  29. #include <stdio.h>
  30. #include <stdlib.h>
  31. #include <string.h>
  32. #include <assert.h>
  33. #include "src/common/hostlist.h"
  34. #include "src/common/split.h"
  35. #include "src/common/err.h"
  36. #include "src/common/xmalloc.h"
  37. #include "src/common/xstring.h"
  38. #include "src/pdsh/xpopen.h"
  39. #include "src/pdsh/ltdl.h"
  40. #include "src/pdsh/mod.h"
  41. #include "src/pdsh/opt.h"
  42. /*
  43. * SLURM headers need to be included after pdsh header files to
  44. * avoid possibly conflicts with the definition of "bool"
  45. */
  46. #include <slurm/slurm.h>
  47. #include <slurm/slurm_errno.h>
  48. #if STATIC_MODULES
  49. # define pdsh_module_info slurm_module_info
  50. # define pdsh_module_priority slurm_module_priority
  51. #endif
  52. /*
  53. * Give this module low priority
  54. */
  55. int pdsh_module_priority = 10;
  56. /*
  57. * Call this module after all option processing. The module will only
  58. * try to read the SLURM_JOBID if opt->wcoll is not already set.
  59. * Calling the module in postop allows us to be sure that all other
  60. * modules had a chance to update the wcoll.
  61. */
  62. static int mod_slurm_init(void);
  63. static int mod_slurm_wcoll(opt_t *opt);
  64. static int mod_slurm_exit(void);
  65. static hostlist_t _slurm_wcoll(List jobids);
  66. static hostlist_t _slurm_wcoll_partition(List partitions);
  67. static int slurm_process_opt(opt_t *, int opt, char *arg);
  68. static List job_list = NULL;
  69. static List partition_list = NULL;
  70. /*
  71. * Export generic pdsh module options
  72. */
  73. struct pdsh_module_operations slurm_module_ops = {
  74. (ModInitF) mod_slurm_init,
  75. (ModExitF) mod_slurm_exit,
  76. (ModReadWcollF) mod_slurm_wcoll,
  77. (ModPostOpF) NULL
  78. };
  79. /*
  80. * Export module options
  81. */
  82. struct pdsh_module_option slurm_module_options[] =
  83. {
  84. { 'j', "jobid,...",
  85. "Run on nodes allocated to SLURM job(s) (\"all\" = all jobs)",
  86. DSH | PCP, (optFunc) slurm_process_opt
  87. },
  88. { 'P', "partition,...",
  89. "Run on nodes contained in SLURM partition",
  90. DSH | PCP, (optFunc) slurm_process_opt
  91. },
  92. PDSH_OPT_TABLE_END
  93. };
  94. /*
  95. * SLURM module info
  96. */
  97. struct pdsh_module pdsh_module_info = {
  98. "misc",
  99. "slurm",
  100. "Mark Grondona <mgrondona@llnl.gov>",
  101. "Target nodes contained in SLURM jobs or partitions, read SLURM_JOBID by default",
  102. DSH | PCP,
  103. &slurm_module_ops,
  104. NULL,
  105. &slurm_module_options[0],
  106. };
  107. static int mod_slurm_init (void)
  108. {
  109. return (0);
  110. }
  111. static int32_t str2jobid (char *str)
  112. {
  113. char *p = NULL;
  114. long int jid;
  115. if (str == NULL)
  116. return (-1);
  117. jid = strtoul (str, &p, 10);
  118. if (*p != '\0')
  119. errx ("%p: invalid setting \"%s\" for -j or SLURM_JOBID\n", str);
  120. return ((int32_t) jid);
  121. }
  122. static int
  123. slurm_process_opt(opt_t *pdsh_opts, int opt, char *arg)
  124. {
  125. switch (opt) {
  126. case 'j':
  127. job_list = list_split_append (job_list, ",", arg);
  128. break;
  129. case 'P':
  130. partition_list = list_split_append (partition_list, ",", arg);
  131. break;
  132. default:
  133. break;
  134. }
  135. return (0);
  136. }
  137. static int
  138. mod_slurm_exit(void)
  139. {
  140. if (job_list)
  141. list_destroy (job_list);
  142. if (partition_list)
  143. list_destroy (partition_list);
  144. return (0);
  145. }
  146. /*
  147. * If no wcoll has been established by this time, look for the
  148. * SLURM_JOBID env var, and set wcoll to the list of nodes allocated
  149. * to that job.
  150. */
  151. static int mod_slurm_wcoll(opt_t *opt)
  152. {
  153. if (job_list && opt->wcoll)
  154. errx("%p: do not specify -j with any other node selection option.\n");
  155. if (partition_list && opt->wcoll)
  156. errx("%p: do not specify -P with any other node selection option.\n");
  157. if (partition_list && job_list)
  158. errx("%p: do not specify -j and -P together.\n");
  159. if (partition_list)
  160. opt->wcoll = _slurm_wcoll_partition (partition_list);
  161. if (!opt->wcoll)
  162. opt->wcoll = _slurm_wcoll (job_list);
  163. return 0;
  164. }
  165. static int32_t _slurm_jobid (void)
  166. {
  167. return (str2jobid (getenv ("SLURM_JOBID")));
  168. }
  169. static int _find_id (char *jobid, uint32_t *id)
  170. {
  171. return (*id == str2jobid (jobid));
  172. }
  173. static int _find_str (char *jobid, char *str)
  174. {
  175. return (strcmp (jobid, str) == 0);
  176. }
  177. /*
  178. * Return non-zero if jobid is in list of ids requested by user
  179. */
  180. static int _jobid_requested (List l, uint32_t jobid)
  181. {
  182. if (l == NULL)
  183. return (0);
  184. return (list_delete_all (l, (ListFindF)_find_id, &jobid));
  185. }
  186. static int _partition_requested (List l, char *partition)
  187. {
  188. if (l == NULL)
  189. return (0);
  190. return (list_delete_all (l, (ListFindF)_find_str, partition));
  191. }
  192. static int _alljobids_requested (List l)
  193. {
  194. char *all = "all";
  195. if (l == NULL)
  196. return (0);
  197. return (list_delete_all (l, (ListFindF)_find_str, all));
  198. }
  199. static hostlist_t _hl_append (hostlist_t hl, char *nodes)
  200. {
  201. if (hl == NULL)
  202. return (hostlist_create (nodes));
  203. else
  204. hostlist_push (hl, nodes);
  205. return (hl);
  206. }
  207. static hostlist_t _slurm_wcoll (List joblist)
  208. {
  209. int i;
  210. hostlist_t hl = NULL;
  211. job_info_msg_t * msg;
  212. int32_t envjobid = 0;
  213. int alljobids = 0;
  214. if ((joblist == NULL) && (envjobid = _slurm_jobid()) < 0)
  215. return (NULL);
  216. if (slurm_load_jobs((time_t) NULL, &msg, 1) < 0)
  217. errx ("Unable to contact slurm controller: %s\n",
  218. slurm_strerror (errno));
  219. /*
  220. * Check for "all" in joblist
  221. */
  222. alljobids = _alljobids_requested (joblist);
  223. for (i = 0; i < msg->record_count; i++) {
  224. job_info_t *j = &msg->job_array[i];
  225. if (alljobids && j->job_state == JOB_RUNNING)
  226. hl = _hl_append (hl, j->nodes);
  227. else if (!joblist && (j->job_id == envjobid)) {
  228. /*
  229. * Only use SLURM_JOBID environment variable if user
  230. * didn't override with -j option
  231. */
  232. hl = hostlist_create (j->nodes);
  233. break;
  234. }
  235. else if (_jobid_requested (joblist, j->job_id)) {
  236. hl = _hl_append (hl, j->nodes);
  237. /*
  238. * Exit when there is no more jobids to search
  239. */
  240. if (list_count (joblist) == 0)
  241. break;
  242. }
  243. }
  244. slurm_free_job_info_msg (msg);
  245. if (hl)
  246. hostlist_uniq (hl);
  247. return (hl);
  248. }
  249. static hostlist_t _slurm_wcoll_partition (List partitionlist)
  250. {
  251. int i;
  252. char * str;
  253. hostlist_t hl = NULL;
  254. partition_info_msg_t * msg;
  255. partition_info_t * p;
  256. ListIterator li;
  257. if (slurm_load_partitions((time_t) NULL, &msg, 1) < 0)
  258. errx ("Unable to contact slurm controller: %s\n",
  259. slurm_strerror (errno));
  260. for (i = 0; i < msg->record_count; i++){
  261. p = &msg->partition_array[i];
  262. if (_partition_requested (partitionlist, p->name)) {
  263. hl = _hl_append (hl, p->nodes);
  264. /*
  265. * Exit when there is no more partitions to search
  266. */
  267. if (list_count (partitionlist) == 0)
  268. break;
  269. }
  270. }
  271. /*
  272. * Anything left in partitionlist wasn't found, emit a warning
  273. */
  274. li = list_iterator_create(partitionlist);
  275. while ((str = list_next(li))){
  276. err("%p: Warning - partition %s not found\n", str);
  277. }
  278. slurm_free_partition_info_msg (msg);
  279. if (hl)
  280. hostlist_uniq (hl);
  281. return (hl);
  282. }
  283. /*
  284. * vi: tabstop=4 shiftwidth=4 expandtab
  285. */