PageRenderTime 40ms CodeModel.GetById 14ms app.highlight 22ms RepoModel.GetById 1ms app.codeStats 0ms

/src/modules/slurm.c

https://code.google.com/
C | 343 lines | 211 code | 58 blank | 74 comment | 49 complexity | b8643ab9fc31b8559a64c466b42ce2a4 MD5 | raw file
  1/*****************************************************************************\
  2 *  $Id$
  3 *****************************************************************************
  4 *  Copyright (C) 2001-2007 The Regents of the University of California.
  5 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  6 *  Written by Mark Grondona <mgrondona@llnl.gov>.
  7 *  UCRL-CODE-2003-005.
  8 *  
  9 *  This file is part of Pdsh, a parallel remote shell program.
 10 *  For details, see <http://www.llnl.gov/linux/pdsh/>.
 11 *  
 12 *  Pdsh is free software; you can redistribute it and/or modify it under
 13 *  the terms of the GNU General Public License as published by the Free
 14 *  Software Foundation; either version 2 of the License, or (at your option)
 15 *  any later version.
 16 *  
 17 *  Pdsh is distributed in the hope that it will be useful, but WITHOUT ANY
 18 *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 19 *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 20 *  details.
 21 *  
 22 *  You should have received a copy of the GNU General Public License along
 23 *  with Pdsh; if not, write to the Free Software Foundation, Inc.,
 24 *  59 Temple Place, Suite 330, Boston, MA  02111-1307  USA.
 25\*****************************************************************************/
 26
 27#if HAVE_CONFIG_H
 28#  include "config.h"
 29#endif
 30
 31#include <stdio.h>
 32#include <stdlib.h>
 33#include <string.h>
 34#include <assert.h>
 35
 36#include "src/common/hostlist.h"
 37#include "src/common/split.h"
 38#include "src/common/err.h"
 39#include "src/common/xmalloc.h"
 40#include "src/common/xstring.h"
 41#include "src/pdsh/xpopen.h"
 42#include "src/pdsh/ltdl.h"
 43#include "src/pdsh/mod.h"
 44#include "src/pdsh/opt.h"
 45
 46/*
 47 * SLURM headers need to be included after pdsh header files to
 48 *  avoid possibly conflicts with the definition of "bool"
 49 */
 50#include <slurm/slurm.h>
 51#include <slurm/slurm_errno.h>
 52
 53#if STATIC_MODULES
 54#  define pdsh_module_info slurm_module_info
 55#  define pdsh_module_priority slurm_module_priority
 56#endif    
 57/*
 58 *  Give this module low priority 
 59 */
 60int pdsh_module_priority = 10;
 61
 62
 63/*
 64 *  Call this module after all option processing. The module will only
 65 *    try to read the SLURM_JOBID if opt->wcoll is not already set.
 66 *    Calling the module in postop allows us to be sure that all other
 67 *    modules had a chance to update the wcoll.
 68 */
 69static int mod_slurm_init(void);
 70static int mod_slurm_wcoll(opt_t *opt);
 71static int mod_slurm_exit(void);
 72static hostlist_t _slurm_wcoll(List jobids);
 73static hostlist_t _slurm_wcoll_partition(List partitions);
 74static int slurm_process_opt(opt_t *, int opt, char *arg);
 75
 76static List job_list = NULL;
 77static List partition_list = NULL;
 78
 79/*
 80 *  Export generic pdsh module options
 81 */
 82struct pdsh_module_operations slurm_module_ops = {
 83    (ModInitF)       mod_slurm_init, 
 84    (ModExitF)       mod_slurm_exit, 
 85    (ModReadWcollF)  mod_slurm_wcoll,
 86    (ModPostOpF)     NULL
 87};
 88
 89
 90/* 
 91 * Export module options
 92 */
 93struct pdsh_module_option slurm_module_options[] = 
 94 { 
 95   { 'j', "jobid,...", 
 96     "Run on nodes allocated to SLURM job(s) (\"all\" = all jobs)",
 97     DSH | PCP, (optFunc) slurm_process_opt
 98   },
 99   { 'P', "partition,...",
100     "Run on nodes contained in SLURM partition",
101     DSH | PCP, (optFunc) slurm_process_opt
102   },
103   PDSH_OPT_TABLE_END
104 };
105
106/* 
107 * SLURM module info 
108 */
109struct pdsh_module pdsh_module_info = {
110  "misc",
111  "slurm",
112  "Mark Grondona <mgrondona@llnl.gov>",
113  "Target nodes contained in SLURM jobs or partitions, read SLURM_JOBID by default",
114  DSH | PCP, 
115
116  &slurm_module_ops,
117  NULL,
118  &slurm_module_options[0],
119};
120
121
122static int mod_slurm_init (void)
123{
124    return (0);
125}
126
127
128static int32_t str2jobid (char *str)
129{
130    char *p = NULL;
131    long int jid;
132
133    if (str == NULL) 
134        return (-1);
135
136    jid = strtoul (str, &p, 10);
137
138    if (*p != '\0') 
139        errx ("%p: invalid setting \"%s\" for -j or SLURM_JOBID\n", str);
140
141    return ((int32_t) jid);
142}
143
144    
145static int
146slurm_process_opt(opt_t *pdsh_opts, int opt, char *arg)
147{
148    switch (opt) {
149    case 'j':
150        job_list = list_split_append (job_list, ",", arg);
151        break;
152    case 'P':
153        partition_list = list_split_append (partition_list, ",", arg);
154        break;
155    default:
156        break;
157    }
158
159    return (0);
160}
161
162static int
163mod_slurm_exit(void)
164{
165    if (job_list)
166        list_destroy (job_list);
167
168    if (partition_list)
169        list_destroy (partition_list);
170
171    return (0);
172}
173
174/*
175 *  If no wcoll has been established by this time, look for the
176 *    SLURM_JOBID env var, and set wcoll to the list of nodes allocated
177 *    to that job.
178 */
179static int mod_slurm_wcoll(opt_t *opt)
180{
181    if (job_list && opt->wcoll)
182        errx("%p: do not specify -j with any other node selection option.\n");
183
184    if (partition_list && opt->wcoll)
185        errx("%p: do not specify -P with any other node selection option.\n");
186
187    if (partition_list && job_list)
188        errx("%p: do not specify -j and -P together.\n");
189
190    if (partition_list)
191        opt->wcoll = _slurm_wcoll_partition (partition_list);
192
193    if (!opt->wcoll) 
194            opt->wcoll = _slurm_wcoll (job_list); 
195
196    return 0;
197}
198
199static int32_t _slurm_jobid (void)
200{
201    return (str2jobid (getenv ("SLURM_JOBID")));
202}
203
204static int _find_id (char *jobid, uint32_t *id)
205{
206    return (*id == str2jobid (jobid));
207}
208
209static int _find_str (char *jobid, char *str)
210{
211    return (strcmp (jobid, str) == 0);
212}
213
214/*
215 * Return non-zero if jobid is in list of ids requested by user
216 */
217static int _jobid_requested (List l, uint32_t jobid)
218{
219    if (l == NULL)
220        return (0);
221    return (list_delete_all (l, (ListFindF)_find_id, &jobid));
222}
223
224static int _partition_requested (List l, char *partition)
225{
226    if (l == NULL)
227        return (0);
228    return (list_delete_all (l, (ListFindF)_find_str, partition));
229}
230
231static int _alljobids_requested (List l)
232{
233    char *all = "all";
234    if (l == NULL)
235        return (0);
236    return (list_delete_all (l, (ListFindF)_find_str, all));
237}
238
239static hostlist_t _hl_append (hostlist_t hl, char *nodes)
240{
241    if (hl == NULL)
242        return (hostlist_create (nodes));
243    else
244        hostlist_push (hl, nodes);
245    return (hl);
246}
247
248static hostlist_t _slurm_wcoll (List joblist)
249{
250    int i;
251    hostlist_t hl = NULL;
252    job_info_msg_t * msg;
253    int32_t envjobid = 0;
254    int alljobids = 0;
255
256    if ((joblist == NULL) && (envjobid = _slurm_jobid()) < 0)
257        return (NULL);
258
259    if (slurm_load_jobs((time_t) NULL, &msg, 1) < 0) 
260        errx ("Unable to contact slurm controller: %s\n", 
261              slurm_strerror (errno));
262
263    /*
264     *  Check for "all" in joblist
265     */
266    alljobids = _alljobids_requested (joblist);
267
268    for (i = 0; i < msg->record_count; i++) {
269        job_info_t *j = &msg->job_array[i];
270
271        if (alljobids && j->job_state == JOB_RUNNING)
272            hl = _hl_append (hl, j->nodes);
273        else if (!joblist && (j->job_id == envjobid)) {
274            /*
275             *  Only use SLURM_JOBID environment variable if user
276             *   didn't override with -j option
277             */
278            hl = hostlist_create (j->nodes);
279            break;
280        }
281        else if (_jobid_requested (joblist, j->job_id)) {
282            hl = _hl_append (hl, j->nodes);
283            /* 
284             * Exit when there is no more jobids to search
285             */
286            if (list_count (joblist) == 0)
287                break;
288        }
289    }
290    
291    slurm_free_job_info_msg (msg);
292
293    if (hl)
294        hostlist_uniq (hl);
295
296    return (hl);
297}
298
299static hostlist_t _slurm_wcoll_partition (List partitionlist)
300{
301    int i;
302    char * str;
303    hostlist_t hl = NULL;
304    partition_info_msg_t * msg;
305    partition_info_t * p;
306    ListIterator li;
307
308    if (slurm_load_partitions((time_t) NULL, &msg, 1) < 0)
309        errx ("Unable to contact slurm controller: %s\n",
310              slurm_strerror (errno));
311
312    for (i = 0; i < msg->record_count; i++){
313        p = &msg->partition_array[i];
314
315        if (_partition_requested (partitionlist, p->name)) {
316            hl = _hl_append (hl, p->nodes);
317            /*
318             * Exit when there is no more partitions to search
319             */
320            if (list_count (partitionlist) == 0)
321                break;
322        }
323    }
324
325    /*
326     *  Anything left in partitionlist wasn't found, emit a warning
327     */
328    li = list_iterator_create(partitionlist);
329    while ((str = list_next(li))){
330       err("%p: Warning - partition %s not found\n", str);
331    }
332
333    slurm_free_partition_info_msg (msg);
334
335    if (hl)
336        hostlist_uniq (hl);
337
338    return (hl);
339}
340
341/*
342 * vi: tabstop=4 shiftwidth=4 expandtab
343 */