PageRenderTime 97ms CodeModel.GetById 14ms app.highlight 74ms RepoModel.GetById 1ms app.codeStats 0ms

/src/qsnet/qswutil.c

https://code.google.com/
C | 1255 lines | 777 code | 193 blank | 285 comment | 140 complexity | 7f29dfa25e83ff0e29c0fa2809e4130d MD5 | raw file
   1/*****************************************************************************\
   2 *  $Id$
   3 *****************************************************************************
   4 *  Copyright (C) 2001-2006 The Regents of the University of California.
   5 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
   6 *  Written by Jim Garlick <garlick@llnl.gov>.
   7 *  UCRL-CODE-2003-005.
   8 *  
   9 *  This file is part of Pdsh, a parallel remote shell program.
  10 *  For details, see <http://www.llnl.gov/linux/pdsh/>.
  11 *  
  12 *  Pdsh is free software; you can redistribute it and/or modify it under
  13 *  the terms of the GNU General Public License as published by the Free
  14 *  Software Foundation; either version 2 of the License, or (at your option)
  15 *  any later version.
  16 *  
  17 *  Pdsh is distributed in the hope that it will be useful, but WITHOUT ANY
  18 *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  19 *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
  20 *  details.
  21 *  
  22 *  You should have received a copy of the GNU General Public License along
  23 *  with Pdsh; if not, write to the Free Software Foundation, Inc.,
  24 *  59 Temple Place, Suite 330, Boston, MA  02111-1307  USA.
  25\*****************************************************************************/
  26
  27#if     HAVE_CONFIG_H
  28#include "config.h"
  29#endif
  30
  31#include <stdio.h>
  32#include <sys/param.h>
  33#include <sys/types.h>
  34#include <sys/stat.h>
  35#include <sys/wait.h>
  36#include <syslog.h>
  37#include <errno.h>
  38#include <string.h>
  39#include <paths.h>
  40#include <stdarg.h>
  41#include <ctype.h>
  42#include <assert.h>
  43#include <stdlib.h>
  44#include <unistd.h>
  45#include <limits.h>             /* INT_MAX */
  46#include <pthread.h>
  47
  48#if HAVE_LIBELANCTRL
  49#  include <elan/elanctrl.h>
  50#  include <elan/capability.h>
  51
  52#  define HighNode    cap_highnode
  53#  define LowNode     cap_lownode
  54#  define HighContext cap_highcontext
  55#  define LowContext  cap_lowcontext
  56#  define Bitmap      cap_bitmap
  57#  define Type        cap_type
  58#  define UserKey     cap_userkey
  59#  define RailMask    cap_railmask
  60#  define Values      key_values
  61
  62/* We need these using the old libelan3 library calls
  63 *  so we redefine them to old values here.
  64 *  XXX: What is the equivalent for libelanctrl?
  65 */
  66#  define ELAN_USER_BASE_CONTEXT_NUM 0x020
  67#  define ELAN_USER_TOP_CONTEXT_NUM  0x7ff
  68
  69#include <sys/stat.h>
  70
  71#elif HAVE_LIBELAN3
  72#  include <elan3/elan3.h>
  73#  include <elan3/elanvp.h>
  74#else
  75#  error "Need either libelan3 or libelanctrl to compile this module."
  76#endif
  77
  78#include <rms/rmscall.h>
  79
  80#include <dlfcn.h>
  81
  82#include <elanhosts.h>
  83
  84#include "src/common/xmalloc.h"
  85#include "src/common/xstring.h"
  86#include "src/common/hostlist.h"
  87#include "src/common/list.h"
  88#include "src/common/err.h"
  89#include "qswutil.h"
  90
  91/* we will allocate program descriptions in this range */
  92/* XXX note: do not start at zero as libelan shifts to get unique shm id */
  93#define QSW_PRG_START  1
  94#define QSW_PRG_END    INT_MAX
  95
  96static int debug_syslog = 1;    /* syslog program setup at LOG_DEBUG level */
  97
  98/*
  99 *  Static "Elan Host" configuration
 100 */
 101static elanhost_config_t elanconf = NULL;
 102
 103
 104/* 
 105 *  Static function prototypes:
 106 */
 107static int _set_elan_ids(elanhost_config_t ec);
 108static void *neterr_thr(void *arg);
 109
 110
 111int qsw_init(void)
 112{
 113    assert(elanconf == NULL);
 114
 115    elanconf = elanhost_config_create();
 116
 117    if (elanhost_config_read(elanconf, NULL) < 0) {
 118        err("%p: error: %s\n", elanhost_config_err(elanconf));
 119        return -1;
 120    }
 121
 122    return 0;
 123}
 124
 125void qsw_fini(void)
 126{
 127    elanhost_config_destroy(elanconf);
 128}
 129
 130static int qsw_have_elan3(void)
 131{
 132#if HAVE_LIBELAN3
 133    return (1);
 134#else
 135    struct stat st;
 136
 137    if (stat("/proc/qsnet/elan3/device0", &st) < 0)
 138        return (0);
 139
 140    return (1);
 141#endif /* HAVE_LIBELAN3 */
 142    return (0);
 143}
 144
 145struct neterr_args {
 146    pthread_mutex_t *mutex;
 147    pthread_cond_t  *cond;
 148    int             neterr_rc;
 149};
 150
 151int qsw_spawn_neterr_thr(void)
 152{
 153    struct neterr_args args;
 154    pthread_attr_t attr;
 155    pthread_t neterr_tid;
 156    pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
 157    pthread_cond_t  cond  = PTHREAD_COND_INITIALIZER;
 158
 159    /* 
 160     * Only need to run neterr thread on Elan3 HW.
 161     */
 162    if (!qsw_have_elan3()) 
 163        return (0);
 164
 165    args.mutex = &mutex;
 166    args.cond  = &cond;
 167
 168    if ((errno = pthread_attr_init(&attr)))
 169        errx("%p: pthread_attr_init: %m\n");
 170
 171    errno = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
 172    if (errno)
 173        err("%p: pthread_attr_setdetachstate: %m");
 174
 175    pthread_mutex_lock(&mutex);
 176
 177    if ((errno = pthread_create(&neterr_tid, &attr, neterr_thr, &args)))
 178        return -1;
 179
 180    /*
 181     *  Wait for successful startup of neterr resolver thread before 
 182     *    returning control to main thread.
 183     */
 184    pthread_cond_wait(&cond, &mutex);
 185    pthread_mutex_unlock(&mutex);
 186
 187    return args.neterr_rc;
 188
 189    return 0;
 190}
 191
 192/*
 193 * Use dlopen () for libelan3.so (when needed)
 194 *   This allows us to build a single version of the qsnet modules
 195 *   for Elan3 and Elan4 QsNetII systems.
 196 */
 197
 198/* 
 199 * libelan3.so handle:
 200 */
 201static void * elan3h = NULL; 
 202
 203/*
 204 * Wrapper functions for needed libelan3 functions
 205 */
 206static int _elan3_init_neterr_svc (int dbglvl)
 207{
 208    static int (*init_svc) (int);
 209
 210    if (!(init_svc = dlsym (elan3h, "elan3_init_neterr_svc"))) 
 211        return (0);
 212
 213    return (init_svc (dbglvl));
 214}
 215
 216
 217static int _elan3_register_neterr_svc (void)
 218{
 219    static int (*reg_svc) (void);
 220
 221    if (!(reg_svc = dlsym (elan3h, "elan3_register_neterr_svc"))) 
 222        return (0);
 223
 224    return (reg_svc ());
 225}
 226
 227static int _elan3_run_neterr_svc (void)
 228{
 229    static int (*run_svc) ();
 230
 231    if (!(run_svc = dlsym (elan3h, "elan3_run_neterr_svc"))) 
 232        return (0);
 233
 234    return (run_svc ());
 235}
 236
 237
 238static int _elan3_load_neterr_svc (int i, char *host)
 239{
 240    static int (*load_svc) (int, char *);
 241
 242    if (!(load_svc = dlsym (elan3h, "elan3_load_neterr_svc"))) 
 243        return (0);
 244
 245    return (load_svc (i, host));
 246}
 247
 248
 249static int
 250_set_elan_ids(elanhost_config_t ec)
 251{
 252    int i;
 253    for (i = 0; i <= elanhost_config_maxid(ec); i++) {
 254        char *host = elanhost_elanid2host(ec, ELANHOST_EIP, i);
 255        if (!host)
 256            continue;
 257        
 258		if (_elan3_load_neterr_svc(i, host) < 0)
 259			err("%p: elan3_load_neterr_svc(%d, %s): %m", i, host);
 260	}
 261
 262    return 0;
 263}
 264
 265static void *neterr_thr(void *arg)
 266{	
 267    struct neterr_args *args = arg;
 268
 269    if (!(elan3h = dlopen ("libelan3.so", RTLD_LAZY))) {
 270        syslog(LOG_ERR, "unable to open libelan3.so: %s", dlerror());
 271        goto fail;
 272    }
 273
 274	if (!_elan3_init_neterr_svc(0)) {
 275		syslog(LOG_ERR, "elan3_init_neterr_svc: %m");
 276		goto fail;
 277	}
 278
 279	/* 
 280	 *  Attempt to register the neterr svc thread. If the address 
 281	 *   cannot be bound, then there is already a thread running, and
 282	 *   we should just exit with success.
 283	 */
 284	if (!_elan3_register_neterr_svc()) {
 285		if (errno != EADDRINUSE) {
 286			syslog(LOG_ERR, "elan3_register_neterr_svc: %m");
 287			goto fail;
 288		}
 289        /* error resolver already running, just return */
 290        goto done;
 291	}
 292
 293    /* 
 294     * Attempt to register elan ids with kernel if we successfully 
 295     *  registered the error resolver service.
 296     */
 297    _set_elan_ids(elanconf);
 298
 299   done:
 300	/* 
 301	 *  Signal main thread that we've successfully initialized
 302	 */
 303	pthread_mutex_lock(args->mutex);
 304	args->neterr_rc = 0;
 305	pthread_cond_signal(args->cond);
 306	pthread_mutex_unlock(args->mutex);
 307
 308	/*
 309	 *  Run the network error resolver thread. This should
 310	 *   never return. If it does, there's not much we can do
 311	 *   about it.
 312	 */
 313	_elan3_run_neterr_svc();
 314
 315    return NULL;
 316
 317   fail:
 318	pthread_mutex_lock(args->mutex);
 319	args->neterr_rc = -1;
 320	pthread_cond_signal(args->cond);
 321	pthread_mutex_unlock(args->mutex);
 322
 323	return NULL;
 324}
 325
 326static void
 327_free_it (void *item)
 328{
 329    Free((void **) &item);
 330}
 331
 332static List
 333_hostlist_to_elanids (hostlist_t nodelist)
 334{
 335    char *host = NULL;
 336    List l = list_create ((ListDelF) _free_it);
 337    hostlist_iterator_t i = hostlist_iterator_create (nodelist);
 338
 339    if (l == NULL)
 340        errx ("%p: list_create: %m");
 341
 342    if (i == NULL)
 343        errx ("%p: hostlist_iterator_create: %m");
 344
 345    while ((host = hostlist_next (i))) {
 346        int *id = Malloc (sizeof(int));
 347        
 348        if ((*id = elanhost_host2elanid (elanconf, host)) < 0) {
 349            err ("%p: Unable to get ElanId for \"%s\": %s\n", 
 350                 host, elanhost_config_err (elanconf));
 351            goto fail;
 352        }
 353
 354        list_append (l, id);
 355        free (host);
 356    }
 357    hostlist_iterator_destroy (i);
 358
 359    return (l);
 360
 361  fail: 
 362    if (host != NULL)
 363        free (host);
 364    if (i != NULL)
 365        hostlist_iterator_destroy (i);
 366    if (l != NULL)
 367        list_destroy (l);
 368
 369    return (NULL);
 370}
 371
 372static int
 373_elanid_min (List el)
 374{
 375    int *id;
 376    int  min = -1;
 377    ListIterator i = list_iterator_create (el);
 378
 379    while ((id = list_next (i))) {
 380        if ((*id < min) || (min == -1))
 381            min = *id;
 382    }
 383
 384    list_iterator_destroy (i);
 385
 386    return (min);
 387}
 388
 389static int
 390_elanid_max (List el)
 391{
 392    int *id;
 393    int  max = -1;
 394    ListIterator i = list_iterator_create (el);
 395
 396    while ((id = list_next (i))) {
 397        if ((*id > max) || (max == -1))
 398            max = *id;
 399    }
 400
 401    list_iterator_destroy (i);
 402
 403    return (max);
 404}
 405
 406
 407/*
 408 * Given a list of hostnames and the number of processes per node, 
 409 * set the correct bits in the capability's bitmap and set high and
 410 * low node id's.
 411 */
 412static int
 413_setbitmap(hostlist_t nodelist, int procs_per_node, int cyclic, 
 414           ELAN_CAPABILITY * cap)
 415{
 416    int *id;
 417    int nodes_in_bitmap;
 418    int rc = 0;
 419    List el;
 420    ListIterator itr;
 421
 422    if (!(el = _hostlist_to_elanids (nodelist)))
 423        return (-1);
 424
 425    cap->HighNode = _elanid_max (el);
 426    cap->LowNode  = _elanid_min (el);
 427
 428    if (cap->HighNode == -1 || cap->LowNode == -1)
 429        return -1;
 430
 431    nodes_in_bitmap = cap->HighNode - cap->LowNode + 1;
 432
 433    /*
 434     * There are (procs_per_node * nnodes) significant bits in the mask, 
 435     * each representing a process slot.  Bits are off where for holes 
 436     * corresponding to process slots for unallocated nodes.
 437     * For example, if nodes 4 and 6 are running two processes per node,
 438     * bits 0,1 (corresponding to the two processes on node 4) and bits 4,5
 439     * (corresponding to the two processes running no node 6) are set.
 440     *
 441     * Note that for QsNet, the bits have a different meaning depending
 442     * on whether the capability distribution type is cyclic or block.
 443     * For block distribution, the bits are laid out in node-major
 444     * format, while for cyclic distribution, a procid (or context) major
 445     * format is used. 
 446     * 
 447     * Example: 2 processes per node on nodes 0,2:
 448     *
 449     *        block                       cyclic
 450     *                                      
 451     *    2  |  1  |  0     NodeId     2 1 0 | 2 1 0
 452     *       |     |                         |
 453     *   1 0 | 1 0 | 1 0   ContextId     1   |   0         
 454     *       |     |                         |       
 455     *   5 4 | 3 2 | 1 0  Bit Numbers  5 4 3 | 2 1 0
 456     *       |     |                         |
 457     *  ---- +-----+-----             -------+-------
 458     *   1 1 | 0 0 | 1 1   Bit Value   1 0 1 | 1 0 1
 459     */
 460
 461    itr = list_iterator_create (el);
 462
 463    while ((id = list_next (itr))) {
 464        int node = (*id) - cap->LowNode; /* relative id w/in bitmap */
 465        int i;
 466
 467        for (i = 0; i < procs_per_node; i++) {
 468            int bit;
 469            if (cyclic) 
 470                bit = (i * nodes_in_bitmap) + node;
 471            else
 472                bit = (node * (procs_per_node)) + i;
 473
 474            if (bit >= (sizeof (cap->Bitmap) * 8)) {
 475                err ("%p: _setbitmap: bit %d out of range\n", bit);
 476                rc = -1;
 477                break;
 478            }
 479
 480            BT_SET(cap->Bitmap, bit);
 481        }
 482    }
 483    list_destroy (el);
 484
 485    return (rc);
 486}
 487
 488/*
 489 * Set a variable in the callers environment.  Args are printf style.
 490 * XXX Space is allocated on the heap and will never be reclaimed.
 491 * Example: setenvf("RMS_RANK=%d", rank);
 492 */
 493static int _setenvf(const char *fmt, ...)
 494{
 495    va_list ap;
 496    char buf[BUFSIZ];
 497    char *bufcpy;
 498
 499    va_start(ap, fmt);
 500    vsnprintf(buf, sizeof(buf), fmt, ap);
 501    va_end(ap);
 502
 503    bufcpy = strdup(buf);
 504    if (bufcpy == NULL)
 505        return -1;
 506    return putenv(bufcpy);
 507}
 508
 509static int _rms_setenv(qsw_info_t * qi)
 510{
 511    /* MPI wants some of these ... 
 512     *  (It doesn't anymore, but they are helpful when running
 513     *   parallel scripts - ashley@quadrics.com )
 514     */
 515    if (_setenvf("RMS_RANK=%d", qi->rank) < 0)
 516        return -1;
 517    if (_setenvf("RMS_NODEID=%d", qi->nodeid) < 0)
 518        return -1;
 519    if (_setenvf("RMS_PROCID=%d", qi->procid) < 0)
 520        return -1;
 521    if (_setenvf("RMS_NNODES=%d", qi->nnodes) < 0)
 522        return -1;
 523    if (_setenvf("RMS_NPROCS=%d", qi->nprocs) < 0)
 524        return -1;
 525
 526    if (_setenvf("ELAN_AUTO=pdsh") < 0)
 527        return -1;
 528    if (_setenvf("ELAN_JOBID=%d", qi->prgnum) < 0)
 529        return -1;
 530
 531#if 0
 532    /* I'm not sure what this should be set to yet,
 533     * libelan will do the right thing if it's not
 534     * set though. (ashley@quadrics.com) */
 535    if (_setenvf("LIBELAN_SHMKEY=%d", qi->prgnum) < 0)
 536        return -1;
 537#endif 
 538
 539    return 0;
 540}
 541
 542/*
 543 * Return the number of times qsw_encode_cap_bitamp/qsw_decode_cap_bitmap
 544 * must be called.
 545 */
 546int qsw_cap_bitmap_count(void)
 547{
 548    ELAN_CAPABILITY cap;
 549    int count = sizeof(cap.Bitmap) / sizeof(cap.Bitmap[0]);
 550
 551    assert(count % 16 == 0);
 552    return count;
 553}
 554
 555/*
 556 * Convert capability (all but cap->Bitmap) to string.
 557 */
 558int qsw_encode_cap(char *s, int len, ELAN_CAPABILITY * cap)
 559{
 560    int n;
 561
 562    if (sizeof(cap->UserKey.Values[0]) != 4) {
 563        err("%p: qsw_encode_cap: UserKey is unexpected size\n");
 564        return -1;
 565    }
 566    if (sizeof(cap->UserKey) / 4 != 4) {
 567        err("%p: qsw_encode_cap: UserKey array is unexpected size\n");
 568        return -1;
 569    }
 570#if HAVE_LIBELANCTRL
 571    cap->cap_spare = ELAN_CAP_UNINITIALISED ;
 572    n = snprintf(s, len, "%x.%x.%x.%x.%hx.%x.%x.%x.%x.%x.%x.%x",
 573                           cap->UserKey.Values[0],
 574                           cap->UserKey.Values[1], 
 575                           cap->UserKey.Values[2], 
 576                           cap->UserKey.Values[3], 
 577                           cap->Type, /* short */
 578#ifdef ELAN_CAP_ELAN3
 579                           cap->cap_elan_type, /* char */
 580#else
 581                           cap->cap_spare,
 582#endif
 583                           cap->LowContext,
 584                           cap->HighContext,
 585                           cap->cap_mycontext,
 586                           cap->LowNode,
 587                           cap->HighNode,
 588                           cap->RailMask);
 589#elif HAVE_LIBELAN3
 590    n = snprintf(s, len, "%x.%x.%x.%x.%hx.%x.%x.%x.%x.%x.%x.%x",
 591                           cap->UserKey.Values[0],
 592                           cap->UserKey.Values[1],
 593                           cap->UserKey.Values[2],
 594                           cap->UserKey.Values[3],
 595                           cap->Type,      /* short */
 596                           cap->LowContext,
 597                           cap->HighContext,
 598                           cap->MyContext,
 599                           cap->LowNode,
 600                           cap->HighNode,
 601                           cap->Entries,
 602                           cap->RailMask);
 603
 604#else
 605#error "Neither LIBELAN3 nor LIBELANCTRL defined!"
 606#endif
 607
 608    if (n < 0 || n > strlen(s)) {
 609        err("%p: qsw_encode_cap: string overflow\n");
 610        return -1;
 611    }
 612    return 0;
 613}
 614
 615/*
 616 * Convert cap->Bitmap to string.
 617 */
 618int qsw_encode_cap_bitmap(char *s, int len, ELAN_CAPABILITY * cap, int i)
 619{
 620    int n;
 621
 622    if (sizeof(cap->Bitmap[0]) != sizeof(unsigned int)) {
 623        err("%p: qsw_encode_cap_bitmap: Bitmap is unexpected size\n");
 624        return -1;
 625    }
 626    if ((sizeof(cap->Bitmap) / sizeof(cap->Bitmap[0])) % 16 != 0) {
 627        err("%p: qsw_encode_cap_bitmap: Bitmap is not mult of 16\n");
 628        return -1;
 629    }
 630    if (i < 0 || i >= (sizeof(cap->Bitmap) / sizeof(cap->Bitmap[0]))) {
 631        err("%p: qsw_encode_cap_bitmap: Bitmap index out of range\n");
 632        return -1;
 633    }
 634    n = snprintf(s, len, "%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x",
 635                 cap->Bitmap[i + 0], cap->Bitmap[i + 1],
 636                 cap->Bitmap[i + 2], cap->Bitmap[i + 3],
 637                 cap->Bitmap[i + 4], cap->Bitmap[i + 5],
 638                 cap->Bitmap[i + 6], cap->Bitmap[i + 7],
 639                 cap->Bitmap[i + 8], cap->Bitmap[i + 9],
 640                 cap->Bitmap[i + 10], cap->Bitmap[i + 11],
 641                 cap->Bitmap[i + 12], cap->Bitmap[i + 13],
 642                 cap->Bitmap[i + 14], cap->Bitmap[i + 15]);
 643    if (n == -1 || n > strlen(s)) {
 644        err("%p: qsw_encode_cap_bitmap: string overflow\n");
 645        return -1;
 646    }
 647    return 0;
 648}
 649
 650/*
 651 * Convert string to capability (all but cap->Bitmap).
 652 */
 653int qsw_decode_cap(char *s, ELAN_CAPABILITY * cap)
 654{
 655    int n;
 656
 657#if HAVE_LIBELANCTRL
 658    /* initialize capability */
 659    elan_nullcap(cap);
 660
 661    n =  sscanf(s, "%x.%x.%x.%x.%hx.%hx.%x.%x.%x.%x.%x.%x",
 662                     &cap->UserKey.Values[0],
 663                     &cap->UserKey.Values[1],
 664                     &cap->UserKey.Values[2],
 665                     &cap->UserKey.Values[3],
 666                     &cap->cap_type,      /* short */
 667#  ifdef ELAN_CAP_ELAN3
 668                     &cap->cap_elan_type, /* char */
 669#  else
 670                     &cap->cap_spare,     /* unsigned short */
 671#  endif
 672                     &cap->LowContext,
 673                     &cap->HighContext,
 674                     &cap->cap_mycontext,
 675                     &cap->LowNode,
 676                     &cap->HighNode,
 677                     &cap->RailMask);
 678
 679#elif HAVE_LIBELAN3
 680
 681    /* initialize capability */
 682    elan3_nullcap(cap);
 683
 684    /* fill in values sent from remote */
 685    n = sscanf(s, "%x.%x.%x.%x.%hx.%x.%x.%x.%x.%x.%x.%x", 
 686                    &cap->UserKey.Values[0], 
 687                    &cap->UserKey.Values[1],
 688                    &cap->UserKey.Values[2],
 689                    &cap->UserKey.Values[3],
 690                    &cap->Type, /* short */
 691                    &cap->LowContext,
 692                    &cap->HighContext,
 693                    &cap->MyContext,
 694                    &cap->LowNode,
 695                    &cap->HighNode, 
 696                    &cap->Entries, 
 697                    &cap->RailMask);
 698#else
 699#  error "Neither LIBELANCTRL nor LIBELAN3 set!"
 700#endif
 701
 702    if (n != 12) {
 703        err("%p: qsw_decode_cap: scan error (%d of %d)\n", n, 12);
 704        return -1;
 705    }
 706    return 0;
 707}
 708
 709/*
 710 * Convert string to cap->Bitmap.
 711 */
 712int qsw_decode_cap_bitmap(char *s, ELAN_CAPABILITY * cap, int i)
 713{
 714    int n;
 715
 716    if (i < 0 || i >= sizeof(cap->Bitmap) / sizeof(cap->Bitmap[0])) {
 717        err("%p: qsw_decode_cap_bitmap: BitMap index out of range\n");
 718        return -1;
 719    }
 720    n = sscanf(s, "%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x",
 721               &cap->Bitmap[i + 0], &cap->Bitmap[i + 1],
 722               &cap->Bitmap[i + 2], &cap->Bitmap[i + 3],
 723               &cap->Bitmap[i + 4], &cap->Bitmap[i + 5],
 724               &cap->Bitmap[i + 6], &cap->Bitmap[i + 7],
 725               &cap->Bitmap[i + 8], &cap->Bitmap[i + 9],
 726               &cap->Bitmap[i + 10], &cap->Bitmap[i + 11],
 727               &cap->Bitmap[i + 12], &cap->Bitmap[i + 13],
 728               &cap->Bitmap[i + 14], &cap->Bitmap[i + 15]);
 729    if (n != 16) {
 730        err("%p: qsw_decode_cap_bitmap(%d): scan error\n", i);
 731        return -1;
 732    }
 733    return 0;
 734}
 735
 736/*
 737 * string -> info
 738 */
 739int qsw_decode_info(char *s, qsw_info_t * qi)
 740{
 741    int n;
 742
 743    n = sscanf(s, "%x.%x.%x.%x.%x.%x",
 744               &qi->prgnum,
 745               &qi->rank,
 746               &qi->nodeid, &qi->procid, &qi->nnodes, &qi->nprocs);
 747    if (n != 6) {
 748        err("%p: qsw_decode_info: scan error\n");
 749        return -1;
 750    }
 751    return 0;
 752}
 753
 754/*
 755 * info -> string
 756 */
 757int qsw_encode_info(char *s, int len, qsw_info_t * qi)
 758{
 759    int n;
 760
 761    n = snprintf(s, len, "%x.%x.%x.%x.%x.%x",
 762                 qi->prgnum,
 763                 qi->rank, qi->nodeid, qi->procid, qi->nnodes, qi->nprocs);
 764    if (n == -1 || n > strlen(s)) {
 765        err("%p: qsw_encode_info: string overflow\n");
 766        return -1;
 767    }
 768    return 0;
 769}
 770
 771/*
 772 * Generate a random program number.  Normally these would be allocated,
 773 * but since we have no persistant daemon, we settle for random.
 774 * Must be called after qsw_init_capability (we seed lrand48 there).
 775 */
 776int qsw_get_prgnum(void)
 777{
 778    int prgnum;
 779
 780    prgnum = lrand48() % (QSW_PRG_END - QSW_PRG_START + 1);
 781    prgnum += QSW_PRG_START;
 782
 783    return prgnum;
 784}
 785
 786/*
 787 * Prepare a capability that will be passed to all the processes in a 
 788 * parallel program.
 789 * Function returns a 0 on success, -1 = fail.
 790 */
 791int
 792qsw_init_capability(ELAN_CAPABILITY * cap, int nprocs, hostlist_t nodelist,
 793                    int cyclic_alloc, unsigned int railmask)
 794{
 795    int i;
 796    int num_nodes = hostlist_count(nodelist);
 797    int procs_per_node = nprocs / num_nodes;
 798
 799    assert (railmask < QSW_RAILMASK_MAX);
 800
 801    srand48(getpid());
 802
 803    /*
 804     * Initialize for multi rail and either block or cyclic allocation.  
 805     * Set ELAN_CAP_TYPE_BROADCASTABLE later if appropriate.
 806     */
 807#if HAVE_LIBELANCTRL
 808    elan_nullcap(cap);
 809#elif HAVE_LIBELAN3
 810    elan3_nullcap(cap);
 811#else
 812#  error
 813#endif
 814
 815    if (cyclic_alloc)
 816        cap->Type = ELAN_CAP_TYPE_CYCLIC;
 817    else
 818        cap->Type = ELAN_CAP_TYPE_BLOCK;
 819    cap->Type |= ELAN_CAP_TYPE_MULTI_RAIL;
 820
 821    cap->RailMask = railmask;
 822
 823#if HAVE_LIBELANCTRL
 824#  ifdef ELAN_CAP_ELAN3
 825    cap->cap_elan_type = ELAN_CAP_ELAN3;
 826#  else
 827    cap->cap_spare = ELAN_CAP_UNINITIALISED;
 828#  endif
 829#endif
 830
 831    /*
 832     * UserKey is 128 bits of randomness which should be kept private.
 833     */
 834    for (i = 0; i < 4; i++)
 835        cap->UserKey.Values[i] = lrand48();
 836
 837    /*
 838     * Elan hardware context numbers must be unique per node.
 839     * One is allocated to each parallel process.  In order for processes 
 840     * on the same node to communicate, they must use contexts in the 
 841     *  hi-lo range of a common capability.  With pdsh we have no 
 842     * persistant daemon to allocate these, so we settle for a random one.  
 843     */
 844    cap->LowContext = lrand48() %
 845        (ELAN_USER_TOP_CONTEXT_NUM -
 846         (ELAN_USER_BASE_CONTEXT_NUM + procs_per_node - 1) - 1);
 847    cap->LowContext += ELAN_USER_BASE_CONTEXT_NUM;
 848    cap->HighContext = cap->LowContext + procs_per_node - 1;
 849    /* not necessary to initialize cap->MyContext */
 850
 851    /*
 852     * Describe the mapping of processes to nodes.
 853     * This sets cap->HighNode, cap->LowNode, and cap->Bitmap.
 854     */
 855    if (_setbitmap(nodelist, procs_per_node, cyclic_alloc, cap) < 0) {
 856        err("%p: do all target nodes have an Elan adapter?\n");
 857        return -1;
 858    }
 859
 860#if HAVE_LIBELAN3
 861    /* 
 862     * Set cap->Entries and add broadcast bit to cap->type based on 
 863     * cap->HighNode and cap->LowNode values set above.
 864     */
 865    cap->Entries = nprocs;
 866    if (cap->Entries > ELAN_MAX_VPS) {
 867        err("%p: program would have too many processes (max %d)\n",
 868                ELAN_MAX_VPS);
 869        return -1;
 870    }
 871#endif
 872
 873    /* 
 874     * As we now support segmented broadcast, always flag the capability
 875     * as broadcastable. 
 876     */
 877    /*if (abs(cap->HighNode - cap->LowNode) == num_nodes - 1) */
 878    cap->Type |= ELAN_CAP_TYPE_BROADCASTABLE;
 879
 880    return 0;
 881}
 882
 883static int 
 884_qsw_elan_nrails(ELAN_CAPABILITY * cap)
 885{
 886#if HAVE_LIBELANCTRL
 887    return elan_nrails (cap);
 888#elif HAVE_LIBELAN3
 889    return elan3_nrails (cap);
 890#endif
 891}
 892
 893
 894static int 
 895_qsw_cap_create(ELAN_CAPABILITY * cap, int nrails)
 896{
 897#if HAVE_LIBELANCTRL
 898    ELANCTRL_HANDLE handle;
 899
 900    /* 
 901     * Open up the Elan control device so we can create 
 902     * a new capability.  
 903     */
 904    if (elanctrl_open(&handle) != 0)
 905        errx("%p: elanctrl_open(): %m\n");
 906
 907    /* Push capability into device driver */
 908    if (elanctrl_create_cap(handle, cap) < 0)
 909        errx("%p: elanctrl_create_cap failed: %m\n");
 910
 911    /* 
 912     * Do not close elanctrl handle here, this can cause
 913     *  MPI initialization to fail somehow.
 914     *
 915     * elanctrl_close(handle); 
 916     */
 917
 918#elif HAVE_LIBELAN3
 919    int i, n = 0;
 920
 921    /* MULTI-RAIL: Create the capability in all rails */
 922    for (i = 0; (i < ELAN_MAX_RAILS) && (n < nrails); i++) {
 923        void *handle;
 924
 925        if (!(cap->RailMask & (1 << i)))
 926            continue;
 927
 928        /* 
 929         * Open up the control device so we can create a new 
 930         * capability.  This will fail if we don't have rw 
 931         * access to /dev/elan3/control[i]
 932         */
 933        if ((handle = elan3_control_open(i)) == NULL) 
 934            errx("%p: elan3_control_open(%d): %m\n", i);
 935
 936        /* Push capability into device driver */
 937        if (elan3_create(handle, cap) < 0)
 938            errx("%p: elan3_create failed: %m\n");
 939
 940        /* 
 941         * Do not close handle, for some reason this causes
 942         *  elan3_attach to return EINVAL...
 943         *  
 944         * elan3_control_close(handle);
 945         */
 946
 947        n++;
 948
 949    }
 950
 951#endif /* HAVE_LIBELANCTRL */
 952    return (0);
 953}
 954
 955/*
 956 * Take necessary steps to set up to run an Elan MPI "program" 
 957 * (set of processes) on a node.  
 958 *
 959 * Process 1        Process 2      |        Process 3
 960 * read args                       |
 961 * fork  -------  rms_prgcreate    |
 962 * waitpid        elan3_create     |
 963 *                rms_prgaddcap    |
 964 *                fork N procs  ---+------  rms_setcap
 965 *                wait all         |        setup RMS_ env        
 966 *                                 |         setuid, etc.
 967 *                                 |         exec mpi process
 968 *                exit             |
 969 * rms_prgdestroy                  |
 970 * exit                            |     (one pair of processes per mpi proc!)
 971 *
 972 * Explanation of the two fork(2) calls:
 973 * - The first fork is required because rms_prgdestroy can't occur in the 
 974 *   process that calls rms_prgcreate (since it is a member, ECHILD).
 975 * - The second fork is required when running multiple processes per node 
 976 *   because each process must announce its use of one of the hw contexts 
 977 *   in the range allocated in the capability.
 978 *
 979 * One process:
 980 *    init-xinetd-+-in.qshd---in.qshd---in.qshd---sleep
 981 * Two processes:
 982 *    init-xinetd-+-in.qshd---in.qshd---2*[in.qshd---sleep]
 983 * (if stderr backchannel is active, add one in.qshd)
 984 *   
 985 * Any errors result in a message on stderr and program exit.
 986 */
 987void qsw_setup_program(ELAN_CAPABILITY * cap, qsw_info_t * qi, uid_t uid)
 988{
 989    int pid;
 990    int i;
 991    int nrails;
 992    int cpid[ELAN_MAX_VPS];
 993    int procs_per_node;
 994    int proc_index;
 995
 996    if (qi->nprocs > ELAN_MAX_VPS)      /* should catch this in client */
 997        errx("%p: too many processes requested\n");
 998
 999    /* 
1000     * First fork.  Parent waits for child to terminate, then cleans up.
1001     */
1002    pid = fork();
1003    switch (pid) {
1004    case -1:                   /* error */
1005        errx("%p: fork: %m\n");
1006    case 0:                    /* child falls thru */
1007        break;
1008    default:                   /* parent */
1009        if (waitpid(pid, NULL, 0) < 0)
1010            errx("%p: waitpid: %m\n");
1011        while (rms_prgdestroy(qi->prgnum) < 0) {
1012            if (errno != ECHILD)
1013                errx("%p: rms_prgdestroy: %m\n");
1014            sleep(1);           /* waitprg would be nice! */
1015        }
1016        exit(0);
1017    }
1018    /* child continues here */
1019
1020    nrails = _qsw_elan_nrails(cap);
1021
1022    /* associate this process and its children with prgnum */
1023    if (rms_prgcreate(qi->prgnum, uid, 1) < 0)  /* 1 cpu (bogus!) */
1024        errx("%p: rms_prgcreate %d failed: %m\n", qi->prgnum);
1025
1026    /* 
1027     * Set up capability 
1028     */
1029    if (_qsw_cap_create(cap, nrails) < 0)
1030        errx("%p: unable to set up Elan capability\n");
1031
1032    /* 
1033     * Make cap known via rms_getcap/rms_ncaps 
1034     *  to members of this prgnum 
1035     */
1036    for (i = 0; i < nrails; i++) {
1037        if (rms_prgaddcap(qi->prgnum, i, cap) < 0)
1038            errx("%p: rms_prgaddcap failed: %m\n");
1039    }
1040
1041    if (debug_syslog) {
1042        char tmpstr[1024];
1043
1044        syslog(LOG_DEBUG, "prg %d cap %s bitmap 0x%.8x", qi->prgnum,
1045#if HAVE_LIBELANCTRL
1046                elan_capability_string(cap, tmpstr),
1047#elif HAVE_LIBELAN3
1048                elan3_capability_string(cap, tmpstr), 
1049#endif
1050                cap->Bitmap[0]);
1051    }
1052
1053    /* 
1054     * Second fork - once for each process.
1055     * Parent waits for all children to exit the it exits.
1056     * Child assigns hardware context to each process, then forks again...
1057     */
1058    procs_per_node = qi->nprocs / qi->nnodes;
1059    for (proc_index = 0; proc_index < procs_per_node; proc_index++) {
1060        cpid[proc_index] = fork();
1061        if (cpid[proc_index] < 0)
1062            errx("%p: fork (%d): %m\n", proc_index);
1063        else if (cpid[proc_index] == 0)
1064            break;
1065    }
1066    /* parent */
1067    if (proc_index == procs_per_node) {
1068        int waiting = procs_per_node;
1069        int i;
1070
1071        while (waiting > 0) {
1072            pid = waitpid(0, NULL, 0);  /* any in pgrp */
1073            if (pid < 0)
1074                errx("%p: waitpid: %m\n");
1075            for (i = 0; i < procs_per_node; i++) {
1076                if (cpid[i] == pid)
1077                    waiting--;
1078            }
1079        }
1080        exit(0);
1081    }
1082    /* child falls through here */
1083    /* proc_index will be set to the child's index */
1084
1085    /*
1086     * Assign elan hardware context to current process.
1087     * - arg1 is an index into the kernel's list of caps for this 
1088     *   program desc (added by rms_prgaddcap).  There will be
1089     *   one per rail.
1090     * - arg2 indexes the hw ctxt range in the capability
1091     *   [cap->LowContext, cap->HighContext]
1092     */
1093    for (i = 0; i < nrails; i++) {
1094        if (rms_setcap(i, proc_index) < 0)
1095            errx("%p: rms_setcap (%d): %m\n", proc_index);
1096    }
1097
1098    /* set RMS_ environment vars */
1099    switch (cap->Type & ELAN_CAP_TYPE_MASK) {
1100        case ELAN_CAP_TYPE_BLOCK:
1101            qi->procid = (qi->nodeid * procs_per_node) + proc_index;
1102            break;
1103        case ELAN_CAP_TYPE_CYCLIC:
1104            qi->procid = qi->nodeid + (proc_index * qi->nnodes);
1105            break;
1106        default:
1107            errx("%p: unsupported Elan capability type\n");
1108    }
1109    qi->rank = qi->procid;
1110    if (_rms_setenv(qi) < 0)
1111        errx("%p: failed to set environment variables: %m\n");
1112    /* Exec the process... */
1113}
1114
1115int qsw_prgsignal(int prgid, int signo)
1116{
1117    return rms_prgsignal(prgid, signo);
1118}
1119
1120#ifdef TEST_MAIN
1121/* encode info, then decode and check that the result is what we started with */
1122static void _verify_info_encoding(qsw_info_t * qi)
1123{
1124    int err;
1125    char tmpstr[1024];
1126    qsw_info_t qicpy;
1127
1128    err = qsw_encode_info(tmpstr, sizeof(tmpstr), qi);
1129    assert(err >= 0);
1130    err = qsw_decode_info(tmpstr, &qicpy);
1131    assert(memcmp(qi, &qicpy, sizeof(qicpy)) == 0);
1132}
1133
1134/* encode cap, then decode and check that the result is what we started with */
1135static void _verify_cap_encoding(ELAN_CAPABILITY * cap)
1136{
1137    ELAN_CAPABILITY capcpy;
1138    char tmpstr[1024];
1139    int err;
1140
1141    err = qsw_encode_cap(tmpstr, sizeof(tmpstr), cap);
1142    assert(err >= 0);
1143    err = qsw_decode_cap(tmpstr, &capcpy);
1144    assert(err >= 0);
1145/*assert(ELAN_CAP_MATCH(&cap, &cap2)); *//* broken - see GNATS #3875 */
1146    assert(memcmp(cap, &capcpy, sizeof(capcpy)) == 0);
1147}
1148
1149/* concatenate args into a single string */
1150static void _strncatargs(char *buf, int len, int argc, char *argv[])
1151{
1152    if (len > 0) {
1153        buf[0] = '\0';
1154    }
1155    while (len > 1 && argc > 0) {
1156        strncat(buf, argv[0], len);
1157        argv++;
1158        argc--;
1159        if (argc > 0)
1160            strncat(buf, " ", len);
1161    }
1162    buf[len - 1] = '\0';
1163}
1164
1165static void _usage(void)
1166{
1167    errx("Usage %p [ -n procs ] [ -u uid ] command args...\n");
1168}
1169
1170/* 
1171 * Test program for qsw runtime routines.  Run one or more processes locally, 
1172 * e.g. for MPI ping test across shared memory:
1173 *    qrun -n 2 -u 5588 mping 1 32768
1174 */
1175int main(int argc, char *argv[])
1176{
1177    extern char *optarg;
1178    extern int optind;
1179
1180    char cmdbuf[1024];
1181    ELAN_CAPABILITY cap;
1182    int c;
1183    char *p;
1184    uid_t uid = 0;
1185    hostlist_t wcoll = hostlist_create("");
1186    char hostname[MAXHOSTNAMELEN];
1187    qsw_info_t qinfo = {
1188        nnodes:1,
1189        nprocs:1,
1190    };
1191
1192    err_init(xbasename(argv[0]));       /* init err package */
1193
1194    while ((c = getopt(argc, argv, "u:n:")) != EOF) {
1195        switch (c) {
1196        case 'u':
1197            uid = atoi(optarg);
1198            break;
1199        case 'n':
1200            qinfo.nprocs = atoi(optarg);
1201            break;
1202        default:
1203            _usage();
1204        }
1205    }
1206
1207    argc -= optind;
1208    argv += optind;
1209
1210    if (argc == 0)
1211        _usage();
1212
1213    /* prep arg for the shell */
1214    _strncatargs(cmdbuf, sizeof(cmdbuf), argc, argv);
1215
1216    /* create working collective containing only this host */
1217    if (gethostname(hostname, sizeof(hostname)) < 0)
1218        errx("%p: gethostname: %m\n");
1219    if ((p = strchr(hostname, '.')))
1220        *p = '\0';
1221    hostlist_push(wcoll, hostname);
1222
1223    qsw_init();
1224
1225    /* initialize capability for this "program" */
1226    if (qsw_init_capability(&cap, qinfo.nprocs / qinfo.nnodes, wcoll, 0) < 0)
1227        errx("%p: failed to initialize Elan capability\n");
1228
1229    /* assert encode/decode routines work (we don't use them here) */
1230    _verify_info_encoding(&qinfo);
1231    _verify_cap_encoding(&cap);
1232
1233    /* generate random program number */
1234    qinfo.prgnum = qsw_get_prgnum();
1235
1236    /* set up capabilities, environment, fork, etc.. */
1237    qsw_setup_program(&cap, &qinfo, uid);
1238    /* multiple threads continue on here (one per processes) */
1239
1240    if (seteuid(uid) < 0)
1241        errx("%p: seteuid: %m\n");
1242    err("%p: %d:%d executing /bin/bash -c %s\n",
1243        qinfo.prgnum, qinfo.procid, cmdbuf);
1244    execl("/bin/bash", "bash", "-c", cmdbuf, 0);
1245    errx("%p: exec of shell failed: %m\n");
1246
1247    qsw_fini();
1248
1249    exit(0);
1250}
1251#endif                          /* TEST_MAIN */
1252
1253/*
1254 * vi:tabstop=4 shiftwidth=4 expandtab
1255 */