/src/qsnet/qswutil.c
C | 1255 lines | 777 code | 193 blank | 285 comment | 140 complexity | 7f29dfa25e83ff0e29c0fa2809e4130d MD5 | raw file
1/*****************************************************************************\ 2 * $Id$ 3 ***************************************************************************** 4 * Copyright (C) 2001-2006 The Regents of the University of California. 5 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 6 * Written by Jim Garlick <garlick@llnl.gov>. 7 * UCRL-CODE-2003-005. 8 * 9 * This file is part of Pdsh, a parallel remote shell program. 10 * For details, see <http://www.llnl.gov/linux/pdsh/>. 11 * 12 * Pdsh is free software; you can redistribute it and/or modify it under 13 * the terms of the GNU General Public License as published by the Free 14 * Software Foundation; either version 2 of the License, or (at your option) 15 * any later version. 16 * 17 * Pdsh is distributed in the hope that it will be useful, but WITHOUT ANY 18 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 19 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 20 * details. 21 * 22 * You should have received a copy of the GNU General Public License along 23 * with Pdsh; if not, write to the Free Software Foundation, Inc., 24 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. 25\*****************************************************************************/ 26 27#if HAVE_CONFIG_H 28#include "config.h" 29#endif 30 31#include <stdio.h> 32#include <sys/param.h> 33#include <sys/types.h> 34#include <sys/stat.h> 35#include <sys/wait.h> 36#include <syslog.h> 37#include <errno.h> 38#include <string.h> 39#include <paths.h> 40#include <stdarg.h> 41#include <ctype.h> 42#include <assert.h> 43#include <stdlib.h> 44#include <unistd.h> 45#include <limits.h> /* INT_MAX */ 46#include <pthread.h> 47 48#if HAVE_LIBELANCTRL 49# include <elan/elanctrl.h> 50# include <elan/capability.h> 51 52# define HighNode cap_highnode 53# define LowNode cap_lownode 54# define HighContext cap_highcontext 55# define LowContext cap_lowcontext 56# define Bitmap cap_bitmap 57# define Type cap_type 58# define UserKey cap_userkey 59# define RailMask cap_railmask 60# define Values key_values 61 62/* We need these using the old libelan3 library calls 63 * so we redefine them to old values here. 64 * XXX: What is the equivalent for libelanctrl? 65 */ 66# define ELAN_USER_BASE_CONTEXT_NUM 0x020 67# define ELAN_USER_TOP_CONTEXT_NUM 0x7ff 68 69#include <sys/stat.h> 70 71#elif HAVE_LIBELAN3 72# include <elan3/elan3.h> 73# include <elan3/elanvp.h> 74#else 75# error "Need either libelan3 or libelanctrl to compile this module." 76#endif 77 78#include <rms/rmscall.h> 79 80#include <dlfcn.h> 81 82#include <elanhosts.h> 83 84#include "src/common/xmalloc.h" 85#include "src/common/xstring.h" 86#include "src/common/hostlist.h" 87#include "src/common/list.h" 88#include "src/common/err.h" 89#include "qswutil.h" 90 91/* we will allocate program descriptions in this range */ 92/* XXX note: do not start at zero as libelan shifts to get unique shm id */ 93#define QSW_PRG_START 1 94#define QSW_PRG_END INT_MAX 95 96static int debug_syslog = 1; /* syslog program setup at LOG_DEBUG level */ 97 98/* 99 * Static "Elan Host" configuration 100 */ 101static elanhost_config_t elanconf = NULL; 102 103 104/* 105 * Static function prototypes: 106 */ 107static int _set_elan_ids(elanhost_config_t ec); 108static void *neterr_thr(void *arg); 109 110 111int qsw_init(void) 112{ 113 assert(elanconf == NULL); 114 115 elanconf = elanhost_config_create(); 116 117 if (elanhost_config_read(elanconf, NULL) < 0) { 118 err("%p: error: %s\n", elanhost_config_err(elanconf)); 119 return -1; 120 } 121 122 return 0; 123} 124 125void qsw_fini(void) 126{ 127 elanhost_config_destroy(elanconf); 128} 129 130static int qsw_have_elan3(void) 131{ 132#if HAVE_LIBELAN3 133 return (1); 134#else 135 struct stat st; 136 137 if (stat("/proc/qsnet/elan3/device0", &st) < 0) 138 return (0); 139 140 return (1); 141#endif /* HAVE_LIBELAN3 */ 142 return (0); 143} 144 145struct neterr_args { 146 pthread_mutex_t *mutex; 147 pthread_cond_t *cond; 148 int neterr_rc; 149}; 150 151int qsw_spawn_neterr_thr(void) 152{ 153 struct neterr_args args; 154 pthread_attr_t attr; 155 pthread_t neterr_tid; 156 pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; 157 pthread_cond_t cond = PTHREAD_COND_INITIALIZER; 158 159 /* 160 * Only need to run neterr thread on Elan3 HW. 161 */ 162 if (!qsw_have_elan3()) 163 return (0); 164 165 args.mutex = &mutex; 166 args.cond = &cond; 167 168 if ((errno = pthread_attr_init(&attr))) 169 errx("%p: pthread_attr_init: %m\n"); 170 171 errno = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); 172 if (errno) 173 err("%p: pthread_attr_setdetachstate: %m"); 174 175 pthread_mutex_lock(&mutex); 176 177 if ((errno = pthread_create(&neterr_tid, &attr, neterr_thr, &args))) 178 return -1; 179 180 /* 181 * Wait for successful startup of neterr resolver thread before 182 * returning control to main thread. 183 */ 184 pthread_cond_wait(&cond, &mutex); 185 pthread_mutex_unlock(&mutex); 186 187 return args.neterr_rc; 188 189 return 0; 190} 191 192/* 193 * Use dlopen () for libelan3.so (when needed) 194 * This allows us to build a single version of the qsnet modules 195 * for Elan3 and Elan4 QsNetII systems. 196 */ 197 198/* 199 * libelan3.so handle: 200 */ 201static void * elan3h = NULL; 202 203/* 204 * Wrapper functions for needed libelan3 functions 205 */ 206static int _elan3_init_neterr_svc (int dbglvl) 207{ 208 static int (*init_svc) (int); 209 210 if (!(init_svc = dlsym (elan3h, "elan3_init_neterr_svc"))) 211 return (0); 212 213 return (init_svc (dbglvl)); 214} 215 216 217static int _elan3_register_neterr_svc (void) 218{ 219 static int (*reg_svc) (void); 220 221 if (!(reg_svc = dlsym (elan3h, "elan3_register_neterr_svc"))) 222 return (0); 223 224 return (reg_svc ()); 225} 226 227static int _elan3_run_neterr_svc (void) 228{ 229 static int (*run_svc) (); 230 231 if (!(run_svc = dlsym (elan3h, "elan3_run_neterr_svc"))) 232 return (0); 233 234 return (run_svc ()); 235} 236 237 238static int _elan3_load_neterr_svc (int i, char *host) 239{ 240 static int (*load_svc) (int, char *); 241 242 if (!(load_svc = dlsym (elan3h, "elan3_load_neterr_svc"))) 243 return (0); 244 245 return (load_svc (i, host)); 246} 247 248 249static int 250_set_elan_ids(elanhost_config_t ec) 251{ 252 int i; 253 for (i = 0; i <= elanhost_config_maxid(ec); i++) { 254 char *host = elanhost_elanid2host(ec, ELANHOST_EIP, i); 255 if (!host) 256 continue; 257 258 if (_elan3_load_neterr_svc(i, host) < 0) 259 err("%p: elan3_load_neterr_svc(%d, %s): %m", i, host); 260 } 261 262 return 0; 263} 264 265static void *neterr_thr(void *arg) 266{ 267 struct neterr_args *args = arg; 268 269 if (!(elan3h = dlopen ("libelan3.so", RTLD_LAZY))) { 270 syslog(LOG_ERR, "unable to open libelan3.so: %s", dlerror()); 271 goto fail; 272 } 273 274 if (!_elan3_init_neterr_svc(0)) { 275 syslog(LOG_ERR, "elan3_init_neterr_svc: %m"); 276 goto fail; 277 } 278 279 /* 280 * Attempt to register the neterr svc thread. If the address 281 * cannot be bound, then there is already a thread running, and 282 * we should just exit with success. 283 */ 284 if (!_elan3_register_neterr_svc()) { 285 if (errno != EADDRINUSE) { 286 syslog(LOG_ERR, "elan3_register_neterr_svc: %m"); 287 goto fail; 288 } 289 /* error resolver already running, just return */ 290 goto done; 291 } 292 293 /* 294 * Attempt to register elan ids with kernel if we successfully 295 * registered the error resolver service. 296 */ 297 _set_elan_ids(elanconf); 298 299 done: 300 /* 301 * Signal main thread that we've successfully initialized 302 */ 303 pthread_mutex_lock(args->mutex); 304 args->neterr_rc = 0; 305 pthread_cond_signal(args->cond); 306 pthread_mutex_unlock(args->mutex); 307 308 /* 309 * Run the network error resolver thread. This should 310 * never return. If it does, there's not much we can do 311 * about it. 312 */ 313 _elan3_run_neterr_svc(); 314 315 return NULL; 316 317 fail: 318 pthread_mutex_lock(args->mutex); 319 args->neterr_rc = -1; 320 pthread_cond_signal(args->cond); 321 pthread_mutex_unlock(args->mutex); 322 323 return NULL; 324} 325 326static void 327_free_it (void *item) 328{ 329 Free((void **) &item); 330} 331 332static List 333_hostlist_to_elanids (hostlist_t nodelist) 334{ 335 char *host = NULL; 336 List l = list_create ((ListDelF) _free_it); 337 hostlist_iterator_t i = hostlist_iterator_create (nodelist); 338 339 if (l == NULL) 340 errx ("%p: list_create: %m"); 341 342 if (i == NULL) 343 errx ("%p: hostlist_iterator_create: %m"); 344 345 while ((host = hostlist_next (i))) { 346 int *id = Malloc (sizeof(int)); 347 348 if ((*id = elanhost_host2elanid (elanconf, host)) < 0) { 349 err ("%p: Unable to get ElanId for \"%s\": %s\n", 350 host, elanhost_config_err (elanconf)); 351 goto fail; 352 } 353 354 list_append (l, id); 355 free (host); 356 } 357 hostlist_iterator_destroy (i); 358 359 return (l); 360 361 fail: 362 if (host != NULL) 363 free (host); 364 if (i != NULL) 365 hostlist_iterator_destroy (i); 366 if (l != NULL) 367 list_destroy (l); 368 369 return (NULL); 370} 371 372static int 373_elanid_min (List el) 374{ 375 int *id; 376 int min = -1; 377 ListIterator i = list_iterator_create (el); 378 379 while ((id = list_next (i))) { 380 if ((*id < min) || (min == -1)) 381 min = *id; 382 } 383 384 list_iterator_destroy (i); 385 386 return (min); 387} 388 389static int 390_elanid_max (List el) 391{ 392 int *id; 393 int max = -1; 394 ListIterator i = list_iterator_create (el); 395 396 while ((id = list_next (i))) { 397 if ((*id > max) || (max == -1)) 398 max = *id; 399 } 400 401 list_iterator_destroy (i); 402 403 return (max); 404} 405 406 407/* 408 * Given a list of hostnames and the number of processes per node, 409 * set the correct bits in the capability's bitmap and set high and 410 * low node id's. 411 */ 412static int 413_setbitmap(hostlist_t nodelist, int procs_per_node, int cyclic, 414 ELAN_CAPABILITY * cap) 415{ 416 int *id; 417 int nodes_in_bitmap; 418 int rc = 0; 419 List el; 420 ListIterator itr; 421 422 if (!(el = _hostlist_to_elanids (nodelist))) 423 return (-1); 424 425 cap->HighNode = _elanid_max (el); 426 cap->LowNode = _elanid_min (el); 427 428 if (cap->HighNode == -1 || cap->LowNode == -1) 429 return -1; 430 431 nodes_in_bitmap = cap->HighNode - cap->LowNode + 1; 432 433 /* 434 * There are (procs_per_node * nnodes) significant bits in the mask, 435 * each representing a process slot. Bits are off where for holes 436 * corresponding to process slots for unallocated nodes. 437 * For example, if nodes 4 and 6 are running two processes per node, 438 * bits 0,1 (corresponding to the two processes on node 4) and bits 4,5 439 * (corresponding to the two processes running no node 6) are set. 440 * 441 * Note that for QsNet, the bits have a different meaning depending 442 * on whether the capability distribution type is cyclic or block. 443 * For block distribution, the bits are laid out in node-major 444 * format, while for cyclic distribution, a procid (or context) major 445 * format is used. 446 * 447 * Example: 2 processes per node on nodes 0,2: 448 * 449 * block cyclic 450 * 451 * 2 | 1 | 0 NodeId 2 1 0 | 2 1 0 452 * | | | 453 * 1 0 | 1 0 | 1 0 ContextId 1 | 0 454 * | | | 455 * 5 4 | 3 2 | 1 0 Bit Numbers 5 4 3 | 2 1 0 456 * | | | 457 * ---- +-----+----- -------+------- 458 * 1 1 | 0 0 | 1 1 Bit Value 1 0 1 | 1 0 1 459 */ 460 461 itr = list_iterator_create (el); 462 463 while ((id = list_next (itr))) { 464 int node = (*id) - cap->LowNode; /* relative id w/in bitmap */ 465 int i; 466 467 for (i = 0; i < procs_per_node; i++) { 468 int bit; 469 if (cyclic) 470 bit = (i * nodes_in_bitmap) + node; 471 else 472 bit = (node * (procs_per_node)) + i; 473 474 if (bit >= (sizeof (cap->Bitmap) * 8)) { 475 err ("%p: _setbitmap: bit %d out of range\n", bit); 476 rc = -1; 477 break; 478 } 479 480 BT_SET(cap->Bitmap, bit); 481 } 482 } 483 list_destroy (el); 484 485 return (rc); 486} 487 488/* 489 * Set a variable in the callers environment. Args are printf style. 490 * XXX Space is allocated on the heap and will never be reclaimed. 491 * Example: setenvf("RMS_RANK=%d", rank); 492 */ 493static int _setenvf(const char *fmt, ...) 494{ 495 va_list ap; 496 char buf[BUFSIZ]; 497 char *bufcpy; 498 499 va_start(ap, fmt); 500 vsnprintf(buf, sizeof(buf), fmt, ap); 501 va_end(ap); 502 503 bufcpy = strdup(buf); 504 if (bufcpy == NULL) 505 return -1; 506 return putenv(bufcpy); 507} 508 509static int _rms_setenv(qsw_info_t * qi) 510{ 511 /* MPI wants some of these ... 512 * (It doesn't anymore, but they are helpful when running 513 * parallel scripts - ashley@quadrics.com ) 514 */ 515 if (_setenvf("RMS_RANK=%d", qi->rank) < 0) 516 return -1; 517 if (_setenvf("RMS_NODEID=%d", qi->nodeid) < 0) 518 return -1; 519 if (_setenvf("RMS_PROCID=%d", qi->procid) < 0) 520 return -1; 521 if (_setenvf("RMS_NNODES=%d", qi->nnodes) < 0) 522 return -1; 523 if (_setenvf("RMS_NPROCS=%d", qi->nprocs) < 0) 524 return -1; 525 526 if (_setenvf("ELAN_AUTO=pdsh") < 0) 527 return -1; 528 if (_setenvf("ELAN_JOBID=%d", qi->prgnum) < 0) 529 return -1; 530 531#if 0 532 /* I'm not sure what this should be set to yet, 533 * libelan will do the right thing if it's not 534 * set though. (ashley@quadrics.com) */ 535 if (_setenvf("LIBELAN_SHMKEY=%d", qi->prgnum) < 0) 536 return -1; 537#endif 538 539 return 0; 540} 541 542/* 543 * Return the number of times qsw_encode_cap_bitamp/qsw_decode_cap_bitmap 544 * must be called. 545 */ 546int qsw_cap_bitmap_count(void) 547{ 548 ELAN_CAPABILITY cap; 549 int count = sizeof(cap.Bitmap) / sizeof(cap.Bitmap[0]); 550 551 assert(count % 16 == 0); 552 return count; 553} 554 555/* 556 * Convert capability (all but cap->Bitmap) to string. 557 */ 558int qsw_encode_cap(char *s, int len, ELAN_CAPABILITY * cap) 559{ 560 int n; 561 562 if (sizeof(cap->UserKey.Values[0]) != 4) { 563 err("%p: qsw_encode_cap: UserKey is unexpected size\n"); 564 return -1; 565 } 566 if (sizeof(cap->UserKey) / 4 != 4) { 567 err("%p: qsw_encode_cap: UserKey array is unexpected size\n"); 568 return -1; 569 } 570#if HAVE_LIBELANCTRL 571 cap->cap_spare = ELAN_CAP_UNINITIALISED ; 572 n = snprintf(s, len, "%x.%x.%x.%x.%hx.%x.%x.%x.%x.%x.%x.%x", 573 cap->UserKey.Values[0], 574 cap->UserKey.Values[1], 575 cap->UserKey.Values[2], 576 cap->UserKey.Values[3], 577 cap->Type, /* short */ 578#ifdef ELAN_CAP_ELAN3 579 cap->cap_elan_type, /* char */ 580#else 581 cap->cap_spare, 582#endif 583 cap->LowContext, 584 cap->HighContext, 585 cap->cap_mycontext, 586 cap->LowNode, 587 cap->HighNode, 588 cap->RailMask); 589#elif HAVE_LIBELAN3 590 n = snprintf(s, len, "%x.%x.%x.%x.%hx.%x.%x.%x.%x.%x.%x.%x", 591 cap->UserKey.Values[0], 592 cap->UserKey.Values[1], 593 cap->UserKey.Values[2], 594 cap->UserKey.Values[3], 595 cap->Type, /* short */ 596 cap->LowContext, 597 cap->HighContext, 598 cap->MyContext, 599 cap->LowNode, 600 cap->HighNode, 601 cap->Entries, 602 cap->RailMask); 603 604#else 605#error "Neither LIBELAN3 nor LIBELANCTRL defined!" 606#endif 607 608 if (n < 0 || n > strlen(s)) { 609 err("%p: qsw_encode_cap: string overflow\n"); 610 return -1; 611 } 612 return 0; 613} 614 615/* 616 * Convert cap->Bitmap to string. 617 */ 618int qsw_encode_cap_bitmap(char *s, int len, ELAN_CAPABILITY * cap, int i) 619{ 620 int n; 621 622 if (sizeof(cap->Bitmap[0]) != sizeof(unsigned int)) { 623 err("%p: qsw_encode_cap_bitmap: Bitmap is unexpected size\n"); 624 return -1; 625 } 626 if ((sizeof(cap->Bitmap) / sizeof(cap->Bitmap[0])) % 16 != 0) { 627 err("%p: qsw_encode_cap_bitmap: Bitmap is not mult of 16\n"); 628 return -1; 629 } 630 if (i < 0 || i >= (sizeof(cap->Bitmap) / sizeof(cap->Bitmap[0]))) { 631 err("%p: qsw_encode_cap_bitmap: Bitmap index out of range\n"); 632 return -1; 633 } 634 n = snprintf(s, len, "%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x", 635 cap->Bitmap[i + 0], cap->Bitmap[i + 1], 636 cap->Bitmap[i + 2], cap->Bitmap[i + 3], 637 cap->Bitmap[i + 4], cap->Bitmap[i + 5], 638 cap->Bitmap[i + 6], cap->Bitmap[i + 7], 639 cap->Bitmap[i + 8], cap->Bitmap[i + 9], 640 cap->Bitmap[i + 10], cap->Bitmap[i + 11], 641 cap->Bitmap[i + 12], cap->Bitmap[i + 13], 642 cap->Bitmap[i + 14], cap->Bitmap[i + 15]); 643 if (n == -1 || n > strlen(s)) { 644 err("%p: qsw_encode_cap_bitmap: string overflow\n"); 645 return -1; 646 } 647 return 0; 648} 649 650/* 651 * Convert string to capability (all but cap->Bitmap). 652 */ 653int qsw_decode_cap(char *s, ELAN_CAPABILITY * cap) 654{ 655 int n; 656 657#if HAVE_LIBELANCTRL 658 /* initialize capability */ 659 elan_nullcap(cap); 660 661 n = sscanf(s, "%x.%x.%x.%x.%hx.%hx.%x.%x.%x.%x.%x.%x", 662 &cap->UserKey.Values[0], 663 &cap->UserKey.Values[1], 664 &cap->UserKey.Values[2], 665 &cap->UserKey.Values[3], 666 &cap->cap_type, /* short */ 667# ifdef ELAN_CAP_ELAN3 668 &cap->cap_elan_type, /* char */ 669# else 670 &cap->cap_spare, /* unsigned short */ 671# endif 672 &cap->LowContext, 673 &cap->HighContext, 674 &cap->cap_mycontext, 675 &cap->LowNode, 676 &cap->HighNode, 677 &cap->RailMask); 678 679#elif HAVE_LIBELAN3 680 681 /* initialize capability */ 682 elan3_nullcap(cap); 683 684 /* fill in values sent from remote */ 685 n = sscanf(s, "%x.%x.%x.%x.%hx.%x.%x.%x.%x.%x.%x.%x", 686 &cap->UserKey.Values[0], 687 &cap->UserKey.Values[1], 688 &cap->UserKey.Values[2], 689 &cap->UserKey.Values[3], 690 &cap->Type, /* short */ 691 &cap->LowContext, 692 &cap->HighContext, 693 &cap->MyContext, 694 &cap->LowNode, 695 &cap->HighNode, 696 &cap->Entries, 697 &cap->RailMask); 698#else 699# error "Neither LIBELANCTRL nor LIBELAN3 set!" 700#endif 701 702 if (n != 12) { 703 err("%p: qsw_decode_cap: scan error (%d of %d)\n", n, 12); 704 return -1; 705 } 706 return 0; 707} 708 709/* 710 * Convert string to cap->Bitmap. 711 */ 712int qsw_decode_cap_bitmap(char *s, ELAN_CAPABILITY * cap, int i) 713{ 714 int n; 715 716 if (i < 0 || i >= sizeof(cap->Bitmap) / sizeof(cap->Bitmap[0])) { 717 err("%p: qsw_decode_cap_bitmap: BitMap index out of range\n"); 718 return -1; 719 } 720 n = sscanf(s, "%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x", 721 &cap->Bitmap[i + 0], &cap->Bitmap[i + 1], 722 &cap->Bitmap[i + 2], &cap->Bitmap[i + 3], 723 &cap->Bitmap[i + 4], &cap->Bitmap[i + 5], 724 &cap->Bitmap[i + 6], &cap->Bitmap[i + 7], 725 &cap->Bitmap[i + 8], &cap->Bitmap[i + 9], 726 &cap->Bitmap[i + 10], &cap->Bitmap[i + 11], 727 &cap->Bitmap[i + 12], &cap->Bitmap[i + 13], 728 &cap->Bitmap[i + 14], &cap->Bitmap[i + 15]); 729 if (n != 16) { 730 err("%p: qsw_decode_cap_bitmap(%d): scan error\n", i); 731 return -1; 732 } 733 return 0; 734} 735 736/* 737 * string -> info 738 */ 739int qsw_decode_info(char *s, qsw_info_t * qi) 740{ 741 int n; 742 743 n = sscanf(s, "%x.%x.%x.%x.%x.%x", 744 &qi->prgnum, 745 &qi->rank, 746 &qi->nodeid, &qi->procid, &qi->nnodes, &qi->nprocs); 747 if (n != 6) { 748 err("%p: qsw_decode_info: scan error\n"); 749 return -1; 750 } 751 return 0; 752} 753 754/* 755 * info -> string 756 */ 757int qsw_encode_info(char *s, int len, qsw_info_t * qi) 758{ 759 int n; 760 761 n = snprintf(s, len, "%x.%x.%x.%x.%x.%x", 762 qi->prgnum, 763 qi->rank, qi->nodeid, qi->procid, qi->nnodes, qi->nprocs); 764 if (n == -1 || n > strlen(s)) { 765 err("%p: qsw_encode_info: string overflow\n"); 766 return -1; 767 } 768 return 0; 769} 770 771/* 772 * Generate a random program number. Normally these would be allocated, 773 * but since we have no persistant daemon, we settle for random. 774 * Must be called after qsw_init_capability (we seed lrand48 there). 775 */ 776int qsw_get_prgnum(void) 777{ 778 int prgnum; 779 780 prgnum = lrand48() % (QSW_PRG_END - QSW_PRG_START + 1); 781 prgnum += QSW_PRG_START; 782 783 return prgnum; 784} 785 786/* 787 * Prepare a capability that will be passed to all the processes in a 788 * parallel program. 789 * Function returns a 0 on success, -1 = fail. 790 */ 791int 792qsw_init_capability(ELAN_CAPABILITY * cap, int nprocs, hostlist_t nodelist, 793 int cyclic_alloc, unsigned int railmask) 794{ 795 int i; 796 int num_nodes = hostlist_count(nodelist); 797 int procs_per_node = nprocs / num_nodes; 798 799 assert (railmask < QSW_RAILMASK_MAX); 800 801 srand48(getpid()); 802 803 /* 804 * Initialize for multi rail and either block or cyclic allocation. 805 * Set ELAN_CAP_TYPE_BROADCASTABLE later if appropriate. 806 */ 807#if HAVE_LIBELANCTRL 808 elan_nullcap(cap); 809#elif HAVE_LIBELAN3 810 elan3_nullcap(cap); 811#else 812# error 813#endif 814 815 if (cyclic_alloc) 816 cap->Type = ELAN_CAP_TYPE_CYCLIC; 817 else 818 cap->Type = ELAN_CAP_TYPE_BLOCK; 819 cap->Type |= ELAN_CAP_TYPE_MULTI_RAIL; 820 821 cap->RailMask = railmask; 822 823#if HAVE_LIBELANCTRL 824# ifdef ELAN_CAP_ELAN3 825 cap->cap_elan_type = ELAN_CAP_ELAN3; 826# else 827 cap->cap_spare = ELAN_CAP_UNINITIALISED; 828# endif 829#endif 830 831 /* 832 * UserKey is 128 bits of randomness which should be kept private. 833 */ 834 for (i = 0; i < 4; i++) 835 cap->UserKey.Values[i] = lrand48(); 836 837 /* 838 * Elan hardware context numbers must be unique per node. 839 * One is allocated to each parallel process. In order for processes 840 * on the same node to communicate, they must use contexts in the 841 * hi-lo range of a common capability. With pdsh we have no 842 * persistant daemon to allocate these, so we settle for a random one. 843 */ 844 cap->LowContext = lrand48() % 845 (ELAN_USER_TOP_CONTEXT_NUM - 846 (ELAN_USER_BASE_CONTEXT_NUM + procs_per_node - 1) - 1); 847 cap->LowContext += ELAN_USER_BASE_CONTEXT_NUM; 848 cap->HighContext = cap->LowContext + procs_per_node - 1; 849 /* not necessary to initialize cap->MyContext */ 850 851 /* 852 * Describe the mapping of processes to nodes. 853 * This sets cap->HighNode, cap->LowNode, and cap->Bitmap. 854 */ 855 if (_setbitmap(nodelist, procs_per_node, cyclic_alloc, cap) < 0) { 856 err("%p: do all target nodes have an Elan adapter?\n"); 857 return -1; 858 } 859 860#if HAVE_LIBELAN3 861 /* 862 * Set cap->Entries and add broadcast bit to cap->type based on 863 * cap->HighNode and cap->LowNode values set above. 864 */ 865 cap->Entries = nprocs; 866 if (cap->Entries > ELAN_MAX_VPS) { 867 err("%p: program would have too many processes (max %d)\n", 868 ELAN_MAX_VPS); 869 return -1; 870 } 871#endif 872 873 /* 874 * As we now support segmented broadcast, always flag the capability 875 * as broadcastable. 876 */ 877 /*if (abs(cap->HighNode - cap->LowNode) == num_nodes - 1) */ 878 cap->Type |= ELAN_CAP_TYPE_BROADCASTABLE; 879 880 return 0; 881} 882 883static int 884_qsw_elan_nrails(ELAN_CAPABILITY * cap) 885{ 886#if HAVE_LIBELANCTRL 887 return elan_nrails (cap); 888#elif HAVE_LIBELAN3 889 return elan3_nrails (cap); 890#endif 891} 892 893 894static int 895_qsw_cap_create(ELAN_CAPABILITY * cap, int nrails) 896{ 897#if HAVE_LIBELANCTRL 898 ELANCTRL_HANDLE handle; 899 900 /* 901 * Open up the Elan control device so we can create 902 * a new capability. 903 */ 904 if (elanctrl_open(&handle) != 0) 905 errx("%p: elanctrl_open(): %m\n"); 906 907 /* Push capability into device driver */ 908 if (elanctrl_create_cap(handle, cap) < 0) 909 errx("%p: elanctrl_create_cap failed: %m\n"); 910 911 /* 912 * Do not close elanctrl handle here, this can cause 913 * MPI initialization to fail somehow. 914 * 915 * elanctrl_close(handle); 916 */ 917 918#elif HAVE_LIBELAN3 919 int i, n = 0; 920 921 /* MULTI-RAIL: Create the capability in all rails */ 922 for (i = 0; (i < ELAN_MAX_RAILS) && (n < nrails); i++) { 923 void *handle; 924 925 if (!(cap->RailMask & (1 << i))) 926 continue; 927 928 /* 929 * Open up the control device so we can create a new 930 * capability. This will fail if we don't have rw 931 * access to /dev/elan3/control[i] 932 */ 933 if ((handle = elan3_control_open(i)) == NULL) 934 errx("%p: elan3_control_open(%d): %m\n", i); 935 936 /* Push capability into device driver */ 937 if (elan3_create(handle, cap) < 0) 938 errx("%p: elan3_create failed: %m\n"); 939 940 /* 941 * Do not close handle, for some reason this causes 942 * elan3_attach to return EINVAL... 943 * 944 * elan3_control_close(handle); 945 */ 946 947 n++; 948 949 } 950 951#endif /* HAVE_LIBELANCTRL */ 952 return (0); 953} 954 955/* 956 * Take necessary steps to set up to run an Elan MPI "program" 957 * (set of processes) on a node. 958 * 959 * Process 1 Process 2 | Process 3 960 * read args | 961 * fork ------- rms_prgcreate | 962 * waitpid elan3_create | 963 * rms_prgaddcap | 964 * fork N procs ---+------ rms_setcap 965 * wait all | setup RMS_ env 966 * | setuid, etc. 967 * | exec mpi process 968 * exit | 969 * rms_prgdestroy | 970 * exit | (one pair of processes per mpi proc!) 971 * 972 * Explanation of the two fork(2) calls: 973 * - The first fork is required because rms_prgdestroy can't occur in the 974 * process that calls rms_prgcreate (since it is a member, ECHILD). 975 * - The second fork is required when running multiple processes per node 976 * because each process must announce its use of one of the hw contexts 977 * in the range allocated in the capability. 978 * 979 * One process: 980 * init-xinetd-+-in.qshd---in.qshd---in.qshd---sleep 981 * Two processes: 982 * init-xinetd-+-in.qshd---in.qshd---2*[in.qshd---sleep] 983 * (if stderr backchannel is active, add one in.qshd) 984 * 985 * Any errors result in a message on stderr and program exit. 986 */ 987void qsw_setup_program(ELAN_CAPABILITY * cap, qsw_info_t * qi, uid_t uid) 988{ 989 int pid; 990 int i; 991 int nrails; 992 int cpid[ELAN_MAX_VPS]; 993 int procs_per_node; 994 int proc_index; 995 996 if (qi->nprocs > ELAN_MAX_VPS) /* should catch this in client */ 997 errx("%p: too many processes requested\n"); 998 999 /* 1000 * First fork. Parent waits for child to terminate, then cleans up. 1001 */ 1002 pid = fork(); 1003 switch (pid) { 1004 case -1: /* error */ 1005 errx("%p: fork: %m\n"); 1006 case 0: /* child falls thru */ 1007 break; 1008 default: /* parent */ 1009 if (waitpid(pid, NULL, 0) < 0) 1010 errx("%p: waitpid: %m\n"); 1011 while (rms_prgdestroy(qi->prgnum) < 0) { 1012 if (errno != ECHILD) 1013 errx("%p: rms_prgdestroy: %m\n"); 1014 sleep(1); /* waitprg would be nice! */ 1015 } 1016 exit(0); 1017 } 1018 /* child continues here */ 1019 1020 nrails = _qsw_elan_nrails(cap); 1021 1022 /* associate this process and its children with prgnum */ 1023 if (rms_prgcreate(qi->prgnum, uid, 1) < 0) /* 1 cpu (bogus!) */ 1024 errx("%p: rms_prgcreate %d failed: %m\n", qi->prgnum); 1025 1026 /* 1027 * Set up capability 1028 */ 1029 if (_qsw_cap_create(cap, nrails) < 0) 1030 errx("%p: unable to set up Elan capability\n"); 1031 1032 /* 1033 * Make cap known via rms_getcap/rms_ncaps 1034 * to members of this prgnum 1035 */ 1036 for (i = 0; i < nrails; i++) { 1037 if (rms_prgaddcap(qi->prgnum, i, cap) < 0) 1038 errx("%p: rms_prgaddcap failed: %m\n"); 1039 } 1040 1041 if (debug_syslog) { 1042 char tmpstr[1024]; 1043 1044 syslog(LOG_DEBUG, "prg %d cap %s bitmap 0x%.8x", qi->prgnum, 1045#if HAVE_LIBELANCTRL 1046 elan_capability_string(cap, tmpstr), 1047#elif HAVE_LIBELAN3 1048 elan3_capability_string(cap, tmpstr), 1049#endif 1050 cap->Bitmap[0]); 1051 } 1052 1053 /* 1054 * Second fork - once for each process. 1055 * Parent waits for all children to exit the it exits. 1056 * Child assigns hardware context to each process, then forks again... 1057 */ 1058 procs_per_node = qi->nprocs / qi->nnodes; 1059 for (proc_index = 0; proc_index < procs_per_node; proc_index++) { 1060 cpid[proc_index] = fork(); 1061 if (cpid[proc_index] < 0) 1062 errx("%p: fork (%d): %m\n", proc_index); 1063 else if (cpid[proc_index] == 0) 1064 break; 1065 } 1066 /* parent */ 1067 if (proc_index == procs_per_node) { 1068 int waiting = procs_per_node; 1069 int i; 1070 1071 while (waiting > 0) { 1072 pid = waitpid(0, NULL, 0); /* any in pgrp */ 1073 if (pid < 0) 1074 errx("%p: waitpid: %m\n"); 1075 for (i = 0; i < procs_per_node; i++) { 1076 if (cpid[i] == pid) 1077 waiting--; 1078 } 1079 } 1080 exit(0); 1081 } 1082 /* child falls through here */ 1083 /* proc_index will be set to the child's index */ 1084 1085 /* 1086 * Assign elan hardware context to current process. 1087 * - arg1 is an index into the kernel's list of caps for this 1088 * program desc (added by rms_prgaddcap). There will be 1089 * one per rail. 1090 * - arg2 indexes the hw ctxt range in the capability 1091 * [cap->LowContext, cap->HighContext] 1092 */ 1093 for (i = 0; i < nrails; i++) { 1094 if (rms_setcap(i, proc_index) < 0) 1095 errx("%p: rms_setcap (%d): %m\n", proc_index); 1096 } 1097 1098 /* set RMS_ environment vars */ 1099 switch (cap->Type & ELAN_CAP_TYPE_MASK) { 1100 case ELAN_CAP_TYPE_BLOCK: 1101 qi->procid = (qi->nodeid * procs_per_node) + proc_index; 1102 break; 1103 case ELAN_CAP_TYPE_CYCLIC: 1104 qi->procid = qi->nodeid + (proc_index * qi->nnodes); 1105 break; 1106 default: 1107 errx("%p: unsupported Elan capability type\n"); 1108 } 1109 qi->rank = qi->procid; 1110 if (_rms_setenv(qi) < 0) 1111 errx("%p: failed to set environment variables: %m\n"); 1112 /* Exec the process... */ 1113} 1114 1115int qsw_prgsignal(int prgid, int signo) 1116{ 1117 return rms_prgsignal(prgid, signo); 1118} 1119 1120#ifdef TEST_MAIN 1121/* encode info, then decode and check that the result is what we started with */ 1122static void _verify_info_encoding(qsw_info_t * qi) 1123{ 1124 int err; 1125 char tmpstr[1024]; 1126 qsw_info_t qicpy; 1127 1128 err = qsw_encode_info(tmpstr, sizeof(tmpstr), qi); 1129 assert(err >= 0); 1130 err = qsw_decode_info(tmpstr, &qicpy); 1131 assert(memcmp(qi, &qicpy, sizeof(qicpy)) == 0); 1132} 1133 1134/* encode cap, then decode and check that the result is what we started with */ 1135static void _verify_cap_encoding(ELAN_CAPABILITY * cap) 1136{ 1137 ELAN_CAPABILITY capcpy; 1138 char tmpstr[1024]; 1139 int err; 1140 1141 err = qsw_encode_cap(tmpstr, sizeof(tmpstr), cap); 1142 assert(err >= 0); 1143 err = qsw_decode_cap(tmpstr, &capcpy); 1144 assert(err >= 0); 1145/*assert(ELAN_CAP_MATCH(&cap, &cap2)); *//* broken - see GNATS #3875 */ 1146 assert(memcmp(cap, &capcpy, sizeof(capcpy)) == 0); 1147} 1148 1149/* concatenate args into a single string */ 1150static void _strncatargs(char *buf, int len, int argc, char *argv[]) 1151{ 1152 if (len > 0) { 1153 buf[0] = '\0'; 1154 } 1155 while (len > 1 && argc > 0) { 1156 strncat(buf, argv[0], len); 1157 argv++; 1158 argc--; 1159 if (argc > 0) 1160 strncat(buf, " ", len); 1161 } 1162 buf[len - 1] = '\0'; 1163} 1164 1165static void _usage(void) 1166{ 1167 errx("Usage %p [ -n procs ] [ -u uid ] command args...\n"); 1168} 1169 1170/* 1171 * Test program for qsw runtime routines. Run one or more processes locally, 1172 * e.g. for MPI ping test across shared memory: 1173 * qrun -n 2 -u 5588 mping 1 32768 1174 */ 1175int main(int argc, char *argv[]) 1176{ 1177 extern char *optarg; 1178 extern int optind; 1179 1180 char cmdbuf[1024]; 1181 ELAN_CAPABILITY cap; 1182 int c; 1183 char *p; 1184 uid_t uid = 0; 1185 hostlist_t wcoll = hostlist_create(""); 1186 char hostname[MAXHOSTNAMELEN]; 1187 qsw_info_t qinfo = { 1188 nnodes:1, 1189 nprocs:1, 1190 }; 1191 1192 err_init(xbasename(argv[0])); /* init err package */ 1193 1194 while ((c = getopt(argc, argv, "u:n:")) != EOF) { 1195 switch (c) { 1196 case 'u': 1197 uid = atoi(optarg); 1198 break; 1199 case 'n': 1200 qinfo.nprocs = atoi(optarg); 1201 break; 1202 default: 1203 _usage(); 1204 } 1205 } 1206 1207 argc -= optind; 1208 argv += optind; 1209 1210 if (argc == 0) 1211 _usage(); 1212 1213 /* prep arg for the shell */ 1214 _strncatargs(cmdbuf, sizeof(cmdbuf), argc, argv); 1215 1216 /* create working collective containing only this host */ 1217 if (gethostname(hostname, sizeof(hostname)) < 0) 1218 errx("%p: gethostname: %m\n"); 1219 if ((p = strchr(hostname, '.'))) 1220 *p = '\0'; 1221 hostlist_push(wcoll, hostname); 1222 1223 qsw_init(); 1224 1225 /* initialize capability for this "program" */ 1226 if (qsw_init_capability(&cap, qinfo.nprocs / qinfo.nnodes, wcoll, 0) < 0) 1227 errx("%p: failed to initialize Elan capability\n"); 1228 1229 /* assert encode/decode routines work (we don't use them here) */ 1230 _verify_info_encoding(&qinfo); 1231 _verify_cap_encoding(&cap); 1232 1233 /* generate random program number */ 1234 qinfo.prgnum = qsw_get_prgnum(); 1235 1236 /* set up capabilities, environment, fork, etc.. */ 1237 qsw_setup_program(&cap, &qinfo, uid); 1238 /* multiple threads continue on here (one per processes) */ 1239 1240 if (seteuid(uid) < 0) 1241 errx("%p: seteuid: %m\n"); 1242 err("%p: %d:%d executing /bin/bash -c %s\n", 1243 qinfo.prgnum, qinfo.procid, cmdbuf); 1244 execl("/bin/bash", "bash", "-c", cmdbuf, 0); 1245 errx("%p: exec of shell failed: %m\n"); 1246 1247 qsw_fini(); 1248 1249 exit(0); 1250} 1251#endif /* TEST_MAIN */ 1252 1253/* 1254 * vi:tabstop=4 shiftwidth=4 expandtab 1255 */