/src/modules/sdr.c
C | 555 lines | 365 code | 111 blank | 79 comment | 82 complexity | ab9f72e0815c7521e1b5cbdf893d0a3c MD5 | raw file
1/*****************************************************************************\ 2 * $Id$ 3 ***************************************************************************** 4 * Copyright (C) 2001-2006 The Regents of the University of California. 5 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 6 * Written by Jim Garlick <garlick@llnl.gov>. 7 * UCRL-CODE-2003-005. 8 * 9 * This file is part of Pdsh, a parallel remote shell program. 10 * For details, see <http://www.llnl.gov/linux/pdsh/>. 11 * 12 * Pdsh is free software; you can redistribute it and/or modify it under 13 * the terms of the GNU General Public License as published by the Free 14 * Software Foundation; either version 2 of the License, or (at your option) 15 * any later version. 16 * 17 * Pdsh is distributed in the hope that it will be useful, but WITHOUT ANY 18 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 19 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 20 * details. 21 * 22 * You should have received a copy of the GNU General Public License along 23 * with Pdsh; if not, write to the Free Software Foundation, Inc., 24 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. 25\*****************************************************************************/ 26#if HAVE_CONFIG_H 27# include "config.h" 28#endif 29 30#include <assert.h> 31#include <string.h> 32 33#include "src/pdsh/wcoll.h" 34#include "src/pdsh/mod.h" 35#include "src/pdsh/xpopen.h" 36#include "src/common/xmalloc.h" 37#include "src/common/err.h" 38#include "src/common/xstring.h" 39 40#define SPACES "\t\n " 41 42#define LINEBUFSIZE 2048 43/* some handy SP constants */ 44/* NOTE: degenerate case of one node per frame, nodes would be 1, 17, 33,... */ 45#define MAX_SP_NODES 512 46#define MAX_SP_NODES_PER_FRAME 16 47#define MAX_SP_NODE_NUMBER (MAX_SP_NODES * MAX_SP_NODES_PER_FRAME - 1) 48 49#if STATIC_MODULES 50# define pdsh_module_info sdr_module_info 51# define pdsh_module_priority sdr_module_priority 52#endif 53 54int pdsh_module_priority = DEFAULT_MODULE_PRIORITY; 55 56static int sdr_init (void); 57static int sdr_exit (void); 58static hostlist_t read_sdr (opt_t *opt); 59static int sdr_postop (opt_t *); 60 61static int sdr_process_opt(opt_t *, int, char *); 62 63/* 64 * Export generic module functions 65 */ 66struct pdsh_module_operations sdr_module_ops = { 67 (ModInitF) sdr_init, 68 (ModExitF) sdr_exit, 69 (ModReadWcollF) read_sdr, 70 (ModPostOpF) sdr_postop, 71}; 72 73/* 74 * Export module options 75 */ 76struct pdsh_module_option sdr_module_options[] = 77 { { 'a', NULL, "target all nodes", 78 DSH | PCP, (optFunc) sdr_process_opt 79 }, 80 { 'v', NULL, "verify nodes are up using host/switch_responds", 81 DSH | PCP, (optFunc) sdr_process_opt 82 }, 83 { 'i', NULL, "translate to alternate/initial hostnames from SDR (if applicable)", 84 DSH | PCP, (optFunc) sdr_process_opt 85 }, 86 { 'G', NULL, "with -a, run on all SP partitions", 87 DSH | PCP, (optFunc) sdr_process_opt 88 }, 89 PDSH_OPT_TABLE_END 90 }; 91 92/* 93 * Sdr module info 94 */ 95struct pdsh_module pdsh_module_info = { 96 "misc", 97 "sdr", 98 "Jim Garlick <garlick@llnl.gov>", 99 "Support for SDR on IBM SP", 100 DSH | PCP, 101 102 &sdr_module_ops, 103 NULL, 104 &sdr_module_options[0], 105}; 106 107 108 109/* 110 * Data cache for SDR information. 111 * XXX: Hash by node number instead of leaving room for 112 * all possible nodes 113 */ 114struct sdr_info { 115 char *hostname; 116 char *reliable_hostname; 117 bool switch_responds; 118 bool host_responds; 119}; 120 121static bool sdr_initialized = false; 122static struct sdr_info * sdrcache[MAX_SP_NODE_NUMBER]; 123 124/* 125 * Global options 126 */ 127static bool allnodes = false; 128static bool altnames = false; 129static bool verify = false; 130static bool global = false; 131 132 133/* 134 * Required static forward declarations 135 */ 136static struct sdr_info * sdr_info_create (char *host, char *rhost); 137static void sdr_info_destroy (struct sdr_info *s); 138 139static hostlist_t _sdr_filter (hostlist_t hl, bool iopt, bool verify); 140static hostlist_t _sdr_wcoll(bool Gopt); 141static struct sdr_info * _find_node (const char *name, int *rhost); 142static hostlist_t _sdr_reliable_names (void); 143static void _sdr_getnames(bool Gopt); 144static void _sdr_getresp (bool Gopt); 145 146static char *_list_nth(List l, int n); 147 148/* 149 * module interface functions 150 */ 151 152static int sdr_init (void) 153{ 154 int i; 155 for (i = 0; i < MAX_SP_NODE_NUMBER; i++) 156 sdrcache[i] = NULL; 157 return (0); 158} 159 160static int sdr_exit (void) 161{ 162 int i; 163 for (i = 0; i < MAX_SP_NODE_NUMBER; i++) 164 sdr_info_destroy (sdrcache[i]); 165 return (0); 166} 167 168static int sdr_process_opt(opt_t *pdsh_opt, int opt, char *arg) 169{ 170 switch (opt) { 171 case 'a': 172 allnodes = true; 173 break; 174 case 'i': 175 altnames = true; 176 break; 177 case 'v': 178 verify = true; 179 break; 180 case 'G': 181 global = true; 182 break; 183 default: 184 errx("%p: badness factor high in sdr module\n"); 185 break; 186 } 187 return 0; 188} 189 190 191static hostlist_t read_sdr(opt_t *opt) 192{ 193 if (!allnodes) 194 return (NULL); 195 196 if (allnodes && opt->wcoll) 197 errx("%p: Do not specify -a with other node selection options\n"); 198 199 return _sdr_wcoll (global); 200} 201 202static int sdr_postop (opt_t *opt) 203{ 204 hostlist_t hl; 205 206 if (!verify && !altnames) 207 return (0); 208 209 if (!opt->wcoll || (hostlist_count (opt->wcoll) == 0)) 210 return (0); 211 212 if (!sdr_initialized) 213 _sdr_getnames (global); 214 215 if (verify) 216 _sdr_getresp (global); 217 218 hl = _sdr_filter (opt->wcoll, altnames, verify); 219 hostlist_destroy (opt->wcoll); 220 opt->wcoll = hl; 221 222 return (0); 223} 224 225/* 226 * Other functions 227 */ 228 229 230/* 231 * Get the wcoll from the SDR. 232 * Gopt (IN) pass -G to SDRGetObjects 233 * RETURN new list containing hostnames (reliable by default) 234 */ 235static hostlist_t _sdr_wcoll (bool Gopt) 236{ 237 /* 238 * Cache SDR reliable and initial hostnames 239 */ 240 _sdr_getnames (Gopt); 241 242 return _sdr_reliable_names (); 243} 244 245 246/* 247 * Filter hostlist `hl' using SDR attributes. 248 * iopt convert reliable hostnames to initial and vice versa. 249 * verify remove hosts that are not responding on the corresponding 250 * interface (i.e. switch for initial hostnames, eth otherwise) 251 * RETURN new list containing filtered hosts. 252 */ 253static hostlist_t _sdr_filter (hostlist_t hl, bool iopt, bool verify) 254{ 255 char *host = NULL; 256 hostlist_t new = hostlist_create (NULL); 257 hostlist_iterator_t i = hostlist_iterator_create (hl); 258 struct sdr_info *s = NULL; 259 260 while ((host = hostlist_next (i))) { 261 int r = 0; 262 263 if ((s = _find_node (host, &r)) == NULL) { 264 hostlist_push_host (new, host); 265 continue; 266 } 267 268 if (iopt) 269 r = !r; 270 271 if (!verify || (r ? s->host_responds : s->switch_responds)) 272 hostlist_push_host (new, r ? s->reliable_hostname : s->hostname); 273 274 free (host); 275 } 276 277 hostlist_iterator_destroy (i); 278 279 return (new); 280} 281 282static void _sdr_getswitchname(char *switchName, int len) 283{ 284 FILE *f; 285 List words; 286 char cmd[LINEBUFSIZE]; 287 char buf[LINEBUFSIZE]; 288 289 snprintf(cmd, sizeof(cmd), "%s -x Switch switch_number==1 switch_name", 290 _PATH_SDRGETOBJECTS); 291 f = xpopen(cmd, "r"); 292 if (f == NULL) 293 errx("%p: error running %s\n", _PATH_SDRGETOBJECTS); 294 while (fgets(buf, LINEBUFSIZE, f) != NULL) { 295 words = list_split(NULL, buf); 296 assert(list_count(words) == 1); 297 snprintf(switchName, len, _list_nth(words, 0)); 298 list_destroy(words); 299 } 300 xpclose(f); 301} 302 303static char * _sdr_switch_attr (int *numswitchplanes) 304{ 305 FILE *f; 306 List words; 307 char cmd[LINEBUFSIZE]; 308 char buf[LINEBUFSIZE]; 309 int n; 310 311 static char * attr[] = { 312 "switch_responds", 313 "switch_responds0", 314 "switch_responds0 switch_responds1" 315 }; 316 317 _sdr_getswitchname(buf, sizeof(buf)); 318 if (strcmp(buf, "SP_Switch2") != 0) { 319 *numswitchplanes = 1; 320 return (attr[0]); 321 } 322 323 snprintf(cmd, sizeof(cmd), "%s -x SP number_switch_planes", 324 _PATH_SDRGETOBJECTS); 325 326 f = xpopen(cmd, "r"); 327 328 if (f == NULL) 329 errx("%p: error running %s\n", _PATH_SDRGETOBJECTS); 330 while (fgets(buf, LINEBUFSIZE, f) != NULL) { 331 words = list_split(NULL, buf); 332 assert(list_count(words) == 1); 333 n = atoi(_list_nth(words, 0)); 334 list_destroy(words); 335 } 336 if (xpclose(f) != 0) 337 err("%p: nonzero return code from %s\n", _PATH_SDRGETOBJECTS); 338 339 *numswitchplanes = n; 340 return (attr[n]); 341} 342 343static void _sdr_cache_hresp_line (char *buf) 344{ 345 List words = NULL; 346 int nn = -1; 347 348 words = list_split (NULL, buf); 349 assert(list_count (words) == 2); 350 351 nn = atoi (_list_nth (words, 0)); 352 assert (nn >= 0 && nn <= MAX_SP_NODE_NUMBER); 353 /* 354 * Ignore host_responds for hosts without node information 355 */ 356 if (sdrcache[nn] != NULL) 357 sdrcache[nn]->host_responds = (atoi (_list_nth (words, 1)) == 1); 358 359 return; 360} 361 362static void _sdr_cache_sresp_line (char *buf, int switchplanes) 363{ 364 List words = NULL; 365 int nn = -1; 366 struct sdr_info *s; 367 368 words = list_split (NULL, buf); 369 assert(list_count (words) == (1 + switchplanes)); 370 371 nn = atoi (_list_nth (words, 0)); 372 assert (nn >= 0 && nn <= MAX_SP_NODE_NUMBER); 373 assert (sdrcache[nn] != NULL); 374 375 s = sdrcache[nn]; 376 377 s->switch_responds = (atoi(_list_nth(words, 1)) == 1); 378 if (switchplanes == 2) 379 s->switch_responds = s->switch_responds || (atoi(_list_nth (words, 1))); 380 381 return; 382} 383 384static void _sdr_cache_name_line (char *buf) 385{ 386 char *name = NULL; 387 char *rname = NULL; 388 List words = NULL; 389 int nn = -1; 390 char *p; 391 392 words = list_split (NULL, buf); 393 assert (list_count(words) == 3); 394 395 nn = atoi (_list_nth (words, 0)); 396 assert (nn >= 0 && nn <= MAX_SP_NODE_NUMBER); 397 398 name = _list_nth (words, 1); 399 rname = _list_nth (words, 2); 400 401 if ((p = strchr (name, '.'))) 402 *p = '\0'; 403 404 if ((p = strchr (rname, '.'))) 405 *p = '\0'; 406 407 sdrcache[nn] = sdr_info_create (name, rname); 408 409 list_destroy (words); 410 return; 411} 412 413static void _sdr_getresp (bool Gopt) 414{ 415 FILE *f; 416 char cmd[LINEBUFSIZE]; 417 char buf[LINEBUFSIZE]; 418 int nswitchplanes; 419 420 snprintf (cmd, sizeof(cmd), 421 "%s %s -x host_responds node_number host_responds", 422 _PATH_SDRGETOBJECTS, Gopt ? "-G" : ""); 423 424 if ((f = xpopen (cmd, "r")) == NULL) 425 errx("%p: error running %s\n", _PATH_SDRGETOBJECTS); 426 427 while (fgets (buf, LINEBUFSIZE, f) != NULL) 428 _sdr_cache_hresp_line (buf); 429 430 snprintf (cmd, sizeof(cmd), 431 "%s %s -x switch_responds node_number %s", 432 _PATH_SDRGETOBJECTS, Gopt ? "-G" : "", 433 _sdr_switch_attr (&nswitchplanes)); 434 435 if ((f = xpopen (cmd, "r")) == NULL) 436 errx("%p: error running %s\n", _PATH_SDRGETOBJECTS); 437 438 while (fgets (buf, LINEBUFSIZE, f) != NULL) 439 _sdr_cache_sresp_line (buf, nswitchplanes); 440 441 xpclose (f); 442 443 return; 444} 445 446/* 447 * Query the SDR for hostnames of all nodes and return the results in an 448 * array indexed by node number. 449 * Gopt (IN) pass -G to SDRGetObjects 450 */ 451static void _sdr_getnames(bool Gopt) 452{ 453 FILE *f; 454 char cmd[LINEBUFSIZE]; 455 char buf[LINEBUFSIZE]; 456 457 snprintf (cmd, sizeof(cmd), 458 "%s %s -x Node node_number initial_hostname reliable_hostname", 459 _PATH_SDRGETOBJECTS, Gopt ? "-G" : ""); 460 461 if ((f = xpopen (cmd, "r")) == NULL) 462 errx("%p: error running %s\n", _PATH_SDRGETOBJECTS); 463 464 while (fgets (buf, LINEBUFSIZE, f) != NULL) 465 _sdr_cache_name_line (buf); 466 467 xpclose(f); 468 469 sdr_initialized = true; 470} 471 472static hostlist_t _sdr_reliable_names () 473{ 474 hostlist_t hl = hostlist_create (NULL); 475 int i; 476 477 for (i = 0; i < MAX_SP_NODE_NUMBER; i++) { 478 if (sdrcache[i] != NULL) 479 hostlist_push_host (hl, sdrcache[i]->reliable_hostname); 480 } 481 482 return (hl); 483} 484 485static char *_list_nth(List l, int n) 486{ 487 int i = 0; 488 char *name = NULL; 489 ListIterator itr = list_iterator_create(l); 490 491 while ((name = list_next(itr))) { 492 if (i++ == n) break; 493 } 494 list_iterator_destroy(itr); 495 return name; 496} 497 498static struct sdr_info * sdr_info_create (char *host, char *rhost) 499{ 500 struct sdr_info *s = Malloc (sizeof (*s)); 501 502 s->hostname = Strdup (host); 503 s->reliable_hostname = Strdup (rhost); 504 505 s->host_responds = false; 506 s->switch_responds = false; 507 508 return (s); 509} 510 511static void sdr_info_destroy (struct sdr_info *s) 512{ 513 if (s == NULL) 514 return; 515 516 if (s->hostname) 517 Free ((void **) &s->hostname); 518 if (s->reliable_hostname) 519 Free ((void **) &s->reliable_hostname); 520 521 Free ((void **) &s); 522 523 return; 524} 525 526static struct sdr_info * _find_node (const char *name, int *rhost) 527{ 528 int i; 529 530 for (i = 0; i < MAX_SP_NODE_NUMBER; i++) { 531 struct sdr_info *s = sdrcache[i]; 532 533 if (s == NULL) 534 continue; 535 536 if (strncmp (name, s->reliable_hostname, 537 strlen (s->reliable_hostname)) == 0) { 538 if (rhost != NULL) 539 *rhost = 1; 540 return (s); 541 } 542 543 if (strncmp (name, s->hostname, strlen (s->hostname)) == 0) { 544 if (rhost != NULL) 545 *rhost = 0; 546 return (s); 547 } 548 } 549 550 return (NULL); 551} 552 553/* 554 * vi: tabstop=4 shiftwidth=4 expandtab 555 */