/src/qsnet/qswutil.c

https://code.google.com/ · C · 1255 lines · 777 code · 193 blank · 285 comment · 140 complexity · 7f29dfa25e83ff0e29c0fa2809e4130d MD5 · raw file

  1. /*****************************************************************************\
  2. * $Id$
  3. *****************************************************************************
  4. * Copyright (C) 2001-2006 The Regents of the University of California.
  5. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  6. * Written by Jim Garlick <garlick@llnl.gov>.
  7. * UCRL-CODE-2003-005.
  8. *
  9. * This file is part of Pdsh, a parallel remote shell program.
  10. * For details, see <http://www.llnl.gov/linux/pdsh/>.
  11. *
  12. * Pdsh is free software; you can redistribute it and/or modify it under
  13. * the terms of the GNU General Public License as published by the Free
  14. * Software Foundation; either version 2 of the License, or (at your option)
  15. * any later version.
  16. *
  17. * Pdsh is distributed in the hope that it will be useful, but WITHOUT ANY
  18. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  19. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  20. * details.
  21. *
  22. * You should have received a copy of the GNU General Public License along
  23. * with Pdsh; if not, write to the Free Software Foundation, Inc.,
  24. * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
  25. \*****************************************************************************/
  26. #if HAVE_CONFIG_H
  27. #include "config.h"
  28. #endif
  29. #include <stdio.h>
  30. #include <sys/param.h>
  31. #include <sys/types.h>
  32. #include <sys/stat.h>
  33. #include <sys/wait.h>
  34. #include <syslog.h>
  35. #include <errno.h>
  36. #include <string.h>
  37. #include <paths.h>
  38. #include <stdarg.h>
  39. #include <ctype.h>
  40. #include <assert.h>
  41. #include <stdlib.h>
  42. #include <unistd.h>
  43. #include <limits.h> /* INT_MAX */
  44. #include <pthread.h>
  45. #if HAVE_LIBELANCTRL
  46. # include <elan/elanctrl.h>
  47. # include <elan/capability.h>
  48. # define HighNode cap_highnode
  49. # define LowNode cap_lownode
  50. # define HighContext cap_highcontext
  51. # define LowContext cap_lowcontext
  52. # define Bitmap cap_bitmap
  53. # define Type cap_type
  54. # define UserKey cap_userkey
  55. # define RailMask cap_railmask
  56. # define Values key_values
  57. /* We need these using the old libelan3 library calls
  58. * so we redefine them to old values here.
  59. * XXX: What is the equivalent for libelanctrl?
  60. */
  61. # define ELAN_USER_BASE_CONTEXT_NUM 0x020
  62. # define ELAN_USER_TOP_CONTEXT_NUM 0x7ff
  63. #include <sys/stat.h>
  64. #elif HAVE_LIBELAN3
  65. # include <elan3/elan3.h>
  66. # include <elan3/elanvp.h>
  67. #else
  68. # error "Need either libelan3 or libelanctrl to compile this module."
  69. #endif
  70. #include <rms/rmscall.h>
  71. #include <dlfcn.h>
  72. #include <elanhosts.h>
  73. #include "src/common/xmalloc.h"
  74. #include "src/common/xstring.h"
  75. #include "src/common/hostlist.h"
  76. #include "src/common/list.h"
  77. #include "src/common/err.h"
  78. #include "qswutil.h"
  79. /* we will allocate program descriptions in this range */
  80. /* XXX note: do not start at zero as libelan shifts to get unique shm id */
  81. #define QSW_PRG_START 1
  82. #define QSW_PRG_END INT_MAX
  83. static int debug_syslog = 1; /* syslog program setup at LOG_DEBUG level */
  84. /*
  85. * Static "Elan Host" configuration
  86. */
  87. static elanhost_config_t elanconf = NULL;
  88. /*
  89. * Static function prototypes:
  90. */
  91. static int _set_elan_ids(elanhost_config_t ec);
  92. static void *neterr_thr(void *arg);
  93. int qsw_init(void)
  94. {
  95. assert(elanconf == NULL);
  96. elanconf = elanhost_config_create();
  97. if (elanhost_config_read(elanconf, NULL) < 0) {
  98. err("%p: error: %s\n", elanhost_config_err(elanconf));
  99. return -1;
  100. }
  101. return 0;
  102. }
  103. void qsw_fini(void)
  104. {
  105. elanhost_config_destroy(elanconf);
  106. }
  107. static int qsw_have_elan3(void)
  108. {
  109. #if HAVE_LIBELAN3
  110. return (1);
  111. #else
  112. struct stat st;
  113. if (stat("/proc/qsnet/elan3/device0", &st) < 0)
  114. return (0);
  115. return (1);
  116. #endif /* HAVE_LIBELAN3 */
  117. return (0);
  118. }
  119. struct neterr_args {
  120. pthread_mutex_t *mutex;
  121. pthread_cond_t *cond;
  122. int neterr_rc;
  123. };
  124. int qsw_spawn_neterr_thr(void)
  125. {
  126. struct neterr_args args;
  127. pthread_attr_t attr;
  128. pthread_t neterr_tid;
  129. pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
  130. pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
  131. /*
  132. * Only need to run neterr thread on Elan3 HW.
  133. */
  134. if (!qsw_have_elan3())
  135. return (0);
  136. args.mutex = &mutex;
  137. args.cond = &cond;
  138. if ((errno = pthread_attr_init(&attr)))
  139. errx("%p: pthread_attr_init: %m\n");
  140. errno = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
  141. if (errno)
  142. err("%p: pthread_attr_setdetachstate: %m");
  143. pthread_mutex_lock(&mutex);
  144. if ((errno = pthread_create(&neterr_tid, &attr, neterr_thr, &args)))
  145. return -1;
  146. /*
  147. * Wait for successful startup of neterr resolver thread before
  148. * returning control to main thread.
  149. */
  150. pthread_cond_wait(&cond, &mutex);
  151. pthread_mutex_unlock(&mutex);
  152. return args.neterr_rc;
  153. return 0;
  154. }
  155. /*
  156. * Use dlopen () for libelan3.so (when needed)
  157. * This allows us to build a single version of the qsnet modules
  158. * for Elan3 and Elan4 QsNetII systems.
  159. */
  160. /*
  161. * libelan3.so handle:
  162. */
  163. static void * elan3h = NULL;
  164. /*
  165. * Wrapper functions for needed libelan3 functions
  166. */
  167. static int _elan3_init_neterr_svc (int dbglvl)
  168. {
  169. static int (*init_svc) (int);
  170. if (!(init_svc = dlsym (elan3h, "elan3_init_neterr_svc")))
  171. return (0);
  172. return (init_svc (dbglvl));
  173. }
  174. static int _elan3_register_neterr_svc (void)
  175. {
  176. static int (*reg_svc) (void);
  177. if (!(reg_svc = dlsym (elan3h, "elan3_register_neterr_svc")))
  178. return (0);
  179. return (reg_svc ());
  180. }
  181. static int _elan3_run_neterr_svc (void)
  182. {
  183. static int (*run_svc) ();
  184. if (!(run_svc = dlsym (elan3h, "elan3_run_neterr_svc")))
  185. return (0);
  186. return (run_svc ());
  187. }
  188. static int _elan3_load_neterr_svc (int i, char *host)
  189. {
  190. static int (*load_svc) (int, char *);
  191. if (!(load_svc = dlsym (elan3h, "elan3_load_neterr_svc")))
  192. return (0);
  193. return (load_svc (i, host));
  194. }
  195. static int
  196. _set_elan_ids(elanhost_config_t ec)
  197. {
  198. int i;
  199. for (i = 0; i <= elanhost_config_maxid(ec); i++) {
  200. char *host = elanhost_elanid2host(ec, ELANHOST_EIP, i);
  201. if (!host)
  202. continue;
  203. if (_elan3_load_neterr_svc(i, host) < 0)
  204. err("%p: elan3_load_neterr_svc(%d, %s): %m", i, host);
  205. }
  206. return 0;
  207. }
  208. static void *neterr_thr(void *arg)
  209. {
  210. struct neterr_args *args = arg;
  211. if (!(elan3h = dlopen ("libelan3.so", RTLD_LAZY))) {
  212. syslog(LOG_ERR, "unable to open libelan3.so: %s", dlerror());
  213. goto fail;
  214. }
  215. if (!_elan3_init_neterr_svc(0)) {
  216. syslog(LOG_ERR, "elan3_init_neterr_svc: %m");
  217. goto fail;
  218. }
  219. /*
  220. * Attempt to register the neterr svc thread. If the address
  221. * cannot be bound, then there is already a thread running, and
  222. * we should just exit with success.
  223. */
  224. if (!_elan3_register_neterr_svc()) {
  225. if (errno != EADDRINUSE) {
  226. syslog(LOG_ERR, "elan3_register_neterr_svc: %m");
  227. goto fail;
  228. }
  229. /* error resolver already running, just return */
  230. goto done;
  231. }
  232. /*
  233. * Attempt to register elan ids with kernel if we successfully
  234. * registered the error resolver service.
  235. */
  236. _set_elan_ids(elanconf);
  237. done:
  238. /*
  239. * Signal main thread that we've successfully initialized
  240. */
  241. pthread_mutex_lock(args->mutex);
  242. args->neterr_rc = 0;
  243. pthread_cond_signal(args->cond);
  244. pthread_mutex_unlock(args->mutex);
  245. /*
  246. * Run the network error resolver thread. This should
  247. * never return. If it does, there's not much we can do
  248. * about it.
  249. */
  250. _elan3_run_neterr_svc();
  251. return NULL;
  252. fail:
  253. pthread_mutex_lock(args->mutex);
  254. args->neterr_rc = -1;
  255. pthread_cond_signal(args->cond);
  256. pthread_mutex_unlock(args->mutex);
  257. return NULL;
  258. }
  259. static void
  260. _free_it (void *item)
  261. {
  262. Free((void **) &item);
  263. }
  264. static List
  265. _hostlist_to_elanids (hostlist_t nodelist)
  266. {
  267. char *host = NULL;
  268. List l = list_create ((ListDelF) _free_it);
  269. hostlist_iterator_t i = hostlist_iterator_create (nodelist);
  270. if (l == NULL)
  271. errx ("%p: list_create: %m");
  272. if (i == NULL)
  273. errx ("%p: hostlist_iterator_create: %m");
  274. while ((host = hostlist_next (i))) {
  275. int *id = Malloc (sizeof(int));
  276. if ((*id = elanhost_host2elanid (elanconf, host)) < 0) {
  277. err ("%p: Unable to get ElanId for \"%s\": %s\n",
  278. host, elanhost_config_err (elanconf));
  279. goto fail;
  280. }
  281. list_append (l, id);
  282. free (host);
  283. }
  284. hostlist_iterator_destroy (i);
  285. return (l);
  286. fail:
  287. if (host != NULL)
  288. free (host);
  289. if (i != NULL)
  290. hostlist_iterator_destroy (i);
  291. if (l != NULL)
  292. list_destroy (l);
  293. return (NULL);
  294. }
  295. static int
  296. _elanid_min (List el)
  297. {
  298. int *id;
  299. int min = -1;
  300. ListIterator i = list_iterator_create (el);
  301. while ((id = list_next (i))) {
  302. if ((*id < min) || (min == -1))
  303. min = *id;
  304. }
  305. list_iterator_destroy (i);
  306. return (min);
  307. }
  308. static int
  309. _elanid_max (List el)
  310. {
  311. int *id;
  312. int max = -1;
  313. ListIterator i = list_iterator_create (el);
  314. while ((id = list_next (i))) {
  315. if ((*id > max) || (max == -1))
  316. max = *id;
  317. }
  318. list_iterator_destroy (i);
  319. return (max);
  320. }
  321. /*
  322. * Given a list of hostnames and the number of processes per node,
  323. * set the correct bits in the capability's bitmap and set high and
  324. * low node id's.
  325. */
  326. static int
  327. _setbitmap(hostlist_t nodelist, int procs_per_node, int cyclic,
  328. ELAN_CAPABILITY * cap)
  329. {
  330. int *id;
  331. int nodes_in_bitmap;
  332. int rc = 0;
  333. List el;
  334. ListIterator itr;
  335. if (!(el = _hostlist_to_elanids (nodelist)))
  336. return (-1);
  337. cap->HighNode = _elanid_max (el);
  338. cap->LowNode = _elanid_min (el);
  339. if (cap->HighNode == -1 || cap->LowNode == -1)
  340. return -1;
  341. nodes_in_bitmap = cap->HighNode - cap->LowNode + 1;
  342. /*
  343. * There are (procs_per_node * nnodes) significant bits in the mask,
  344. * each representing a process slot. Bits are off where for holes
  345. * corresponding to process slots for unallocated nodes.
  346. * For example, if nodes 4 and 6 are running two processes per node,
  347. * bits 0,1 (corresponding to the two processes on node 4) and bits 4,5
  348. * (corresponding to the two processes running no node 6) are set.
  349. *
  350. * Note that for QsNet, the bits have a different meaning depending
  351. * on whether the capability distribution type is cyclic or block.
  352. * For block distribution, the bits are laid out in node-major
  353. * format, while for cyclic distribution, a procid (or context) major
  354. * format is used.
  355. *
  356. * Example: 2 processes per node on nodes 0,2:
  357. *
  358. * block cyclic
  359. *
  360. * 2 | 1 | 0 NodeId 2 1 0 | 2 1 0
  361. * | | |
  362. * 1 0 | 1 0 | 1 0 ContextId 1 | 0
  363. * | | |
  364. * 5 4 | 3 2 | 1 0 Bit Numbers 5 4 3 | 2 1 0
  365. * | | |
  366. * ---- +-----+----- -------+-------
  367. * 1 1 | 0 0 | 1 1 Bit Value 1 0 1 | 1 0 1
  368. */
  369. itr = list_iterator_create (el);
  370. while ((id = list_next (itr))) {
  371. int node = (*id) - cap->LowNode; /* relative id w/in bitmap */
  372. int i;
  373. for (i = 0; i < procs_per_node; i++) {
  374. int bit;
  375. if (cyclic)
  376. bit = (i * nodes_in_bitmap) + node;
  377. else
  378. bit = (node * (procs_per_node)) + i;
  379. if (bit >= (sizeof (cap->Bitmap) * 8)) {
  380. err ("%p: _setbitmap: bit %d out of range\n", bit);
  381. rc = -1;
  382. break;
  383. }
  384. BT_SET(cap->Bitmap, bit);
  385. }
  386. }
  387. list_destroy (el);
  388. return (rc);
  389. }
  390. /*
  391. * Set a variable in the callers environment. Args are printf style.
  392. * XXX Space is allocated on the heap and will never be reclaimed.
  393. * Example: setenvf("RMS_RANK=%d", rank);
  394. */
  395. static int _setenvf(const char *fmt, ...)
  396. {
  397. va_list ap;
  398. char buf[BUFSIZ];
  399. char *bufcpy;
  400. va_start(ap, fmt);
  401. vsnprintf(buf, sizeof(buf), fmt, ap);
  402. va_end(ap);
  403. bufcpy = strdup(buf);
  404. if (bufcpy == NULL)
  405. return -1;
  406. return putenv(bufcpy);
  407. }
  408. static int _rms_setenv(qsw_info_t * qi)
  409. {
  410. /* MPI wants some of these ...
  411. * (It doesn't anymore, but they are helpful when running
  412. * parallel scripts - ashley@quadrics.com )
  413. */
  414. if (_setenvf("RMS_RANK=%d", qi->rank) < 0)
  415. return -1;
  416. if (_setenvf("RMS_NODEID=%d", qi->nodeid) < 0)
  417. return -1;
  418. if (_setenvf("RMS_PROCID=%d", qi->procid) < 0)
  419. return -1;
  420. if (_setenvf("RMS_NNODES=%d", qi->nnodes) < 0)
  421. return -1;
  422. if (_setenvf("RMS_NPROCS=%d", qi->nprocs) < 0)
  423. return -1;
  424. if (_setenvf("ELAN_AUTO=pdsh") < 0)
  425. return -1;
  426. if (_setenvf("ELAN_JOBID=%d", qi->prgnum) < 0)
  427. return -1;
  428. #if 0
  429. /* I'm not sure what this should be set to yet,
  430. * libelan will do the right thing if it's not
  431. * set though. (ashley@quadrics.com) */
  432. if (_setenvf("LIBELAN_SHMKEY=%d", qi->prgnum) < 0)
  433. return -1;
  434. #endif
  435. return 0;
  436. }
  437. /*
  438. * Return the number of times qsw_encode_cap_bitamp/qsw_decode_cap_bitmap
  439. * must be called.
  440. */
  441. int qsw_cap_bitmap_count(void)
  442. {
  443. ELAN_CAPABILITY cap;
  444. int count = sizeof(cap.Bitmap) / sizeof(cap.Bitmap[0]);
  445. assert(count % 16 == 0);
  446. return count;
  447. }
  448. /*
  449. * Convert capability (all but cap->Bitmap) to string.
  450. */
  451. int qsw_encode_cap(char *s, int len, ELAN_CAPABILITY * cap)
  452. {
  453. int n;
  454. if (sizeof(cap->UserKey.Values[0]) != 4) {
  455. err("%p: qsw_encode_cap: UserKey is unexpected size\n");
  456. return -1;
  457. }
  458. if (sizeof(cap->UserKey) / 4 != 4) {
  459. err("%p: qsw_encode_cap: UserKey array is unexpected size\n");
  460. return -1;
  461. }
  462. #if HAVE_LIBELANCTRL
  463. cap->cap_spare = ELAN_CAP_UNINITIALISED ;
  464. n = snprintf(s, len, "%x.%x.%x.%x.%hx.%x.%x.%x.%x.%x.%x.%x",
  465. cap->UserKey.Values[0],
  466. cap->UserKey.Values[1],
  467. cap->UserKey.Values[2],
  468. cap->UserKey.Values[3],
  469. cap->Type, /* short */
  470. #ifdef ELAN_CAP_ELAN3
  471. cap->cap_elan_type, /* char */
  472. #else
  473. cap->cap_spare,
  474. #endif
  475. cap->LowContext,
  476. cap->HighContext,
  477. cap->cap_mycontext,
  478. cap->LowNode,
  479. cap->HighNode,
  480. cap->RailMask);
  481. #elif HAVE_LIBELAN3
  482. n = snprintf(s, len, "%x.%x.%x.%x.%hx.%x.%x.%x.%x.%x.%x.%x",
  483. cap->UserKey.Values[0],
  484. cap->UserKey.Values[1],
  485. cap->UserKey.Values[2],
  486. cap->UserKey.Values[3],
  487. cap->Type, /* short */
  488. cap->LowContext,
  489. cap->HighContext,
  490. cap->MyContext,
  491. cap->LowNode,
  492. cap->HighNode,
  493. cap->Entries,
  494. cap->RailMask);
  495. #else
  496. #error "Neither LIBELAN3 nor LIBELANCTRL defined!"
  497. #endif
  498. if (n < 0 || n > strlen(s)) {
  499. err("%p: qsw_encode_cap: string overflow\n");
  500. return -1;
  501. }
  502. return 0;
  503. }
  504. /*
  505. * Convert cap->Bitmap to string.
  506. */
  507. int qsw_encode_cap_bitmap(char *s, int len, ELAN_CAPABILITY * cap, int i)
  508. {
  509. int n;
  510. if (sizeof(cap->Bitmap[0]) != sizeof(unsigned int)) {
  511. err("%p: qsw_encode_cap_bitmap: Bitmap is unexpected size\n");
  512. return -1;
  513. }
  514. if ((sizeof(cap->Bitmap) / sizeof(cap->Bitmap[0])) % 16 != 0) {
  515. err("%p: qsw_encode_cap_bitmap: Bitmap is not mult of 16\n");
  516. return -1;
  517. }
  518. if (i < 0 || i >= (sizeof(cap->Bitmap) / sizeof(cap->Bitmap[0]))) {
  519. err("%p: qsw_encode_cap_bitmap: Bitmap index out of range\n");
  520. return -1;
  521. }
  522. n = snprintf(s, len, "%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x",
  523. cap->Bitmap[i + 0], cap->Bitmap[i + 1],
  524. cap->Bitmap[i + 2], cap->Bitmap[i + 3],
  525. cap->Bitmap[i + 4], cap->Bitmap[i + 5],
  526. cap->Bitmap[i + 6], cap->Bitmap[i + 7],
  527. cap->Bitmap[i + 8], cap->Bitmap[i + 9],
  528. cap->Bitmap[i + 10], cap->Bitmap[i + 11],
  529. cap->Bitmap[i + 12], cap->Bitmap[i + 13],
  530. cap->Bitmap[i + 14], cap->Bitmap[i + 15]);
  531. if (n == -1 || n > strlen(s)) {
  532. err("%p: qsw_encode_cap_bitmap: string overflow\n");
  533. return -1;
  534. }
  535. return 0;
  536. }
  537. /*
  538. * Convert string to capability (all but cap->Bitmap).
  539. */
  540. int qsw_decode_cap(char *s, ELAN_CAPABILITY * cap)
  541. {
  542. int n;
  543. #if HAVE_LIBELANCTRL
  544. /* initialize capability */
  545. elan_nullcap(cap);
  546. n = sscanf(s, "%x.%x.%x.%x.%hx.%hx.%x.%x.%x.%x.%x.%x",
  547. &cap->UserKey.Values[0],
  548. &cap->UserKey.Values[1],
  549. &cap->UserKey.Values[2],
  550. &cap->UserKey.Values[3],
  551. &cap->cap_type, /* short */
  552. # ifdef ELAN_CAP_ELAN3
  553. &cap->cap_elan_type, /* char */
  554. # else
  555. &cap->cap_spare, /* unsigned short */
  556. # endif
  557. &cap->LowContext,
  558. &cap->HighContext,
  559. &cap->cap_mycontext,
  560. &cap->LowNode,
  561. &cap->HighNode,
  562. &cap->RailMask);
  563. #elif HAVE_LIBELAN3
  564. /* initialize capability */
  565. elan3_nullcap(cap);
  566. /* fill in values sent from remote */
  567. n = sscanf(s, "%x.%x.%x.%x.%hx.%x.%x.%x.%x.%x.%x.%x",
  568. &cap->UserKey.Values[0],
  569. &cap->UserKey.Values[1],
  570. &cap->UserKey.Values[2],
  571. &cap->UserKey.Values[3],
  572. &cap->Type, /* short */
  573. &cap->LowContext,
  574. &cap->HighContext,
  575. &cap->MyContext,
  576. &cap->LowNode,
  577. &cap->HighNode,
  578. &cap->Entries,
  579. &cap->RailMask);
  580. #else
  581. # error "Neither LIBELANCTRL nor LIBELAN3 set!"
  582. #endif
  583. if (n != 12) {
  584. err("%p: qsw_decode_cap: scan error (%d of %d)\n", n, 12);
  585. return -1;
  586. }
  587. return 0;
  588. }
  589. /*
  590. * Convert string to cap->Bitmap.
  591. */
  592. int qsw_decode_cap_bitmap(char *s, ELAN_CAPABILITY * cap, int i)
  593. {
  594. int n;
  595. if (i < 0 || i >= sizeof(cap->Bitmap) / sizeof(cap->Bitmap[0])) {
  596. err("%p: qsw_decode_cap_bitmap: BitMap index out of range\n");
  597. return -1;
  598. }
  599. n = sscanf(s, "%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x.%x",
  600. &cap->Bitmap[i + 0], &cap->Bitmap[i + 1],
  601. &cap->Bitmap[i + 2], &cap->Bitmap[i + 3],
  602. &cap->Bitmap[i + 4], &cap->Bitmap[i + 5],
  603. &cap->Bitmap[i + 6], &cap->Bitmap[i + 7],
  604. &cap->Bitmap[i + 8], &cap->Bitmap[i + 9],
  605. &cap->Bitmap[i + 10], &cap->Bitmap[i + 11],
  606. &cap->Bitmap[i + 12], &cap->Bitmap[i + 13],
  607. &cap->Bitmap[i + 14], &cap->Bitmap[i + 15]);
  608. if (n != 16) {
  609. err("%p: qsw_decode_cap_bitmap(%d): scan error\n", i);
  610. return -1;
  611. }
  612. return 0;
  613. }
  614. /*
  615. * string -> info
  616. */
  617. int qsw_decode_info(char *s, qsw_info_t * qi)
  618. {
  619. int n;
  620. n = sscanf(s, "%x.%x.%x.%x.%x.%x",
  621. &qi->prgnum,
  622. &qi->rank,
  623. &qi->nodeid, &qi->procid, &qi->nnodes, &qi->nprocs);
  624. if (n != 6) {
  625. err("%p: qsw_decode_info: scan error\n");
  626. return -1;
  627. }
  628. return 0;
  629. }
  630. /*
  631. * info -> string
  632. */
  633. int qsw_encode_info(char *s, int len, qsw_info_t * qi)
  634. {
  635. int n;
  636. n = snprintf(s, len, "%x.%x.%x.%x.%x.%x",
  637. qi->prgnum,
  638. qi->rank, qi->nodeid, qi->procid, qi->nnodes, qi->nprocs);
  639. if (n == -1 || n > strlen(s)) {
  640. err("%p: qsw_encode_info: string overflow\n");
  641. return -1;
  642. }
  643. return 0;
  644. }
  645. /*
  646. * Generate a random program number. Normally these would be allocated,
  647. * but since we have no persistant daemon, we settle for random.
  648. * Must be called after qsw_init_capability (we seed lrand48 there).
  649. */
  650. int qsw_get_prgnum(void)
  651. {
  652. int prgnum;
  653. prgnum = lrand48() % (QSW_PRG_END - QSW_PRG_START + 1);
  654. prgnum += QSW_PRG_START;
  655. return prgnum;
  656. }
  657. /*
  658. * Prepare a capability that will be passed to all the processes in a
  659. * parallel program.
  660. * Function returns a 0 on success, -1 = fail.
  661. */
  662. int
  663. qsw_init_capability(ELAN_CAPABILITY * cap, int nprocs, hostlist_t nodelist,
  664. int cyclic_alloc, unsigned int railmask)
  665. {
  666. int i;
  667. int num_nodes = hostlist_count(nodelist);
  668. int procs_per_node = nprocs / num_nodes;
  669. assert (railmask < QSW_RAILMASK_MAX);
  670. srand48(getpid());
  671. /*
  672. * Initialize for multi rail and either block or cyclic allocation.
  673. * Set ELAN_CAP_TYPE_BROADCASTABLE later if appropriate.
  674. */
  675. #if HAVE_LIBELANCTRL
  676. elan_nullcap(cap);
  677. #elif HAVE_LIBELAN3
  678. elan3_nullcap(cap);
  679. #else
  680. # error
  681. #endif
  682. if (cyclic_alloc)
  683. cap->Type = ELAN_CAP_TYPE_CYCLIC;
  684. else
  685. cap->Type = ELAN_CAP_TYPE_BLOCK;
  686. cap->Type |= ELAN_CAP_TYPE_MULTI_RAIL;
  687. cap->RailMask = railmask;
  688. #if HAVE_LIBELANCTRL
  689. # ifdef ELAN_CAP_ELAN3
  690. cap->cap_elan_type = ELAN_CAP_ELAN3;
  691. # else
  692. cap->cap_spare = ELAN_CAP_UNINITIALISED;
  693. # endif
  694. #endif
  695. /*
  696. * UserKey is 128 bits of randomness which should be kept private.
  697. */
  698. for (i = 0; i < 4; i++)
  699. cap->UserKey.Values[i] = lrand48();
  700. /*
  701. * Elan hardware context numbers must be unique per node.
  702. * One is allocated to each parallel process. In order for processes
  703. * on the same node to communicate, they must use contexts in the
  704. * hi-lo range of a common capability. With pdsh we have no
  705. * persistant daemon to allocate these, so we settle for a random one.
  706. */
  707. cap->LowContext = lrand48() %
  708. (ELAN_USER_TOP_CONTEXT_NUM -
  709. (ELAN_USER_BASE_CONTEXT_NUM + procs_per_node - 1) - 1);
  710. cap->LowContext += ELAN_USER_BASE_CONTEXT_NUM;
  711. cap->HighContext = cap->LowContext + procs_per_node - 1;
  712. /* not necessary to initialize cap->MyContext */
  713. /*
  714. * Describe the mapping of processes to nodes.
  715. * This sets cap->HighNode, cap->LowNode, and cap->Bitmap.
  716. */
  717. if (_setbitmap(nodelist, procs_per_node, cyclic_alloc, cap) < 0) {
  718. err("%p: do all target nodes have an Elan adapter?\n");
  719. return -1;
  720. }
  721. #if HAVE_LIBELAN3
  722. /*
  723. * Set cap->Entries and add broadcast bit to cap->type based on
  724. * cap->HighNode and cap->LowNode values set above.
  725. */
  726. cap->Entries = nprocs;
  727. if (cap->Entries > ELAN_MAX_VPS) {
  728. err("%p: program would have too many processes (max %d)\n",
  729. ELAN_MAX_VPS);
  730. return -1;
  731. }
  732. #endif
  733. /*
  734. * As we now support segmented broadcast, always flag the capability
  735. * as broadcastable.
  736. */
  737. /*if (abs(cap->HighNode - cap->LowNode) == num_nodes - 1) */
  738. cap->Type |= ELAN_CAP_TYPE_BROADCASTABLE;
  739. return 0;
  740. }
  741. static int
  742. _qsw_elan_nrails(ELAN_CAPABILITY * cap)
  743. {
  744. #if HAVE_LIBELANCTRL
  745. return elan_nrails (cap);
  746. #elif HAVE_LIBELAN3
  747. return elan3_nrails (cap);
  748. #endif
  749. }
  750. static int
  751. _qsw_cap_create(ELAN_CAPABILITY * cap, int nrails)
  752. {
  753. #if HAVE_LIBELANCTRL
  754. ELANCTRL_HANDLE handle;
  755. /*
  756. * Open up the Elan control device so we can create
  757. * a new capability.
  758. */
  759. if (elanctrl_open(&handle) != 0)
  760. errx("%p: elanctrl_open(): %m\n");
  761. /* Push capability into device driver */
  762. if (elanctrl_create_cap(handle, cap) < 0)
  763. errx("%p: elanctrl_create_cap failed: %m\n");
  764. /*
  765. * Do not close elanctrl handle here, this can cause
  766. * MPI initialization to fail somehow.
  767. *
  768. * elanctrl_close(handle);
  769. */
  770. #elif HAVE_LIBELAN3
  771. int i, n = 0;
  772. /* MULTI-RAIL: Create the capability in all rails */
  773. for (i = 0; (i < ELAN_MAX_RAILS) && (n < nrails); i++) {
  774. void *handle;
  775. if (!(cap->RailMask & (1 << i)))
  776. continue;
  777. /*
  778. * Open up the control device so we can create a new
  779. * capability. This will fail if we don't have rw
  780. * access to /dev/elan3/control[i]
  781. */
  782. if ((handle = elan3_control_open(i)) == NULL)
  783. errx("%p: elan3_control_open(%d): %m\n", i);
  784. /* Push capability into device driver */
  785. if (elan3_create(handle, cap) < 0)
  786. errx("%p: elan3_create failed: %m\n");
  787. /*
  788. * Do not close handle, for some reason this causes
  789. * elan3_attach to return EINVAL...
  790. *
  791. * elan3_control_close(handle);
  792. */
  793. n++;
  794. }
  795. #endif /* HAVE_LIBELANCTRL */
  796. return (0);
  797. }
  798. /*
  799. * Take necessary steps to set up to run an Elan MPI "program"
  800. * (set of processes) on a node.
  801. *
  802. * Process 1 Process 2 | Process 3
  803. * read args |
  804. * fork ------- rms_prgcreate |
  805. * waitpid elan3_create |
  806. * rms_prgaddcap |
  807. * fork N procs ---+------ rms_setcap
  808. * wait all | setup RMS_ env
  809. * | setuid, etc.
  810. * | exec mpi process
  811. * exit |
  812. * rms_prgdestroy |
  813. * exit | (one pair of processes per mpi proc!)
  814. *
  815. * Explanation of the two fork(2) calls:
  816. * - The first fork is required because rms_prgdestroy can't occur in the
  817. * process that calls rms_prgcreate (since it is a member, ECHILD).
  818. * - The second fork is required when running multiple processes per node
  819. * because each process must announce its use of one of the hw contexts
  820. * in the range allocated in the capability.
  821. *
  822. * One process:
  823. * init-xinetd-+-in.qshd---in.qshd---in.qshd---sleep
  824. * Two processes:
  825. * init-xinetd-+-in.qshd---in.qshd---2*[in.qshd---sleep]
  826. * (if stderr backchannel is active, add one in.qshd)
  827. *
  828. * Any errors result in a message on stderr and program exit.
  829. */
  830. void qsw_setup_program(ELAN_CAPABILITY * cap, qsw_info_t * qi, uid_t uid)
  831. {
  832. int pid;
  833. int i;
  834. int nrails;
  835. int cpid[ELAN_MAX_VPS];
  836. int procs_per_node;
  837. int proc_index;
  838. if (qi->nprocs > ELAN_MAX_VPS) /* should catch this in client */
  839. errx("%p: too many processes requested\n");
  840. /*
  841. * First fork. Parent waits for child to terminate, then cleans up.
  842. */
  843. pid = fork();
  844. switch (pid) {
  845. case -1: /* error */
  846. errx("%p: fork: %m\n");
  847. case 0: /* child falls thru */
  848. break;
  849. default: /* parent */
  850. if (waitpid(pid, NULL, 0) < 0)
  851. errx("%p: waitpid: %m\n");
  852. while (rms_prgdestroy(qi->prgnum) < 0) {
  853. if (errno != ECHILD)
  854. errx("%p: rms_prgdestroy: %m\n");
  855. sleep(1); /* waitprg would be nice! */
  856. }
  857. exit(0);
  858. }
  859. /* child continues here */
  860. nrails = _qsw_elan_nrails(cap);
  861. /* associate this process and its children with prgnum */
  862. if (rms_prgcreate(qi->prgnum, uid, 1) < 0) /* 1 cpu (bogus!) */
  863. errx("%p: rms_prgcreate %d failed: %m\n", qi->prgnum);
  864. /*
  865. * Set up capability
  866. */
  867. if (_qsw_cap_create(cap, nrails) < 0)
  868. errx("%p: unable to set up Elan capability\n");
  869. /*
  870. * Make cap known via rms_getcap/rms_ncaps
  871. * to members of this prgnum
  872. */
  873. for (i = 0; i < nrails; i++) {
  874. if (rms_prgaddcap(qi->prgnum, i, cap) < 0)
  875. errx("%p: rms_prgaddcap failed: %m\n");
  876. }
  877. if (debug_syslog) {
  878. char tmpstr[1024];
  879. syslog(LOG_DEBUG, "prg %d cap %s bitmap 0x%.8x", qi->prgnum,
  880. #if HAVE_LIBELANCTRL
  881. elan_capability_string(cap, tmpstr),
  882. #elif HAVE_LIBELAN3
  883. elan3_capability_string(cap, tmpstr),
  884. #endif
  885. cap->Bitmap[0]);
  886. }
  887. /*
  888. * Second fork - once for each process.
  889. * Parent waits for all children to exit the it exits.
  890. * Child assigns hardware context to each process, then forks again...
  891. */
  892. procs_per_node = qi->nprocs / qi->nnodes;
  893. for (proc_index = 0; proc_index < procs_per_node; proc_index++) {
  894. cpid[proc_index] = fork();
  895. if (cpid[proc_index] < 0)
  896. errx("%p: fork (%d): %m\n", proc_index);
  897. else if (cpid[proc_index] == 0)
  898. break;
  899. }
  900. /* parent */
  901. if (proc_index == procs_per_node) {
  902. int waiting = procs_per_node;
  903. int i;
  904. while (waiting > 0) {
  905. pid = waitpid(0, NULL, 0); /* any in pgrp */
  906. if (pid < 0)
  907. errx("%p: waitpid: %m\n");
  908. for (i = 0; i < procs_per_node; i++) {
  909. if (cpid[i] == pid)
  910. waiting--;
  911. }
  912. }
  913. exit(0);
  914. }
  915. /* child falls through here */
  916. /* proc_index will be set to the child's index */
  917. /*
  918. * Assign elan hardware context to current process.
  919. * - arg1 is an index into the kernel's list of caps for this
  920. * program desc (added by rms_prgaddcap). There will be
  921. * one per rail.
  922. * - arg2 indexes the hw ctxt range in the capability
  923. * [cap->LowContext, cap->HighContext]
  924. */
  925. for (i = 0; i < nrails; i++) {
  926. if (rms_setcap(i, proc_index) < 0)
  927. errx("%p: rms_setcap (%d): %m\n", proc_index);
  928. }
  929. /* set RMS_ environment vars */
  930. switch (cap->Type & ELAN_CAP_TYPE_MASK) {
  931. case ELAN_CAP_TYPE_BLOCK:
  932. qi->procid = (qi->nodeid * procs_per_node) + proc_index;
  933. break;
  934. case ELAN_CAP_TYPE_CYCLIC:
  935. qi->procid = qi->nodeid + (proc_index * qi->nnodes);
  936. break;
  937. default:
  938. errx("%p: unsupported Elan capability type\n");
  939. }
  940. qi->rank = qi->procid;
  941. if (_rms_setenv(qi) < 0)
  942. errx("%p: failed to set environment variables: %m\n");
  943. /* Exec the process... */
  944. }
  945. int qsw_prgsignal(int prgid, int signo)
  946. {
  947. return rms_prgsignal(prgid, signo);
  948. }
  949. #ifdef TEST_MAIN
  950. /* encode info, then decode and check that the result is what we started with */
  951. static void _verify_info_encoding(qsw_info_t * qi)
  952. {
  953. int err;
  954. char tmpstr[1024];
  955. qsw_info_t qicpy;
  956. err = qsw_encode_info(tmpstr, sizeof(tmpstr), qi);
  957. assert(err >= 0);
  958. err = qsw_decode_info(tmpstr, &qicpy);
  959. assert(memcmp(qi, &qicpy, sizeof(qicpy)) == 0);
  960. }
  961. /* encode cap, then decode and check that the result is what we started with */
  962. static void _verify_cap_encoding(ELAN_CAPABILITY * cap)
  963. {
  964. ELAN_CAPABILITY capcpy;
  965. char tmpstr[1024];
  966. int err;
  967. err = qsw_encode_cap(tmpstr, sizeof(tmpstr), cap);
  968. assert(err >= 0);
  969. err = qsw_decode_cap(tmpstr, &capcpy);
  970. assert(err >= 0);
  971. /*assert(ELAN_CAP_MATCH(&cap, &cap2)); *//* broken - see GNATS #3875 */
  972. assert(memcmp(cap, &capcpy, sizeof(capcpy)) == 0);
  973. }
  974. /* concatenate args into a single string */
  975. static void _strncatargs(char *buf, int len, int argc, char *argv[])
  976. {
  977. if (len > 0) {
  978. buf[0] = '\0';
  979. }
  980. while (len > 1 && argc > 0) {
  981. strncat(buf, argv[0], len);
  982. argv++;
  983. argc--;
  984. if (argc > 0)
  985. strncat(buf, " ", len);
  986. }
  987. buf[len - 1] = '\0';
  988. }
  989. static void _usage(void)
  990. {
  991. errx("Usage %p [ -n procs ] [ -u uid ] command args...\n");
  992. }
  993. /*
  994. * Test program for qsw runtime routines. Run one or more processes locally,
  995. * e.g. for MPI ping test across shared memory:
  996. * qrun -n 2 -u 5588 mping 1 32768
  997. */
  998. int main(int argc, char *argv[])
  999. {
  1000. extern char *optarg;
  1001. extern int optind;
  1002. char cmdbuf[1024];
  1003. ELAN_CAPABILITY cap;
  1004. int c;
  1005. char *p;
  1006. uid_t uid = 0;
  1007. hostlist_t wcoll = hostlist_create("");
  1008. char hostname[MAXHOSTNAMELEN];
  1009. qsw_info_t qinfo = {
  1010. nnodes:1,
  1011. nprocs:1,
  1012. };
  1013. err_init(xbasename(argv[0])); /* init err package */
  1014. while ((c = getopt(argc, argv, "u:n:")) != EOF) {
  1015. switch (c) {
  1016. case 'u':
  1017. uid = atoi(optarg);
  1018. break;
  1019. case 'n':
  1020. qinfo.nprocs = atoi(optarg);
  1021. break;
  1022. default:
  1023. _usage();
  1024. }
  1025. }
  1026. argc -= optind;
  1027. argv += optind;
  1028. if (argc == 0)
  1029. _usage();
  1030. /* prep arg for the shell */
  1031. _strncatargs(cmdbuf, sizeof(cmdbuf), argc, argv);
  1032. /* create working collective containing only this host */
  1033. if (gethostname(hostname, sizeof(hostname)) < 0)
  1034. errx("%p: gethostname: %m\n");
  1035. if ((p = strchr(hostname, '.')))
  1036. *p = '\0';
  1037. hostlist_push(wcoll, hostname);
  1038. qsw_init();
  1039. /* initialize capability for this "program" */
  1040. if (qsw_init_capability(&cap, qinfo.nprocs / qinfo.nnodes, wcoll, 0) < 0)
  1041. errx("%p: failed to initialize Elan capability\n");
  1042. /* assert encode/decode routines work (we don't use them here) */
  1043. _verify_info_encoding(&qinfo);
  1044. _verify_cap_encoding(&cap);
  1045. /* generate random program number */
  1046. qinfo.prgnum = qsw_get_prgnum();
  1047. /* set up capabilities, environment, fork, etc.. */
  1048. qsw_setup_program(&cap, &qinfo, uid);
  1049. /* multiple threads continue on here (one per processes) */
  1050. if (seteuid(uid) < 0)
  1051. errx("%p: seteuid: %m\n");
  1052. err("%p: %d:%d executing /bin/bash -c %s\n",
  1053. qinfo.prgnum, qinfo.procid, cmdbuf);
  1054. execl("/bin/bash", "bash", "-c", cmdbuf, 0);
  1055. errx("%p: exec of shell failed: %m\n");
  1056. qsw_fini();
  1057. exit(0);
  1058. }
  1059. #endif /* TEST_MAIN */
  1060. /*
  1061. * vi:tabstop=4 shiftwidth=4 expandtab
  1062. */