/src/modules/sdr.c

https://code.google.com/ · C · 555 lines · 365 code · 111 blank · 79 comment · 82 complexity · ab9f72e0815c7521e1b5cbdf893d0a3c MD5 · raw file

  1. /*****************************************************************************\
  2. * $Id$
  3. *****************************************************************************
  4. * Copyright (C) 2001-2006 The Regents of the University of California.
  5. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  6. * Written by Jim Garlick <garlick@llnl.gov>.
  7. * UCRL-CODE-2003-005.
  8. *
  9. * This file is part of Pdsh, a parallel remote shell program.
  10. * For details, see <http://www.llnl.gov/linux/pdsh/>.
  11. *
  12. * Pdsh is free software; you can redistribute it and/or modify it under
  13. * the terms of the GNU General Public License as published by the Free
  14. * Software Foundation; either version 2 of the License, or (at your option)
  15. * any later version.
  16. *
  17. * Pdsh is distributed in the hope that it will be useful, but WITHOUT ANY
  18. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  19. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  20. * details.
  21. *
  22. * You should have received a copy of the GNU General Public License along
  23. * with Pdsh; if not, write to the Free Software Foundation, Inc.,
  24. * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
  25. \*****************************************************************************/
  26. #if HAVE_CONFIG_H
  27. # include "config.h"
  28. #endif
  29. #include <assert.h>
  30. #include <string.h>
  31. #include "src/pdsh/wcoll.h"
  32. #include "src/pdsh/mod.h"
  33. #include "src/pdsh/xpopen.h"
  34. #include "src/common/xmalloc.h"
  35. #include "src/common/err.h"
  36. #include "src/common/xstring.h"
  37. #define SPACES "\t\n "
  38. #define LINEBUFSIZE 2048
  39. /* some handy SP constants */
  40. /* NOTE: degenerate case of one node per frame, nodes would be 1, 17, 33,... */
  41. #define MAX_SP_NODES 512
  42. #define MAX_SP_NODES_PER_FRAME 16
  43. #define MAX_SP_NODE_NUMBER (MAX_SP_NODES * MAX_SP_NODES_PER_FRAME - 1)
  44. #if STATIC_MODULES
  45. # define pdsh_module_info sdr_module_info
  46. # define pdsh_module_priority sdr_module_priority
  47. #endif
  48. int pdsh_module_priority = DEFAULT_MODULE_PRIORITY;
  49. static int sdr_init (void);
  50. static int sdr_exit (void);
  51. static hostlist_t read_sdr (opt_t *opt);
  52. static int sdr_postop (opt_t *);
  53. static int sdr_process_opt(opt_t *, int, char *);
  54. /*
  55. * Export generic module functions
  56. */
  57. struct pdsh_module_operations sdr_module_ops = {
  58. (ModInitF) sdr_init,
  59. (ModExitF) sdr_exit,
  60. (ModReadWcollF) read_sdr,
  61. (ModPostOpF) sdr_postop,
  62. };
  63. /*
  64. * Export module options
  65. */
  66. struct pdsh_module_option sdr_module_options[] =
  67. { { 'a', NULL, "target all nodes",
  68. DSH | PCP, (optFunc) sdr_process_opt
  69. },
  70. { 'v', NULL, "verify nodes are up using host/switch_responds",
  71. DSH | PCP, (optFunc) sdr_process_opt
  72. },
  73. { 'i', NULL, "translate to alternate/initial hostnames from SDR (if applicable)",
  74. DSH | PCP, (optFunc) sdr_process_opt
  75. },
  76. { 'G', NULL, "with -a, run on all SP partitions",
  77. DSH | PCP, (optFunc) sdr_process_opt
  78. },
  79. PDSH_OPT_TABLE_END
  80. };
  81. /*
  82. * Sdr module info
  83. */
  84. struct pdsh_module pdsh_module_info = {
  85. "misc",
  86. "sdr",
  87. "Jim Garlick <garlick@llnl.gov>",
  88. "Support for SDR on IBM SP",
  89. DSH | PCP,
  90. &sdr_module_ops,
  91. NULL,
  92. &sdr_module_options[0],
  93. };
  94. /*
  95. * Data cache for SDR information.
  96. * XXX: Hash by node number instead of leaving room for
  97. * all possible nodes
  98. */
  99. struct sdr_info {
  100. char *hostname;
  101. char *reliable_hostname;
  102. bool switch_responds;
  103. bool host_responds;
  104. };
  105. static bool sdr_initialized = false;
  106. static struct sdr_info * sdrcache[MAX_SP_NODE_NUMBER];
  107. /*
  108. * Global options
  109. */
  110. static bool allnodes = false;
  111. static bool altnames = false;
  112. static bool verify = false;
  113. static bool global = false;
  114. /*
  115. * Required static forward declarations
  116. */
  117. static struct sdr_info * sdr_info_create (char *host, char *rhost);
  118. static void sdr_info_destroy (struct sdr_info *s);
  119. static hostlist_t _sdr_filter (hostlist_t hl, bool iopt, bool verify);
  120. static hostlist_t _sdr_wcoll(bool Gopt);
  121. static struct sdr_info * _find_node (const char *name, int *rhost);
  122. static hostlist_t _sdr_reliable_names (void);
  123. static void _sdr_getnames(bool Gopt);
  124. static void _sdr_getresp (bool Gopt);
  125. static char *_list_nth(List l, int n);
  126. /*
  127. * module interface functions
  128. */
  129. static int sdr_init (void)
  130. {
  131. int i;
  132. for (i = 0; i < MAX_SP_NODE_NUMBER; i++)
  133. sdrcache[i] = NULL;
  134. return (0);
  135. }
  136. static int sdr_exit (void)
  137. {
  138. int i;
  139. for (i = 0; i < MAX_SP_NODE_NUMBER; i++)
  140. sdr_info_destroy (sdrcache[i]);
  141. return (0);
  142. }
  143. static int sdr_process_opt(opt_t *pdsh_opt, int opt, char *arg)
  144. {
  145. switch (opt) {
  146. case 'a':
  147. allnodes = true;
  148. break;
  149. case 'i':
  150. altnames = true;
  151. break;
  152. case 'v':
  153. verify = true;
  154. break;
  155. case 'G':
  156. global = true;
  157. break;
  158. default:
  159. errx("%p: badness factor high in sdr module\n");
  160. break;
  161. }
  162. return 0;
  163. }
  164. static hostlist_t read_sdr(opt_t *opt)
  165. {
  166. if (!allnodes)
  167. return (NULL);
  168. if (allnodes && opt->wcoll)
  169. errx("%p: Do not specify -a with other node selection options\n");
  170. return _sdr_wcoll (global);
  171. }
  172. static int sdr_postop (opt_t *opt)
  173. {
  174. hostlist_t hl;
  175. if (!verify && !altnames)
  176. return (0);
  177. if (!opt->wcoll || (hostlist_count (opt->wcoll) == 0))
  178. return (0);
  179. if (!sdr_initialized)
  180. _sdr_getnames (global);
  181. if (verify)
  182. _sdr_getresp (global);
  183. hl = _sdr_filter (opt->wcoll, altnames, verify);
  184. hostlist_destroy (opt->wcoll);
  185. opt->wcoll = hl;
  186. return (0);
  187. }
  188. /*
  189. * Other functions
  190. */
  191. /*
  192. * Get the wcoll from the SDR.
  193. * Gopt (IN) pass -G to SDRGetObjects
  194. * RETURN new list containing hostnames (reliable by default)
  195. */
  196. static hostlist_t _sdr_wcoll (bool Gopt)
  197. {
  198. /*
  199. * Cache SDR reliable and initial hostnames
  200. */
  201. _sdr_getnames (Gopt);
  202. return _sdr_reliable_names ();
  203. }
  204. /*
  205. * Filter hostlist `hl' using SDR attributes.
  206. * iopt convert reliable hostnames to initial and vice versa.
  207. * verify remove hosts that are not responding on the corresponding
  208. * interface (i.e. switch for initial hostnames, eth otherwise)
  209. * RETURN new list containing filtered hosts.
  210. */
  211. static hostlist_t _sdr_filter (hostlist_t hl, bool iopt, bool verify)
  212. {
  213. char *host = NULL;
  214. hostlist_t new = hostlist_create (NULL);
  215. hostlist_iterator_t i = hostlist_iterator_create (hl);
  216. struct sdr_info *s = NULL;
  217. while ((host = hostlist_next (i))) {
  218. int r = 0;
  219. if ((s = _find_node (host, &r)) == NULL) {
  220. hostlist_push_host (new, host);
  221. continue;
  222. }
  223. if (iopt)
  224. r = !r;
  225. if (!verify || (r ? s->host_responds : s->switch_responds))
  226. hostlist_push_host (new, r ? s->reliable_hostname : s->hostname);
  227. free (host);
  228. }
  229. hostlist_iterator_destroy (i);
  230. return (new);
  231. }
  232. static void _sdr_getswitchname(char *switchName, int len)
  233. {
  234. FILE *f;
  235. List words;
  236. char cmd[LINEBUFSIZE];
  237. char buf[LINEBUFSIZE];
  238. snprintf(cmd, sizeof(cmd), "%s -x Switch switch_number==1 switch_name",
  239. _PATH_SDRGETOBJECTS);
  240. f = xpopen(cmd, "r");
  241. if (f == NULL)
  242. errx("%p: error running %s\n", _PATH_SDRGETOBJECTS);
  243. while (fgets(buf, LINEBUFSIZE, f) != NULL) {
  244. words = list_split(NULL, buf);
  245. assert(list_count(words) == 1);
  246. snprintf(switchName, len, _list_nth(words, 0));
  247. list_destroy(words);
  248. }
  249. xpclose(f);
  250. }
  251. static char * _sdr_switch_attr (int *numswitchplanes)
  252. {
  253. FILE *f;
  254. List words;
  255. char cmd[LINEBUFSIZE];
  256. char buf[LINEBUFSIZE];
  257. int n;
  258. static char * attr[] = {
  259. "switch_responds",
  260. "switch_responds0",
  261. "switch_responds0 switch_responds1"
  262. };
  263. _sdr_getswitchname(buf, sizeof(buf));
  264. if (strcmp(buf, "SP_Switch2") != 0) {
  265. *numswitchplanes = 1;
  266. return (attr[0]);
  267. }
  268. snprintf(cmd, sizeof(cmd), "%s -x SP number_switch_planes",
  269. _PATH_SDRGETOBJECTS);
  270. f = xpopen(cmd, "r");
  271. if (f == NULL)
  272. errx("%p: error running %s\n", _PATH_SDRGETOBJECTS);
  273. while (fgets(buf, LINEBUFSIZE, f) != NULL) {
  274. words = list_split(NULL, buf);
  275. assert(list_count(words) == 1);
  276. n = atoi(_list_nth(words, 0));
  277. list_destroy(words);
  278. }
  279. if (xpclose(f) != 0)
  280. err("%p: nonzero return code from %s\n", _PATH_SDRGETOBJECTS);
  281. *numswitchplanes = n;
  282. return (attr[n]);
  283. }
  284. static void _sdr_cache_hresp_line (char *buf)
  285. {
  286. List words = NULL;
  287. int nn = -1;
  288. words = list_split (NULL, buf);
  289. assert(list_count (words) == 2);
  290. nn = atoi (_list_nth (words, 0));
  291. assert (nn >= 0 && nn <= MAX_SP_NODE_NUMBER);
  292. /*
  293. * Ignore host_responds for hosts without node information
  294. */
  295. if (sdrcache[nn] != NULL)
  296. sdrcache[nn]->host_responds = (atoi (_list_nth (words, 1)) == 1);
  297. return;
  298. }
  299. static void _sdr_cache_sresp_line (char *buf, int switchplanes)
  300. {
  301. List words = NULL;
  302. int nn = -1;
  303. struct sdr_info *s;
  304. words = list_split (NULL, buf);
  305. assert(list_count (words) == (1 + switchplanes));
  306. nn = atoi (_list_nth (words, 0));
  307. assert (nn >= 0 && nn <= MAX_SP_NODE_NUMBER);
  308. assert (sdrcache[nn] != NULL);
  309. s = sdrcache[nn];
  310. s->switch_responds = (atoi(_list_nth(words, 1)) == 1);
  311. if (switchplanes == 2)
  312. s->switch_responds = s->switch_responds || (atoi(_list_nth (words, 1)));
  313. return;
  314. }
  315. static void _sdr_cache_name_line (char *buf)
  316. {
  317. char *name = NULL;
  318. char *rname = NULL;
  319. List words = NULL;
  320. int nn = -1;
  321. char *p;
  322. words = list_split (NULL, buf);
  323. assert (list_count(words) == 3);
  324. nn = atoi (_list_nth (words, 0));
  325. assert (nn >= 0 && nn <= MAX_SP_NODE_NUMBER);
  326. name = _list_nth (words, 1);
  327. rname = _list_nth (words, 2);
  328. if ((p = strchr (name, '.')))
  329. *p = '\0';
  330. if ((p = strchr (rname, '.')))
  331. *p = '\0';
  332. sdrcache[nn] = sdr_info_create (name, rname);
  333. list_destroy (words);
  334. return;
  335. }
  336. static void _sdr_getresp (bool Gopt)
  337. {
  338. FILE *f;
  339. char cmd[LINEBUFSIZE];
  340. char buf[LINEBUFSIZE];
  341. int nswitchplanes;
  342. snprintf (cmd, sizeof(cmd),
  343. "%s %s -x host_responds node_number host_responds",
  344. _PATH_SDRGETOBJECTS, Gopt ? "-G" : "");
  345. if ((f = xpopen (cmd, "r")) == NULL)
  346. errx("%p: error running %s\n", _PATH_SDRGETOBJECTS);
  347. while (fgets (buf, LINEBUFSIZE, f) != NULL)
  348. _sdr_cache_hresp_line (buf);
  349. snprintf (cmd, sizeof(cmd),
  350. "%s %s -x switch_responds node_number %s",
  351. _PATH_SDRGETOBJECTS, Gopt ? "-G" : "",
  352. _sdr_switch_attr (&nswitchplanes));
  353. if ((f = xpopen (cmd, "r")) == NULL)
  354. errx("%p: error running %s\n", _PATH_SDRGETOBJECTS);
  355. while (fgets (buf, LINEBUFSIZE, f) != NULL)
  356. _sdr_cache_sresp_line (buf, nswitchplanes);
  357. xpclose (f);
  358. return;
  359. }
  360. /*
  361. * Query the SDR for hostnames of all nodes and return the results in an
  362. * array indexed by node number.
  363. * Gopt (IN) pass -G to SDRGetObjects
  364. */
  365. static void _sdr_getnames(bool Gopt)
  366. {
  367. FILE *f;
  368. char cmd[LINEBUFSIZE];
  369. char buf[LINEBUFSIZE];
  370. snprintf (cmd, sizeof(cmd),
  371. "%s %s -x Node node_number initial_hostname reliable_hostname",
  372. _PATH_SDRGETOBJECTS, Gopt ? "-G" : "");
  373. if ((f = xpopen (cmd, "r")) == NULL)
  374. errx("%p: error running %s\n", _PATH_SDRGETOBJECTS);
  375. while (fgets (buf, LINEBUFSIZE, f) != NULL)
  376. _sdr_cache_name_line (buf);
  377. xpclose(f);
  378. sdr_initialized = true;
  379. }
  380. static hostlist_t _sdr_reliable_names ()
  381. {
  382. hostlist_t hl = hostlist_create (NULL);
  383. int i;
  384. for (i = 0; i < MAX_SP_NODE_NUMBER; i++) {
  385. if (sdrcache[i] != NULL)
  386. hostlist_push_host (hl, sdrcache[i]->reliable_hostname);
  387. }
  388. return (hl);
  389. }
  390. static char *_list_nth(List l, int n)
  391. {
  392. int i = 0;
  393. char *name = NULL;
  394. ListIterator itr = list_iterator_create(l);
  395. while ((name = list_next(itr))) {
  396. if (i++ == n) break;
  397. }
  398. list_iterator_destroy(itr);
  399. return name;
  400. }
  401. static struct sdr_info * sdr_info_create (char *host, char *rhost)
  402. {
  403. struct sdr_info *s = Malloc (sizeof (*s));
  404. s->hostname = Strdup (host);
  405. s->reliable_hostname = Strdup (rhost);
  406. s->host_responds = false;
  407. s->switch_responds = false;
  408. return (s);
  409. }
  410. static void sdr_info_destroy (struct sdr_info *s)
  411. {
  412. if (s == NULL)
  413. return;
  414. if (s->hostname)
  415. Free ((void **) &s->hostname);
  416. if (s->reliable_hostname)
  417. Free ((void **) &s->reliable_hostname);
  418. Free ((void **) &s);
  419. return;
  420. }
  421. static struct sdr_info * _find_node (const char *name, int *rhost)
  422. {
  423. int i;
  424. for (i = 0; i < MAX_SP_NODE_NUMBER; i++) {
  425. struct sdr_info *s = sdrcache[i];
  426. if (s == NULL)
  427. continue;
  428. if (strncmp (name, s->reliable_hostname,
  429. strlen (s->reliable_hostname)) == 0) {
  430. if (rhost != NULL)
  431. *rhost = 1;
  432. return (s);
  433. }
  434. if (strncmp (name, s->hostname, strlen (s->hostname)) == 0) {
  435. if (rhost != NULL)
  436. *rhost = 0;
  437. return (s);
  438. }
  439. }
  440. return (NULL);
  441. }
  442. /*
  443. * vi: tabstop=4 shiftwidth=4 expandtab
  444. */