PageRenderTime 54ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/mpich2-1.4.1p1/src/pm/mpd/mpdboot.py

#
Python | 482 lines | 447 code | 3 blank | 32 comment | 14 complexity | 669b077e156c212c1c76a20eac04ed92 MD5 | raw file
  1. #!/usr/bin/env python
  2. #
  3. # (C) 2001 by Argonne National Laboratory.
  4. # See COPYRIGHT in top-level directory.
  5. #
  6. """
  7. usage: mpdboot --totalnum=<n_to_start> [--file=<hostsfile>] [--help] \
  8. [--rsh=<rshcmd>] [--user=<user>] [--mpd=<mpdcmd>] \
  9. [--loccons] [--remcons] [--shell] [--verbose] [-1] \
  10. [--ncpus=<ncpus>] [--ifhn=<ifhn>] [--chkup] [--chkuponly] \
  11. [--maxbranch=<maxbranch>]
  12. or, in short form,
  13. mpdboot -n n_to_start [-f <hostsfile>] [-h] [-r <rshcmd>] [-u <user>] \
  14. [-m <mpdcmd>] -s -v [-1] [-c]
  15. --totalnum specifies the total number of mpds to start; at least
  16. one mpd will be started locally, and others on the machines specified
  17. by the file argument; by default, only one mpd per host will be
  18. started even if the hostname occurs multiple times in the hosts file
  19. -1 means remove the restriction of starting only one mpd per machine;
  20. in this case, at most the first mpd on a host will have a console
  21. --file specifies the file of machines to start the rest of the mpds on;
  22. it defaults to mpd.hosts
  23. --mpd specifies the full path name of mpd on the remote hosts if it is
  24. not in your path
  25. --rsh specifies the name of the command used to start remote mpds; it
  26. defaults to ssh; an alternative is rsh
  27. --shell says that the Bourne shell is your default for rsh'
  28. --verbose shows the ssh attempts as they occur; it does not provide
  29. confirmation that the sshs were successful
  30. --loccons says you do not want a console available on local mpd(s)
  31. --remcons says you do not want consoles available on remote mpd(s)
  32. --ncpus indicates how many cpus you want to show for the local machine;
  33. others are listed in the hosts file
  34. --ifhn indicates the interface hostname to use for the local mpd; others
  35. may be specified in the hostsfile
  36. --chkup requests that mpdboot try to verify that the hosts in the host file
  37. are up before attempting start mpds on any of them; it just checks the number
  38. of hosts specified by -n
  39. --chkuponly requests that mpdboot try to verify that the hosts in the host file
  40. are up; it then terminates; it just checks the number of hosts specified by -n
  41. --maxbranch indicates the maximum number of mpds to enter the ring under another;
  42. the default is 4
  43. """
  44. # workaround to suppress deprecated module warnings in python2.6
  45. # see https://trac.mcs.anl.gov/projects/mpich2/ticket/362 for tracking
  46. import warnings
  47. warnings.filterwarnings('ignore', '.*the popen2 module is deprecated.*', DeprecationWarning)
  48. from time import ctime
  49. __author__ = "Ralph Butler and Rusty Lusk"
  50. __date__ = ctime()
  51. __version__ = "$Revision: 1.49 $"
  52. __credits__ = ""
  53. import re
  54. from os import environ, path, kill, access, X_OK
  55. from sys import argv, exit, stdout
  56. from popen2 import Popen4, Popen3, popen2
  57. from socket import gethostname, gethostbyname_ex
  58. from select import select, error
  59. from signal import SIGKILL
  60. from commands import getoutput, getstatusoutput
  61. from mpdlib import mpd_set_my_id, mpd_get_my_username, mpd_same_ips, \
  62. mpd_get_ranks_in_binary_tree, mpd_print, MPDSock, MPDParmDB
  63. global myHost, fullDirName, rshCmd, user, mpdCmd, debug, verbose
  64. def mpdboot():
  65. global myHost, fullDirName, rshCmd, user, mpdCmd, debug, verbose
  66. myHost = gethostname()
  67. mpd_set_my_id('mpdboot_%s' % (myHost) )
  68. fullDirName = path.abspath(path.split(argv[0])[0])
  69. rshCmd = 'ssh'
  70. user = mpd_get_my_username()
  71. mpdCmd = path.join(fullDirName,'mpd.py')
  72. hostsFilename = 'mpd.hosts'
  73. totalnumToStart = 1 # may get chgd below
  74. debug = 0
  75. verbose = 0
  76. localConArg = ''
  77. remoteConArg = ''
  78. oneMPDPerHost = 1
  79. myNcpus = 1
  80. myIfhn = ''
  81. chkupIndicator = 0 # 1 -> chk and start ; 2 -> just chk
  82. maxUnderOneRoot = 4
  83. try:
  84. shell = path.split(environ['SHELL'])[-1]
  85. except:
  86. shell = 'csh'
  87. argidx = 1 # skip arg 0
  88. while argidx < len(argv):
  89. if argv[argidx] == '-h' or argv[argidx] == '--help':
  90. usage()
  91. elif argv[argidx] == '-r': # or --rsh=
  92. rshCmd = argv[argidx+1]
  93. argidx += 2
  94. elif argv[argidx].startswith('--rsh'):
  95. splitArg = argv[argidx].split('=')
  96. try:
  97. rshCmd = splitArg[1]
  98. except:
  99. print 'mpdboot: invalid argument:', argv[argidx]
  100. usage()
  101. argidx += 1
  102. elif argv[argidx] == '-u': # or --user=
  103. user = argv[argidx+1]
  104. argidx += 2
  105. elif argv[argidx].startswith('--user'):
  106. splitArg = argv[argidx].split('=')
  107. try:
  108. user = splitArg[1]
  109. except:
  110. print 'mpdboot: invalid argument:', argv[argidx]
  111. usage()
  112. argidx += 1
  113. elif argv[argidx] == '-m': # or --mpd=
  114. mpdCmd = argv[argidx+1]
  115. argidx += 2
  116. elif argv[argidx].startswith('--mpd'):
  117. splitArg = argv[argidx].split('=')
  118. try:
  119. mpdCmd = splitArg[1]
  120. except:
  121. print 'mpdboot: invalid argument:', argv[argidx]
  122. usage()
  123. argidx += 1
  124. elif argv[argidx] == '-f': # or --file=
  125. hostsFilename = argv[argidx+1]
  126. argidx += 2
  127. elif argv[argidx].startswith('--file'):
  128. splitArg = argv[argidx].split('=')
  129. try:
  130. hostsFilename = splitArg[1]
  131. except:
  132. print 'mpdboot: invalid argument:', argv[argidx]
  133. usage()
  134. argidx += 1
  135. elif argv[argidx].startswith('--ncpus'):
  136. splitArg = argv[argidx].split('=')
  137. try:
  138. myNcpus = int(splitArg[1])
  139. except:
  140. print 'mpdboot: invalid argument:', argv[argidx]
  141. usage()
  142. argidx += 1
  143. elif argv[argidx].startswith('--ifhn'):
  144. splitArg = argv[argidx].split('=')
  145. myIfhn = splitArg[1]
  146. myHost = splitArg[1]
  147. argidx += 1
  148. elif argv[argidx] == '-n': # or --totalnum=
  149. totalnumToStart = int(argv[argidx+1])
  150. argidx += 2
  151. elif argv[argidx].startswith('--totalnum'):
  152. splitArg = argv[argidx].split('=')
  153. try:
  154. totalnumToStart = int(splitArg[1])
  155. except:
  156. print 'mpdboot: invalid argument:', argv[argidx]
  157. usage()
  158. argidx += 1
  159. elif argv[argidx].startswith('--maxbranch'):
  160. splitArg = argv[argidx].split('=')
  161. try:
  162. maxUnderOneRoot = int(splitArg[1])
  163. except:
  164. print 'mpdboot: invalid argument:', argv[argidx]
  165. usage()
  166. argidx += 1
  167. elif argv[argidx] == '-d' or argv[argidx] == '--debug':
  168. debug = 1
  169. argidx += 1
  170. elif argv[argidx] == '-s' or argv[argidx] == '--shell':
  171. shell = 'bourne'
  172. argidx += 1
  173. elif argv[argidx] == '-v' or argv[argidx] == '--verbose':
  174. verbose = 1
  175. argidx += 1
  176. elif argv[argidx] == '-c' or argv[argidx] == '--chkup':
  177. chkupIndicator = 1
  178. argidx += 1
  179. elif argv[argidx] == '--chkuponly':
  180. chkupIndicator = 2
  181. argidx += 1
  182. elif argv[argidx] == '-1':
  183. oneMPDPerHost = 0
  184. argidx += 1
  185. elif argv[argidx] == '--loccons':
  186. localConArg = '-n'
  187. argidx += 1
  188. elif argv[argidx] == '--remcons':
  189. remoteConArg = '-n'
  190. argidx += 1
  191. else:
  192. print 'mpdboot: unrecognized argument:', argv[argidx]
  193. usage()
  194. # Fix for tt#662, make sure the config file is available to avoid some very
  195. # confusing error messages. We don't actually need these values here.
  196. parmdb = MPDParmDB()
  197. parmdb.get_parms_from_rcfile(parmsToOverride={}, errIfMissingFile=1)
  198. if debug:
  199. print 'debug: starting'
  200. lines = []
  201. if totalnumToStart > 1:
  202. try:
  203. f = open(hostsFilename,'r')
  204. for line in f:
  205. lines.append(line)
  206. except:
  207. print 'unable to open (or read) hostsfile %s' % (hostsFilename)
  208. exit(-1)
  209. hostsAndInfo = [ {'host' : myHost, 'ncpus' : myNcpus, 'ifhn' : myIfhn} ]
  210. for line in lines:
  211. line = line.strip()
  212. if not line or line[0] == '#':
  213. continue
  214. splitLine = re.split(r'\s+',line)
  215. host = splitLine[0]
  216. ncpus = 1 # default
  217. if ':' in host:
  218. (host,ncpus) = host.split(':',1)
  219. ncpus = int(ncpus)
  220. ifhn = '' # default
  221. for kv in splitLine[1:]:
  222. (k,v) = kv.split('=',1)
  223. if k == 'ifhn':
  224. ifhn = v
  225. hostsAndInfo.append( {'host' : host, 'ncpus' : ncpus, 'ifhn' : ifhn} )
  226. cachedIPs = {}
  227. if oneMPDPerHost and totalnumToStart > 1:
  228. oldHostsAndInfo = hostsAndInfo[:]
  229. hostsAndInfo = []
  230. for hostAndInfo in oldHostsAndInfo:
  231. oldhost = hostAndInfo['host']
  232. try:
  233. ips = gethostbyname_ex(oldhost)[2] # may fail if invalid host
  234. except:
  235. print 'unable to obtain IP for host:', oldhost
  236. continue
  237. uips = {} # unique ips
  238. for ip in ips:
  239. uips[ip] = 1
  240. keep = 1
  241. for ip in uips.keys():
  242. if cachedIPs.has_key(ip):
  243. keep = 0
  244. break
  245. if keep:
  246. hostsAndInfo.append(hostAndInfo)
  247. cachedIPs.update(uips)
  248. if len(hostsAndInfo) < totalnumToStart: # one is local
  249. print 'totalnum=%d numhosts=%d' % (totalnumToStart,len(hostsAndInfo))
  250. print 'there are not enough hosts on which to start all processes'
  251. exit(-1)
  252. if chkupIndicator:
  253. hostsToCheck = [ hai['host'] for hai in hostsAndInfo[1:totalnumToStart] ]
  254. (upList,dnList) = chkupdn(hostsToCheck)
  255. if dnList:
  256. print "these hosts are down; exiting"
  257. print dnList
  258. exit(-1)
  259. print "there are %d hosts up (counting local)" % (len(upList)+1)
  260. if chkupIndicator == 2: # do the chkup and quit
  261. exit(0)
  262. try:
  263. # stop current (if any) mpds; ignore the output
  264. getoutput('%s/mpdallexit.py' % (fullDirName))
  265. if verbose or debug:
  266. print 'running mpdallexit on %s' % (myHost)
  267. except:
  268. pass
  269. if environ.has_key('MPD_TMPDIR'):
  270. tmpdir = environ['MPD_TMPDIR']
  271. else:
  272. tmpdir = ''
  273. if myIfhn:
  274. ifhn = '--ifhn=%s' % (myIfhn)
  275. else:
  276. ifhn = ''
  277. hostsAndInfo[0]['entry_host'] = ''
  278. hostsAndInfo[0]['entry_port'] = ''
  279. mpdArgs = '%s %s --ncpus=%d' % (localConArg,ifhn,myNcpus)
  280. if tmpdir:
  281. mpdArgs += ' --tmpdir=%s' % (tmpdir)
  282. (mpdPID,mpdFD) = launch_one_mpd(0,0,mpdArgs,hostsAndInfo)
  283. fd2idx = {mpdFD : 0}
  284. handle_mpd_output(mpdFD,fd2idx,hostsAndInfo)
  285. try:
  286. from os import sysconf
  287. maxfds = sysconf('SC_OPEN_MAX')
  288. except:
  289. maxfds = 1024
  290. maxAtOnce = min(128,maxfds-8) # -8 for stdeout, etc. + a few more for padding
  291. hostsSeen = { myHost : 1 }
  292. fdsToSelect = []
  293. numStarted = 1 # local already going
  294. numStarting = 0
  295. numUnderCurrRoot = 0
  296. possRoots = []
  297. currRoot = 0
  298. idxToStart = 1 # local mpd already going
  299. while numStarted < totalnumToStart:
  300. if numStarting < maxAtOnce and idxToStart < totalnumToStart:
  301. if numUnderCurrRoot < maxUnderOneRoot:
  302. entryHost = hostsAndInfo[currRoot]['host']
  303. entryPort = hostsAndInfo[currRoot]['list_port']
  304. hostsAndInfo[idxToStart]['entry_host'] = entryHost
  305. hostsAndInfo[idxToStart]['entry_port'] = entryPort
  306. if hostsSeen.has_key(hostsAndInfo[idxToStart]['host']):
  307. remoteConArg = '-n'
  308. myNcpus = hostsAndInfo[idxToStart]['ncpus']
  309. ifhn = hostsAndInfo[idxToStart]['ifhn']
  310. if ifhn:
  311. ifhn = '--ifhn=%s' % (ifhn)
  312. mpdArgs = '%s -h %s -p %s %s --ncpus=%d' % (remoteConArg,entryHost,entryPort,ifhn,myNcpus)
  313. if tmpdir:
  314. mpdArgs += ' --tmpdir=%s' % (tmpdir)
  315. (mpdPID,mpdFD) = launch_one_mpd(idxToStart,currRoot,mpdArgs,hostsAndInfo)
  316. numStarting += 1
  317. numUnderCurrRoot += 1
  318. hostsAndInfo[idxToStart]['pid'] = mpdPID
  319. hostsSeen[hostsAndInfo[idxToStart]['host']] = 1
  320. fd2idx[mpdFD] = idxToStart
  321. fdsToSelect.append(mpdFD)
  322. idxToStart += 1
  323. else:
  324. if possRoots:
  325. currRoot = possRoots.pop()
  326. numUnderCurrRoot = 0
  327. selectTime = 0.01
  328. else:
  329. selectTime = 0.1
  330. try:
  331. (readyFDs,unused1,unused2) = select(fdsToSelect,[],[],selectTime)
  332. except error, errmsg:
  333. mpd_print(1,'mpdboot: select failed: errmsg=:%s:' % (errmsg) )
  334. exit(-1)
  335. for fd in readyFDs:
  336. handle_mpd_output(fd,fd2idx,hostsAndInfo)
  337. numStarted += 1
  338. numStarting -= 1
  339. possRoots.append(fd2idx[fd])
  340. fdsToSelect.remove(fd)
  341. fd.close()
  342. def launch_one_mpd(idxToStart,currRoot,mpdArgs,hostsAndInfo):
  343. global myHost, fullDirName, rshCmd, user, mpdCmd, debug, verbose
  344. mpdHost = hostsAndInfo[idxToStart]['host']
  345. if idxToStart == 0:
  346. cmd = '%s %s -e -d' % (mpdCmd,mpdArgs)
  347. else:
  348. if rshCmd == 'ssh':
  349. rshArgs = '-x -n -q'
  350. else:
  351. rshArgs = '-n'
  352. mpdHost = hostsAndInfo[idxToStart]['host']
  353. cmd = "%s %s %s '%s %s -e -d' " % \
  354. (rshCmd,rshArgs,mpdHost,mpdCmd,mpdArgs)
  355. if verbose:
  356. entryHost = hostsAndInfo[idxToStart]['entry_host']
  357. entryPort = hostsAndInfo[idxToStart]['entry_port']
  358. # print "LAUNCHED mpd on %s via %s %s" % (mpdHost,entryHost,str(entryPort))
  359. print "LAUNCHED mpd on %s via %s" % (mpdHost,entryHost)
  360. if debug:
  361. print "debug: launch cmd=", cmd
  362. mpd = Popen4(cmd,0)
  363. mpdFD = mpd.fromchild
  364. mpdPID = mpd.pid
  365. return (mpdPID,mpdFD)
  366. def handle_mpd_output(fd,fd2idx,hostsAndInfo):
  367. global myHost, fullDirName, rshCmd, user, mpdCmd, debug, verbose
  368. idx = fd2idx[fd]
  369. host = hostsAndInfo[idx]['host']
  370. # port = fd.readline().strip()
  371. port = 'no_port'
  372. for line in fd.readlines(): # handle output from shells that echo stuff
  373. line = line.strip()
  374. splitLine = line.split('=')
  375. if splitLine[0] == 'mpd_port':
  376. port = splitLine[1]
  377. break
  378. if debug:
  379. print "debug: mpd on %s on port %s" % (host,port)
  380. if port.isdigit():
  381. hostsAndInfo[idx]['list_port'] = int(port)
  382. tempSock = MPDSock(name='temp_to_mpd')
  383. try:
  384. tempSock.connect((host,int(port)))
  385. except:
  386. tempSock.close()
  387. tempSock = 0
  388. if tempSock:
  389. msgToSend = { 'cmd' : 'ping', 'ifhn' : 'dummy', 'port' : 0}
  390. tempSock.send_dict_msg(msgToSend)
  391. msg = tempSock.recv_dict_msg() # RMB: WITH TIMEOUT ??
  392. if not msg or not msg.has_key('cmd') or msg['cmd'] != 'challenge':
  393. mpd_print(1,'failed to handshake with mpd on %s; recvd output=%s' % \
  394. (host,msg) )
  395. tempOut = tempSock.recv(1000)
  396. print tempOut
  397. try: getoutput('%s/mpdallexit.py' % (fullDirName))
  398. except: pass
  399. exit(-1)
  400. tempSock.close()
  401. else:
  402. mpd_print(1,'failed to connect to mpd on %s' % (host) )
  403. try: getoutput('%s/mpdallexit.py' % (fullDirName))
  404. except: pass
  405. exit(-1)
  406. else:
  407. mpd_print(1,'from mpd on %s, invalid port info:' % (host) )
  408. print port
  409. print fd.read()
  410. try: getoutput('%s/mpdallexit.py' % (fullDirName))
  411. except: pass
  412. exit(-1)
  413. if verbose:
  414. print "RUNNING: mpd on", hostsAndInfo[fd2idx[fd]]['host']
  415. if debug:
  416. print "debug: info for running mpd:", hostsAndInfo[fd2idx[fd]]
  417. def chkupdn(hostList):
  418. upList = []
  419. dnList = []
  420. for hostname in hostList:
  421. print 'checking', hostname
  422. if rshCmd == 'ssh':
  423. rshArgs = '-x -n'
  424. else:
  425. rshArgs = '-n'
  426. cmd = "%s %s %s /bin/echo hello" % (rshCmd,rshArgs,hostname)
  427. runner = Popen3(cmd,1,0)
  428. runout = runner.fromchild
  429. runerr = runner.childerr
  430. runin = runner.tochild
  431. runpid = runner.pid
  432. up = 0
  433. try:
  434. # (readyFDs,unused1,unused2) = select([runout,runerr],[],[],9)
  435. (readyFDs,unused1,unused2) = select([runout],[],[],9)
  436. except:
  437. print 'select failed'
  438. readyFDs = []
  439. for fd in readyFDs: # may have runout and runerr sometimes
  440. line = fd.readline()
  441. if line and line.startswith('hello'):
  442. up = 1
  443. else:
  444. pass
  445. if up:
  446. upList.append(hostname)
  447. else:
  448. dnList.append(hostname)
  449. try:
  450. kill(runpid,SIGKILL)
  451. except:
  452. pass
  453. return(upList,dnList)
  454. def usage():
  455. print __doc__
  456. stdout.flush()
  457. exit(-1)
  458. if __name__ == '__main__':
  459. mpdboot()