/util/batch/send.py
Python | 306 lines | 270 code | 8 blank | 28 comment | 17 complexity | 3eaa39db036c5aa4d89d73c07cc0d424 MD5 | raw file
Possible License(s): BSD-3-Clause, LGPL-2.1
- #!/usr/bin/env python
- # Copyright (c) 2006 The Regents of The University of Michigan
- # All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are
- # met: redistributions of source code must retain the above copyright
- # notice, this list of conditions and the following disclaimer;
- # redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution;
- # neither the name of the copyright holders nor the names of its
- # contributors may be used to endorse or promote products derived from
- # this software without specific prior written permission.
- #
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # Authors: Kevin Lim
- import os, os.path, re, socket, sys
- from os import environ as env, listdir
- from os.path import basename, isdir, isfile, islink, join as joinpath, normpath
- from filecmp import cmp as filecmp
- from shutil import copy
- def nfspath(dir):
- if dir.startswith('/.automount/'):
- dir = '/n/%s' % dir[12:]
- elif not dir.startswith('/n/'):
- dir = '/n/%s%s' % (socket.gethostname().split('.')[0], dir)
- return dir
- def syncdir(srcdir, destdir):
- srcdir = normpath(srcdir)
- destdir = normpath(destdir)
- if not isdir(destdir):
- sys.exit('destination directory "%s" does not exist' % destdir)
- for root, dirs, files in os.walk(srcdir):
- root = normpath(root)
- prefix = os.path.commonprefix([root, srcdir])
- root = root[len(prefix):]
- if root.startswith('/'):
- root = root[1:]
- for rem in [ d for d in dirs if d.startswith('.') or d == 'SCCS']:
- dirs.remove(rem)
- for entry in dirs:
- newdir = joinpath(destdir, root, entry)
- if not isdir(newdir):
- os.mkdir(newdir)
- print 'mkdir', newdir
- for i,d in enumerate(dirs):
- if islink(joinpath(srcdir, root, d)):
- dirs[i] = joinpath(d, '.')
- for entry in files:
- dest = normpath(joinpath(destdir, root, entry))
- src = normpath(joinpath(srcdir, root, entry))
- if not isfile(dest) or not filecmp(src, dest):
- print 'copy %s %s' % (dest, src)
- copy(src, dest)
- progpath = nfspath(sys.path[0])
- progname = basename(sys.argv[0])
- usage = """\
- Usage:
- %(progname)s [-c] [-e] [-f] [-j <jobfile>] [-q queue] [-v] <regexp>
- -c clean directory if job can be run
- -C submit the checkpointing runs
- -d Make jobs be dependent on the completion of the checkpoint runs
- -e only echo pbs command info, don't actually send the job
- -f force the job to run regardless of state
- -q <queue> submit job to the named queue
- -j <jobfile> specify the jobfile (default is <rootdir>/Test.py)
- -v be verbose
- %(progname)s [-j <jobfile>] -l [-v] <regexp>
- -j <jobfile> specify the jobfile (default is <rootdir>/Test.py)
- -l list job names, don't submit
- -v be verbose (list job parameters)
- %(progname)s -h
- -h display this help
- """ % locals()
- try:
- import getopt
- opts, args = getopt.getopt(sys.argv[1:], '-Ccdefhj:lnq:Rt:v')
- except getopt.GetoptError:
- sys.exit(usage)
- depend = False
- clean = False
- onlyecho = False
- exprs = []
- force = False
- listonly = False
- queue = ''
- verbose = False
- jfile = 'Test.py'
- docpts = False
- doruns = True
- runflag = False
- node_type = 'FAST'
- update = True
- for opt,arg in opts:
- if opt == '-C':
- docpts = True
- if opt == '-c':
- clean = True
- if opt == '-d':
- depend = True
- if opt == '-e':
- onlyecho = True
- if opt == '-f':
- force = True
- if opt == '-h':
- print usage
- sys.exit(0)
- if opt == '-j':
- jfile = arg
- if opt == '-l':
- listonly = True
- if opt == '-n':
- update = False
- if opt == '-q':
- queue = arg
- if opt == '-R':
- runflag = True
- if opt == '-t':
- node_type = arg
- if opt == '-v':
- verbose = True
- if docpts:
- doruns = runflag
- for arg in args:
- exprs.append(re.compile(arg))
- import jobfile, batch
- from job import JobDir, date
- conf = jobfile.JobFile(jfile)
- if update and not listonly and not onlyecho and isdir(conf.linkdir):
- if verbose:
- print 'Checking for outdated files in Link directory'
- if not isdir(conf.basedir):
- os.mkdir(conf.basedir)
- syncdir(conf.linkdir, conf.basedir)
- jobnames = {}
- joblist = []
- if docpts and doruns:
- gen = conf.alljobs()
- elif docpts:
- gen = conf.checkpoints()
- elif doruns:
- gen = conf.jobs()
- for job in gen:
- if job.name in jobnames:
- continue
- if exprs:
- for expr in exprs:
- if expr.match(job.name):
- joblist.append(job)
- break
- else:
- joblist.append(job)
- if listonly:
- if verbose:
- for job in joblist:
- job.printinfo()
- else:
- for job in joblist:
- print job.name
- sys.exit(0)
- if not onlyecho:
- newlist = []
- for job in joblist:
- jobdir = JobDir(joinpath(conf.rootdir, job.name))
- if jobdir.exists():
- if not force:
- status = jobdir.getstatus()
- if status == 'queued':
- continue
- if status == 'running':
- continue
- if status == 'success':
- continue
- if not clean:
- sys.exit('job directory %s not clean!' % jobdir)
- jobdir.clean()
- newlist.append(job)
- joblist = newlist
- class NameHack(object):
- def __init__(self, host='pbs.pool', port=24465):
- self.host = host
- self.port = port
- self.socket = None
- def setname(self, jobid, jobname):
- try:
- jobid = int(jobid)
- except ValueError:
- jobid = int(jobid.strip().split('.')[0])
- jobname = jobname.strip()
- # since pbs can handle jobnames of 15 characters or less,
- # don't use the raj hack.
- if len(jobname) <= 15:
- return
- if self.socket is None:
- import socket
- self.socket = socket.socket()
- # Connect to pbs.pool and send the jobid/jobname pair to port
- # 24465 (Raj didn't realize that there are only 64k ports and
- # setup inetd to point to port 90001)
- self.socket.connect((self.host, self.port))
- self.socket.send("%s %s\n" % (jobid, jobname))
- namehack = NameHack()
- rootdir = conf.rootdir
- script = joinpath(rootdir, 'Base', 'job.py')
- for job in joblist:
- jobdir = JobDir(joinpath(rootdir, job.name))
- if depend:
- cptdir = JobDir(joinpath(rootdir, job.checkpoint.name))
- path = str(cptdir)
- if not isdir(path) or not isfile(joinpath(path, '.success')):
- continue
- cptjob = cptdir.readval('.batch_jobid')
- if not onlyecho:
- jobdir.create()
- os.chdir(str(jobdir))
- os.environ['PWD'] = str(jobdir)
- print 'Job name: %s' % job.name
- print 'Job directory: %s' % jobdir
- qsub = batch.oarsub()
- qsub.oarhost = 'poolfs.eecs.umich.edu'
- #qsub.stdout = jobdir.file('jobout')
- qsub.name = job.name
- qsub.walltime = '50'
- #qsub.join = True
- #qsub.node_type = node_type
- #qsub.env['ROOTDIR'] = conf.rootdir
- #qsub.env['JOBNAME'] = job.name
- #if depend:
- # qsub.afterok = cptjob
- #if queue:
- # qsub.queue = queue
- qsub.properties = "64bit = 'Yes' or 64bit = 'No'"
- qsub.build(script)
- if verbose:
- print 'cwd: %s' % qsub.command
- print 'PBS Command: %s' % qsub.command
- if not onlyecho:
- ec = qsub.do()
- if ec == 0:
- jobid = qsub.result
- print 'OAR Jobid: %s' % jobid
- #namehack.setname(jobid, job.name)
- queued = date()
- jobdir.echofile('.batch_jobid', jobid)
- jobdir.echofile('.batch_jobname', job.name)
- jobdir.echofile('.queued', queued)
- jobdir.setstatus('queued on %s' % queued)
- else:
- print 'OAR Failed'
- print
- print