PageRenderTime 34ms CodeModel.GetById 37ms RepoModel.GetById 0ms app.codeStats 0ms

/src/python/WMCore/BossAir/Plugins/LsfPlugin.py

https://github.com/PerilousApricot/WMCore
Python | 319 lines | 176 code | 71 blank | 72 comment | 34 complexity | 0b8f354f7a220a96e9e35c7854a7da20 MD5 | raw file
  1. #!/usr/bin/env python
  2. """
  3. _LsfPlugin_
  4. Plugin to support use of LSF queues (at CERN)
  5. Does not support rerunnable jobs, ie. which are
  6. automatically requeued by LSF with a different job id
  7. """
  8. import os
  9. import re
  10. import errno
  11. import time
  12. import datetime
  13. import socket
  14. import logging
  15. import subprocess
  16. from WMCore.WMInit import getWMBASE
  17. from WMCore.BossAir.Plugins.BasePlugin import BasePlugin, BossAirPluginException
  18. from WMCore.FwkJobReport.Report import Report
  19. class LsfPlugin(BasePlugin):
  20. """
  21. _LsfPlugin_
  22. """
  23. @staticmethod
  24. def stateMap():
  25. """
  26. For a given name, return a global state
  27. """
  28. stateDict = {'New': 'Pending',
  29. 'PEND': 'Pending',
  30. 'PSUSP': 'Pending',
  31. 'WAIT': 'Pending',
  32. 'RUN': 'Running',
  33. 'USUSP': 'Running',
  34. 'SSUSP': 'Running',
  35. 'DONE': 'Complete',
  36. 'EXIT': 'Error',
  37. 'UNKWN': 'Error',
  38. 'ZOMBI': 'Error',
  39. 'Timeout' : 'Error'}
  40. return stateDict
  41. def __init__(self, config):
  42. self.config = config
  43. BasePlugin.__init__(self, config)
  44. self.packageDir = None
  45. self.unpacker = os.path.join(getWMBASE(),
  46. 'WMCore/WMRuntime/Unpacker.py')
  47. self.agent = config.Agent.agentName
  48. self.sandbox = None
  49. self.scriptFile = None
  50. self.queue = None
  51. self.resourceReq = None
  52. self.jobGroup = None
  53. return
  54. def submit(self, jobs, info = None):
  55. """
  56. _submit_
  57. Submit jobs for one subscription
  58. """
  59. # If we're here, then we have submitter components
  60. self.scriptFile = self.config.JobSubmitter.submitScript
  61. self.queue = self.config.JobSubmitter.LsfPluginQueue
  62. self.resourceReq = getattr(self.config.JobSubmitter, 'LsfPluginResourceReq', None)
  63. self.jobGroup = self.config.JobSubmitter.LsfPluginJobGroup
  64. self.batchOutput = getattr(self.config.JobSubmitter, 'LsfPluginBatchOutput', None)
  65. successfulJobs = []
  66. failedJobs = []
  67. if len(jobs) == 0:
  68. # Then we have nothing to do
  69. return successfulJobs, failedJobs
  70. # Now assume that what we get is the following; a mostly
  71. # unordered list of jobs with random sandboxes.
  72. # We intend to sort them by sandbox.
  73. submitDict = {}
  74. for job in jobs:
  75. sandbox = job['sandbox']
  76. if not sandbox in submitDict.keys():
  77. submitDict[sandbox] = []
  78. submitDict[sandbox].append(job)
  79. # Now submit the bastards
  80. for sandbox in submitDict.keys():
  81. jobList = submitDict.get(sandbox, [])
  82. while len(jobList) > 0:
  83. jobsReady = jobList[:self.config.JobSubmitter.jobsPerWorker]
  84. jobList = jobList[self.config.JobSubmitter.jobsPerWorker:]
  85. for job in jobsReady:
  86. if job == {}:
  87. # Then I don't know how we got here either
  88. logging.error("Was passed a nonexistant job. Ignoring")
  89. continue
  90. submitScript = self.makeSubmit(job)
  91. if not submitScript:
  92. # Then we got nothing
  93. logging.error("No submit script made!")
  94. return {'NoResult': [0]}
  95. submitScriptFile = os.path.join(job['cache_dir'], "submit.sh")
  96. handle = open(submitScriptFile, 'w')
  97. handle.writelines(submitScript)
  98. handle.close()
  99. # make reasonable job name
  100. jobName = "WMAgentJob"
  101. regExpParser = re.compile('.*/JobCreator/JobCache/([^/]+)/[^/]+/.*')
  102. match = regExpParser.match(job['cache_dir'])
  103. if ( match != None ):
  104. jobName = "%s-%s" % (match.group(1), job['id'])
  105. # //
  106. # // Submit LSF job
  107. # //
  108. command = 'bsub'
  109. command += ' -q %s' % self.queue
  110. if self.resourceReq != None:
  111. command += ' -R "%s"' % self.resourceReq
  112. command += ' -g %s' % self.jobGroup
  113. command += ' -J %s' % jobName
  114. lsfLogDir = self.batchOutput
  115. if lsfLogDir != None:
  116. now = datetime.datetime.today()
  117. lsfLogDir += '/%s' % now.strftime("%Y%m%d%H")
  118. try:
  119. os.mkdir(lsfLogDir)
  120. logging.debug("Created directory %s" % lsfLogDir)
  121. except OSError, err:
  122. # suppress LSF log unless it's about an already exisiting directory
  123. if err.errno != errno.EEXIST or not os.path.isdir(lsfLogDir):
  124. logging.error("Can't create directory %s, turning off LSF log" % lsfLogDir)
  125. lsfLogDir = None
  126. if lsfLogDir == None:
  127. command += ' -oo /dev/null'
  128. else:
  129. command += ' -oo %s/%s.%%J.out' % (lsfLogDir, jobName)
  130. command += ' < %s' % submitScriptFile
  131. logging.info("Submitting LSF job: %s" % command)
  132. p = subprocess.Popen(command, shell = True,
  133. stdout = subprocess.PIPE,
  134. stderr = subprocess.STDOUT)
  135. stdout = p.communicate()[0]
  136. returncode = p.returncode
  137. if returncode == 0:
  138. # check for correct naming convention in PFN
  139. regExpParser = re.compile('Job <([0-9]+)> is submitted to queue')
  140. match = regExpParser.match(stdout)
  141. if match != None:
  142. job['gridid'] = match.group(1)
  143. successfulJobs.append(job)
  144. logging.info("LSF Job ID : %s" % job['gridid'] )
  145. continue
  146. else:
  147. logging.error("bsub didn't return a valid Job ID. Job is not submitted")
  148. logging.error(stdout)
  149. lsfErrorReport = Report()
  150. lsfErrorReport.addError("JobSubmit", 61202, "LsfError", stdout)
  151. job['fwjr'] = lsfErrorReport
  152. failedJobs.append(job)
  153. # We must return a list of jobs successfully submitted,
  154. # and a list of jobs failed
  155. return successfulJobs, failedJobs
  156. def track(self, jobs, info = None):
  157. """
  158. _track_
  159. Track the jobs while in condor
  160. This returns a three-way ntuple
  161. First, the total number of jobs still running
  162. Second, the jobs that need to be changed
  163. Third, the jobs that need to be completed
  164. """
  165. # If we're here, then we have submitter components
  166. self.jobGroup = self.config.JobSubmitter.LsfPluginJobGroup
  167. changeList = []
  168. completeList = []
  169. runningList = []
  170. # get info about all active and recent jobs
  171. command = 'bjobs -a -w'
  172. command += ' -g %s' % self.jobGroup
  173. p = subprocess.Popen(command, shell = True,
  174. stdout = subprocess.PIPE,
  175. stderr = subprocess.PIPE)
  176. stdout = p.communicate()[0]
  177. returncode = p.returncode
  178. if returncode == 0:
  179. jobInfo = {}
  180. for line in stdout.splitlines(False)[1:]:
  181. # take line apart into elements
  182. linelist = line.rstrip().split()
  183. # dict with LSF jobid as key and LSF jobs status as value
  184. jobInfo[linelist[0]] = linelist[2]
  185. # now go over the jobs and see what we have
  186. for job in jobs:
  187. # if LSF doesn't know anything about the job, mark it complete
  188. if not jobInfo.has_key(job['gridid']):
  189. completeList.append(job)
  190. # otherwise act on LSF job status
  191. else:
  192. newStatus = jobInfo[job['gridid']]
  193. # track status changes
  194. if newStatus != job['status']:
  195. job['status'] = newStatus
  196. job['status_time'] = int(time.time())
  197. changeList.append(job)
  198. job['globalState'] = LsfPlugin.stateMap()[newStatus]
  199. # stop tracking finished jobs
  200. if job['globalState'] in [ 'Complete', 'Error' ]:
  201. completeList.append(job)
  202. else:
  203. runningList.append(job)
  204. return runningList, changeList, completeList
  205. def kill(self, jobs, info = None):
  206. """
  207. Kill a list of jobs based on their LSF jobid
  208. """
  209. for job in jobs:
  210. command = "bkill %s\n" % job['gridid']
  211. p = subprocess.Popen(command, shell = True,
  212. stdout = subprocess.PIPE,
  213. stderr = subprocess.STDOUT)
  214. p.communicate()
  215. return
  216. def makeSubmit(self, job):
  217. """
  218. _makeSubmit_
  219. For a given job make a shell script to submit the job
  220. """
  221. script = ["#!/bin/sh\n"]
  222. # needed to construct rfio URL to access head node
  223. hostname = socket.getfqdn()
  224. # files needed to copied from head node to WN
  225. jobInputFiles = [ job['sandbox'],
  226. "%s/JobPackage.pkl" % job['packageDir'],
  227. self.unpacker,
  228. self.scriptFile ]
  229. for filename in jobInputFiles:
  230. script.append("rfcp %s:%s .\n" % (hostname, filename))
  231. script.append("bash %s %s %s\n" % (os.path.basename(self.scriptFile),
  232. os.path.basename(job['sandbox']),
  233. job['id']))
  234. script.append("rfcp Report.%i.pkl %s:%s/\n" % (job["retry_count"], hostname, job['cache_dir']))
  235. # get back a lot of debug information to the head node
  236. #script.append("find . -type f -name '*.log' -exec rfcp {} %s:%s/ \;\n" % (hostname, job['cache_dir']))
  237. return script