/lib/galaxy/jobs/runners/drmaa.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 378 lines · 338 code · 15 blank · 25 comment · 20 complexity · 521416d186a60f23303c0eb2f79f36a2 MD5 · raw file

  1. """
  2. Job control via the DRMAA API.
  3. """
  4. import json
  5. import logging
  6. import os
  7. import string
  8. import subprocess
  9. import sys
  10. import time
  11. from galaxy import eggs
  12. from galaxy import model
  13. from galaxy.jobs import JobDestination
  14. from galaxy.jobs.runners import AsynchronousJobState, AsynchronousJobRunner
  15. eggs.require( "drmaa" )
  16. log = logging.getLogger( __name__ )
  17. __all__ = [ 'DRMAAJobRunner' ]
  18. drmaa = None
  19. DRMAA_jobTemplate_attributes = [ 'args', 'remoteCommand', 'outputPath', 'errorPath', 'nativeSpecification',
  20. 'jobName', 'email', 'project' ]
  21. class DRMAAJobRunner( AsynchronousJobRunner ):
  22. """
  23. Job runner backed by a finite pool of worker threads. FIFO scheduling
  24. """
  25. runner_name = "DRMAARunner"
  26. def __init__( self, app, nworkers, **kwargs ):
  27. """Start the job runner"""
  28. global drmaa
  29. runner_param_specs = dict(
  30. drmaa_library_path = dict( map = str, default = os.environ.get( 'DRMAA_LIBRARY_PATH', None ) ),
  31. invalidjobexception_state = dict( map = str, valid = lambda x: x in ( model.Job.states.OK, model.Job.states.ERROR ), default = model.Job.states.OK ),
  32. invalidjobexception_retries = dict( map = int, valid = lambda x: int >= 0, default = 0 ),
  33. internalexception_state = dict( map = str, valid = lambda x: x in ( model.Job.states.OK, model.Job.states.ERROR ), default = model.Job.states.OK ),
  34. internalexception_retries = dict( map = int, valid = lambda x: int >= 0, default = 0 ) )
  35. if 'runner_param_specs' not in kwargs:
  36. kwargs[ 'runner_param_specs' ] = dict()
  37. kwargs[ 'runner_param_specs' ].update( runner_param_specs )
  38. super( DRMAAJobRunner, self ).__init__( app, nworkers, **kwargs )
  39. # This allows multiple drmaa runners (although only one per handler) in the same job config file
  40. if 'drmaa_library_path' in kwargs:
  41. log.info( 'Overriding DRMAA_LIBRARY_PATH due to runner plugin parameter: %s', self.runner_params.drmaa_library_path )
  42. os.environ['DRMAA_LIBRARY_PATH'] = self.runner_params.drmaa_library_path
  43. # We foolishly named this file the same as the name exported by the drmaa
  44. # library... 'import drmaa' imports itself.
  45. drmaa = __import__( "drmaa" )
  46. # Subclasses may need access to state constants
  47. self.drmaa_job_states = drmaa.JobState
  48. # Descriptive state strings pulled from the drmaa lib itself
  49. self.drmaa_job_state_strings = {
  50. drmaa.JobState.UNDETERMINED: 'process status cannot be determined',
  51. drmaa.JobState.QUEUED_ACTIVE: 'job is queued and active',
  52. drmaa.JobState.SYSTEM_ON_HOLD: 'job is queued and in system hold',
  53. drmaa.JobState.USER_ON_HOLD: 'job is queued and in user hold',
  54. drmaa.JobState.USER_SYSTEM_ON_HOLD: 'job is queued and in user and system hold',
  55. drmaa.JobState.RUNNING: 'job is running',
  56. drmaa.JobState.SYSTEM_SUSPENDED: 'job is system suspended',
  57. drmaa.JobState.USER_SUSPENDED: 'job is user suspended',
  58. drmaa.JobState.DONE: 'job finished normally',
  59. drmaa.JobState.FAILED: 'job finished, but failed',
  60. }
  61. self.ds = drmaa.Session()
  62. self.ds.initialize()
  63. # external_runJob_script can be None, in which case it's not used.
  64. self.external_runJob_script = app.config.drmaa_external_runjob_script
  65. self.external_killJob_script = app.config.drmaa_external_killjob_script
  66. self.userid = None
  67. self._init_monitor_thread()
  68. self._init_worker_threads()
  69. def url_to_destination(self, url):
  70. """Convert a legacy URL to a job destination"""
  71. if not url:
  72. return
  73. native_spec = url.split('/')[2]
  74. if native_spec:
  75. params = dict( nativeSpecification=native_spec )
  76. log.debug( "Converted URL '%s' to destination runner=drmaa, params=%s" % ( url, params ) )
  77. return JobDestination( runner='drmaa', params=params )
  78. else:
  79. log.debug( "Converted URL '%s' to destination runner=drmaa" % url )
  80. return JobDestination( runner='drmaa' )
  81. def get_native_spec( self, url ):
  82. """Get any native DRM arguments specified by the site configuration"""
  83. try:
  84. return url.split('/')[2] or None
  85. except:
  86. return None
  87. def queue_job( self, job_wrapper ):
  88. """Create job script and submit it to the DRM"""
  89. # prepare the job
  90. if not self.prepare_job( job_wrapper, include_metadata=True ):
  91. return
  92. # command line has been added to the wrapper by prepare_job()
  93. command_line = job_wrapper.runner_command_line
  94. # get configured job destination
  95. job_destination = job_wrapper.job_destination
  96. # wrapper.get_id_tag() instead of job_id for compatibility with TaskWrappers.
  97. galaxy_id_tag = job_wrapper.get_id_tag()
  98. # define job attributes
  99. job_name = 'g%s' % galaxy_id_tag
  100. if job_wrapper.tool.old_id:
  101. job_name += '_%s' % job_wrapper.tool.old_id
  102. if self.external_runJob_script is None:
  103. job_name += '_%s' % job_wrapper.user
  104. job_name = ''.join( map( lambda x: x if x in ( string.letters + string.digits + '_' ) else '_', job_name ) )
  105. ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper, job_name=job_name )
  106. # set up the drmaa job template
  107. jt = self.ds.createJobTemplate()
  108. jt.remoteCommand = ajs.job_file
  109. jt.jobName = ajs.job_name
  110. jt.outputPath = ":%s" % ajs.output_file
  111. jt.errorPath = ":%s" % ajs.error_file
  112. # Avoid a jt.exitCodePath for now - it's only used when finishing.
  113. native_spec = job_destination.params.get('nativeSpecification', None)
  114. if native_spec is not None:
  115. jt.nativeSpecification = native_spec
  116. # fill in the DRM's job run template
  117. script = self.get_job_file(job_wrapper, exit_code_path=ajs.exit_code_file)
  118. try:
  119. fh = file( ajs.job_file, "w" )
  120. fh.write( script )
  121. fh.close()
  122. os.chmod( ajs.job_file, 0755 )
  123. except:
  124. job_wrapper.fail( "failure preparing job script", exception=True )
  125. log.exception( "(%s) failure writing job script" % galaxy_id_tag )
  126. return
  127. # job was deleted while we were preparing it
  128. if job_wrapper.get_state() == model.Job.states.DELETED:
  129. log.debug( "(%s) Job deleted by user before it entered the queue" % galaxy_id_tag )
  130. if self.app.config.cleanup_job in ( "always", "onsuccess" ):
  131. job_wrapper.cleanup()
  132. return
  133. log.debug( "(%s) submitting file %s", galaxy_id_tag, ajs.job_file )
  134. log.debug( "(%s) command is: %s", galaxy_id_tag, command_line )
  135. if native_spec:
  136. log.debug( "(%s) native specification is: %s", galaxy_id_tag, native_spec )
  137. # runJob will raise if there's a submit problem
  138. if self.external_runJob_script is None:
  139. # TODO: create a queue for retrying submission indefinitely
  140. # TODO: configurable max tries and sleep
  141. trynum = 0
  142. external_job_id = None
  143. while external_job_id is None and trynum < 5:
  144. try:
  145. external_job_id = self.ds.runJob(jt)
  146. except drmaa.InternalException, e:
  147. trynum += 1
  148. log.warning( '(%s) drmaa.Session.runJob() failed, will retry: %s', galaxy_id_tag, e )
  149. time.sleep( 5 )
  150. else:
  151. job_wrapper.change_ownership_for_run()
  152. log.debug( '(%s) submitting with credentials: %s [uid: %s]' % ( galaxy_id_tag, job_wrapper.user_system_pwent[0], job_wrapper.user_system_pwent[2] ) )
  153. filename = self.store_jobtemplate(job_wrapper, jt)
  154. self.userid = job_wrapper.user_system_pwent[2]
  155. external_job_id = self.external_runjob(filename, job_wrapper.user_system_pwent[2]).strip()
  156. log.info( "(%s) queued as %s" % ( galaxy_id_tag, external_job_id ) )
  157. # store runner information for tracking if Galaxy restarts
  158. job_wrapper.set_job_destination( job_destination, external_job_id )
  159. # Store DRM related state information for job
  160. ajs.job_id = external_job_id
  161. ajs.old_state = 'new'
  162. ajs.job_destination = job_destination
  163. # delete the job template
  164. self.ds.deleteJobTemplate( jt )
  165. # Add to our 'queue' of jobs to monitor
  166. self.monitor_queue.put( ajs )
  167. def _complete_terminal_job( self, ajs, drmaa_state, **kwargs ):
  168. """
  169. Handle a job upon its termination in the DRM. This method is meant to
  170. be overridden by subclasses to improve post-mortem and reporting of
  171. failures.
  172. """
  173. if drmaa_state == drmaa.JobState.FAILED:
  174. if ajs.job_wrapper.get_state() != model.Job.states.DELETED:
  175. ajs.stop_job = False
  176. ajs.fail_message = "The cluster DRM system terminated this job"
  177. self.work_queue.put( ( self.fail_job, ajs ) )
  178. elif drmaa_state == drmaa.JobState.DONE:
  179. super( DRMAAJobRunner, self )._complete_terminal_job( ajs )
  180. def check_watched_items( self ):
  181. """
  182. Called by the monitor thread to look at each watched job and deal
  183. with state changes.
  184. """
  185. new_watched = []
  186. for ajs in self.watched:
  187. external_job_id = ajs.job_id
  188. galaxy_id_tag = ajs.job_wrapper.get_id_tag()
  189. old_state = ajs.old_state
  190. try:
  191. assert external_job_id not in ( None, 'None' ), '(%s/%s) Invalid job id' % ( galaxy_id_tag, external_job_id )
  192. state = self.ds.jobStatus( external_job_id )
  193. except ( drmaa.InternalException, drmaa.InvalidJobException ), e:
  194. if isinstance( e , drmaa.InvalidJobException ):
  195. ecn = "InvalidJobException".lower()
  196. else:
  197. ecn = "InternalException".lower()
  198. retry_param = ecn.lower() + '_retries'
  199. state_param = ecn.lower() + '_state'
  200. retries = getattr( ajs, retry_param, 0 )
  201. if self.runner_params[ retry_param ] > 0:
  202. if retries < self.runner_params[ retry_param ]:
  203. # will retry check on next iteration
  204. setattr( ajs, retry_param, retries + 1 )
  205. continue
  206. if self.runner_params[ state_param ] == model.Job.states.OK:
  207. log.info( "(%s/%s) job left DRM queue with following message: %s", galaxy_id_tag, external_job_id, e )
  208. self.work_queue.put( ( self.finish_job, ajs ) )
  209. elif self.runner_params[ state_param ] == model.Job.states.ERROR:
  210. log.info( "(%s/%s) job check resulted in %s after %s tries: %s", galaxy_id_tag, external_job_id, ecn, retries, e )
  211. self.work_queue.put( ( self.fail_job, ajs ) )
  212. else:
  213. raise Exception( "%s is set to an invalid value (%s), this should not be possible. See galaxy.jobs.drmaa.__init__()", state_param, self.runner_params[ state_param ] )
  214. continue
  215. except drmaa.DrmCommunicationException, e:
  216. log.warning( "(%s/%s) unable to communicate with DRM: %s", galaxy_id_tag, external_job_id, e )
  217. new_watched.append( ajs )
  218. continue
  219. except Exception, e:
  220. # so we don't kill the monitor thread
  221. log.exception( "(%s/%s) Unable to check job status: %s" % ( galaxy_id_tag, external_job_id, str( e ) ) )
  222. log.warning( "(%s/%s) job will now be errored" % ( galaxy_id_tag, external_job_id ) )
  223. ajs.fail_message = "Cluster could not complete job"
  224. self.work_queue.put( ( self.fail_job, ajs ) )
  225. continue
  226. if state != old_state:
  227. log.debug( "(%s/%s) state change: %s" % ( galaxy_id_tag, external_job_id, self.drmaa_job_state_strings[state] ) )
  228. if state == drmaa.JobState.RUNNING and not ajs.running:
  229. ajs.running = True
  230. ajs.job_wrapper.change_state( model.Job.states.RUNNING )
  231. if state in ( drmaa.JobState.FAILED, drmaa.JobState.DONE ):
  232. self._complete_terminal_job( ajs, drmaa_state = state )
  233. continue
  234. ajs.old_state = state
  235. new_watched.append( ajs )
  236. # Replace the watch list with the updated version
  237. self.watched = new_watched
  238. def stop_job( self, job ):
  239. """Attempts to delete a job from the DRM queue"""
  240. try:
  241. ext_id = job.get_job_runner_external_id()
  242. assert ext_id not in ( None, 'None' ), 'External job id is None'
  243. if self.external_killJob_script is None:
  244. self.ds.control( ext_id, drmaa.JobControlAction.TERMINATE )
  245. else:
  246. # FIXME: hardcoded path
  247. subprocess.Popen( [ '/usr/bin/sudo', '-E', self.external_killJob_script, str( ext_id ), str( self.userid ) ], shell=False )
  248. log.debug( "(%s/%s) Removed from DRM queue at user's request" % ( job.get_id(), ext_id ) )
  249. except drmaa.InvalidJobException:
  250. log.debug( "(%s/%s) User killed running job, but it was already dead" % ( job.get_id(), ext_id ) )
  251. except Exception, e:
  252. log.debug( "(%s/%s) User killed running job, but error encountered removing from DRM queue: %s" % ( job.get_id(), ext_id, e ) )
  253. def recover( self, job, job_wrapper ):
  254. """Recovers jobs stuck in the queued/running state when Galaxy started"""
  255. job_id = job.get_job_runner_external_id()
  256. if job_id is None:
  257. self.put( job_wrapper )
  258. return
  259. ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper )
  260. ajs.job_id = str( job_id )
  261. ajs.command_line = job.get_command_line()
  262. ajs.job_wrapper = job_wrapper
  263. ajs.job_destination = job_wrapper.job_destination
  264. self.__old_state_paths( ajs )
  265. if job.state == model.Job.states.RUNNING:
  266. log.debug( "(%s/%s) is still in running state, adding to the DRM queue" % ( job.get_id(), job.get_job_runner_external_id() ) )
  267. ajs.old_state = drmaa.JobState.RUNNING
  268. ajs.running = True
  269. self.monitor_queue.put( ajs )
  270. elif job.get_state() == model.Job.states.QUEUED:
  271. log.debug( "(%s/%s) is still in DRM queued state, adding to the DRM queue" % ( job.get_id(), job.get_job_runner_external_id() ) )
  272. ajs.old_state = drmaa.JobState.QUEUED_ACTIVE
  273. ajs.running = False
  274. self.monitor_queue.put( ajs )
  275. def __old_state_paths( self, ajs ):
  276. """For recovery of jobs started prior to standardizing the naming of
  277. files in the AsychronousJobState object
  278. """
  279. if ajs.job_wrapper is not None:
  280. job_file = "%s/galaxy_%s.sh" % (self.app.config.cluster_files_directory, ajs.job_wrapper.job_id)
  281. if not os.path.exists( ajs.job_file ) and os.path.exists( job_file ):
  282. ajs.output_file = "%s.drmout" % os.path.join(os.getcwd(), ajs.job_wrapper.working_directory, ajs.job_wrapper.get_id_tag())
  283. ajs.error_file = "%s.drmerr" % os.path.join(os.getcwd(), ajs.job_wrapper.working_directory, ajs.job_wrapper.get_id_tag())
  284. ajs.exit_code_file = "%s.drmec" % os.path.join(os.getcwd(), ajs.job_wrapper.working_directory, ajs.job_wrapper.get_id_tag())
  285. ajs.job_file = job_file
  286. def store_jobtemplate(self, job_wrapper, jt):
  287. """ Stores the content of a DRMAA JobTemplate object in a file as a JSON string.
  288. Path is hard-coded, but it's no worse than other path in this module.
  289. Uses Galaxy's JobID, so file is expected to be unique."""
  290. filename = "%s/%s.jt_json" % (self.app.config.cluster_files_directory, job_wrapper.get_id_tag())
  291. data = {}
  292. for attr in DRMAA_jobTemplate_attributes:
  293. try:
  294. data[attr] = getattr(jt, attr)
  295. except:
  296. pass
  297. s = json.dumps(data)
  298. f = open(filename,'w')
  299. f.write(s)
  300. f.close()
  301. log.debug( '(%s) Job script for external submission is: %s' % ( job_wrapper.job_id, filename ) )
  302. return filename
  303. def external_runjob(self, jobtemplate_filename, username):
  304. """ runs an external script the will QSUB a new job.
  305. The external script will be run with sudo, and will setuid() to the specified user.
  306. Effectively, will QSUB as a different user (then the one used by Galaxy).
  307. """
  308. script_parts = self.external_runJob_script.split()
  309. script = script_parts[0]
  310. command = [ '/usr/bin/sudo', '-E', script]
  311. for script_argument in script_parts[1:]:
  312. command.append(script_argument)
  313. command.extend( [ str(username), jobtemplate_filename ] )
  314. log.info("Running command %s" % command)
  315. p = subprocess.Popen(command,
  316. shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  317. (stdoutdata, stderrdata) = p.communicate()
  318. exitcode = p.returncode
  319. #os.unlink(jobtemplate_filename)
  320. if exitcode != 0:
  321. # There was an error in the child process
  322. raise RuntimeError("External_runjob failed (exit code %s)\nChild process reported error:\n%s" % (str(exitcode), stderrdata))
  323. if not stdoutdata.strip():
  324. raise RuntimeError("External_runjob did return the job id: %s" % (stdoutdata))
  325. # The expected output is a single line containing a single numeric value:
  326. # the DRMAA job-ID. If not the case, will throw an error.
  327. jobId = stdoutdata
  328. return jobId