/lib/galaxy/jobs/runners/sge.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 392 lines · 304 code · 32 blank · 56 comment · 35 complexity · 25ca9948bbf0cd14449e31e37bad9b39 MD5 · raw file

  1. import os, logging, threading, time
  2. from Queue import Queue, Empty
  3. from galaxy import model
  4. from galaxy.jobs.runners import BaseJobRunner
  5. from paste.deploy.converters import asbool
  6. import pkg_resources
  7. egg_message = """
  8. The 'sge' runner depends on 'DRMAA_python' which is not installed. Galaxy's
  9. "scramble" system should make this installation simple, please follow the
  10. instructions found at:
  11. http://wiki.g2.bx.psu.edu/Admin/Config/Performance/Cluster
  12. Additional errors may follow:
  13. %s
  14. """
  15. try:
  16. pkg_resources.require( "DRMAA_python" )
  17. import DRMAA
  18. except Exception, e:
  19. raise Exception( egg_message % str( e ) )
  20. log = logging.getLogger( __name__ )
  21. __all__ = [ 'SGEJobRunner' ]
  22. DRMAA_state = {
  23. DRMAA.Session.UNDETERMINED: 'process status cannot be determined',
  24. DRMAA.Session.QUEUED_ACTIVE: 'job is queued and waiting to be scheduled',
  25. DRMAA.Session.SYSTEM_ON_HOLD: 'job is queued and in system hold',
  26. DRMAA.Session.USER_ON_HOLD: 'job is queued and in user hold',
  27. DRMAA.Session.USER_SYSTEM_ON_HOLD: 'job is queued and in user and system hold',
  28. DRMAA.Session.RUNNING: 'job is running',
  29. DRMAA.Session.SYSTEM_SUSPENDED: 'job is system suspended',
  30. DRMAA.Session.USER_SUSPENDED: 'job is user suspended',
  31. DRMAA.Session.DONE: 'job finished normally',
  32. DRMAA.Session.FAILED: 'job finished, but failed',
  33. }
  34. sge_template = """#!/bin/sh
  35. #$ -S /bin/sh
  36. GALAXY_LIB="%s"
  37. if [ "$GALAXY_LIB" != "None" ]; then
  38. if [ -n "$PYTHONPATH" ]; then
  39. PYTHONPATH="$GALAXY_LIB:$PYTHONPATH"
  40. else
  41. PYTHONPATH="$GALAXY_LIB"
  42. fi
  43. export PYTHONPATH
  44. fi
  45. cd %s
  46. %s
  47. """
  48. class SGEJobState( object ):
  49. def __init__( self ):
  50. """
  51. Encapsulates state related to a job that is being run via SGE and
  52. that we need to monitor.
  53. """
  54. self.job_wrapper = None
  55. self.job_id = None
  56. self.old_state = None
  57. self.running = False
  58. self.job_file = None
  59. self.ofile = None
  60. self.efile = None
  61. self.runner_url = None
  62. class SGEJobRunner( BaseJobRunner ):
  63. """
  64. Job runner backed by a finite pool of worker threads. FIFO scheduling
  65. """
  66. STOP_SIGNAL = object()
  67. def __init__( self, app ):
  68. """Initialize this job runner and start the monitor thread"""
  69. self.app = app
  70. self.sa_session = app.model.context
  71. # 'watched' and 'queue' are both used to keep track of jobs to watch.
  72. # 'queue' is used to add new watched jobs, and can be called from
  73. # any thread (usually by the 'queue_job' method). 'watched' must only
  74. # be modified by the monitor thread, which will move items from 'queue'
  75. # to 'watched' and then manage the watched jobs.
  76. self.watched = []
  77. self.monitor_queue = Queue()
  78. self.default_cell = self.determine_sge_cell( self.app.config.default_cluster_job_runner )
  79. self.ds = DRMAA.Session()
  80. self.ds.init( self.default_cell )
  81. self.monitor_thread = threading.Thread( target=self.monitor )
  82. self.monitor_thread.start()
  83. self.work_queue = Queue()
  84. self.work_threads = []
  85. nworkers = app.config.cluster_job_queue_workers
  86. for i in range( nworkers ):
  87. worker = threading.Thread( target=self.run_next )
  88. worker.start()
  89. self.work_threads.append( worker )
  90. log.debug( "%d workers ready" % nworkers )
  91. def determine_sge_cell( self, url ):
  92. """Determine what SGE cell we are using"""
  93. url_split = url.split("/")
  94. if url_split[0] == 'sge:':
  95. return url_split[2]
  96. # this could happen if sge is started, but is not the default runner
  97. else:
  98. return ''
  99. def determine_sge_queue( self, url ):
  100. """Determine what SGE queue we are submitting to"""
  101. try:
  102. return url.split('/')[3] or None
  103. except:
  104. return None
  105. def determine_sge_project( self, url ):
  106. """Determine what SGE project we are submitting to"""
  107. try:
  108. return url.split('/')[4] or None
  109. except:
  110. return None
  111. def determine_sge_tool_parameters( self, url ):
  112. """Determine what are the tool's specific paramters"""
  113. try:
  114. return url.split('/')[5] or None
  115. except:
  116. return None
  117. def run_next( self ):
  118. """
  119. Run the next item in the queue (a job waiting to run or finish )
  120. """
  121. while 1:
  122. ( op, obj ) = self.work_queue.get()
  123. if op is self.STOP_SIGNAL:
  124. return
  125. try:
  126. if op == 'queue':
  127. self.queue_job( obj )
  128. elif op == 'finish':
  129. self.finish_job( obj )
  130. elif op == 'fail':
  131. self.fail_job( obj )
  132. except:
  133. log.exception( "Uncaught exception %sing job" % op )
  134. def queue_job( self, job_wrapper ):
  135. """Create SGE script for a job and submit it to the SGE queue"""
  136. try:
  137. job_wrapper.prepare()
  138. command_line = self.build_command_line( job_wrapper, include_metadata = True )
  139. except:
  140. job_wrapper.fail( "failure preparing job", exception=True )
  141. log.exception("failure running job %d" % job_wrapper.job_id)
  142. return
  143. runner_url = job_wrapper.tool.job_runner
  144. # This is silly, why would we queue a job with no command line?
  145. if not command_line:
  146. job_wrapper.finish( '', '' )
  147. return
  148. # Check for deletion before we change state
  149. if job_wrapper.get_state() == model.Job.states.DELETED:
  150. log.debug( "Job %s deleted by user before it entered the SGE queue" % job_wrapper.job_id )
  151. job_wrapper.cleanup()
  152. return
  153. # Change to queued state immediately
  154. job_wrapper.change_state( model.Job.states.QUEUED )
  155. if self.determine_sge_cell( runner_url ) != self.default_cell:
  156. # TODO: support multiple cells
  157. log.warning( "(%s) Using multiple SGE cells is not supported. This job will be submitted to the default cell." % job_wrapper.job_id )
  158. sge_queue_name = self.determine_sge_queue( runner_url )
  159. sge_project_name = self.determine_sge_project( runner_url )
  160. sge_extra_params = self.determine_sge_tool_parameters ( runner_url )
  161. # define job attributes
  162. ofile = "%s/%s.o" % (self.app.config.cluster_files_directory, job_wrapper.job_id)
  163. efile = "%s/%s.e" % (self.app.config.cluster_files_directory, job_wrapper.job_id)
  164. jt = self.ds.createJobTemplate()
  165. jt.remoteCommand = "%s/database/pbs/galaxy_%s.sh" % (os.getcwd(), job_wrapper.job_id)
  166. jt.outputPath = ":%s" % ofile
  167. jt.errorPath = ":%s" % efile
  168. nativeSpec = []
  169. if sge_queue_name is not None:
  170. nativeSpec.append( "-q '%s'" % sge_queue_name )
  171. if sge_project_name is not None:
  172. nativeSpec.append( "-P '%s'" % sge_project_name)
  173. if sge_extra_params is not None:
  174. nativeSpec.append( sge_extra_params )
  175. if len(nativeSpec)>0:
  176. jt.nativeSpecification = ' '.join(nativeSpec)
  177. script = sge_template % (job_wrapper.galaxy_lib_dir, os.path.abspath( job_wrapper.working_directory ), command_line)
  178. fh = file( jt.remoteCommand, "w" )
  179. fh.write( script )
  180. fh.close()
  181. os.chmod( jt.remoteCommand, 0750 )
  182. # job was deleted while we were preparing it
  183. if job_wrapper.get_state() == model.Job.states.DELETED:
  184. log.debug( "Job %s deleted by user before it entered the SGE queue" % job_wrapper.job_id )
  185. self.cleanup( ( ofile, efile, jt.remoteCommand ) )
  186. job_wrapper.cleanup()
  187. return
  188. galaxy_job_id = job_wrapper.job_id
  189. log.debug("(%s) submitting file %s" % ( galaxy_job_id, jt.remoteCommand ) )
  190. log.debug("(%s) command is: %s" % ( galaxy_job_id, command_line ) )
  191. # runJob will raise if there's a submit problem
  192. job_id = self.ds.runJob(jt)
  193. if sge_queue_name is None:
  194. log.debug("(%s) queued in default queue as %s" % (galaxy_job_id, job_id) )
  195. else:
  196. log.debug("(%s) queued in %s queue as %s" % (galaxy_job_id, sge_queue_name, job_id) )
  197. # store runner information for tracking if Galaxy restarts
  198. job_wrapper.set_runner( runner_url, job_id )
  199. # Store SGE related state information for job
  200. sge_job_state = SGEJobState()
  201. sge_job_state.job_wrapper = job_wrapper
  202. sge_job_state.job_id = job_id
  203. sge_job_state.ofile = ofile
  204. sge_job_state.efile = efile
  205. sge_job_state.job_file = jt.remoteCommand
  206. sge_job_state.old_state = 'new'
  207. sge_job_state.running = False
  208. sge_job_state.runner_url = runner_url
  209. # delete the job template
  210. self.ds.deleteJobTemplate( jt )
  211. # Add to our 'queue' of jobs to monitor
  212. self.monitor_queue.put( sge_job_state )
  213. def monitor( self ):
  214. """
  215. Watches jobs currently in the PBS queue and deals with state changes
  216. (queued to running) and job completion
  217. """
  218. while 1:
  219. # Take any new watched jobs and put them on the monitor list
  220. try:
  221. while 1:
  222. sge_job_state = self.monitor_queue.get_nowait()
  223. if sge_job_state is self.STOP_SIGNAL:
  224. # TODO: This is where any cleanup would occur
  225. self.ds.exit()
  226. return
  227. self.watched.append( sge_job_state )
  228. except Empty:
  229. pass
  230. # Iterate over the list of watched jobs and check state
  231. self.check_watched_items()
  232. # Sleep a bit before the next state check
  233. time.sleep( 1 )
  234. def check_watched_items( self ):
  235. """
  236. Called by the monitor thread to look at each watched job and deal
  237. with state changes.
  238. """
  239. new_watched = []
  240. for sge_job_state in self.watched:
  241. job_id = sge_job_state.job_id
  242. galaxy_job_id = sge_job_state.job_wrapper.job_id
  243. old_state = sge_job_state.old_state
  244. try:
  245. state = self.ds.getJobProgramStatus( job_id )
  246. except DRMAA.InvalidJobError:
  247. # we should only get here if an orphaned job was put into the queue at app startup
  248. log.debug("(%s/%s) job left SGE queue" % ( galaxy_job_id, job_id ) )
  249. self.work_queue.put( ( 'finish', sge_job_state ) )
  250. continue
  251. except Exception, e:
  252. # so we don't kill the monitor thread
  253. log.exception("(%s/%s) Unable to check job status" % ( galaxy_job_id, job_id ) )
  254. log.warning("(%s/%s) job will now be errored" % ( galaxy_job_id, job_id ) )
  255. sge_job_state.fail_message = "Cluster could not complete job"
  256. self.work_queue.put( ( 'fail', sge_job_state ) )
  257. continue
  258. if state != old_state:
  259. log.debug("(%s/%s) state change: %s" % ( galaxy_job_id, job_id, DRMAA_state[state] ) )
  260. if state == DRMAA.Session.RUNNING and not sge_job_state.running:
  261. sge_job_state.running = True
  262. sge_job_state.job_wrapper.change_state( model.Job.states.RUNNING )
  263. if state in ( DRMAA.Session.DONE, DRMAA.Session.FAILED ):
  264. self.work_queue.put( ( 'finish', sge_job_state ) )
  265. continue
  266. sge_job_state.old_state = state
  267. new_watched.append( sge_job_state )
  268. # Replace the watch list with the updated version
  269. self.watched = new_watched
  270. def finish_job( self, sge_job_state ):
  271. """
  272. Get the output/error for a finished job, pass to `job_wrapper.finish`
  273. and cleanup all the SGE temporary files.
  274. """
  275. ofile = sge_job_state.ofile
  276. efile = sge_job_state.efile
  277. job_file = sge_job_state.job_file
  278. # collect the output
  279. try:
  280. ofh = file(ofile, "r")
  281. efh = file(efile, "r")
  282. stdout = ofh.read( 32768 )
  283. stderr = efh.read( 32768 )
  284. except:
  285. stdout = ''
  286. stderr = 'Job output not returned from cluster'
  287. log.debug(stderr)
  288. try:
  289. sge_job_state.job_wrapper.finish( stdout, stderr )
  290. except:
  291. log.exception("Job wrapper finish method failed")
  292. # clean up the sge files
  293. self.cleanup( ( ofile, efile, job_file ) )
  294. def fail_job( self, sge_job_state ):
  295. """
  296. Seperated out so we can use the worker threads for it.
  297. """
  298. self.stop_job( self.sa_session.query( self.app.model.Job ).get( sge_job_state.job_wrapper.job_id ) )
  299. sge_job_state.job_wrapper.fail( sge_job_state.fail_message )
  300. self.cleanup( ( sge_job_state.ofile, sge_job_state.efile, sge_job_state.job_file ) )
  301. def cleanup( self, files ):
  302. if not asbool( self.app.config.get( 'debug', False ) ):
  303. for file in files:
  304. if os.access( file, os.R_OK ):
  305. os.unlink( file )
  306. def put( self, job_wrapper ):
  307. """Add a job to the queue (by job identifier)"""
  308. # Change to queued state before handing to worker thread so the runner won't pick it up again
  309. job_wrapper.change_state( model.Job.states.QUEUED )
  310. self.work_queue.put( ( 'queue', job_wrapper ) )
  311. def shutdown( self ):
  312. """Attempts to gracefully shut down the monitor thread"""
  313. log.info( "sending stop signal to worker threads" )
  314. self.monitor_queue.put( self.STOP_SIGNAL )
  315. for i in range( len( self.work_threads ) ):
  316. self.work_queue.put( ( self.STOP_SIGNAL, None ) )
  317. log.info( "sge job runner stopped" )
  318. def stop_job( self, job ):
  319. """Attempts to delete a job from the SGE queue"""
  320. try:
  321. self.ds.control( job.job_runner_external_id, DRMAA.Session.TERMINATE )
  322. log.debug( "(%s/%s) Removed from SGE queue at user's request" % ( job.id, job.job_runner_external_id ) )
  323. except DRMAA.InvalidJobError:
  324. log.debug( "(%s/%s) User killed running job, but it was already dead" % ( job.id, job.job_runner_external_id ) )
  325. def recover( self, job, job_wrapper ):
  326. """Recovers jobs stuck in the queued/running state when Galaxy started"""
  327. sge_job_state = SGEJobState()
  328. sge_job_state.ofile = "%s/database/pbs/%s.o" % (os.getcwd(), job.id)
  329. sge_job_state.efile = "%s/database/pbs/%s.e" % (os.getcwd(), job.id)
  330. sge_job_state.job_file = "%s/database/pbs/galaxy_%s.sh" % (os.getcwd(), job.id)
  331. sge_job_state.job_id = str( job.job_runner_external_id )
  332. sge_job_state.runner_url = job_wrapper.tool.job_runner
  333. job_wrapper.command_line = job.command_line
  334. sge_job_state.job_wrapper = job_wrapper
  335. if job.state == model.Job.states.RUNNING:
  336. log.debug( "(%s/%s) is still in running state, adding to the SGE queue" % ( job.id, job.job_runner_external_id ) )
  337. sge_job_state.old_state = DRMAA.Session.RUNNING
  338. sge_job_state.running = True
  339. self.monitor_queue.put( sge_job_state )
  340. elif job.state == model.Job.states.QUEUED:
  341. log.debug( "(%s/%s) is still in SGE queued state, adding to the SGE queue" % ( job.id, job.job_runner_external_id ) )
  342. sge_job_state.old_state = DRMAA.Session.QUEUED_ACTIVE
  343. sge_job_state.running = False
  344. self.monitor_queue.put( sge_job_state )