PageRenderTime 150ms CodeModel.GetById 121ms app.highlight 23ms RepoModel.GetById 2ms app.codeStats 0ms

/lib/galaxy/jobs/runners/sge.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 392 lines | 388 code | 4 blank | 0 comment | 4 complexity | 25ca9948bbf0cd14449e31e37bad9b39 MD5 | raw file
  1import os, logging, threading, time
  2from Queue import Queue, Empty
  3
  4from galaxy import model
  5from galaxy.jobs.runners import BaseJobRunner
  6
  7from paste.deploy.converters import asbool
  8
  9import pkg_resources
 10
 11egg_message = """
 12
 13The 'sge' runner depends on 'DRMAA_python' which is not installed.  Galaxy's
 14"scramble" system should make this installation simple, please follow the
 15instructions found at:
 16
 17  http://wiki.g2.bx.psu.edu/Admin/Config/Performance/Cluster
 18
 19Additional errors may follow:
 20%s
 21"""
 22
 23
 24try:
 25    pkg_resources.require( "DRMAA_python" )
 26    import DRMAA
 27except Exception, e:
 28    raise Exception( egg_message % str( e ) )
 29
 30
 31log = logging.getLogger( __name__ )
 32
 33__all__ = [ 'SGEJobRunner' ]
 34
 35DRMAA_state = {
 36    DRMAA.Session.UNDETERMINED: 'process status cannot be determined',
 37    DRMAA.Session.QUEUED_ACTIVE: 'job is queued and waiting to be scheduled',
 38    DRMAA.Session.SYSTEM_ON_HOLD: 'job is queued and in system hold',
 39    DRMAA.Session.USER_ON_HOLD: 'job is queued and in user hold',
 40    DRMAA.Session.USER_SYSTEM_ON_HOLD: 'job is queued and in user and system hold',
 41    DRMAA.Session.RUNNING: 'job is running',
 42    DRMAA.Session.SYSTEM_SUSPENDED: 'job is system suspended',
 43    DRMAA.Session.USER_SUSPENDED: 'job is user suspended',
 44    DRMAA.Session.DONE: 'job finished normally',
 45    DRMAA.Session.FAILED: 'job finished, but failed',
 46}
 47
 48sge_template = """#!/bin/sh
 49#$ -S /bin/sh
 50GALAXY_LIB="%s"
 51if [ "$GALAXY_LIB" != "None" ]; then
 52    if [ -n "$PYTHONPATH" ]; then
 53        PYTHONPATH="$GALAXY_LIB:$PYTHONPATH"
 54    else
 55        PYTHONPATH="$GALAXY_LIB"
 56    fi
 57    export PYTHONPATH
 58fi
 59cd %s
 60%s
 61"""
 62
 63class SGEJobState( object ):
 64    def __init__( self ):
 65        """
 66        Encapsulates state related to a job that is being run via SGE and 
 67        that we need to monitor.
 68        """
 69        self.job_wrapper = None
 70        self.job_id = None
 71        self.old_state = None
 72        self.running = False
 73        self.job_file = None
 74        self.ofile = None
 75        self.efile = None
 76        self.runner_url = None
 77
 78class SGEJobRunner( BaseJobRunner ):
 79    """
 80    Job runner backed by a finite pool of worker threads. FIFO scheduling
 81    """
 82    STOP_SIGNAL = object()
 83    def __init__( self, app ):
 84        """Initialize this job runner and start the monitor thread"""
 85        self.app = app
 86        self.sa_session = app.model.context
 87        # 'watched' and 'queue' are both used to keep track of jobs to watch.
 88        # 'queue' is used to add new watched jobs, and can be called from
 89        # any thread (usually by the 'queue_job' method). 'watched' must only
 90        # be modified by the monitor thread, which will move items from 'queue'
 91        # to 'watched' and then manage the watched jobs.
 92        self.watched = []
 93        self.monitor_queue = Queue()
 94        self.default_cell = self.determine_sge_cell( self.app.config.default_cluster_job_runner )
 95        self.ds = DRMAA.Session()
 96        self.ds.init( self.default_cell )
 97        self.monitor_thread = threading.Thread( target=self.monitor )
 98        self.monitor_thread.start()
 99        self.work_queue = Queue()
100        self.work_threads = []
101        nworkers = app.config.cluster_job_queue_workers
102        for i in range( nworkers ):
103            worker = threading.Thread( target=self.run_next )
104            worker.start()
105            self.work_threads.append( worker )
106        log.debug( "%d workers ready" % nworkers )
107
108    def determine_sge_cell( self, url ):
109        """Determine what SGE cell we are using"""
110        url_split = url.split("/")
111        if url_split[0] == 'sge:':
112            return url_split[2]
113        # this could happen if sge is started, but is not the default runner
114        else:
115            return ''
116
117    def determine_sge_queue( self, url ):
118        """Determine what SGE queue we are submitting to"""
119        try:
120            return url.split('/')[3] or None
121        except:
122            return None
123
124    def determine_sge_project( self, url ):
125        """Determine what SGE project we are submitting to"""
126        try:
127            return url.split('/')[4] or None
128        except:
129            return None
130
131    def determine_sge_tool_parameters( self, url ):
132        """Determine what are the tool's specific paramters"""
133        try:
134            return url.split('/')[5] or None
135        except:
136            return None
137
138    def run_next( self ):
139        """
140        Run the next item in the queue (a job waiting to run or finish )
141        """
142        while 1:
143            ( op, obj ) = self.work_queue.get()
144            if op is self.STOP_SIGNAL:
145                return
146            try:
147                if op == 'queue':
148                    self.queue_job( obj )
149                elif op == 'finish':
150                    self.finish_job( obj )
151                elif op == 'fail':
152                    self.fail_job( obj )
153            except:
154                log.exception( "Uncaught exception %sing job" % op )
155
156    def queue_job( self, job_wrapper ):
157        """Create SGE script for a job and submit it to the SGE queue"""
158
159        try:
160            job_wrapper.prepare()
161            command_line = self.build_command_line( job_wrapper, include_metadata = True )
162        except:
163            job_wrapper.fail( "failure preparing job", exception=True )
164            log.exception("failure running job %d" % job_wrapper.job_id)
165            return
166
167        runner_url = job_wrapper.tool.job_runner
168        
169        # This is silly, why would we queue a job with no command line?
170        if not command_line:
171            job_wrapper.finish( '', '' )
172            return
173        
174        # Check for deletion before we change state
175        if job_wrapper.get_state() == model.Job.states.DELETED:
176            log.debug( "Job %s deleted by user before it entered the SGE queue" % job_wrapper.job_id )
177            job_wrapper.cleanup()
178            return
179
180        # Change to queued state immediately
181        job_wrapper.change_state( model.Job.states.QUEUED )
182        
183        if self.determine_sge_cell( runner_url ) != self.default_cell:
184            # TODO: support multiple cells
185            log.warning( "(%s) Using multiple SGE cells is not supported.  This job will be submitted to the default cell." % job_wrapper.job_id )
186        sge_queue_name = self.determine_sge_queue( runner_url )
187        sge_project_name = self.determine_sge_project( runner_url )
188        sge_extra_params = self.determine_sge_tool_parameters ( runner_url )
189
190        # define job attributes
191        ofile = "%s/%s.o" % (self.app.config.cluster_files_directory, job_wrapper.job_id)
192        efile = "%s/%s.e" % (self.app.config.cluster_files_directory, job_wrapper.job_id)
193        jt = self.ds.createJobTemplate()
194        jt.remoteCommand = "%s/database/pbs/galaxy_%s.sh" % (os.getcwd(), job_wrapper.job_id)
195        jt.outputPath = ":%s" % ofile
196        jt.errorPath = ":%s" % efile
197        nativeSpec = []
198        if sge_queue_name is not None:
199            nativeSpec.append( "-q '%s'" % sge_queue_name )
200        if sge_project_name is not None:
201            nativeSpec.append( "-P '%s'" % sge_project_name)
202        if sge_extra_params is not None:
203            nativeSpec.append( sge_extra_params ) 
204        if len(nativeSpec)>0:
205            jt.nativeSpecification = ' '.join(nativeSpec)
206
207        script = sge_template % (job_wrapper.galaxy_lib_dir, os.path.abspath( job_wrapper.working_directory ), command_line)
208
209        fh = file( jt.remoteCommand, "w" )
210        fh.write( script )
211        fh.close()
212        os.chmod( jt.remoteCommand, 0750 )
213
214        # job was deleted while we were preparing it
215        if job_wrapper.get_state() == model.Job.states.DELETED:
216            log.debug( "Job %s deleted by user before it entered the SGE queue" % job_wrapper.job_id )
217            self.cleanup( ( ofile, efile, jt.remoteCommand ) )
218            job_wrapper.cleanup()
219            return
220
221        galaxy_job_id = job_wrapper.job_id
222        log.debug("(%s) submitting file %s" % ( galaxy_job_id, jt.remoteCommand ) )
223        log.debug("(%s) command is: %s" % ( galaxy_job_id, command_line ) )
224        # runJob will raise if there's a submit problem
225        job_id = self.ds.runJob(jt)
226        if sge_queue_name is None:
227            log.debug("(%s) queued in default queue as %s" % (galaxy_job_id, job_id) )
228        else:
229            log.debug("(%s) queued in %s queue as %s" % (galaxy_job_id, sge_queue_name, job_id) )
230
231        # store runner information for tracking if Galaxy restarts
232        job_wrapper.set_runner( runner_url, job_id )
233
234        # Store SGE related state information for job
235        sge_job_state = SGEJobState()
236        sge_job_state.job_wrapper = job_wrapper
237        sge_job_state.job_id = job_id
238        sge_job_state.ofile = ofile
239        sge_job_state.efile = efile
240        sge_job_state.job_file = jt.remoteCommand
241        sge_job_state.old_state = 'new'
242        sge_job_state.running = False
243        sge_job_state.runner_url = runner_url
244        
245        # delete the job template
246        self.ds.deleteJobTemplate( jt )
247
248        # Add to our 'queue' of jobs to monitor
249        self.monitor_queue.put( sge_job_state )
250
251    def monitor( self ):
252        """
253        Watches jobs currently in the PBS queue and deals with state changes
254        (queued to running) and job completion
255        """
256        while 1:
257            # Take any new watched jobs and put them on the monitor list
258            try:
259                while 1: 
260                    sge_job_state = self.monitor_queue.get_nowait()
261                    if sge_job_state is self.STOP_SIGNAL:
262                        # TODO: This is where any cleanup would occur
263                        self.ds.exit()
264                        return
265                    self.watched.append( sge_job_state )
266            except Empty:
267                pass
268            # Iterate over the list of watched jobs and check state
269            self.check_watched_items()
270            # Sleep a bit before the next state check
271            time.sleep( 1 )
272            
273    def check_watched_items( self ):
274        """
275        Called by the monitor thread to look at each watched job and deal
276        with state changes.
277        """
278        new_watched = []
279        for sge_job_state in self.watched:
280            job_id = sge_job_state.job_id
281            galaxy_job_id = sge_job_state.job_wrapper.job_id
282            old_state = sge_job_state.old_state
283            try:
284                state = self.ds.getJobProgramStatus( job_id )
285            except DRMAA.InvalidJobError:
286                # we should only get here if an orphaned job was put into the queue at app startup
287                log.debug("(%s/%s) job left SGE queue" % ( galaxy_job_id, job_id ) )
288                self.work_queue.put( ( 'finish', sge_job_state ) )
289                continue
290            except Exception, e:
291                # so we don't kill the monitor thread
292                log.exception("(%s/%s) Unable to check job status" % ( galaxy_job_id, job_id ) )
293                log.warning("(%s/%s) job will now be errored" % ( galaxy_job_id, job_id ) )
294                sge_job_state.fail_message = "Cluster could not complete job"
295                self.work_queue.put( ( 'fail', sge_job_state ) )
296                continue
297            if state != old_state:
298                log.debug("(%s/%s) state change: %s" % ( galaxy_job_id, job_id, DRMAA_state[state] ) )
299            if state == DRMAA.Session.RUNNING and not sge_job_state.running:
300                sge_job_state.running = True
301                sge_job_state.job_wrapper.change_state( model.Job.states.RUNNING )
302            if state in ( DRMAA.Session.DONE, DRMAA.Session.FAILED ):
303                self.work_queue.put( ( 'finish', sge_job_state ) )
304                continue
305            sge_job_state.old_state = state
306            new_watched.append( sge_job_state )
307        # Replace the watch list with the updated version
308        self.watched = new_watched
309        
310    def finish_job( self, sge_job_state ):
311        """
312        Get the output/error for a finished job, pass to `job_wrapper.finish`
313        and cleanup all the SGE temporary files.
314        """
315        ofile = sge_job_state.ofile
316        efile = sge_job_state.efile
317        job_file = sge_job_state.job_file
318        # collect the output
319        try:
320            ofh = file(ofile, "r")
321            efh = file(efile, "r")
322            stdout = ofh.read( 32768 )
323            stderr = efh.read( 32768 )
324        except:
325            stdout = ''
326            stderr = 'Job output not returned from cluster'
327            log.debug(stderr)
328
329        try:
330            sge_job_state.job_wrapper.finish( stdout, stderr )
331        except:
332            log.exception("Job wrapper finish method failed")
333
334        # clean up the sge files
335        self.cleanup( ( ofile, efile, job_file ) )
336
337    def fail_job( self, sge_job_state ):
338        """
339        Seperated out so we can use the worker threads for it.
340        """
341        self.stop_job( self.sa_session.query( self.app.model.Job ).get( sge_job_state.job_wrapper.job_id ) )
342        sge_job_state.job_wrapper.fail( sge_job_state.fail_message )
343        self.cleanup( ( sge_job_state.ofile, sge_job_state.efile, sge_job_state.job_file ) )
344
345    def cleanup( self, files ):
346        if not asbool( self.app.config.get( 'debug', False ) ):
347            for file in files:
348                if os.access( file, os.R_OK ):
349                    os.unlink( file )
350
351    def put( self, job_wrapper ):
352        """Add a job to the queue (by job identifier)"""
353        # Change to queued state before handing to worker thread so the runner won't pick it up again
354        job_wrapper.change_state( model.Job.states.QUEUED )
355        self.work_queue.put( ( 'queue', job_wrapper ) )
356
357    def shutdown( self ):
358        """Attempts to gracefully shut down the monitor thread"""
359        log.info( "sending stop signal to worker threads" )
360        self.monitor_queue.put( self.STOP_SIGNAL )
361        for i in range( len( self.work_threads ) ):
362            self.work_queue.put( ( self.STOP_SIGNAL, None ) )
363        log.info( "sge job runner stopped" )
364
365    def stop_job( self, job ):
366        """Attempts to delete a job from the SGE queue"""
367        try:
368            self.ds.control( job.job_runner_external_id, DRMAA.Session.TERMINATE )
369            log.debug( "(%s/%s) Removed from SGE queue at user's request" % ( job.id, job.job_runner_external_id ) )
370        except DRMAA.InvalidJobError:
371            log.debug( "(%s/%s) User killed running job, but it was already dead" % ( job.id, job.job_runner_external_id ) )
372
373    def recover( self, job, job_wrapper ):
374        """Recovers jobs stuck in the queued/running state when Galaxy started"""
375        sge_job_state = SGEJobState()
376        sge_job_state.ofile = "%s/database/pbs/%s.o" % (os.getcwd(), job.id)
377        sge_job_state.efile = "%s/database/pbs/%s.e" % (os.getcwd(), job.id)
378        sge_job_state.job_file = "%s/database/pbs/galaxy_%s.sh" % (os.getcwd(), job.id)
379        sge_job_state.job_id = str( job.job_runner_external_id )
380        sge_job_state.runner_url = job_wrapper.tool.job_runner
381        job_wrapper.command_line = job.command_line
382        sge_job_state.job_wrapper = job_wrapper
383        if job.state == model.Job.states.RUNNING:
384            log.debug( "(%s/%s) is still in running state, adding to the SGE queue" % ( job.id, job.job_runner_external_id ) )
385            sge_job_state.old_state = DRMAA.Session.RUNNING
386            sge_job_state.running = True
387            self.monitor_queue.put( sge_job_state )
388        elif job.state == model.Job.states.QUEUED:
389            log.debug( "(%s/%s) is still in SGE queued state, adding to the SGE queue" % ( job.id, job.job_runner_external_id ) )
390            sge_job_state.old_state = DRMAA.Session.QUEUED_ACTIVE
391            sge_job_state.running = False
392            self.monitor_queue.put( sge_job_state )