sge.py - This is a Python script that uses the DRMAA librar…

/lib/galaxy/jobs/runners/sge.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 392 lines · 304 code · 32 blank · 56 comment · 35 complexity · 25ca9948bbf0cd14449e31e37bad9b39 MD5 · raw file

import os, logging, threading, time
from Queue import Queue, Empty

from galaxy import model
from galaxy.jobs.runners import BaseJobRunner

from paste.deploy.converters import asbool

import pkg_resources

egg_message = """

The 'sge' runner depends on 'DRMAA_python' which is not installed.  Galaxy's
"scramble" system should make this installation simple, please follow the
instructions found at:

  http://wiki.g2.bx.psu.edu/Admin/Config/Performance/Cluster

Additional errors may follow:
%s
"""


try:
    pkg_resources.require( "DRMAA_python" )
    import DRMAA
except Exception, e:
    raise Exception( egg_message % str( e ) )


log = logging.getLogger( __name__ )

__all__ = [ 'SGEJobRunner' ]

DRMAA_state = {
    DRMAA.Session.UNDETERMINED: 'process status cannot be determined',
    DRMAA.Session.QUEUED_ACTIVE: 'job is queued and waiting to be scheduled',
    DRMAA.Session.SYSTEM_ON_HOLD: 'job is queued and in system hold',
    DRMAA.Session.USER_ON_HOLD: 'job is queued and in user hold',
    DRMAA.Session.USER_SYSTEM_ON_HOLD: 'job is queued and in user and system hold',
    DRMAA.Session.RUNNING: 'job is running',
    DRMAA.Session.SYSTEM_SUSPENDED: 'job is system suspended',
    DRMAA.Session.USER_SUSPENDED: 'job is user suspended',
    DRMAA.Session.DONE: 'job finished normally',
    DRMAA.Session.FAILED: 'job finished, but failed',
}

sge_template = """#!/bin/sh
#$ -S /bin/sh
GALAXY_LIB="%s"
if [ "$GALAXY_LIB" != "None" ]; then
    if [ -n "$PYTHONPATH" ]; then
        PYTHONPATH="$GALAXY_LIB:$PYTHONPATH"
    else
        PYTHONPATH="$GALAXY_LIB"
    fi
    export PYTHONPATH
fi
cd %s
%s
"""

class SGEJobState( object ):
    def __init__( self ):
        """
        Encapsulates state related to a job that is being run via SGE and 
        that we need to monitor.
        """
        self.job_wrapper = None
        self.job_id = None
        self.old_state = None
        self.running = False
        self.job_file = None
        self.ofile = None
        self.efile = None
        self.runner_url = None

class SGEJobRunner( BaseJobRunner ):
    """
    Job runner backed by a finite pool of worker threads. FIFO scheduling
    """
    STOP_SIGNAL = object()
    def __init__( self, app ):
        """Initialize this job runner and start the monitor thread"""
        self.app = app
        self.sa_session = app.model.context
        # 'watched' and 'queue' are both used to keep track of jobs to watch.
        # 'queue' is used to add new watched jobs, and can be called from
        # any thread (usually by the 'queue_job' method). 'watched' must only
        # be modified by the monitor thread, which will move items from 'queue'
        # to 'watched' and then manage the watched jobs.
        self.watched = []
        self.monitor_queue = Queue()
        self.default_cell = self.determine_sge_cell( self.app.config.default_cluster_job_runner )
        self.ds = DRMAA.Session()
        self.ds.init( self.default_cell )
        self.monitor_thread = threading.Thread( target=self.monitor )
        self.monitor_thread.start()
        self.work_queue = Queue()
        self.work_threads = []
        nworkers = app.config.cluster_job_queue_workers
        for i in range( nworkers ):
            worker = threading.Thread( target=self.run_next )
            worker.start()
            self.work_threads.append( worker )
        log.debug( "%d workers ready" % nworkers )

    def determine_sge_cell( self, url ):
        """Determine what SGE cell we are using"""
        url_split = url.split("/")
        if url_split[0] == 'sge:':
            return url_split[2]
        # this could happen if sge is started, but is not the default runner
        else:
            return ''

    def determine_sge_queue( self, url ):
        """Determine what SGE queue we are submitting to"""
        try:
            return url.split('/')[3] or None
        except:
            return None

    def determine_sge_project( self, url ):
        """Determine what SGE project we are submitting to"""
        try:
            return url.split('/')[4] or None
        except:
            return None

    def determine_sge_tool_parameters( self, url ):
        """Determine what are the tool's specific paramters"""
        try:
            return url.split('/')[5] or None
        except:
            return None

    def run_next( self ):
        """
        Run the next item in the queue (a job waiting to run or finish )
        """
        while 1:
            ( op, obj ) = self.work_queue.get()
            if op is self.STOP_SIGNAL:
                return
            try:
                if op == 'queue':
                    self.queue_job( obj )
                elif op == 'finish':
                    self.finish_job( obj )
                elif op == 'fail':
                    self.fail_job( obj )
            except:
                log.exception( "Uncaught exception %sing job" % op )

    def queue_job( self, job_wrapper ):
        """Create SGE script for a job and submit it to the SGE queue"""

        try:
            job_wrapper.prepare()
            command_line = self.build_command_line( job_wrapper, include_metadata = True )
        except:
            job_wrapper.fail( "failure preparing job", exception=True )
            log.exception("failure running job %d" % job_wrapper.job_id)
            return

        runner_url = job_wrapper.tool.job_runner
        
        # This is silly, why would we queue a job with no command line?
        if not command_line:
            job_wrapper.finish( '', '' )
            return
        
        # Check for deletion before we change state
        if job_wrapper.get_state() == model.Job.states.DELETED:
            log.debug( "Job %s deleted by user before it entered the SGE queue" % job_wrapper.job_id )
            job_wrapper.cleanup()
            return

        # Change to queued state immediately
        job_wrapper.change_state( model.Job.states.QUEUED )
        
        if self.determine_sge_cell( runner_url ) != self.default_cell:
            # TODO: support multiple cells
            log.warning( "(%s) Using multiple SGE cells is not supported.  This job will be submitted to the default cell." % job_wrapper.job_id )
        sge_queue_name = self.determine_sge_queue( runner_url )
        sge_project_name = self.determine_sge_project( runner_url )
        sge_extra_params = self.determine_sge_tool_parameters ( runner_url )

        # define job attributes
        ofile = "%s/%s.o" % (self.app.config.cluster_files_directory, job_wrapper.job_id)
        efile = "%s/%s.e" % (self.app.config.cluster_files_directory, job_wrapper.job_id)
        jt = self.ds.createJobTemplate()
        jt.remoteCommand = "%s/database/pbs/galaxy_%s.sh" % (os.getcwd(), job_wrapper.job_id)
        jt.outputPath = ":%s" % ofile
        jt.errorPath = ":%s" % efile
        nativeSpec = []
        if sge_queue_name is not None:
            nativeSpec.append( "-q '%s'" % sge_queue_name )
        if sge_project_name is not None:
            nativeSpec.append( "-P '%s'" % sge_project_name)
        if sge_extra_params is not None:
            nativeSpec.append( sge_extra_params ) 
        if len(nativeSpec)>0:
            jt.nativeSpecification = ' '.join(nativeSpec)

        script = sge_template % (job_wrapper.galaxy_lib_dir, os.path.abspath( job_wrapper.working_directory ), command_line)

        fh = file( jt.remoteCommand, "w" )
        fh.write( script )
        fh.close()
        os.chmod( jt.remoteCommand, 0750 )

        # job was deleted while we were preparing it
        if job_wrapper.get_state() == model.Job.states.DELETED:
            log.debug( "Job %s deleted by user before it entered the SGE queue" % job_wrapper.job_id )
            self.cleanup( ( ofile, efile, jt.remoteCommand ) )
            job_wrapper.cleanup()
            return

        galaxy_job_id = job_wrapper.job_id
        log.debug("(%s) submitting file %s" % ( galaxy_job_id, jt.remoteCommand ) )
        log.debug("(%s) command is: %s" % ( galaxy_job_id, command_line ) )
        # runJob will raise if there's a submit problem
        job_id = self.ds.runJob(jt)
        if sge_queue_name is None:
            log.debug("(%s) queued in default queue as %s" % (galaxy_job_id, job_id) )
        else:
            log.debug("(%s) queued in %s queue as %s" % (galaxy_job_id, sge_queue_name, job_id) )

        # store runner information for tracking if Galaxy restarts
        job_wrapper.set_runner( runner_url, job_id )

        # Store SGE related state information for job
        sge_job_state = SGEJobState()
        sge_job_state.job_wrapper = job_wrapper
        sge_job_state.job_id = job_id
        sge_job_state.ofile = ofile
        sge_job_state.efile = efile
        sge_job_state.job_file = jt.remoteCommand
        sge_job_state.old_state = 'new'
        sge_job_state.running = False
        sge_job_state.runner_url = runner_url
        
        # delete the job template
        self.ds.deleteJobTemplate( jt )

        # Add to our 'queue' of jobs to monitor
        self.monitor_queue.put( sge_job_state )

    def monitor( self ):
        """
        Watches jobs currently in the PBS queue and deals with state changes
        (queued to running) and job completion
        """
        while 1:
            # Take any new watched jobs and put them on the monitor list
            try:
                while 1: 
                    sge_job_state = self.monitor_queue.get_nowait()
                    if sge_job_state is self.STOP_SIGNAL:
                        # TODO: This is where any cleanup would occur
                        self.ds.exit()
                        return
                    self.watched.append( sge_job_state )
            except Empty:
                pass
            # Iterate over the list of watched jobs and check state
            self.check_watched_items()
            # Sleep a bit before the next state check
            time.sleep( 1 )
            
    def check_watched_items( self ):
        """
        Called by the monitor thread to look at each watched job and deal
        with state changes.
        """
        new_watched = []
        for sge_job_state in self.watched:
            job_id = sge_job_state.job_id
            galaxy_job_id = sge_job_state.job_wrapper.job_id
            old_state = sge_job_state.old_state
            try:
                state = self.ds.getJobProgramStatus( job_id )
            except DRMAA.InvalidJobError:
                # we should only get here if an orphaned job was put into the queue at app startup
                log.debug("(%s/%s) job left SGE queue" % ( galaxy_job_id, job_id ) )
                self.work_queue.put( ( 'finish', sge_job_state ) )
                continue
            except Exception, e:
                # so we don't kill the monitor thread
                log.exception("(%s/%s) Unable to check job status" % ( galaxy_job_id, job_id ) )
                log.warning("(%s/%s) job will now be errored" % ( galaxy_job_id, job_id ) )
                sge_job_state.fail_message = "Cluster could not complete job"
                self.work_queue.put( ( 'fail', sge_job_state ) )
                continue
            if state != old_state:
                log.debug("(%s/%s) state change: %s" % ( galaxy_job_id, job_id, DRMAA_state[state] ) )
            if state == DRMAA.Session.RUNNING and not sge_job_state.running:
                sge_job_state.running = True
                sge_job_state.job_wrapper.change_state( model.Job.states.RUNNING )
            if state in ( DRMAA.Session.DONE, DRMAA.Session.FAILED ):
                self.work_queue.put( ( 'finish', sge_job_state ) )
                continue
            sge_job_state.old_state = state
            new_watched.append( sge_job_state )
        # Replace the watch list with the updated version
        self.watched = new_watched
        
    def finish_job( self, sge_job_state ):
        """
        Get the output/error for a finished job, pass to `job_wrapper.finish`
        and cleanup all the SGE temporary files.
        """
        ofile = sge_job_state.ofile
        efile = sge_job_state.efile
        job_file = sge_job_state.job_file
        # collect the output
        try:
            ofh = file(ofile, "r")
            efh = file(efile, "r")
            stdout = ofh.read( 32768 )
            stderr = efh.read( 32768 )
        except:
            stdout = ''
            stderr = 'Job output not returned from cluster'
            log.debug(stderr)

        try:
            sge_job_state.job_wrapper.finish( stdout, stderr )
        except:
            log.exception("Job wrapper finish method failed")

        # clean up the sge files
        self.cleanup( ( ofile, efile, job_file ) )

    def fail_job( self, sge_job_state ):
        """
        Seperated out so we can use the worker threads for it.
        """
        self.stop_job( self.sa_session.query( self.app.model.Job ).get( sge_job_state.job_wrapper.job_id ) )
        sge_job_state.job_wrapper.fail( sge_job_state.fail_message )
        self.cleanup( ( sge_job_state.ofile, sge_job_state.efile, sge_job_state.job_file ) )

    def cleanup( self, files ):
        if not asbool( self.app.config.get( 'debug', False ) ):
            for file in files:
                if os.access( file, os.R_OK ):
                    os.unlink( file )

    def put( self, job_wrapper ):
        """Add a job to the queue (by job identifier)"""
        # Change to queued state before handing to worker thread so the runner won't pick it up again
        job_wrapper.change_state( model.Job.states.QUEUED )
        self.work_queue.put( ( 'queue', job_wrapper ) )

    def shutdown( self ):
        """Attempts to gracefully shut down the monitor thread"""
        log.info( "sending stop signal to worker threads" )
        self.monitor_queue.put( self.STOP_SIGNAL )
        for i in range( len( self.work_threads ) ):
            self.work_queue.put( ( self.STOP_SIGNAL, None ) )
        log.info( "sge job runner stopped" )

    def stop_job( self, job ):
        """Attempts to delete a job from the SGE queue"""
        try:
            self.ds.control( job.job_runner_external_id, DRMAA.Session.TERMINATE )
            log.debug( "(%s/%s) Removed from SGE queue at user's request" % ( job.id, job.job_runner_external_id ) )
        except DRMAA.InvalidJobError:
            log.debug( "(%s/%s) User killed running job, but it was already dead" % ( job.id, job.job_runner_external_id ) )

    def recover( self, job, job_wrapper ):
        """Recovers jobs stuck in the queued/running state when Galaxy started"""
        sge_job_state = SGEJobState()
        sge_job_state.ofile = "%s/database/pbs/%s.o" % (os.getcwd(), job.id)
        sge_job_state.efile = "%s/database/pbs/%s.e" % (os.getcwd(), job.id)
        sge_job_state.job_file = "%s/database/pbs/galaxy_%s.sh" % (os.getcwd(), job.id)
        sge_job_state.job_id = str( job.job_runner_external_id )
        sge_job_state.runner_url = job_wrapper.tool.job_runner
        job_wrapper.command_line = job.command_line
        sge_job_state.job_wrapper = job_wrapper
        if job.state == model.Job.states.RUNNING:
            log.debug( "(%s/%s) is still in running state, adding to the SGE queue" % ( job.id, job.job_runner_external_id ) )
            sge_job_state.old_state = DRMAA.Session.RUNNING
            sge_job_state.running = True
            self.monitor_queue.put( sge_job_state )
        elif job.state == model.Job.states.QUEUED:
            log.debug( "(%s/%s) is still in SGE queued state, adding to the SGE queue" % ( job.id, job.job_runner_external_id ) )
            sge_job_state.old_state = DRMAA.Session.QUEUED_ACTIVE
            sge_job_state.running = False
            self.monitor_queue.put( sge_job_state )
Summary ✨

This is a Python script that uses the DRMAA library to submit jobs to an SGE (Oracle Grid Engine) cluster and monitor their progress. It creates a thread for each job and uses the control method of the DRMAA class to check the status of the job and update its state accordingly. The script also includes a monitoring thread that checks the status of all jobs in the queue and updates their states if necessary.
Tech Fingerprint

Alerts (15)

'raise Exception(' Raise specific exception types for better error handling
28
'threading.Thread(' Use concurrent.futures for easier thread management
97 103
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
121 128 135 153 162 324 331
'time.sleep(' Avoid blocking; use threading.Timer or asyncio.sleep for non-blocking delays
271
'def' Ensure functions have docstrings for documentation
345
Complexity hotspot; lines 346 to 348 (total complexity: 3)
346 347 348