PageRenderTime 44ms CodeModel.GetById 10ms app.highlight 28ms RepoModel.GetById 2ms app.codeStats 0ms

/lib/galaxy/jobs/runners/drmaa.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 378 lines | 286 code | 43 blank | 49 comment | 52 complexity | 521416d186a60f23303c0eb2f79f36a2 MD5 | raw file
  1"""
  2Job control via the DRMAA API.
  3"""
  4
  5import json
  6import logging
  7import os
  8import string
  9import subprocess
 10import sys
 11import time
 12
 13from galaxy import eggs
 14from galaxy import model
 15from galaxy.jobs import JobDestination
 16from galaxy.jobs.runners import AsynchronousJobState, AsynchronousJobRunner
 17
 18eggs.require( "drmaa" )
 19
 20log = logging.getLogger( __name__ )
 21
 22__all__ = [ 'DRMAAJobRunner' ]
 23
 24drmaa = None
 25
 26DRMAA_jobTemplate_attributes = [ 'args', 'remoteCommand', 'outputPath', 'errorPath', 'nativeSpecification',
 27                                 'jobName', 'email', 'project' ]
 28
 29
 30class DRMAAJobRunner( AsynchronousJobRunner ):
 31    """
 32    Job runner backed by a finite pool of worker threads. FIFO scheduling
 33    """
 34    runner_name = "DRMAARunner"
 35
 36    def __init__( self, app, nworkers, **kwargs ):
 37        """Start the job runner"""
 38
 39        global drmaa
 40
 41        runner_param_specs = dict(
 42            drmaa_library_path = dict( map = str, default = os.environ.get( 'DRMAA_LIBRARY_PATH', None ) ),
 43            invalidjobexception_state = dict( map = str, valid = lambda x: x in ( model.Job.states.OK, model.Job.states.ERROR ), default = model.Job.states.OK ),
 44            invalidjobexception_retries = dict( map = int, valid = lambda x: int >= 0, default = 0 ),
 45            internalexception_state = dict( map = str, valid = lambda x: x in ( model.Job.states.OK, model.Job.states.ERROR ), default = model.Job.states.OK ),
 46            internalexception_retries = dict( map = int, valid = lambda x: int >= 0, default = 0 ) )
 47
 48        if 'runner_param_specs' not in kwargs:
 49            kwargs[ 'runner_param_specs' ] = dict()
 50        kwargs[ 'runner_param_specs' ].update( runner_param_specs )
 51
 52        super( DRMAAJobRunner, self ).__init__( app, nworkers, **kwargs )
 53
 54        # This allows multiple drmaa runners (although only one per handler) in the same job config file
 55        if 'drmaa_library_path' in kwargs:
 56            log.info( 'Overriding DRMAA_LIBRARY_PATH due to runner plugin parameter: %s', self.runner_params.drmaa_library_path )
 57            os.environ['DRMAA_LIBRARY_PATH'] = self.runner_params.drmaa_library_path
 58
 59        # We foolishly named this file the same as the name exported by the drmaa
 60        # library... 'import drmaa' imports itself.
 61        drmaa = __import__( "drmaa" )
 62
 63        # Subclasses may need access to state constants
 64        self.drmaa_job_states = drmaa.JobState
 65
 66        # Descriptive state strings pulled from the drmaa lib itself
 67        self.drmaa_job_state_strings = {
 68            drmaa.JobState.UNDETERMINED: 'process status cannot be determined',
 69            drmaa.JobState.QUEUED_ACTIVE: 'job is queued and active',
 70            drmaa.JobState.SYSTEM_ON_HOLD: 'job is queued and in system hold',
 71            drmaa.JobState.USER_ON_HOLD: 'job is queued and in user hold',
 72            drmaa.JobState.USER_SYSTEM_ON_HOLD: 'job is queued and in user and system hold',
 73            drmaa.JobState.RUNNING: 'job is running',
 74            drmaa.JobState.SYSTEM_SUSPENDED: 'job is system suspended',
 75            drmaa.JobState.USER_SUSPENDED: 'job is user suspended',
 76            drmaa.JobState.DONE: 'job finished normally',
 77            drmaa.JobState.FAILED: 'job finished, but failed',
 78        }
 79
 80        self.ds = drmaa.Session()
 81        self.ds.initialize()
 82
 83        # external_runJob_script can be None, in which case it's not used.
 84        self.external_runJob_script = app.config.drmaa_external_runjob_script
 85        self.external_killJob_script = app.config.drmaa_external_killjob_script
 86        self.userid = None
 87
 88        self._init_monitor_thread()
 89        self._init_worker_threads()
 90
 91    def url_to_destination(self, url):
 92        """Convert a legacy URL to a job destination"""
 93        if not url:
 94            return
 95        native_spec = url.split('/')[2]
 96        if native_spec:
 97            params = dict( nativeSpecification=native_spec )
 98            log.debug( "Converted URL '%s' to destination runner=drmaa, params=%s" % ( url, params ) )
 99            return JobDestination( runner='drmaa', params=params )
100        else:
101            log.debug( "Converted URL '%s' to destination runner=drmaa" % url )
102            return JobDestination( runner='drmaa' )
103
104    def get_native_spec( self, url ):
105        """Get any native DRM arguments specified by the site configuration"""
106        try:
107            return url.split('/')[2] or None
108        except:
109            return None
110
111    def queue_job( self, job_wrapper ):
112        """Create job script and submit it to the DRM"""
113        # prepare the job
114        if not self.prepare_job( job_wrapper, include_metadata=True ):
115            return
116
117        # command line has been added to the wrapper by prepare_job()
118        command_line = job_wrapper.runner_command_line
119
120        # get configured job destination
121        job_destination = job_wrapper.job_destination
122
123        # wrapper.get_id_tag() instead of job_id for compatibility with TaskWrappers.
124        galaxy_id_tag = job_wrapper.get_id_tag()
125
126        # define job attributes
127        job_name = 'g%s' % galaxy_id_tag
128        if job_wrapper.tool.old_id:
129            job_name += '_%s' % job_wrapper.tool.old_id
130        if self.external_runJob_script is None:
131            job_name += '_%s' % job_wrapper.user
132        job_name = ''.join( map( lambda x: x if x in ( string.letters + string.digits + '_' ) else '_', job_name ) )
133        ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper, job_name=job_name )
134
135        # set up the drmaa job template
136        jt = self.ds.createJobTemplate()
137        jt.remoteCommand = ajs.job_file
138        jt.jobName = ajs.job_name
139        jt.outputPath = ":%s" % ajs.output_file
140        jt.errorPath = ":%s" % ajs.error_file
141
142        # Avoid a jt.exitCodePath for now - it's only used when finishing.
143        native_spec = job_destination.params.get('nativeSpecification', None)
144        if native_spec is not None:
145            jt.nativeSpecification = native_spec
146
147        # fill in the DRM's job run template
148        script = self.get_job_file(job_wrapper, exit_code_path=ajs.exit_code_file)
149        try:
150            fh = file( ajs.job_file, "w" )
151            fh.write( script )
152            fh.close()
153            os.chmod( ajs.job_file, 0755 )
154        except:
155            job_wrapper.fail( "failure preparing job script", exception=True )
156            log.exception( "(%s) failure writing job script" % galaxy_id_tag )
157            return
158
159        # job was deleted while we were preparing it
160        if job_wrapper.get_state() == model.Job.states.DELETED:
161            log.debug( "(%s) Job deleted by user before it entered the queue" % galaxy_id_tag )
162            if self.app.config.cleanup_job in ( "always", "onsuccess" ):
163                job_wrapper.cleanup()
164            return
165
166        log.debug( "(%s) submitting file %s", galaxy_id_tag, ajs.job_file )
167        log.debug( "(%s) command is: %s", galaxy_id_tag, command_line )
168        if native_spec:
169            log.debug( "(%s) native specification is: %s", galaxy_id_tag, native_spec )
170
171        # runJob will raise if there's a submit problem
172        if self.external_runJob_script is None:
173            # TODO: create a queue for retrying submission indefinitely
174            # TODO: configurable max tries and sleep
175            trynum = 0
176            external_job_id = None
177            while external_job_id is None and trynum < 5:
178                try:
179                    external_job_id = self.ds.runJob(jt)
180                except drmaa.InternalException, e:
181                    trynum += 1
182                    log.warning( '(%s) drmaa.Session.runJob() failed, will retry: %s', galaxy_id_tag, e )
183                    time.sleep( 5 )
184        else:
185            job_wrapper.change_ownership_for_run()
186            log.debug( '(%s) submitting with credentials: %s [uid: %s]' % ( galaxy_id_tag, job_wrapper.user_system_pwent[0], job_wrapper.user_system_pwent[2] ) )
187            filename = self.store_jobtemplate(job_wrapper, jt)
188            self.userid =  job_wrapper.user_system_pwent[2]
189            external_job_id = self.external_runjob(filename, job_wrapper.user_system_pwent[2]).strip()
190        log.info( "(%s) queued as %s" % ( galaxy_id_tag, external_job_id ) )
191
192        # store runner information for tracking if Galaxy restarts
193        job_wrapper.set_job_destination( job_destination, external_job_id )
194
195        # Store DRM related state information for job
196        ajs.job_id = external_job_id
197        ajs.old_state = 'new'
198        ajs.job_destination = job_destination
199
200        # delete the job template
201        self.ds.deleteJobTemplate( jt )
202
203        # Add to our 'queue' of jobs to monitor
204        self.monitor_queue.put( ajs )
205
206    def _complete_terminal_job( self, ajs, drmaa_state, **kwargs ):
207        """
208        Handle a job upon its termination in the DRM. This method is meant to
209        be overridden by subclasses to improve post-mortem and reporting of
210        failures.
211        """
212        if drmaa_state == drmaa.JobState.FAILED:
213            if ajs.job_wrapper.get_state() != model.Job.states.DELETED:
214                ajs.stop_job = False
215                ajs.fail_message = "The cluster DRM system terminated this job"
216                self.work_queue.put( ( self.fail_job, ajs ) )
217        elif drmaa_state == drmaa.JobState.DONE:
218            super( DRMAAJobRunner, self )._complete_terminal_job( ajs )
219
220    def check_watched_items( self ):
221        """
222        Called by the monitor thread to look at each watched job and deal
223        with state changes.
224        """
225        new_watched = []
226        for ajs in self.watched:
227            external_job_id = ajs.job_id
228            galaxy_id_tag = ajs.job_wrapper.get_id_tag()
229            old_state = ajs.old_state
230            try:
231                assert external_job_id not in ( None, 'None' ), '(%s/%s) Invalid job id' % ( galaxy_id_tag, external_job_id )
232                state = self.ds.jobStatus( external_job_id )
233            except ( drmaa.InternalException, drmaa.InvalidJobException ), e:
234                if isinstance( e , drmaa.InvalidJobException ):
235                    ecn = "InvalidJobException".lower()
236                else:
237                    ecn = "InternalException".lower()
238                retry_param = ecn.lower() + '_retries'
239                state_param = ecn.lower() + '_state'
240                retries = getattr( ajs, retry_param, 0 )
241                if self.runner_params[ retry_param ] > 0:
242                    if retries < self.runner_params[ retry_param ]:
243                        # will retry check on next iteration
244                        setattr( ajs, retry_param, retries + 1 )
245                        continue
246                if self.runner_params[ state_param ] == model.Job.states.OK:
247                    log.info( "(%s/%s) job left DRM queue with following message: %s", galaxy_id_tag, external_job_id, e )
248                    self.work_queue.put( ( self.finish_job, ajs ) )
249                elif self.runner_params[ state_param ] == model.Job.states.ERROR:
250                    log.info( "(%s/%s) job check resulted in %s after %s tries: %s", galaxy_id_tag, external_job_id, ecn, retries, e )
251                    self.work_queue.put( ( self.fail_job, ajs ) )
252                else:
253                    raise Exception( "%s is set to an invalid value (%s), this should not be possible. See galaxy.jobs.drmaa.__init__()", state_param, self.runner_params[ state_param ] )
254                continue
255            except drmaa.DrmCommunicationException, e:
256                log.warning( "(%s/%s) unable to communicate with DRM: %s", galaxy_id_tag, external_job_id, e )
257                new_watched.append( ajs )
258                continue
259            except Exception, e:
260                # so we don't kill the monitor thread
261                log.exception( "(%s/%s) Unable to check job status: %s" % ( galaxy_id_tag, external_job_id, str( e ) ) )
262                log.warning( "(%s/%s) job will now be errored" % ( galaxy_id_tag, external_job_id ) )
263                ajs.fail_message = "Cluster could not complete job"
264                self.work_queue.put( ( self.fail_job, ajs ) )
265                continue
266            if state != old_state:
267                log.debug( "(%s/%s) state change: %s" % ( galaxy_id_tag, external_job_id, self.drmaa_job_state_strings[state] ) )
268            if state == drmaa.JobState.RUNNING and not ajs.running:
269                ajs.running = True
270                ajs.job_wrapper.change_state( model.Job.states.RUNNING )
271            if state in ( drmaa.JobState.FAILED, drmaa.JobState.DONE ):
272                self._complete_terminal_job( ajs, drmaa_state = state )
273                continue
274            ajs.old_state = state
275            new_watched.append( ajs )
276        # Replace the watch list with the updated version
277        self.watched = new_watched
278
279    def stop_job( self, job ):
280        """Attempts to delete a job from the DRM queue"""
281        try:
282            ext_id = job.get_job_runner_external_id()
283            assert ext_id not in ( None, 'None' ), 'External job id is None'
284            if self.external_killJob_script is None:
285                self.ds.control( ext_id, drmaa.JobControlAction.TERMINATE )
286            else:
287                # FIXME: hardcoded path
288                subprocess.Popen( [ '/usr/bin/sudo', '-E', self.external_killJob_script, str( ext_id ), str( self.userid ) ], shell=False )
289            log.debug( "(%s/%s) Removed from DRM queue at user's request" % ( job.get_id(), ext_id ) )
290        except drmaa.InvalidJobException:
291            log.debug( "(%s/%s) User killed running job, but it was already dead" % ( job.get_id(), ext_id ) )
292        except Exception, e:
293            log.debug( "(%s/%s) User killed running job, but error encountered removing from DRM queue: %s" % ( job.get_id(), ext_id, e ) )
294
295    def recover( self, job, job_wrapper ):
296        """Recovers jobs stuck in the queued/running state when Galaxy started"""
297        job_id = job.get_job_runner_external_id()
298        if job_id is None:
299            self.put( job_wrapper )
300            return
301        ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper )
302        ajs.job_id = str( job_id )
303        ajs.command_line = job.get_command_line()
304        ajs.job_wrapper = job_wrapper
305        ajs.job_destination = job_wrapper.job_destination
306        self.__old_state_paths( ajs )
307        if job.state == model.Job.states.RUNNING:
308            log.debug( "(%s/%s) is still in running state, adding to the DRM queue" % ( job.get_id(), job.get_job_runner_external_id() ) )
309            ajs.old_state = drmaa.JobState.RUNNING
310            ajs.running = True
311            self.monitor_queue.put( ajs )
312        elif job.get_state() == model.Job.states.QUEUED:
313            log.debug( "(%s/%s) is still in DRM queued state, adding to the DRM queue" % ( job.get_id(), job.get_job_runner_external_id() ) )
314            ajs.old_state = drmaa.JobState.QUEUED_ACTIVE
315            ajs.running = False
316            self.monitor_queue.put( ajs )
317
318    def __old_state_paths( self, ajs ):
319        """For recovery of jobs started prior to standardizing the naming of
320        files in the AsychronousJobState object
321        """
322        if ajs.job_wrapper is not None:
323            job_file = "%s/galaxy_%s.sh" % (self.app.config.cluster_files_directory, ajs.job_wrapper.job_id)
324            if not os.path.exists( ajs.job_file ) and os.path.exists( job_file ):
325                ajs.output_file = "%s.drmout" % os.path.join(os.getcwd(), ajs.job_wrapper.working_directory, ajs.job_wrapper.get_id_tag())
326                ajs.error_file = "%s.drmerr" % os.path.join(os.getcwd(), ajs.job_wrapper.working_directory, ajs.job_wrapper.get_id_tag())
327                ajs.exit_code_file = "%s.drmec" % os.path.join(os.getcwd(), ajs.job_wrapper.working_directory, ajs.job_wrapper.get_id_tag())
328                ajs.job_file = job_file
329
330
331    def store_jobtemplate(self, job_wrapper, jt):
332        """ Stores the content of a DRMAA JobTemplate object in a file as a JSON string.
333        Path is hard-coded, but it's no worse than other path in this module.
334        Uses Galaxy's JobID, so file is expected to be unique."""
335        filename = "%s/%s.jt_json" % (self.app.config.cluster_files_directory, job_wrapper.get_id_tag())
336        data = {}
337        for attr in DRMAA_jobTemplate_attributes:
338            try:
339                data[attr] = getattr(jt, attr)
340            except:
341                pass
342        s = json.dumps(data)
343        f = open(filename,'w')
344        f.write(s)
345        f.close()
346        log.debug( '(%s) Job script for external submission is: %s' % ( job_wrapper.job_id, filename ) )
347        return filename
348
349    def external_runjob(self, jobtemplate_filename, username):
350        """ runs an external script the will QSUB a new job.
351        The external script will be run with sudo, and will setuid() to the specified user.
352        Effectively, will QSUB as a different user (then the one used by Galaxy).
353        """
354        script_parts = self.external_runJob_script.split()
355        script = script_parts[0]
356        command = [ '/usr/bin/sudo', '-E', script]
357        for script_argument in script_parts[1:]:
358            command.append(script_argument)
359
360        command.extend( [ str(username), jobtemplate_filename ] )
361        log.info("Running command %s" % command)
362        p = subprocess.Popen(command,
363                shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
364        (stdoutdata, stderrdata) = p.communicate()
365        exitcode = p.returncode
366        #os.unlink(jobtemplate_filename)
367        if exitcode != 0:
368            # There was an error in the child process
369            raise RuntimeError("External_runjob failed (exit code %s)\nChild process reported error:\n%s" % (str(exitcode), stderrdata))
370        if not stdoutdata.strip():
371            raise RuntimeError("External_runjob did return the job id: %s" % (stdoutdata))
372
373        # The expected output is a single line containing a single numeric value:
374        # the DRMAA job-ID. If not the case, will throw an error.
375        jobId = stdoutdata
376        return jobId
377
378