PageRenderTime 54ms CodeModel.GetById 19ms app.highlight 29ms RepoModel.GetById 1ms app.codeStats 1ms

/lib/galaxy/jobs/deferred/data_transfer.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 384 lines | 360 code | 6 blank | 18 comment | 40 complexity | de0690f127ee12eca76e6fab3cd21897 MD5 | raw file
  1"""
  2Module for managing data transfer jobs.
  3"""
  4import logging, urllib2, re, shutil
  5
  6from galaxy import eggs
  7from sqlalchemy import and_
  8
  9from galaxy.util.odict import odict
 10from galaxy.workflow.modules import module_factory
 11from galaxy.jobs.actions.post import ActionBox
 12from galaxy.jobs.deferred import FakeTrans
 13
 14from galaxy.tools.parameters import visit_input_values
 15from galaxy.tools.parameters.basic import DataToolParameter
 16from galaxy.datatypes import sniff
 17
 18log = logging.getLogger( __name__ )
 19
 20__all__ = [ 'DataTransfer' ]
 21
 22class DataTransfer( object ):
 23    check_interval = 15
 24    dataset_name_re = re.compile( '(dataset\d+)_(name)' )
 25    dataset_datatype_re = re.compile( '(dataset\d+)_(datatype)' )
 26    def __init__( self, app ):
 27        self.app = app
 28        self.sa_session = app.model.context.current
 29    def create_job( self, trans, **kwd ):
 30        raise Exception( "Unimplemented Method" )
 31    def check_job( self, job ):
 32        raise Exception( "Unimplemented Method" )
 33    def run_job( self, job ):
 34        if job.params[ 'type' ] == 'init_transfer':
 35            # TODO: don't create new downloads on restart.
 36            if job.params[ 'protocol' ] in [ 'http', 'https' ]:
 37                results = []
 38                for result in job.params[ 'results' ].values():
 39                    result[ 'transfer_job' ] = self.app.transfer_manager.new( protocol=job.params[ 'protocol' ],
 40                                                                              name=result[ 'name' ],
 41                                                                              datatype=result[ 'datatype' ],
 42                                                                              url=result[ 'url' ] )
 43                    results.append( result )
 44            elif job.params[ 'protocol' ] == 'scp':
 45                results = []
 46                result = {}
 47                sample_datasets_dict = job.params[ 'sample_datasets_dict' ]
 48                # sample_datasets_dict looks something like the following.  The outer dictionary keys are SampleDataset ids.
 49                # {'7': {'status': 'Not started', 'name': '3.bed', 'file_path': '/tmp/library/3.bed', 'sample_id': 7,
 50                #        'external_service_id': 2, 'error_msg': '', 'size': '8.0K'}}
 51                for sample_dataset_id, sample_dataset_info_dict in sample_datasets_dict.items():
 52                    result = {}
 53                    result[ 'transfer_job' ] = self.app.transfer_manager.new( protocol=job.params[ 'protocol' ],
 54                                                                              host=job.params[ 'host' ],
 55                                                                              user_name=job.params[ 'user_name' ],
 56                                                                              password=job.params[ 'password' ],
 57                                                                              sample_dataset_id=sample_dataset_id,
 58                                                                              status=sample_dataset_info_dict[ 'status' ],
 59                                                                              name=sample_dataset_info_dict[ 'name' ],
 60                                                                              file_path=sample_dataset_info_dict[ 'file_path' ],
 61                                                                              sample_id=sample_dataset_info_dict[ 'sample_id' ],
 62                                                                              external_service_id=sample_dataset_info_dict[ 'external_service_id' ],
 63                                                                              error_msg=sample_dataset_info_dict[ 'error_msg' ],
 64                                                                              size=sample_dataset_info_dict[ 'size' ] )
 65                    results.append( result )
 66            self.app.transfer_manager.run( [ r[ 'transfer_job' ] for r in results ] )
 67            for result in results:
 68                transfer_job = result.pop( 'transfer_job' )
 69                self.create_job( None,
 70                                 transfer_job_id=transfer_job.id,
 71                                 result=transfer_job.params,
 72                                 sample_id=job.params[ 'sample_id' ] )
 73                # Update the state of the relevant SampleDataset
 74                new_status = self.app.model.SampleDataset.transfer_status.IN_QUEUE
 75                self._update_sample_dataset_status( protocol=job.params[ 'protocol' ],
 76                                                    sample_id=job.params[ 'sample_id' ],
 77                                                    result_dict=transfer_job.params,
 78                                                    new_status=new_status,
 79                                                    error_msg='' )
 80            job.state = self.app.model.DeferredJob.states.OK
 81            self.sa_session.add( job )
 82            self.sa_session.flush()
 83            # TODO: Error handling: failure executing, or errors returned from the manager
 84        if job.params[ 'type' ] == 'finish_transfer':
 85            protocol = job.params[ 'protocol' ]
 86            # Update the state of the relevant SampleDataset
 87            new_status = self.app.model.SampleDataset.transfer_status.ADD_TO_LIBRARY
 88            if protocol in [ 'http', 'https' ]:
 89                result_dict = job.params[ 'result' ]
 90                library_dataset_name = result_dict[ 'name' ]
 91                extension = result_dict[ 'datatype' ]
 92            elif protocol in [ 'scp' ]:
 93                # In this case, job.params will be a dictionary that contains a key named 'result'.  The value
 94                # of the result key is a dictionary that looks something like:
 95                # {'sample_dataset_id': '8', 'status': 'Not started', 'protocol': 'scp', 'name': '3.bed',
 96                #  'file_path': '/data/library/3.bed', 'host': '127.0.0.1', 'sample_id': 8, 'external_service_id': 2,
 97                #  'local_path': '/tmp/kjl2Ss4', 'password': 'galaxy', 'user_name': 'gvk', 'error_msg': '', 'size': '8.0K'}
 98                try:
 99                    tj = self.sa_session.query( self.app.model.TransferJob ).get( int( job.params['transfer_job_id'] ) )
100                    result_dict = tj.params
101                    result_dict['local_path'] = tj.path
102                except Exception, e:
103                    log.error( "Updated transfer result unavailable, using old result.  Error was: %s" % str( e ) )
104                    result_dict = job.params[ 'result' ]
105                library_dataset_name = result_dict[ 'name' ]
106                # Determine the data format (see the relevant TODO item in the manual_data_transfer plugin)..
107                extension = sniff.guess_ext( result_dict[ 'local_path' ], sniff_order=self.app.datatypes_registry.sniff_order )
108            self._update_sample_dataset_status( protocol=job.params[ 'protocol' ],
109                                                sample_id=int( job.params[ 'sample_id' ] ),
110                                                result_dict=result_dict,
111                                                new_status=new_status,
112                                                error_msg='' )
113            sample = self.sa_session.query( self.app.model.Sample ).get( int( job.params[ 'sample_id' ] ) )
114            ld = self.app.model.LibraryDataset( folder=sample.folder, name=library_dataset_name )
115            self.sa_session.add( ld )
116            self.sa_session.flush()
117            self.app.security_agent.copy_library_permissions( FakeTrans( self.app ), sample.folder, ld )
118            ldda = self.app.model.LibraryDatasetDatasetAssociation( name = library_dataset_name,
119                                                                    extension = extension,
120                                                                    dbkey = '?',
121                                                                    library_dataset = ld,
122                                                                    create_dataset = True,
123                                                                    sa_session = self.sa_session )
124            ldda.message = 'Transferred by the Data Transfer Plugin'
125            self.sa_session.add( ldda )
126            self.sa_session.flush()
127            ldda.state = ldda.states.QUEUED # flushed in the set property
128            ld.library_dataset_dataset_association_id = ldda.id
129            self.sa_session.add( ld )
130            self.sa_session.flush()
131            try:
132                # Move the dataset from its temporary location
133                shutil.move( job.transfer_job.path, ldda.file_name )
134                ldda.init_meta()
135                for name, spec in ldda.metadata.spec.items():
136                    if name not in [ 'name', 'info', 'dbkey', 'base_name' ]:
137                        if spec.get( 'default' ):
138                            setattr( ldda.metadata, name, spec.unwrap( spec.get( 'default' ) ) )
139                self.app.datatypes_registry.set_external_metadata_tool.tool_action.execute( self.app.datatypes_registry.set_external_metadata_tool,
140                                                                                            FakeTrans( self.app,
141                                                                                                       history=sample.history,
142                                                                                                       user=sample.request.user ),
143                                                                                            incoming = { 'input1':ldda } )
144                ldda.state = ldda.states.OK
145                # TODO: not sure if this flush is necessary
146                self.sa_session.add( ldda )
147                self.sa_session.flush()
148            except Exception, e:
149                log.exception( 'Failure preparing library dataset for finished transfer job (id: %s) via deferred job (id: %s):' % \
150                               ( str( job.transfer_job.id ), str( job.id ) ) )
151                ldda.state = ldda.states.ERROR
152            if sample.workflow:
153                log.debug( "\n\nLogging sample mappings as: %s" % sample.workflow[ 'mappings' ] )
154                log.debug( "job.params: %s" % job.params )
155                # We have a workflow.  Update all mappings to ldda's, and when the final one is done
156                # execute_workflow with either the provided history, or a new one.
157                sub_done = True
158                rep_done = False
159                for k, v in sample.workflow[ 'mappings' ].iteritems():
160                    if not 'hda' in v and v[ 'ds_tag' ].startswith( 'hi|' ):
161                        sample.workflow[ 'mappings' ][ k ][ 'hda' ] = self.app.security.decode_id( v[ 'ds_tag' ][3:] )
162                for key, value in sample.workflow[ 'mappings' ].iteritems():
163                    if 'url' in value and value[ 'url' ] == job.params[ 'result' ][ 'url' ]:
164                        # DBTODO Make sure all ds| mappings get the URL of the dataset, for linking to later.
165                        # If this dataset maps to what we just finished, update the ldda id in the sample.
166                        sample.workflow[ 'mappings' ][ key ][ 'ldda' ] = ldda.id
167                        rep_done = True
168                    # DBTODO replace the hi| mappings with the hda here.  Just rip off the first three chars.
169                    elif not 'ldda' in value and not 'hda' in value:
170                        # We're not done if some mappings still don't have ldda or hda mappings.
171                        sub_done = False
172                if sub_done and rep_done:
173                    if not sample.history:
174                        new_history = self.app.model.History( name="New History From %s" % sample.name, user=sample.request.user )
175                        self.sa_session.add( new_history )
176                        sample.history = new_history
177                        self.sa_session.flush()
178                    self._execute_workflow( sample )
179                # Check the workflow for substitution done-ness
180                self.sa_session.add( sample )
181                self.sa_session.flush()
182            elif sample.history:
183                # We don't have a workflow, but a history was provided.
184                # No processing, go ahead and chunk everything in the history.
185                if ldda.dataset.state in [ 'new', 'upload', 'queued', 'running', 'empty', 'discarded' ]:
186                    log.error("Cannot import dataset '%s' to user history since its state is '%s'.  " % ( ldda.name, ldda.dataset.state ))
187                elif ldda.dataset.state in [ 'ok', 'error' ]:
188                    ldda.to_history_dataset_association( target_history=sample.history, add_to_history=True )
189            # Finished
190            job.state = self.app.model.DeferredJob.states.OK
191            self.sa_session.add( job )
192            self.sa_session.flush()
193            # Update the state of the relevant SampleDataset
194            new_status = self.app.model.SampleDataset.transfer_status.COMPLETE
195            self._update_sample_dataset_status( protocol=job.params[ 'protocol' ],
196                                                sample_id=int( job.params[ 'sample_id' ] ),
197                                                result_dict=job.params[ 'result' ],
198                                                new_status=new_status,
199                                                error_msg='' )
200            if sample.datasets and not sample.untransferred_dataset_files:
201                # Update the state of the sample to the sample's request type's final state.
202                new_state = sample.request.type.final_sample_state
203                self._update_sample_state( sample.id, new_state )
204                # Update the state of the request, if possible
205                self._update_request_state( sample.request.id )
206    def _missing_params( self, params, required_params ):
207        missing_params = filter( lambda x: x not in params, required_params )
208        if missing_params:
209            log.error( 'Job parameters missing required keys: %s' % ', '.join( missing_params ) )
210            return True
211        return False
212    def _update_sample_dataset_status( self, protocol, sample_id, result_dict, new_status, error_msg=None ):
213        # result_dict looks something like:
214        # {'url': '127.0.0.1/data/filtered_subreads.fa', 'name': 'Filtered reads'}
215        # Check if the new status is a valid transfer status
216        valid_statuses = [ v[1] for v in self.app.model.SampleDataset.transfer_status.items() ]
217        # TODO: error checking on valid new_status value
218        if protocol in [ 'http', 'https' ]:
219            sample_dataset = self.sa_session.query( self.app.model.SampleDataset ) \
220                                            .filter( and_( self.app.model.SampleDataset.table.c.sample_id == sample_id,
221                                                           self.app.model.SampleDataset.table.c.name == result_dict[ 'name' ],
222                                                           self.app.model.SampleDataset.table.c.file_path == result_dict[ 'url' ] ) ) \
223                                            .first()
224        elif protocol in [ 'scp' ]:
225            sample_dataset = self.sa_session.query( self.app.model.SampleDataset ).get( int( result_dict[ 'sample_dataset_id' ] ) )
226        sample_dataset.status = new_status
227        sample_dataset.error_msg = error_msg
228        self.sa_session.add( sample_dataset )
229        self.sa_session.flush()
230    def _update_sample_state( self, sample_id, new_state, comment=None ):
231        sample = self.sa_session.query( self.app.model.Sample ).get( sample_id )
232        if comment is None:
233            comment = 'Sample state set to %s' % str( new_state )
234        event = self.app.model.SampleEvent( sample, new_state, comment )
235        self.sa_session.add( event )
236        self.sa_session.flush()
237    def _update_request_state( self, request_id ):
238        request = self.sa_session.query( self.app.model.Request ).get( request_id )
239        # Make sure all the samples of the current request have the same state
240        common_state = request.samples_have_common_state
241        if not common_state:
242            # If the current request state is complete and one of its samples moved from
243            # the final sample state, then move the request state to In-progress
244            if request.is_complete:
245                message = "At least 1 sample state moved from the final sample state, so now the request's state is (%s)" % request.states.SUBMITTED
246                event = self.app.model.RequestEvent( request, request.states.SUBMITTED, message )
247                self.sa_session.add( event )
248                self.sa_session.flush()
249        else:
250            final_state = False
251            request_type_state = request.type.final_sample_state
252            if common_state.id == request_type_state.id:
253                # Since all the samples are in the final state, change the request state to 'Complete'
254                comment = "All samples of this sequencing request are in the final sample state (%s). " % request_type_state.name
255                state = request.states.COMPLETE
256                final_state = True
257            else:
258                comment = "All samples of this sequencing request are in the (%s) sample state. " % common_state.name
259                state = request.states.SUBMITTED
260            event = self.app.model.RequestEvent( request, state, comment )
261            self.sa_session.add( event )
262            self.sa_session.flush()
263            # TODO: handle email notification if it is configured to be sent when the samples are in this state.
264    def _execute_workflow( self, sample):
265        for key, value in sample.workflow['mappings'].iteritems():
266            if 'hda' not in value and 'ldda' in value:
267                # If HDA is already here, it's an external input, we're not copying anything.
268                ldda = self.sa_session.query( self.app.model.LibraryDatasetDatasetAssociation ).get( value['ldda'] )
269                if ldda.dataset.state in [ 'new', 'upload', 'queued', 'running', 'empty', 'discarded' ]:
270                    log.error("Cannot import dataset '%s' to user history since its state is '%s'.  " % ( ldda.name, ldda.dataset.state ))
271                elif ldda.dataset.state in [ 'ok', 'error' ]:
272                    hda = ldda.to_history_dataset_association( target_history=sample.history, add_to_history=True )
273                    sample.workflow['mappings'][key]['hda'] = hda.id
274                    self.sa_session.add( sample )
275                    self.sa_session.flush()
276        workflow_dict = sample.workflow
277        import copy
278        new_wf_dict = copy.deepcopy(workflow_dict)
279        for key in workflow_dict['mappings']:
280            if not isinstance(key, int):
281                new_wf_dict['mappings'][int(key)] = workflow_dict['mappings'][key]
282        workflow_dict = new_wf_dict
283        fk_trans = FakeTrans(self.app, history = sample.history, user=sample.request.user)
284        workflow = self.sa_session.query(self.app.model.Workflow).get(workflow_dict['id'])
285        if not workflow:
286            log.error("Workflow mapping failure.")
287            return
288        if len( workflow.steps ) == 0:
289            log.error( "Workflow cannot be run because it does not have any steps" )
290            return
291        if workflow.has_cycles:
292            log.error( "Workflow cannot be run because it contains cycles" )
293            return
294        if workflow.has_errors:
295            log.error( "Workflow cannot be run because of validation errors in some steps" )
296            return
297        # Build the state for each step
298        errors = {}
299        has_upgrade_messages = False
300        has_errors = False
301        # Build a fake dictionary prior to execution.
302        # Prepare each step
303        for step in workflow.steps:
304            step.upgrade_messages = {}
305            # Contruct modules
306            if step.type == 'tool' or step.type is None:
307                # Restore the tool state for the step
308                step.module = module_factory.from_workflow_step( fk_trans, step )
309                # Fix any missing parameters
310                step.upgrade_messages = step.module.check_and_update_state()
311                if step.upgrade_messages:
312                    has_upgrade_messages = True
313                # Any connected input needs to have value DummyDataset (these
314                # are not persisted so we need to do it every time)
315                step.module.add_dummy_datasets( connections=step.input_connections )
316                # Store state with the step
317                step.state = step.module.state
318                # Error dict
319                if step.tool_errors:
320                    has_errors = True
321                    errors[step.id] = step.tool_errors
322            else:
323                ## Non-tool specific stuff?
324                step.module = module_factory.from_workflow_step( fk_trans, step )
325                step.state = step.module.get_runtime_state()
326            # Connections by input name
327            step.input_connections_by_name = dict( ( conn.input_name, conn ) for conn in step.input_connections )
328        for step in workflow.steps:
329            step.upgrade_messages = {}
330            # Connections by input name
331            step.input_connections_by_name = \
332                dict( ( conn.input_name, conn ) for conn in step.input_connections )
333            # Extract just the arguments for this step by prefix
334            step_errors = None
335            if step.type == 'tool' or step.type is None:
336                module = module_factory.from_workflow_step( fk_trans, step )
337                # Fix any missing parameters
338                step.upgrade_messages = module.check_and_update_state()
339                if step.upgrade_messages:
340                    has_upgrade_messages = True
341                # Any connected input needs to have value DummyDataset (these
342                # are not persisted so we need to do it every time)
343                module.add_dummy_datasets( connections=step.input_connections )
344                # Get the tool
345                tool = module.tool
346                # Get the state
347                step.state = state = module.state
348                # Get old errors
349                old_errors = state.inputs.pop( "__errors__", {} )
350            if step_errors:
351                errors[step.id] = state.inputs["__errors__"] = step_errors
352        # Run each step, connecting outputs to inputs
353        workflow_invocation = self.app.model.WorkflowInvocation()
354        workflow_invocation.workflow = workflow
355        outputs = odict()
356        for i, step in enumerate( workflow.steps ):
357            job = None
358            if step.type == 'tool' or step.type is None:
359                tool = self.app.toolbox.tools_by_id[ step.tool_id ]
360                def callback( input, value, prefixed_name, prefixed_label ):
361                    if isinstance( input, DataToolParameter ):
362                        if prefixed_name in step.input_connections_by_name:
363                            conn = step.input_connections_by_name[ prefixed_name ]
364                            return outputs[ conn.output_step.id ][ conn.output_name ]
365                visit_input_values( tool.inputs, step.state.inputs, callback )
366                job, out_data = tool.execute( fk_trans, step.state.inputs, history=sample.history)
367                outputs[ step.id ] = out_data
368                for pja in step.post_job_actions:
369                    if pja.action_type in ActionBox.immediate_actions:
370                        ActionBox.execute(self.app, self.sa_session, pja, job, replacement_dict=None)
371                    else:
372                        job.add_post_job_action(pja)
373            else:
374                job, out_data = step.module.execute( fk_trans, step.state)
375                outputs[ step.id ] = out_data
376                if step.id in workflow_dict['mappings']:
377                    data = self.sa_session.query( self.app.model.HistoryDatasetAssociation ).get( workflow_dict['mappings'][str(step.id)]['hda'] )
378                    outputs[ step.id ]['output'] = data
379            workflow_invocation_step = self.app.model.WorkflowInvocationStep()
380            workflow_invocation_step.workflow_invocation = workflow_invocation
381            workflow_invocation_step.workflow_step = step
382            workflow_invocation_step.job = job
383        self.sa_session.add( workflow_invocation )
384        self.sa_session.flush()