PageRenderTime 55ms CodeModel.GetById 15ms app.highlight 34ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/galaxy/tools/actions/__init__.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 435 lines | 348 code | 15 blank | 72 comment | 126 complexity | 63bedfebdc4f782de7d882a16dcdf014 MD5 | raw file
  1import os
  2
  3from galaxy.exceptions import ObjectInvalid
  4from galaxy.model import LibraryDatasetDatasetAssociation
  5from galaxy.tools.parameters import DataToolParameter
  6from galaxy.tools.parameters.wrapped import WrappedParameters
  7from galaxy.util.json import from_json_string
  8from galaxy.util.json import to_json_string
  9from galaxy.util.none_like import NoneDataset
 10from galaxy.util.odict import odict
 11from galaxy.util.template import fill_template
 12from galaxy.web import url_for
 13
 14import logging
 15log = logging.getLogger( __name__ )
 16
 17
 18class ToolAction( object ):
 19    """
 20    The actions to be taken when a tool is run (after parameters have
 21    been converted and validated).
 22    """
 23    def execute( self, tool, trans, incoming={}, set_output_hid=True ):
 24        raise TypeError("Abstract method")
 25
 26
 27class DefaultToolAction( object ):
 28    """Default tool action is to run an external command"""
 29
 30    def collect_input_datasets( self, tool, param_values, trans ):
 31        """
 32        Collect any dataset inputs from incoming. Returns a mapping from
 33        parameter name to Dataset instance for each tool parameter that is
 34        of the DataToolParameter type.
 35        """
 36        input_datasets = dict()
 37
 38        def visitor( prefix, input, value, parent=None ):
 39
 40            def process_dataset( data, formats=None ):
 41                if not data:
 42                    return data
 43                if formats is None:
 44                    formats = input.formats
 45                if not data.datatype.matches_any( formats ):
 46                    # Need to refresh in case this conversion just took place, i.e. input above in tool performed the same conversion
 47                    trans.sa_session.refresh( data )
 48                    target_ext, converted_dataset = data.find_conversion_destination( formats )
 49                    if target_ext:
 50                        if converted_dataset:
 51                            data = converted_dataset
 52                        else:
 53                            #run converter here
 54                            new_data = data.datatype.convert_dataset( trans, data, target_ext, return_output=True, visible=False ).values()[0]
 55                            new_data.hid = data.hid
 56                            new_data.name = data.name
 57                            trans.sa_session.add( new_data )
 58                            assoc = trans.app.model.ImplicitlyConvertedDatasetAssociation( parent=data, file_type=target_ext, dataset=new_data, metadata_safe=False )
 59                            trans.sa_session.add( assoc )
 60                            trans.sa_session.flush()
 61                            data = new_data
 62                current_user_roles = trans.get_current_user_roles()
 63                if not trans.app.security_agent.can_access_dataset( current_user_roles, data.dataset ):
 64                    raise "User does not have permission to use a dataset (%s) provided for input." % data.id
 65                return data
 66            if isinstance( input, DataToolParameter ):
 67                if isinstance( value, list ):
 68                    # If there are multiple inputs with the same name, they
 69                    # are stored as name1, name2, ...
 70                    for i, v in enumerate( value ):
 71                        processed_dataset = process_dataset( v )
 72                        if i == 0:
 73                            # Allow copying metadata to output, first item will be source.
 74                            input_datasets[ prefix + input.name ] = processed_dataset
 75                        input_datasets[ prefix + input.name + str( i + 1 ) ] = processed_dataset
 76                        conversions = []
 77                        for conversion_name, conversion_extensions, conversion_datatypes in input.conversions:
 78                            new_data = process_dataset( input_datasets[ prefix + input.name + str( i + 1 ) ], conversion_datatypes )
 79                            if not new_data or new_data.datatype.matches_any( conversion_datatypes ):
 80                                input_datasets[ prefix + conversion_name + str( i + 1 ) ] = new_data
 81                                conversions.append( ( conversion_name, new_data ) )
 82                            else:
 83                                raise Exception('A path for explicit datatype conversion has not been found: %s --/--> %s' % ( input_datasets[ prefix + input.name + str( i + 1 ) ].extension, conversion_extensions ) )
 84                        if parent:
 85                            parent[input.name][i] = input_datasets[ prefix + input.name + str( i + 1 ) ]
 86                            for conversion_name, conversion_data in conversions:
 87                                #allow explicit conversion to be stored in job_parameter table
 88                                parent[ conversion_name ][i] = conversion_data.id  # a more robust way to determine JSONable value is desired
 89                        else:
 90                            param_values[input.name][i] = input_datasets[ prefix + input.name + str( i + 1 ) ]
 91                            for conversion_name, conversion_data in conversions:
 92                                #allow explicit conversion to be stored in job_parameter table
 93                                param_values[ conversion_name ][i] = conversion_data.id  # a more robust way to determine JSONable value is desired
 94                else:
 95                    input_datasets[ prefix + input.name ] = process_dataset( value )
 96                    conversions = []
 97                    for conversion_name, conversion_extensions, conversion_datatypes in input.conversions:
 98                        new_data = process_dataset( input_datasets[ prefix + input.name ], conversion_datatypes )
 99                        if not new_data or new_data.datatype.matches_any( conversion_datatypes ):
100                            input_datasets[ prefix + conversion_name ] = new_data
101                            conversions.append( ( conversion_name, new_data ) )
102                        else:
103                            raise Exception( 'A path for explicit datatype conversion has not been found: %s --/--> %s' % ( input_datasets[ prefix + input.name ].extension, conversion_extensions ) )
104                    target_dict = parent
105                    if not target_dict:
106                        target_dict = param_values
107                    target_dict[ input.name ] = input_datasets[ prefix + input.name ]
108                    for conversion_name, conversion_data in conversions:
109                        #allow explicit conversion to be stored in job_parameter table
110                        target_dict[ conversion_name ] = conversion_data.id  # a more robust way to determine JSONable value is desired
111        tool.visit_inputs( param_values, visitor )
112        return input_datasets
113
114    def execute(self, tool, trans, incoming={}, return_job=False, set_output_hid=True, set_output_history=True, history=None, job_params=None, rerun_remap_job_id=None):
115        """
116        Executes a tool, creating job and tool outputs, associating them, and
117        submitting the job to the job queue. If history is not specified, use
118        trans.history as destination for tool's output datasets.
119        """
120        # Set history.
121        if not history:
122            history = tool.get_default_history_by_trans( trans, create=True )
123
124        out_data = odict()
125        # Collect any input datasets from the incoming parameters
126        inp_data = self.collect_input_datasets( tool, incoming, trans )
127
128        # Deal with input dataset names, 'dbkey' and types
129        input_names = []
130        input_ext = 'data'
131        input_dbkey = incoming.get( "dbkey", "?" )
132        for name, data in inp_data.items():
133            if not data:
134                data = NoneDataset( datatypes_registry=trans.app.datatypes_registry )
135                continue
136
137            # Convert LDDA to an HDA.
138            if isinstance(data, LibraryDatasetDatasetAssociation):
139                data = data.to_history_dataset_association( None )
140                inp_data[name] = data
141
142            else:  # HDA
143                if data.hid:
144                    input_names.append( 'data %s' % data.hid )
145            input_ext = data.ext
146
147            if data.dbkey not in [None, '?']:
148                input_dbkey = data.dbkey
149
150        # Collect chromInfo dataset and add as parameters to incoming
151        db_datasets = {}
152        db_dataset = trans.db_dataset_for( input_dbkey )
153        if db_dataset:
154            db_datasets[ "chromInfo" ] = db_dataset
155            incoming[ "chromInfo" ] = db_dataset.file_name
156        else:
157            # -- Get chrom_info (len file) from either a custom or built-in build. --
158
159            chrom_info = None
160            if trans.user and ( 'dbkeys' in trans.user.preferences ) and ( input_dbkey in from_json_string( trans.user.preferences[ 'dbkeys' ] ) ):
161                # Custom build.
162                custom_build_dict = from_json_string( trans.user.preferences[ 'dbkeys' ] )[ input_dbkey ]
163                # HACK: the attempt to get chrom_info below will trigger the
164                # fasta-to-len converter if the dataset is not available or,
165                # which will in turn create a recursive loop when
166                # running the fasta-to-len tool. So, use a hack in the second
167                # condition below to avoid getting chrom_info when running the
168                # fasta-to-len converter.
169                if 'fasta' in custom_build_dict and tool.id != 'CONVERTER_fasta_to_len':
170                    # Build is defined by fasta; get len file, which is obtained from converting fasta.
171                    build_fasta_dataset = trans.sa_session.query( trans.app.model.HistoryDatasetAssociation ).get( custom_build_dict[ 'fasta' ] )
172                    chrom_info = build_fasta_dataset.get_converted_dataset( trans, 'len' ).file_name
173                elif 'len' in custom_build_dict:
174                    # Build is defined by len file, so use it.
175                    chrom_info = trans.sa_session.query( trans.app.model.HistoryDatasetAssociation ).get( custom_build_dict[ 'len' ] ).file_name
176
177            if not chrom_info:
178                # Default to built-in build.
179                chrom_info = os.path.join( trans.app.config.len_file_path, "%s.len" % input_dbkey )
180            incoming[ "chromInfo" ] = os.path.abspath( chrom_info )
181        inp_data.update( db_datasets )
182
183        # Determine output dataset permission/roles list
184        existing_datasets = [ inp for inp in inp_data.values() if inp ]
185        if existing_datasets:
186            output_permissions = trans.app.security_agent.guess_derived_permissions_for_datasets( existing_datasets )
187        else:
188            # No valid inputs, we will use history defaults
189            output_permissions = trans.app.security_agent.history_get_default_permissions( history )
190
191        # Build name for output datasets based on tool name and input names
192        on_text = on_text_for_names( input_names )
193
194        # Add the dbkey to the incoming parameters
195        incoming[ "dbkey" ] = input_dbkey
196        # wrapped params are used by change_format action and by output.label; only perform this wrapping once, as needed
197        wrapped_params = WrappedParameters( trans, tool, incoming )
198        # Keep track of parent / child relationships, we'll create all the
199        # datasets first, then create the associations
200        parent_to_child_pairs = []
201        child_dataset_names = set()
202        object_store_id = None
203        for name, output in tool.outputs.items():
204            for filter in output.filters:
205                try:
206                    if not eval( filter.text.strip(), globals(), incoming ):
207                        break  # do not create this dataset
208                except Exception, e:
209                    log.debug( 'Dataset output filter failed: %s' % e )
210            else:  # all filters passed
211                if output.parent:
212                    parent_to_child_pairs.append( ( output.parent, name ) )
213                    child_dataset_names.add( name )
214                ## What is the following hack for? Need to document under what
215                ## conditions can the following occur? (james@bx.psu.edu)
216                # HACK: the output data has already been created
217                #      this happens i.e. as a result of the async controller
218                if name in incoming:
219                    dataid = incoming[name]
220                    data = trans.sa_session.query( trans.app.model.HistoryDatasetAssociation ).get( dataid )
221                    assert data != None
222                    out_data[name] = data
223                else:
224                    # the type should match the input
225                    ext = output.format
226                    if ext == "input":
227                        ext = input_ext
228                    if output.format_source is not None and output.format_source in inp_data:
229                        try:
230                            input_dataset = inp_data[output.format_source]
231                            input_extension = input_dataset.ext
232                            ext = input_extension
233                        except Exception, e:
234                            pass
235
236                    #process change_format tags
237                    if output.change_format:
238                        for change_elem in output.change_format:
239                            for when_elem in change_elem.findall( 'when' ):
240                                check = when_elem.get( 'input', None )
241                                if check is not None:
242                                    try:
243                                        if '$' not in check:
244                                            #allow a simple name or more complex specifications
245                                            check = '${%s}' % check
246                                        if str( fill_template( check, context=wrapped_params.params ) ) == when_elem.get( 'value', None ):
247                                            ext = when_elem.get( 'format', ext )
248                                    except:  # bad tag input value; possibly referencing a param within a different conditional when block or other nonexistent grouping construct
249                                        continue
250                                else:
251                                    check = when_elem.get( 'input_dataset', None )
252                                    if check is not None:
253                                        check = inp_data.get( check, None )
254                                        if check is not None:
255                                            if str( getattr( check, when_elem.get( 'attribute' ) ) ) == when_elem.get( 'value', None ):
256                                                ext = when_elem.get( 'format', ext )
257                    data = trans.app.model.HistoryDatasetAssociation( extension=ext, create_dataset=True, sa_session=trans.sa_session )
258                    if output.hidden:
259                        data.visible = False
260                    # Commit the dataset immediately so it gets database assigned unique id
261                    trans.sa_session.add( data )
262                    trans.sa_session.flush()
263                    trans.app.security_agent.set_all_dataset_permissions( data.dataset, output_permissions )
264                # Create an empty file immediately.  The first dataset will be
265                # created in the "default" store, all others will be created in
266                # the same store as the first.
267                data.dataset.object_store_id = object_store_id
268                try:
269                    trans.app.object_store.create( data.dataset )
270                except ObjectInvalid:
271                    raise Exception('Unable to create output dataset: object store is full')
272                object_store_id = data.dataset.object_store_id      # these will be the same thing after the first output
273                # This may not be neccesary with the new parent/child associations
274                data.designation = name
275                # Copy metadata from one of the inputs if requested.
276                if output.metadata_source:
277                    data.init_meta( copy_from=inp_data[output.metadata_source] )
278                else:
279                    data.init_meta()
280                # Take dbkey from LAST input
281                data.dbkey = str(input_dbkey)
282                # Set state
283                # FIXME: shouldn't this be NEW until the job runner changes it?
284                data.state = data.states.QUEUED
285                data.blurb = "queued"
286                # Set output label
287                data.name = self.get_output_name( output, data, tool, on_text, trans, incoming, history, wrapped_params.params, job_params )
288                # Store output
289                out_data[ name ] = data
290                if output.actions:
291                    #Apply pre-job tool-output-dataset actions; e.g. setting metadata, changing format
292                    output_action_params = dict( out_data )
293                    output_action_params.update( incoming )
294                    output.actions.apply_action( data, output_action_params )
295                # Store all changes to database
296                trans.sa_session.flush()
297        # Add all the top-level (non-child) datasets to the history unless otherwise specified
298        for name in out_data.keys():
299            if name not in child_dataset_names and name not in incoming:  # don't add children; or already existing datasets, i.e. async created
300                data = out_data[ name ]
301                if set_output_history:
302                    history.add_dataset( data, set_hid=set_output_hid )
303                trans.sa_session.add( data )
304                trans.sa_session.flush()
305        # Add all the children to their parents
306        for parent_name, child_name in parent_to_child_pairs:
307            parent_dataset = out_data[ parent_name ]
308            child_dataset = out_data[ child_name ]
309            parent_dataset.children.append( child_dataset )
310        # Store data after custom code runs
311        trans.sa_session.flush()
312        # Create the job object
313        job = trans.app.model.Job()
314        galaxy_session = trans.get_galaxy_session()
315        # If we're submitting from the API, there won't be a session.
316        if type( galaxy_session ) == trans.model.GalaxySession:
317            job.session_id = galaxy_session.id
318        if trans.user is not None:
319            job.user_id = trans.user.id
320        job.history_id = history.id
321        job.tool_id = tool.id
322        try:
323            # For backward compatibility, some tools may not have versions yet.
324            job.tool_version = tool.version
325        except:
326            job.tool_version = "1.0.0"
327        # FIXME: Don't need all of incoming here, just the defined parameters
328        #        from the tool. We need to deal with tools that pass all post
329        #        parameters to the command as a special case.
330        for name, value in tool.params_to_strings( incoming, trans.app ).iteritems():
331            job.add_parameter( name, value )
332        current_user_roles = trans.get_current_user_roles()
333        for name, dataset in inp_data.iteritems():
334            if dataset:
335                if not trans.app.security_agent.can_access_dataset( current_user_roles, dataset.dataset ):
336                    raise "User does not have permission to use a dataset (%s) provided for input." % data.id
337                job.add_input_dataset( name, dataset )
338            else:
339                job.add_input_dataset( name, None )
340        for name, dataset in out_data.iteritems():
341            job.add_output_dataset( name, dataset )
342        job.object_store_id = object_store_id
343        if job_params:
344            job.params = to_json_string( job_params )
345        job.set_handler(tool.get_job_handler(job_params))
346        trans.sa_session.add( job )
347        # Now that we have a job id, we can remap any outputs if this is a rerun and the user chose to continue dependent jobs
348        # This functionality requires tracking jobs in the database.
349        if trans.app.config.track_jobs_in_database and rerun_remap_job_id is not None:
350            try:
351                old_job = trans.sa_session.query( trans.app.model.Job ).get(rerun_remap_job_id)
352                assert old_job is not None, '(%s/%s): Old job id is invalid' % (rerun_remap_job_id, job.id)
353                assert old_job.tool_id == job.tool_id, '(%s/%s): Old tool id (%s) does not match rerun tool id (%s)' % (old_job.id, job.id, old_job.tool_id, job.tool_id)
354                if trans.user is not None:
355                    assert old_job.user_id == trans.user.id, '(%s/%s): Old user id (%s) does not match rerun user id (%s)' % (old_job.id, job.id, old_job.user_id, trans.user.id)
356                elif trans.user is None and type( galaxy_session ) == trans.model.GalaxySession:
357                    assert old_job.session_id == galaxy_session.id, '(%s/%s): Old session id (%s) does not match rerun session id (%s)' % (old_job.id, job.id, old_job.session_id, galaxy_session.id)
358                else:
359                    raise Exception('(%s/%s): Remapping via the API is not (yet) supported' % (old_job.id, job.id))
360                for jtod in old_job.output_datasets:
361                    for (job_to_remap, jtid) in [(jtid.job, jtid) for jtid in jtod.dataset.dependent_jobs]:
362                        if (trans.user is not None and job_to_remap.user_id == trans.user.id) or (trans.user is None and job_to_remap.session_id == galaxy_session.id):
363                            if job_to_remap.state == job_to_remap.states.PAUSED:
364                                job_to_remap.state = job_to_remap.states.NEW
365                            for hda in [ dep_jtod.dataset for dep_jtod in job_to_remap.output_datasets ]:
366                                if hda.state == hda.states.PAUSED:
367                                    hda.state = hda.states.NEW
368                                    hda.info = None
369                            for p in job_to_remap.parameters:
370                                if p.name == jtid.name and p.value == str(jtod.dataset.id):
371                                    p.value = str(out_data[jtod.name].id)
372                            jtid.dataset = out_data[jtod.name]
373                            jtid.dataset.hid = jtod.dataset.hid
374                            log.info('Job %s input HDA %s remapped to new HDA %s' % (job_to_remap.id, jtod.dataset.id, jtid.dataset.id))
375                            trans.sa_session.add(job_to_remap)
376                            trans.sa_session.add(jtid)
377                    jtod.dataset.visible = False
378                    trans.sa_session.add(jtod)
379            except Exception, e:
380                log.exception('Cannot remap rerun dependencies.')
381        trans.sa_session.flush()
382        # Some tools are not really executable, but jobs are still created for them ( for record keeping ).
383        # Examples include tools that redirect to other applications ( epigraph ).  These special tools must
384        # include something that can be retrieved from the params ( e.g., REDIRECT_URL ) to keep the job
385        # from being queued.
386        if 'REDIRECT_URL' in incoming:
387            # Get the dataset - there should only be 1
388            for name in inp_data.keys():
389                dataset = inp_data[ name ]
390            redirect_url = tool.parse_redirect_url( dataset, incoming )
391            # GALAXY_URL should be include in the tool params to enable the external application
392            # to send back to the current Galaxy instance
393            GALAXY_URL = incoming.get( 'GALAXY_URL', None )
394            assert GALAXY_URL is not None, "GALAXY_URL parameter missing in tool config."
395            redirect_url += "&GALAXY_URL=%s" % GALAXY_URL
396            # Job should not be queued, so set state to ok
397            job.state = trans.app.model.Job.states.OK
398            job.info = "Redirected to: %s" % redirect_url
399            trans.sa_session.add( job )
400            trans.sa_session.flush()
401            trans.response.send_redirect( url_for( controller='tool_runner', action='redirect', redirect_url=redirect_url ) )
402        else:
403            # Put the job in the queue if tracking in memory
404            trans.app.job_queue.put( job.id, job.tool_id )
405            trans.log_event( "Added job to the job queue, id: %s" % str(job.id), tool_id=job.tool_id )
406            return job, out_data
407
408    def get_output_name( self, output, dataset, tool, on_text, trans, incoming, history, params, job_params ):
409        if output.label:
410            params['tool'] = tool
411            params['on_string'] = on_text
412            return fill_template( output.label, context=params )
413        else:
414            return self._get_default_data_name( dataset, tool, on_text=on_text, trans=trans, incoming=incoming, history=history, params=params, job_params=job_params )
415
416    def _get_default_data_name( self, dataset, tool, on_text=None, trans=None, incoming=None, history=None, params=None, job_params=None, **kwd ):
417        name = tool.name
418        if on_text:
419            name += ( " on " + on_text )
420        return name
421
422
423def on_text_for_names( input_names ):
424    # Build name for output datasets based on tool name and input names
425    if len( input_names ) == 1:
426        on_text = input_names[0]
427    elif len( input_names ) == 2:
428        on_text = '%s and %s' % tuple(input_names[0:2])
429    elif len( input_names ) == 3:
430        on_text = '%s, %s, and %s' % tuple(input_names[0:3])
431    elif len( input_names ) > 3:
432        on_text = '%s, %s, and others' % tuple(input_names[0:2])
433    else:
434        on_text = ""
435    return on_text