__init__.py - This is a Python function that creates a job …

/lib/galaxy/tools/actions/init.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 435 lines · 348 code · 15 blank · 72 comment · 105 complexity · 63bedfebdc4f782de7d882a16dcdf014 MD5 · raw file

import os

from galaxy.exceptions import ObjectInvalid
from galaxy.model import LibraryDatasetDatasetAssociation
from galaxy.tools.parameters import DataToolParameter
from galaxy.tools.parameters.wrapped import WrappedParameters
from galaxy.util.json import from_json_string
from galaxy.util.json import to_json_string
from galaxy.util.none_like import NoneDataset
from galaxy.util.odict import odict
from galaxy.util.template import fill_template
from galaxy.web import url_for

import logging
log = logging.getLogger( __name__ )


class ToolAction( object ):
    """
    The actions to be taken when a tool is run (after parameters have
    been converted and validated).
    """
    def execute( self, tool, trans, incoming={}, set_output_hid=True ):
        raise TypeError("Abstract method")


class DefaultToolAction( object ):
    """Default tool action is to run an external command"""

    def collect_input_datasets( self, tool, param_values, trans ):
        """
        Collect any dataset inputs from incoming. Returns a mapping from
        parameter name to Dataset instance for each tool parameter that is
        of the DataToolParameter type.
        """
        input_datasets = dict()

        def visitor( prefix, input, value, parent=None ):

            def process_dataset( data, formats=None ):
                if not data:
                    return data
                if formats is None:
                    formats = input.formats
                if not data.datatype.matches_any( formats ):
                    # Need to refresh in case this conversion just took place, i.e. input above in tool performed the same conversion
                    trans.sa_session.refresh( data )
                    target_ext, converted_dataset = data.find_conversion_destination( formats )
                    if target_ext:
                        if converted_dataset:
                            data = converted_dataset
                        else:
                            #run converter here
                            new_data = data.datatype.convert_dataset( trans, data, target_ext, return_output=True, visible=False ).values()[0]
                            new_data.hid = data.hid
                            new_data.name = data.name
                            trans.sa_session.add( new_data )
                            assoc = trans.app.model.ImplicitlyConvertedDatasetAssociation( parent=data, file_type=target_ext, dataset=new_data, metadata_safe=False )
                            trans.sa_session.add( assoc )
                            trans.sa_session.flush()
                            data = new_data
                current_user_roles = trans.get_current_user_roles()
                if not trans.app.security_agent.can_access_dataset( current_user_roles, data.dataset ):
                    raise "User does not have permission to use a dataset (%s) provided for input." % data.id
                return data
            if isinstance( input, DataToolParameter ):
                if isinstance( value, list ):
                    # If there are multiple inputs with the same name, they
                    # are stored as name1, name2, ...
                    for i, v in enumerate( value ):
                        processed_dataset = process_dataset( v )
                        if i == 0:
                            # Allow copying metadata to output, first item will be source.
                            input_datasets[ prefix + input.name ] = processed_dataset
                        input_datasets[ prefix + input.name + str( i + 1 ) ] = processed_dataset
                        conversions = []
                        for conversion_name, conversion_extensions, conversion_datatypes in input.conversions:
                            new_data = process_dataset( input_datasets[ prefix + input.name + str( i + 1 ) ], conversion_datatypes )
                            if not new_data or new_data.datatype.matches_any( conversion_datatypes ):
                                input_datasets[ prefix + conversion_name + str( i + 1 ) ] = new_data
                                conversions.append( ( conversion_name, new_data ) )
                            else:
                                raise Exception('A path for explicit datatype conversion has not been found: %s --/--> %s' % ( input_datasets[ prefix + input.name + str( i + 1 ) ].extension, conversion_extensions ) )
                        if parent:
                            parent[input.name][i] = input_datasets[ prefix + input.name + str( i + 1 ) ]
                            for conversion_name, conversion_data in conversions:
                                #allow explicit conversion to be stored in job_parameter table
                                parent[ conversion_name ][i] = conversion_data.id  # a more robust way to determine JSONable value is desired
                        else:
                            param_values[input.name][i] = input_datasets[ prefix + input.name + str( i + 1 ) ]
                            for conversion_name, conversion_data in conversions:
                                #allow explicit conversion to be stored in job_parameter table
                                param_values[ conversion_name ][i] = conversion_data.id  # a more robust way to determine JSONable value is desired
                else:
                    input_datasets[ prefix + input.name ] = process_dataset( value )
                    conversions = []
                    for conversion_name, conversion_extensions, conversion_datatypes in input.conversions:
                        new_data = process_dataset( input_datasets[ prefix + input.name ], conversion_datatypes )
                        if not new_data or new_data.datatype.matches_any( conversion_datatypes ):
                            input_datasets[ prefix + conversion_name ] = new_data
                            conversions.append( ( conversion_name, new_data ) )
                        else:
                            raise Exception( 'A path for explicit datatype conversion has not been found: %s --/--> %s' % ( input_datasets[ prefix + input.name ].extension, conversion_extensions ) )
                    target_dict = parent
                    if not target_dict:
                        target_dict = param_values
                    target_dict[ input.name ] = input_datasets[ prefix + input.name ]
                    for conversion_name, conversion_data in conversions:
                        #allow explicit conversion to be stored in job_parameter table
                        target_dict[ conversion_name ] = conversion_data.id  # a more robust way to determine JSONable value is desired
        tool.visit_inputs( param_values, visitor )
        return input_datasets

    def execute(self, tool, trans, incoming={}, return_job=False, set_output_hid=True, set_output_history=True, history=None, job_params=None, rerun_remap_job_id=None):
        """
        Executes a tool, creating job and tool outputs, associating them, and
        submitting the job to the job queue. If history is not specified, use
        trans.history as destination for tool's output datasets.
        """
        # Set history.
        if not history:
            history = tool.get_default_history_by_trans( trans, create=True )

        out_data = odict()
        # Collect any input datasets from the incoming parameters
        inp_data = self.collect_input_datasets( tool, incoming, trans )

        # Deal with input dataset names, 'dbkey' and types
        input_names = []
        input_ext = 'data'
        input_dbkey = incoming.get( "dbkey", "?" )
        for name, data in inp_data.items():
            if not data:
                data = NoneDataset( datatypes_registry=trans.app.datatypes_registry )
                continue

            # Convert LDDA to an HDA.
            if isinstance(data, LibraryDatasetDatasetAssociation):
                data = data.to_history_dataset_association( None )
                inp_data[name] = data

            else:  # HDA
                if data.hid:
                    input_names.append( 'data %s' % data.hid )
            input_ext = data.ext

            if data.dbkey not in [None, '?']:
                input_dbkey = data.dbkey

        # Collect chromInfo dataset and add as parameters to incoming
        db_datasets = {}
        db_dataset = trans.db_dataset_for( input_dbkey )
        if db_dataset:
            db_datasets[ "chromInfo" ] = db_dataset
            incoming[ "chromInfo" ] = db_dataset.file_name
        else:
            # -- Get chrom_info (len file) from either a custom or built-in build. --

            chrom_info = None
            if trans.user and ( 'dbkeys' in trans.user.preferences ) and ( input_dbkey in from_json_string( trans.user.preferences[ 'dbkeys' ] ) ):
                # Custom build.
                custom_build_dict = from_json_string( trans.user.preferences[ 'dbkeys' ] )[ input_dbkey ]
                # HACK: the attempt to get chrom_info below will trigger the
                # fasta-to-len converter if the dataset is not available or,
                # which will in turn create a recursive loop when
                # running the fasta-to-len tool. So, use a hack in the second
                # condition below to avoid getting chrom_info when running the
                # fasta-to-len converter.
                if 'fasta' in custom_build_dict and tool.id != 'CONVERTER_fasta_to_len':
                    # Build is defined by fasta; get len file, which is obtained from converting fasta.
                    build_fasta_dataset = trans.sa_session.query( trans.app.model.HistoryDatasetAssociation ).get( custom_build_dict[ 'fasta' ] )
                    chrom_info = build_fasta_dataset.get_converted_dataset( trans, 'len' ).file_name
                elif 'len' in custom_build_dict:
                    # Build is defined by len file, so use it.
                    chrom_info = trans.sa_session.query( trans.app.model.HistoryDatasetAssociation ).get( custom_build_dict[ 'len' ] ).file_name

            if not chrom_info:
                # Default to built-in build.
                chrom_info = os.path.join( trans.app.config.len_file_path, "%s.len" % input_dbkey )
            incoming[ "chromInfo" ] = os.path.abspath( chrom_info )
        inp_data.update( db_datasets )

        # Determine output dataset permission/roles list
        existing_datasets = [ inp for inp in inp_data.values() if inp ]
        if existing_datasets:
            output_permissions = trans.app.security_agent.guess_derived_permissions_for_datasets( existing_datasets )
        else:
            # No valid inputs, we will use history defaults
            output_permissions = trans.app.security_agent.history_get_default_permissions( history )

        # Build name for output datasets based on tool name and input names
        on_text = on_text_for_names( input_names )

        # Add the dbkey to the incoming parameters
        incoming[ "dbkey" ] = input_dbkey
        # wrapped params are used by change_format action and by output.label; only perform this wrapping once, as needed
        wrapped_params = WrappedParameters( trans, tool, incoming )
        # Keep track of parent / child relationships, we'll create all the
        # datasets first, then create the associations
        parent_to_child_pairs = []
        child_dataset_names = set()
        object_store_id = None
        for name, output in tool.outputs.items():
            for filter in output.filters:
                try:
                    if not eval( filter.text.strip(), globals(), incoming ):
                        break  # do not create this dataset
                except Exception, e:
                    log.debug( 'Dataset output filter failed: %s' % e )
            else:  # all filters passed
                if output.parent:
                    parent_to_child_pairs.append( ( output.parent, name ) )
                    child_dataset_names.add( name )
                ## What is the following hack for? Need to document under what
                ## conditions can the following occur? (james@bx.psu.edu)
                # HACK: the output data has already been created
                #      this happens i.e. as a result of the async controller
                if name in incoming:
                    dataid = incoming[name]
                    data = trans.sa_session.query( trans.app.model.HistoryDatasetAssociation ).get( dataid )
                    assert data != None
                    out_data[name] = data
                else:
                    # the type should match the input
                    ext = output.format
                    if ext == "input":
                        ext = input_ext
                    if output.format_source is not None and output.format_source in inp_data:
                        try:
                            input_dataset = inp_data[output.format_source]
                            input_extension = input_dataset.ext
                            ext = input_extension
                        except Exception, e:
                            pass

                    #process change_format tags
                    if output.change_format:
                        for change_elem in output.change_format:
                            for when_elem in change_elem.findall( 'when' ):
                                check = when_elem.get( 'input', None )
                                if check is not None:
                                    try:
                                        if '$' not in check:
                                            #allow a simple name or more complex specifications
                                            check = '${%s}' % check
                                        if str( fill_template( check, context=wrapped_params.params ) ) == when_elem.get( 'value', None ):
                                            ext = when_elem.get( 'format', ext )
                                    except:  # bad tag input value; possibly referencing a param within a different conditional when block or other nonexistent grouping construct
                                        continue
                                else:
                                    check = when_elem.get( 'input_dataset', None )
                                    if check is not None:
                                        check = inp_data.get( check, None )
                                        if check is not None:
                                            if str( getattr( check, when_elem.get( 'attribute' ) ) ) == when_elem.get( 'value', None ):
                                                ext = when_elem.get( 'format', ext )
                    data = trans.app.model.HistoryDatasetAssociation( extension=ext, create_dataset=True, sa_session=trans.sa_session )
                    if output.hidden:
                        data.visible = False
                    # Commit the dataset immediately so it gets database assigned unique id
                    trans.sa_session.add( data )
                    trans.sa_session.flush()
                    trans.app.security_agent.set_all_dataset_permissions( data.dataset, output_permissions )
                # Create an empty file immediately.  The first dataset will be
                # created in the "default" store, all others will be created in
                # the same store as the first.
                data.dataset.object_store_id = object_store_id
                try:
                    trans.app.object_store.create( data.dataset )
                except ObjectInvalid:
                    raise Exception('Unable to create output dataset: object store is full')
                object_store_id = data.dataset.object_store_id      # these will be the same thing after the first output
                # This may not be neccesary with the new parent/child associations
                data.designation = name
                # Copy metadata from one of the inputs if requested.
                if output.metadata_source:
                    data.init_meta( copy_from=inp_data[output.metadata_source] )
                else:
                    data.init_meta()
                # Take dbkey from LAST input
                data.dbkey = str(input_dbkey)
                # Set state
                # FIXME: shouldn't this be NEW until the job runner changes it?
                data.state = data.states.QUEUED
                data.blurb = "queued"
                # Set output label
                data.name = self.get_output_name( output, data, tool, on_text, trans, incoming, history, wrapped_params.params, job_params )
                # Store output
                out_data[ name ] = data
                if output.actions:
                    #Apply pre-job tool-output-dataset actions; e.g. setting metadata, changing format
                    output_action_params = dict( out_data )
                    output_action_params.update( incoming )
                    output.actions.apply_action( data, output_action_params )
                # Store all changes to database
                trans.sa_session.flush()
        # Add all the top-level (non-child) datasets to the history unless otherwise specified
        for name in out_data.keys():
            if name not in child_dataset_names and name not in incoming:  # don't add children; or already existing datasets, i.e. async created
                data = out_data[ name ]
                if set_output_history:
                    history.add_dataset( data, set_hid=set_output_hid )
                trans.sa_session.add( data )
                trans.sa_session.flush()
        # Add all the children to their parents
        for parent_name, child_name in parent_to_child_pairs:
            parent_dataset = out_data[ parent_name ]
            child_dataset = out_data[ child_name ]
            parent_dataset.children.append( child_dataset )
        # Store data after custom code runs
        trans.sa_session.flush()
        # Create the job object
        job = trans.app.model.Job()
        galaxy_session = trans.get_galaxy_session()
        # If we're submitting from the API, there won't be a session.
        if type( galaxy_session ) == trans.model.GalaxySession:
            job.session_id = galaxy_session.id
        if trans.user is not None:
            job.user_id = trans.user.id
        job.history_id = history.id
        job.tool_id = tool.id
        try:
            # For backward compatibility, some tools may not have versions yet.
            job.tool_version = tool.version
        except:
            job.tool_version = "1.0.0"
        # FIXME: Don't need all of incoming here, just the defined parameters
        #        from the tool. We need to deal with tools that pass all post
        #        parameters to the command as a special case.
        for name, value in tool.params_to_strings( incoming, trans.app ).iteritems():
            job.add_parameter( name, value )
        current_user_roles = trans.get_current_user_roles()
        for name, dataset in inp_data.iteritems():
            if dataset:
                if not trans.app.security_agent.can_access_dataset( current_user_roles, dataset.dataset ):
                    raise "User does not have permission to use a dataset (%s) provided for input." % data.id
                job.add_input_dataset( name, dataset )
            else:
                job.add_input_dataset( name, None )
        for name, dataset in out_data.iteritems():
            job.add_output_dataset( name, dataset )
        job.object_store_id = object_store_id
        if job_params:
            job.params = to_json_string( job_params )
        job.set_handler(tool.get_job_handler(job_params))
        trans.sa_session.add( job )
        # Now that we have a job id, we can remap any outputs if this is a rerun and the user chose to continue dependent jobs
        # This functionality requires tracking jobs in the database.
        if trans.app.config.track_jobs_in_database and rerun_remap_job_id is not None:
            try:
                old_job = trans.sa_session.query( trans.app.model.Job ).get(rerun_remap_job_id)
                assert old_job is not None, '(%s/%s): Old job id is invalid' % (rerun_remap_job_id, job.id)
                assert old_job.tool_id == job.tool_id, '(%s/%s): Old tool id (%s) does not match rerun tool id (%s)' % (old_job.id, job.id, old_job.tool_id, job.tool_id)
                if trans.user is not None:
                    assert old_job.user_id == trans.user.id, '(%s/%s): Old user id (%s) does not match rerun user id (%s)' % (old_job.id, job.id, old_job.user_id, trans.user.id)
                elif trans.user is None and type( galaxy_session ) == trans.model.GalaxySession:
                    assert old_job.session_id == galaxy_session.id, '(%s/%s): Old session id (%s) does not match rerun session id (%s)' % (old_job.id, job.id, old_job.session_id, galaxy_session.id)
                else:
                    raise Exception('(%s/%s): Remapping via the API is not (yet) supported' % (old_job.id, job.id))
                for jtod in old_job.output_datasets:
                    for (job_to_remap, jtid) in [(jtid.job, jtid) for jtid in jtod.dataset.dependent_jobs]:
                        if (trans.user is not None and job_to_remap.user_id == trans.user.id) or (trans.user is None and job_to_remap.session_id == galaxy_session.id):
                            if job_to_remap.state == job_to_remap.states.PAUSED:
                                job_to_remap.state = job_to_remap.states.NEW
                            for hda in [ dep_jtod.dataset for dep_jtod in job_to_remap.output_datasets ]:
                                if hda.state == hda.states.PAUSED:
                                    hda.state = hda.states.NEW
                                    hda.info = None
                            for p in job_to_remap.parameters:
                                if p.name == jtid.name and p.value == str(jtod.dataset.id):
                                    p.value = str(out_data[jtod.name].id)
                            jtid.dataset = out_data[jtod.name]
                            jtid.dataset.hid = jtod.dataset.hid
                            log.info('Job %s input HDA %s remapped to new HDA %s' % (job_to_remap.id, jtod.dataset.id, jtid.dataset.id))
                            trans.sa_session.add(job_to_remap)
                            trans.sa_session.add(jtid)
                    jtod.dataset.visible = False
                    trans.sa_session.add(jtod)
            except Exception, e:
                log.exception('Cannot remap rerun dependencies.')
        trans.sa_session.flush()
        # Some tools are not really executable, but jobs are still created for them ( for record keeping ).
        # Examples include tools that redirect to other applications ( epigraph ).  These special tools must
        # include something that can be retrieved from the params ( e.g., REDIRECT_URL ) to keep the job
        # from being queued.
        if 'REDIRECT_URL' in incoming:
            # Get the dataset - there should only be 1
            for name in inp_data.keys():
                dataset = inp_data[ name ]
            redirect_url = tool.parse_redirect_url( dataset, incoming )
            # GALAXY_URL should be include in the tool params to enable the external application
            # to send back to the current Galaxy instance
            GALAXY_URL = incoming.get( 'GALAXY_URL', None )
            assert GALAXY_URL is not None, "GALAXY_URL parameter missing in tool config."
            redirect_url += "&GALAXY_URL=%s" % GALAXY_URL
            # Job should not be queued, so set state to ok
            job.state = trans.app.model.Job.states.OK
            job.info = "Redirected to: %s" % redirect_url
            trans.sa_session.add( job )
            trans.sa_session.flush()
            trans.response.send_redirect( url_for( controller='tool_runner', action='redirect', redirect_url=redirect_url ) )
        else:
            # Put the job in the queue if tracking in memory
            trans.app.job_queue.put( job.id, job.tool_id )
            trans.log_event( "Added job to the job queue, id: %s" % str(job.id), tool_id=job.tool_id )
            return job, out_data

    def get_output_name( self, output, dataset, tool, on_text, trans, incoming, history, params, job_params ):
        if output.label:
            params['tool'] = tool
            params['on_string'] = on_text
            return fill_template( output.label, context=params )
        else:
            return self._get_default_data_name( dataset, tool, on_text=on_text, trans=trans, incoming=incoming, history=history, params=params, job_params=job_params )

    def _get_default_data_name( self, dataset, tool, on_text=None, trans=None, incoming=None, history=None, params=None, job_params=None, **kwd ):
        name = tool.name
        if on_text:
            name += ( " on " + on_text )
        return name


def on_text_for_names( input_names ):
    # Build name for output datasets based on tool name and input names
    if len( input_names ) == 1:
        on_text = input_names[0]
    elif len( input_names ) == 2:
        on_text = '%s and %s' % tuple(input_names[0:2])
    elif len( input_names ) == 3:
        on_text = '%s, %s, and %s' % tuple(input_names[0:3])
    elif len( input_names ) > 3:
        on_text = '%s, %s, and others' % tuple(input_names[0:2])
    else:
        on_text = ""
    return on_text
Summary ✨

This is a Python function that creates a job for a tool and returns the created job object and the output datasets. It takes in various parameters such as the input datasets, the tool to be run, and the user’s history. The function first checks if the tool can be run on the input datasets and then creates a new job object with the appropriate information. If the tool is not executable, it includes something in the params (e.g., REDIRECT_URL) to keep the job from being queued. Finally, it puts the job in the queue if tracking in memory or returns the created job object and output datasets.
Tech Fingerprint

Alerts (23)

'def' Ensure functions have docstrings for documentation
38 40 408 423
'isinstance(' Overuse may indicate design issues; consider polymorphism
66 67 138
'raise Exception(' Raise specific exception types for better error handling
83 103 271 359
'def' Avoid long function definitions; keep signatures concise for readability
114 416
'eval(' Avoid due to security risks; use ast.literal_eval for safer evaluation of literals
206
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
248 325
'type(' Use isinstance() for type checking instead of type()
316 356
'try:' Ensure try blocks have corresponding except or finally blocks
350
Complexity hotspot; lines 360 to 363 (total complexity: 8)
360 361 362 363
/lib/galaxy/tools/actions/__init__.py

/lib/galaxy/tools/actions/init.py