microbial_import_code.py - This is a post-processing functi…

/tools/data_source/microbial_import_code.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 154 lines · 137 code · 7 blank · 10 comment · 24 complexity · b23f4367caf4c357fbd2bd548c7e11ba MD5 · raw file


def load_microbial_data( GALAXY_DATA_INDEX_DIR, sep='\t' ):
    # FIXME: this function is duplicated in the DynamicOptions class.  It is used here only to
    # set data.name in exec_after_process(). 
    microbe_info= {}
    orgs = {}
    
    filename = "%s/microbial_data.loc" % GALAXY_DATA_INDEX_DIR
    for i, line in enumerate( open( filename ) ):
        line = line.rstrip( '\r\n' )
        if line and not line.startswith( '#' ):
            fields = line.split( sep )
            #read each line, if not enough fields, go to next line
            try:
                info_type = fields.pop(0)
                if info_type.upper() == "ORG":
                    #ORG     12521   Clostridium perfringens SM101   bacteria        Firmicutes      CP000312,CP000313,CP000314,CP000315     http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=genomeprj&cmd=Retrieve&dopt=Overview&list_uids=12521
                    org_num = fields.pop(0)
                    name = fields.pop(0)
                    kingdom = fields.pop(0)
                    group = fields.pop(0)
                    chromosomes = fields.pop(0)
                    info_url = fields.pop(0)
                    link_site = fields.pop(0)
                    if org_num not in orgs:
                        orgs[ org_num ] = {}
                        orgs[ org_num ][ 'chrs' ] = {}
                    orgs[ org_num ][ 'name' ] = name
                    orgs[ org_num ][ 'kingdom' ] = kingdom
                    orgs[ org_num ][ 'group' ] = group
                    orgs[ org_num ][ 'chromosomes' ] = chromosomes
                    orgs[ org_num ][ 'info_url' ] = info_url
                    orgs[ org_num ][ 'link_site' ] = link_site
                elif info_type.upper() == "CHR":
                    #CHR     12521   CP000315        Clostridium perfringens phage phiSM101, complete genome 38092   110684521       CP000315.1
                    org_num = fields.pop(0)
                    chr_acc = fields.pop(0)
                    name = fields.pop(0)
                    length = fields.pop(0)
                    gi = fields.pop(0)
                    gb = fields.pop(0)
                    info_url = fields.pop(0)
                    chr = {}
                    chr[ 'name' ] = name
                    chr[ 'length' ] = length
                    chr[ 'gi' ] = gi
                    chr[ 'gb' ] = gb
                    chr[ 'info_url' ] = info_url
                    if org_num not in orgs:
                        orgs[ org_num ] = {}
                        orgs[ org_num ][ 'chrs' ] = {}
                    orgs[ org_num ][ 'chrs' ][ chr_acc ] = chr
                elif info_type.upper() == "DATA":
                    #DATA    12521_12521_CDS 12521   CP000315        CDS     bed     /home/djb396/alignments/playground/bacteria/12521/CP000315.CDS.bed
                    uid = fields.pop(0)
                    org_num = fields.pop(0)
                    chr_acc = fields.pop(0)
                    feature = fields.pop(0)
                    filetype = fields.pop(0)
                    path = fields.pop(0)
                    data = {}
                    data[ 'filetype' ] = filetype
                    data[ 'path' ] = path
                    data[ 'feature' ] = feature

                    if org_num not in orgs:
                        orgs[ org_num ] = {}
                        orgs[ org_num ][ 'chrs' ] = {}
                    if 'data' not in orgs[ org_num ][ 'chrs' ][ chr_acc ]:
                        orgs[ org_num ][ 'chrs' ][ chr_acc ][ 'data' ] = {}
                    orgs[ org_num ][ 'chrs' ][ chr_acc ][ 'data' ][ uid ] = data
                else: continue
            except: continue
    for org_num in orgs:
        org = orgs[ org_num ]
        if org[ 'kingdom' ] not in microbe_info:
            microbe_info[ org[ 'kingdom' ] ] = {}
        if org_num not in microbe_info[ org[ 'kingdom' ] ]:
            microbe_info[ org[ 'kingdom' ] ][org_num] = org
    return microbe_info

#post processing, set build for data and add additional data to history
from galaxy import datatypes, config, tools
from shutil import copyfile

def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
    base_dataset = out_data.items()[0][1]
    history = base_dataset.history
    if history == None:
        print "unknown history!"
        return
    kingdom = param_dict.get( 'kingdom', None )
    #group = param_dict.get( 'group', None )
    org = param_dict.get( 'org', None )
    
    #if not (kingdom or group or org):
    if not (kingdom or org):
        print "Parameters are not available."
    #workflow passes galaxy.tools.parameters.basic.UnvalidatedValue instead of values
    if isinstance( kingdom, tools.parameters.basic.UnvalidatedValue ):
        kingdom = kingdom.value
    if isinstance( org, tools.parameters.basic.UnvalidatedValue ):
        org = org.value
    
    GALAXY_DATA_INDEX_DIR = app.config.tool_data_path
    microbe_info = load_microbial_data( GALAXY_DATA_INDEX_DIR, sep='\t' )
    new_stdout = ""
    split_stdout = stdout.split("\n")
    basic_name = ""
    for line in split_stdout:
        fields = line.split("\t")
        if fields[0] == "#File1":
            description = fields[1]
            chr = fields[2]
            dbkey = fields[3]
            file_type = fields[4]
            name, data = out_data.items()[0]
            data.set_size()
            basic_name = data.name
            data.name = data.name + " (" + microbe_info[kingdom][org]['chrs'][chr]['data'][description]['feature'] +" for " + microbe_info[kingdom][org]['name'] + ":" + chr + ")"
            data.dbkey = dbkey
            data.info = data.name
            data = app.datatypes_registry.change_datatype( data, file_type )
            data.init_meta()
            data.set_peek()
            app.model.context.add( data )
            app.model.context.flush()
        elif fields[0] == "#NewFile":
            description = fields[1]
            chr = fields[2]
            dbkey = fields[3]
            filepath = fields[4]
            file_type = fields[5]
            newdata = app.model.HistoryDatasetAssociation( create_dataset = True, sa_session = app.model.context ) #This import should become a library
            newdata.set_size()
            newdata.extension = file_type
            newdata.name = basic_name + " (" + microbe_info[kingdom][org]['chrs'][chr]['data'][description]['feature'] +" for "+microbe_info[kingdom][org]['name']+":"+chr + ")"
            app.model.context.add( newdata )
            app.model.context.flush()
            app.security_agent.copy_dataset_permissions( base_dataset.dataset, newdata.dataset )
            history.add_dataset( newdata )
            app.model.context.add( history )
            app.model.context.flush()
            try:
                copyfile(filepath,newdata.file_name)
                newdata.info = newdata.name
                newdata.state = newdata.states.OK
            except:
                newdata.info = "The requested file is missing from the system."
                newdata.state = newdata.states.ERROR
            newdata.dbkey = dbkey
            newdata.init_meta()
            newdata.set_peek()
            app.model.context.flush()

Summary ✨

This is a post-processing function for the “microbial_data” tool in Galaxy. It takes the output of the tool and adds additional information to the history, such as the name of the organism and chromosome that the data corresponds to. The function uses the load_microbial_data function to load the microbial data index file and retrieve the necessary information about the organism and chromosome for each output dataset.

Alerts (9)

'def' Ensure functions have docstrings for documentation
1 85
'open(' Use 'with open()' to ensure Files are properly closed
8
'try:' Ensure try blocks have corresponding except or finally blocks
13
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
72 147
'== None' Use 'is' for None comparisons (e.g., x is None)
88
'isinstance(' Overuse may indicate design issues; consider polymorphism
99 101