ucsc_filter.py - This is a custom post-processing script fo…

/tools/data_source/ucsc_filter.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 68 lines · 66 code · 1 blank · 1 comment · 0 complexity · 7b6a192f88566f421b41d5c7fb21401f MD5 · raw file


# runs after the job (and after the default post-filter)
from galaxy import datatypes, jobs

def validate(incoming):
    """Validator"""
    #raise Exception, 'not quite right'
    pass

def exec_before_job( app, inp_data, out_data, param_dict, tool=None):
    """Sets the name of the data"""
    outputType = param_dict.get( 'hgta_outputType', None )
    if isinstance(outputType, list) and len(outputType)>0: outputType = outputType[-1]
    items = out_data.items()
    
    for name, data in items:
        data.name  = param_dict.get('display', data.name)
        data.dbkey = param_dict.get('dbkey', '???')

        if outputType == 'wigData':
            ext = "wig"
        elif outputType == 'maf':
            ext = "maf"
        elif outputType == 'gff':
            ext = "gff"
        elif outputType == 'gff3':
            ext = "gff3"
        else:
            if 'hgta_doPrintSelectedFields' in param_dict:
                ext = "interval"
            elif 'hgta_doGetBed' in param_dict:
                ext = "bed"
            elif 'hgta_doGenomicDna' in param_dict:
                ext = "fasta"
            elif 'hgta_doGenePredSequence' in param_dict:
                ext = "fasta"
            else:
                ext = "interval"
        
        data = app.datatypes_registry.change_datatype(data, ext)
        out_data[name] = data
        
def exec_after_process( app, inp_data, out_data, param_dict, tool=None, stdout=None, stderr=None):
    """Verifies the data after the run"""
    items = out_data.items()
    for name, data in items:
        data.set_size()
        try:            
            err_msg, err_flag = 'Errors:', False
            line_count = 0
            num_lines = len(file(data.file_name).readlines())
            for line in file(data.file_name):
                line_count += 1
                if line and line[0] == '-':
                    if line_count + 3 == num_lines and not err_flag:
                        err_flag = True
                        err_msg = "Warning: It appears that your results have been truncated by UCSC. View the bottom of your result file for details."
                        break
                    err_flag = True
                    err_msg = err_msg +" (line "+str(line_count)+")"+line
            data.set_peek()
            if isinstance(data.datatype, datatypes.interval.Interval) and data.missing_meta():
                data = app.datatypes_registry.change_datatype(data, 'tabular')
                out_data[name] = data
            if err_flag:
                raise Exception(err_msg)
        except Exception, exc:
            data.info  = data.info + "\n" + str(exc)
            data.blurb = "error"

Summary ✨

This is a custom post-processing script for the UCSC Genome Browser tool in Galaxy. It sets the name and dbkey of the output data, changes the datatype to “wigData”, “maf”, “gff”, “gff3”, “interval”, “bed”, “fasta”, or “tabular” based on the selected output type, and verifies that the data is not truncated by UCSC.

Alerts (4)

'isinstance(' Overuse may indicate design issues; consider polymorphism
12 61
'try:' Ensure try blocks have corresponding except or finally blocks
47
'raise Exception(' Raise specific exception types for better error handling
65