corrtf_code.py - This is a Python script that filters and m…

/tools/expression/corrtf_code.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 162 lines · 132 code · 28 blank · 2 comment · 46 complexity · af525d4f234de80a46f4eada5981dbac MD5 · raw file


#build list of available data
import string, os, sys, glob, shutil
import galaxy.util
from galaxy import datatypes, config, jobs
from com.lilly.cistrome.gs import getGeneSymbols

repository = "/usr/local/galaxy/data/rg/library"

def mapProbesetToGene(db, filename, coe, geneName):
    if (not db.endswith(".db")):
        db += ".db"

    f = file(filename, 'r')
    lines = f.readlines()
    f.close()

    if (len(lines) == 0):
        return
    elif (len(lines) == 1 and lines[0].strip() == '""'):
        try:
            f = file(filename, 'w')
            f.write("Gene %s not found in expression set\n" % (geneName))
            return
        finally:
            f.close()

    header = "Gene\tCorrelation"

    lines = lines[1:]
    probesetIds = []
    for (line) in lines:
        tokens = line.split()
        probesetId = tokens[0].strip('"')
        if (len(probesetId) > 0):
            probesetIds.append(probesetId)

    if (len(probesetIds) > 0):
        geneSymbols = getGeneSymbols(db, probesetIds)
    else:
        geneSymbols = {}

    if (len(lines) == 1):
        try:
            f = file(filename, 'w')
            f.write("No gene correlated to %s with correlation coefficient of %s\n" % (geneSymbols[probesetIds[0]], coe))
            return
        finally:
            f.close()

    f = file(filename, 'w')
    f.write("%s\n" % (header))
    for (line) in lines:
        tokens = line.split()
        probesetId = tokens[0]
        gene = geneSymbols[probesetId.strip('"')]
        tokens[0] = gene
        f.write(" ".join(tokens) + '\n')
    f.close()

def filter(outfile, log, species, coA, coR, tf, tr):
    if (species == 'Nil'):
        return

    logLines = log.split("\n")

    s = [x for x in logLines if x.split('\t')[0] == "### cwd"]
    if (len(s)) > 0:
        s = [x.strip().split('\t')[1:] for x in s]
        cwd = s[0][0].strip('\n')

    header = None
    hasFilter = False
    if (coA):
        (header,coAMap) = getMap(os.path.join(cwd, 'coA' + species))
        hasFilter = True
    if (coR):
        (header,coRMap) = getMap(os.path.join(cwd, 'coR' + species))
        hasFilter = True
    if (tf):
        (header,tfMap) = getMap(os.path.join(cwd, 'tf' + species))
        hasFilter = True
    if (tr):
        (header,trMap) = getMap(os.path.join(cwd, 'tr' + species))
        hasFilter = True

    if (hasFilter):
        header = "Gene\tCorrelation\t" + header
    else:
        return

    try:
        f = file(outfile, 'r')
        lines = f.readlines()
    finally:
        f.close()

    try:
        f = file(outfile, 'w')
        f.write(header)
        for (line) in lines[1:]:
            gene = line.split()[0].strip()
            s = None
            if (coA and gene in coAMap):
                s = "%s\t%s" % ("\t".join(line.split()), coAMap[gene])
            elif (coR and gene in coRMap):
                s = "%s\t%s" % ("\t".join(line.split()), coRMap[gene])
            elif (tf and gene in tfMap):
                s = "%s\t%s" % ("\t".join(line.split()), tfMap[gene])
            elif (tr and gene in trMap):
                s = "%s\t%s" % ("\t".join(line.split()), trMap[gene])

            if (s <> None):
                f.write(s)
    finally:
        f.close()

def getMap(which):
    if (not os.path.isfile(which)):
        return (which, {})

    f = file(which, 'r')
    try:
        lines = f.readlines()
    finally:
        f.close()

    header = lines[0]
    headerTokens = header.split('\t')
    header = "\t".join(headerTokens[1:])
    lines = lines[1:]
    map = {}
    for (line) in lines:
        tokens = line.split('\t')
        map[tokens[0].strip()] = '\t'.join(tokens[1:])

    return (header,map)

def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
    infile = inp_data['i1'].file_name
    db = inp_data['i1'].dbkey
    outfile = out_data['logmeta'].file_name

    coe = inp_data.get('y')

    param_dict = param_dict.get("opMode")
    geneName = param_dict.get('geneName')

    if (db == '?'):
        return

    mapProbesetToGene(db, outfile, coe, geneName)
    filter(outfile, out_data['logmeta'].info, param_dict.get('filterSpecies'), param_dict.get('coA'), param_dict.get('coR'), param_dict.get('tf'), param_dict.get('tr'))

    # First column has changed from probesetId to gene
    try:
        f = file(outfile, 'r')
        pk = f.read()
    finally:
        f.close()
    out_data['logmeta'].peek = pk
    out_data['logmeta'].set_peek()

Summary ✨

This is a Python script that filters and maps probe sets to genes for a Galaxy workflow. It uses the getGeneSymbols function from the com.lilly.cistrome.gs module to map probesets to genes based on a database file. The script then filters the output based on user-defined parameters such as correlation coefficient, transcription factor, and transcription rate. Finally, it updates the peek of the output dataset with the filtered data.

Tech Fingerprint

Standard Library: OS Interaction

Alerts (7)

'def' Ensure functions have docstrings for documentation
9 60 117 138
Complexity hotspot; lines 66 to 68 (total complexity: 4)
66 67 68