diffexp_code.py - This is a Python script that analyzes gen…

/tools/expression/diffexp_code.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 186 lines · 168 code · 7 blank · 11 comment · 15 complexity · 3ced7ac0aaf9561975a64a4c3c9819d8 MD5 · raw file


#build list of available data
import string, os, sys, glob, shutil
import galaxy.util, shlex
from galaxy import datatypes, config, jobs 

from com.lilly.cistrome import htmlParser

#repository = "/usr/local/galaxy/data/rg/library"

def getHasPheno(phef):
  if not phef:
    return [('Missing input','nothing',False)]
  hasPheno = phef.metadata.pheno
  phenoOptions = [] 
  if (hasPheno == None):
    phenoOptions.append(('Manual', 'manual', True))
  else:
    phenoOptions.append(('Use uploaded pheno file', 'file', True))
    phenoOptions.append(('Manual', 'manual', False))
  return phenoOptions


def getSamples(phef):
  if not phef:
    return []
  #phe = phef.metadata.pheno
  infile = open(phef.get_file_name())
  header = infile.readline().strip()
#  samples = shlex.split(header)
  samples = header.split('\t')
  if samples[0]=="":
    samples = samples[1:]
  samples = map(lambda x : (x, x, False), samples)
  infile.close()
  return samples

def get_phecols(phef):
   """ this version only gives dichotomous columns
   return phenotype column names for an rexpression - eg eset or affybatch
   A column with only 1 value doesn't change, so is not interesting for
   analysis. A column with a different value in every row is equivalent to a unique
   identifier so is also not interesting for anova or limma analysis so both these
   are removed after the concordance (count of unique terms) is constructed for each
   column. 
   """
   sep = 'XXX'
   phe = phef.metadata.pheno
   if (phe == None):
      res = [('1. No dichotomous phenotype columns found','',False),]
      return res
   phelist = phe.split('\n')
   head = None 
   for nrows,row in enumerate(phelist): # construct concordance
        row = row.strip().split('\t')
        if nrows == 0:
           head = row
           totcols = len(row) 
           concordance = [{} for x in head] # list of dicts
        else:
            for col,code in enumerate(row): # keep column order correct
                if not concordance[col].get(code,None):
                    concordance[col][code] = 1 # first
                else:
                    concordance[col][code] += 1
                
   useCols = []
   useConc = [] # columns of interest to keep
   colNames = []
   nrows -= 1 # drop head from count
   for c,conc in enumerate(concordance): # c is column number
        ck = len(conc.keys())
        if (ck > 1) and (ck < nrows): # not all same and not all different!!
            useConc.append(conc) # keep concordance
            useCols.append(c) # keep column
            colNames.append(head[c])
   nuse = len(useCols) 
   res = [] 
   for c,conc in enumerate(useConc): # these are all unique columns for the design matrix
            if len(conc.keys()) == 2: # dichotomous
                ccounts = [(conc.get(code,0),code) for code in conc.keys()] # decorate
                ccounts.sort()
                cc = ['%s (%d)' % (x[1],x[0]) for x in ccounts]
                retc = [x[1] for x in ccounts] # for design matrix, just want values, not counts
                show = '%s%s%s' % (colNames[c],sep,sep.join(cc)) # undecorate to get codes + counts in order
                rets = '%s%s%s' % (colNames[c],sep,sep.join(retc)) # for design
                res.append((rets,rets,False))
   if len(res) >= 1:
      x,y,z = res[0] # 0,1 = fid,iid
      res[0] = (x,y,True) # set second selected
   else:
      res = [('2. No dichotomous phenotype columns found','',False),]
   return res


def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
    """Sets the name of the html file to the title"""
    trantab = string.maketrans(string.punctuation,'_'*len(string.punctuation))
    job_name = param_dict.get( 'title', 'AffyExpress' ).translate(trantab)
    data = out_data['outfile']
    newname = '%s.html' % job_name
    data = app.datatypes_registry.change_datatype(data,'html')
    data.name = newname
    out_data['outfile'] = data

    logpath = out_data['outfile'].file_name
    loglist = file(logpath, 'r').readlines()
    mng = '### makenewgalaxy'
    txtpath = out_data['txtoutfile'].file_name
#    txtf = file(txtpath, 'w')
#    txtf.write('Probe\tSymbol\tDescription\tGene\tCytoband\tLog2Ratio\tPValue\n')
    atLeastOne = False
    newfile = [x for x in loglist if x.split('\t')[0] == mng]
    newfile = [x.strip().split('\t')[1:] for x in newfile]
    for (file_path, title) in newfile:
        okToOpen = os.path.exists(file_path)
        atLeastOne = okToOpen or atLeastOne
        if (not okToOpen):
            continue

        htmlParser.html2Txt(file_path, txtpath)
#        html = file(file_path, 'r').readlines()
        """
        data = [x for x in html if x.split(' ')[0] == '<td']
        i = 0
        for (line) in data:
            i = i + 1
            if (i == 1):
                td = string.split(line, '>')
                probeset = td[2]
                td = string.split(probeset, '<')
                probeset = td[0].strip()
            if (i == 2):
                td = string.split(line, '>')
                geneSymbol = td[1]
                td = string.split(geneSymbol, '<')
                geneSymbol = td[0].strip()
            if (i == 3):
                td = string.split(line, '>')
                description = td[1]
                td = string.split(description , '<')
                description = td[0].strip()
            if (i == 4):
                td = string.split(line, '>')
                entrezId = td[2]
                td = string.split(entrezId , '<')
                entrezId = td[0].strip()
            if (i == 5):
                td = string.split(line, '>')
                cytoband = td[2]
                td = string.split(cytoband , '<')
                cytoband = td[0].strip()
            if (i == 6):
                td = string.split(line, '>')
                log_2_ratio = td[1]
                td = string.split(log_2_ratio , '<')
                log_2_ratio = td[0].strip()
            if (i == 7):
                td = string.split(line, '>')
                pvalue = td[1]
                td = string.split(pvalue , '<')
                pvalue = td[0].strip()

                i = 0
                s = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (probeset, geneSymbol, description, entrezId, cytoband, log_2_ratio, pvalue)
                txtf.write(s)

    txtf.close()
"""

    if (not atLeastOne):
        # overwrite file and output a message
        txtf = file(txtpath, 'w')
        infname = param_dict['rexprIn'].name
        pthresh = param_dict['pthresh']
        txtf.write("No differentially expressed gene found in %s at p-value %s" % (infname, pthresh))
        txtf.close()

    out_data['txtoutfile'].set_peek()
    out_data['txtoutfile'].set_meta()
    out_data['txtoutfile'].set_size()
    title = param_dict.get('title', 'Differential_expression')
    out_data['txtoutfile'].name = title + "TargetGenes.txt"
    out_data['txtoutfile'].info = title + " target genes"

    app.model.context.flush()

Summary ✨

This is a Python script that analyzes gene expression data and identifies differentially expressed genes based on a specified p-value threshold. It takes in a file with gene expression data, performs statistical analysis, and outputs a list of differentially expressed genes along with their corresponding p-values.

Tech Fingerprint

Standard Library: OS Interaction

Alerts (11)

'def' Ensure functions have docstrings for documentation
10 23
'== None' Use 'is' for None comparisons (e.g., x is None)
15 48
'open(' Use 'with open()' to ensure Files are properly closed
27
Complexity hotspot; lines 41 to 43 (total complexity: 4)
41 42 43
Complexity hotspot; lines 112 to 114 (total complexity: 4)
112 113 114