eiplot.py - This Python script uses R to create a plot of t…

/tools/expression/eiplot.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 296 lines · 207 code · 43 blank · 46 comment · 22 complexity · 6ff6ccbcbc5d789a340f2cd2cd4413bc MD5 · raw file


"""
exptf.py
A galaxy tool to find the top-n highest expressed TFs
"""
import os.path
import sys
import tempfile
import os
import re
import string
import subprocess

rprog="""rplot=function(tdir, title, txtfname, logfpath, output1, output2)
#  rcall = "('%s','%s')" % (tdir, txtfname, logfpath)
# /home/c104611/cistrome/dist/galaxy/database/tmp
# /home/c104611/cistrome/src/gx/GX/SmallExpression.txt
# /home/c104611/cistrome/dist/galaxy/database/tmp/test.dat
{
sink(file=file(logfpath, "at"), type="message")
sink(file=file(logfpath, "at"), type="output")

newfiles = c()
newnames = c()
newtypes = c()

setwd(tdir)

library(oligo)
library(affy)
library(genefilter)
library(geneplotter)
library(limma)
library(RColorBrewer)
library(genefilter)

palette(brewer.pal(8, "Dark2"))
pdffname = file.path(tdir, title) #provides full file path

#for output
newnames = c(newnames, title)
newfiles = c(newfiles, pdffname)
newtypes = c(newtypes, 'pdf')

x <- read.table(output1,header=TRUE,sep="\t")
xx <- read.table(output2,header=TRUE,sep="\t")

require(graphics)

par(mfrow=c(3,1))
pdf(file=pdffname, paper='a4')

boxplot(x[,2],xx[,2], names= TRUE,las=1, cex.axis=0.7, main="Expression Intensity of two Factors", col=c("lightblue","lightgreen"))
hist(x[,2], col='lightblue', names= FALSE, xlab="Intensity", main="Distribution of Intensity Factor1")
hist(xx[,2], col='lightgreen', names= FALSE,xlab="Intensity",  main="Distribution of Intensity Factor2")

dev.off()

sink()
return(list(newfiles=newfiles,newnames=newnames,newtypes=newtypes))
}
"""

mng = '### makenewgalaxy' # used by exec_after_process hook to parse out new file paths/names
tmp = '### tmp' # used by exec_after_process hook to clean up tmp directory

def findExpression(inputExprFilePath, inputSampleName, inputCompareGeneFilePath, inputPartialFilePath, refChoice, logFileName, output1, output2):

    #read infpath
    #drop header
    #get sselect sample name
    #Get Full/Partial Reference Genes and write into file
        #genes = extract 1st column (genes) in vector
        #exp_val = extract n-th column in vector
        #map(exp_val, gene)
        #sort exp_val
        #loop sorted_exp_val and print gene until sorted_exp_val < cutoff
    #write into log file as matrix

    #read expression index file
    f = file(inputExprFilePath, 'r')
    exprFilelines = f.readlines()
    f.close()      
   
    #find sample_name & remove if doubel quotes
    header_line =  string.strip(exprFilelines[0])

    samples = []
    x_re = re.compile(r'\t')
    headerItems = x_re.split(header_line)

    for (sample) in headerItems:
        sample = string.strip(sample, '"')
        samples.append(sample)

    sample_name = samples[inputSampleName-1]
    
    #drop header
    exprFilelines = exprFilelines[1:]

    #prepare mapping for ALL & Partial    
    expr_genes_arr = []
    exp_val_arr = []
    full_mapping = {}

    partial_exp_val_arr = []
    partial_mapping = {}

    if refChoice=="All":
        #when reference genes are ALL
        for (line) in exprFilelines:
            columns = line.strip().split()
            geneName = columns[0].strip('"')            
            value = float(columns[inputSampleName])
            expr_genes_arr.append(geneName)
            exp_val_arr.append(value)
            full_mapping[value] = geneName
    else:
        #when reference genes are partial
        #read (gene = expr_value) from expression index file
        fpartial = file(inputPartialFilePath, 'r')
        partial_genes = fpartial.readlines()
        fpartial.close()
        
        #Match inputPartialFileFenes with expression_index file genes and list out
        for (partialGeneName) in partial_genes:
            partialGeneName = partialGeneName.strip().strip('"')
            for (eachLine) in exprFilelines: #read (gene = expr_value) from expression index file
                columns = eachLine.strip().split()
                exprGeneName = columns[0].strip('"')
                exprValue = float(columns[inputSampleName])
                #check and add partial gene
                if exprGeneName==partialGeneName:
                    partial_exp_val_arr.append(exprValue)
                    partial_mapping[exprValue] = exprGeneName

    #write mapping into logs
    out1 = file(output1, 'w')

    #write mapping. 1st Preference to ALL    
    if len(exp_val_arr)>0:  
        exp_val_arr.sort()
        exp_val_arr.reverse()
        out1.write("%s\t%s\n" % ("GeneName",sample_name ))
        n=0
        for (evalue) in exp_val_arr:
            out1.write("%s\t%f" % (full_mapping[evalue], evalue)) # maintains value & gene
            n=n+1
            #if n<len(exp_val_arr):
            out1.write("\n")
    else:
        
        partial_exp_val_arr.sort()
        partial_exp_val_arr.reverse()
        out1.write("%s\t%s\n" % ("GeneName",sample_name ))
        m=0
        for (pevalue) in partial_exp_val_arr:
            out1.write("%s\t%f" % (partial_mapping[pevalue], pevalue)) # maintains value & gene
            m=m+1            
            out1.write("\n")

    out1.close()

    #read compare gene lists expression index
    #read expression index file
    cf = file(inputCompareGeneFilePath, 'r')
    comp_genes = cf.readlines()
    cf.close()

    comp_exp_val_arr = []
    comp_mapping = {}
    #Match inputPartialFileFenes with expression_index file genes and list out
    for (cGeneName) in comp_genes:
        cGeneName = cGeneName.strip().strip('"')
        for (eachLine) in exprFilelines: #read (gene = expr_value) from expression index file
            columns = eachLine.strip().split()
            cExprGeneName = columns[0].strip('"')
            cExprValue = float(columns[inputSampleName])
            #check and add partial gene
            if cExprGeneName==cGeneName:
                comp_exp_val_arr.append(cExprValue)
                comp_mapping[cExprValue] = cExprGeneName
                
    #write mapping into logs
    out2 = file(output2, 'w')

    #write mapping. 1st Preference to ALL
    if len(comp_exp_val_arr)>0:
        comp_exp_val_arr.sort()
        comp_exp_val_arr.reverse()
        out2.write("%s\t%s\n" % ("GeneName",sample_name ))
        i=0
        for (cevalue) in comp_exp_val_arr:
            i = i+1
            out2.write("%s\t%f" % (comp_mapping[cevalue], cevalue)) # maintains value & gene           
            out2.write("\n")
    out2.close()
       
            
def main():
    """called as 
<command interpreter="python">
exptf.py '$expVal' '$i' '$cutoff' '$logmeta'
</command>
    """
    nparm = 8
    appname = sys.argv[0]

   # print '%s needs %d parameters - given %d as %s' % (appname,nparm,len(sys.argv),';'.join(sys.argv))
    
    if (len(sys.argv) < nparm):
        print '%s needs %d parameters - given %d as %s' % (appname,nparm,len(sys.argv),';'.join(sys.argv))
        print 'usage: %s expression_values n out_file' % (appname)
        sys.exit(1)
        # short command line error
    title = sys.argv[1].strip()
    inputExprFilePath = sys.argv[2].strip()
    sampleName = int(sys.argv[3].strip())
    geneList1 =  sys.argv[4].strip()
    refChoice = sys.argv[5].strip()
    geneList2 = sys.argv[6].strip()
    logFileName = sys.argv[7].strip()
    
    #create temp directory
    title = title.strip(' ').replace('  ',' ').replace(' ','_')
    tdir = tempfile.mkdtemp(prefix=title)
    title = title + '.pdf'   

    #dirName = os.path.dirname(tdir)
    
    out1 = os.path.join(tdir, "out1.txt")
    out2 = os.path.join(tdir, "out2.txt")

    #remove temp files if already exists
    if os.path.exists(out1):
        os.remove(out1)
        
    if os.path.exists(out2):
        os.remove(out2)
    
    #create temp files
    output1_file = open(out1, "w")    
    output1_file.close()

    output2_file = open(out2, "w")    
    output2_file.close()

    #output1 = sys.argv[6].strip()
    #output2 = sys.argv[6].strip()

    print 'title:%s'% (title)
    print 'ExpressionFilePath:%s'% (inputExprFilePath)
    print 'Genes To be compared:%s'% (geneList1)
    print 'Choice:%s'% (refChoice)
    print 'Partial Genes:%s'% (geneList2)
    print 'logfilename:%s'% (logFileName)
    print 'output1:%s'% (out1)
    print 'output2:%s'% (out2)

    #write {gene expr_val) into tmp logfname
    #or
    #write {partial_gene expr_val) into tmp logFileName
    findExpression(inputExprFilePath, sampleName, geneList1, geneList2, refChoice, logFileName, out1, out2)
   
    # call R function & read return array
    rprogname = 'rplot'
    rcall = "%s('%s','%s','%s','%s','%s','%s')" % (rprogname,tdir,title,inputExprFilePath,logFileName,out1,out2)
    print '##rcall=',rcall
    
    p = subprocess.Popen("R --vanilla", shell=True, executable="/bin/bash", stdin=subprocess.PIPE)
    p.communicate(rprog + "\n" + rcall)
    makenew = getFilenames(tdir, title)

    newfiles = makenew[0]
    newnames = makenew[1]
    newtypes = makenew[2]

    #Write PDF file path,name and type to Log file for plot_code reading. i.e, exec_after_process()
    logf = file(logFileName,'a')
    s = 'R code and call\n%s\n\n\nCalled as\n%s\n' % (rprog, rcall)
    logf.write(s)
    try:
        outlist = ['%s\t%s\t%s\t%s' % (mng,newfiles[i],newnames[i],newtypes[i]) for i in range(len(newfiles))]
    except:
        outlist = ['%s\t%s\t%s\t%s' % (mng,newfiles,newnames,newtypes),] # only 1
    tmpString = '\n%s\t%s\n' % (tmp, tdir)
    logf.write(tmpString)
    logf.write('\n'.join(outlist))
    logf.write('\n')
    logf.close()
    
def getFilenames(tdir, title):
    return [[os.path.join(tdir, title)], [title], ['pdf']]

if __name__ == "__main__":
    main()

Summary ✨

This Python script uses R to create a plot of the expression levels of genes from two different datasets, with one dataset being used as a reference. The script takes in several arguments, including the path to an expression data file, the names of the genes to be compared, and the choice of which dataset should be used as the reference. It then creates temporary files containing the expression levels for each gene and calls an R function to create a plot comparing the two datasets. The output is a PDF file with the plot.

Tech Fingerprint

Alerts (6)

'def' Avoid long function definitions; keep signatures concise for readability
66
'def' Ensure functions have docstrings for documentation
66 291
'open(' Use 'with open()' to ensure Files are properly closed
241 244
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
283