PageRenderTime 48ms CodeModel.GetById 24ms app.highlight 19ms RepoModel.GetById 2ms app.codeStats 0ms

/tools/expression/eiplot.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 296 lines | 291 code | 1 blank | 4 comment | 0 complexity | 6ff6ccbcbc5d789a340f2cd2cd4413bc MD5 | raw file
  1"""
  2exptf.py
  3A galaxy tool to find the top-n highest expressed TFs
  4"""
  5import os.path
  6import sys
  7import tempfile
  8import os
  9import re
 10import string
 11import subprocess
 12
 13rprog="""rplot=function(tdir, title, txtfname, logfpath, output1, output2)
 14#  rcall = "('%s','%s')" % (tdir, txtfname, logfpath)
 15# /home/c104611/cistrome/dist/galaxy/database/tmp
 16# /home/c104611/cistrome/src/gx/GX/SmallExpression.txt
 17# /home/c104611/cistrome/dist/galaxy/database/tmp/test.dat
 18{
 19sink(file=file(logfpath, "at"), type="message")
 20sink(file=file(logfpath, "at"), type="output")
 21
 22newfiles = c()
 23newnames = c()
 24newtypes = c()
 25
 26setwd(tdir)
 27
 28library(oligo)
 29library(affy)
 30library(genefilter)
 31library(geneplotter)
 32library(limma)
 33library(RColorBrewer)
 34library(genefilter)
 35
 36palette(brewer.pal(8, "Dark2"))
 37pdffname = file.path(tdir, title) #provides full file path
 38
 39#for output
 40newnames = c(newnames, title)
 41newfiles = c(newfiles, pdffname)
 42newtypes = c(newtypes, 'pdf')
 43
 44x <- read.table(output1,header=TRUE,sep="\t")
 45xx <- read.table(output2,header=TRUE,sep="\t")
 46
 47require(graphics)
 48
 49par(mfrow=c(3,1))
 50pdf(file=pdffname, paper='a4')
 51
 52boxplot(x[,2],xx[,2], names= TRUE,las=1, cex.axis=0.7, main="Expression Intensity of two Factors", col=c("lightblue","lightgreen"))
 53hist(x[,2], col='lightblue', names= FALSE, xlab="Intensity", main="Distribution of Intensity Factor1")
 54hist(xx[,2], col='lightgreen', names= FALSE,xlab="Intensity",  main="Distribution of Intensity Factor2")
 55
 56dev.off()
 57
 58sink()
 59return(list(newfiles=newfiles,newnames=newnames,newtypes=newtypes))
 60}
 61"""
 62
 63mng = '### makenewgalaxy' # used by exec_after_process hook to parse out new file paths/names
 64tmp = '### tmp' # used by exec_after_process hook to clean up tmp directory
 65
 66def findExpression(inputExprFilePath, inputSampleName, inputCompareGeneFilePath, inputPartialFilePath, refChoice, logFileName, output1, output2):
 67
 68    #read infpath
 69    #drop header
 70    #get sselect sample name
 71    #Get Full/Partial Reference Genes and write into file
 72        #genes = extract 1st column (genes) in vector
 73        #exp_val = extract n-th column in vector
 74        #map(exp_val, gene)
 75        #sort exp_val
 76        #loop sorted_exp_val and print gene until sorted_exp_val < cutoff
 77    #write into log file as matrix
 78
 79    #read expression index file
 80    f = file(inputExprFilePath, 'r')
 81    exprFilelines = f.readlines()
 82    f.close()      
 83   
 84    #find sample_name & remove if doubel quotes
 85    header_line =  string.strip(exprFilelines[0])
 86
 87    samples = []
 88    x_re = re.compile(r'\t')
 89    headerItems = x_re.split(header_line)
 90
 91    for (sample) in headerItems:
 92        sample = string.strip(sample, '"')
 93        samples.append(sample)
 94
 95    sample_name = samples[inputSampleName-1]
 96    
 97    #drop header
 98    exprFilelines = exprFilelines[1:]
 99
100    #prepare mapping for ALL & Partial    
101    expr_genes_arr = []
102    exp_val_arr = []
103    full_mapping = {}
104
105    partial_exp_val_arr = []
106    partial_mapping = {}
107
108    if refChoice=="All":
109        #when reference genes are ALL
110        for (line) in exprFilelines:
111            columns = line.strip().split()
112            geneName = columns[0].strip('"')            
113            value = float(columns[inputSampleName])
114            expr_genes_arr.append(geneName)
115            exp_val_arr.append(value)
116            full_mapping[value] = geneName
117    else:
118        #when reference genes are partial
119        #read (gene = expr_value) from expression index file
120        fpartial = file(inputPartialFilePath, 'r')
121        partial_genes = fpartial.readlines()
122        fpartial.close()
123        
124        #Match inputPartialFileFenes with expression_index file genes and list out
125        for (partialGeneName) in partial_genes:
126            partialGeneName = partialGeneName.strip().strip('"')
127            for (eachLine) in exprFilelines: #read (gene = expr_value) from expression index file
128                columns = eachLine.strip().split()
129                exprGeneName = columns[0].strip('"')
130                exprValue = float(columns[inputSampleName])
131                #check and add partial gene
132                if exprGeneName==partialGeneName:
133                    partial_exp_val_arr.append(exprValue)
134                    partial_mapping[exprValue] = exprGeneName
135
136    #write mapping into logs
137    out1 = file(output1, 'w')
138
139    #write mapping. 1st Preference to ALL    
140    if len(exp_val_arr)>0:  
141        exp_val_arr.sort()
142        exp_val_arr.reverse()
143        out1.write("%s\t%s\n" % ("GeneName",sample_name ))
144        n=0
145        for (evalue) in exp_val_arr:
146            out1.write("%s\t%f" % (full_mapping[evalue], evalue)) # maintains value & gene
147            n=n+1
148            #if n<len(exp_val_arr):
149            out1.write("\n")
150    else:
151        
152        partial_exp_val_arr.sort()
153        partial_exp_val_arr.reverse()
154        out1.write("%s\t%s\n" % ("GeneName",sample_name ))
155        m=0
156        for (pevalue) in partial_exp_val_arr:
157            out1.write("%s\t%f" % (partial_mapping[pevalue], pevalue)) # maintains value & gene
158            m=m+1            
159            out1.write("\n")
160
161    out1.close()
162
163    #read compare gene lists expression index
164    #read expression index file
165    cf = file(inputCompareGeneFilePath, 'r')
166    comp_genes = cf.readlines()
167    cf.close()
168
169    comp_exp_val_arr = []
170    comp_mapping = {}
171    #Match inputPartialFileFenes with expression_index file genes and list out
172    for (cGeneName) in comp_genes:
173        cGeneName = cGeneName.strip().strip('"')
174        for (eachLine) in exprFilelines: #read (gene = expr_value) from expression index file
175            columns = eachLine.strip().split()
176            cExprGeneName = columns[0].strip('"')
177            cExprValue = float(columns[inputSampleName])
178            #check and add partial gene
179            if cExprGeneName==cGeneName:
180                comp_exp_val_arr.append(cExprValue)
181                comp_mapping[cExprValue] = cExprGeneName
182                
183    #write mapping into logs
184    out2 = file(output2, 'w')
185
186    #write mapping. 1st Preference to ALL
187    if len(comp_exp_val_arr)>0:
188        comp_exp_val_arr.sort()
189        comp_exp_val_arr.reverse()
190        out2.write("%s\t%s\n" % ("GeneName",sample_name ))
191        i=0
192        for (cevalue) in comp_exp_val_arr:
193            i = i+1
194            out2.write("%s\t%f" % (comp_mapping[cevalue], cevalue)) # maintains value & gene           
195            out2.write("\n")
196    out2.close()
197       
198            
199def main():
200    """called as 
201<command interpreter="python">
202exptf.py '$expVal' '$i' '$cutoff' '$logmeta'
203</command>
204    """
205    nparm = 8
206    appname = sys.argv[0]
207
208   # print '%s needs %d parameters - given %d as %s' % (appname,nparm,len(sys.argv),';'.join(sys.argv))
209    
210    if (len(sys.argv) < nparm):
211        print '%s needs %d parameters - given %d as %s' % (appname,nparm,len(sys.argv),';'.join(sys.argv))
212        print 'usage: %s expression_values n out_file' % (appname)
213        sys.exit(1)
214        # short command line error
215    title = sys.argv[1].strip()
216    inputExprFilePath = sys.argv[2].strip()
217    sampleName = int(sys.argv[3].strip())
218    geneList1 =  sys.argv[4].strip()
219    refChoice = sys.argv[5].strip()
220    geneList2 = sys.argv[6].strip()
221    logFileName = sys.argv[7].strip()
222    
223    #create temp directory
224    title = title.strip(' ').replace('  ',' ').replace(' ','_')
225    tdir = tempfile.mkdtemp(prefix=title)
226    title = title + '.pdf'   
227
228    #dirName = os.path.dirname(tdir)
229    
230    out1 = os.path.join(tdir, "out1.txt")
231    out2 = os.path.join(tdir, "out2.txt")
232
233    #remove temp files if already exists
234    if os.path.exists(out1):
235        os.remove(out1)
236        
237    if os.path.exists(out2):
238        os.remove(out2)
239    
240    #create temp files
241    output1_file = open(out1, "w")    
242    output1_file.close()
243
244    output2_file = open(out2, "w")    
245    output2_file.close()
246
247    #output1 = sys.argv[6].strip()
248    #output2 = sys.argv[6].strip()
249
250    print 'title:%s'% (title)
251    print 'ExpressionFilePath:%s'% (inputExprFilePath)
252    print 'Genes To be compared:%s'% (geneList1)
253    print 'Choice:%s'% (refChoice)
254    print 'Partial Genes:%s'% (geneList2)
255    print 'logfilename:%s'% (logFileName)
256    print 'output1:%s'% (out1)
257    print 'output2:%s'% (out2)
258
259    #write {gene expr_val) into tmp logfname
260    #or
261    #write {partial_gene expr_val) into tmp logFileName
262    findExpression(inputExprFilePath, sampleName, geneList1, geneList2, refChoice, logFileName, out1, out2)
263   
264    # call R function & read return array
265    rprogname = 'rplot'
266    rcall = "%s('%s','%s','%s','%s','%s','%s')" % (rprogname,tdir,title,inputExprFilePath,logFileName,out1,out2)
267    print '##rcall=',rcall
268    
269    p = subprocess.Popen("R --vanilla", shell=True, executable="/bin/bash", stdin=subprocess.PIPE)
270    p.communicate(rprog + "\n" + rcall)
271    makenew = getFilenames(tdir, title)
272
273    newfiles = makenew[0]
274    newnames = makenew[1]
275    newtypes = makenew[2]
276
277    #Write PDF file path,name and type to Log file for plot_code reading. i.e, exec_after_process()
278    logf = file(logFileName,'a')
279    s = 'R code and call\n%s\n\n\nCalled as\n%s\n' % (rprog, rcall)
280    logf.write(s)
281    try:
282        outlist = ['%s\t%s\t%s\t%s' % (mng,newfiles[i],newnames[i],newtypes[i]) for i in range(len(newfiles))]
283    except:
284        outlist = ['%s\t%s\t%s\t%s' % (mng,newfiles,newnames,newtypes),] # only 1
285    tmpString = '\n%s\t%s\n' % (tmp, tdir)
286    logf.write(tmpString)
287    logf.write('\n'.join(outlist))
288    logf.write('\n')
289    logf.close()
290    
291def getFilenames(tdir, title):
292    return [[os.path.join(tdir, title)], [title], ['pdf']]
293
294if __name__ == "__main__":
295    main()
296