/tools/expression/eiplot.py
https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 296 lines · 207 code · 43 blank · 46 comment · 22 complexity · 6ff6ccbcbc5d789a340f2cd2cd4413bc MD5 · raw file
- """
- exptf.py
- A galaxy tool to find the top-n highest expressed TFs
- """
- import os.path
- import sys
- import tempfile
- import os
- import re
- import string
- import subprocess
- rprog="""rplot=function(tdir, title, txtfname, logfpath, output1, output2)
- # rcall = "('%s','%s')" % (tdir, txtfname, logfpath)
- # /home/c104611/cistrome/dist/galaxy/database/tmp
- # /home/c104611/cistrome/src/gx/GX/SmallExpression.txt
- # /home/c104611/cistrome/dist/galaxy/database/tmp/test.dat
- {
- sink(file=file(logfpath, "at"), type="message")
- sink(file=file(logfpath, "at"), type="output")
- newfiles = c()
- newnames = c()
- newtypes = c()
- setwd(tdir)
- library(oligo)
- library(affy)
- library(genefilter)
- library(geneplotter)
- library(limma)
- library(RColorBrewer)
- library(genefilter)
- palette(brewer.pal(8, "Dark2"))
- pdffname = file.path(tdir, title) #provides full file path
- #for output
- newnames = c(newnames, title)
- newfiles = c(newfiles, pdffname)
- newtypes = c(newtypes, 'pdf')
- x <- read.table(output1,header=TRUE,sep="\t")
- xx <- read.table(output2,header=TRUE,sep="\t")
- require(graphics)
- par(mfrow=c(3,1))
- pdf(file=pdffname, paper='a4')
- boxplot(x[,2],xx[,2], names= TRUE,las=1, cex.axis=0.7, main="Expression Intensity of two Factors", col=c("lightblue","lightgreen"))
- hist(x[,2], col='lightblue', names= FALSE, xlab="Intensity", main="Distribution of Intensity Factor1")
- hist(xx[,2], col='lightgreen', names= FALSE,xlab="Intensity", main="Distribution of Intensity Factor2")
- dev.off()
- sink()
- return(list(newfiles=newfiles,newnames=newnames,newtypes=newtypes))
- }
- """
- mng = '### makenewgalaxy' # used by exec_after_process hook to parse out new file paths/names
- tmp = '### tmp' # used by exec_after_process hook to clean up tmp directory
- def findExpression(inputExprFilePath, inputSampleName, inputCompareGeneFilePath, inputPartialFilePath, refChoice, logFileName, output1, output2):
- #read infpath
- #drop header
- #get sselect sample name
- #Get Full/Partial Reference Genes and write into file
- #genes = extract 1st column (genes) in vector
- #exp_val = extract n-th column in vector
- #map(exp_val, gene)
- #sort exp_val
- #loop sorted_exp_val and print gene until sorted_exp_val < cutoff
- #write into log file as matrix
- #read expression index file
- f = file(inputExprFilePath, 'r')
- exprFilelines = f.readlines()
- f.close()
-
- #find sample_name & remove if doubel quotes
- header_line = string.strip(exprFilelines[0])
- samples = []
- x_re = re.compile(r'\t')
- headerItems = x_re.split(header_line)
- for (sample) in headerItems:
- sample = string.strip(sample, '"')
- samples.append(sample)
- sample_name = samples[inputSampleName-1]
-
- #drop header
- exprFilelines = exprFilelines[1:]
- #prepare mapping for ALL & Partial
- expr_genes_arr = []
- exp_val_arr = []
- full_mapping = {}
- partial_exp_val_arr = []
- partial_mapping = {}
- if refChoice=="All":
- #when reference genes are ALL
- for (line) in exprFilelines:
- columns = line.strip().split()
- geneName = columns[0].strip('"')
- value = float(columns[inputSampleName])
- expr_genes_arr.append(geneName)
- exp_val_arr.append(value)
- full_mapping[value] = geneName
- else:
- #when reference genes are partial
- #read (gene = expr_value) from expression index file
- fpartial = file(inputPartialFilePath, 'r')
- partial_genes = fpartial.readlines()
- fpartial.close()
-
- #Match inputPartialFileFenes with expression_index file genes and list out
- for (partialGeneName) in partial_genes:
- partialGeneName = partialGeneName.strip().strip('"')
- for (eachLine) in exprFilelines: #read (gene = expr_value) from expression index file
- columns = eachLine.strip().split()
- exprGeneName = columns[0].strip('"')
- exprValue = float(columns[inputSampleName])
- #check and add partial gene
- if exprGeneName==partialGeneName:
- partial_exp_val_arr.append(exprValue)
- partial_mapping[exprValue] = exprGeneName
- #write mapping into logs
- out1 = file(output1, 'w')
- #write mapping. 1st Preference to ALL
- if len(exp_val_arr)>0:
- exp_val_arr.sort()
- exp_val_arr.reverse()
- out1.write("%s\t%s\n" % ("GeneName",sample_name ))
- n=0
- for (evalue) in exp_val_arr:
- out1.write("%s\t%f" % (full_mapping[evalue], evalue)) # maintains value & gene
- n=n+1
- #if n<len(exp_val_arr):
- out1.write("\n")
- else:
-
- partial_exp_val_arr.sort()
- partial_exp_val_arr.reverse()
- out1.write("%s\t%s\n" % ("GeneName",sample_name ))
- m=0
- for (pevalue) in partial_exp_val_arr:
- out1.write("%s\t%f" % (partial_mapping[pevalue], pevalue)) # maintains value & gene
- m=m+1
- out1.write("\n")
- out1.close()
- #read compare gene lists expression index
- #read expression index file
- cf = file(inputCompareGeneFilePath, 'r')
- comp_genes = cf.readlines()
- cf.close()
- comp_exp_val_arr = []
- comp_mapping = {}
- #Match inputPartialFileFenes with expression_index file genes and list out
- for (cGeneName) in comp_genes:
- cGeneName = cGeneName.strip().strip('"')
- for (eachLine) in exprFilelines: #read (gene = expr_value) from expression index file
- columns = eachLine.strip().split()
- cExprGeneName = columns[0].strip('"')
- cExprValue = float(columns[inputSampleName])
- #check and add partial gene
- if cExprGeneName==cGeneName:
- comp_exp_val_arr.append(cExprValue)
- comp_mapping[cExprValue] = cExprGeneName
-
- #write mapping into logs
- out2 = file(output2, 'w')
- #write mapping. 1st Preference to ALL
- if len(comp_exp_val_arr)>0:
- comp_exp_val_arr.sort()
- comp_exp_val_arr.reverse()
- out2.write("%s\t%s\n" % ("GeneName",sample_name ))
- i=0
- for (cevalue) in comp_exp_val_arr:
- i = i+1
- out2.write("%s\t%f" % (comp_mapping[cevalue], cevalue)) # maintains value & gene
- out2.write("\n")
- out2.close()
-
-
- def main():
- """called as
- <command interpreter="python">
- exptf.py '$expVal' '$i' '$cutoff' '$logmeta'
- </command>
- """
- nparm = 8
- appname = sys.argv[0]
- # print '%s needs %d parameters - given %d as %s' % (appname,nparm,len(sys.argv),';'.join(sys.argv))
-
- if (len(sys.argv) < nparm):
- print '%s needs %d parameters - given %d as %s' % (appname,nparm,len(sys.argv),';'.join(sys.argv))
- print 'usage: %s expression_values n out_file' % (appname)
- sys.exit(1)
- # short command line error
- title = sys.argv[1].strip()
- inputExprFilePath = sys.argv[2].strip()
- sampleName = int(sys.argv[3].strip())
- geneList1 = sys.argv[4].strip()
- refChoice = sys.argv[5].strip()
- geneList2 = sys.argv[6].strip()
- logFileName = sys.argv[7].strip()
-
- #create temp directory
- title = title.strip(' ').replace(' ',' ').replace(' ','_')
- tdir = tempfile.mkdtemp(prefix=title)
- title = title + '.pdf'
- #dirName = os.path.dirname(tdir)
-
- out1 = os.path.join(tdir, "out1.txt")
- out2 = os.path.join(tdir, "out2.txt")
- #remove temp files if already exists
- if os.path.exists(out1):
- os.remove(out1)
-
- if os.path.exists(out2):
- os.remove(out2)
-
- #create temp files
- output1_file = open(out1, "w")
- output1_file.close()
- output2_file = open(out2, "w")
- output2_file.close()
- #output1 = sys.argv[6].strip()
- #output2 = sys.argv[6].strip()
- print 'title:%s'% (title)
- print 'ExpressionFilePath:%s'% (inputExprFilePath)
- print 'Genes To be compared:%s'% (geneList1)
- print 'Choice:%s'% (refChoice)
- print 'Partial Genes:%s'% (geneList2)
- print 'logfilename:%s'% (logFileName)
- print 'output1:%s'% (out1)
- print 'output2:%s'% (out2)
- #write {gene expr_val) into tmp logfname
- #or
- #write {partial_gene expr_val) into tmp logFileName
- findExpression(inputExprFilePath, sampleName, geneList1, geneList2, refChoice, logFileName, out1, out2)
-
- # call R function & read return array
- rprogname = 'rplot'
- rcall = "%s('%s','%s','%s','%s','%s','%s')" % (rprogname,tdir,title,inputExprFilePath,logFileName,out1,out2)
- print '##rcall=',rcall
-
- p = subprocess.Popen("R --vanilla", shell=True, executable="/bin/bash", stdin=subprocess.PIPE)
- p.communicate(rprog + "\n" + rcall)
- makenew = getFilenames(tdir, title)
- newfiles = makenew[0]
- newnames = makenew[1]
- newtypes = makenew[2]
- #Write PDF file path,name and type to Log file for plot_code reading. i.e, exec_after_process()
- logf = file(logFileName,'a')
- s = 'R code and call\n%s\n\n\nCalled as\n%s\n' % (rprog, rcall)
- logf.write(s)
- try:
- outlist = ['%s\t%s\t%s\t%s' % (mng,newfiles[i],newnames[i],newtypes[i]) for i in range(len(newfiles))]
- except:
- outlist = ['%s\t%s\t%s\t%s' % (mng,newfiles,newnames,newtypes),] # only 1
- tmpString = '\n%s\t%s\n' % (tmp, tdir)
- logf.write(tmpString)
- logf.write('\n'.join(outlist))
- logf.write('\n')
- logf.close()
-
- def getFilenames(tdir, title):
- return [[os.path.join(tdir, title)], [title], ['pdf']]
- if __name__ == "__main__":
- main()