/tools/expression/eiplot.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 296 lines · 207 code · 43 blank · 46 comment · 22 complexity · 6ff6ccbcbc5d789a340f2cd2cd4413bc MD5 · raw file

  1. """
  2. exptf.py
  3. A galaxy tool to find the top-n highest expressed TFs
  4. """
  5. import os.path
  6. import sys
  7. import tempfile
  8. import os
  9. import re
  10. import string
  11. import subprocess
  12. rprog="""rplot=function(tdir, title, txtfname, logfpath, output1, output2)
  13. # rcall = "('%s','%s')" % (tdir, txtfname, logfpath)
  14. # /home/c104611/cistrome/dist/galaxy/database/tmp
  15. # /home/c104611/cistrome/src/gx/GX/SmallExpression.txt
  16. # /home/c104611/cistrome/dist/galaxy/database/tmp/test.dat
  17. {
  18. sink(file=file(logfpath, "at"), type="message")
  19. sink(file=file(logfpath, "at"), type="output")
  20. newfiles = c()
  21. newnames = c()
  22. newtypes = c()
  23. setwd(tdir)
  24. library(oligo)
  25. library(affy)
  26. library(genefilter)
  27. library(geneplotter)
  28. library(limma)
  29. library(RColorBrewer)
  30. library(genefilter)
  31. palette(brewer.pal(8, "Dark2"))
  32. pdffname = file.path(tdir, title) #provides full file path
  33. #for output
  34. newnames = c(newnames, title)
  35. newfiles = c(newfiles, pdffname)
  36. newtypes = c(newtypes, 'pdf')
  37. x <- read.table(output1,header=TRUE,sep="\t")
  38. xx <- read.table(output2,header=TRUE,sep="\t")
  39. require(graphics)
  40. par(mfrow=c(3,1))
  41. pdf(file=pdffname, paper='a4')
  42. boxplot(x[,2],xx[,2], names= TRUE,las=1, cex.axis=0.7, main="Expression Intensity of two Factors", col=c("lightblue","lightgreen"))
  43. hist(x[,2], col='lightblue', names= FALSE, xlab="Intensity", main="Distribution of Intensity Factor1")
  44. hist(xx[,2], col='lightgreen', names= FALSE,xlab="Intensity", main="Distribution of Intensity Factor2")
  45. dev.off()
  46. sink()
  47. return(list(newfiles=newfiles,newnames=newnames,newtypes=newtypes))
  48. }
  49. """
  50. mng = '### makenewgalaxy' # used by exec_after_process hook to parse out new file paths/names
  51. tmp = '### tmp' # used by exec_after_process hook to clean up tmp directory
  52. def findExpression(inputExprFilePath, inputSampleName, inputCompareGeneFilePath, inputPartialFilePath, refChoice, logFileName, output1, output2):
  53. #read infpath
  54. #drop header
  55. #get sselect sample name
  56. #Get Full/Partial Reference Genes and write into file
  57. #genes = extract 1st column (genes) in vector
  58. #exp_val = extract n-th column in vector
  59. #map(exp_val, gene)
  60. #sort exp_val
  61. #loop sorted_exp_val and print gene until sorted_exp_val < cutoff
  62. #write into log file as matrix
  63. #read expression index file
  64. f = file(inputExprFilePath, 'r')
  65. exprFilelines = f.readlines()
  66. f.close()
  67. #find sample_name & remove if doubel quotes
  68. header_line = string.strip(exprFilelines[0])
  69. samples = []
  70. x_re = re.compile(r'\t')
  71. headerItems = x_re.split(header_line)
  72. for (sample) in headerItems:
  73. sample = string.strip(sample, '"')
  74. samples.append(sample)
  75. sample_name = samples[inputSampleName-1]
  76. #drop header
  77. exprFilelines = exprFilelines[1:]
  78. #prepare mapping for ALL & Partial
  79. expr_genes_arr = []
  80. exp_val_arr = []
  81. full_mapping = {}
  82. partial_exp_val_arr = []
  83. partial_mapping = {}
  84. if refChoice=="All":
  85. #when reference genes are ALL
  86. for (line) in exprFilelines:
  87. columns = line.strip().split()
  88. geneName = columns[0].strip('"')
  89. value = float(columns[inputSampleName])
  90. expr_genes_arr.append(geneName)
  91. exp_val_arr.append(value)
  92. full_mapping[value] = geneName
  93. else:
  94. #when reference genes are partial
  95. #read (gene = expr_value) from expression index file
  96. fpartial = file(inputPartialFilePath, 'r')
  97. partial_genes = fpartial.readlines()
  98. fpartial.close()
  99. #Match inputPartialFileFenes with expression_index file genes and list out
  100. for (partialGeneName) in partial_genes:
  101. partialGeneName = partialGeneName.strip().strip('"')
  102. for (eachLine) in exprFilelines: #read (gene = expr_value) from expression index file
  103. columns = eachLine.strip().split()
  104. exprGeneName = columns[0].strip('"')
  105. exprValue = float(columns[inputSampleName])
  106. #check and add partial gene
  107. if exprGeneName==partialGeneName:
  108. partial_exp_val_arr.append(exprValue)
  109. partial_mapping[exprValue] = exprGeneName
  110. #write mapping into logs
  111. out1 = file(output1, 'w')
  112. #write mapping. 1st Preference to ALL
  113. if len(exp_val_arr)>0:
  114. exp_val_arr.sort()
  115. exp_val_arr.reverse()
  116. out1.write("%s\t%s\n" % ("GeneName",sample_name ))
  117. n=0
  118. for (evalue) in exp_val_arr:
  119. out1.write("%s\t%f" % (full_mapping[evalue], evalue)) # maintains value & gene
  120. n=n+1
  121. #if n<len(exp_val_arr):
  122. out1.write("\n")
  123. else:
  124. partial_exp_val_arr.sort()
  125. partial_exp_val_arr.reverse()
  126. out1.write("%s\t%s\n" % ("GeneName",sample_name ))
  127. m=0
  128. for (pevalue) in partial_exp_val_arr:
  129. out1.write("%s\t%f" % (partial_mapping[pevalue], pevalue)) # maintains value & gene
  130. m=m+1
  131. out1.write("\n")
  132. out1.close()
  133. #read compare gene lists expression index
  134. #read expression index file
  135. cf = file(inputCompareGeneFilePath, 'r')
  136. comp_genes = cf.readlines()
  137. cf.close()
  138. comp_exp_val_arr = []
  139. comp_mapping = {}
  140. #Match inputPartialFileFenes with expression_index file genes and list out
  141. for (cGeneName) in comp_genes:
  142. cGeneName = cGeneName.strip().strip('"')
  143. for (eachLine) in exprFilelines: #read (gene = expr_value) from expression index file
  144. columns = eachLine.strip().split()
  145. cExprGeneName = columns[0].strip('"')
  146. cExprValue = float(columns[inputSampleName])
  147. #check and add partial gene
  148. if cExprGeneName==cGeneName:
  149. comp_exp_val_arr.append(cExprValue)
  150. comp_mapping[cExprValue] = cExprGeneName
  151. #write mapping into logs
  152. out2 = file(output2, 'w')
  153. #write mapping. 1st Preference to ALL
  154. if len(comp_exp_val_arr)>0:
  155. comp_exp_val_arr.sort()
  156. comp_exp_val_arr.reverse()
  157. out2.write("%s\t%s\n" % ("GeneName",sample_name ))
  158. i=0
  159. for (cevalue) in comp_exp_val_arr:
  160. i = i+1
  161. out2.write("%s\t%f" % (comp_mapping[cevalue], cevalue)) # maintains value & gene
  162. out2.write("\n")
  163. out2.close()
  164. def main():
  165. """called as
  166. <command interpreter="python">
  167. exptf.py '$expVal' '$i' '$cutoff' '$logmeta'
  168. </command>
  169. """
  170. nparm = 8
  171. appname = sys.argv[0]
  172. # print '%s needs %d parameters - given %d as %s' % (appname,nparm,len(sys.argv),';'.join(sys.argv))
  173. if (len(sys.argv) < nparm):
  174. print '%s needs %d parameters - given %d as %s' % (appname,nparm,len(sys.argv),';'.join(sys.argv))
  175. print 'usage: %s expression_values n out_file' % (appname)
  176. sys.exit(1)
  177. # short command line error
  178. title = sys.argv[1].strip()
  179. inputExprFilePath = sys.argv[2].strip()
  180. sampleName = int(sys.argv[3].strip())
  181. geneList1 = sys.argv[4].strip()
  182. refChoice = sys.argv[5].strip()
  183. geneList2 = sys.argv[6].strip()
  184. logFileName = sys.argv[7].strip()
  185. #create temp directory
  186. title = title.strip(' ').replace(' ',' ').replace(' ','_')
  187. tdir = tempfile.mkdtemp(prefix=title)
  188. title = title + '.pdf'
  189. #dirName = os.path.dirname(tdir)
  190. out1 = os.path.join(tdir, "out1.txt")
  191. out2 = os.path.join(tdir, "out2.txt")
  192. #remove temp files if already exists
  193. if os.path.exists(out1):
  194. os.remove(out1)
  195. if os.path.exists(out2):
  196. os.remove(out2)
  197. #create temp files
  198. output1_file = open(out1, "w")
  199. output1_file.close()
  200. output2_file = open(out2, "w")
  201. output2_file.close()
  202. #output1 = sys.argv[6].strip()
  203. #output2 = sys.argv[6].strip()
  204. print 'title:%s'% (title)
  205. print 'ExpressionFilePath:%s'% (inputExprFilePath)
  206. print 'Genes To be compared:%s'% (geneList1)
  207. print 'Choice:%s'% (refChoice)
  208. print 'Partial Genes:%s'% (geneList2)
  209. print 'logfilename:%s'% (logFileName)
  210. print 'output1:%s'% (out1)
  211. print 'output2:%s'% (out2)
  212. #write {gene expr_val) into tmp logfname
  213. #or
  214. #write {partial_gene expr_val) into tmp logFileName
  215. findExpression(inputExprFilePath, sampleName, geneList1, geneList2, refChoice, logFileName, out1, out2)
  216. # call R function & read return array
  217. rprogname = 'rplot'
  218. rcall = "%s('%s','%s','%s','%s','%s','%s')" % (rprogname,tdir,title,inputExprFilePath,logFileName,out1,out2)
  219. print '##rcall=',rcall
  220. p = subprocess.Popen("R --vanilla", shell=True, executable="/bin/bash", stdin=subprocess.PIPE)
  221. p.communicate(rprog + "\n" + rcall)
  222. makenew = getFilenames(tdir, title)
  223. newfiles = makenew[0]
  224. newnames = makenew[1]
  225. newtypes = makenew[2]
  226. #Write PDF file path,name and type to Log file for plot_code reading. i.e, exec_after_process()
  227. logf = file(logFileName,'a')
  228. s = 'R code and call\n%s\n\n\nCalled as\n%s\n' % (rprog, rcall)
  229. logf.write(s)
  230. try:
  231. outlist = ['%s\t%s\t%s\t%s' % (mng,newfiles[i],newnames[i],newtypes[i]) for i in range(len(newfiles))]
  232. except:
  233. outlist = ['%s\t%s\t%s\t%s' % (mng,newfiles,newnames,newtypes),] # only 1
  234. tmpString = '\n%s\t%s\n' % (tmp, tdir)
  235. logf.write(tmpString)
  236. logf.write('\n'.join(outlist))
  237. logf.write('\n')
  238. logf.close()
  239. def getFilenames(tdir, title):
  240. return [[os.path.join(tdir, title)], [title], ['pdf']]
  241. if __name__ == "__main__":
  242. main()