/tools/expression/diffexp.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 421 lines · 355 code · 29 blank · 37 comment · 24 complexity · 2e37c81f41a8d604c3c52bbbb3f4ee5b MD5 · raw file

  1. """
  2. differential_expression.py
  3. A Galaxy tool based on the simplest dichotomous example in AffyExpress
  4. documentation I could find. Use this as a prototype to write more
  5. complex models.
  6. This code called by the job runner, parameters as shown in the corresponding
  7. xml tool descriptor.
  8. From ?regress
  9. regress package:AffyExpress R Documentation
  10. Run regression to fit genewise linear model
  11. Description:
  12. Fit genewise linear model using LIMMA package, ordinary linear
  13. regression, or permutation method.
  14. Usage:
  15. regress(object, design, contrast, method, adj="none", permute.time=1000)
  16. Arguments:
  17. object: an "ExpressionSet"
  18. design: design matrix from the make.design function
  19. contrast: contrast matrix from the make.contrast function
  20. method: Three methods are supported by this function: "L" for using
  21. LIMMA method - compute moderated t-statistics and log-odds
  22. of differential expression by empirical Bayes shrinkage of
  23. the standard errors towards a common value, "F" for using
  24. ordinary linear regression, "P" for permuation test by
  25. resampling the phenotype
  26. adj: adjustment method for multiple comparison test, including
  27. "holm", "hochberg", "hommel", "bonferroni", "BH", "BY",
  28. "fdr", "none". The default value is "none". Type
  29. help(p.adjust) for more detail.
  30. permute.time: number of permutation times, only used for the
  31. permutation method.
  32. :
  33. Value:
  34. A dataframe contains rows for all the genes from object and the
  35. following columns: ID(probeid); Log2Ratio (estimate of the effect
  36. or the contrast, on the log2 scale); F (F statistics); P.Value
  37. (raw p-value); adj.P.Value (adjusted p-value or q-value)
  38. Author(s):
  39. Xiwei Wu xwu@coh.org, Xuejun Arthur Li xueli@coh.org
  40. References:
  41. Smyth, G.K. (2005) Limma: linear models for microarray data. In:
  42. Bioinformatics and Computational Biology Solutions using R and
  43. Bioconductor, R. Gentleman, V. Carey, S. Dudoit, R. Irizarry, W.
  44. Huber (eds.), Springer, New York, pages 397-420
  45. Examples:
  46. data(testData)
  47. normaldata<-pre.process("rma",testData)
  48. ## Create design matrix
  49. design<-make.design(pData(normaldata), "group")
  50. ## Create contrast matrix - Compare group "A" vs. "C"
  51. contrast<-make.contrast(design, "A", "C")
  52. ## Identify differentially expressed gene by using LIMMA method
  53. result<-regress(normaldata, design, contrast, "L")
  54. AffyQA(parameters = c("estrogen", "time.h"), raw = raw)
  55. result.wrapper <- AffyRegress(normal.data = filtered, cov = "estrogen",
  56. + compare1 = "present", compare2 = "absent", method = "L",
  57. + adj = "fdr", p.value = 0.05, m.value = log2(1.5))
  58. target<-data.frame(drug=(c(rep("A",4),rep("B",4),rep("C",4))),
  59. gender=factor(c(rep("M",6),rep("F",6))),
  60. group=factor(rep(c(1,2,3),4)))
  61. # Example1: Compare drug "A" vs. "B"
  62. design1<-make.design(target, "drug")
  63. contrast1<-make.contrast(design1, "A", "B")
  64. # Example2: Compare drug "A" vs. "B", adjusting for "group" variable
  65. design2<-make.design(target, c("drug","group"))
  66. contrast2<-make.contrast(design2, "A", "B")
  67. # Example3: Suppose you are interested in "drug", "group" interaction
  68. design3<-make.design(target, c("drug","group"), int=c(1,2))
  69. contrast3<-make.contrast(design3, interaction=TRUE)
  70. # Example4: Compare drug "A" vs. "B" among "male"
  71. # Notice that you must use an design matrix containing an interaction term
  72. design4<-make.design(target, c("drug","gender"), int=c(1,2))
  73. contrast4<-make.contrast(design4, "A", "B", "M")
  74. """
  75. import sys, StringIO, string, os, tempfile, shutil, time, subprocess
  76. from xml.etree.ElementTree import ElementTree
  77. from com.lilly.cistrome import htmlParser
  78. galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
  79. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  80. <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  81. <head>
  82. <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  83. <meta name="generator" content="Galaxy %s tool output - see http://galaxyproject.org/" />
  84. <title></title>
  85. <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
  86. </head>
  87. <body>
  88. <div class="document">
  89. """
  90. galhtmlpostfix = """</div></body></html>"""
  91. # eg
  92. # library(AffyExpress)
  93. # load( "Golub_Merge.eset")
  94. # reFunc('test.log', "Golub_Merge.eset" , "golub", '.','eset','rma','Gender','F','M','L','fdr','0.001','1.5','test.html',0)
  95. # reFunc('test.log', "ETABM25.affyBatch" , "etabm25", '.','eset','rma','FactorValue..Age.','<40','>40','L','fdr','0.001','1.5','test.html',0)
  96. reFunc="""reFunc <- function(logfpath,infpath,infname,tDir,inftype,norm_method,phename,pheval1,
  97. pheval2,anntn, samples,groups,meth,padj,pthresh,mthresh,outhtmlName,permutations)
  98. {
  99. sink(file=file(logfpath, "at"), type="message")
  100. sink(file=file(logfpath, "at"), type="output")
  101. library("AffyExpress")
  102. library("affy")
  103. #x = load(infpath)
  104. #eset = get(x)
  105. print ("***==================***")
  106. print ("infpath: ")
  107. print (infpath)
  108. print (samples)
  109. print (groups)
  110. print (anntn)
  111. print ("***==================***")
  112. exprs <- as.matrix(read.table(infpath, sep = "\t", header = TRUE, row.names = 1, as.is = TRUE))
  113. eset <- new("ExpressionSet", exprs=exprs)
  114. ### Download annotation for a chip we have not seen before
  115. x <-(sub(' ', '',(paste(anntn,".db"))))
  116. if (x == "fly.db0.db"){x="fly.db0"}
  117. if (! require(x,character.only=T)){
  118. source("http://bioconductor.org/biocLite.R")
  119. biocLite(x)
  120. library(x,character.only=T, logical.return=T)
  121. }
  122. if ((!is.null(samples)) && samples != '' && (!is.null(groups)) && groups != '') {
  123. sampleList <- strsplit(samples, ",")
  124. samples <- unlist(sampleList)
  125. groupList <- strsplit(groups, ",")
  126. groups <- unlist(groupList)
  127. dd <- data.frame(Name=samples, Group=groups, row.names=1)
  128. pData(eset) <- dd
  129. phe <- pData(eset)
  130. annotation(eset) <- anntn
  131. }
  132. cdir = getwd()
  133. setwd(tDir)
  134. outfiles=c()
  135. outlabs=c()
  136. if (inftype == 'affybatch')
  137. {
  138. # this makes AffyQA.html and some low res png's
  139. # TODO MAKE PDF's or better resolution so the labels are legible for complex data
  140. AffyQA(parameters = phename, raw = eset)
  141. outfiles = c(outfiles,'AffyQA')
  142. outlabs = c(outlabs,paste('Output from AffyQA run on',phename))
  143. normaldata <- pre.process(method = norm_method, raw = eset)
  144. eset <- normaldata
  145. }
  146. ## Create design matrix
  147. cat('Phenotype names =')
  148. cat(names(phe))
  149. cat('\n')
  150. design<-make.design(phe,phename)
  151. cat(paste('design for',phename,'=\n'))
  152. cat(design)
  153. cat('\n')
  154. ## Create contrast matrix - Compare group "A" vs. "C"
  155. contrast<-make.contrast(design,pheval1,pheval2) # should be the list of values
  156. cat(paste('contrast for',pheval1,pheval2,'=\n'))
  157. cat(contrast)
  158. cat('\n')
  159. ## Identify differentially expressed gene by using LIMMA method
  160. if (meth == 'P')
  161. {
  162. result<-regress(eset, design=design, contrast=contrast, method=meth, adj=padj,
  163. permute.time=permutations)
  164. }
  165. else
  166. {
  167. result<-regress(eset, design=design, contrast=contrast, method=meth, adj=padj)
  168. }
  169. cdfname = annotation(eset)
  170. cdfname = paste(cdfname, "db", sep=".")
  171. if (cdfname=="fly.db0.db"){cdfname="fly.db0"}
  172. select <- select.sig.gene(top.table = result, p.value = pthresh,m.value = log2(mthresh))
  173. result2html(cdf.name = cdfname, result = select,filename = outhtmlName)
  174. outfiles = c(outfiles,outhtmlName)
  175. s = paste('Top table for',infname,'pheno=',phename,'contrast=',pheval1,'vs',pheval2,'method=',meth,
  176. 'pthresh=',pthresh,'foldthresh=',mthresh,'adj=',padj)
  177. outlabs = c(outlabs,s)
  178. setwd(cdir)
  179. return(list(outlabs=outlabs,outfiles=outfiles))
  180. }
  181. """
  182. def timenow():
  183. """return current time as a string
  184. """
  185. return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
  186. mng = '### makenewgalaxy' # used by exec_after_process hook to parse out new file paths/names
  187. def makePheno(groupOne, groupTwo, logf):
  188. logf.write("\nIn makePheno, g1: %s\ng2: %s\n" % (groupOne, groupTwo))
  189. if (not groupOne and not groupTwo):
  190. return ("", "")
  191. elif (groupTwo == ""):
  192. return extractSamplesGroups(groupOne, logf)
  193. else:
  194. groupOneList = groupOne.split(',')
  195. groupTwoList = groupTwo.split(',')
  196. #if (len(groupOneList) <> len(groupTwoList)):
  197. # return ("", "")
  198. zeros = []
  199. ones = []
  200. for i in range(0, len(groupOneList)):
  201. zeros.append("0")
  202. for j in range(0, len(groupTwoList)):
  203. ones.append("1")
  204. return (",".join(groupOneList + groupTwoList), ",".join(zeros + ones))
  205. def extractSamplesGroups(phenoStr, logf):
  206. delimiter = " "
  207. phelist = phenoStr.split(delimiter);
  208. phelist = phelist[1:]
  209. logf.write("\nphelist: %s\n" % (phelist))
  210. samples = []
  211. groups = []
  212. for pheStr in phelist:
  213. row = pheStr.split('\t')
  214. samples.append(row[0])
  215. groups.append(row[1])
  216. logf.write("\n\nphenoStr: %s, \nSamples: %s\nGroups: %s\n" % (phenoStr, samples, groups))
  217. return (','.join(samples), ','.join(groups))
  218. def extractSampleSize(infpath):
  219. infile = open(infpath)
  220. header = infile.readline().strip()
  221. # samples = shlex.split(header)
  222. samples = header.split('\t')
  223. if samples[0]=="":
  224. samples = samples[1:]
  225. infile.close()
  226. return len(samples)
  227. def verifySelectedGroupSize(groups, phenoSize):
  228. groupsList = groups.split(',')
  229. if (len(groupsList) != phenoSize):
  230. print "Please select a total of %d samples for Group 1 and Group 2" %(phenoSize)
  231. #raise Exception("Please select a total of %d item from Group 1 and Group 2" %(phenoSize))
  232. sys.exit(-1)
  233. def refineHtmlForGene(htmlFilename):
  234. try:
  235. infile = open(htmlFilename)
  236. html = htmlParser.makeWellForm(infile)
  237. infile.close()
  238. tree = ElementTree()
  239. tree.parse(StringIO.StringIO(html))
  240. toModifyHtml = htmlParser.shouldModifyHtml(tree)
  241. if toModifyHtml == True:
  242. htmlParser.constructGeneHtml(tree, htmlFilename)
  243. except IOError:
  244. pass
  245. def main():
  246. """<command interpreter="python">
  247. diffexp.py '$rexprIn' '$title' '$outfile' '$rexprIn.name' '$rexprIn.extension' '$phecols' '$annotation' '$groupOne' '$groupTwo' '$method' '$permutations' '$padj' '$pthresh' '$mthresh' '$outfile.extra_files_path' '$txtoutfile'
  248. </command>
  249. """
  250. sep = 'XXX'
  251. nparm = 16
  252. appname = sys.argv[0]
  253. if (len(sys.argv) < nparm):
  254. print '%s needs %d parameters - given %d as %s' % (appname,nparm,len(sys.argv),';'.join(sys.argv))
  255. sys.exit(1)
  256. # short command line error
  257. loghandle,logfname = tempfile.mkstemp(text='T')
  258. logf = file(logfname,'a')
  259. appname = sys.argv[0]
  260. infpath = sys.argv[1].strip()
  261. phenoSize = extractSampleSize(infpath)
  262. title = sys.argv[2].strip()
  263. outhtml = sys.argv[3].strip()
  264. infname = sys.argv[4].strip()
  265. infname = infname.replace('.','_')
  266. inftype = sys.argv[5].strip()
  267. phecols = sys.argv[6].strip() # columns for design/contrast
  268. annotation = sys.argv[7].strip()
  269. groupOne = sys.argv[8].strip()
  270. groupTwo = sys.argv[9].strip()
  271. (samples, groups) = makePheno(groupOne, groupTwo, logf)
  272. verifySelectedGroupSize(groups, phenoSize)
  273. logf.write( "***==========***\ninfpath: %s, infname:%s\ngroup1: %s\ngroup2: %s\nphenoSize: %d" % (infpath, infname, groupOne, groupTwo, phenoSize))
  274. # these are created as
  275. # rets = '%s~~!~~%s' % (head[useCols[i]],'~~~~~'.join(retc)) # for design
  276. phecols = phecols.split(sep)
  277. phename = phecols[0] # first is name
  278. phevals = phecols[1:] # rest are the values
  279. meth = sys.argv[10].strip() # L,F or P
  280. permutations = sys.argv[11].strip()
  281. try:
  282. p = int(permutations)
  283. except:
  284. permutations = '0'
  285. padj = sys.argv[12].strip() # fdr etc
  286. pthresh = sys.argv[13].strip()
  287. mthresh = sys.argv[14].strip()
  288. try:
  289. p = float(pthresh)
  290. except:
  291. pthresh = '1.0' # all
  292. try:
  293. m = float(mthresh)
  294. except:
  295. mthresh = '0.0' # all
  296. outpath = sys.argv[15].strip()
  297. txtoutfile = sys.argv[16].strip()
  298. norm_method='rma'
  299. replace = string.whitespace + string.punctuation
  300. ttab = string.maketrans(replace,'_'*len(replace))
  301. title = string.translate(title,ttab)
  302. #loghandle,logfname = tempfile.mkstemp(text='T')
  303. #logf = file(logfname,'a')
  304. logf.write('# %s - part of the Rexpression Galaxy toolkit http://esphealth.org/trac/rgalaxy\n' % (appname))
  305. logf.write('# Job %s Got parameters %s\n' % (title,' '.join(sys.argv)))
  306. try:
  307. os.makedirs(outpath) # not made yet?
  308. logf.write('### html file path %s made\n' % outpath)
  309. except:
  310. logf.write('### html file path %s already exists\n' % outpath)
  311. logf.close()
  312. outhtmlname = 'Dichotomous_%s_%s' % (phename,infname)
  313. rcall = "reFunc('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s',%s,%s,'%s',%s)" % (logfname,infpath,infname, outpath,inftype,'rma', phename,phevals[0].strip(),phevals[1].strip(),annotation, samples,groups,meth,padj,pthresh,mthresh,outhtmlname,permutations)
  314. p = subprocess.Popen("R --vanilla", shell=True, executable="/bin/bash", stdin=subprocess.PIPE)
  315. p.communicate(reFunc + "\n" + rcall)
  316. logf = file(logfname,'a')
  317. logf.write('R code follows:\n%s\n' % reFunc)
  318. logf.write('Called as %s\n\n' % rcall)
  319. outfiles = outhtmlname
  320. outlabs = getHTMLTitle(infname, phename, phevals[0].strip(), phevals[1].strip(), meth, pthresh, mthresh, padj)
  321. logf.write('## R call got outfiles %s and outlabs %s\n' % (outfiles,outlabs))
  322. html = []
  323. html.append(galhtmlprefix)
  324. html.append('<h2>Output from job title %s run at %s</h2><ul>' % (title,timenow()))
  325. if type(outfiles) == type('string'):
  326. fn = outpath + "/" + outfiles + ".html"
  327. logf.write("%s\t%s\t%s\n" % (mng, fn, title))
  328. refineHtmlForGene(fn)
  329. if (os.path.exists(fn)):
  330. html.append('<li><a href="%s.html">%s</a></li>' % (outfiles,outlabs))
  331. txt = file(fn, 'r').readlines()
  332. else:
  333. txt = ["No differentially expressed gene found in %s at p-value %s" % (infname, pthresh)]
  334. for (line) in txt:
  335. logf.write("%s\n" % (line))
  336. else: # got lists
  337. for i,outfn in enumerate(outfiles):
  338. lab = outlabs[i]
  339. html.append('<li><a href="%s.html">%s</a></li>' % (outfn,lab))
  340. logf.close()
  341. html.append('</ul>')
  342. l = file(logfname,'r').read()
  343. html.append('<hr><h2>Job log:</h2><hr><pre>%s</pre><hr><h5>Tool was %s</h5>' % (l,appname)) # show log
  344. os.unlink(logfname)
  345. html.append(galhtmlpostfix)
  346. outf = file(outhtml,'w')
  347. outf.write('\n'.join(html))
  348. outf.close()
  349. def getHTMLTitle(infname, phename, pheval1, pheval2, meth, pthresh, mthresh, padj):
  350. return "Top table for " + infname + ", pheno=" + phename + ", contrast=" + pheval1 + " vs " + pheval2 + ", method=" + meth + ", pthresh=" + pthresh + ", mthresh=" + mthresh + ", adj=" + padj
  351. if __name__ == "__main__":
  352. main()