/tools/expression/diffexp.py
https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 421 lines · 355 code · 29 blank · 37 comment · 24 complexity · 2e37c81f41a8d604c3c52bbbb3f4ee5b MD5 · raw file
- """
- differential_expression.py
- A Galaxy tool based on the simplest dichotomous example in AffyExpress
- documentation I could find. Use this as a prototype to write more
- complex models.
- This code called by the job runner, parameters as shown in the corresponding
- xml tool descriptor.
- From ?regress
- regress package:AffyExpress R Documentation
- Run regression to fit genewise linear model
- Description:
- Fit genewise linear model using LIMMA package, ordinary linear
- regression, or permutation method.
- Usage:
- regress(object, design, contrast, method, adj="none", permute.time=1000)
- Arguments:
- object: an "ExpressionSet"
- design: design matrix from the make.design function
- contrast: contrast matrix from the make.contrast function
- method: Three methods are supported by this function: "L" for using
- LIMMA method - compute moderated t-statistics and log-odds
- of differential expression by empirical Bayes shrinkage of
- the standard errors towards a common value, "F" for using
- ordinary linear regression, "P" for permuation test by
- resampling the phenotype
- adj: adjustment method for multiple comparison test, including
- "holm", "hochberg", "hommel", "bonferroni", "BH", "BY",
- "fdr", "none". The default value is "none". Type
- help(p.adjust) for more detail.
- permute.time: number of permutation times, only used for the
- permutation method.
- :
- Value:
- A dataframe contains rows for all the genes from object and the
- following columns: ID(probeid); Log2Ratio (estimate of the effect
- or the contrast, on the log2 scale); F (F statistics); P.Value
- (raw p-value); adj.P.Value (adjusted p-value or q-value)
- Author(s):
- Xiwei Wu xwu@coh.org, Xuejun Arthur Li xueli@coh.org
- References:
- Smyth, G.K. (2005) Limma: linear models for microarray data. In:
- Bioinformatics and Computational Biology Solutions using R and
- Bioconductor, R. Gentleman, V. Carey, S. Dudoit, R. Irizarry, W.
- Huber (eds.), Springer, New York, pages 397-420
- Examples:
- data(testData)
- normaldata<-pre.process("rma",testData)
- ## Create design matrix
- design<-make.design(pData(normaldata), "group")
- ## Create contrast matrix - Compare group "A" vs. "C"
- contrast<-make.contrast(design, "A", "C")
- ## Identify differentially expressed gene by using LIMMA method
- result<-regress(normaldata, design, contrast, "L")
- AffyQA(parameters = c("estrogen", "time.h"), raw = raw)
- result.wrapper <- AffyRegress(normal.data = filtered, cov = "estrogen",
- + compare1 = "present", compare2 = "absent", method = "L",
- + adj = "fdr", p.value = 0.05, m.value = log2(1.5))
- target<-data.frame(drug=(c(rep("A",4),rep("B",4),rep("C",4))),
- gender=factor(c(rep("M",6),rep("F",6))),
- group=factor(rep(c(1,2,3),4)))
- # Example1: Compare drug "A" vs. "B"
- design1<-make.design(target, "drug")
- contrast1<-make.contrast(design1, "A", "B")
- # Example2: Compare drug "A" vs. "B", adjusting for "group" variable
- design2<-make.design(target, c("drug","group"))
- contrast2<-make.contrast(design2, "A", "B")
- # Example3: Suppose you are interested in "drug", "group" interaction
- design3<-make.design(target, c("drug","group"), int=c(1,2))
- contrast3<-make.contrast(design3, interaction=TRUE)
- # Example4: Compare drug "A" vs. "B" among "male"
- # Notice that you must use an design matrix containing an interaction term
- design4<-make.design(target, c("drug","gender"), int=c(1,2))
- contrast4<-make.contrast(design4, "A", "B", "M")
-
- """
- import sys, StringIO, string, os, tempfile, shutil, time, subprocess
- from xml.etree.ElementTree import ElementTree
- from com.lilly.cistrome import htmlParser
- galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
- <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
- <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
- <head>
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
- <meta name="generator" content="Galaxy %s tool output - see http://galaxyproject.org/" />
- <title></title>
- <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
- </head>
- <body>
- <div class="document">
- """
- galhtmlpostfix = """</div></body></html>"""
- # eg
- # library(AffyExpress)
- # load( "Golub_Merge.eset")
- # reFunc('test.log', "Golub_Merge.eset" , "golub", '.','eset','rma','Gender','F','M','L','fdr','0.001','1.5','test.html',0)
- # reFunc('test.log', "ETABM25.affyBatch" , "etabm25", '.','eset','rma','FactorValue..Age.','<40','>40','L','fdr','0.001','1.5','test.html',0)
- reFunc="""reFunc <- function(logfpath,infpath,infname,tDir,inftype,norm_method,phename,pheval1,
- pheval2,anntn, samples,groups,meth,padj,pthresh,mthresh,outhtmlName,permutations)
- {
- sink(file=file(logfpath, "at"), type="message")
- sink(file=file(logfpath, "at"), type="output")
- library("AffyExpress")
- library("affy")
- #x = load(infpath)
- #eset = get(x)
- print ("***==================***")
- print ("infpath: ")
- print (infpath)
- print (samples)
- print (groups)
- print (anntn)
- print ("***==================***")
- exprs <- as.matrix(read.table(infpath, sep = "\t", header = TRUE, row.names = 1, as.is = TRUE))
- eset <- new("ExpressionSet", exprs=exprs)
- ### Download annotation for a chip we have not seen before
- x <-(sub(' ', '',(paste(anntn,".db"))))
- if (x == "fly.db0.db"){x="fly.db0"}
- if (! require(x,character.only=T)){
- source("http://bioconductor.org/biocLite.R")
- biocLite(x)
- library(x,character.only=T, logical.return=T)
- }
- if ((!is.null(samples)) && samples != '' && (!is.null(groups)) && groups != '') {
- sampleList <- strsplit(samples, ",")
- samples <- unlist(sampleList)
- groupList <- strsplit(groups, ",")
- groups <- unlist(groupList)
- dd <- data.frame(Name=samples, Group=groups, row.names=1)
- pData(eset) <- dd
- phe <- pData(eset)
- annotation(eset) <- anntn
- }
- cdir = getwd()
- setwd(tDir)
- outfiles=c()
- outlabs=c()
- if (inftype == 'affybatch')
- {
- # this makes AffyQA.html and some low res png's
- # TODO MAKE PDF's or better resolution so the labels are legible for complex data
- AffyQA(parameters = phename, raw = eset)
- outfiles = c(outfiles,'AffyQA')
- outlabs = c(outlabs,paste('Output from AffyQA run on',phename))
- normaldata <- pre.process(method = norm_method, raw = eset)
- eset <- normaldata
- }
- ## Create design matrix
- cat('Phenotype names =')
- cat(names(phe))
- cat('\n')
- design<-make.design(phe,phename)
- cat(paste('design for',phename,'=\n'))
- cat(design)
- cat('\n')
- ## Create contrast matrix - Compare group "A" vs. "C"
- contrast<-make.contrast(design,pheval1,pheval2) # should be the list of values
- cat(paste('contrast for',pheval1,pheval2,'=\n'))
- cat(contrast)
- cat('\n')
- ## Identify differentially expressed gene by using LIMMA method
- if (meth == 'P')
- {
- result<-regress(eset, design=design, contrast=contrast, method=meth, adj=padj,
- permute.time=permutations)
- }
- else
- {
- result<-regress(eset, design=design, contrast=contrast, method=meth, adj=padj)
- }
- cdfname = annotation(eset)
- cdfname = paste(cdfname, "db", sep=".")
- if (cdfname=="fly.db0.db"){cdfname="fly.db0"}
- select <- select.sig.gene(top.table = result, p.value = pthresh,m.value = log2(mthresh))
- result2html(cdf.name = cdfname, result = select,filename = outhtmlName)
- outfiles = c(outfiles,outhtmlName)
- s = paste('Top table for',infname,'pheno=',phename,'contrast=',pheval1,'vs',pheval2,'method=',meth,
- 'pthresh=',pthresh,'foldthresh=',mthresh,'adj=',padj)
- outlabs = c(outlabs,s)
- setwd(cdir)
- return(list(outlabs=outlabs,outfiles=outfiles))
- }
- """
- def timenow():
- """return current time as a string
- """
- return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
- mng = '### makenewgalaxy' # used by exec_after_process hook to parse out new file paths/names
- def makePheno(groupOne, groupTwo, logf):
- logf.write("\nIn makePheno, g1: %s\ng2: %s\n" % (groupOne, groupTwo))
- if (not groupOne and not groupTwo):
- return ("", "")
- elif (groupTwo == ""):
- return extractSamplesGroups(groupOne, logf)
- else:
- groupOneList = groupOne.split(',')
- groupTwoList = groupTwo.split(',')
- #if (len(groupOneList) <> len(groupTwoList)):
- # return ("", "")
- zeros = []
- ones = []
- for i in range(0, len(groupOneList)):
- zeros.append("0")
- for j in range(0, len(groupTwoList)):
- ones.append("1")
- return (",".join(groupOneList + groupTwoList), ",".join(zeros + ones))
- def extractSamplesGroups(phenoStr, logf):
- delimiter = " "
- phelist = phenoStr.split(delimiter);
- phelist = phelist[1:]
- logf.write("\nphelist: %s\n" % (phelist))
- samples = []
- groups = []
- for pheStr in phelist:
- row = pheStr.split('\t')
- samples.append(row[0])
- groups.append(row[1])
-
- logf.write("\n\nphenoStr: %s, \nSamples: %s\nGroups: %s\n" % (phenoStr, samples, groups))
- return (','.join(samples), ','.join(groups))
- def extractSampleSize(infpath):
- infile = open(infpath)
- header = infile.readline().strip()
- # samples = shlex.split(header)
- samples = header.split('\t')
- if samples[0]=="":
- samples = samples[1:]
- infile.close()
- return len(samples)
- def verifySelectedGroupSize(groups, phenoSize):
- groupsList = groups.split(',')
- if (len(groupsList) != phenoSize):
- print "Please select a total of %d samples for Group 1 and Group 2" %(phenoSize)
- #raise Exception("Please select a total of %d item from Group 1 and Group 2" %(phenoSize))
- sys.exit(-1)
- def refineHtmlForGene(htmlFilename):
- try:
- infile = open(htmlFilename)
- html = htmlParser.makeWellForm(infile)
- infile.close()
- tree = ElementTree()
- tree.parse(StringIO.StringIO(html))
- toModifyHtml = htmlParser.shouldModifyHtml(tree)
- if toModifyHtml == True:
- htmlParser.constructGeneHtml(tree, htmlFilename)
- except IOError:
- pass
- def main():
- """<command interpreter="python">
- diffexp.py '$rexprIn' '$title' '$outfile' '$rexprIn.name' '$rexprIn.extension' '$phecols' '$annotation' '$groupOne' '$groupTwo' '$method' '$permutations' '$padj' '$pthresh' '$mthresh' '$outfile.extra_files_path' '$txtoutfile'
- </command>
- """
- sep = 'XXX'
- nparm = 16
- appname = sys.argv[0]
- if (len(sys.argv) < nparm):
- print '%s needs %d parameters - given %d as %s' % (appname,nparm,len(sys.argv),';'.join(sys.argv))
- sys.exit(1)
- # short command line error
- loghandle,logfname = tempfile.mkstemp(text='T')
- logf = file(logfname,'a')
- appname = sys.argv[0]
- infpath = sys.argv[1].strip()
- phenoSize = extractSampleSize(infpath)
- title = sys.argv[2].strip()
- outhtml = sys.argv[3].strip()
- infname = sys.argv[4].strip()
- infname = infname.replace('.','_')
- inftype = sys.argv[5].strip()
- phecols = sys.argv[6].strip() # columns for design/contrast
- annotation = sys.argv[7].strip()
- groupOne = sys.argv[8].strip()
- groupTwo = sys.argv[9].strip()
- (samples, groups) = makePheno(groupOne, groupTwo, logf)
- verifySelectedGroupSize(groups, phenoSize)
- logf.write( "***==========***\ninfpath: %s, infname:%s\ngroup1: %s\ngroup2: %s\nphenoSize: %d" % (infpath, infname, groupOne, groupTwo, phenoSize))
- # these are created as
- # rets = '%s~~!~~%s' % (head[useCols[i]],'~~~~~'.join(retc)) # for design
- phecols = phecols.split(sep)
- phename = phecols[0] # first is name
- phevals = phecols[1:] # rest are the values
- meth = sys.argv[10].strip() # L,F or P
- permutations = sys.argv[11].strip()
- try:
- p = int(permutations)
- except:
- permutations = '0'
- padj = sys.argv[12].strip() # fdr etc
- pthresh = sys.argv[13].strip()
- mthresh = sys.argv[14].strip()
- try:
- p = float(pthresh)
- except:
- pthresh = '1.0' # all
- try:
- m = float(mthresh)
- except:
- mthresh = '0.0' # all
- outpath = sys.argv[15].strip()
- txtoutfile = sys.argv[16].strip()
- norm_method='rma'
- replace = string.whitespace + string.punctuation
- ttab = string.maketrans(replace,'_'*len(replace))
- title = string.translate(title,ttab)
- #loghandle,logfname = tempfile.mkstemp(text='T')
- #logf = file(logfname,'a')
- logf.write('# %s - part of the Rexpression Galaxy toolkit http://esphealth.org/trac/rgalaxy\n' % (appname))
- logf.write('# Job %s Got parameters %s\n' % (title,' '.join(sys.argv)))
- try:
- os.makedirs(outpath) # not made yet?
- logf.write('### html file path %s made\n' % outpath)
- except:
- logf.write('### html file path %s already exists\n' % outpath)
- logf.close()
- outhtmlname = 'Dichotomous_%s_%s' % (phename,infname)
- rcall = "reFunc('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s',%s,%s,'%s',%s)" % (logfname,infpath,infname, outpath,inftype,'rma', phename,phevals[0].strip(),phevals[1].strip(),annotation, samples,groups,meth,padj,pthresh,mthresh,outhtmlname,permutations)
- p = subprocess.Popen("R --vanilla", shell=True, executable="/bin/bash", stdin=subprocess.PIPE)
- p.communicate(reFunc + "\n" + rcall)
- logf = file(logfname,'a')
- logf.write('R code follows:\n%s\n' % reFunc)
- logf.write('Called as %s\n\n' % rcall)
- outfiles = outhtmlname
- outlabs = getHTMLTitle(infname, phename, phevals[0].strip(), phevals[1].strip(), meth, pthresh, mthresh, padj)
- logf.write('## R call got outfiles %s and outlabs %s\n' % (outfiles,outlabs))
- html = []
- html.append(galhtmlprefix)
- html.append('<h2>Output from job title %s run at %s</h2><ul>' % (title,timenow()))
- if type(outfiles) == type('string'):
- fn = outpath + "/" + outfiles + ".html"
- logf.write("%s\t%s\t%s\n" % (mng, fn, title))
- refineHtmlForGene(fn)
- if (os.path.exists(fn)):
- html.append('<li><a href="%s.html">%s</a></li>' % (outfiles,outlabs))
- txt = file(fn, 'r').readlines()
- else:
- txt = ["No differentially expressed gene found in %s at p-value %s" % (infname, pthresh)]
- for (line) in txt:
- logf.write("%s\n" % (line))
- else: # got lists
- for i,outfn in enumerate(outfiles):
- lab = outlabs[i]
- html.append('<li><a href="%s.html">%s</a></li>' % (outfn,lab))
- logf.close()
- html.append('</ul>')
- l = file(logfname,'r').read()
- html.append('<hr><h2>Job log:</h2><hr><pre>%s</pre><hr><h5>Tool was %s</h5>' % (l,appname)) # show log
- os.unlink(logfname)
- html.append(galhtmlpostfix)
- outf = file(outhtml,'w')
- outf.write('\n'.join(html))
- outf.close()
- def getHTMLTitle(infname, phename, pheval1, pheval2, meth, pthresh, mthresh, padj):
- return "Top table for " + infname + ", pheno=" + phename + ", contrast=" + pheval1 + " vs " + pheval2 + ", method=" + meth + ", pthresh=" + pthresh + ", mthresh=" + mthresh + ", adj=" + padj
- if __name__ == "__main__":
- main()