PageRenderTime 28ms CodeModel.GetById 3ms app.highlight 20ms RepoModel.GetById 1ms app.codeStats 1ms

/tools/expression/diffexp.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 421 lines | 400 code | 0 blank | 21 comment | 5 complexity | 2e37c81f41a8d604c3c52bbbb3f4ee5b MD5 | raw file
  1"""
  2differential_expression.py
  3A Galaxy tool based on the simplest dichotomous example in AffyExpress
  4documentation I could find. Use this as a prototype to write more
  5complex models.
  6This code called by the job runner, parameters as shown in the corresponding
  7xml tool descriptor.
  8
  9From ?regress
 10
 11regress             package:AffyExpress             R Documentation
 12
 13Run regression to fit genewise linear model
 14
 15Description:
 16
 17     Fit genewise linear model using LIMMA package, ordinary linear
 18     regression,  or permutation method.
 19
 20Usage:
 21
 22     regress(object, design, contrast, method, adj="none", permute.time=1000) 
 23
 24Arguments:
 25
 26  object: an "ExpressionSet"
 27
 28  design: design matrix from the make.design function
 29
 30contrast: contrast matrix from the make.contrast function
 31
 32  method: Three methods are supported by this function: "L" for using
 33          LIMMA method - compute moderated t-statistics and log-odds 
 34          of differential expression by empirical Bayes shrinkage of
 35          the standard  errors towards a common value,   "F" for using
 36          ordinary linear regression,  "P" for permuation test by
 37          resampling the phenotype
 38
 39     adj: adjustment method for multiple comparison test, including
 40          "holm",  "hochberg", "hommel", "bonferroni", "BH", "BY",
 41          "fdr", "none".  The default value is "none". Type
 42          help(p.adjust) for more detail.
 43
 44permute.time: number of permutation times, only used for the
 45          permutation  method.
 46
 47:
 48Value:
 49
 50     A dataframe contains rows for all the genes from object and the
 51     following columns: ID(probeid); Log2Ratio (estimate of the effect
 52     or the contrast,  on the log2 scale);  F (F statistics); P.Value
 53     (raw p-value); adj.P.Value (adjusted  p-value or q-value)
 54
 55Author(s):
 56
 57     Xiwei Wu xwu@coh.org, Xuejun Arthur Li xueli@coh.org
 58
 59References:
 60
 61     Smyth, G.K. (2005) Limma: linear models for microarray data. In:
 62     Bioinformatics and Computational Biology Solutions using R and 
 63     Bioconductor, R. Gentleman,  V. Carey, S. Dudoit, R. Irizarry,  W.
 64     Huber (eds.), Springer, New York, pages 397-420
 65
 66Examples:
 67
 68     data(testData)
 69     normaldata<-pre.process("rma",testData)
 70
 71     ## Create design matrix
 72     design<-make.design(pData(normaldata), "group")
 73
 74     ## Create contrast matrix - Compare group "A" vs. "C"
 75     contrast<-make.contrast(design, "A", "C")
 76
 77     ## Identify differentially expressed gene by using LIMMA method
 78     result<-regress(normaldata, design, contrast, "L")
 79
 80
 81 AffyQA(parameters = c("estrogen", "time.h"), raw = raw)
 82
 83  result.wrapper <- AffyRegress(normal.data = filtered, cov = "estrogen",
 84+     compare1 = "present", compare2 = "absent", method = "L",
 85+     adj = "fdr", p.value = 0.05, m.value = log2(1.5))
 86
 87 target<-data.frame(drug=(c(rep("A",4),rep("B",4),rep("C",4))), 
 88     gender=factor(c(rep("M",6),rep("F",6))), 
 89     group=factor(rep(c(1,2,3),4)))
 90
 91     # Example1:  Compare drug "A" vs. "B"
 92     design1<-make.design(target, "drug")
 93     contrast1<-make.contrast(design1, "A", "B")
 94
 95     # Example2:  Compare drug "A" vs. "B", adjusting for "group" variable
 96     design2<-make.design(target, c("drug","group"))
 97     contrast2<-make.contrast(design2, "A", "B")
 98
 99     # Example3:  Suppose you are interested in "drug", "group" interaction
100     design3<-make.design(target, c("drug","group"), int=c(1,2))
101     contrast3<-make.contrast(design3, interaction=TRUE)
102
103     # Example4:  Compare drug "A" vs. "B" among "male"
104     # Notice that you must use an design matrix containing an interaction term
105     design4<-make.design(target, c("drug","gender"), int=c(1,2))
106     contrast4<-make.contrast(design4, "A", "B", "M")
107
108    
109"""
110import sys, StringIO, string, os, tempfile, shutil, time, subprocess
111from xml.etree.ElementTree import ElementTree
112from com.lilly.cistrome import htmlParser
113
114galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
115<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
116<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
117<head>
118<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
119<meta name="generator" content="Galaxy %s tool output - see http://galaxyproject.org/" />
120<title></title>
121<link rel="stylesheet" href="/static/style/base.css" type="text/css" />
122</head>
123<body>
124<div class="document">
125"""
126
127galhtmlpostfix = """</div></body></html>"""
128# eg
129# library(AffyExpress)
130# load( "Golub_Merge.eset")
131
132#  reFunc('test.log', "Golub_Merge.eset" , "golub", '.','eset','rma','Gender','F','M','L','fdr','0.001','1.5','test.html',0)
133#  reFunc('test.log', "ETABM25.affyBatch" , "etabm25", '.','eset','rma','FactorValue..Age.','<40','>40','L','fdr','0.001','1.5','test.html',0)
134
135
136
137reFunc="""reFunc <- function(logfpath,infpath,infname,tDir,inftype,norm_method,phename,pheval1,
138pheval2,anntn, samples,groups,meth,padj,pthresh,mthresh,outhtmlName,permutations)
139{
140sink(file=file(logfpath, "at"), type="message")
141sink(file=file(logfpath, "at"), type="output")
142library("AffyExpress")
143library("affy")
144#x = load(infpath)
145#eset = get(x)
146print ("***==================***")
147print ("infpath: ")
148print (infpath)
149print (samples)
150print (groups)
151print (anntn)
152print ("***==================***")
153
154exprs <- as.matrix(read.table(infpath, sep = "\t",  header = TRUE, row.names = 1, as.is = TRUE))
155eset <- new("ExpressionSet", exprs=exprs)
156
157### Download annotation for a chip we have not seen before
158x <-(sub(' ', '',(paste(anntn,".db"))))
159if (x  == "fly.db0.db"){x="fly.db0"}
160
161if (! require(x,character.only=T)){
162    source("http://bioconductor.org/biocLite.R")
163    biocLite(x)
164    library(x,character.only=T, logical.return=T)
165}
166
167if ((!is.null(samples)) && samples != '' && (!is.null(groups)) && groups != '') {
168    sampleList <- strsplit(samples, ",")
169    samples <- unlist(sampleList)
170    groupList <- strsplit(groups, ",")
171    groups <- unlist(groupList)
172    dd <- data.frame(Name=samples, Group=groups, row.names=1)
173    pData(eset) <- dd
174    phe <- pData(eset)
175    annotation(eset) <- anntn
176}
177cdir = getwd()
178setwd(tDir)
179outfiles=c()
180outlabs=c()
181
182if (inftype == 'affybatch')
183    {
184        # this makes AffyQA.html and some low res png's 
185        # TODO MAKE PDF's or better resolution so the labels are legible for complex data
186        AffyQA(parameters = phename, raw = eset)
187        outfiles = c(outfiles,'AffyQA')
188        outlabs = c(outlabs,paste('Output from AffyQA run on',phename))
189        normaldata <- pre.process(method = norm_method, raw = eset)
190        eset <- normaldata
191    }
192## Create design matrix
193cat('Phenotype names =')
194cat(names(phe))
195cat('\n')
196design<-make.design(phe,phename)
197cat(paste('design for',phename,'=\n'))
198cat(design)
199cat('\n')
200## Create contrast matrix - Compare group "A" vs. "C"
201contrast<-make.contrast(design,pheval1,pheval2) # should be the list of values
202cat(paste('contrast for',pheval1,pheval2,'=\n'))
203cat(contrast)
204cat('\n')
205## Identify differentially expressed gene by using LIMMA method
206if (meth == 'P')
207    {
208    result<-regress(eset, design=design, contrast=contrast, method=meth, adj=padj, 
209    permute.time=permutations)
210    }
211else
212    {
213    result<-regress(eset, design=design, contrast=contrast, method=meth, adj=padj)
214    }  
215cdfname = annotation(eset)
216cdfname = paste(cdfname, "db", sep=".")
217if (cdfname=="fly.db0.db"){cdfname="fly.db0"}
218select <- select.sig.gene(top.table = result, p.value = pthresh,m.value = log2(mthresh))
219result2html(cdf.name = cdfname, result = select,filename = outhtmlName)  
220outfiles = c(outfiles,outhtmlName)
221s = paste('Top table for',infname,'pheno=',phename,'contrast=',pheval1,'vs',pheval2,'method=',meth,
222'pthresh=',pthresh,'foldthresh=',mthresh,'adj=',padj)
223outlabs = c(outlabs,s)
224setwd(cdir) 
225return(list(outlabs=outlabs,outfiles=outfiles))
226}
227"""
228
229
230
231def timenow():
232    """return current time as a string
233    """
234    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
235
236mng = '### makenewgalaxy' # used by exec_after_process hook to parse out new file paths/names
237
238def makePheno(groupOne, groupTwo, logf):
239  logf.write("\nIn makePheno, g1: %s\ng2: %s\n" % (groupOne, groupTwo)) 
240  if (not groupOne and not groupTwo): 
241    return ("", "")
242  elif (groupTwo == ""):
243    return extractSamplesGroups(groupOne, logf)
244  else:
245    groupOneList = groupOne.split(',')
246    groupTwoList = groupTwo.split(',')
247    #if (len(groupOneList) <> len(groupTwoList)):
248    #  return ("", "")
249
250    zeros = []
251    ones  = []
252    for i in range(0, len(groupOneList)):
253      zeros.append("0")
254    for j in range(0, len(groupTwoList)):
255      ones.append("1")
256
257    return (",".join(groupOneList + groupTwoList), ",".join(zeros + ones))
258
259def extractSamplesGroups(phenoStr, logf):
260
261  delimiter = "  "
262  phelist = phenoStr.split(delimiter);
263  phelist = phelist[1:]
264  logf.write("\nphelist: %s\n" % (phelist))
265  samples = []
266  groups = []
267  for pheStr in phelist:
268    row = pheStr.split('\t')
269    samples.append(row[0])
270    groups.append(row[1])
271  
272  logf.write("\n\nphenoStr: %s, \nSamples: %s\nGroups: %s\n" % (phenoStr, samples, groups))
273  return (','.join(samples), ','.join(groups))
274
275def extractSampleSize(infpath):
276  infile = open(infpath)
277  header = infile.readline().strip()
278#  samples = shlex.split(header)
279  samples = header.split('\t')
280  if samples[0]=="":
281    samples = samples[1:]
282  infile.close()
283  return len(samples)
284
285def verifySelectedGroupSize(groups, phenoSize):
286  groupsList = groups.split(',')
287  if (len(groupsList) != phenoSize):
288    print "Please select a total of %d samples for Group 1 and Group 2" %(phenoSize)
289    #raise Exception("Please select a total of %d item from Group 1 and Group 2" %(phenoSize))
290    sys.exit(-1)
291
292def refineHtmlForGene(htmlFilename):
293  try:
294    infile = open(htmlFilename)
295    html = htmlParser.makeWellForm(infile)
296    infile.close()
297
298    tree = ElementTree()
299    tree.parse(StringIO.StringIO(html))
300    toModifyHtml = htmlParser.shouldModifyHtml(tree)
301
302    if toModifyHtml == True:
303      htmlParser.constructGeneHtml(tree, htmlFilename)
304  except IOError:
305    pass
306
307def main():
308    """<command interpreter="python">
309    diffexp.py '$rexprIn' '$title' '$outfile' '$rexprIn.name' '$rexprIn.extension' '$phecols' '$annotation' '$groupOne' '$groupTwo' '$method' '$permutations' '$padj' '$pthresh' '$mthresh'  '$outfile.extra_files_path' '$txtoutfile'
310    </command>
311    """
312    sep = 'XXX'
313    nparm = 16
314    appname = sys.argv[0]
315    if (len(sys.argv) < nparm):
316        print '%s needs %d parameters - given %d as %s' % (appname,nparm,len(sys.argv),';'.join(sys.argv))
317        sys.exit(1)
318        # short command line error
319
320    loghandle,logfname = tempfile.mkstemp(text='T')
321    logf = file(logfname,'a')
322
323    appname = sys.argv[0]
324    infpath = sys.argv[1].strip()
325    phenoSize = extractSampleSize(infpath)
326    title = sys.argv[2].strip()
327    outhtml = sys.argv[3].strip() 
328    infname = sys.argv[4].strip()
329    infname = infname.replace('.','_')
330    inftype = sys.argv[5].strip()
331    phecols = sys.argv[6].strip() # columns for design/contrast
332    annotation = sys.argv[7].strip()
333    groupOne = sys.argv[8].strip()
334    groupTwo = sys.argv[9].strip()
335    (samples, groups) = makePheno(groupOne, groupTwo, logf)
336    verifySelectedGroupSize(groups, phenoSize)
337    logf.write( "***==========***\ninfpath: %s, infname:%s\ngroup1: %s\ngroup2: %s\nphenoSize: %d" % (infpath, infname, groupOne, groupTwo, phenoSize))
338    # these are created as
339    # rets = '%s~~!~~%s' % (head[useCols[i]],'~~~~~'.join(retc)) # for design
340    phecols = phecols.split(sep)
341    phename = phecols[0] # first is name
342    phevals = phecols[1:] # rest are the values
343    meth = sys.argv[10].strip() # L,F or P
344    permutations = sys.argv[11].strip()
345    try:
346        p = int(permutations)
347    except:
348        permutations = '0'
349    padj = sys.argv[12].strip() # fdr etc
350    pthresh = sys.argv[13].strip()
351    mthresh = sys.argv[14].strip()
352    try:
353        p = float(pthresh)
354    except:
355        pthresh = '1.0' # all
356    try:
357        m = float(mthresh)
358    except:
359        mthresh = '0.0' # all
360    outpath = sys.argv[15].strip() 
361    txtoutfile = sys.argv[16].strip()
362    norm_method='rma'    
363    replace = string.whitespace + string.punctuation
364    ttab = string.maketrans(replace,'_'*len(replace))
365    title = string.translate(title,ttab)
366    #loghandle,logfname = tempfile.mkstemp(text='T')
367    #logf = file(logfname,'a')
368    logf.write('# %s - part of the Rexpression Galaxy toolkit http://esphealth.org/trac/rgalaxy\n' % (appname))
369    logf.write('# Job %s Got parameters %s\n' % (title,' '.join(sys.argv)))
370    try:
371        os.makedirs(outpath) # not made yet?
372        logf.write('### html file path %s made\n' % outpath)
373    except:
374        logf.write('### html file path %s already exists\n' % outpath)
375    logf.close()
376    outhtmlname = 'Dichotomous_%s_%s' % (phename,infname)
377    rcall = "reFunc('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s',%s,%s,'%s',%s)" % (logfname,infpath,infname, outpath,inftype,'rma', phename,phevals[0].strip(),phevals[1].strip(),annotation, samples,groups,meth,padj,pthresh,mthresh,outhtmlname,permutations)
378    p = subprocess.Popen("R --vanilla", shell=True, executable="/bin/bash", stdin=subprocess.PIPE)
379    p.communicate(reFunc + "\n" + rcall)
380    logf = file(logfname,'a')
381    logf.write('R code follows:\n%s\n' % reFunc)
382    logf.write('Called as %s\n\n' % rcall)
383    outfiles = outhtmlname
384    outlabs = getHTMLTitle(infname, phename, phevals[0].strip(), phevals[1].strip(), meth, pthresh, mthresh, padj)
385    logf.write('## R call got outfiles %s and outlabs %s\n' % (outfiles,outlabs))
386    html = []
387    html.append(galhtmlprefix)
388    html.append('<h2>Output from job title %s run at %s</h2><ul>' % (title,timenow()))
389    if type(outfiles) == type('string'):
390        fn = outpath + "/" + outfiles + ".html"
391        logf.write("%s\t%s\t%s\n" % (mng, fn, title))
392	refineHtmlForGene(fn)
393
394        if (os.path.exists(fn)):
395            html.append('<li><a href="%s.html">%s</a></li>' % (outfiles,outlabs))
396            txt = file(fn, 'r').readlines()
397        else:
398            txt = ["No differentially expressed gene found in %s at p-value %s" % (infname, pthresh)]
399        for (line) in txt:
400            logf.write("%s\n" % (line))
401
402    else: # got lists
403        for i,outfn in enumerate(outfiles):
404            lab = outlabs[i]
405            html.append('<li><a href="%s.html">%s</a></li>' % (outfn,lab))
406    logf.close()
407    html.append('</ul>')
408    l = file(logfname,'r').read()
409    html.append('<hr><h2>Job log:</h2><hr><pre>%s</pre><hr><h5>Tool was %s</h5>' % (l,appname)) # show log
410    os.unlink(logfname) 
411    html.append(galhtmlpostfix)
412    outf = file(outhtml,'w')
413    outf.write('\n'.join(html))
414    outf.close()
415
416def getHTMLTitle(infname, phename, pheval1, pheval2, meth, pthresh, mthresh, padj):
417    return "Top table for " + infname + ", pheno=" + phename + ", contrast=" + pheval1 + " vs " + pheval2 + ", method=" + meth + ", pthresh=" + pthresh + ", mthresh=" + mthresh + ", adj=" + padj
418
419if __name__ == "__main__":
420    main()
421