PageRenderTime 41ms CodeModel.GetById 30ms app.highlight 4ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/expression/diffexp.xml

https://bitbucket.org/cistrome/cistrome-harvard/
XML | 239 lines | 205 code | 33 blank | 1 comment | 0 complexity | 4025db414e20677953e234645944e73b MD5 | raw file
  1<?xml version="1.0"?>
  2<tool id="differential_expression" name="Calculate differential expression">
  3<description>to find differential expressed genes between two conditions.
  4</description>
  5<code file="diffexp_code.py"/> 
  6<command interpreter="python">
  7#if $howToGetPheno.howToGetPheno_select=="file" #diffexp.py '$rexprIn' '$title' '$outfile' '$rexprIn.name' '$rexprIn.extension' '$howToGetPheno.phecols' '${rexprIn.metadata.dbkey}' '${rexprIn.metadata.pheno}' '$howToGetPheno.groupTwo' '$method' '$permutations' '$padj' '$pthresh' '$mthresh'  '$outfile.extra_files_path' '$txtoutfile'
  8
  9#else #diffexp.py '$rexprIn' '$title' '$outfile' '$rexprIn.name' '$rexprIn.extension' '$howToGetPheno.phecols' '$howToGetPheno.annotation' '$howToGetPheno.groupOne' '$howToGetPheno.groupTwo' '$method' '$permutations' '$padj' '$pthresh' '$mthresh'  '$outfile.extra_files_path' '$txtoutfile'
 10
 11#end if
 12</command>
 13<inputs>
 14  <page>
 15    <param name="title" label="Title to label job outputs" type="text" size="80" value="Differential expression analysis" />
 16    <param name="rexprIn" type="data" format="txt" 
 17	   label="Express set text chosen from your history (Required)" optional="false"
 18	   size="120" help="Choose an expression set text having samples in the columns and genes in the rows from your current history" />     
 19  </page>
 20  <page>
 21    <conditional name="howToGetPheno">
 22      <param name="howToGetPheno_select" type="select" label="How to classify dichotomous phenotype factors" dynamic_options="getHasPheno(rexprIn)"/>
 23      <when value="file">
 24	<param name="phecols" type="hidden" value="GroupXXX0XXX1"/>
 25	<param name="groupOne" type="hidden" value=""/>
 26	<param name="groupTwo" type="hidden" value=""/>
 27	<param name="annotation" type="hidden" value=""/>
 28      </when>
 29      <when value="manual">
 30	<param name="phecols" type="hidden" value="GroupXXX0XXX1"/>
 31	<param name="annotation" type="select" label="Annotation">
 32	  <option value="hgu133a" selected="True">Homo sapiens hgu133a</option>
 33	  <option value="hgu133b">Homo sapiens hgu133b</option>
 34	  <option value="hgu133plus2">Homo sapiens hgu133plus</option>
 35	  <option value="hgu95av2">Homo sapiens hgu95av2</option>
 36	
 37	  <option value="mouse430a2.db">Mouse 430a2</option>
 38	  <option value="celegans.db">C. elengans</option>
 39	  <option value="drosophila2.db">Drosophila2</option>
 40	  <option value="org.Hs.eg">org.Hs.eg</option>
 41	  <option value="org.Mm.eg">org.Mm.eg</option>
 42	  <option value="org.Ce.eg">org.Ce.eg</option>
 43	  <option value="org.Dm.eg">org.Dm.eg</option>
 44	</param>
 45	<param name="groupOne" type="select" multiple="true" label="Group 1 samples" dynamic_options="getSamples(rexprIn)"/>
 46	<param name="groupTwo" type="select" multiple="true" label="Group 2 samples" dynamic_options="getSamples(rexprIn)"/>
 47      </when>
 48      <when value="nothing" />
 49    </conditional>
 50    <!-- ##note trick needed to extract rexprIn from the conditional on the previous page##-->
 51    
 52    <param name="method" label="Regression method" type="select">
 53      <option value="L" selected="True">Limma (moderated t-statistics and log-odds 
 54      of differential expression by empirical Bayes shrinkage)</option>
 55      <option value="F">Ordinary least-squares regression</option>
 56      <option value="P">Permutation by phenotype resampling</option>
 57    </param>
 58    <param name="permutations" label="Number of permutations if using permutation method"
 59	   type="integer" value="1000" />
 60    <param name="padj" label="Type II error control method" type="select"
 61	   help='Control either family wise error or false discovery rate using routines from R p.adjust'>
 62      <option value="fdr" selected="True">False discovery rate</option>
 63      <option value="BY">Benjamini-Yukateli FDR</option>
 64      <option value="BH">Benjamini-Hochberg FDR</option>
 65      <option value="holm">Holm FWE control</option>
 66      <option value="hochberg">Hochberg FWE control</option>
 67      <option value="bonferroni">Bonferroni FWE control (Overconservative if data correlated)</option>
 68      <option value="none">None (NOT recommended!)</option>
 69    </param>
 70    <param name="pthresh" label="Maximum P-value threshold to report (1.0 to report all)" type="float" value="0.01">
 71      <validator type="in_range" max="1.0" min="0" message="width is out of range, width has to be between 0 to 1.0" />
 72    </param>
 73    <param name="mthresh" label="Minimum fold-change value threshold to report" type="float" value="1.2">
 74      <validator type="in_range" max="10" min="1.2" message="width is out of range, width has to be between 1.2 to 10" />
 75    </param>
 76  </page>
 77</inputs>
 78
 79<outputs>  
 80    <data format='html' name="outfile" />
 81    <data format='txt' name="txtoutfile" />
 82</outputs>
 83
 84 
 85
 86<help>
 87
 88
 89**Syntax**
 90
 91- **Title** is used to name the output files - so make it meaningful
 92- **Data set to use** an expression index text file from your history
 93- **Method** LIMMA method - compute moderated t-statistics and log-odds of differential expression by empirical Bayes shrinkage ofthe standard errors towards a common value, using ordinary linea regression, permuation test by resampling the phenotype
 94- **Permutations** Number of permutations - ignored unless using permutation method
 95- **Type II error control** FDR, one of the specific methods from the R stats p.adjust function
 96- **P threshold** Sets the adjusted p value below which genes will be reported in the output "top table"
 97- **Fold-change threshold** Sets the fold change (it will be log2 transformed) above which genes will be reported in the output "top table"
 98
 99-----
100
101**Summary**
102
103Given a normalized .expression set as input, this tool will calculate differential expression of genes in 2 groups  analysis, group classification will be done by the "Group" column defined in the phenotype file (pheno.txt) provided together with the raw data ( see Upload Affymetrix .CEL or NimbleGen .XYS files from your computer1. Upload Gene Expression data), or by manually selecting which samples belong to each group.
104
105The user can also upload any standard tab delimited file containing expression values having gene/probes IDs in the rows and samples in the .columns, see example bellow::
106
107                a.cel   b.cel   c.cel   d.cel
108  1053_at	5.9716  5.4552  5.7371	5.6868
109  117_at	6.1563	6.2173	6.3502	6.3966
110  121_at	8.7508	9.2037	8.7661	9.1232
111  1255_g_at	4.3780	4.9647	4.4988	5.0353
112  1294_at	6.4600	6.4432	6.3925	6.7555
113  1316_at	5.1460	5.6332	5.3307	5.6146
114  1320_at	5.0362	5.1392	5.0129	5.1335
115  1405_i_at	3.8925	4.1668	3.9669	4.1081
116  1431_at	3.6796	3.9467	3.7574	4.1612
117  1438_at	7.5160	7.7695	7.3263	7.5756
118  1487_at	7.1885	7.2374	7.2611	7.4751
119  1494_f_at	6.3009	7.1231	6.8411	7.0828
120  1598_g_at	9.1183	9.0740	9.0078	8.9683
121  160020_at	7.4241	8.0714	7.5623	7.8344
122  1729_at	7.0726	7.1000	6.8014	7.1348
123  1773_at	6.0027	5.7266	5.5894	5.6914
124
125
126Different regression methods are available to determine differentially expressed genes: Limma moderated t-test, ordinary least-squares and permutation by resampling. Since Microarray experiments generate large multiplicity problems in which thousands of hypothesis (is gene x differentially expressed between the two groups) are tested simultaneously, correction for false positive (type I errors) and false negative (type 2) errors may be performed by using one of the available methods (Bonferroni and Benjamini FDR). The output from this tool will be a text file (.txt) containing a set of differentially expressed genes, fold-change ( Log2 transformed)  and p-values of differential expression. This file that can be saved or used as an input for the next step in the pipe-line(e.g. Find highest expressed TFs tool).
127This tool implements the AffyExpress vignette shown below:
128    
129::
130
131  5    Identifying Differentially Expressed Genes
132
133  Identifying differentially expressed genes depends on a researcher's interest and 
134  applying correct statistical models during the analysis process. We will illustrate 
135  a few basic statistical models on this data set.
136
137  5.1     Single Factor
138  Suppose we would like to identify how differentially expressed genes respond to 
139  estrogen regardless of time period. Analysis on a single categorical variable is 
140  the same as One Way ANOVA. Since we only have two levels, present and absent, 
141  for the estrogen variable, this type of analysis is also equivalent to a two-sample t test.
142
143    5.1.1    Design Matrix and Contrast Matrix
144
145    To run the analysis, we need to create a design and a contrast matrix. One of the 
146  major strengths of this package that we can use the built-in function to create the
147  design matrix and the contrast matrix using standard statistics approaches which
148  is different from the design matrix from the LIMMA package. To create a design
149  matrix, we will use the make.design function.
150  > design = make.design(target = pData(filtered), cov = "estrogen")
151  > design
152      (Intercept) estrogen/present
153  1              1                    0
154  2              1                    0
155  3              1                    1
156  4              1                    1
157  5              1                    0
158  6              1                    0
159  7              1                    1
160  8              1                    1
161
162  attr(,"assign")
163  [1] 0 1
164  attr(,"contrasts")
165  attr(,"contrasts")$estrogen
166  [1] "contr.treatment"
167  Notice that the name of the second column of the design matrix is estrogen/present,
168  where estrogen is the name of the variable and present tells us that present
169  corresponds to 1. Thus, the design matrix above is equvalent to the equation
170  below:
171                               y = alpha + betaE xE +                                (1)
172  where
173                                 1 if estrogen = present
174                        xE =
175                                 0 if estrogen = absent
176  Next we need to create a contrast matrix. Since we are comparing present
177  versus absent, we will create the following contrast:
178  > contrast = make.contrast(design.matrix = design, compare1 = "present",
179  +       compare2 = "absent")
180  > contrast
181         [,1]
182  [1,]      0
183  [2,]      1
184
185  5.1.2    Analysis
186  Once the design matrix and contrast matrix are created, we can run the analysis
187  by using the regress function. There are three types of regression methods that
188  are being supported: LIMMA (computing moderated t-statistics and log-odds of
189  differential expressions by empirical Bayes shrinkage of the standard errors towards
190  a common value), permutation test (resampling the phenotype), and ordinary
191  linear regression. Also, we can apply multiple comparison corrections by using the
192  adj option, such as fdr. The default value for the adj is none
193  > result = regress(object = filtered, design = design, contrast = contrast,
194  +       method = "L", adj = "fdr")
195  Here are the first three genes of the result
196  > result[1:3, ]
197                       ID Log2Ratio.1              F    P.Value adj.P.Val
198  1000_at         1000_at -0.23625810 2.9566239 0.1195380 0.4969571
199  1001_at         1001_at 0.12495314 1.0542776 0.3312475 0.7979252
200  1003_s_at 1003_s_at 0.06103375 0.2581607 0.6235681 0.9536450
201                                            4
202    For the next step, we can select differentaly expressed genes based on p value
203  and/or fold change. Suppose that we would like to select genes with a p value
204  LT 0.05 and a fold change value greater than 1.5.
205  > select = select.sig.gene(top.table = result, p.value = 0.05,
206  +       m.value = log2(1.5))
207  [1] "There are 381 differentially expressed genes"
208  [1] "based on your selection criteria."
209  The select.sig.gene function adds an additional column, significant, which
210  gives a value of either TRUE or FALSE indicating whether the gene is differen-
211  tially expressed based on your selection criteria. In this example, there are 381
212  differentially expressed genes being selected.
213
214  5.1.3    Output Your Result
215  To output the differentially expressed genes along with annotations to an HTML
216  file in your current working directory, we can use the result2html function.
217  > result2html(cdf.name = annotation(filtered), result = select,
218  +       filename = "singleFactor")
219
220  5.1.4    A Wrapper Function
221  There is a wrapper function, AffyRegress, that can acomplish all of the above
222  steps together including: create a design and contrast matrix, run regression, select
223  differentaly expressed genes, and output the differentally expressed gene to an html
224  file.
225
226  > result.wrapper = AffyRegress(normal.data = filtered, cov = "estrogen",
227  +       compare1 = "present", compare2 = "absent", method = "L",
228  +       adj = "fdr", p.value = 0.05, m.value = log2(1.5))
229  [1] "There are 381 differentially expressed genes"
230  [1] "based on your selection criteria."
231
232
233Note that this tool uses ideas documented at the R and BioConductor manual:
234http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/R_BioCondManual.html#biocon_affypack and uses standard R (www.r-project.org) and Bioconductor (htt://bioconductor.org) packages.
235
236Some base code for this tool was designed and written for the Rexpression BioC/Galaxy tools by ross lazarus (ross d0t lazarus at gmail d0t com) August 2008 Copyright Ross Lazarus 2008 Licensed under the LGPL as documented at http://www.gnu.org/licenses/lgpl.html
237
238</help>
239</tool>