PageRenderTime 28ms CodeModel.GetById 17ms app.highlight 4ms RepoModel.GetById 1ms app.codeStats 1ms

/tools/ngs_rna/cuffcompare_wrapper.xml

https://bitbucket.org/cistrome/cistrome-harvard/
XML | 217 lines | 167 code | 44 blank | 6 comment | 0 complexity | 630f8f315156c6c540e3f4d8c023e295 MD5 | raw file
  1<tool id="cuffcompare" name="Cuffcompare" version="0.0.5">
  2    <!-- Wrapper supports Cuffcompare versions v1.0.0-v1.0.3 -->
  3    <description>compare assembled transcripts to a reference annotation and track Cufflinks transcripts across multiple experiments</description>
  4    <requirements>
  5        <requirement type="package">cufflinks</requirement>
  6    </requirements>
  7    <command interpreter="python">
  8        cuffcompare_wrapper.py 
  9            
 10            ## Use annotation reference?
 11            #if $annotation.use_ref_annotation == "Yes":
 12                -r $annotation.reference_annotation
 13                #if $annotation.ignore_nonoverlapping_reference:
 14                    -R
 15                #end if
 16            #end if
 17            
 18            ## Use sequence data?
 19            #if $seq_data.use_seq_data == "Yes":
 20	        -s
 21                #if $seq_data.seq_source.index_source == "history":
 22                    --ref_file=$seq_data.seq_source.ref_file
 23                #else:
 24                    --ref_file="None"
 25                #end if
 26                --dbkey=${first_input.metadata.dbkey} 
 27                --index_dir=${GALAXY_DATA_INDEX_DIR}
 28            #end if
 29            
 30            ## Outputs.
 31            --combined-transcripts=${transcripts_combined}
 32            
 33            ## Inputs.
 34            ${first_input}
 35            #for $input_file in $input_files:
 36              ${input_file.additional_input}
 37            #end for
 38            
 39    </command>
 40    <inputs>
 41        <param format="gtf" name="first_input" type="data" label="GTF file produced by Cufflinks" help=""/>
 42        <repeat name="input_files" title="Additional GTF Input Files">
 43            <param format="gtf" name="additional_input" type="data" label="GTF file produced by Cufflinks" help=""/>
 44        </repeat>
 45        <conditional name="annotation">
 46            <param name="use_ref_annotation" type="select" label="Use Reference Annotation">
 47                <option value="No">No</option>
 48                <option value="Yes">Yes</option>
 49            </param>
 50            <when value="Yes">
 51                <param format="gtf" name="reference_annotation" type="data" label="Reference Annotation" help="Make sure your annotation file is in GTF format and that Galaxy knows that your file is GTF--not GFF."/>    
 52                <param name="ignore_nonoverlapping_reference" type="boolean" label="Ignore reference transcripts that are not overlapped by any transcript in input files"/>
 53            </when>
 54            <when value="No">
 55            </when>
 56        </conditional>
 57        <conditional name="seq_data">
 58            <param name="use_seq_data" type="select" label="Use Sequence Data" help="Use sequence data for some optional classification functions, including the addition of the p_id attribute required by Cuffdiff.">
 59                <option value="Yes">Yes</option>
 60                <option value="No">No</option>
 61            </param>
 62            <when value="No"></when>
 63            <when value="Yes">
 64                <conditional name="seq_source">
 65                  <param name="index_source" type="select" label="Choose the source for the reference list">
 66                    <option value="cached">Locally cached</option>
 67                    <option value="history">History</option>
 68                  </param>
 69                  <when value="cached"></when>
 70                  <when value="history">
 71                      <param name="ref_file" type="data" format="fasta" label="Using reference file" />
 72                  </when>
 73                </conditional>
 74            </when>
 75        </conditional>
 76    </inputs>
 77
 78    <outputs>
 79        <data format="txt" name="transcripts_accuracy" label="${tool.name} on ${on_string}: transcript accuracy" 
 80            from_work_dir="cc_output.stats" />
 81        <data format="tabular" name="input1_tmap" label="${tool.name} on ${on_string}: ${first_input.hid} data tmap file"
 82            from_work_dir="cc_output.input1.tmap" />
 83        <data format="tabular" name="input1_refmap" label="${tool.name} on ${on_string}: data ${first_input.hid} refmap file" 
 84            from_work_dir="cc_output.input1.refmap"/>
 85        <data format="tabular" name="input2_tmap" label="${tool.name} on ${on_string}: data ${input_files[0]['additional_input'].hid} tmap file" from_work_dir="cc_output.input2.tmap">
 86            <filter>len( input_files ) > 0</filter>
 87        </data>
 88        <data format="tabular" name="input2_refmap" label="${tool.name} on ${on_string}: data ${input_files[0]['additional_input'].hid} refmap file" from_work_dir="cc_output.input2.refmap">
 89            <filter>len( input_files ) > 0</filter>
 90        </data>
 91        <data format="tabular" name="transcripts_tracking" label="${tool.name} on ${on_string}: transcript tracking" from_work_dir="cc_output.tracking">
 92            <filter>len( input_files ) > 0</filter>
 93        </data>
 94        <data format="gtf" name="transcripts_combined" label="${tool.name} on ${on_string}: combined transcripts"/>
 95    </outputs>
 96
 97    <tests>
 98        <!-- 
 99            cuffcompare -r cuffcompare_in3.gtf -R cuffcompare_in1.gtf cuffcompare_in2.gtf
100        -->
101        <test>
102            <param name="first_input" value="cuffcompare_in1.gtf" ftype="gtf"/>
103            <param name="additional_input" value="cuffcompare_in2.gtf" ftype="gtf"/>
104            <param name="use_ref_annotation" value="Yes"/>
105            <param name="reference_annotation" value="cuffcompare_in3.gtf" ftype="gtf"/>
106            <param name="ignore_nonoverlapping_reference" value="Yes"/>
107            <param name="use_seq_data" value="No"/>
108            <!-- Line diffs are the result of different locations for input files; this cannot be fixed as cuffcompare outputs
109                full input path for each input. -->
110            <output name="transcripts_accuracy" file="cuffcompare_out7.txt" lines_diff="16"/>
111            <output name="input1_tmap" file="cuffcompare_out1.tmap"/>
112            <output name="input1_refmap" file="cuffcompare_out2.refmap"/>
113            <output name="input2_tmap" file="cuffcompare_out3.tmap"/>
114            <output name="input2_refmap" file="cuffcompare_out4.refmap"/>
115            <output name="transcripts_tracking" file="cuffcompare_out6.tracking"/>
116            <output name="transcripts_combined" file="cuffcompare_out5.gtf"/>
117        </test>
118    </tests>
119
120    <help>
121**Cuffcompare Overview**
122
123Cuffcompare is part of Cufflinks_. Cuffcompare helps you: (a) compare your assembled transcripts to a reference annotation and (b) track Cufflinks transcripts across multiple experiments (e.g. across a time course). Please cite: Trapnell C, Williams BA, Pertea G, Mortazavi AM, Kwan G, van Baren MJ, Salzberg SL, Wold B, Pachter L. Transcript assembly and abundance estimation from RNA-Seq reveals thousands of new transcripts and switching among isoforms. Nature Biotechnology doi:10.1038/nbt.1621
124
125.. _Cufflinks: http://cufflinks.cbcb.umd.edu/
126        
127------
128
129**Know what you are doing**
130
131.. class:: warningmark
132
133There is no such thing (yet) as an automated gearshift in expression analysis. It is all like stick-shift driving in San Francisco. In other words, running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy.
134
135.. __: http://cufflinks.cbcb.umd.edu/manual.html#cuffcompare
136
137------
138
139**Input format**
140
141Cuffcompare takes Cufflinks' GTF output as input, and optionally can take a "reference" annotation (such as from Ensembl_)
142
143.. _Ensembl: http://www.ensembl.org 
144
145------
146
147**Outputs**
148
149Cuffcompare produces the following output files:
150
151Transcripts Accuracy File:
152
153Cuffcompare reports various statistics related to the "accuracy" of the transcripts in each sample when compared to the reference annotation data. The typical gene finding measures of "sensitivity" and "specificity" (as defined in Burset, M., Guig?, R. : Evaluation of gene structure prediction programs (1996) Genomics, 34 (3), pp. 353-367. doi: 10.1006/geno.1996.0298) are calculated at various levels (nucleotide, exon, intron, transcript, gene) for each input file and reported in this file. The Sn and Sp columns show specificity and sensitivity values at each level, while the fSn and fSp columns are "fuzzy" variants of these same accuracy calculations, allowing for a very small variation in exon boundaries to still be counted as a "match".
154
155Transcripts Combined File:
156
157Cuffcompare reports a GTF file containing the "union" of all transfrags in each sample. If a transfrag is present in both samples, it is thus reported once in the combined gtf. 
158
159Transcripts Tracking File:
160
161This file matches transcripts up between samples. Each row contains a transcript structure that is present in one or more input GTF files. Because the transcripts will generally have different IDs (unless you assembled your RNA-Seq reads against a reference transcriptome), cuffcompare examines the structure of each the transcripts, matching transcripts that agree on the coordinates and order of all of their introns, as well as strand. Matching transcripts are allowed to differ on the length of the first and last exons, since these lengths will naturally vary from sample to sample due to the random nature of sequencing.
162If you ran cuffcompare with the -r option, the first and second columns contain the closest matching reference transcript to the one described by each row.
163
164Here's an example of a line from the tracking file::
165
166  TCONS_00000045 XLOC_000023 Tcea|uc007afj.1	j	\
167     q1:exp.115|exp.115.0|100|3.061355|0.350242|0.350207 \
168     q2:60hr.292|60hr.292.0|100|4.094084|0.000000|0.000000
169
170In this example, a transcript present in the two input files, called exp.115.0 in the first and 60hr.292.0 in the second, doesn't match any reference transcript exactly, but shares exons with uc007afj.1, an isoform of the gene Tcea, as indicated by the class code j. The first three columns are as follows::
171
172  Column number   Column name               Example          Description
173  -----------------------------------------------------------------------
174  1               Cufflinks transfrag id    TCONS_00000045   A unique internal id for the transfrag
175  2               Cufflinks locus id        XLOC_000023      A unique internal id for the locus
176  3               Reference gene id         Tcea             The gene_name attribute of the reference GTF record for this transcript, or '-' if no reference transcript overlaps this Cufflinks transcript
177  4               Reference transcript id   uc007afj.1       The transcript_id attribute of the reference GTF record for this transcript, or '-' if no reference transcript overlaps this Cufflinks transcript
178  5               Class code                c                The type of match between the Cufflinks transcripts in column 6 and the reference transcript. See class codes
179  
180Each of the columns after the fifth have the following format:
181  qJ:gene_id|transcript_id|FMI|FPKM|conf_lo|conf_hi
182
183A transcript need be present in all samples to be reported in the tracking file. A sample not containing a transcript will have a "-" in its entry in the row for that transcript.
184
185Class Codes
186
187If you ran cuffcompare with the -r option, tracking rows will contain the following values. If you did not use -r, the rows will all contain "-" in their class code column::
188
189  Priority	 Code	   Description
190  ---------------------------------
191  1	         =	       Match
192  2	         c	       Contained	
193  3	         j	       New isoform	
194  4	         e	       A single exon transcript overlapping a reference exon and at least 10 bp of a reference intron, indicating a possible pre-mRNA fragment.	
195  5	         i	       A single exon transcript falling entirely with a reference intron	
196  6	         r	       Repeat. Currently determined by looking at the reference sequence and applied to transcripts where at least 50% of the bases are lower case	
197  7	         p	       Possible polymerase run-on fragment	
198  8	         u	       Unknown, intergenic transcript	
199  9	         o	       Unknown, generic overlap with reference	
200  10             .	       (.tracking file only, indicates multiple classifications)
201    
202-------
203
204**Settings**
205
206All of the options have a default value. You can change any of them. Most of the options in Cuffcompare have been implemented here.
207
208------
209
210**Cuffcompare parameter list**
211
212This is a list of implemented Cuffcompare options::
213
214  -r    An optional "reference" annotation GTF. Each sample is matched against this file, and sample isoforms are tagged as overlapping, matching, or novel where appropriate. See the refmap and tmap output file descriptions below.
215  -R    If -r was specified, this option causes cuffcompare to ignore reference transcripts that are not overlapped by any transcript in one of cuff1.gtf,...,cuffN.gtf. Useful for ignoring annotated transcripts that are not present in your RNA-Seq samples and thus adjusting the "sensitivity" calculation in the accuracy report written in the transcripts_accuracy file
216    </help>
217</tool>