PageRenderTime 24ms CodeModel.GetById 17ms app.highlight 3ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/samtools/pileup_parser.xml

https://bitbucket.org/cistrome/cistrome-harvard/
XML | 387 lines | 305 code | 82 blank | 0 comment | 0 complexity | e455534db0a4b927f33e051d43ee36ab MD5 | raw file
  1<tool id="pileup_parser" name="Filter pileup" version="1.0.2">>
  2  <description>on coverage and SNPs</description>
  3  <command interpreter="perl">
  4    #if   $pileup_type.type_select == "six"    #pileup_parser.pl $input "3" "5" "6" "4" $qv_cutoff $cvrg_cutoff $snps_only $interval "2" $out_file1 $diff $qc_base
  5    #elif $pileup_type.type_select == "ten"    #pileup_parser.pl $input "3" "9" "10" "8" $qv_cutoff $cvrg_cutoff $snps_only $interval "2" $out_file1 $diff $qc_base
  6    #elif $pileup_type.type_select == "manual" #pileup_parser.pl $input $pileup_type.ref_base_column $pileup_type.read_bases_column $pileup_type.read_qv_column $pileup_type.cvrg_column $qv_cutoff $cvrg_cutoff $snps_only $interval $pileup_type.coord_column $out_file1 $diff $qc_base
  7    #end if#
  8  </command>
  9  <inputs>
 10    <param name="input" type="data" format="tabular" label="Select dataset"/>
 11    <conditional name="pileup_type">
 12        <param name="type_select" type="select" label="which contains" help="See &quot;Types of pileup datasets&quot; below for examples">
 13            <option value="six" selected="true">Pileup with six columns (simple)</option>
 14            <option value="ten">Pileup with ten columns (with consensus)</option>
 15            <option value="manual">Set columns manually</option>
 16        </param>
 17        <when value="manual">
 18            <param name="ref_base_column" label="Select column with reference base" type="data_column" numerical="false" data_ref="input" />
 19            <param name="read_bases_column" label="Select column with read bases" type="data_column" numerical="false" data_ref="input" help="something like this: ..,a.."/>
 20            <param name="read_qv_column" label="Select column with base qualities" type="data_column" numerical="false" data_ref="input" help="something like this: IIIGIAI"/>
 21            <param name="cvrg_column" label="Select column with coverage" type="data_column" numerical="true" data_ref="input" />
 22            <param name="coord_column" label="Select coordinate column" type="data_column" numerical="true" data_ref="input" />
 23        </when>
 24        <when value="six">
 25        </when>
 26        <when value="ten">
 27        </when>
 28    </conditional>
 29    <param name="qv_cutoff" label="Do not consider read bases with quality lower than" type="integer" value="20" help="No variants with quality below this value will be reported"/>
 30    <param name="cvrg_cutoff" label="Do not report positions with coverage lower than" type="integer" value="3" help="Pileup lines with coverage lower than this value will be skipped"/>
 31    <param name="snps_only" label="Only report variants?" type="select" help="See &quot;Examples 1 and 2&quot; below for explanation">
 32        <option value="No">No</option>
 33        <option value="Yes" selected="true">Yes</option>
 34    </param>
 35    <param name="interval" label="Convert coordinates to intervals?" type="select" help="See &quot;Output format&quot; below for explanation">
 36        <option value="No" selected="true">No</option>
 37        <option value="Yes">Yes</option>
 38    </param>
 39    <param name="diff" label="Print total number of differences?" type="select" help="See &quot;Example 3&quot; below for explanation">
 40        <option value="No" selected="true">No</option>
 41        <option value="Yes">Yes</option>
 42    </param>
 43    <param name="qc_base" label="Print quality and base string?" type="select" help="See &quot;Example 4&quot; below for explanation">
 44        <option value="No">No</option>
 45        <option value="Yes" selected="true">Yes</option>
 46    </param>
 47        
 48  </inputs>
 49  <outputs>
 50    <data format="tabular" name="out_file1">
 51        <change_format>
 52            <when input="interval" value="Yes" format="interval" />
 53        </change_format>
 54   </data>
 55  </outputs>
 56  <tests>
 57    <test>
 58      <param name="input" value="pileup_parser.6col.pileup"/>
 59      <output name="out_file1" file="pileup_parser.6col.20-3-yes-yes.pileup.out"/>
 60      <param name="type_select" value="six"/>
 61      <param name="qv_cutoff" value="20" />
 62      <param name="cvrg_cutoff" value="3" />
 63      <param name="snps_only" value="Yes"/>
 64      <param name="interval" value="Yes" />
 65      <param name="diff" value="No" />
 66      <param name="qc_base" value="Yes" />
 67    </test>
 68    <test>
 69      <param name="input" value="pileup_parser.6col.pileup"/>
 70      <output name="out_file1" file="pileup_parser.6col.20-3-yes-no.pileup.out"/>
 71      <param name="type_select" value="six"/>
 72      <param name="qv_cutoff" value="20" />
 73      <param name="cvrg_cutoff" value="3" />
 74      <param name="snps_only" value="Yes"/>
 75      <param name="interval" value="No" />
 76       <param name="diff" value="No" />
 77      <param name="qc_base" value="Yes" />
 78    </test>
 79    <test>
 80      <param name="input" value="pileup_parser.6col.pileup"/>
 81      <output name="out_file1" file="pileup_parser.6col.20-3-no-no.pileup.out"/>
 82      <param name="type_select" value="six"/>
 83      <param name="qv_cutoff" value="20" />
 84      <param name="cvrg_cutoff" value="3" />
 85      <param name="snps_only" value="No"/>
 86      <param name="interval" value="No" />
 87       <param name="diff" value="No" />
 88      <param name="qc_base" value="Yes" />
 89    </test>
 90    <test>
 91      <param name="input" value="pileup_parser.10col.pileup"/>
 92      <output name="out_file1" file="pileup_parser.10col.20-3-yes-yes.pileup.out"/>
 93      <param name="type_select" value="ten"/>
 94      <param name="qv_cutoff" value="20" />
 95      <param name="cvrg_cutoff" value="3" />
 96      <param name="snps_only" value="Yes"/>q
 97      <param name="interval" value="Yes" />
 98       <param name="diff" value="No" />
 99      <param name="qc_base" value="Yes" />
100    </test>
101    <test>
102      <param name="input" value="pileup_parser.10col.pileup"/>
103      <output name="out_file1" file="pileup_parser.10col.20-3-yes-yes.pileup.out"/>
104      <param name="type_select" value="manual"/>
105      <param name="ref_base_column" value="3"/>
106      <param name="read_bases_column" value="9"/>
107      <param name="read_qv_column" value="10"/>
108      <param name="cvrg_column" value="8"/>
109      <param name="coord_column" value="2"/>
110      <param name="qv_cutoff" value="20" />
111      <param name="cvrg_cutoff" value="3" />
112      <param name="snps_only" value="Yes"/>
113      <param name="interval" value="Yes" />
114       <param name="diff" value="No" />
115      <param name="qc_base" value="Yes" />
116    </test>
117        <test>
118      <param name="input" value="pileup_parser.10col.pileup"/>
119      <output name="out_file1" file="pileup_parser.10col.20-3-yes-yes-yes-yes.pileup.out"/>
120      <param name="type_select" value="manual"/>
121      <param name="ref_base_column" value="3"/>
122      <param name="read_bases_column" value="9"/>
123      <param name="read_qv_column" value="10"/>
124      <param name="cvrg_column" value="8"/>
125      <param name="coord_column" value="2"/>
126      <param name="qv_cutoff" value="20" />
127      <param name="cvrg_cutoff" value="3" />
128      <param name="snps_only" value="Yes"/>
129      <param name="interval" value="Yes" />
130       <param name="diff" value="Yes" />
131      <param name="qc_base" value="Yes" />
132    </test>
133    <test>
134      <param name="input" value="pileup_parser.10col.pileup"/>
135      <output name="out_file1" file="pileup_parser.10col.20-3-yes-yes-yes-no.pileup.out"/>
136      <param name="type_select" value="manual"/>
137      <param name="ref_base_column" value="3"/>
138      <param name="read_bases_column" value="9"/>
139      <param name="read_qv_column" value="10"/>
140      <param name="cvrg_column" value="8"/>
141      <param name="coord_column" value="2"/>
142      <param name="qv_cutoff" value="20" />
143      <param name="cvrg_cutoff" value="3" />
144      <param name="snps_only" value="Yes"/>
145      <param name="interval" value="Yes" />
146       <param name="diff" value="Yes" />
147      <param name="qc_base" value="No" />
148    </test>
149
150
151 </tests>
152<help>
153
154**What it does**
155
156Allows one to find sequence variants and/or sites covered by a specified number of reads with bases above a set quality threshold. The tool works on six and ten column pileup formats produced with *samtools pileup* command. However, it also allows you to specify columns in the input file manually. The tool assumes the following:
157
158- the quality scores follow phred33 convention, where input qualities are ASCII characters equal to the Phred quality plus 33.
159- the pileup dataset was produced by the *samtools pileup* command (although you can override this by setting column assignments manually).
160
161--------
162
163**Types of pileup datasets**
164
165The descriptions of the following pileup formats are largely based on information that can be found on the SAMTools_ documentation page. The 6- and 10-column variants are described below.
166
167.. _SAMTools: http://samtools.sourceforge.net/pileup.shtml
168
169**Six column pileup**::
170
171    1    2  3  4        5        6
172 ---------------------------------   
173 chrM  412  A  2       .,       II
174 chrM  413  G  4     ..t,     IIIH
175 chrM  414  C  4     ..Ta     III2
176 chrM  415  C  4     TTTt     III7
177   
178where::
179
180  Column Definition
181 ------- ----------------------------
182       1 Chromosome
183       2 Position (1-based)
184       3 Reference base at that position
185       4 Coverage (# reads aligning over that position)
186       5 Bases within reads
187       6 Quality values (phred33 scale, see Galaxy wiki for more)
188       
189**Ten column pileup**
190
191The `ten-column`__ pileup incorporates additional consensus information generated with the *-c* option of the *samtools pileup* command::
192
193
194    1    2  3  4   5   6   7   8       9       10
195 ------------------------------------------------
196 chrM  412  A  A  75   0  25  2       .,       II
197 chrM  413  G  G  72   0  25  4     ..t,     IIIH
198 chrM  414  C  C  75   0  25  4     ..Ta     III2
199 chrM  415  C  T  75  75  25  4     TTTt     III7
200
201where::
202
203  Column Definition
204 ------- ----------------------------
205       1 Chromosome
206       2 Position (1-based)
207       3 Reference base at that position
208       4 Consensus bases
209       5 Consensus quality
210       6 SNP quality
211       7 Maximum mapping quality
212       8 Coverage (# reads aligning over that position)
213       9 Bases within reads
214      10 Quality values (phred33 scale, see Galaxy wiki for more)
215
216
217.. __: http://samtools.sourceforge.net/cns0.shtml
218
219------
220
221**The output format**
222
223The tool modifies the input dataset in two ways:
224
2251. It appends five columns to the end of every reported line:
226
227- Number of **A** variants
228- Number of **C** variants
229- Number of **G** variants
230- Number of **T** variants
231- Number of read bases covering this position, where quality is equal to or higher than the value set by **Do not consider read bases with  quality lower than** option. 
232
233Optionally, if **Print total number of differences?** is set to **Yes**, the tool will append the sixth column with the total number of deviants (see below).
234
2352. If **Convert coordinates to intervals?** is set to **Yes**, the tool replaces the position column (typically the second column) with a pair of tab-delimited start/end values.
236
237For example, if you are calling variants with base quality above 20 on this dataset::
238
239 chrM  412  A  2       .,       II
240 chrM  413  G  4     ..t,     III2
241 chrM  414  C  4     ..Ta     III2
242 chrM  415  C  4     TTTt     III7
243
244you will get::
245
246 chrM  413  G  4  ..t,  IIIH  0  0  2  1  3
247 chrM  414  C  4  ..Ta  III2  1  1  0  1  3
248 chrM  415  C  4  TTTt  III7  0  0  0  4  4
249 
250where::
251
252  Column Definition
253 ------- ----------------------------
254       1 Chromosome
255       2 Position (1-based)
256       3 Reference base at that position
257       4 Coverage (# reads aligning over that position)
258       5 Bases within reads where
259       6 Quality values (phred33 scale, see Galaxy wiki for more)
260       7 Number of A variants
261       8 Number of C variants
262       9 Number of G variants
263      10 Number of T variants
264      11 Quality adjusted coverage:
265      12 Number of read bases (i.e., # of reads) with quality above the set threshold
266      13 Total number of deviants (if Convert coordinates to intervals? is set to yes)
267         
268if **Print total number of differences?** is set to **Yes**, you will get::
269
270 chrM  413  G  4  ..t,  IIIH  0  0  2  1  3  1
271 chrM  414  C  4  ..Ta  III2  1  2  0  1  3  2
272 chrM  415  C  4  TTTt  III7  0  0  0  4  4  0 
273 
274Note the additional column 13, that contains the number of deviant reads (e.g., there are two deviants, T and a, for position 414).
275
276 
277Finally, if **Convert coordinates to intervals?** is set to **Yes**, you will get one additional column with the end coordinate::
278 
279 chrM  412 413  G  4  ..t,  III2  0  0  2  1  3
280 chrM  414 415  C  4  ..Ta  III2  1  2  0  1  3
281 chrM  414 415  C  4  TTTt  III7  0  0  0  4  4
282 
283where::
284
285  Column Definition
286 ------- ----------------------------
287       1 Chromosome
288       2 Start position (0-based)
289       3 End position (1-based)
290       4 Reference base at that position
291       5 Coverage (# reads aligning over that position)
292       6 Bases within reads
293       7 Quality values (phred33 scale, see Galaxy wiki for more)
294       8 Number of A variants
295       9 Number of C variants
296      10 Number of G variants
297      11 Number of T variants
298      12 Quality adjusted coverage
299      13 Total number of deviants (if Convert coordinates to intervals? is set to yes)
300
301
302Note that in this case the coordinates of SNPs were converted to intervals, where the start coordinate is 0-based and the end coordinate in 1-based using the UCSC Table Browser convention. 
303 
304Although three positions have variants in the original file (413, 414, and 415), only 413 and 415 are reported because the quality values associated with these two SNPs are above the threshold of 20. In the case of 414 the **a** allele has a quality value of 17 ( ord("2")-33 ), and is therefore not reported. Note that five columns have been added to each of the reported lines::
305
306  chrM  413  G  4  ..t,  IIIH  0  0  2  1  3
307  
308Here, there is one variant, **t**. Because the fourth column represents **T** counts, it is incremented by 1. The last column shows that at this position, three reads have bases above the quality threshold of 20.
309
310-----
311
312**Example 1**: Just variants
313
314In this mode, the tool only outputs the lines from the input datasets where at least one read contains a sequence variant with quality above the threshold set by the **Do not consider read bases with quality lower than** option. For example, suppose one has a pileup dataset like the following::
315
316 chrM  412  A  2       .,       II
317 chrM  413  G  4     ..t,     III2
318 chrM  414  C  4     ..Ta     III2
319 chrM  415  C  4     TTTt     III7
320 
321To call all variants (with no restriction by coverage) with quality above phred value of 20, we will need to set the parameters as follows:
322
323.. image:: ./static/images/pileup_parser_help1.png 
324
325Running the tool with these parameters will return::
326
327 chrM  413  G  4  ..t,  IIIH  0  0  0  1  3
328 chrM  414  C  4  ..Ta  III2  0  2  0  1  3
329 chrM  415  C  4  TTTt  III7  0  0  0  4  4
330 
331**Note** that position 414 is not reported because the *a* variant has associated quality value of 17 (because ord('2')-33 = 17) and is below the phred threshold of 20 set by the **Count variants with quality above this value** parameter.
332
333-----
334
335**Example 2**: Report everything
336
337In addition to calling variants, it is often useful to know the quality adjusted coverage. Running the tool with these parameters:
338
339.. image:: ./static/images/pileup_parser_help2.png 
340
341will report everything from the original file::
342
343 chrM  412  A  2  .,    II    2  0  0  0  2
344 chrM  413  G  4  ..t,  III2  0  0  2  1  3
345 chrM  414  C  4  ..Ta  III2  0  2  0  1  3
346 chrM  415  C  4  TTTt  III7  0  0  0  4  4
347 
348Here, you can see that although the total coverage at position 414 is 4 (column 4), the quality adjusted coverage is 3 (last column). This is because only three out of four reads have bases with quality above the set threshold of 20 (the actual qualities are III2 or, after conversion,  40, 40, 40, 17).
349
350One can use the last column of this dataset to filter out (using Galaxy's **Filter** tool) positions where quality adjusted coverage (last column) is below a set threshold.
351
352------
353
354**Example 3**: Report everything and print total number of differences
355
356If you set the **Print total number of differences?** to **Yes** the tool will print an additional column with the total number of reads where a devinat base is above the quality threshold. So, seetiing parametrs like this:
357
358.. image:: ./static/images/pileup_parser_help3.png
359
360will produce this::
361
362 chrM  412  A  2  .,    II    2  0  0  0  2  0
363 chrM  413  G  4  ..t,  III2  0  0  2  1  3  1
364 chrM  414  C  4  ..Ta  III2  0  2  0  1  3  1
365 chrM  415  C  4  TTTt  III7  0  0  0  4  4  0
366 
367 
368-----
369
370**Example 4**: Report everything, print total number of differences, and ignore qualities and read bases
371
372Setting **Print quality and base string?** to **Yes** as shown here:
373
374.. image:: ./static/images/pileup_parser_help4.png
375
376will produce this::
377
378 chrM  412  A  2  2  0  0  0  2  0
379 chrM  413  G  4  0  0  2  1  3  1
380 chrM  414  C  4  0  2  0  1  3  1
381 chrM  415  C  4  0  0  0  4  4  0
382
383
384
385 
386</help>
387</tool>