PageRenderTime 66ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/tools/samtools/pileup_parser.xml

https://bitbucket.org/cistrome/cistrome-harvard/
XML | 387 lines | 305 code | 82 blank | 0 comment | 0 complexity | e455534db0a4b927f33e051d43ee36ab MD5 | raw file
  1. <tool id="pileup_parser" name="Filter pileup" version="1.0.2">>
  2. <description>on coverage and SNPs</description>
  3. <command interpreter="perl">
  4. #if $pileup_type.type_select == "six" #pileup_parser.pl $input "3" "5" "6" "4" $qv_cutoff $cvrg_cutoff $snps_only $interval "2" $out_file1 $diff $qc_base
  5. #elif $pileup_type.type_select == "ten" #pileup_parser.pl $input "3" "9" "10" "8" $qv_cutoff $cvrg_cutoff $snps_only $interval "2" $out_file1 $diff $qc_base
  6. #elif $pileup_type.type_select == "manual" #pileup_parser.pl $input $pileup_type.ref_base_column $pileup_type.read_bases_column $pileup_type.read_qv_column $pileup_type.cvrg_column $qv_cutoff $cvrg_cutoff $snps_only $interval $pileup_type.coord_column $out_file1 $diff $qc_base
  7. #end if#
  8. </command>
  9. <inputs>
  10. <param name="input" type="data" format="tabular" label="Select dataset"/>
  11. <conditional name="pileup_type">
  12. <param name="type_select" type="select" label="which contains" help="See &quot;Types of pileup datasets&quot; below for examples">
  13. <option value="six" selected="true">Pileup with six columns (simple)</option>
  14. <option value="ten">Pileup with ten columns (with consensus)</option>
  15. <option value="manual">Set columns manually</option>
  16. </param>
  17. <when value="manual">
  18. <param name="ref_base_column" label="Select column with reference base" type="data_column" numerical="false" data_ref="input" />
  19. <param name="read_bases_column" label="Select column with read bases" type="data_column" numerical="false" data_ref="input" help="something like this: ..,a.."/>
  20. <param name="read_qv_column" label="Select column with base qualities" type="data_column" numerical="false" data_ref="input" help="something like this: IIIGIAI"/>
  21. <param name="cvrg_column" label="Select column with coverage" type="data_column" numerical="true" data_ref="input" />
  22. <param name="coord_column" label="Select coordinate column" type="data_column" numerical="true" data_ref="input" />
  23. </when>
  24. <when value="six">
  25. </when>
  26. <when value="ten">
  27. </when>
  28. </conditional>
  29. <param name="qv_cutoff" label="Do not consider read bases with quality lower than" type="integer" value="20" help="No variants with quality below this value will be reported"/>
  30. <param name="cvrg_cutoff" label="Do not report positions with coverage lower than" type="integer" value="3" help="Pileup lines with coverage lower than this value will be skipped"/>
  31. <param name="snps_only" label="Only report variants?" type="select" help="See &quot;Examples 1 and 2&quot; below for explanation">
  32. <option value="No">No</option>
  33. <option value="Yes" selected="true">Yes</option>
  34. </param>
  35. <param name="interval" label="Convert coordinates to intervals?" type="select" help="See &quot;Output format&quot; below for explanation">
  36. <option value="No" selected="true">No</option>
  37. <option value="Yes">Yes</option>
  38. </param>
  39. <param name="diff" label="Print total number of differences?" type="select" help="See &quot;Example 3&quot; below for explanation">
  40. <option value="No" selected="true">No</option>
  41. <option value="Yes">Yes</option>
  42. </param>
  43. <param name="qc_base" label="Print quality and base string?" type="select" help="See &quot;Example 4&quot; below for explanation">
  44. <option value="No">No</option>
  45. <option value="Yes" selected="true">Yes</option>
  46. </param>
  47. </inputs>
  48. <outputs>
  49. <data format="tabular" name="out_file1">
  50. <change_format>
  51. <when input="interval" value="Yes" format="interval" />
  52. </change_format>
  53. </data>
  54. </outputs>
  55. <tests>
  56. <test>
  57. <param name="input" value="pileup_parser.6col.pileup"/>
  58. <output name="out_file1" file="pileup_parser.6col.20-3-yes-yes.pileup.out"/>
  59. <param name="type_select" value="six"/>
  60. <param name="qv_cutoff" value="20" />
  61. <param name="cvrg_cutoff" value="3" />
  62. <param name="snps_only" value="Yes"/>
  63. <param name="interval" value="Yes" />
  64. <param name="diff" value="No" />
  65. <param name="qc_base" value="Yes" />
  66. </test>
  67. <test>
  68. <param name="input" value="pileup_parser.6col.pileup"/>
  69. <output name="out_file1" file="pileup_parser.6col.20-3-yes-no.pileup.out"/>
  70. <param name="type_select" value="six"/>
  71. <param name="qv_cutoff" value="20" />
  72. <param name="cvrg_cutoff" value="3" />
  73. <param name="snps_only" value="Yes"/>
  74. <param name="interval" value="No" />
  75. <param name="diff" value="No" />
  76. <param name="qc_base" value="Yes" />
  77. </test>
  78. <test>
  79. <param name="input" value="pileup_parser.6col.pileup"/>
  80. <output name="out_file1" file="pileup_parser.6col.20-3-no-no.pileup.out"/>
  81. <param name="type_select" value="six"/>
  82. <param name="qv_cutoff" value="20" />
  83. <param name="cvrg_cutoff" value="3" />
  84. <param name="snps_only" value="No"/>
  85. <param name="interval" value="No" />
  86. <param name="diff" value="No" />
  87. <param name="qc_base" value="Yes" />
  88. </test>
  89. <test>
  90. <param name="input" value="pileup_parser.10col.pileup"/>
  91. <output name="out_file1" file="pileup_parser.10col.20-3-yes-yes.pileup.out"/>
  92. <param name="type_select" value="ten"/>
  93. <param name="qv_cutoff" value="20" />
  94. <param name="cvrg_cutoff" value="3" />
  95. <param name="snps_only" value="Yes"/>q
  96. <param name="interval" value="Yes" />
  97. <param name="diff" value="No" />
  98. <param name="qc_base" value="Yes" />
  99. </test>
  100. <test>
  101. <param name="input" value="pileup_parser.10col.pileup"/>
  102. <output name="out_file1" file="pileup_parser.10col.20-3-yes-yes.pileup.out"/>
  103. <param name="type_select" value="manual"/>
  104. <param name="ref_base_column" value="3"/>
  105. <param name="read_bases_column" value="9"/>
  106. <param name="read_qv_column" value="10"/>
  107. <param name="cvrg_column" value="8"/>
  108. <param name="coord_column" value="2"/>
  109. <param name="qv_cutoff" value="20" />
  110. <param name="cvrg_cutoff" value="3" />
  111. <param name="snps_only" value="Yes"/>
  112. <param name="interval" value="Yes" />
  113. <param name="diff" value="No" />
  114. <param name="qc_base" value="Yes" />
  115. </test>
  116. <test>
  117. <param name="input" value="pileup_parser.10col.pileup"/>
  118. <output name="out_file1" file="pileup_parser.10col.20-3-yes-yes-yes-yes.pileup.out"/>
  119. <param name="type_select" value="manual"/>
  120. <param name="ref_base_column" value="3"/>
  121. <param name="read_bases_column" value="9"/>
  122. <param name="read_qv_column" value="10"/>
  123. <param name="cvrg_column" value="8"/>
  124. <param name="coord_column" value="2"/>
  125. <param name="qv_cutoff" value="20" />
  126. <param name="cvrg_cutoff" value="3" />
  127. <param name="snps_only" value="Yes"/>
  128. <param name="interval" value="Yes" />
  129. <param name="diff" value="Yes" />
  130. <param name="qc_base" value="Yes" />
  131. </test>
  132. <test>
  133. <param name="input" value="pileup_parser.10col.pileup"/>
  134. <output name="out_file1" file="pileup_parser.10col.20-3-yes-yes-yes-no.pileup.out"/>
  135. <param name="type_select" value="manual"/>
  136. <param name="ref_base_column" value="3"/>
  137. <param name="read_bases_column" value="9"/>
  138. <param name="read_qv_column" value="10"/>
  139. <param name="cvrg_column" value="8"/>
  140. <param name="coord_column" value="2"/>
  141. <param name="qv_cutoff" value="20" />
  142. <param name="cvrg_cutoff" value="3" />
  143. <param name="snps_only" value="Yes"/>
  144. <param name="interval" value="Yes" />
  145. <param name="diff" value="Yes" />
  146. <param name="qc_base" value="No" />
  147. </test>
  148. </tests>
  149. <help>
  150. **What it does**
  151. Allows one to find sequence variants and/or sites covered by a specified number of reads with bases above a set quality threshold. The tool works on six and ten column pileup formats produced with *samtools pileup* command. However, it also allows you to specify columns in the input file manually. The tool assumes the following:
  152. - the quality scores follow phred33 convention, where input qualities are ASCII characters equal to the Phred quality plus 33.
  153. - the pileup dataset was produced by the *samtools pileup* command (although you can override this by setting column assignments manually).
  154. --------
  155. **Types of pileup datasets**
  156. The descriptions of the following pileup formats are largely based on information that can be found on the SAMTools_ documentation page. The 6- and 10-column variants are described below.
  157. .. _SAMTools: http://samtools.sourceforge.net/pileup.shtml
  158. **Six column pileup**::
  159. 1 2 3 4 5 6
  160. ---------------------------------
  161. chrM 412 A 2 ., II
  162. chrM 413 G 4 ..t, IIIH
  163. chrM 414 C 4 ..Ta III2
  164. chrM 415 C 4 TTTt III7
  165. where::
  166. Column Definition
  167. ------- ----------------------------
  168. 1 Chromosome
  169. 2 Position (1-based)
  170. 3 Reference base at that position
  171. 4 Coverage (# reads aligning over that position)
  172. 5 Bases within reads
  173. 6 Quality values (phred33 scale, see Galaxy wiki for more)
  174. **Ten column pileup**
  175. The `ten-column`__ pileup incorporates additional consensus information generated with the *-c* option of the *samtools pileup* command::
  176. 1 2 3 4 5 6 7 8 9 10
  177. ------------------------------------------------
  178. chrM 412 A A 75 0 25 2 ., II
  179. chrM 413 G G 72 0 25 4 ..t, IIIH
  180. chrM 414 C C 75 0 25 4 ..Ta III2
  181. chrM 415 C T 75 75 25 4 TTTt III7
  182. where::
  183. Column Definition
  184. ------- ----------------------------
  185. 1 Chromosome
  186. 2 Position (1-based)
  187. 3 Reference base at that position
  188. 4 Consensus bases
  189. 5 Consensus quality
  190. 6 SNP quality
  191. 7 Maximum mapping quality
  192. 8 Coverage (# reads aligning over that position)
  193. 9 Bases within reads
  194. 10 Quality values (phred33 scale, see Galaxy wiki for more)
  195. .. __: http://samtools.sourceforge.net/cns0.shtml
  196. ------
  197. **The output format**
  198. The tool modifies the input dataset in two ways:
  199. 1. It appends five columns to the end of every reported line:
  200. - Number of **A** variants
  201. - Number of **C** variants
  202. - Number of **G** variants
  203. - Number of **T** variants
  204. - Number of read bases covering this position, where quality is equal to or higher than the value set by **Do not consider read bases with quality lower than** option.
  205. Optionally, if **Print total number of differences?** is set to **Yes**, the tool will append the sixth column with the total number of deviants (see below).
  206. 2. If **Convert coordinates to intervals?** is set to **Yes**, the tool replaces the position column (typically the second column) with a pair of tab-delimited start/end values.
  207. For example, if you are calling variants with base quality above 20 on this dataset::
  208. chrM 412 A 2 ., II
  209. chrM 413 G 4 ..t, III2
  210. chrM 414 C 4 ..Ta III2
  211. chrM 415 C 4 TTTt III7
  212. you will get::
  213. chrM 413 G 4 ..t, IIIH 0 0 2 1 3
  214. chrM 414 C 4 ..Ta III2 1 1 0 1 3
  215. chrM 415 C 4 TTTt III7 0 0 0 4 4
  216. where::
  217. Column Definition
  218. ------- ----------------------------
  219. 1 Chromosome
  220. 2 Position (1-based)
  221. 3 Reference base at that position
  222. 4 Coverage (# reads aligning over that position)
  223. 5 Bases within reads where
  224. 6 Quality values (phred33 scale, see Galaxy wiki for more)
  225. 7 Number of A variants
  226. 8 Number of C variants
  227. 9 Number of G variants
  228. 10 Number of T variants
  229. 11 Quality adjusted coverage:
  230. 12 Number of read bases (i.e., # of reads) with quality above the set threshold
  231. 13 Total number of deviants (if Convert coordinates to intervals? is set to yes)
  232. if **Print total number of differences?** is set to **Yes**, you will get::
  233. chrM 413 G 4 ..t, IIIH 0 0 2 1 3 1
  234. chrM 414 C 4 ..Ta III2 1 2 0 1 3 2
  235. chrM 415 C 4 TTTt III7 0 0 0 4 4 0
  236. Note the additional column 13, that contains the number of deviant reads (e.g., there are two deviants, T and a, for position 414).
  237. Finally, if **Convert coordinates to intervals?** is set to **Yes**, you will get one additional column with the end coordinate::
  238. chrM 412 413 G 4 ..t, III2 0 0 2 1 3
  239. chrM 414 415 C 4 ..Ta III2 1 2 0 1 3
  240. chrM 414 415 C 4 TTTt III7 0 0 0 4 4
  241. where::
  242. Column Definition
  243. ------- ----------------------------
  244. 1 Chromosome
  245. 2 Start position (0-based)
  246. 3 End position (1-based)
  247. 4 Reference base at that position
  248. 5 Coverage (# reads aligning over that position)
  249. 6 Bases within reads
  250. 7 Quality values (phred33 scale, see Galaxy wiki for more)
  251. 8 Number of A variants
  252. 9 Number of C variants
  253. 10 Number of G variants
  254. 11 Number of T variants
  255. 12 Quality adjusted coverage
  256. 13 Total number of deviants (if Convert coordinates to intervals? is set to yes)
  257. Note that in this case the coordinates of SNPs were converted to intervals, where the start coordinate is 0-based and the end coordinate in 1-based using the UCSC Table Browser convention.
  258. Although three positions have variants in the original file (413, 414, and 415), only 413 and 415 are reported because the quality values associated with these two SNPs are above the threshold of 20. In the case of 414 the **a** allele has a quality value of 17 ( ord("2")-33 ), and is therefore not reported. Note that five columns have been added to each of the reported lines::
  259. chrM 413 G 4 ..t, IIIH 0 0 2 1 3
  260. Here, there is one variant, **t**. Because the fourth column represents **T** counts, it is incremented by 1. The last column shows that at this position, three reads have bases above the quality threshold of 20.
  261. -----
  262. **Example 1**: Just variants
  263. In this mode, the tool only outputs the lines from the input datasets where at least one read contains a sequence variant with quality above the threshold set by the **Do not consider read bases with quality lower than** option. For example, suppose one has a pileup dataset like the following::
  264. chrM 412 A 2 ., II
  265. chrM 413 G 4 ..t, III2
  266. chrM 414 C 4 ..Ta III2
  267. chrM 415 C 4 TTTt III7
  268. To call all variants (with no restriction by coverage) with quality above phred value of 20, we will need to set the parameters as follows:
  269. .. image:: ./static/images/pileup_parser_help1.png
  270. Running the tool with these parameters will return::
  271. chrM 413 G 4 ..t, IIIH 0 0 0 1 3
  272. chrM 414 C 4 ..Ta III2 0 2 0 1 3
  273. chrM 415 C 4 TTTt III7 0 0 0 4 4
  274. **Note** that position 414 is not reported because the *a* variant has associated quality value of 17 (because ord('2')-33 = 17) and is below the phred threshold of 20 set by the **Count variants with quality above this value** parameter.
  275. -----
  276. **Example 2**: Report everything
  277. In addition to calling variants, it is often useful to know the quality adjusted coverage. Running the tool with these parameters:
  278. .. image:: ./static/images/pileup_parser_help2.png
  279. will report everything from the original file::
  280. chrM 412 A 2 ., II 2 0 0 0 2
  281. chrM 413 G 4 ..t, III2 0 0 2 1 3
  282. chrM 414 C 4 ..Ta III2 0 2 0 1 3
  283. chrM 415 C 4 TTTt III7 0 0 0 4 4
  284. Here, you can see that although the total coverage at position 414 is 4 (column 4), the quality adjusted coverage is 3 (last column). This is because only three out of four reads have bases with quality above the set threshold of 20 (the actual qualities are III2 or, after conversion, 40, 40, 40, 17).
  285. One can use the last column of this dataset to filter out (using Galaxy's **Filter** tool) positions where quality adjusted coverage (last column) is below a set threshold.
  286. ------
  287. **Example 3**: Report everything and print total number of differences
  288. If you set the **Print total number of differences?** to **Yes** the tool will print an additional column with the total number of reads where a devinat base is above the quality threshold. So, seetiing parametrs like this:
  289. .. image:: ./static/images/pileup_parser_help3.png
  290. will produce this::
  291. chrM 412 A 2 ., II 2 0 0 0 2 0
  292. chrM 413 G 4 ..t, III2 0 0 2 1 3 1
  293. chrM 414 C 4 ..Ta III2 0 2 0 1 3 1
  294. chrM 415 C 4 TTTt III7 0 0 0 4 4 0
  295. -----
  296. **Example 4**: Report everything, print total number of differences, and ignore qualities and read bases
  297. Setting **Print quality and base string?** to **Yes** as shown here:
  298. .. image:: ./static/images/pileup_parser_help4.png
  299. will produce this::
  300. chrM 412 A 2 2 0 0 0 2 0
  301. chrM 413 G 4 0 0 2 1 3 1
  302. chrM 414 C 4 0 2 0 1 3 1
  303. chrM 415 C 4 0 0 0 4 4 0
  304. </help>
  305. </tool>