PageRenderTime 78ms CodeModel.GetById 23ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/variant_detection/pileup2vcf.pl

https://bitbucket.org/montella/galaxy
Perl | 334 lines | 278 code | 32 blank | 24 comment | 59 complexity | 8cbf3a5d80ec4d12108d17602c68398d MD5 | raw file
Possible License(s): CC-BY-3.0, GPL-3.0, MIT, Apache-2.0
  1. #!/usr/bin/perl -w
  2. #
  3. # VCF specs: http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf4.0
  4. #
  5. # Contact: pd3@sanger, revised by Eric Tycksen: etycksen@gtac.wustl.edu
  6. # Version: 2010-04-23r
  7. # Note to self: Need to add ref base + deletion for ref allele. Need to check formatting of insertions as well. Close.....very close.
  8. use strict;
  9. use warnings;
  10. use Carp;
  11. my $opts = parse_params();
  12. do_pileup_to_vcf($opts);
  13. exit;
  14. #---------------
  15. sub error
  16. {
  17. my (@msg) = @_;
  18. if ( scalar @msg ) { croak(@msg); }
  19. die
  20. "Usage: sam2vcf.pl [OPTIONS] < in.pileup > out.vcf\n",
  21. "Options:\n",
  22. " -h, -?, --help This help message.\n",
  23. " -i, --indels-only Ignore SNPs.\n",
  24. " -r, --refseq <file.fa> The reference sequence, required when indels are present.\n",
  25. " -R, --keep-ref Print reference alleles as well.\n",
  26. " -s, --snps-only Ignore indels.\n",
  27. " -t, --column-title <string> The column title.\n",
  28. "\n";
  29. }
  30. sub parse_params
  31. {
  32. my %opts = ();
  33. $opts{fh_in} = *STDIN;
  34. $opts{fh_out} = *STDOUT;
  35. while (my $arg=shift(@ARGV))
  36. {
  37. if ( $arg eq '-R' || $arg eq '--keep-ref' ) { $opts{keep_ref}=1; next; }
  38. if ( $arg eq '-r' || $arg eq '--refseq' ) { $opts{refseq}=shift(@ARGV); next; }
  39. if ( $arg eq '-t' || $arg eq '--column-title' ) { $opts{title}=shift(@ARGV); next; }
  40. if ( $arg eq '-s' || $arg eq '--snps-only' ) { $opts{snps_only}=1; next; }
  41. if ( $arg eq '-i' || $arg eq '--indels-only' ) { $opts{indels_only}=1; next; }
  42. if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
  43. error("Unknown parameter \"$arg\". Run -h for help.\n");
  44. }
  45. return \%opts;
  46. }
  47. sub iupac_to_gtype
  48. {
  49. my ($ref,$base) = @_;
  50. my %iupac = (
  51. 'K' => ['G','T'],
  52. 'M' => ['A','C'],
  53. 'S' => ['C','G'],
  54. 'R' => ['A','G'],
  55. 'W' => ['A','T'],
  56. 'Y' => ['C','T'],
  57. 'N' => ['G','T','C','A'],
  58. );
  59. if ( !exists($iupac{$base}) )
  60. {
  61. if ( $base ne 'A' && $base ne 'C' && $base ne 'G' && $base ne 'T' ) { error("FIXME: what is this [$base]?\n"); }
  62. if ( $ref eq $base ) { return ('.','0/0'); }
  63. return ($base,'1/1');
  64. }
  65. my $gt = $iupac{$base};
  66. if ( $$gt[0] eq $ref ) { return ($$gt[1],'0/1'); }
  67. elsif ( $$gt[1] eq $ref ) { return ($$gt[0],'0/1'); }
  68. return ("$$gt[0],$$gt[1]",'1/2');
  69. }
  70. sub parse_indel
  71. {
  72. my ($cons, $ref) = @_;
  73. if ( $cons=~/^-/ ) {
  74. my $p_del = $cons;
  75. my $parse_del = substr($cons,1);
  76. my $ref_del = "$ref" . "$parse_del";
  77. my $del = longest_common_prefix($ref,$ref_del);
  78. return "$del","$ref_del","$parse_del","$p_del";
  79. }
  80. elsif ( $cons=~/^\+/ ) {
  81. my $p_ins = $cons;
  82. my $parse_ins = substr($cons,1);
  83. my $ins = "$ref" . "$parse_ins";
  84. my $ref_ins = $ref;
  85. return "$ins","$ref","$parse_ins","$p_ins";
  86. }
  87. elsif ( $cons eq '*' ) { return undef; }
  88. error("FIXME: could not parse [$cons]\n");
  89. }
  90. sub longest_common_prefix {
  91. my $prefix = shift;
  92. for (@_) {
  93. chop $prefix while (! /^\Q$prefix\E/);
  94. }
  95. return $prefix;
  96. }
  97. # An example of the pileup format:
  98. # 1 3000011 C C 32 0 98 1 ^~, A
  99. # 1 3002155 * +T/+T 53 119 52 5 +T * 4 1 0
  100. # 1 3003094 * -TT/-TT 31 164 60 11 -TT * 5 6 0
  101. # 1 3073986 * */-AAAAAAAAAAAAAA 3 3 45 9 * -AAAAAAAAAAAAAA 7 2 0
  102. #
  103. sub do_pileup_to_vcf
  104. {
  105. my ($opts) = @_;
  106. my $fh_in = $$opts{fh_in};
  107. my $fh_out = $$opts{fh_out};
  108. my ($prev_chr,$prev_pos,$prev_ref);
  109. my $refseq;
  110. my $ignore_indels = $$opts{snps_only} ? 1 : 0;
  111. my $ignore_snps = $$opts{indels_only} ? 1 : 0;
  112. my $keep_ref = $$opts{keep_ref} ? 1 : 0;
  113. my $title = exists($$opts{title}) ? $$opts{title} : 'data';
  114. print $fh_out
  115. qq[##fileformat=VCFv4.1\n],
  116. qq[##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">\n],
  117. qq[##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n],
  118. qq[##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">\n],
  119. qq[##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">\n],
  120. qq[#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$title\n]
  121. ;
  122. while (my $line=<$fh_in>)
  123. {
  124. chomp($line);
  125. my (@items) = split(/\t/,$line);
  126. if ( scalar @items<8 )
  127. {
  128. error("\nToo few columns, does not look like output of 'samtools pileup -c': $line\n");
  129. }
  130. my ($chr,$pos,$ref,$cons,$cons_qual,$snp_qual,$rms_qual,$depth,$a1,$a2) = @items;
  131. $ref = uc($ref);
  132. $cons = uc($cons);
  133. my ($alt,$gt);
  134. if ( $ref eq '*' )
  135. {
  136. # An indel is involved.
  137. if ( $ignore_indels )
  138. {
  139. $prev_ref = $ref;
  140. $prev_pos = $pos;
  141. $prev_chr = $chr;
  142. next;
  143. }
  144. if (!defined $prev_chr || $chr ne $prev_chr || $pos ne $prev_pos)
  145. {
  146. if ( !$$opts{refseq} ) { error("Cannot do indels without the reference.\n"); }
  147. if ( !$refseq ) { $refseq = Fasta->new(file=>$$opts{refseq}); }
  148. $ref = $refseq->get_base($chr,$pos);
  149. $ref = uc($ref);
  150. }
  151. else { $ref = $prev_ref; }
  152. # One of the alleles can be a reference and it can come in arbitrary order. In some
  153. # cases */* can be encountered. In such a case, look in the additional columns.
  154. my ($al1,$al2) = split('/', $cons);
  155. if ( $al1 eq $al2 && $al1 eq '*' ) { $al1=$a1; $al2=$a2; }
  156. my ($alt1,$indel_ref_1,$indel_parse_1,$pileup_1) = parse_indel($al1,$ref);
  157. my ($alt2,$indel_ref_2,$indel_parse_2,$pileup_2) = parse_indel($al2,$ref);
  158. $alt1 = uc($alt1);
  159. $alt2 = uc($alt2);
  160. $indel_ref_1 = uc($indel_ref_1);
  161. $indel_ref_2 = uc($indel_ref_2);
  162. if ( !$alt1 && !$alt2 )
  163. {
  164. warn("FIXME: could not parse indel:\n", $line);
  165. next;
  166. }
  167. if ( !$pileup_1 )
  168. {
  169. $alt=$alt2;
  170. $ref=$indel_ref_2;
  171. $gt='0/1';
  172. }
  173. elsif ( !$pileup_2 )
  174. {
  175. $alt=$alt1;
  176. $ref=$indel_ref_1;
  177. $gt='0/1';
  178. }
  179. elsif ( $pileup_1 eq $pileup_2 )
  180. {
  181. $alt="$alt1";
  182. $ref=$indel_ref_1;
  183. $gt='1/1';
  184. }
  185. elsif (($pileup_1=~/^\+/) && ($pileup_2=~/^\+/))
  186. { #In the case of insertions, indel_parse returns the correct values#
  187. $alt = "$alt1,$alt2";
  188. $ref = $indel_ref_1;
  189. $gt ='1/2';
  190. }
  191. elsif (($pileup_1=~/^-/) && ($pileup_2=~/^-/))
  192. { #In the case of heterozygous deletions, must make subsititions to determine the alternate alleles#
  193. my %hash = ($indel_ref_1, length($indel_ref_1), $indel_ref_2, length($indel_ref_2));
  194. my $ref_length=$hash{$indel_ref_1} >= $hash{$indel_ref_2} ? $hash{$indel_ref_1} : $hash{$indel_ref_2};
  195. my %rev_hash = reverse %hash;
  196. $ref=$rev_hash{$ref_length};
  197. my $del_1 = $ref;
  198. my $del_2 = $ref;
  199. $del_1 =~ s/$indel_parse_1//;
  200. $del_2 =~ s/$indel_parse_2//;
  201. $alt = "$del_1,$del_2";
  202. $gt = '1/2';
  203. }
  204. elsif (($pileup_1=~/^\+/) && ($pileup_2=~/^-/))
  205. { #In the case of heterozygous indels (ins/del), reference is the deletion from pileup#
  206. $ref = $indel_ref_2;
  207. my $del_2 = $ref;
  208. $del_2 =~ s/$indel_parse_2//;
  209. $alt = "$alt1,$del_2";
  210. $gt = '1/2';
  211. }
  212. elsif (($pileup_1=~/^-/) && ($pileup_2=~/^\+/))
  213. { #In the case of heterozygous indels (del,ins), reference is the deletion from pileup#
  214. $ref = $indel_ref_1;
  215. my $del_1 =$ref;
  216. $del_1 =~ s/$indel_parse_1//;
  217. $alt = "$del_1,$alt2";
  218. $gt = '1/2';
  219. }
  220. else
  221. {
  222. warn("FIXME: could not parse indel:\n", $line);
  223. next;
  224. }
  225. }
  226. else
  227. {
  228. if ( $ignore_snps || (!$keep_ref && $ref eq $cons) )
  229. {
  230. $prev_ref = $ref;
  231. $prev_pos = $pos;
  232. $prev_chr = $chr;
  233. next;
  234. }
  235. # SNP
  236. ($alt,$gt) = iupac_to_gtype($ref,$cons);
  237. }
  238. print $fh_out "$chr\t$pos\t.\t$ref\t$alt\t$snp_qual\tPASS\tDP=$depth\tGT:GQ:DP\t$gt:$cons_qual:$depth\n";
  239. $prev_ref = $ref;
  240. $prev_pos = $pos;
  241. $prev_chr = $chr;
  242. }
  243. }
  244. #------------- Fasta --------------------
  245. #
  246. # Uses samtools to get a requested base from a fasta file. For efficiency, preloads
  247. # a chunk to memory. The size of the cached sequence can be controlled by the 'size'
  248. # parameter.
  249. #
  250. package Fasta;
  251. use strict;
  252. use warnings;
  253. use Carp;
  254. sub Fasta::new
  255. {
  256. my ($class,@args) = @_;
  257. my $self = {@args};
  258. bless $self, ref($class) || $class;
  259. if ( !$$self{file} ) { $self->throw(qq[Missing the parameter "file"\n]); }
  260. $$self{chr} = undef;
  261. $$self{from} = undef;
  262. $$self{to} = undef;
  263. if ( !$$self{size} ) { $$self{size}=10_000_000; }
  264. bless $self, ref($class) || $class;
  265. return $self;
  266. }
  267. sub read_chunk
  268. {
  269. my ($self,$chr,$pos) = @_;
  270. my $to = $pos + $$self{size};
  271. my $cmd = "samtools faidx $$self{file} $chr:$pos-$to";
  272. my @out = `$cmd`;
  273. if ( $? ) { $self->throw("$cmd: $!"); }
  274. my $line = shift(@out);
  275. if ( !($line=~/^>$chr:(\d+)-(\d+)/) ) { $self->throw("Could not parse: $line"); }
  276. $$self{chr} = $chr;
  277. $$self{from} = $1;
  278. $$self{to} = $2;
  279. my $chunk = '';
  280. while ($line=shift(@out))
  281. {
  282. chomp($line);
  283. $chunk .= $line;
  284. }
  285. $$self{chunk} = $chunk;
  286. return;
  287. }
  288. sub get_base
  289. {
  290. my ($self,$chr,$pos) = @_;
  291. if ( !$$self{chr} || $chr ne $$self{chr} || $pos<$$self{from} || $pos>$$self{to} )
  292. {
  293. $self->read_chunk($chr,$pos);
  294. }
  295. my $idx = $pos - $$self{from};
  296. return substr($$self{chunk},$idx,1);
  297. }
  298. sub throw
  299. {
  300. my ($self,@msg) = @_;
  301. croak(@msg);
  302. }