/tools/samtools/pileup_parser.pl

https://bitbucket.org/cistrome/cistrome-harvard/ · Perl · 121 lines · 96 code · 22 blank · 3 comment · 23 complexity · 25b7d668adc30a5b81f7aa2596b92c75 MD5 · raw file

  1. #! /usr/bin/perl -w
  2. use strict;
  3. use POSIX;
  4. die "Usage: pileup_parser.pl <in_file> <ref_base_column> <read_bases_column> <base_quality_column> <coverage column> <qv cutoff> <coverage cutoff> <SNPs only?> <output bed?> <coord_column> <out_file> <total_diff> <print_qual_bases>\n" unless @ARGV == 13;
  5. my $in_file = $ARGV[0];
  6. my $ref_base_column = $ARGV[1]-1; # 1 based
  7. my $read_bases_column = $ARGV[2]-1; # 1 based
  8. my $base_quality_column = $ARGV[3]-1; # 1 based
  9. my $cvrg_column = $ARGV[4]-1; # 1 based
  10. my $quality_cutoff = $ARGV[5]; # phred scale integer
  11. my $cvrg_cutoff = $ARGV[6]; # unsigned integer
  12. my $SNPs_only = $ARGV[7]; # set to "Yes" to print only positions with SNPs; set to "No" to pring everything
  13. my $bed = $ARGV[8]; #set to "Yes" to convert coordinates to bed format (0-based start, 1-based end); set to "No" to leave as is
  14. my $coord_column = $ARGV[9]-1; #1 based
  15. my $out_file = $ARGV[10];
  16. my $total_diff = $ARGV[11]; # set to "Yes" to print total number of deviant based
  17. my $print_qual_bases = $ARGV[12]; #set to "Yes" to print quality and read base columns
  18. my $invalid_line_counter = 0;
  19. my $first_skipped_line = "";
  20. my %SNPs = ('A',0,'T',0,'C',0,'G',0);
  21. my $above_qv_bases = 0;
  22. my $SNPs_exist = 0;
  23. my $out_string = "";
  24. my $diff_count = 0;
  25. open (IN, "<$in_file") or die "Cannot open $in_file $!\n";
  26. open (OUT, ">$out_file") or die "Cannot open $out_file $!\n";
  27. while (<IN>) {
  28. chop;
  29. next if m/^\#/;
  30. my @fields = split /\t/;
  31. next if $fields[ $ref_base_column ] eq "*"; # skip indel lines
  32. my $read_bases = $fields[ $read_bases_column ];
  33. die "Coverage column" . ($cvrg_column+1) . " contains non-numeric values. Check your input parameters as well as format of input dataset." if ( not isdigit $fields[ $cvrg_column ] );
  34. next if $fields[ $cvrg_column ] < $cvrg_cutoff;
  35. my $base_quality = $fields[ $base_quality_column ];
  36. if ($read_bases =~ m/[\$\^\+-]/) {
  37. $read_bases =~ s/\^.//g; #removing the start of the read segement mark
  38. $read_bases =~ s/\$//g; #removing end of the read segment mark
  39. while ($read_bases =~ m/[\+-]{1}(\d+)/g) {
  40. my $indel_len = $1;
  41. $read_bases =~ s/[\+-]{1}$indel_len.{$indel_len}//; # remove indel info from read base field
  42. }
  43. }
  44. if ( length($read_bases) != length($base_quality) ) {
  45. $first_skipped_line = $. if $first_skipped_line eq "";
  46. ++$invalid_line_counter;
  47. next;
  48. }
  49. # after removing read block and indel data the length of read_base
  50. # field should identical to the length of base_quality field
  51. my @bases = split //, $read_bases;
  52. my @qv = split //, $base_quality;
  53. for my $base ( 0 .. @bases - 1 ) {
  54. if ( ord( $qv[ $base ] ) - 33 >= $quality_cutoff and $bases[ $base ] ne '*')
  55. {
  56. ++$above_qv_bases;
  57. if ( $bases[ $base ] =~ m/[ATGC]/i )
  58. {
  59. $SNPs_exist = 1;
  60. $SNPs{ uc( $bases[ $base ] ) } += 1;
  61. $diff_count += 1;
  62. } elsif ( $bases[ $base ] =~ m/[\.,]/ ) {
  63. $SNPs{ uc( $fields[ $ref_base_column ] ) } += 1;
  64. }
  65. }
  66. }
  67. if ($bed eq "Yes") {
  68. my $start = $fields[ $coord_column ] - 1;
  69. my $end = $fields[ $coord_column ];
  70. $fields[ $coord_column ] = "$start\t$end";
  71. }
  72. if ($print_qual_bases ne "Yes") {
  73. $fields[ $base_quality_column ] = "";
  74. $fields[ $read_bases_column ] = "";
  75. }
  76. $out_string = join("\t", @fields); # \t$read_bases\t$base_quality";
  77. foreach my $SNP (sort keys %SNPs) {
  78. $out_string .= "\t$SNPs{$SNP}";
  79. }
  80. if ($total_diff eq "Yes") {
  81. $out_string .= "\t$above_qv_bases\t$diff_count\n";
  82. } else {
  83. $out_string .= "\t$above_qv_bases\n";
  84. }
  85. $out_string =~ s/\t+/\t/g;
  86. if ( $SNPs_only eq "Yes" ) {
  87. print OUT $out_string if $SNPs_exist == 1;
  88. } else {
  89. print OUT $out_string;
  90. }
  91. %SNPs = ();
  92. %SNPs = ('A',0,'T',0,'C',0,'G',0);
  93. $above_qv_bases = 0;
  94. $SNPs_exist = 0;
  95. $diff_count = 0;
  96. }
  97. print "Skipped $invalid_line_counter invalid line(s) beginning with line $first_skipped_line\n" if $invalid_line_counter > 0;
  98. close IN;
  99. close OUT;