PageRenderTime 73ms CodeModel.GetById 47ms app.highlight 21ms RepoModel.GetById 1ms app.codeStats 1ms

/tools/samtools/pileup_parser.pl

https://bitbucket.org/cistrome/cistrome-harvard/
Perl | 121 lines | 96 code | 22 blank | 3 comment | 23 complexity | 25b7d668adc30a5b81f7aa2596b92c75 MD5 | raw file
  1#! /usr/bin/perl -w
  2
  3use strict;
  4use POSIX;
  5
  6
  7die "Usage: pileup_parser.pl <in_file> <ref_base_column> <read_bases_column> <base_quality_column> <coverage column> <qv cutoff> <coverage cutoff> <SNPs only?> <output bed?> <coord_column> <out_file> <total_diff> <print_qual_bases>\n" unless @ARGV == 13;
  8
  9my $in_file = $ARGV[0];
 10my $ref_base_column = $ARGV[1]-1; # 1 based
 11my $read_bases_column = $ARGV[2]-1; # 1 based
 12my $base_quality_column = $ARGV[3]-1; # 1 based
 13my $cvrg_column = $ARGV[4]-1; # 1 based
 14my $quality_cutoff = $ARGV[5]; # phred scale integer
 15my $cvrg_cutoff = $ARGV[6]; # unsigned integer
 16my $SNPs_only = $ARGV[7]; # set to "Yes" to print only positions with SNPs; set to "No" to pring everything
 17my $bed = $ARGV[8]; #set to "Yes" to convert coordinates to bed format (0-based start, 1-based end); set to "No" to leave as is
 18my $coord_column = $ARGV[9]-1; #1 based 
 19my $out_file = $ARGV[10];
 20my $total_diff = $ARGV[11]; # set to "Yes" to print total number of deviant based
 21my $print_qual_bases = $ARGV[12]; #set to "Yes" to print quality and read base columns
 22
 23my $invalid_line_counter = 0;
 24my $first_skipped_line = "";
 25my %SNPs = ('A',0,'T',0,'C',0,'G',0);
 26my $above_qv_bases = 0;
 27my $SNPs_exist = 0;
 28my $out_string = "";
 29my $diff_count = 0;
 30
 31open (IN, "<$in_file") or die "Cannot open $in_file $!\n";
 32open (OUT, ">$out_file") or die "Cannot open $out_file $!\n";
 33
 34while (<IN>) {
 35	chop;
 36	next if m/^\#/;
 37	my @fields = split /\t/;
 38	next if $fields[ $ref_base_column ] eq "*"; # skip indel lines
 39 	my $read_bases   = $fields[ $read_bases_column ];
 40 	die "Coverage column" . ($cvrg_column+1) . " contains non-numeric values. Check your input parameters as well as format of input dataset." if ( not isdigit $fields[ $cvrg_column ] );
 41    next if $fields[ $cvrg_column ] < $cvrg_cutoff;
 42	my $base_quality = $fields[ $base_quality_column ];
 43	if ($read_bases =~ m/[\$\^\+-]/) {
 44		$read_bases =~ s/\^.//g; #removing the start of the read segement mark
 45		$read_bases =~ s/\$//g; #removing end of the read segment mark
 46		while ($read_bases =~ m/[\+-]{1}(\d+)/g) {
 47			my $indel_len = $1;
 48			$read_bases =~ s/[\+-]{1}$indel_len.{$indel_len}//; # remove indel info from read base field
 49		}
 50	}
 51	if ( length($read_bases) != length($base_quality) ) {
 52        $first_skipped_line = $. if $first_skipped_line eq "";
 53        ++$invalid_line_counter;
 54        next;
 55	}
 56	# after removing read block and indel data the length of read_base 
 57	# field should identical to the length of base_quality field
 58	
 59	my @bases = split //, $read_bases;
 60	my @qv    = split //, $base_quality;
 61	
 62	for my $base ( 0 .. @bases - 1 ) {
 63		if ( ord( $qv[ $base ] ) - 33 >= $quality_cutoff and $bases[ $base ] ne '*')
 64		{
 65			++$above_qv_bases;
 66			
 67			if ( $bases[ $base ] =~ m/[ATGC]/i )
 68			{
 69				$SNPs_exist = 1;	
 70				$SNPs{ uc( $bases[ $base ] ) } += 1;
 71				$diff_count += 1;
 72			} elsif ( $bases[ $base ] =~ m/[\.,]/ ) {
 73			    $SNPs{ uc( $fields[ $ref_base_column ] ) } += 1;
 74		    }		 	
 75		}
 76	} 
 77	
 78	if ($bed eq "Yes") {
 79	       my $start = $fields[ $coord_column ] - 1;
 80	       my $end   = $fields[ $coord_column ];
 81	       $fields[ $coord_column ] = "$start\t$end";
 82	} 
 83	
 84	if ($print_qual_bases ne "Yes") {
 85	       $fields[ $base_quality_column ] = "";
 86	       $fields[ $read_bases_column ] = "";
 87	}
 88	       
 89	
 90	$out_string = join("\t", @fields); # \t$read_bases\t$base_quality";
 91	foreach my $SNP (sort keys %SNPs) {
 92			$out_string .= "\t$SNPs{$SNP}";
 93	}
 94	
 95	if ($total_diff eq "Yes") {
 96	   $out_string .= "\t$above_qv_bases\t$diff_count\n";
 97	} else {
 98	   $out_string .= "\t$above_qv_bases\n";
 99	}	
100	
101	$out_string =~ s/\t+/\t/g;
102	
103	if ( $SNPs_only eq "Yes" ) {
104		print OUT $out_string if $SNPs_exist == 1;
105	} else {
106		print OUT $out_string;
107	}
108
109		
110	%SNPs = ();
111	%SNPs = ('A',0,'T',0,'C',0,'G',0);
112	$above_qv_bases = 0;
113	$SNPs_exist = 0;
114	$diff_count = 0;
115	
116
117}
118
119print "Skipped $invalid_line_counter invalid line(s) beginning with line $first_skipped_line\n" if $invalid_line_counter > 0;
120close IN;
121close OUT;