/tools/next_gen_conversion/bwa_solid2fastq_modified.pl
Perl | 89 lines | 77 code | 8 blank | 4 comment | 26 complexity | 8130b472aae8d09d7f96e7698dea606c MD5 | raw file
1#!/usr/bin/perl -w 2 3# Author: lh3 4# Note: Ideally, this script should be written in C. It is a bit slow at present. 5 6use strict; 7use warnings; 8use Getopt::Std; 9 10my %opts; 11my $version = '0.1.3'; 12my $usage = qq{ 13Usage: solid2fastq.pl <paired> <outfile1> <outfile2> <F3.csfasta> <F3.qual> <R3.csfasta> <R3.qual> 14 15Note: <in.title> is the string showed in the `# Title:' line of a 16 ".csfasta" read file. Then <in.title>F3.csfasta is read sequence 17 file and <in.title>F3_QV.qual is the quality file. If 18 <in.title>R3.csfasta is present, this script assumes reads are 19 paired; otherwise reads will be regarded as single-end. 20 21 The read name will be <out.prefix>:panel_x_y/[12] with `1' for R3 22 tag and `2' for F3. Usually you may want to use short <out.prefix> 23 to save diskspace. Long <out.prefix> also causes troubles to maq. 24 25}; 26 27getopts('', \%opts); 28die($usage) if (@ARGV != 7); 29my ($is_paired,$outfile1,$outfile2,$f3reads,$f3qual,$r3reads,$r3qual) = @ARGV; 30my (@fhr, @fhw); 31my $fn = ''; 32my @fn_suff = ($f3reads,$f3qual,$r3reads,$r3qual); 33if ($is_paired eq "yes") { # paired end 34 for (0 .. 3) { 35 $fn = $fn_suff[$_]; 36 $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); 37 open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); 38 } 39 open($fhw[0], "|gzip >$outfile2") || die; 40 open($fhw[1], "|gzip >$outfile1") || die; 41 my (@df, @dr); 42 @df = &read1(1); @dr = &read1(2); 43 while (@df && @dr) { 44 if ($df[0] eq $dr[0]) { # mate pair 45 print {$fhw[0]} $df[1]; print {$fhw[1]} $dr[1]; 46 @df = &read1(1); @dr = &read1(2); 47 } 48 } 49 close($fhr[$_]) for (0 .. $#fhr); 50 close($fhw[$_]) for (0 .. $#fhw); 51} else { # single end 52 for (0 .. 1) { 53 my $fn = "$fn_suff[$_]"; 54 $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); 55 open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); 56 } 57 open($fhw[2], "|gzip >$outfile1") || die; 58 my @df; 59 while (@df = &read1(1, $fhr[0], $fhr[1])) { 60 print {$fhw[2]} $df[1]; 61 } 62 close($fhr[$_]) for (0 .. $#fhr); 63 close($fhw[2]); 64} 65 66sub read1 { 67 my $i = shift(@_); 68 my $j = ($i-1)<<1; 69 my ($key, $seq); 70 my ($fhs, $fhq) = ($fhr[$j], $fhr[$j|1]); 71 while (<$fhs>) { 72 my $t = <$fhq>; 73 if (/^>(\d+)_(\d+)_(\d+)_[FR]3/) { 74 $key = sprintf("%.4d_%.4d_%.4d", $1, $2, $3); # this line could be improved on 64-bit machines 75 #print $key; 76 die(qq/** unmatched read name: '$_' != '$t'\n/) unless ($_ eq $t); 77 my $name = "$1_$2_$3/$i"; 78 $_ = substr(<$fhs>, 2); 79 tr/0123./ACGTN/; 80 my $s = $_; 81 $_ = <$fhq>; 82 s/^(\d+)\s*//; 83 s/(\d+)\s*/chr($1+33)/eg; 84 $seq = qq/\@$name\n$s+\n$_\n/; 85 last; 86 } 87 } 88 return defined($seq)? ($key, $seq) : (); 89}