/tools/next_gen_conversion/bwa_solid2fastq_modified.pl

https://bitbucket.org/h_morita_dbcls/galaxy-central · Perl · 89 lines · 77 code · 8 blank · 4 comment · 26 complexity · 8130b472aae8d09d7f96e7698dea606c MD5 · raw file

  1. #!/usr/bin/perl -w
  2. # Author: lh3
  3. # Note: Ideally, this script should be written in C. It is a bit slow at present.
  4. use strict;
  5. use warnings;
  6. use Getopt::Std;
  7. my %opts;
  8. my $version = '0.1.3';
  9. my $usage = qq{
  10. Usage: solid2fastq.pl <paired> <outfile1> <outfile2> <F3.csfasta> <F3.qual> <R3.csfasta> <R3.qual>
  11. Note: <in.title> is the string showed in the `# Title:' line of a
  12. ".csfasta" read file. Then <in.title>F3.csfasta is read sequence
  13. file and <in.title>F3_QV.qual is the quality file. If
  14. <in.title>R3.csfasta is present, this script assumes reads are
  15. paired; otherwise reads will be regarded as single-end.
  16. The read name will be <out.prefix>:panel_x_y/[12] with `1' for R3
  17. tag and `2' for F3. Usually you may want to use short <out.prefix>
  18. to save diskspace. Long <out.prefix> also causes troubles to maq.
  19. };
  20. getopts('', \%opts);
  21. die($usage) if (@ARGV != 7);
  22. my ($is_paired,$outfile1,$outfile2,$f3reads,$f3qual,$r3reads,$r3qual) = @ARGV;
  23. my (@fhr, @fhw);
  24. my $fn = '';
  25. my @fn_suff = ($f3reads,$f3qual,$r3reads,$r3qual);
  26. if ($is_paired eq "yes") { # paired end
  27. for (0 .. 3) {
  28. $fn = $fn_suff[$_];
  29. $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz");
  30. open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n");
  31. }
  32. open($fhw[0], "|gzip >$outfile2") || die;
  33. open($fhw[1], "|gzip >$outfile1") || die;
  34. my (@df, @dr);
  35. @df = &read1(1); @dr = &read1(2);
  36. while (@df && @dr) {
  37. if ($df[0] eq $dr[0]) { # mate pair
  38. print {$fhw[0]} $df[1]; print {$fhw[1]} $dr[1];
  39. @df = &read1(1); @dr = &read1(2);
  40. }
  41. }
  42. close($fhr[$_]) for (0 .. $#fhr);
  43. close($fhw[$_]) for (0 .. $#fhw);
  44. } else { # single end
  45. for (0 .. 1) {
  46. my $fn = "$fn_suff[$_]";
  47. $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz");
  48. open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n");
  49. }
  50. open($fhw[2], "|gzip >$outfile1") || die;
  51. my @df;
  52. while (@df = &read1(1, $fhr[0], $fhr[1])) {
  53. print {$fhw[2]} $df[1];
  54. }
  55. close($fhr[$_]) for (0 .. $#fhr);
  56. close($fhw[2]);
  57. }
  58. sub read1 {
  59. my $i = shift(@_);
  60. my $j = ($i-1)<<1;
  61. my ($key, $seq);
  62. my ($fhs, $fhq) = ($fhr[$j], $fhr[$j|1]);
  63. while (<$fhs>) {
  64. my $t = <$fhq>;
  65. if (/^>(\d+)_(\d+)_(\d+)_[FR]3/) {
  66. $key = sprintf("%.4d_%.4d_%.4d", $1, $2, $3); # this line could be improved on 64-bit machines
  67. #print $key;
  68. die(qq/** unmatched read name: '$_' != '$t'\n/) unless ($_ eq $t);
  69. my $name = "$1_$2_$3/$i";
  70. $_ = substr(<$fhs>, 2);
  71. tr/0123./ACGTN/;
  72. my $s = $_;
  73. $_ = <$fhq>;
  74. s/^(\d+)\s*//;
  75. s/(\d+)\s*/chr($1+33)/eg;
  76. $seq = qq/\@$name\n$s+\n$_\n/;
  77. last;
  78. }
  79. }
  80. return defined($seq)? ($key, $seq) : ();
  81. }