/bin/apps/utils/splitace.pl

https://github.com/stuartpyoung/agua · Perl · 253 lines · 140 code · 90 blank · 23 comment · 14 complexity · 0091198e6642924dda1628392a968fb8 MD5 · raw file

  1. #!/usr/bin/perl -w
  2. #### DEBUG
  3. #### TIME
  4. my $time = time();
  5. my $delay = 30;
  6. print "Sleeping $delay seconds...\n";
  7. sleep($delay);
  8. =head2
  9. APPLICATION splitace
  10. **** DUMMY EXECUTABLE TO TEST WORKFLOW ****
  11. PURPOSE
  12. 1. SPLIT A LARGE .ace FILE INTO SMALLER .ace FILES FOR EACH CONTIG
  13. 2. WRITE THE INDIVIDUAL CONTIG .ace FILE IN NUMBERED SUBDIRECTORIES
  14. BASED ON THE NUMBER OF THE CONTIG (E.G., THE .ace FILES FOR CONTIGS
  15. 1 TO 100 GO IN SUBDIRECTORY '100', THE .ace FILES FOR CONTIGS
  16. 701-800 GO IN SUBDIRECTORY '800', ETC.)
  17. INPUT
  18. 1. AN .ace FILE
  19. OUTPUT
  20. 1. MULTIPLE .ace FILES - ONE FOR EACH CONTIG IN THE INPUT .ace FILE
  21. USAGE
  22. ./splitace.pl <-i inputfile> [-h]
  23. -i inputfile : /full/path/to/inputfile
  24. -i outputdir : /full/path/to/outputdir
  25. -h help : print help info
  26. EXAMPLES
  27. ./splitace.pl -i /home/syoung/base/pipeline/run2-lane6-mtdna-velvet/data/s_6_1_sequence.ace -o /home/syoung/base/pipeline/run2-lane6-mtdna-velvet/assembly/acefiles
  28. =cut
  29. use strict;
  30. #### USE LIBRARY
  31. use FindBin qw($Bin);
  32. use lib "$Bin/../../../lib";
  33. #### INTERNAL MODULES
  34. use Timer;
  35. use Util;
  36. use Conf::Agua;
  37. #### EXTERNAL MODULES
  38. use Term::ANSIColor qw(:constants);
  39. use Data::Dumper;
  40. use Getopt::Long;
  41. my @arguments = @ARGV;
  42. #### GET OPTIONS
  43. my $inputfile;
  44. my $outputdir;
  45. my $help;
  46. if ( not GetOptions (
  47. 'inputfile=s' => \$inputfile,
  48. 'outputdir=s' => \$outputdir,
  49. 'help' => \$help
  50. ) )
  51. { print "Use option --help for usage instructions.\n"; exit; };
  52. #### PRINT HELP
  53. if ( defined $help ) { usage(); }
  54. print "Inputfile: $inputfile\n";
  55. print "Outputdir: $outputdir\n";
  56. #### CHECK INPUTFILE
  57. if ( not defined $inputfile )
  58. {
  59. print "Input file not defined (option --inputfile)\n";
  60. usage();
  61. }
  62. if ( not -f $inputfile )
  63. {
  64. die "Could not find input file: $inputfile\n";
  65. usage();
  66. }
  67. #### CHECK OUTPUT DIRECTORY
  68. if ( not defined $outputdir ) { print "Output directory not defined (option --outputdir)\n"; usage(); }
  69. if ( not -d $outputdir )
  70. {
  71. mkdir($outputdir) or die "Can't make directory: $outputdir\n";
  72. if ( not -d $outputdir )
  73. {
  74. die "Could not create output directory: $outputdir\n";
  75. }
  76. }
  77. #### OPEN INPUT FILE
  78. open(FILE, $inputfile) or die "Can't open input file: $inputfile\n";
  79. #### SET RECORD DIVIDER
  80. $/ = "\nCO ";
  81. #### CHECK FIRST LINE TO MAKE SURE ITS AN .ace FILE
  82. my $line = <FILE>;
  83. if ( $line !~ /^AS\s+\d+\s+\d+\s*$/ms )
  84. {
  85. die "Input file does not have .ace-format ('AS <no. contigs> <no. bases>') first line: $line";
  86. }
  87. while ( <FILE> )
  88. {
  89. my ($contig_number, $number_reads, $contig_length) = $_ =~ /^(\S+)\s+(\d+)\s+(\d+)/i;
  90. #### REMOVE ANY INFO AFTER THE FIRST "|" BAR IN THE CONTIG NUMBER
  91. $contig_number =~ s/\|.+$//;
  92. #### REMOVE ANY LEADING NON-NUMERIC SYMBOLS IN CONTIG NUMBER
  93. $contig_number =~ s/^(\D+)//;
  94. my $output_subdir = int($contig_number / 100) + 1;
  95. $output_subdir = $output_subdir . "00";
  96. if ( not $output_subdir )
  97. {
  98. $output_subdir = "000";
  99. }
  100. print "Output subdir: $output_subdir\n";
  101. my $output_dirpath = "$outputdir/$output_subdir";
  102. print "output_dirpath: $output_dirpath\n";
  103. if ( not -d $output_dirpath )
  104. {
  105. mkdir($output_dirpath) or die "Can't create directory: $output_dirpath\n";
  106. if ( not -d $output_dirpath )
  107. {
  108. die "Could not create output subdirectory: $output_dirpath";
  109. }
  110. }
  111. if ( $^O =~ /^MSWin32$/ ) { $output_dirpath =~ s/\//\\/g; }
  112. print "output_dirpath: $output_dirpath\n";
  113. #### SET OUTPUT FILE
  114. my $outputfile = "$output_dirpath/contig.$contig_number.ace";
  115. if ( $^O =~ /^MSWin32$/ ) { $outputfile =~ s/\//\\/g; }
  116. print "Outputfile: $outputfile\n";
  117. #### REMOVE ENDING 'CO' FROM OUTPUT
  118. $_ =~ s/CO\s*$//;
  119. #### OPEN OUTPUT FILE
  120. open(OUTFILE, ">$outputfile") or die "Can't open output file: $outputfile\n";
  121. print OUTFILE "AS $number_reads $contig_length\n\n";
  122. print OUTFILE "CO Contig";
  123. print OUTFILE $_;
  124. close(OUTFILE);
  125. #`cat $outputfile`;
  126. }
  127. #### PRINT RUN TIME
  128. my $runtime = Timer::runtime( $time, time() );
  129. print "Run time: $runtime\n";
  130. print "Completed $0\n";
  131. print Util::datetime(), "\n";
  132. print "****************************************\n\n\n";
  133. exit;
  134. #:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  135. # SUBROUTINES
  136. #:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  137. sub usage
  138. {
  139. print GREEN <<"EOF";
  140. APPLICATION splitace
  141. **** DUMMY EXECUTABLE TO TEST WORKFLOW ****
  142. PURPOSE
  143. 1. SPLIT A LARGE .ace FILE INTO SMALLER .ace FILES FOR EACH CONTIG
  144. 2. WRITE THE INDIVIDUAL CONTIG .ace FILE IN NUMBERED SUBDIRECTORIES
  145. BASED ON THE NUMBER OF THE CONTIG (E.G., THE .ace FILES FOR CONTIGS
  146. 1 TO 100 GO IN SUBDIRECTORY '100', THE .ace FILES FOR CONTIGS
  147. 701-800 GO IN SUBDIRECTORY '800', ETC.)
  148. INPUT
  149. 1. AN .ace FILE
  150. OUTPUT
  151. 1. MULTIPLE .ace FILES - ONE FOR EACH CONTIG IN THE INPUT .ace FILE
  152. USAGE
  153. ./splitace.pl <-i inputfile> [-h]
  154. -i inputfile : /full/path/to/inputfile
  155. -i outputdir : /full/path/to/outputdir
  156. -h help : print help info
  157. EXAMPLES
  158. ./splitace.pl -i /home/syoung/base/pipeline/run2-lane6-mtdna-velvet/data/s_6_1_sequence.ace -o /home/syoung/base/pipeline/run2-lane6-mtdna-velvet/assembly/acefiles
  159. =cut
  160. EOF
  161. print RESET;
  162. exit(1);
  163. }