PageRenderTime 58ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/RADpools

https://github.com/johnomics/RADtools
Perl | 848 lines | 612 code | 152 blank | 84 comment | 65 complexity | a68a18b9e8e847b94aeb791fa043c2fb MD5 | raw file
Possible License(s): GPL-3.0
  1. #!/usr/bin/env perl
  2. # RADpools
  3. # History:
  4. # 16/05/10 First public version
  5. # 02/07/10 Changed defaults to avoid trimming reads
  6. # 18/07/10 Added MID length parameter and changed default to reject zombies,
  7. # not accept them
  8. # 03/08/10 Incorporated into RADtools
  9. # 05/08/10 Run through perlcritic, tidied up options, added POD,
  10. # removed benchmarking, added main() for testing purposes
  11. # 20/08/10 Added sorting the read pools using Parallel::ForkManager
  12. # 25/08/10 Catch absence of Parallel::ForkManager
  13. # Rework trim and quality options, remove mid_length and
  14. # minimum_sequence_length options, rename zombies to fuzzy_MIDs
  15. # 25/08/10 Version 1.0
  16. # 08/09/10 1.0.1 Minor bug fixes
  17. # 29/10/10 1.0.2 Accept at most one N in restriction enzyme, MID;
  18. # Don't use species name in output filename
  19. # 30/10/10 1.1 Change species argument to directory argument
  20. # Output restriction site sequence
  21. # Add option to output FASTQ format
  22. # Load and output fastq-sanger qualities
  23. # 4/11/10 1.1.1 Improved robustness following implementation of test suite
  24. # 16/02/11 1.2 New RADmarkers output; no changes to RADpools
  25. # 15/05/11 1.2.1 Clarify verbose output
  26. # 20/07/11 1.2.2 Allow Q=41 quality scores following base caller upgrade
  27. # 12/09/11 1.2.3 If P2 MIDs given, use them to sort reads
  28. # 07/02/12 1.2.4 Allow enzymes with >1 ambiguous base such as SgrAI
  29. #############################################################################
  30. ###
  31. ### PREAMBLE
  32. ###
  33. #############################################################################
  34. # Core pragmas and modules
  35. use strict;
  36. use warnings;
  37. use English qw( -no_match_vars );
  38. use Getopt::Long qw( :config bundling no_auto_abbrev auto_version );
  39. use Pod::Usage;
  40. use File::Basename;
  41. use File::Copy;
  42. use Cwd;
  43. use RADtools qw(load_fastq_pair sort_reads_file QUAL_OFFSET);
  44. # Non-core modules
  45. eval {
  46. require Parallel::ForkManager;
  47. Parallel::ForkManager->import();
  48. };
  49. die
  50. "RADpools requires the CPAN module Parallel::ForkManager. Please install this package and add it to your Perl library path.\n"
  51. if $@;
  52. # Would like to use Carp, but an outstanding bug causes cryptic errors
  53. # when using caller(), so using die until this is fixed
  54. # http://www.nntp.perl.org/group/perl.perl5.porters/2010/03/msg157461.html
  55. local $main::VERSION = 1.2.4; # Used by Getopt::Long to provide --version
  56. local $OUTPUT_AUTOFLUSH = 1; # So reporting on progress works
  57. main(@ARGV) unless caller(); # So test suite can call script
  58. sub main {
  59. # Set up default options
  60. my $help = 0;
  61. my $usage = 0;
  62. my $man = 0;
  63. my $verbose = 0;
  64. my $read1_file = q{};
  65. my $read2_file = q{};
  66. my $directory = q{};
  67. my $max_processes = 1;
  68. my $sanger = 0;
  69. my $output_fastq = 0;
  70. my $res_site = 'TGCAGG'; #SbfI
  71. my $quality_threshold = 0;
  72. my $trim = 0;
  73. my $fuzzy_MIDs = 0;
  74. # Load user options
  75. my $options_okay = GetOptions(
  76. 'help|h' => \$help,
  77. 'usage|u' => \$usage,
  78. 'man' => \$man,
  79. 'verbose|v' => \$verbose,
  80. 'in|i=s' => \$read1_file,
  81. 'paired|p=s' => \$read2_file,
  82. 'directory|d=s' => \$directory,
  83. 'max_processes|m=i' => \$max_processes,
  84. 'sanger|s' => \$sanger,
  85. 'output_fastq|o' => \$output_fastq,
  86. 'enzyme|e=s' => \$res_site,
  87. 'quality|q=i' => \$quality_threshold,
  88. 'trim|t=i' => \$trim,
  89. 'fuzzy_MIDs|f' => \$fuzzy_MIDs,
  90. ) or pod2usage( -exitval => 3, -verbose => 0 );
  91. pod2usage( -exitval => 4, -verbose => 0 ) if $usage;
  92. pod2usage( -exitval => 5, -verbose => 1 ) if $help;
  93. pod2usage( -exitval => 6, -verbose => 2 ) if $man;
  94. # Validate options
  95. pod2usage( -exitval => 7, -verbose => 0 ) if ( $read1_file eq q{} );
  96. pod2usage( -exitval => 8, -verbose => 0 ) if ( $directory eq q{} );
  97. die "Quality threshold must be between 0 and 40\n"
  98. if ( ( $quality_threshold < 0 ) || ( $quality_threshold > 40 ) );
  99. die "Restriction enzyme must contain only IUPAC codes\n"
  100. if ( $res_site =~ /[^ACGTRYSWKMBDHVN]/ );
  101. # If directory is ., get basename of current directory
  102. if ( $directory eq '.' ) {
  103. $directory = basename(getcwd);
  104. }
  105. # Sugar: strip .pools from directory argument if it has been added
  106. $directory =~ s/\.pools$//;
  107. #############################################################################
  108. ###
  109. ### LOAD POOLS AND MIDS
  110. ###
  111. #############################################################################
  112. my $res_site_length = length $res_site;
  113. my $p1mid_length = 0;
  114. my $p2mid_length = 0;
  115. my %mid_pools = ();
  116. my %pool_handles = ();
  117. load_pools_and_mids( \%mid_pools, $directory, $verbose, $fuzzy_MIDs );
  118. my $results_ref =
  119. open_read_files( \%mid_pools, \%pool_handles, $directory, $fuzzy_MIDs );
  120. $p1mid_length = $results_ref->{p1mid_length};
  121. $p2mid_length = $results_ref->{p2mid_length};
  122. # Make fuzzy list of restriction enzyme sites
  123. my %fuzzy_res_sites = ();
  124. my @fuzzy_res_list = ();
  125. make_fuzzy_seqs( $res_site, \@fuzzy_res_list );
  126. map { $fuzzy_res_sites{$_} = $res_site; } @fuzzy_res_list;
  127. #############################################################################
  128. ###
  129. ### LOAD FASTQ RECORDS
  130. ###
  131. #############################################################################
  132. if ($verbose) { print "Loading RAD reads...\n"; }
  133. # Initialise sequence, quality and RAD variables
  134. my $quality_tail = q{};
  135. my $sequence = q{};
  136. my $record_count_total = 0;
  137. my $record_count_valid = 0;
  138. # Set up quality thresholds and characters
  139. my $qual_low_char = ';';
  140. my $qual_high_char = 'i';
  141. if ($sanger) {
  142. $qual_low_char = '!';
  143. $qual_high_char = 'J';
  144. }
  145. # Open RAD sequence file
  146. open my $read1_in, '<', $read1_file
  147. or die "Couldn't open '$read1_file': $OS_ERROR";
  148. my $read2_in;
  149. if ( $read2_file ne q{} ) {
  150. open $read2_in, '<', $read2_file
  151. or die "Couldn't open '$read2_file': $OS_ERROR";
  152. }
  153. # Output invalid/unclustered sequences
  154. my $results_file_stem = "./$directory/$directory\_" . get_timestamp();
  155. my $invalid_file = "$results_file_stem\_invalid.txt";
  156. open my $invalid_handle, '>', $invalid_file
  157. or die "Can't open $invalid_file: $OS_ERROR";
  158. print {$invalid_handle} "Input files: $read1_file"
  159. or die "Can't write to $invalid_file: $OS_ERROR\n";
  160. if ( $read2_file ne q{} ) { print {$invalid_handle} ", $read2_file"; }
  161. print {$invalid_handle}
  162. "; Directory: $directory, Enzyme site: $res_site; Trim length: $trim; Quality threshold: $quality_threshold; Processes: $max_processes; Sanger? ";
  163. if ($sanger) { print {$invalid_handle} "Yes;" }
  164. else { print {$invalid_handle} "No;" }
  165. print {$invalid_handle} " Fuzzy MIDs? ";
  166. if ($fuzzy_MIDs) { print {$invalid_handle} "Yes;" }
  167. else { print {$invalid_handle} "No;" }
  168. print {$invalid_handle} "\n";
  169. # Load RAD sequences into hash of pools with counts for identical sequences
  170. my $non_fastq_count = 0;
  171. my $wrong_rad_format_count = 0;
  172. my $wrong_ressite_count = 0;
  173. my $wrong_p1mid_count = 0;
  174. my $wrong_p2mid_count = 0;
  175. my $short_sequence_count = 0;
  176. LOAD_ONE_SEQUENCE:
  177. while (
  178. my $pair_ref = load_fastq_pair(
  179. {
  180. read1_handle => $read1_in,
  181. read2_handle => $read2_in,
  182. qual_low_char => $qual_low_char,
  183. qual_high_char => $qual_high_char,
  184. illumina => !$sanger,
  185. }
  186. )
  187. )
  188. {
  189. last if ( $pair_ref->{valid} == -1 );
  190. # Update progress meter
  191. $record_count_total++;
  192. if ( ($verbose) && ( $record_count_total % 100_000 == 0 ) ) {
  193. printf
  194. "OK: %8d | Rejected %8d (FASTQ: %8d; RAD: %8d; Res: %8d; P1 MID: %8d; P2 MID: %8d; Short: %8d)\n",
  195. $record_count_valid,
  196. $non_fastq_count +
  197. $wrong_rad_format_count +
  198. $wrong_ressite_count +
  199. $wrong_p1mid_count +
  200. $wrong_p2mid_count +
  201. $short_sequence_count,
  202. $non_fastq_count,
  203. $wrong_rad_format_count,
  204. $wrong_ressite_count, $wrong_p1mid_count, $wrong_p2mid_count,
  205. $short_sequence_count;
  206. }
  207. if ( !$pair_ref->{valid} ) {
  208. print {$invalid_handle}
  209. "Invalid FASTQ record: $pair_ref->{r1name} $pair_ref->{r1seq} $pair_ref->{r1qual}\n"
  210. or die "Can't write to $invalid_file: $OS_ERROR\n";
  211. $non_fastq_count++;
  212. next LOAD_ONE_SEQUENCE;
  213. }
  214. # Set trim to length of RAD sequence if not previously set
  215. my $tag_length = ( length $pair_ref->{r1seq} ) - $p1mid_length;
  216. if ( ( $trim <= 0 ) or ( $trim > $tag_length ) ) {
  217. $trim = $tag_length;
  218. }
  219. # Find any point where Q<threshold and
  220. # strip off sequence following this point
  221. my $qual_threshold_char = chr( $quality_threshold + QUAL_OFFSET );
  222. if ( $pair_ref->{r1qual} =~ m{(\A[$qual_threshold_char-J]+)}xms ) {
  223. $quality_tail = $1;
  224. }
  225. else {
  226. $quality_tail = q{};
  227. }
  228. # Get substring of sequence of length quality_substring
  229. $sequence = substr( $pair_ref->{r1seq}, 0, length $quality_tail );
  230. my $p1mid = q{}; # Empty string
  231. my $seq_res_site = q{};
  232. my $dna_sequence = q{};
  233. # Find restriction enzyme site; if not found, skip sequence
  234. if (
  235. $sequence =~ m{^([ACGTN]{$p1mid_length})
  236. ([ACGTN]{$res_site_length})
  237. ([ACGTN\.]+)$}xms
  238. )
  239. {
  240. $p1mid = $1;
  241. $seq_res_site = $2;
  242. $dna_sequence = $2 . $3;
  243. }
  244. else {
  245. print {$invalid_handle}
  246. "Sequence is not a valid RAD sequence : $pair_ref->{r1seq} $pair_ref->{r1qual} $sequence\n"
  247. or die "Can't write to $invalid_file: $OS_ERROR\n";
  248. $wrong_rad_format_count++;
  249. next LOAD_ONE_SEQUENCE;
  250. }
  251. if ( !( exists $fuzzy_res_sites{$seq_res_site} ) ) {
  252. print {$invalid_handle} 'Restriction enzyme site does not match'
  253. . " : $pair_ref->{r1seq} $pair_ref->{r1qual} "
  254. . "$seq_res_site $dna_sequence\n"
  255. or die "Can't write to $invalid_file: $OS_ERROR\n";
  256. $wrong_ressite_count++;
  257. next LOAD_ONE_SEQUENCE;
  258. }
  259. # Find pool for this MID; if no pool, make pool for this MID alone
  260. if ( !( exists $mid_pools{$p1mid} ) ) {
  261. # If MID not found, reject read
  262. print {$invalid_handle} 'P1 MID is not a valid MID'
  263. . " : $pair_ref->{r1seq} $pair_ref->{r1qual} "
  264. . "$p1mid $sequence\n"
  265. or die "Can't write to $invalid_file: $OS_ERROR\n";
  266. $wrong_p1mid_count++;
  267. next LOAD_ONE_SEQUENCE;
  268. }
  269. my $p2mid = substr( $pair_ref->{r2seq}, 0, $p2mid_length );
  270. if ( !( exists $mid_pools{$p1mid}{$p2mid} ) ) {
  271. # If MID not found, reject read
  272. print {$invalid_handle}
  273. 'P2 MID is not a valid MID or does not match P1 MID'
  274. . " : $pair_ref->{r2seq} $pair_ref->{r2qual} "
  275. . "P1:$p1mid P2:$p2mid\n"
  276. or die "Can't write to $invalid_file: $OS_ERROR\n";
  277. $wrong_p2mid_count++;
  278. next LOAD_ONE_SEQUENCE;
  279. }
  280. my $pool = $mid_pools{$p1mid}{$p2mid};
  281. # Strip P2 MID from read 2 sequence and quality
  282. $pair_ref->{r2seq} = substr( $pair_ref->{r2seq}, $p2mid_length );
  283. $pair_ref->{r2qual} = substr( $pair_ref->{r2qual}, $p2mid_length );
  284. # Increment the number of times this sequence has occurred
  285. if ( length($dna_sequence) >= $trim ) {
  286. # Remove MID from read 1 quality string
  287. if ( $quality_tail =~ m{^(.{$p1mid_length})(.+)$}xms ) {
  288. $quality_tail = $2;
  289. }
  290. $dna_sequence = substr $dna_sequence, 0, $trim;
  291. $quality_tail = substr $quality_tail, 0, $trim;
  292. if ( $output_fastq && ( $pair_ref->{r1name} ne q{} ) ) {
  293. print { $pool_handles{$pool} } "$pair_ref->{r1name} ";
  294. }
  295. print { $pool_handles{$pool} } "$dna_sequence $quality_tail"
  296. or die "Can't write to pools file: $OS_ERROR\n";
  297. if ( $read2_file ne q{} ) {
  298. if ( $output_fastq && ( $pair_ref->{r2name} ne q{} ) ) {
  299. print { $pool_handles{$pool} } " $pair_ref->{r2name}";
  300. }
  301. print { $pool_handles{$pool} }
  302. " $pair_ref->{r2seq} $pair_ref->{r2qual}";
  303. }
  304. print { $pool_handles{$pool} } "\n";
  305. $record_count_valid++;
  306. }
  307. # Otherwise, sequence is too short; throw it away
  308. else {
  309. print {$invalid_handle} 'Sequence shorter than '
  310. . $trim
  311. . ' base pairs : '
  312. . "$pair_ref->{r1seq} $pair_ref->{r1qual} $dna_sequence\n"
  313. or die "Can't write to $invalid_file: $OS_ERROR\n";
  314. $short_sequence_count++;
  315. }
  316. }
  317. close $read1_in or die "Can't close $read1_file: $OS_ERROR\n";
  318. if ( $read2_file ne q{} ) {
  319. close $read2_in or die "Can't close $read2_file: $OS_ERROR\n";
  320. }
  321. foreach my $pool ( keys %pool_handles ) {
  322. close $pool_handles{$pool}
  323. or die "Can't close pool files: $OS_ERROR\n";
  324. }
  325. if ($verbose) { print "Sorting pooled reads...\n"; }
  326. my @pool_ids = sort keys %pool_handles;
  327. my $pm = new Parallel::ForkManager( scalar(@pool_ids) );
  328. $pm->set_max_procs($max_processes);
  329. foreach my $pool_id (@pool_ids) {
  330. $pm->start and next;
  331. my $read_filename = "./$directory/$pool_id\.reads";
  332. if ($verbose) { print "Sorting $read_filename...\n"; }
  333. sort_reads_file(
  334. {
  335. directory => $directory,
  336. read_filename => $read_filename,
  337. sort_start => $res_site_length + 1,
  338. read1_field => $output_fastq ? 2 : 1,
  339. }
  340. );
  341. # Convert reads to FASTQ
  342. if ($output_fastq) {
  343. if ($verbose) {
  344. print "Converting $pool_id to FASTQ format...\n";
  345. }
  346. my $fastq1_file;
  347. my $fastq2_file;
  348. if ( $read2_file ne q{} ) {
  349. open $fastq1_file, '>', "./$directory/$pool_id\_1\.fastq";
  350. open $fastq2_file, '>', "./$directory/$pool_id\_2\.fastq";
  351. }
  352. else {
  353. open $fastq1_file, '>', "./$directory/$pool_id\.fastq";
  354. }
  355. open my $read_file, '<', $read_filename;
  356. while ( my $read_line = <$read_file> ) {
  357. chomp $read_line;
  358. my @read_fields = split / /, $read_line;
  359. print $fastq1_file
  360. "\@$read_fields[0]\n$read_fields[1]\n\+$read_fields[0]\n$read_fields[2]\n";
  361. if ( $read2_file ne q{} ) {
  362. print $fastq2_file
  363. "\@$read_fields[3]\n$read_fields[4]\n\+$read_fields[3]\n$read_fields[5]\n";
  364. }
  365. }
  366. close $read_file;
  367. if ( $read2_file ne q{} ) { close $fastq2_file; }
  368. close $fastq1_file;
  369. unlink($read_filename);
  370. }
  371. $pm->finish;
  372. }
  373. $pm->wait_all_children;
  374. print {$invalid_handle}
  375. "$record_count_valid records loaded, but discarded \n"
  376. . "$non_fastq_count records not in FASTQ format, \n"
  377. . "$wrong_rad_format_count records not in RAD format, \n"
  378. . "$wrong_ressite_count records not matching restriction enzyme site, \n"
  379. . "$wrong_p1mid_count records not matching any listed P1 MIDs, \n"
  380. . "$wrong_p2mid_count records not matching any listed P2 MIDs, and \n"
  381. . "$short_sequence_count records with high-quality sequences less than $trim bp long\n"
  382. or die "Can't write to $invalid_file: $OS_ERROR\n";
  383. close $invalid_handle or die "Can't close $invalid_file: $OS_ERROR\n";
  384. # Output records loaded and discarded
  385. if ($verbose) {
  386. print "$record_count_valid records loaded, but discarded \n"
  387. . "$non_fastq_count records not in FASTQ format, \n"
  388. . "$wrong_rad_format_count records not in RAD format, \n"
  389. . "$wrong_ressite_count records not matching restriction enzyme site, \n"
  390. . "$wrong_p1mid_count records not matching any listed P1 MIDs, \n"
  391. . "$wrong_p2mid_count records not matching any listed P2 MIDs, and \n"
  392. . "$short_sequence_count records with high-quality sequences less than $trim bp long\n"
  393. . "Done\n";
  394. }
  395. return;
  396. }
  397. #############################################################################
  398. ###
  399. ### SUBROUTINES
  400. ###
  401. #############################################################################
  402. sub make_fuzzy_seqs {
  403. my ( $seq, $fuzzy_list_ref ) = @_;
  404. my $seq_length = length $seq;
  405. # Unpack IUPAC bases
  406. my @unpacked_seqs;
  407. if ($seq =~ /[RYSWKMBDHVN]/) {
  408. my @bases = split //, $seq;
  409. my $iupac_stubs_ref = get_unpacked_seqs(\@bases);
  410. @unpacked_seqs = @{$iupac_stubs_ref};
  411. }
  412. else {
  413. push @unpacked_seqs, $seq;
  414. }
  415. for my $unpacked_seq (@unpacked_seqs) {
  416. for my $i ( 1 .. $seq_length ) {
  417. for my $base (qw{A C G T N}) {
  418. my $fuzzyseq = $unpacked_seq;
  419. my $prebase_i = $i - 1;
  420. my $postbase_i = $seq_length - $i;
  421. $fuzzyseq =~ s{^([ACGT]{$prebase_i})
  422. ([ACGT])
  423. ([ACGT]{$postbase_i})$}
  424. {$1$base$3}xms;
  425. push @{$fuzzy_list_ref}, $fuzzyseq;
  426. }
  427. }
  428. }
  429. return;
  430. }
  431. sub get_unpacked_seqs {
  432. my ($bases_ref) = @_;
  433. my $base1 = shift @{$bases_ref};
  434. my @iupac_stubs;
  435. @iupac_stubs = @{$bases_ref} > 0 ? @{get_unpacked_seqs($bases_ref)} : ('');
  436. # IUPAC code constants
  437. my %iupac_codes = (
  438. 'A' => 'A',
  439. 'C' => 'C',
  440. 'G' => 'G',
  441. 'T' => 'T',
  442. 'R' => 'AG',
  443. 'Y' => 'CT',
  444. 'S' => 'GC',
  445. 'W' => 'AT',
  446. 'K' => 'GT',
  447. 'M' => 'AC',
  448. 'B' => 'CGT',
  449. 'D' => 'AGT',
  450. 'H' => 'ACT',
  451. 'V' => 'ACG',
  452. 'N' => 'ACGT',
  453. );
  454. my $iupac_string = $iupac_codes{$base1};
  455. my @unpacked_seqs;
  456. for my $iupac_base (split //, $iupac_string) {
  457. for my $iupac_stub (@iupac_stubs) {
  458. push @unpacked_seqs, $iupac_base . $iupac_stub;
  459. }
  460. }
  461. return \@unpacked_seqs;
  462. }
  463. sub load_pools_and_mids {
  464. my ( $mid_pools_ref, $directory, $verbose, $fuzzy_MIDs ) = @_;
  465. if ( $directory ne q{} ) {
  466. # Load pools from file into an array
  467. my $pool_file = $directory . '.pools';
  468. open my $poolhandle, '<', $pool_file
  469. or die "Couldn't open '$pool_file': $OS_ERROR\n";
  470. if ($verbose) { print "Loading pools from $pool_file...\n"; }
  471. my @pools = <$poolhandle>;
  472. close $poolhandle or die "Can't close $pool_file: $OS_ERROR\n";
  473. if ( !-d "./$directory" ) {
  474. mkdir "./$directory"
  475. or die
  476. "Can't create directory for $directory output files: $OS_ERROR!\n";
  477. if ($verbose) { print "Created ./$directory...\n"; }
  478. }
  479. my %pool_names;
  480. # Load MIDs for all pools
  481. foreach my $pool_line (@pools) {
  482. chomp $pool_line;
  483. next if ( $pool_line eq '' );
  484. my @pool_mid_list = split /\s+/, $pool_line;
  485. if ( @pool_mid_list < 2 ) {
  486. print STDERR
  487. "Faulty pool: $pool_line - must have at least a name and one MID separated by whitespace\n";
  488. next;
  489. }
  490. my $pool_name = shift @pool_mid_list;
  491. if ( defined $pool_names{$pool_name} ) {
  492. print STDERR
  493. "Faulty pool: $pool_name - pool names must be unique\n";
  494. next;
  495. }
  496. $pool_names{$pool_name}++;
  497. # Load MIDs into array and set up hash of MIDs for pool lookup
  498. foreach my $pool_mid (@pool_mid_list) {
  499. my $p1mid = $pool_mid;
  500. my $p2mid = "";
  501. if ( $pool_mid =~ /:/ ) {
  502. my @p1p2mids = split /:/, $pool_mid;
  503. $p1mid = $p1p2mids[0];
  504. $p2mid = $p1p2mids[1];
  505. }
  506. if ( ( $p1mid !~ /[ACGT]+/ )
  507. || ( ( $p2mid ne "" ) && ( $p2mid !~ /[ACGT]+/ ) ) )
  508. {
  509. print STDERR
  510. "Faulty MID : $pool_name $pool_mid - MIDs must contain only ACGT\n";
  511. next;
  512. }
  513. if ( !$fuzzy_MIDs ) {
  514. push @{ $mid_pools_ref->{$p1mid}{$p2mid} }, $pool_name;
  515. }
  516. else {
  517. my @p1seqs;
  518. my @p2seqs;
  519. make_fuzzy_seqs( $p1mid, \@p1seqs );
  520. make_fuzzy_seqs( $p2mid, \@p2seqs );
  521. if ( @p2seqs == 0 ) { push @p2seqs, ""; }
  522. foreach my $p1seq (@p1seqs) {
  523. foreach my $p2seq (@p2seqs) {
  524. push @{ $mid_pools_ref->{$p1seq}{$p2seq} },
  525. $pool_name;
  526. }
  527. }
  528. }
  529. }
  530. }
  531. }
  532. else {
  533. die 'Please provide the name of your pools file using the -d option';
  534. }
  535. return;
  536. }
  537. sub open_read_files {
  538. my ( $mid_pools_ref, $pool_handles_ref, $directory, $fuzzy_MIDs ) = @_;
  539. my $p1mid_length = 0;
  540. my $p2mid_length = 0;
  541. # Collapse lists of pools for each MID into unique, sorted, string
  542. foreach my $p1mid ( keys %{$mid_pools_ref} ) {
  543. $p1mid_length = length $p1mid;
  544. foreach my $p2mid ( keys %{ $mid_pools_ref->{$p1mid} } ) {
  545. $p2mid_length = length $p2mid;
  546. my @pool_list = sort @{ $mid_pools_ref->{$p1mid}{$p2mid} };
  547. my $seen = q{};
  548. # Get unique elements of list
  549. @pool_list =
  550. grep { ( $_ ne $seen ) && ( ($seen) = $_ ) } @pool_list;
  551. if ( ( !$fuzzy_MIDs ) && ( @pool_list > 1 ) ) {
  552. delete $mid_pools_ref->{$p1mid}{$p2mid};
  553. }
  554. else {
  555. my $pool_string = q{};
  556. # Collapse list of pools
  557. foreach my $pool (@pool_list) {
  558. $pool_string = $pool_string . $pool . '.';
  559. }
  560. # Get rid of trailing .
  561. chop $pool_string;
  562. $mid_pools_ref->{$p1mid}{$p2mid} = $pool_string;
  563. open $pool_handles_ref->{$pool_string}, '>',
  564. "./$directory/$pool_string\.reads"
  565. or die "Can't open output file for $pool_string: $OS_ERROR\n";
  566. }
  567. }
  568. }
  569. return { p1mid_length => $p1mid_length, p2mid_length => $p2mid_length };
  570. }
  571. #############################################################################
  572. ### Name: GET TIMESTAMP
  573. ### Function: gets and formats the current time
  574. ### Parameters: None
  575. ### Returns: Current time as YYYYMMDD_HHMMSS
  576. #############################################################################
  577. sub get_timestamp {
  578. my ( $sec, $min, $hour, $mday, $mon, $year ) = localtime time;
  579. $mon++;
  580. $year += 1900;
  581. if ( $mon < 10 ) { $mon = "0$mon"; }
  582. if ( $mday < 10 ) { $mday = "0$mday"; }
  583. if ( $hour < 10 ) { $hour = "0$hour"; }
  584. if ( $min < 10 ) { $min = "0$min"; }
  585. if ( $sec < 10 ) { $sec = "0$sec"; }
  586. return $year . $mon . $mday . '_' . $hour . $min . $sec;
  587. }
  588. # return true;
  589. 1;
  590. __END__
  591. #############################################################################
  592. ###
  593. ### DOCUMENTATION
  594. ###
  595. #############################################################################
  596. =head1 NAME
  597. RADpools - Take raw Illumina RAD reads and create read files for a set of pools
  598. =head1 VERSION
  599. This documentation refers to RADtools version $main::VERSION.
  600. =head1 SYNOPSIS
  601. =over 8
  602. =item RADpools --in <FILE> --directory <DIRECTORY> [options]
  603. =item RADpools --help
  604. =back
  605. =head1 OPTIONS
  606. =over 8
  607. =item B<-h, --help>
  608. Print a brief help message and exit
  609. =item B<-u, --usage>
  610. Print concise usage and exit
  611. =item B<--man>
  612. Print the manual page and exit
  613. =item B<--version>
  614. Print version number and exit
  615. =item B<-v, --verbose>
  616. Output status messages during run (default off)
  617. =item B<-i, --in>
  618. File containing single end Illumina RAD reads in FASTQ format (required)
  619. =item B<-p, --paired>
  620. File containing paired end Illumina RAD reads in FASTQ format (optional)
  621. =item B<-d, --directory>
  622. Name of output directory, matching name of pools file (required)
  623. =item B<-m, --max_processes>
  624. Maximum number of processes to use for sorting read pools after input files have been processed. Default 1; increasing --max_processes is highly recommended if more cores are available.
  625. =item B<-e, --enzyme>
  626. Sequence of cut site for used restriction enzyme (default TGCAGG, SbfI). IUPAC codes are allowed.
  627. =item B<-q, --quality>
  628. Minimum acceptable quality score (default 0, ie all bases accepted)
  629. =item B<-t, --trim>
  630. Trim all reads to this length (default max length of read, ie no trimming)
  631. =item B<-f, --fuzzy_MIDs>
  632. If a MID contains an error, search for the closest MID sequence. If more than one MID found, create a new MID for the intermediate sequence (default off)
  633. =item B<-s, --sanger>
  634. Specify this option if input files are in fastq-sanger format, not fastq-illumina format.
  635. =item B<-o, --output_fastq>
  636. Outputs FASTQ format rather than reads format.
  637. =back
  638. =head1 DESCRIPTION
  639. B<RADpools> accepts as input one or two FASTQ files containing single or
  640. paired end GERALD output from an Illumina RAD sequencing run. These RAD
  641. sequences contain MIDs which identify the individuals from which the
  642. sequence came. This script loads in the pools to which the MIDs belong and
  643. assigns the sequences to the appropriate pools. It then writes out the
  644. sequences to separate files, one for each pool.
  645. =head1 AUTHOR
  646. John Davey <john.davey@ed.ac.uk>
  647. =head1 LICENCE AND COPYRIGHT
  648. Copyright 2008, 2011 John Davey, University of Edinburgh <john.davey@ed.ac.uk>
  649. This program is free software: you can redistribute it and/or modify
  650. it under the terms of the GNU General Public License as published by
  651. the Free Software Foundation, either version 3 of the License, or
  652. (at your option) any later version.
  653. This program is distributed in the hope that it will be useful,
  654. but WITHOUT ANY WARRANTY; without even the implied warranty of
  655. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  656. GNU General Public License for more details.
  657. You should have received a copy of the GNU General Public License
  658. along with this program. If not, see <http://www.gnu.org/licenses/>.
  659. =cut