/RADpools
Perl | 848 lines | 612 code | 152 blank | 84 comment | 65 complexity | a68a18b9e8e847b94aeb791fa043c2fb MD5 | raw file
Possible License(s): GPL-3.0
- #!/usr/bin/env perl
- # RADpools
- # History:
- # 16/05/10 First public version
- # 02/07/10 Changed defaults to avoid trimming reads
- # 18/07/10 Added MID length parameter and changed default to reject zombies,
- # not accept them
- # 03/08/10 Incorporated into RADtools
- # 05/08/10 Run through perlcritic, tidied up options, added POD,
- # removed benchmarking, added main() for testing purposes
- # 20/08/10 Added sorting the read pools using Parallel::ForkManager
- # 25/08/10 Catch absence of Parallel::ForkManager
- # Rework trim and quality options, remove mid_length and
- # minimum_sequence_length options, rename zombies to fuzzy_MIDs
- # 25/08/10 Version 1.0
- # 08/09/10 1.0.1 Minor bug fixes
- # 29/10/10 1.0.2 Accept at most one N in restriction enzyme, MID;
- # Don't use species name in output filename
- # 30/10/10 1.1 Change species argument to directory argument
- # Output restriction site sequence
- # Add option to output FASTQ format
- # Load and output fastq-sanger qualities
- # 4/11/10 1.1.1 Improved robustness following implementation of test suite
- # 16/02/11 1.2 New RADmarkers output; no changes to RADpools
- # 15/05/11 1.2.1 Clarify verbose output
- # 20/07/11 1.2.2 Allow Q=41 quality scores following base caller upgrade
- # 12/09/11 1.2.3 If P2 MIDs given, use them to sort reads
- # 07/02/12 1.2.4 Allow enzymes with >1 ambiguous base such as SgrAI
- #############################################################################
- ###
- ### PREAMBLE
- ###
- #############################################################################
- # Core pragmas and modules
- use strict;
- use warnings;
- use English qw( -no_match_vars );
- use Getopt::Long qw( :config bundling no_auto_abbrev auto_version );
- use Pod::Usage;
- use File::Basename;
- use File::Copy;
- use Cwd;
- use RADtools qw(load_fastq_pair sort_reads_file QUAL_OFFSET);
- # Non-core modules
- eval {
- require Parallel::ForkManager;
- Parallel::ForkManager->import();
- };
- die
- "RADpools requires the CPAN module Parallel::ForkManager. Please install this package and add it to your Perl library path.\n"
- if $@;
- # Would like to use Carp, but an outstanding bug causes cryptic errors
- # when using caller(), so using die until this is fixed
- # http://www.nntp.perl.org/group/perl.perl5.porters/2010/03/msg157461.html
- local $main::VERSION = 1.2.4; # Used by Getopt::Long to provide --version
- local $OUTPUT_AUTOFLUSH = 1; # So reporting on progress works
- main(@ARGV) unless caller(); # So test suite can call script
- sub main {
- # Set up default options
- my $help = 0;
- my $usage = 0;
- my $man = 0;
- my $verbose = 0;
- my $read1_file = q{};
- my $read2_file = q{};
- my $directory = q{};
- my $max_processes = 1;
- my $sanger = 0;
- my $output_fastq = 0;
- my $res_site = 'TGCAGG'; #SbfI
- my $quality_threshold = 0;
- my $trim = 0;
- my $fuzzy_MIDs = 0;
- # Load user options
- my $options_okay = GetOptions(
- 'help|h' => \$help,
- 'usage|u' => \$usage,
- 'man' => \$man,
- 'verbose|v' => \$verbose,
- 'in|i=s' => \$read1_file,
- 'paired|p=s' => \$read2_file,
- 'directory|d=s' => \$directory,
- 'max_processes|m=i' => \$max_processes,
- 'sanger|s' => \$sanger,
- 'output_fastq|o' => \$output_fastq,
- 'enzyme|e=s' => \$res_site,
- 'quality|q=i' => \$quality_threshold,
- 'trim|t=i' => \$trim,
- 'fuzzy_MIDs|f' => \$fuzzy_MIDs,
- ) or pod2usage( -exitval => 3, -verbose => 0 );
- pod2usage( -exitval => 4, -verbose => 0 ) if $usage;
- pod2usage( -exitval => 5, -verbose => 1 ) if $help;
- pod2usage( -exitval => 6, -verbose => 2 ) if $man;
- # Validate options
- pod2usage( -exitval => 7, -verbose => 0 ) if ( $read1_file eq q{} );
- pod2usage( -exitval => 8, -verbose => 0 ) if ( $directory eq q{} );
- die "Quality threshold must be between 0 and 40\n"
- if ( ( $quality_threshold < 0 ) || ( $quality_threshold > 40 ) );
- die "Restriction enzyme must contain only IUPAC codes\n"
- if ( $res_site =~ /[^ACGTRYSWKMBDHVN]/ );
- # If directory is ., get basename of current directory
- if ( $directory eq '.' ) {
- $directory = basename(getcwd);
- }
- # Sugar: strip .pools from directory argument if it has been added
- $directory =~ s/\.pools$//;
- #############################################################################
- ###
- ### LOAD POOLS AND MIDS
- ###
- #############################################################################
- my $res_site_length = length $res_site;
- my $p1mid_length = 0;
- my $p2mid_length = 0;
- my %mid_pools = ();
- my %pool_handles = ();
- load_pools_and_mids( \%mid_pools, $directory, $verbose, $fuzzy_MIDs );
- my $results_ref =
- open_read_files( \%mid_pools, \%pool_handles, $directory, $fuzzy_MIDs );
- $p1mid_length = $results_ref->{p1mid_length};
- $p2mid_length = $results_ref->{p2mid_length};
- # Make fuzzy list of restriction enzyme sites
- my %fuzzy_res_sites = ();
- my @fuzzy_res_list = ();
- make_fuzzy_seqs( $res_site, \@fuzzy_res_list );
- map { $fuzzy_res_sites{$_} = $res_site; } @fuzzy_res_list;
- #############################################################################
- ###
- ### LOAD FASTQ RECORDS
- ###
- #############################################################################
- if ($verbose) { print "Loading RAD reads...\n"; }
- # Initialise sequence, quality and RAD variables
- my $quality_tail = q{};
- my $sequence = q{};
- my $record_count_total = 0;
- my $record_count_valid = 0;
- # Set up quality thresholds and characters
- my $qual_low_char = ';';
- my $qual_high_char = 'i';
- if ($sanger) {
- $qual_low_char = '!';
- $qual_high_char = 'J';
- }
- # Open RAD sequence file
- open my $read1_in, '<', $read1_file
- or die "Couldn't open '$read1_file': $OS_ERROR";
- my $read2_in;
- if ( $read2_file ne q{} ) {
- open $read2_in, '<', $read2_file
- or die "Couldn't open '$read2_file': $OS_ERROR";
- }
- # Output invalid/unclustered sequences
- my $results_file_stem = "./$directory/$directory\_" . get_timestamp();
- my $invalid_file = "$results_file_stem\_invalid.txt";
- open my $invalid_handle, '>', $invalid_file
- or die "Can't open $invalid_file: $OS_ERROR";
- print {$invalid_handle} "Input files: $read1_file"
- or die "Can't write to $invalid_file: $OS_ERROR\n";
- if ( $read2_file ne q{} ) { print {$invalid_handle} ", $read2_file"; }
- print {$invalid_handle}
- "; Directory: $directory, Enzyme site: $res_site; Trim length: $trim; Quality threshold: $quality_threshold; Processes: $max_processes; Sanger? ";
- if ($sanger) { print {$invalid_handle} "Yes;" }
- else { print {$invalid_handle} "No;" }
- print {$invalid_handle} " Fuzzy MIDs? ";
- if ($fuzzy_MIDs) { print {$invalid_handle} "Yes;" }
- else { print {$invalid_handle} "No;" }
- print {$invalid_handle} "\n";
- # Load RAD sequences into hash of pools with counts for identical sequences
- my $non_fastq_count = 0;
- my $wrong_rad_format_count = 0;
- my $wrong_ressite_count = 0;
- my $wrong_p1mid_count = 0;
- my $wrong_p2mid_count = 0;
- my $short_sequence_count = 0;
- LOAD_ONE_SEQUENCE:
- while (
- my $pair_ref = load_fastq_pair(
- {
- read1_handle => $read1_in,
- read2_handle => $read2_in,
- qual_low_char => $qual_low_char,
- qual_high_char => $qual_high_char,
- illumina => !$sanger,
- }
- )
- )
- {
- last if ( $pair_ref->{valid} == -1 );
- # Update progress meter
- $record_count_total++;
- if ( ($verbose) && ( $record_count_total % 100_000 == 0 ) ) {
- printf
- "OK: %8d | Rejected %8d (FASTQ: %8d; RAD: %8d; Res: %8d; P1 MID: %8d; P2 MID: %8d; Short: %8d)\n",
- $record_count_valid,
- $non_fastq_count +
- $wrong_rad_format_count +
- $wrong_ressite_count +
- $wrong_p1mid_count +
- $wrong_p2mid_count +
- $short_sequence_count,
- $non_fastq_count,
- $wrong_rad_format_count,
- $wrong_ressite_count, $wrong_p1mid_count, $wrong_p2mid_count,
- $short_sequence_count;
- }
- if ( !$pair_ref->{valid} ) {
- print {$invalid_handle}
- "Invalid FASTQ record: $pair_ref->{r1name} $pair_ref->{r1seq} $pair_ref->{r1qual}\n"
- or die "Can't write to $invalid_file: $OS_ERROR\n";
- $non_fastq_count++;
- next LOAD_ONE_SEQUENCE;
- }
- # Set trim to length of RAD sequence if not previously set
- my $tag_length = ( length $pair_ref->{r1seq} ) - $p1mid_length;
- if ( ( $trim <= 0 ) or ( $trim > $tag_length ) ) {
- $trim = $tag_length;
- }
- # Find any point where Q<threshold and
- # strip off sequence following this point
- my $qual_threshold_char = chr( $quality_threshold + QUAL_OFFSET );
- if ( $pair_ref->{r1qual} =~ m{(\A[$qual_threshold_char-J]+)}xms ) {
- $quality_tail = $1;
- }
- else {
- $quality_tail = q{};
- }
- # Get substring of sequence of length quality_substring
- $sequence = substr( $pair_ref->{r1seq}, 0, length $quality_tail );
- my $p1mid = q{}; # Empty string
- my $seq_res_site = q{};
- my $dna_sequence = q{};
- # Find restriction enzyme site; if not found, skip sequence
- if (
- $sequence =~ m{^([ACGTN]{$p1mid_length})
- ([ACGTN]{$res_site_length})
- ([ACGTN\.]+)$}xms
- )
- {
- $p1mid = $1;
- $seq_res_site = $2;
- $dna_sequence = $2 . $3;
- }
- else {
- print {$invalid_handle}
- "Sequence is not a valid RAD sequence : $pair_ref->{r1seq} $pair_ref->{r1qual} $sequence\n"
- or die "Can't write to $invalid_file: $OS_ERROR\n";
- $wrong_rad_format_count++;
- next LOAD_ONE_SEQUENCE;
- }
- if ( !( exists $fuzzy_res_sites{$seq_res_site} ) ) {
- print {$invalid_handle} 'Restriction enzyme site does not match'
- . " : $pair_ref->{r1seq} $pair_ref->{r1qual} "
- . "$seq_res_site $dna_sequence\n"
- or die "Can't write to $invalid_file: $OS_ERROR\n";
- $wrong_ressite_count++;
- next LOAD_ONE_SEQUENCE;
- }
- # Find pool for this MID; if no pool, make pool for this MID alone
- if ( !( exists $mid_pools{$p1mid} ) ) {
- # If MID not found, reject read
- print {$invalid_handle} 'P1 MID is not a valid MID'
- . " : $pair_ref->{r1seq} $pair_ref->{r1qual} "
- . "$p1mid $sequence\n"
- or die "Can't write to $invalid_file: $OS_ERROR\n";
- $wrong_p1mid_count++;
- next LOAD_ONE_SEQUENCE;
- }
- my $p2mid = substr( $pair_ref->{r2seq}, 0, $p2mid_length );
- if ( !( exists $mid_pools{$p1mid}{$p2mid} ) ) {
- # If MID not found, reject read
- print {$invalid_handle}
- 'P2 MID is not a valid MID or does not match P1 MID'
- . " : $pair_ref->{r2seq} $pair_ref->{r2qual} "
- . "P1:$p1mid P2:$p2mid\n"
- or die "Can't write to $invalid_file: $OS_ERROR\n";
- $wrong_p2mid_count++;
- next LOAD_ONE_SEQUENCE;
- }
- my $pool = $mid_pools{$p1mid}{$p2mid};
- # Strip P2 MID from read 2 sequence and quality
- $pair_ref->{r2seq} = substr( $pair_ref->{r2seq}, $p2mid_length );
- $pair_ref->{r2qual} = substr( $pair_ref->{r2qual}, $p2mid_length );
- # Increment the number of times this sequence has occurred
- if ( length($dna_sequence) >= $trim ) {
- # Remove MID from read 1 quality string
- if ( $quality_tail =~ m{^(.{$p1mid_length})(.+)$}xms ) {
- $quality_tail = $2;
- }
- $dna_sequence = substr $dna_sequence, 0, $trim;
- $quality_tail = substr $quality_tail, 0, $trim;
- if ( $output_fastq && ( $pair_ref->{r1name} ne q{} ) ) {
- print { $pool_handles{$pool} } "$pair_ref->{r1name} ";
- }
- print { $pool_handles{$pool} } "$dna_sequence $quality_tail"
- or die "Can't write to pools file: $OS_ERROR\n";
- if ( $read2_file ne q{} ) {
- if ( $output_fastq && ( $pair_ref->{r2name} ne q{} ) ) {
- print { $pool_handles{$pool} } " $pair_ref->{r2name}";
- }
- print { $pool_handles{$pool} }
- " $pair_ref->{r2seq} $pair_ref->{r2qual}";
- }
- print { $pool_handles{$pool} } "\n";
- $record_count_valid++;
- }
- # Otherwise, sequence is too short; throw it away
- else {
- print {$invalid_handle} 'Sequence shorter than '
- . $trim
- . ' base pairs : '
- . "$pair_ref->{r1seq} $pair_ref->{r1qual} $dna_sequence\n"
- or die "Can't write to $invalid_file: $OS_ERROR\n";
- $short_sequence_count++;
- }
- }
- close $read1_in or die "Can't close $read1_file: $OS_ERROR\n";
- if ( $read2_file ne q{} ) {
- close $read2_in or die "Can't close $read2_file: $OS_ERROR\n";
- }
- foreach my $pool ( keys %pool_handles ) {
- close $pool_handles{$pool}
- or die "Can't close pool files: $OS_ERROR\n";
- }
- if ($verbose) { print "Sorting pooled reads...\n"; }
- my @pool_ids = sort keys %pool_handles;
- my $pm = new Parallel::ForkManager( scalar(@pool_ids) );
- $pm->set_max_procs($max_processes);
- foreach my $pool_id (@pool_ids) {
- $pm->start and next;
- my $read_filename = "./$directory/$pool_id\.reads";
- if ($verbose) { print "Sorting $read_filename...\n"; }
- sort_reads_file(
- {
- directory => $directory,
- read_filename => $read_filename,
- sort_start => $res_site_length + 1,
- read1_field => $output_fastq ? 2 : 1,
- }
- );
- # Convert reads to FASTQ
- if ($output_fastq) {
- if ($verbose) {
- print "Converting $pool_id to FASTQ format...\n";
- }
- my $fastq1_file;
- my $fastq2_file;
- if ( $read2_file ne q{} ) {
- open $fastq1_file, '>', "./$directory/$pool_id\_1\.fastq";
- open $fastq2_file, '>', "./$directory/$pool_id\_2\.fastq";
- }
- else {
- open $fastq1_file, '>', "./$directory/$pool_id\.fastq";
- }
- open my $read_file, '<', $read_filename;
- while ( my $read_line = <$read_file> ) {
- chomp $read_line;
- my @read_fields = split / /, $read_line;
- print $fastq1_file
- "\@$read_fields[0]\n$read_fields[1]\n\+$read_fields[0]\n$read_fields[2]\n";
- if ( $read2_file ne q{} ) {
- print $fastq2_file
- "\@$read_fields[3]\n$read_fields[4]\n\+$read_fields[3]\n$read_fields[5]\n";
- }
- }
- close $read_file;
- if ( $read2_file ne q{} ) { close $fastq2_file; }
- close $fastq1_file;
- unlink($read_filename);
- }
- $pm->finish;
- }
- $pm->wait_all_children;
- print {$invalid_handle}
- "$record_count_valid records loaded, but discarded \n"
- . "$non_fastq_count records not in FASTQ format, \n"
- . "$wrong_rad_format_count records not in RAD format, \n"
- . "$wrong_ressite_count records not matching restriction enzyme site, \n"
- . "$wrong_p1mid_count records not matching any listed P1 MIDs, \n"
- . "$wrong_p2mid_count records not matching any listed P2 MIDs, and \n"
- . "$short_sequence_count records with high-quality sequences less than $trim bp long\n"
- or die "Can't write to $invalid_file: $OS_ERROR\n";
- close $invalid_handle or die "Can't close $invalid_file: $OS_ERROR\n";
- # Output records loaded and discarded
- if ($verbose) {
- print "$record_count_valid records loaded, but discarded \n"
- . "$non_fastq_count records not in FASTQ format, \n"
- . "$wrong_rad_format_count records not in RAD format, \n"
- . "$wrong_ressite_count records not matching restriction enzyme site, \n"
- . "$wrong_p1mid_count records not matching any listed P1 MIDs, \n"
- . "$wrong_p2mid_count records not matching any listed P2 MIDs, and \n"
- . "$short_sequence_count records with high-quality sequences less than $trim bp long\n"
- . "Done\n";
- }
- return;
- }
- #############################################################################
- ###
- ### SUBROUTINES
- ###
- #############################################################################
- sub make_fuzzy_seqs {
- my ( $seq, $fuzzy_list_ref ) = @_;
- my $seq_length = length $seq;
- # Unpack IUPAC bases
- my @unpacked_seqs;
- if ($seq =~ /[RYSWKMBDHVN]/) {
- my @bases = split //, $seq;
- my $iupac_stubs_ref = get_unpacked_seqs(\@bases);
- @unpacked_seqs = @{$iupac_stubs_ref};
- }
- else {
- push @unpacked_seqs, $seq;
- }
- for my $unpacked_seq (@unpacked_seqs) {
- for my $i ( 1 .. $seq_length ) {
- for my $base (qw{A C G T N}) {
- my $fuzzyseq = $unpacked_seq;
- my $prebase_i = $i - 1;
- my $postbase_i = $seq_length - $i;
- $fuzzyseq =~ s{^([ACGT]{$prebase_i})
- ([ACGT])
- ([ACGT]{$postbase_i})$}
- {$1$base$3}xms;
- push @{$fuzzy_list_ref}, $fuzzyseq;
- }
- }
- }
- return;
- }
- sub get_unpacked_seqs {
- my ($bases_ref) = @_;
- my $base1 = shift @{$bases_ref};
-
- my @iupac_stubs;
- @iupac_stubs = @{$bases_ref} > 0 ? @{get_unpacked_seqs($bases_ref)} : ('');
-
- # IUPAC code constants
- my %iupac_codes = (
- 'A' => 'A',
- 'C' => 'C',
- 'G' => 'G',
- 'T' => 'T',
- 'R' => 'AG',
- 'Y' => 'CT',
- 'S' => 'GC',
- 'W' => 'AT',
- 'K' => 'GT',
- 'M' => 'AC',
- 'B' => 'CGT',
- 'D' => 'AGT',
- 'H' => 'ACT',
- 'V' => 'ACG',
- 'N' => 'ACGT',
- );
-
-
- my $iupac_string = $iupac_codes{$base1};
-
- my @unpacked_seqs;
- for my $iupac_base (split //, $iupac_string) {
- for my $iupac_stub (@iupac_stubs) {
- push @unpacked_seqs, $iupac_base . $iupac_stub;
- }
- }
- return \@unpacked_seqs;
- }
- sub load_pools_and_mids {
- my ( $mid_pools_ref, $directory, $verbose, $fuzzy_MIDs ) = @_;
- if ( $directory ne q{} ) {
- # Load pools from file into an array
- my $pool_file = $directory . '.pools';
- open my $poolhandle, '<', $pool_file
- or die "Couldn't open '$pool_file': $OS_ERROR\n";
- if ($verbose) { print "Loading pools from $pool_file...\n"; }
- my @pools = <$poolhandle>;
- close $poolhandle or die "Can't close $pool_file: $OS_ERROR\n";
- if ( !-d "./$directory" ) {
- mkdir "./$directory"
- or die
- "Can't create directory for $directory output files: $OS_ERROR!\n";
- if ($verbose) { print "Created ./$directory...\n"; }
- }
- my %pool_names;
- # Load MIDs for all pools
- foreach my $pool_line (@pools) {
- chomp $pool_line;
- next if ( $pool_line eq '' );
- my @pool_mid_list = split /\s+/, $pool_line;
- if ( @pool_mid_list < 2 ) {
- print STDERR
- "Faulty pool: $pool_line - must have at least a name and one MID separated by whitespace\n";
- next;
- }
- my $pool_name = shift @pool_mid_list;
- if ( defined $pool_names{$pool_name} ) {
- print STDERR
- "Faulty pool: $pool_name - pool names must be unique\n";
- next;
- }
- $pool_names{$pool_name}++;
- # Load MIDs into array and set up hash of MIDs for pool lookup
- foreach my $pool_mid (@pool_mid_list) {
- my $p1mid = $pool_mid;
- my $p2mid = "";
- if ( $pool_mid =~ /:/ ) {
- my @p1p2mids = split /:/, $pool_mid;
- $p1mid = $p1p2mids[0];
- $p2mid = $p1p2mids[1];
- }
- if ( ( $p1mid !~ /[ACGT]+/ )
- || ( ( $p2mid ne "" ) && ( $p2mid !~ /[ACGT]+/ ) ) )
- {
- print STDERR
- "Faulty MID : $pool_name $pool_mid - MIDs must contain only ACGT\n";
- next;
- }
- if ( !$fuzzy_MIDs ) {
- push @{ $mid_pools_ref->{$p1mid}{$p2mid} }, $pool_name;
- }
- else {
- my @p1seqs;
- my @p2seqs;
- make_fuzzy_seqs( $p1mid, \@p1seqs );
- make_fuzzy_seqs( $p2mid, \@p2seqs );
- if ( @p2seqs == 0 ) { push @p2seqs, ""; }
- foreach my $p1seq (@p1seqs) {
- foreach my $p2seq (@p2seqs) {
- push @{ $mid_pools_ref->{$p1seq}{$p2seq} },
- $pool_name;
- }
- }
- }
- }
- }
- }
- else {
- die 'Please provide the name of your pools file using the -d option';
- }
- return;
- }
- sub open_read_files {
- my ( $mid_pools_ref, $pool_handles_ref, $directory, $fuzzy_MIDs ) = @_;
- my $p1mid_length = 0;
- my $p2mid_length = 0;
- # Collapse lists of pools for each MID into unique, sorted, string
- foreach my $p1mid ( keys %{$mid_pools_ref} ) {
- $p1mid_length = length $p1mid;
- foreach my $p2mid ( keys %{ $mid_pools_ref->{$p1mid} } ) {
- $p2mid_length = length $p2mid;
- my @pool_list = sort @{ $mid_pools_ref->{$p1mid}{$p2mid} };
- my $seen = q{};
- # Get unique elements of list
- @pool_list =
- grep { ( $_ ne $seen ) && ( ($seen) = $_ ) } @pool_list;
- if ( ( !$fuzzy_MIDs ) && ( @pool_list > 1 ) ) {
- delete $mid_pools_ref->{$p1mid}{$p2mid};
- }
- else {
- my $pool_string = q{};
- # Collapse list of pools
- foreach my $pool (@pool_list) {
- $pool_string = $pool_string . $pool . '.';
- }
- # Get rid of trailing .
- chop $pool_string;
- $mid_pools_ref->{$p1mid}{$p2mid} = $pool_string;
- open $pool_handles_ref->{$pool_string}, '>',
- "./$directory/$pool_string\.reads"
- or die "Can't open output file for $pool_string: $OS_ERROR\n";
- }
- }
- }
- return { p1mid_length => $p1mid_length, p2mid_length => $p2mid_length };
- }
- #############################################################################
- ### Name: GET TIMESTAMP
- ### Function: gets and formats the current time
- ### Parameters: None
- ### Returns: Current time as YYYYMMDD_HHMMSS
- #############################################################################
- sub get_timestamp {
- my ( $sec, $min, $hour, $mday, $mon, $year ) = localtime time;
- $mon++;
- $year += 1900;
- if ( $mon < 10 ) { $mon = "0$mon"; }
- if ( $mday < 10 ) { $mday = "0$mday"; }
- if ( $hour < 10 ) { $hour = "0$hour"; }
- if ( $min < 10 ) { $min = "0$min"; }
- if ( $sec < 10 ) { $sec = "0$sec"; }
- return $year . $mon . $mday . '_' . $hour . $min . $sec;
- }
- # return true;
- 1;
- __END__
- #############################################################################
- ###
- ### DOCUMENTATION
- ###
- #############################################################################
- =head1 NAME
- RADpools - Take raw Illumina RAD reads and create read files for a set of pools
- =head1 VERSION
- This documentation refers to RADtools version $main::VERSION.
- =head1 SYNOPSIS
- =over 8
- =item RADpools --in <FILE> --directory <DIRECTORY> [options]
- =item RADpools --help
- =back
- =head1 OPTIONS
- =over 8
- =item B<-h, --help>
- Print a brief help message and exit
- =item B<-u, --usage>
- Print concise usage and exit
- =item B<--man>
- Print the manual page and exit
- =item B<--version>
- Print version number and exit
- =item B<-v, --verbose>
- Output status messages during run (default off)
- =item B<-i, --in>
- File containing single end Illumina RAD reads in FASTQ format (required)
- =item B<-p, --paired>
- File containing paired end Illumina RAD reads in FASTQ format (optional)
- =item B<-d, --directory>
- Name of output directory, matching name of pools file (required)
- =item B<-m, --max_processes>
- Maximum number of processes to use for sorting read pools after input files have been processed. Default 1; increasing --max_processes is highly recommended if more cores are available.
- =item B<-e, --enzyme>
- Sequence of cut site for used restriction enzyme (default TGCAGG, SbfI). IUPAC codes are allowed.
- =item B<-q, --quality>
- Minimum acceptable quality score (default 0, ie all bases accepted)
- =item B<-t, --trim>
- Trim all reads to this length (default max length of read, ie no trimming)
- =item B<-f, --fuzzy_MIDs>
- If a MID contains an error, search for the closest MID sequence. If more than one MID found, create a new MID for the intermediate sequence (default off)
- =item B<-s, --sanger>
- Specify this option if input files are in fastq-sanger format, not fastq-illumina format.
- =item B<-o, --output_fastq>
- Outputs FASTQ format rather than reads format.
- =back
- =head1 DESCRIPTION
- B<RADpools> accepts as input one or two FASTQ files containing single or
- paired end GERALD output from an Illumina RAD sequencing run. These RAD
- sequences contain MIDs which identify the individuals from which the
- sequence came. This script loads in the pools to which the MIDs belong and
- assigns the sequences to the appropriate pools. It then writes out the
- sequences to separate files, one for each pool.
- =head1 AUTHOR
- John Davey <john.davey@ed.ac.uk>
- =head1 LICENCE AND COPYRIGHT
- Copyright 2008, 2011 John Davey, University of Edinburgh <john.davey@ed.ac.uk>
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- =cut