RADpools - Rework trim and quality options, remove mid

/RADpools

https://github.com/johnomics/RADtools · Perl · 848 lines · 559 code · 189 blank · 100 comment · 93 complexity · a68a18b9e8e847b94aeb791fa043c2fb MD5 · raw file

#!/usr/bin/env perl

# RADpools

# History:
# 16/05/10 First public version
# 02/07/10 Changed defaults to avoid trimming reads
# 18/07/10 Added MID length parameter and changed default to reject zombies,
#          not accept them
# 03/08/10 Incorporated into RADtools
# 05/08/10 Run through perlcritic, tidied up options, added POD,
#          removed benchmarking, added main() for testing purposes
# 20/08/10 Added sorting the read pools using Parallel::ForkManager
# 25/08/10 Catch absence of Parallel::ForkManager
#          Rework trim and quality options, remove mid_length and
#              minimum_sequence_length options, rename zombies to fuzzy_MIDs
# 25/08/10 Version 1.0
# 08/09/10 1.0.1 Minor bug fixes
# 29/10/10 1.0.2 Accept at most one N in restriction enzyme, MID;
#                Don't use species name in output filename
# 30/10/10 1.1   Change species argument to directory argument
#                Output restriction site sequence
#                Add option to output FASTQ format
#                Load and output fastq-sanger qualities
#  4/11/10 1.1.1 Improved robustness following implementation of test suite
# 16/02/11 1.2   New RADmarkers output; no changes to RADpools
# 15/05/11 1.2.1 Clarify verbose output
# 20/07/11 1.2.2 Allow Q=41 quality scores following base caller upgrade
# 12/09/11 1.2.3 If P2 MIDs given, use them to sort reads
# 07/02/12 1.2.4 Allow enzymes with >1 ambiguous base such as SgrAI

#############################################################################
###
### PREAMBLE
###
#############################################################################

# Core pragmas and modules
use strict;
use warnings;
use English qw( -no_match_vars );
use Getopt::Long qw( :config bundling no_auto_abbrev auto_version );
use Pod::Usage;
use File::Basename;
use File::Copy;
use Cwd;

use RADtools qw(load_fastq_pair sort_reads_file QUAL_OFFSET);

# Non-core modules
eval {
    require Parallel::ForkManager;
    Parallel::ForkManager->import();
};
die
"RADpools requires the CPAN module Parallel::ForkManager. Please install this package and add it to your Perl library path.\n"
  if $@;

# Would like to use Carp, but an outstanding bug causes cryptic errors
# when using caller(), so using die until this is fixed
# http://www.nntp.perl.org/group/perl.perl5.porters/2010/03/msg157461.html

local $main::VERSION    = 1.2.4;    # Used by Getopt::Long to provide --version
local $OUTPUT_AUTOFLUSH = 1;        # So reporting on progress works

main(@ARGV) unless caller();        # So test suite can call script

sub main {

    # Set up default options

    my $help  = 0;
    my $usage = 0;
    my $man   = 0;

    my $verbose = 0;

    my $read1_file = q{};
    my $read2_file = q{};
    my $directory  = q{};

    my $max_processes = 1;
    my $sanger        = 0;
    my $output_fastq  = 0;

    my $res_site          = 'TGCAGG';    #SbfI
    my $quality_threshold = 0;
    my $trim              = 0;
    my $fuzzy_MIDs        = 0;

    # Load user options
    my $options_okay = GetOptions(

        'help|h'    => \$help,
        'usage|u'   => \$usage,
        'man'       => \$man,
        'verbose|v' => \$verbose,

        'in|i=s'        => \$read1_file,
        'paired|p=s'    => \$read2_file,
        'directory|d=s' => \$directory,

        'max_processes|m=i' => \$max_processes,
        'sanger|s'          => \$sanger,
        'output_fastq|o'    => \$output_fastq,

        'enzyme|e=s'   => \$res_site,
        'quality|q=i'  => \$quality_threshold,
        'trim|t=i'     => \$trim,
        'fuzzy_MIDs|f' => \$fuzzy_MIDs,

    ) or pod2usage( -exitval => 3, -verbose => 0 );

    pod2usage( -exitval => 4, -verbose => 0 ) if $usage;
    pod2usage( -exitval => 5, -verbose => 1 ) if $help;
    pod2usage( -exitval => 6, -verbose => 2 ) if $man;

    # Validate options
    pod2usage( -exitval => 7, -verbose => 0 ) if ( $read1_file eq q{} );
    pod2usage( -exitval => 8, -verbose => 0 ) if ( $directory  eq q{} );

    die "Quality threshold must be between 0 and 40\n"
      if ( ( $quality_threshold < 0 ) || ( $quality_threshold > 40 ) );

    die "Restriction enzyme must contain only IUPAC codes\n"
      if ( $res_site =~ /[^ACGTRYSWKMBDHVN]/ );

    # If directory is ., get basename of current directory
    if ( $directory eq '.' ) {
        $directory = basename(getcwd);
    }

    # Sugar: strip .pools from directory argument if it has been added
    $directory =~ s/\.pools$//;

#############################################################################
###
### LOAD POOLS AND MIDS
###
#############################################################################

    my $res_site_length = length $res_site;
    my $p1mid_length    = 0;
    my $p2mid_length    = 0;
    my %mid_pools       = ();
    my %pool_handles    = ();

    load_pools_and_mids( \%mid_pools, $directory, $verbose, $fuzzy_MIDs );

    my $results_ref =
      open_read_files( \%mid_pools, \%pool_handles, $directory, $fuzzy_MIDs );

    $p1mid_length = $results_ref->{p1mid_length};
    $p2mid_length = $results_ref->{p2mid_length};

    # Make fuzzy list of restriction enzyme sites

    my %fuzzy_res_sites = ();
    my @fuzzy_res_list  = ();
    make_fuzzy_seqs( $res_site, \@fuzzy_res_list );
    map { $fuzzy_res_sites{$_} = $res_site; } @fuzzy_res_list;

#############################################################################
###
### LOAD FASTQ RECORDS
###
#############################################################################

    if ($verbose) { print "Loading RAD reads...\n"; }

    # Initialise sequence, quality and RAD variables

    my $quality_tail       = q{};
    my $sequence           = q{};
    my $record_count_total = 0;
    my $record_count_valid = 0;

    # Set up quality thresholds and characters
    my $qual_low_char  = ';';
    my $qual_high_char = 'i';
    if ($sanger) {
        $qual_low_char  = '!';
        $qual_high_char = 'J';
    }

    # Open RAD sequence file
    open my $read1_in, '<', $read1_file
      or die "Couldn't open '$read1_file': $OS_ERROR";

    my $read2_in;
    if ( $read2_file ne q{} ) {
        open $read2_in, '<', $read2_file
          or die "Couldn't open '$read2_file': $OS_ERROR";
    }

    # Output invalid/unclustered sequences
    my $results_file_stem = "./$directory/$directory\_" . get_timestamp();
    my $invalid_file      = "$results_file_stem\_invalid.txt";
    open my $invalid_handle, '>', $invalid_file
      or die "Can't open $invalid_file: $OS_ERROR";

    print {$invalid_handle} "Input files: $read1_file"
      or die "Can't write to $invalid_file: $OS_ERROR\n";

    if ( $read2_file ne q{} ) { print {$invalid_handle} ", $read2_file"; }

    print {$invalid_handle}
"; Directory: $directory, Enzyme site: $res_site; Trim length: $trim; Quality threshold: $quality_threshold; Processes: $max_processes; Sanger? ";

    if   ($sanger) { print {$invalid_handle} "Yes;" }
    else           { print {$invalid_handle} "No;" }

    print {$invalid_handle} " Fuzzy MIDs? ";
    if   ($fuzzy_MIDs) { print {$invalid_handle} "Yes;" }
    else               { print {$invalid_handle} "No;" }

    print {$invalid_handle} "\n";

    # Load RAD sequences into hash of pools with counts for identical sequences

    my $non_fastq_count        = 0;
    my $wrong_rad_format_count = 0;
    my $wrong_ressite_count    = 0;
    my $wrong_p1mid_count      = 0;
    my $wrong_p2mid_count      = 0;
    my $short_sequence_count   = 0;

  LOAD_ONE_SEQUENCE:
    while (
        my $pair_ref = load_fastq_pair(
            {
                read1_handle   => $read1_in,
                read2_handle   => $read2_in,
                qual_low_char  => $qual_low_char,
                qual_high_char => $qual_high_char,
                illumina       => !$sanger,
            }
        )
      )
    {
        last if ( $pair_ref->{valid} == -1 );

        # Update progress meter
        $record_count_total++;
        if ( ($verbose) && ( $record_count_total % 100_000 == 0 ) ) {
            printf
"OK: %8d | Rejected %8d (FASTQ: %8d; RAD: %8d; Res: %8d; P1 MID: %8d; P2 MID: %8d; Short: %8d)\n",
              $record_count_valid,
              $non_fastq_count +
              $wrong_rad_format_count +
              $wrong_ressite_count +
              $wrong_p1mid_count +
              $wrong_p2mid_count +
              $short_sequence_count,
              $non_fastq_count,
              $wrong_rad_format_count,
              $wrong_ressite_count, $wrong_p1mid_count, $wrong_p2mid_count,
              $short_sequence_count;
        }

        if ( !$pair_ref->{valid} ) {
            print {$invalid_handle}
"Invalid FASTQ record: $pair_ref->{r1name} $pair_ref->{r1seq} $pair_ref->{r1qual}\n"
              or die "Can't write to $invalid_file: $OS_ERROR\n";
            $non_fastq_count++;
            next LOAD_ONE_SEQUENCE;
        }

        # Set trim to length of RAD sequence if not previously set
        my $tag_length = ( length $pair_ref->{r1seq} ) - $p1mid_length;
        if ( ( $trim <= 0 ) or ( $trim > $tag_length ) ) {
            $trim = $tag_length;
        }

        # Find any point where Q<threshold and
        # strip off sequence following this point
        my $qual_threshold_char = chr( $quality_threshold + QUAL_OFFSET );
        if ( $pair_ref->{r1qual} =~ m{(\A[$qual_threshold_char-J]+)}xms ) {
            $quality_tail = $1;
        }
        else {
            $quality_tail = q{};
        }

        # Get substring of sequence of length quality_substring
        $sequence = substr( $pair_ref->{r1seq}, 0, length $quality_tail );

        my $p1mid        = q{};    # Empty string
        my $seq_res_site = q{};
        my $dna_sequence = q{};

        # Find restriction enzyme site; if not found, skip sequence
        if (
            $sequence =~ m{^([ACGTN]{$p1mid_length})
                        ([ACGTN]{$res_site_length})
                        ([ACGTN\.]+)$}xms
          )
        {
            $p1mid        = $1;
            $seq_res_site = $2;
            $dna_sequence = $2 . $3;
        }
        else {
            print {$invalid_handle}
"Sequence is not a valid RAD sequence               : $pair_ref->{r1seq} $pair_ref->{r1qual} $sequence\n"
              or die "Can't write to $invalid_file: $OS_ERROR\n";
            $wrong_rad_format_count++;
            next LOAD_ONE_SEQUENCE;
        }

        if ( !( exists $fuzzy_res_sites{$seq_res_site} ) ) {

            print {$invalid_handle} 'Restriction enzyme site does not match'
              . "             : $pair_ref->{r1seq} $pair_ref->{r1qual} "
              . "$seq_res_site $dna_sequence\n"
              or die "Can't write to $invalid_file: $OS_ERROR\n";
            $wrong_ressite_count++;
            next LOAD_ONE_SEQUENCE;
        }

        # Find pool for this MID; if no pool, make pool for this MID alone
        if ( !( exists $mid_pools{$p1mid} ) ) {

            # If MID not found, reject read
            print {$invalid_handle} 'P1 MID is not a valid MID'
              . "                     : $pair_ref->{r1seq} $pair_ref->{r1qual} "
              . "$p1mid $sequence\n"
              or die "Can't write to $invalid_file: $OS_ERROR\n";
            $wrong_p1mid_count++;
            next LOAD_ONE_SEQUENCE;
        }

        my $p2mid = substr( $pair_ref->{r2seq}, 0, $p2mid_length );

        if ( !( exists $mid_pools{$p1mid}{$p2mid} ) ) {

            # If MID not found, reject read
            print {$invalid_handle}
              'P2 MID is not a valid MID or does not match P1 MID'
              . "                     : $pair_ref->{r2seq} $pair_ref->{r2qual} "
              . "P1:$p1mid P2:$p2mid\n"
              or die "Can't write to $invalid_file: $OS_ERROR\n";
            $wrong_p2mid_count++;
            next LOAD_ONE_SEQUENCE;

        }
        my $pool = $mid_pools{$p1mid}{$p2mid};

        # Strip P2 MID from read 2 sequence and quality
        $pair_ref->{r2seq}  = substr( $pair_ref->{r2seq},  $p2mid_length );
        $pair_ref->{r2qual} = substr( $pair_ref->{r2qual}, $p2mid_length );

        # Increment the number of times this sequence has occurred
        if ( length($dna_sequence) >= $trim ) {

            # Remove MID from read 1 quality string
            if ( $quality_tail =~ m{^(.{$p1mid_length})(.+)$}xms ) {
                $quality_tail = $2;
            }
            $dna_sequence = substr $dna_sequence, 0, $trim;
            $quality_tail = substr $quality_tail, 0, $trim;

            if ( $output_fastq && ( $pair_ref->{r1name} ne q{} ) ) {
                print { $pool_handles{$pool} } "$pair_ref->{r1name} ";
            }
            print { $pool_handles{$pool} } "$dna_sequence $quality_tail"
              or die "Can't write to pools file: $OS_ERROR\n";
            if ( $read2_file ne q{} ) {
                if ( $output_fastq && ( $pair_ref->{r2name} ne q{} ) ) {
                    print { $pool_handles{$pool} } " $pair_ref->{r2name}";
                }
                print { $pool_handles{$pool} }
                  " $pair_ref->{r2seq} $pair_ref->{r2qual}";
            }
            print { $pool_handles{$pool} } "\n";
            $record_count_valid++;
        }

        # Otherwise, sequence is too short; throw it away
        else {
            print {$invalid_handle} 'Sequence shorter than ' 
              . $trim
              . ' base pairs                : '
              . "$pair_ref->{r1seq} $pair_ref->{r1qual} $dna_sequence\n"
              or die "Can't write to $invalid_file: $OS_ERROR\n";
            $short_sequence_count++;

        }
    }
    close $read1_in or die "Can't close $read1_file: $OS_ERROR\n";

    if ( $read2_file ne q{} ) {
        close $read2_in or die "Can't close $read2_file: $OS_ERROR\n";
    }

    foreach my $pool ( keys %pool_handles ) {
        close $pool_handles{$pool}
          or die "Can't close pool files: $OS_ERROR\n";
    }

    if ($verbose) { print "Sorting pooled reads...\n"; }

    my @pool_ids = sort keys %pool_handles;
    my $pm       = new Parallel::ForkManager( scalar(@pool_ids) );
    $pm->set_max_procs($max_processes);

    foreach my $pool_id (@pool_ids) {
        $pm->start and next;

        my $read_filename = "./$directory/$pool_id\.reads";
        if ($verbose) { print "Sorting $read_filename...\n"; }

        sort_reads_file(
            {
                directory     => $directory,
                read_filename => $read_filename,
                sort_start    => $res_site_length + 1,
                read1_field   => $output_fastq ? 2 : 1,
            }
        );

        # Convert reads to FASTQ
        if ($output_fastq) {
            if ($verbose) {
                print "Converting $pool_id to FASTQ format...\n";
            }

            my $fastq1_file;
            my $fastq2_file;
            if ( $read2_file ne q{} ) {
                open $fastq1_file, '>', "./$directory/$pool_id\_1\.fastq";
                open $fastq2_file, '>', "./$directory/$pool_id\_2\.fastq";
            }
            else {
                open $fastq1_file, '>', "./$directory/$pool_id\.fastq";
            }

            open my $read_file, '<', $read_filename;
            while ( my $read_line = <$read_file> ) {
                chomp $read_line;

                my @read_fields = split / /, $read_line;

                print $fastq1_file
"\@$read_fields[0]\n$read_fields[1]\n\+$read_fields[0]\n$read_fields[2]\n";

                if ( $read2_file ne q{} ) {
                    print $fastq2_file
"\@$read_fields[3]\n$read_fields[4]\n\+$read_fields[3]\n$read_fields[5]\n";
                }
            }
            close $read_file;
            if ( $read2_file ne q{} ) { close $fastq2_file; }
            close $fastq1_file;
            unlink($read_filename);
        }

        $pm->finish;
    }

    $pm->wait_all_children;

    print {$invalid_handle}
      "$record_count_valid records loaded, but discarded \n"
      . "$non_fastq_count records not in FASTQ format, \n"
      . "$wrong_rad_format_count records not in RAD format, \n"
      . "$wrong_ressite_count records not matching restriction enzyme site, \n"
      . "$wrong_p1mid_count records not matching any listed P1 MIDs, \n"
      . "$wrong_p2mid_count records not matching any listed P2 MIDs, and \n"
      . "$short_sequence_count records with high-quality sequences less than $trim bp long\n"
      or die "Can't write to $invalid_file: $OS_ERROR\n";

    close $invalid_handle or die "Can't close $invalid_file: $OS_ERROR\n";

    # Output records loaded and discarded
    if ($verbose) {

        print "$record_count_valid records loaded, but discarded \n"
          . "$non_fastq_count records not in FASTQ format, \n"
          . "$wrong_rad_format_count records not in RAD format, \n"
          . "$wrong_ressite_count records not matching restriction enzyme site, \n"
          . "$wrong_p1mid_count records not matching any listed P1 MIDs, \n"
          . "$wrong_p2mid_count records not matching any listed P2 MIDs, and \n"
          . "$short_sequence_count records with high-quality sequences less than $trim bp long\n"
          . "Done\n";
    }

    return;
}

#############################################################################
###
### SUBROUTINES
###
#############################################################################

sub make_fuzzy_seqs {
    my ( $seq, $fuzzy_list_ref ) = @_;

    my $seq_length = length $seq;

    # Unpack IUPAC bases
    my @unpacked_seqs;

    if ($seq =~ /[RYSWKMBDHVN]/) {
        my @bases = split //, $seq;
        my $iupac_stubs_ref = get_unpacked_seqs(\@bases);
        @unpacked_seqs = @{$iupac_stubs_ref};
    }
    else {
        push @unpacked_seqs, $seq;
    }

    for my $unpacked_seq (@unpacked_seqs) {
        for my $i ( 1 .. $seq_length ) {
            for my $base (qw{A C G T N}) {
                my $fuzzyseq   = $unpacked_seq;
                my $prebase_i  = $i - 1;
                my $postbase_i = $seq_length - $i;
                $fuzzyseq =~ s{^([ACGT]{$prebase_i})
           ([ACGT])
           ([ACGT]{$postbase_i})$}
           {$1$base$3}xms;
                push @{$fuzzy_list_ref}, $fuzzyseq;
            }
        }
    }
    return;
}

sub get_unpacked_seqs {
    my ($bases_ref) = @_;

    my $base1 = shift @{$bases_ref};
    
    my @iupac_stubs;
    @iupac_stubs = @{$bases_ref} > 0 ? @{get_unpacked_seqs($bases_ref)} : ('');
    
    # IUPAC code constants
    my %iupac_codes = (
        'A' => 'A',
        'C' => 'C',
        'G' => 'G',
        'T' => 'T',
        'R' => 'AG',
        'Y' => 'CT',
        'S' => 'GC',
        'W' => 'AT',
        'K' => 'GT',
        'M' => 'AC',
        'B' => 'CGT',
        'D' => 'AGT',
        'H' => 'ACT',
        'V' => 'ACG',
        'N' => 'ACGT',
    );
    
    
    my $iupac_string = $iupac_codes{$base1};
    
    my @unpacked_seqs;
    for my $iupac_base (split //, $iupac_string) {
        for my $iupac_stub (@iupac_stubs) {
            push @unpacked_seqs, $iupac_base . $iupac_stub;
        }
    }

    return \@unpacked_seqs;
}

sub load_pools_and_mids {
    my ( $mid_pools_ref, $directory, $verbose, $fuzzy_MIDs ) = @_;

    if ( $directory ne q{} ) {

        # Load pools from file into an array
        my $pool_file = $directory . '.pools';
        open my $poolhandle, '<', $pool_file
          or die "Couldn't open '$pool_file': $OS_ERROR\n";
        if ($verbose) { print "Loading pools from $pool_file...\n"; }
        my @pools = <$poolhandle>;
        close $poolhandle or die "Can't close $pool_file: $OS_ERROR\n";

        if ( !-d "./$directory" ) {
            mkdir "./$directory"
              or die
"Can't create directory for $directory output files: $OS_ERROR!\n";
            if ($verbose) { print "Created ./$directory...\n"; }

        }

        my %pool_names;

        # Load MIDs for all pools
        foreach my $pool_line (@pools) {
            chomp $pool_line;

            next if ( $pool_line eq '' );

            my @pool_mid_list = split /\s+/, $pool_line;
            if ( @pool_mid_list < 2 ) {
                print STDERR
"Faulty pool: $pool_line - must have at least a name and one MID separated by whitespace\n";
                next;
            }

            my $pool_name = shift @pool_mid_list;

            if ( defined $pool_names{$pool_name} ) {
                print STDERR
                  "Faulty pool: $pool_name - pool names must be unique\n";
                next;
            }
            $pool_names{$pool_name}++;

            # Load MIDs into array and set up hash of MIDs for pool lookup

            foreach my $pool_mid (@pool_mid_list) {
                my $p1mid = $pool_mid;
                my $p2mid = "";
                if ( $pool_mid =~ /:/ ) {
                    my @p1p2mids = split /:/, $pool_mid;
                    $p1mid = $p1p2mids[0];
                    $p2mid = $p1p2mids[1];
                }

                if (   ( $p1mid !~ /[ACGT]+/ )
                    || ( ( $p2mid ne "" ) && ( $p2mid !~ /[ACGT]+/ ) ) )
                {
                    print STDERR
"Faulty MID : $pool_name $pool_mid - MIDs must contain only ACGT\n";
                    next;
                }

                if ( !$fuzzy_MIDs ) {
                    push @{ $mid_pools_ref->{$p1mid}{$p2mid} }, $pool_name;
                }
                else {
                    my @p1seqs;
                    my @p2seqs;
                    make_fuzzy_seqs( $p1mid, \@p1seqs );
                    make_fuzzy_seqs( $p2mid, \@p2seqs );
                    if ( @p2seqs == 0 ) { push @p2seqs, ""; }
                    foreach my $p1seq (@p1seqs) {
                        foreach my $p2seq (@p2seqs) {
                            push @{ $mid_pools_ref->{$p1seq}{$p2seq} },
                              $pool_name;
                        }
                    }
                }
            }
        }
    }
    else {
        die 'Please provide the name of your pools file using the -d option';
    }
    return;
}

sub open_read_files {
    my ( $mid_pools_ref, $pool_handles_ref, $directory, $fuzzy_MIDs ) = @_;

    my $p1mid_length = 0;
    my $p2mid_length = 0;

    # Collapse lists of pools for each MID into unique, sorted, string
    foreach my $p1mid ( keys %{$mid_pools_ref} ) {
        $p1mid_length = length $p1mid;
        foreach my $p2mid ( keys %{ $mid_pools_ref->{$p1mid} } ) {
            $p2mid_length = length $p2mid;
            my @pool_list = sort @{ $mid_pools_ref->{$p1mid}{$p2mid} };
            my $seen      = q{};

            # Get unique elements of list
            @pool_list =
              grep { ( $_ ne $seen ) && ( ($seen) = $_ ) } @pool_list;

            if ( ( !$fuzzy_MIDs ) && ( @pool_list > 1 ) ) {
                delete $mid_pools_ref->{$p1mid}{$p2mid};
            }
            else {

                my $pool_string = q{};

                # Collapse list of pools
                foreach my $pool (@pool_list) {
                    $pool_string = $pool_string . $pool . '.';
                }

                # Get rid of trailing .
                chop $pool_string;
                $mid_pools_ref->{$p1mid}{$p2mid} = $pool_string;

                open $pool_handles_ref->{$pool_string}, '>',
                  "./$directory/$pool_string\.reads"
                  or die "Can't open output file for $pool_string: $OS_ERROR\n";
            }
        }
    }
    return { p1mid_length => $p1mid_length, p2mid_length => $p2mid_length };
}

#############################################################################
### Name:       GET TIMESTAMP
### Function:   gets and formats the current time
### Parameters: None
### Returns:    Current time as YYYYMMDD_HHMMSS
#############################################################################

sub get_timestamp {
    my ( $sec, $min, $hour, $mday, $mon, $year ) = localtime time;
    $mon++;
    $year += 1900;
    if ( $mon < 10 )  { $mon  = "0$mon"; }
    if ( $mday < 10 ) { $mday = "0$mday"; }
    if ( $hour < 10 ) { $hour = "0$hour"; }
    if ( $min < 10 )  { $min  = "0$min"; }
    if ( $sec < 10 )  { $sec  = "0$sec"; }

    return $year . $mon . $mday . '_' . $hour . $min . $sec;
}

# return true;
1;

__END__

#############################################################################
###
### DOCUMENTATION
###
#############################################################################

=head1 NAME

RADpools - Take raw Illumina RAD reads and create read files for a set of pools

=head1 VERSION

This documentation refers to RADtools version $main::VERSION.

=head1 SYNOPSIS

=over 8

=item RADpools --in <FILE> --directory <DIRECTORY> [options]

=item RADpools --help

=back

=head1 OPTIONS

=over 8

=item B<-h, --help>

Print a brief help message and exit

=item B<-u, --usage>

Print concise usage and exit

=item B<--man>

Print the manual page and exit

=item B<--version>

Print version number and exit

=item B<-v, --verbose>

Output status messages during run (default off)

=item B<-i, --in>

File containing single end Illumina RAD reads in FASTQ format (required)

=item B<-p, --paired>

File containing paired end Illumina RAD reads in FASTQ format (optional)

=item B<-d, --directory>

Name of output directory, matching name of pools file (required)

=item B<-m, --max_processes>

Maximum number of processes to use for sorting read pools after input files have been processed. Default 1; increasing --max_processes is highly recommended if more cores are available.

=item B<-e, --enzyme>

Sequence of cut site for used restriction enzyme (default TGCAGG, SbfI). IUPAC codes are allowed.

=item B<-q, --quality>

Minimum acceptable quality score (default 0, ie all bases accepted)

=item B<-t, --trim>

Trim all reads to this length (default max length of read, ie no trimming)

=item B<-f, --fuzzy_MIDs>

If a MID contains an error, search for the closest MID sequence. If more than one MID found, create a new MID for the intermediate sequence (default off)

=item B<-s, --sanger>

Specify this option if input files are in fastq-sanger format, not fastq-illumina format.

=item B<-o, --output_fastq>

Outputs FASTQ format rather than reads format.

=back

=head1 DESCRIPTION

B<RADpools> accepts as input one or two FASTQ files containing single or
paired end GERALD output from an Illumina RAD sequencing run. These RAD
sequences contain MIDs which identify the individuals from which the
sequence came. This script loads in the pools to which the MIDs belong and
assigns the sequences to the appropriate pools. It then writes out the
sequences to separate files, one for each pool.

=head1 AUTHOR

John Davey <john.davey@ed.ac.uk>

=head1 LICENCE AND COPYRIGHT

Copyright 2008, 2011 John Davey, University of Edinburgh <john.davey@ed.ac.uk>

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

=cut