/lib/MonkeyShines/Sequence.pm
Perl | 2315 lines | 1549 code | 169 blank | 597 comment | 170 complexity | 09dba631b6a905d5f035d49393157456 MD5 | raw file
Large files files are truncated, but you can click here to view the full file
- package MonkeyShines::Sequence;
- use warnings;
- use strict;
- use Carp;
- our $VERSION = '0.1';
- use File::Path qw(make_path);
- use FindBin qw($Bin);
- use MonkeyShines::Utils qw (
- timestamp_id
- read_from_file
- read_from_delim
- save_to_file
- save_to_delim
- mean
- median_hash
- binomial_coeff
- random_select
- random_select_and_remove
- list_exists
- flip_hash
- execute_check
- );
- # Only need this module if we use the embl_fetch function, which is
- # currently mothballed at the bottom of this module...
- #use LWP::Simple;
- use Exporter;
- our @ISA = qw(Exporter);
- our @EXPORT_OK = qw(
- download_ucsc maf_index maf_blocks maf_blocks_fill phastcons phastcons_scores
- sequence_portion sequence_flanking flanking_lengths
- find_fasta_sequence match_fasta_sequences
- from_x_to_y from_clustal from_fasta from_maf from_maf_crop_id from_paml
- from_phylip_i from_phylip_s from_phylip
- to_clustal to_fasta to_maf to_paml to_phylip_i to_phylip_s to_phylip
- complement complement_rev
- reverse_seqs remove_gaps excise_columns prune_alignment
- nucleotide_freqs nucleotide_freqs_multiple
- gc_content gc_content_multiple at_content at_content_multiple
- pairwise_identity pairwise_identity_site all_pairwise_identities median_pairwise_identity
- consensus_sequence
- alignment_islands
- alignment_columns columns_to_alignment
- shuffle_sequence randomise_alignment shuffle_alignment random_structure
- clustalw mafft mafft_fa blastn
- blat_server_start blat_server_stop blat blat_top_hit blat_top_hits fetch_dna
- parse_structure sci
- );
- # This module contains wrapper scripts that execute various bits of software.
- # If these are in the user's execution path, the global variable $bin_dir
- # can simply be an empty string; otherwise it needs to be the path to the
- # directory that contains the executables (or that contains symbolic links to
- # the executables).
- my $bin_dir = "";
- # Sometimes scripts from other software packages need to be executed; this
- # directory defines their location.
- my $resources_dir = "$Bin/../../resources";
- =pod
- =head1 Name
- MonkeyShines::Sequence - Sequence processing.
- =head1 Description
- MonkeyShines::Sequence is a collection of functions for sequence processing,
- including alignment, format conversion, and sequence statistics.
- This module contains wrapper scripts that execute various bits of software.
- Generally, these rely on the software being in the user's execution path;
- an alternative is to set the global variable $bin_dir in this module to the
- path to the directory that contains the executables (or that contains
- symbolic links to the executables).
- List of software (executables) used by functions in this module:
- BLAT (blat faToTwoBit gfClient, gfServer)
- bx-python
- ClustalW (clustalw)
- Galaxy
- MAFFT (mafft, ginsi, linsi)
- SISSIz (sissiz)
- uShuffle (ushuffle)
- =head1 Functions
- =head2 Sequence Retrieval and Parsing
- =over
- =item B<C<undef> = download_ucsc($type, $data_dir[, $species][, $chrs][, $x_way][, $ftp_url][, $build][, $unzip])>
- Retrieve data from UCSC. The type of data is specified by $type, which must
- be either 'fasta', 'maf', or 'phastcons'. The defaults are set up to get human
- and vertebrate datafrom the MultiZ 46-way alignment: $species = 'Homo_sapiens';
- $x_way = '46way'. For other species, check the UCSC ftp site, to get these
- values. $build defaults to the current build for the reference species $species.
- To retrieve data for specific chromosomes, provide an arrayref, $chrs (useful
- for avoiding data from undetermined locations). Sometimes these files are
- enormous, so I wouldn't recommend unzipping them until you need them, unless
- you have memory and disk space to burn. All of the files are stored in
- $data_dir/$type, which is created if it doesn't exist. The program checks there
- before downloading, to see if newer copies of the file are available, and only
- downloads if necessary. The formatting of phastCons filenames is entirely
- inconsistent, so symbolic links are created with the format, $chr.wig.gz.
- =item B<C<undef> = maf_index($maf_dir)>
- For each maf file in $maf_dir, create an index file with a script from
- the bx-python project (./scripts/bx-python/maf_build_index.py). This is a
- time- and memory-intensive process, but is essential for efficiently
- executing subsequent functions. If the maf files are gzipped, then they
- will be unzipped, processed, then rezipped, one at time.
- Obviously you need to have installed bx-python, which is fairly
- straightforward: http://bitbucket.org/james_taylor/bx-python/wiki/HowToInstall
- While you're at it, you might just want to install Galaxy, as it is needed for
- other processing tasks: http://bitbucket.org/galaxy/galaxy-central/wiki/GetGalaxy
- =item B<$sub_seq = sequence_portion($seq, $start, $stop)>
- Return the sequence between $start and $stop for a given sequence $seq.
- =item B<($flanking_l, $flanking_r) = sequence_flanking($seq, $sub_seq[, $left][, $right][, $pad])>
- Return the flanking sequences on each side of $sub_seq for a given
- sequence $seq. The default amount of flanking sequence returned is
- the length of $sub_seq, on either side. To retrieve a specific amount of
- flanking, use integer values for $left and $right; $right is set to the
- same value as $left if only $left is given. If $pad is true then the
- returned flanking regions will be padded with gaps at either end, so
- that each is the specified length (this is not done by default). An error
- will occur if $sub_seq does not appear in $seq, or if it appears multiple
- times.
- =item B<$seq = find_fasta_sequence($fasta_string, $id)>
- Get a single sequence from fasta-formatted data, whose ID matches $id.
- The returned sequence $seq retains the formatting, ie newlines.
- =item B<[$%]seqs = match_fasta_sequences($fasta_string, $match)>
- Get one or more sequences from fasta-formatted data, where the IDs match $match.
- The returned sequences retain their formatting, ie newlines.
- =back
- =head2 Format Conversion
- =over
- =item B<$out = from_x_to_y($from, $to, $in[, $out_file])>
- Convert an alignment, $in, from one format to another, $out.
- $from and $to can take the following values: "clustal", "fasta", "maf",
- "paml". Some of these formats are covered by the 'readseq'
- software, but are duplicated here for ease of use. If $out_file is
- given, then $out is saved there.
- =item B<($seqs, $order) = from_<format>($in)>
- <format> can be one of "clustal", "fasta", "maf", "paml",
- e.g. from_clustal($in). Convert $in, an alignment in that format, to
- a hashref, $seqs, with ids and sequences as keys and values, respectively.
- Since the order of sequences in a file may be important, $order is a
- hashref with ids as keys and incremental numbers as values.
- =item B<($out) = to_<format>($seqs[, $order])>
- <format> can be one of "clustal", "fasta", "maf", "paml",
- e.g. to_clustal($seqs, $order). Save a hashref, $seqs with ids and
- sequences as keys and values, respectively, in the specified format.
- $order is a hashref with ids as keys and integer values that indicate
- the order of the sequences in $out.
- =item B<C<undef> = maf_blocks($bed_file, $maf_dir, $chr[, $species][, $out_file])>
- Retrieve the MAF blocks for the interval(s) specified in $bed_file, using
- the files in $maf_dir; the appropriate files are chosen according to the
- chromosome, given by $chr. Indexes are required, for tolerable speed of
- execution (see the maf_index function), and these will be created if they do
- not exist. The appropirate MAF file will be unzipped then rezipped, if
- necessary. If this function will be called repeatedly, it's wisest to unzip
- the MAF file before executing the loop that does the repeated calling, as
- this'll save a load of time. The maf data is saved to $outfile, which
- defaults to $bed_file with the file type changed from 'bed' to 'maf'.
- The function uses the Galaxy script ./scripts/galaxy/maf/interval2maf.py, so
- Galaxy needs to be installed: http://bitbucket.org/galaxy/galaxy-central/wiki/GetGalaxy
- =back
- =head2 Sequence Manipulation
- =over
- =item B<$comp_seq = complement($seq[, $rna])>
- Generate the complement of a sequence; set $rna to true for RNA sequence.
- =item B<$comp_rev_seq = complement_rev($seq[, $rna])>
- Generate the reverse complement of a sequence; set $rna to true for RNA
- sequence.
- =item B<%new_seqs | $new_seqs = reverse_seqs($seqs)>
- Reverse every sequence in the hashref $seqs (handy for HoT calculations).
- =item B<%new_seqs | $new_seqs = remove_gaps($seqs)>
- Remove gaps from sequences stored in the hashref $seqs. Gaps are defined here
- as dots, dashes, and whitespace characters.
- =item B<%new_seqs | $new_seqs = excise_columns($seqs[, $order][, $regexes])>
- Remove columns that have particular properties, e.g. those that consist
- entirely of the same character, or of gaps and a single character
- ("[\.\-]*[a-zA-Z][\.\-]*"). It can be tricky to write a single regex to
- cover all cases, so the function accepts an arrayref of regexes. The
- default is to remove columns which are entirely gaps. The alignment is
- specified by the hashref $seqs, with IDs as keys and sequences as values.
- If the order of the sequences is important (e.g. a regex relies on a
- reference sequence being the first row), $order can be used to provide
- a hashref with the same keys as $seqs, and numeric values that give the
- ordering. This function is useful when rows have been removed from a multiple
- alignment, leaving some columns that consist entirely of gaps.
- =item B<%new_seqs | $new_seqs = prune_alignment($seqs, $taxa)>
- From a hashref $seqs, with IDs as keys and sequences as values, remove
- the sequences with IDs in the arrayref $taxa, then use 'excise_columns'
- to remove any columns which consist entirely of gaps.
- =back
- =head2 Sequence Statistics
- =over
- =item B<%freqs | $freqs = nucleotide_freqs($sequence[, $k_let][, $merge][, $return_counts])>
- Calculate the mono-, di-, or tri- nucleotide frequencies of $sequence, by
- setting $k_let = 1 (default), 2, or 3, respectively. For $k_let = 2 or 3,
- if $merge is true then palindromes are considered to be equivalent, and
- the counts are combined; e.g. the totals for AC and CA are added. A hash,
- or hashref, is returned with the mono-, di-, or tri- nucleotides as keys;
- and the frequencies (to 6d.p.) as values, unless $return_counts is true,
- in which case the integer counts are the values. (In fact, $k_let can be
- any integer value, so this function could be used to detect 'words' of
- any length in sequence data.)
- =item B<%freqs | $freqs = nucleotide_freqs_multiple($sequences[, $k_let][, $merge])>
- Calculate the mono-, di-, or tri- nucleotide frequencies of an array of
- sequences, $sequences, by using the 'nucleotide_freqs' function. Any gap
- characters will be removed from the sequences before calculations are done.
- =item B<$freq = gc_content($sequence)>
- =item B<$freq = at_content($sequence)>
- Calculate the GC or AT(U) content of a sequence, to 6d.p.
- =item B<$freq = gc_content_multiple($sequences)>
- =item B<$freq = at_content_multiple($sequences)>
- Calculate the GC or AT(U) content of an arrayref of sequences, to 6d.p.
- Gap characters are removed before doing the calculation.
- =item B<$ident = pairwise_identity($sequences[, $calc_type][, $ignore_case])>
- Calculate the pairwise identity of an alignment, given by the arrayref
- $sequences; if more than two sequences are given, the mean pairwise identity
- is calculated. There are a number of ways to define pairwise
- identity, each involving slightly different calculations; options for
- $calc_type are 'include_gaps' (default), 'exclude_gaps', 'shortest', and
- 'mean_length' (cf May (2004) Structure 12:737-8). The (mean) pairwise
- identity, $ident, is returned to 6d.p. By default, the function treats
- lower and upper case verions of the alignment characters as the same;
- set $ignore_case to false to treat them differently.
- =item B<$ident = pairwise_identity_site($column[, $calc_type][, $ignore_case])>
- More or less the same as the pairwise_identity function, except the
- calculations are done for a string values, $column, that represents a column
- in an alignment.
- =item B<($median_id, $median_value) | ($pairwise_identities | %pairwise_identities) =
- all_pairwise_identities($seq, $seqs[, $calc_type][, $median])>
- For a reference sequence $seq, calculate the pairwise identity with a bunch
- of other sequences in turn, then either return all of the values
- ($|%pairwise_identities) or pick the one with the median value and return
- both the ID of the chosen sequence and it's value ($median_id, $median_value).
- The former is the default, the latter is done if $median is true. See the
- pairwise_identity function for more details on $calc_type.
- =item B<$consensus_sequence = consensus_sequence($sequences[, $threshold][, $tie_breaks])>
- There are several ways to get a consensus sequence. At the simplest end
- of the spectrum, the frequency of each character in each column is counted,
- and the highest value chosen (equivalent to setting $threshold = 0, the
- default value). If multiple characters have equally high frequencies, choose
- one at random ($tie_breaks = 'Random', the default).
- If $threshold is greater than 0, then only characters with frequencies
- greater than that are considered. [Not implemented properly yet.]
- The $tie_breaks parameter determines the course of action when characters
- appear equally frequently: 'Random' just selects one at random; 'Nonly'
- and 'Xonly' insert 'N' or 'X', respectively; 'RYN' replaces the character
- with 'R' or 'Y' if both characters are purine or pyrimidine, and 'N'
- otherwise; and 'All' places all possible nucleotides in square brackets,
- such that the returned consensus sequence is effectively a regex.
- =item B<@results | $results = phastcons($bed_file, $phastcons_dir, $chr)>
- Calculate mean, min and max scores, for an interval file, $bed_file. Requires
- the phastCons wigFix files to have been downloaded (to $phastcons_dir), and
- relies on consistent filenaming, as implemented by the 'download_ucsc'
- function. This function will create index files if they don't already exist;
- the original Galaxy file has been augmented to do this automagically
- (./scripts/galaxy/stats/aggregate_scores_in_intervals.py).
- Each array element in the results array/arrayref is a tab-delimited line from
- the bed file, with mean, min, and max columns tacked on the end, in that order.
- =item B<@results | $results = phastcons_scores($bed_file, $phastcons_dir, $chr, $fatal)>
- Get scores for each position, for an interval (.bed) file. Requires the
- phastCons index (.ba) files (which will automatically have been created by
- the 'phastcons' function). Uses the Galaxy script
- ./scripts/galaxy/stats/scores_in_intervals.py; if this script fails then
- the execution will be halted; set $fatal to zero to just display a warning.
- The return value is an arrary or arrayref where each element is a tab-delimited
- string of three variables, chr, position, score.
- =item B<@columns | $columns = alignment_columns($seqs, $order)>
- For a hashref of sequences $seqs, and a hashref with the same keys that
- specifies the ordering of the sequences, extract the columns into an
- array/arrayref.
- =item B<%seqs | $seqs = columns_to_alignment($columns, $order)>
- The inverse of the alignment_columns function, to translate an arrayref
- of columns to a hashref of sequences, indexed with the keys of $order.
- =back
- =head2 Shuffle Sequences
- =over
- =item B<$shuffled = shuffle_sequence($sequence[, $k_let])>
- Shuffle a single sequence whilst retaining nucleotide composition, where
- $k_let is the number of consecutive nucleotides to consider (default is 1).
- This a simple wrapper to ushuffle, which needs to be installed locally.
- =item B<$randomised = randomise_alignment($aln_file[, $method][, $options][, $out_file][, $fatal])>
- Randomise an alignment with either MultiPerm (the default for $method)
- or SISSIz, preserving (approximately) dinucleotide content. Additional
- arguments can be provided with the $options parameter - see the
- MultiPerm/SISSIz documentation for possible values and formatting.
- The randomised alignment is returned as a single string, and is saved
- in $out_file, if provided. If $fatal is true (the default) then any errors
- will halt execution, rather than just causing a warning.
- MultiPerm and SISSIz do much the same thing, in two different ways; SISSIz's
- approach is arguably better, but it takes roughly 10 times longer to execute,
- and the resulting randomised alignments are difficult to tell apart.
- =item B<$shuffled_seqs = shuffle_alignment($seqs, $order[, $no_replace])>
- Shuffle the columns of an alignment, given by hashrefs $seqs and $order,
- by selecting columns at random either with replacement (the default) or
- without ($no_replace set to true). Very simple, makes no attempt to preserve
- local conservation or gap structure.
- =back
- =head2 Align Sequences
- =head3 ClustalW
- =over
- =item B<$out_file = clustalw($fasta_file[, $out_file][, $options])>
- Align the sequences in $fasta_file with ClustalW (which must be locally
- installed). If $out_file is not provided, the filetype of
- $fasta_file is replaced with 'aln' and the output is saved in that
- location. Command line options to ClustalW can be provided ($options);
- see the ClustalW documentation for options and formatting details.
- =back
- =head3 MAFFT
- =over
- =item B<$out_file = mafft($fasta_file[, $out_file][, $local_or_global])>
- Align the sequences in $fasta_file with MAFFT (which must be locally
- installed). If $out_file is not provided, the filetype of
- $fasta_file is replaced with 'aln' and the output is saved in that
- location. MAFFT can be used in "local" or "global" mode by specifying
- the appropriate string as the $local_or_global parameter; the default
- is to allow MAFFT to do what it thinks is best. If $local_or_global is
- specified, then the 'linsi' or 'ginsi' executables are required, rather
- than 'mafft'.
- =back
- =head3 BLASTN
- =over
- =item B<%results | $results = blastn($sequence[, $db][, $entrez_query][, $options][, $out_file])>
- Execute BLAST in nucleotide mode for $sequence, against database $db, which is
- 'nt' by default. A local BLAST installation is used if possible, but if an
- Entrez query is given, a remote installation is used. A useful value for
- $entrez_query is 'NOT(srcdb refseq model[PROP]', which will ignore hypothetical
- proteins. Additional options can be passed via $options - see the BLAST docs
- for details. If $out_file is given, then the results are saved. The results
- hash that is returned is a hash of hashes; the first key is the sequence ID
- from BLAST (ie giXXX), and the next level of keys are 'start', 'end', 'evalue',
- 'bitscore', and 'pident'.
- =back
- =head3 BLAT
- =over
- =item B<C<undef> = blat_server_start($fasta_dir[, $host][, $port])>
- Start a local BLAT server, for the files in $fasta_dir. By default,
- $host and $port are 'localhost' and 3309, respectively. In order to
- speed up repeated starting of the BLAT server, the fasta files are
- automatically converted to 'two_bit' format, and the data is cached
- in an appropriately named directory, alongside $fasta_dir.
- =item B<C<undef> = blat_server_stop([$host][, $port])>
- Stop a local BLAT server. By default, $host and $port are 'localhost'
- and 3309, respectively.
- =item B<@results | $results = blat($seq_file, $out_file[, $host][, $port][, $verbose])>
- Query a local BLAT server. By default, $host and $port are 'localhost'
- and 3309, respectively. $seq_file should contain a sequence in fasta
- format; the results are saved in $out_file. In addition to the BLAT
- output, a score value is calculated and appended to each row; the rows
- are then sorted on this score value, with the highest values at the top.
- The array of arrays containing the output is returned.
- =item B<($chr, $start, $end, $q_start, $q_end, $strand, $blocks) = blat_top_hit($results)>
- Extract useful information for the top-scoring result from a set of BLAT
- results, as returned by the 'blat' function.
- =back
- =head1 Author
- James Allen
- (james@monkeyshines.co.uk)
- =head1 Copyright
- Copyright 2009-2011 James Allen; University of Manchester
- =head1 Licence
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- =cut
- # SVN Date: $Date: 2012-07-24 15:14:56 +0100 (Tue, 24 Jul 2012) $
- # SVN ID: $Id: Sequence.pm 927 2012-07-24 14:14:56Z Julio $
- ################################################################################
- # SEQUENCE RETRIEVAL AND PARSING
- ################################################################################
- sub download_ucsc {
- my ($type, $data_dir, $species, $chrs, $x_way, $ftp_url, $build, $unzip) = @_;
- croak "Type ('fasta', 'maf', or 'phastcons') must be specified" unless $type;
- croak "Data directory must be specified" unless $data_dir;
- $species = "Homo_sapiens" unless $species;
- $species =~ s/ /_/g;
- $species = ucfirst(lc($species));
- $x_way = "46way" unless $x_way;
- $ftp_url = "ftp://hgdownload.cse.ucsc.edu/goldenPath" unless $ftp_url;
- $build = "currentGenomes/$species" unless $build;
- $unzip = 0 unless $unzip;
-
- my $ucsc_data_dir = "$data_dir/$type";
- make_path($ucsc_data_dir) unless -e $ucsc_data_dir;
- my @ucsc_data_dirs = ($ucsc_data_dir);
- if ($type eq "phastcons") {
- if ($x_way eq "46way") {
- make_path("$ucsc_data_dir/vertebrata") unless -e "$ucsc_data_dir/vertebrata";
- make_path("$ucsc_data_dir/placentalia") unless -e "$ucsc_data_dir/placentalia";
- make_path("$ucsc_data_dir/primates") unless -e "$ucsc_data_dir/primates";
- @ucsc_data_dirs = ("$ucsc_data_dir/vertebrata", "$ucsc_data_dir/placentalia", "$ucsc_data_dir/primates");
- }
- }
-
- if ($chrs) {
- foreach my $chr (@$chrs) {
- if ($type eq "fasta") {
- `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/chromosomes/chr$chr.fa.gz`;
- } elsif ($type eq "maf") {
- # Annoyingly, UCSC isn't consistent about where it puts maf
- # files, so we check both possible locations, but will only
- # get data from one, for a given species.
- `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/multiz$x_way/maf/chr$chr.maf.gz`;
- `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/multiz$x_way/chr$chr.maf.gz`;
- } elsif ($type eq "phastcons") {
- if ($x_way eq "46way") {
- `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir/placentalia $ftp_url/$build/phastCons$x_way/placentalMammals/chr$chr*.gz`;
- `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir/primates $ftp_url/$build/phastCons$x_way/primates/chr$chr*.gz`;
- `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir/vertebrata $ftp_url/$build/phastCons$x_way/vertebrate/chr$chr*.gz`;
- } else {
- `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/phastCons$x_way/*chr$chr*.gz`;
- }
- }
- }
- } else {
- if ($type eq "fasta") {
- `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/chromosomes/chr*.fa.gz`;
- } elsif ($type eq "maf") {
- `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/multiz$x_way/maf/chr*.maf.gz`;
- `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/multiz$x_way/chr*.maf.gz`;
- } elsif ($type eq "phastcons") {
- if ($x_way eq "46way") {
- `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir/placentalia $ftp_url/$build/phastCons$x_way/placentalMammals/chr*.gz`;
- `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir/primates $ftp_url/$build/phastCons$x_way/primates/chr*.gz`;
- `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir/vertebrata $ftp_url/$build/phastCons$x_way/vertebrate/chr*.gz`;
- } else {
- `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/phastCons$x_way/*chr*.gz`;
- }
- }
- }
-
- # Always handy to have the tree(s) downloaded too.
- if ($type eq "maf") {
- `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/multiz$x_way/*.nh`;
- } elsif ($type eq "phastcons") {
- `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/phastCons$x_way/*.mod`;
- }
-
- # The formatting of the phastCons filenames is a dog's dinner - it's
- # entirely inconsistent, so we create links to make them easier to
- # process at a later stage - we don't want to rename them, because
- # then wget will not know that the data already exists.
- if ($type eq "phastcons") {
- foreach my $dir (@ucsc_data_dirs) {
- opendir(DIR, $dir);
- foreach my $file (grep { /\.gz$/ } readdir(DIR)) {
- my ($chr) = $file =~ /(chr\w+)/;
- `ln -s $dir/$file $dir/$chr.wig.gz`;
- }
- closedir(DIR);
- }
- }
-
- if ($unzip) {
- foreach my $dir (@ucsc_data_dirs) {
- opendir(DIR, $dir);
- foreach my $file (grep { /\.gz$/ } readdir(DIR)) {
- (my $unzipped = $file) =~ s/\.gz$//;
- `gunzip -c $dir/$file > $dir/$unzipped`;
- }
- closedir(DIR);
- }
- }
-
- return;
- }
- sub maf_index {
- my ($maf_dir) = @_;
- croak "MAF directory must be specified" unless $maf_dir;
-
- # This function relies on bx-python being installed.
- my $index_script = "$resources_dir/scripts/bx-python/maf_build_index.py";
-
- opendir(DIR, $maf_dir);
- foreach my $maf_file (grep { /\.maf$/ } readdir(DIR)) {
- unless (-e "$maf_dir/$maf_file.index") {
- execute_check("$index_script $maf_dir/$maf_file", "usage", 0, "bx-python: maf_build_index.py", "fatal");
- }
- }
- closedir(DIR);
- opendir(DIR, $maf_dir);
- foreach my $maf_file (grep { /\.maf.gz$/ } readdir(DIR)) {
- (my $index_file = $maf_file) =~ s/\.gz$/\.index/;
- unless (-e "$maf_dir/$index_file") {
- `gunzip -c $maf_dir/$maf_file > $maf_dir/unzipped.tmp`;
- execute_check("$index_script $maf_dir/unzipped.tmp $maf_dir/$index_file", "usage", 0, "bx-python: maf_build_index.py", "fatal");
- `rm $maf_dir/unzipped.tmp`;
- }
- }
- closedir(DIR);
-
- return;
- }
- sub sequence_portion {
- my ($seq, $start, $stop) = @_;
- return substr($seq, $start-1, $stop-$start);
- }
- sub sequence_flanking {
- my ($seq, $sub_seq, $left, $right, $pad) = @_;
-
- # The default for the flanking region is the length of the sub-sequence,
- # on both sides; if a single length is provided, then it is applied to
- # both sides
- $left = length($sub_seq) unless $left;
- $right = $left unless $right;
-
- # By default, do not pad the flanking regions with
- # gaps if there is insufficient sequence.
- $pad = 0 unless $pad;
-
- # Check how many times the sub-sequence occurs in the sequence
- my $occurrence = () = $seq =~ /($sub_seq)/gi;
- croak "Sub-sequence does not occur within sequence" unless $occurrence;
- croak "Multiple occurrences of sub-sequence in sequence" if $occurrence > 1;
-
- my ($flanking_l, $flanking_r) = $seq =~
- /([\w\-]{0,$left})$sub_seq([\w\-]{0,$right})/i;
-
- if ($pad) {
- if (length($flanking_l) < $left) {
- $flanking_l = "-"x($left-length($flanking_l)).$flanking_l;
- }
- if (length($flanking_r) < $right) {
- $flanking_r = $flanking_r."-"x($right-length($flanking_r));
- }
- }
-
- return ($flanking_l, $flanking_r);
- }
- sub flanking_lengths {
- my ($subseq, $aligned) = @_;
-
- $aligned =~ s/\./\-/g;
- my @aligned = split(//, $aligned);
- (my $ungapped = $aligned) =~ s/\-//g;
- my $occurences = () = $ungapped =~ /$subseq/g;
- if (!defined($occurences) || $occurences == 0) {
- carp " Couldn't find subsequence ($subseq) in sequence ($ungapped).\n";
- } elsif ($occurences > 1) {
- carp " Multiple occurences of subsequence ($subseq) in alignment ($ungapped).\n";
- }
- my $upstream_bases = index($ungapped, $subseq);
- my $upstream_and_subseq_bases = $upstream_bases + length($subseq);
-
- my $bases = 0;
- my ($upstream_length, $subseq_length, $downstream_length) = (0, 0, 0);
- foreach my $position (0..$#aligned) {
- $bases++ if $aligned[$position] ne '-';
-
- if ($bases <= $upstream_bases) {
- $upstream_length++;
- } elsif ($bases < $upstream_and_subseq_bases) {
- $subseq_length++;
- } elsif ($bases == $upstream_and_subseq_bases) {
- $subseq_length++;
- last;
- }
- }
- $downstream_length = length($aligned) - $upstream_length - $subseq_length;
-
- return ($upstream_length, $subseq_length, $downstream_length);
- }
- # Get a single sequence from fasta-formatted data, for a given ID.
- sub find_fasta_sequence {
- my ($fasta_string, $id) = @_;
-
- # The empty parentheses force a count, rather than a true/false result
- my $occurrence = () = $fasta_string =~ /^>$id\s/gms;
- croak "The id '$id' does not occur in the fasta data" unless $occurrence;
- croak "Multiple occurrences of '$id' in the fasta data" if $occurrence > 1;
-
- my ($seq) = $fasta_string =~ /^>$id[^\n]*\n([^>]*)/ms;
- $seq =~ s/\s+//gms;
- return $seq;
- }
- # Get all sequences from fasta-formatted data that match part of the ID line.
- sub match_fasta_sequences {
- my ($fasta_string, $match) = @_;
-
- # If $match is not given, parse the whole file.
- $match = "" unless $match;
-
- my %seqs = $fasta_string =~ /^>([^\n]*$match\S*)[^\n]*\n([^>]*)/gms;
- foreach my $id (keys %seqs) {
- $seqs{$id} =~ s/\s+//gms;
- }
- return wantarray ? %seqs : \%seqs;
- }
- ################################################################################
- ################################################################################
- # FORMAT CONVERSION
- ################################################################################
- sub from_clustal {
- my ($in) = @_;
- my (%seqs, %order);
- my $counter = 0;
- my @lines = split(/\n/, $in);
- foreach my $line (@lines) {
- if ($line =~ /^(\S+)\s+(\S+)$/) {
- $seqs{$1} .= $2;
- unless (exists($order{$1})) {
- $order{$1} = ++$counter;
- }
- }
- }
- return (\%seqs, \%order);
- }
- sub from_fasta {
- my ($in) = @_;
- my (%seqs, %order);
- %seqs = $in =~ /^>(\S+)[^\n]*\n([^>]*)/gms;
- foreach my $id (keys %seqs) {
- $seqs{$id} =~ s/[\n\r ]//gms;
- }
- my @ids = $in =~ /^>(\S+)/gms;
- for (my $i=0; $i<scalar(@ids); $i++) {
- $order{$ids[$i]} = $i+1;
- }
- return (\%seqs, \%order);
- }
- sub from_maf {
- my ($in) = @_;
- my (%blocks, %seqs, %order);
-
- # Ensure that we don't miss the last block if there aren't enough newlines.
- $in .= "\n\n";
- my @blocks = $in =~ /(^a.+?)\n\n/gms;
-
- my $counter = 0;
- foreach my $block (@blocks) {
- my ($start, $seq) = $block =~ /^s\s+\S+\s+(\d+)[^\n]+\s+(\S+)\s*$/ms;
- my @ids = $block =~ /^s\s+(\S+)/gms;
- my %block_seqs = $block =~ /^s\s+(\S+)[^\n]+\s+(\S+)\s*$/gms;
- $blocks{$start}{'seqs'} = \%block_seqs;
- $blocks{$start}{'size'} = length($seq);
-
- foreach my $id (@ids) {
- unless (exists($order{$id})) {
- $order{$id} = $counter++;
- }
- }
- }
-
- foreach my $start (sort keys %blocks) {
- foreach my $id (sort {$order{$a} <=> $order{$b}} keys %order) {
- if (exists($blocks{$start}{'seqs'}{$id})) {
- $seqs{$id} .= $blocks{$start}{'seqs'}{$id};
- } else {
- $seqs{$id} .= "-" x $blocks{$start}{'size'};
- }
- }
- }
-
- return (\%seqs, \%order);
- }
- # The from_maf function works if the species in the alignment are fully
- # assembled on chromosomes, but can run into trouble with multiple contigs
- # for one or more species. We make the assumption that the ids are of the
- # format 'species.contig', and remove the contig information.
- sub from_maf_crop_id {
- my ($in) = @_;
- $in =~ s/^(s\s+\w+)\S+/$1/gm;
- return from_maf($in);
- }
- sub from_paml {
- my ($in) = @_;
- my (%seqs, %order);
- my $counter = 0;
- my @lines = split(/\n/, $in);
- foreach my $line (@lines) {
- if ($line =~ /^(\S+)\s+(\S+)$/) {
- $seqs{$1} .= $2;
- unless (exists($order{$1})) {
- $order{$1} = ++$counter;
- }
- }
- }
- return (\%seqs, \%order);
- }
- sub from_phylip_i {
- my ($data, $relaxed) = @_;
- $relaxed = 1 unless defined $relaxed;
- my (%seqs, %order, @ids);
- my $counter = 0;
-
- my @lines = split(/\n[\n \t]*/, $data);
- my $info = shift @lines;
- my ($count, $length) = $info =~ /(\d+)\s+(\d+)/;
-
- foreach my $i (1..$count) {
- my $line = shift @lines;
- my ($id, $seq);
- if ($relaxed) {
- ($id, $seq) = $line =~ /^(\S+)\s+(.+)/;
- } else {
- ($id, $seq) = $line =~ /^(.{10})(.+)/;
- }
- $id =~ s/\s+$//g;
- $seqs{$id} = $seq;
- $order{$id} = ++$counter;
- push @ids, $id;
- }
-
- while (@lines) {
- foreach my $i (1..$count) {
- $seqs{$ids[$i-1]} .= shift @lines;
- }
- }
-
- foreach my $id (keys %seqs) {
- $seqs{$id} =~ s/\s+//g;
- }
-
- return (\%seqs, \%order);
- }
- sub from_phylip_s {
- my ($in) = @_;
- my (%seqs, %order);
- my $counter = 0;
- my @lines = split(/\n/, $in);
- foreach my $line (@lines) {
- if ($line =~ /^(\S+)\s+(\S+)$/) {
- $seqs{$1} .= $2;
- unless (exists($order{$1})) {
- $order{$1} = ++$counter;
- }
- }
- }
- return (\%seqs, \%order);
- }
- sub from_phylip {
- my ($in) = @_;
- carp "Conversion from PHYLIP assuming an interleaved file, strict format.";
- return from_phylip_i($in, 0);
- }
- sub to_clustal {
- my ($seqs, $order) = @_;
- my $out;
- my @ids = keys(%$seqs);
- $out = "CLUSTAL [Format Conversion] multiple sequence alignment\n\n\n";
-
- my $max_chunks = 0;
- my %seq_chunks;
- foreach my $id (@ids) {
- my @chunks = $$seqs{$id} =~ /(.{1,60})/gms;
- $max_chunks = scalar(@chunks) if scalar(@chunks) > $max_chunks;
- $seq_chunks{$id} = \@chunks;
- }
-
- unless ($order) {
- for (my $i=0; $i<scalar(@ids); $i++) {
- $$order{$ids[$i]} = $i+1;
- }
- }
- for (my $i=0; $i<$max_chunks; $i++) {
- foreach my $id (sort {$$order{$a} <=> $$order{$b}} keys %$order) {
- if (exists($seq_chunks{$id})) {
- $out .= "$id ".$seq_chunks{$id}[$i]."\n";
- }
- }
- $out .= "\n\n";
- }
- return $out;
- }
- sub to_fasta {
- my ($seqs, $order) = @_;
- my $out;
- my @ids = keys(%$seqs);
- unless ($order) {
- for (my $i=0; $i<scalar(@ids); $i++) {
- $$order{$ids[$i]} = $i+1;
- }
- }
- foreach my $id (sort {$$order{$a} <=> $$order{$b}} keys %$order) {
- if (exists($$seqs{$id})) {
- (my $seq = $$seqs{$id}) =~ s/([\w\-\.]{1,80})/$1\n/gms;
- $out .= ">$id\n$seq\n";
- }
- }
- return $out;
- }
- sub to_maf {
- my ($seqs, $order) = @_;
- my $out;
- my @ids = keys(%$seqs);
- my $seq_length = length($$seqs{random_select(\@ids)});
- $out = "##maf version=1\n";
- $out .= "# Format Conversion, with dummy positional data\n\n";
- $out .= "a ENTRY=Alignment\n";
- unless ($order) {
- for (my $i=0; $i<scalar(@ids); $i++) {
- $$order{$ids[$i]} = $i+1;
- }
- }
- foreach my $id (sort {$$order{$a} <=> $$order{$b}} keys %$order) {
- if (exists($$seqs{$id})) {
- $out .= "s $id\t0\t$seq_length\t+\t$seq_length\t".$$seqs{$id}."\n";
- }
- }
- $out .= "\n";
- return $out;
- }
- sub to_paml {
- my ($seqs, $order) = @_;
- my $out;
- my @ids = keys(%$seqs);
- my $seq_count = scalar(keys(%$seqs));
- my $seq_length = length($$seqs{$ids[0]});
- $out = " $seq_count $seq_length\n";
- unless ($order) {
- for (my $i=0; $i<scalar(@ids); $i++) {
- $$order{$ids[$i]} = $i+1;
- }
- }
- foreach my $id (sort {$$order{$a} <=> $$order{$b}} keys %$order) {
- if (exists($$seqs{$id})) {
- my $seq = $$seqs{$id};
- #$id =~ s/(.{28}).*/$1/;
- $out .= "$id $seq\n";
- }
- }
- return $out;
- }
- sub to_phylip_i {
- my ($seqs, $order, $relaxed) = @_;
- $relaxed = 1 unless defined $relaxed;
- my $out;
- my @ids = keys(%$seqs);
- my $seq_count = scalar(keys(%$seqs));
- my $seq_length = length($$seqs{$ids[0]});
- $out = " $seq_count $seq_length\n";
- unless ($order) {
- for (my $i=0; $i<scalar(@ids); $i++) {
- $$order{$ids[$i]} = $i+1;
- }
- }
- foreach my $id (sort {$$order{$a} <=> $$order{$b}} keys %$order) {
- (my $seq = $$seqs{$id}) =~ s/(.{1,10})/$1 /gm;
- $seq =~ s/(.{1,54}) /$1\n/gm;
- my @seq = split(/\n/, $seq);
-
- if ($relaxed) {
- $out .= "$id ";
- } else {
- $out .= sprintf "%-10.10s", $id;
- }
- $out .= shift @seq;
- $out .= "\n";
-
- $$seqs{$id} = \@seq;
- }
-
- my $chunks = scalar(@{$$seqs{$ids[0]}});
- foreach my $i (1..$chunks) {
- $out .= "\n";
- foreach my $id (sort {$$order{$a} <=> $$order{$b}} keys %$order) {
- $out .= " ";
- $out .= shift @{$$seqs{$id}};
- $out .= "\n";
- }
- }
-
- return $out;
- }
- sub to_phylip_s {
- my ($seqs, $order) = @_;
- my $out;
- my @ids = keys(%$seqs);
- my $seq_count = scalar(keys(%$seqs));
- my $seq_length = length($$seqs{$ids[0]});
- $out = " $seq_count $seq_length\n";
- unless ($order) {
- for (my $i=0; $i<scalar(@ids); $i++) {
- $$order{$ids[$i]} = $i+1;
- }
- }
- foreach my $id (sort {$$order{$a} <=> $$order{$b}} keys %$order) {
- if (exists($$seqs{$id})) {
- my $seq = $$seqs{$id};
- $out .= "$id $seq\n";
- }
- }
- return $out;
- }
- sub to_phylip {
- my ($seqs, $order) = @_;
- carp "Conversion to PHYLIP assuming an interleaved file, strict format.";
- return to_phylip_i($seqs, $order, 0);
- }
- sub from_x_to_y {
- my ($from, $to, $in, $out_file) = @_;
- my ($seqs, $order, $out);
-
- if (lc($from) eq "ama") {
- ($seqs, $order) = from_maf($in);
- } elsif (lc($from) eq "clustal") {
- ($seqs, $order) = from_clustal($in);
- } elsif (lc($from) eq "fasta") {
- ($seqs, $order) = from_fasta($in);
- } elsif (lc($from) eq "maf") {
- ($seqs, $order) = from_maf($in);
- } elsif (lc($from) eq "paml") {
- ($seqs, $order) = from_paml($in);
- } elsif (lc($from) eq "phylip") {
- ($seqs, $order) = from_phylip($in);
- } else {
- croak "Unrecognised format '$from'";
- }
-
- if (lc($to) eq "ama") {
- $out = to_maf($seqs, $order);
- } elsif (lc($to) eq "clustal") {
- $out = to_clustal($seqs, $order);
- } elsif (lc($to) eq "fasta") {
- $out = to_fasta($seqs, $order);
- } elsif (lc($to) eq "maf") {
- $out = to_maf($seqs, $order);
- } elsif (lc($to) eq "paml") {
- $out = to_paml($seqs, $order);
- } elsif (lc($to) eq "phylip") {
- $out = to_phylip($seqs, $order);
- } else {
- croak "Unrecognised format '$to'";
- }
- save_to_file($out, $out_file) if $out_file;
-
- return $out;
- }
- sub maf_blocks {
- my ($bed_file, $maf_dir, $chr, $species, $out_file) = @_;
- croak "Bed file must be specified" unless $bed_file;
- croak "MAF directory must be specified" unless $maf_dir;
- croak "Chromosome must be specified" unless $chr;
- ($out_file = $bed_file) =~ s/bed$/maf/ unless $out_file;
-
- my $maf_file = "$maf_dir/$chr.maf";
- my $zip_it = 0;
- unless (-e $maf_file) {
- croak "MAF file '$maf_file' does not exist" unless -e "$maf_file.gz";
- `gunzip $maf_file.gz`;
- $zip_it = 1;
- }
-
- my $index_file = "$maf_file.index";
- unless (-e $index_file) {
- my $index_script = "$resources_dir/scripts/bx-python/maf_build_index.py";
- execute_check("$index_script $maf_file", "usage", 0, "bx-python: maf_build_index.py", "fatal");
- }
-
- # The Galaxy code requires that a build be specified, which is a bit of
- # a hassle, because it can easily be determined from the first line of
- # the maf file; and, indeed, the Galaxy code requires that the location
- # of the unzipped maf file is given as well as the index, so it must be
- # doing something with it.
- my $first_line = `grep -m 1 '^s ' $maf_file`;
- my ($build) = $first_line =~ /^s (\w+)/;
-
- my $maf_script = "$resources_dir/scripts/galaxy/maf/interval2maf.py";
- my $maf_options = "--dbkey=$build ";
- $maf_options .= "--chromCol=1 --startCol=2 --endCol=3 --strandCol=6 ";
- if ($species) {
- $maf_options .= "--species=".join(",", @$species)." ";
- }
- $maf_options .= "--mafFile=$maf_file ";
- $maf_options .= "--mafIndex=$index_file ";
- $maf_options .= "--interval_file=$bed_file ";
- $maf_options .= "--output_file=$out_file ";
-
- execute_check("$maf_script $maf_options", "MAF blocks", 1, "Galaxy: interval2maf.py", "fatal");
-
- if ($zip_it) {
- `gzip $maf_file`;
- }
-
- return;
- }
- # Sometimes the reference sequence won't be aligned to anything, which will
- # look like missing data in the maf file, which by definition contains blocks
- # with two or more species. This is a problem when we want to look at large
- # scale genomic data, when the spacing between aligned sections may be important.
- # So we splice in the appropriate bit of sequence as a single-species block.
- sub maf_blocks_fill {
- my ($maf_file, $two_bit_dir, $out_file, $chr, $start, $stop) = @_;
- $out_file = $maf_file unless $out_file;
-
- my $maf = read_from_file($maf_file);
- # Ensure that we don't miss the last block if there aren't enough newlines.
- $maf .= "\n\n";
- my ($new_maf) = $maf =~ /(^#.+\n)/m;
- my @blocks = $maf =~ /(^a.+?)\n\n/gms;
-
- # Assume that the top row in the maf file pertains to the reference species.
- my ($ref_species, $strand, $total) = $maf =~ /^s\s+(\S+)\s+\d+\s+\d+\s+(\S+)\s+(\d+)/m;
- ($chr) = $ref_species =~ /(chr\w+)/ unless $chr;
- if ($total < $stop) {
- $stop = $total;
- carp("Stop position is beyond the end of the chromosome");
- }
-
- # Our cursor is always relative to the positive strand.
- my $cursor = $start;
- foreach my $block (@blocks) {
- my ($block_start, $bases, $block_strand) = $block =~ /^s\s+$ref_species\s+(\d+)\s+(\d+)\s+(\S+)\s+\d+\s+\S+\s*$/m;
- carp "Changed strand!" if $block_strand ne $strand;
-
- # Stuff on the negative strand is a pain in the giblets.
- # In the MAF files the start position is given relative to the
- # strand: so for the -ve strand, to get the position on the +ve
- # strand we need to substract the start from the total length of
- # the chr/contig. It's clearer if we repeat ourselves for +ve
- # and -ve strands, rather than get muddled with a more concise
- # chunk of code.
- if ($block_strand eq '-') {
- my $positive_start = $total - $block_start - $bases;
- if ($cursor < $positive_start) {
- my $seq = fetch_dna($two_bit_dir, undef, $chr, $cursor, $positive_start);
- $seq = complement_rev($seq);
- my $length = length($seq);
- my $negative_start = $total - $cursor - $length;
- $new_maf .= "a score=0.0\n";
- $new_maf .= "s $ref_species $negative_start $length $block_strand $total $seq \n\n";
- }
- $cursor = $positive_start + $bases;
- } else {
- if ($cursor < $block_start) {
- my $seq = fetch_dna($two_bit_dir, undef, $chr, $cursor, $block_start);
- my $length = length($seq);
- $new_maf .= "a score=0.0\n";
- $new_maf .= "s $ref_species $cursor $length $block_strand $total $seq \n\n";
- }
- $cursor = $block_start + $bases;
- }
- $new_maf .= "$block\n\n";
- }
-
- if ($cursor < $stop) {
- my $seq = fetch_dna($two_bit_dir, undef, $chr, $cursor, $stop);
- my $length = length($seq);
- $new_maf .= "a score=0.0\n";
-
- if ($strand eq '-') {
- $seq = complement_rev($seq);
- my $negative_start = $total - $cursor - $length;
- $new_maf .= "s $ref_species $negative_start $length $strand $total $seq \n\n";
- } else {
- $new_maf .= "s $ref_species $cursor $length $strand $total $seq \n\n";
- }
- }
-
- save_to_file($new_maf, $out_file);
-
- return $new_maf;
- }
- ################################################################################
- ################################################################################
- # SEQUENCE MANIPULATION
- ################################################################################
- # Complement and (optionally) reverse a sequence.
- sub complement {
- my ($seq, $rna) = @_;
- if ($rna) {
- $seq =~ tr/ACGURYacgury/UGCAYRugcayr/;
- } else {
- $seq =~ tr/ACGTRYacgtry/TGCAYRtgcayr/;
- }
- return $seq;
- }
- sub complement_rev {
- my ($seq, $rna) = @_;
- return reverse(complement($seq, $rna));
- }
- # Reverse sequences - primarily useful for doing HoT calculations.
- sub reverse_seqs {
- my ($seqs) = @_;
- my %new_seqs;
- foreach my $id (sort keys %$seqs) {
- $new_seqs{$id} = reverse($$seqs{$id});
- }
- return wantarray ? %new_seqs : \%new_seqs;
- }
- sub remove_gaps {
- my ($seqs) = @_;
- my %new_seqs;
- foreach my $id (sort keys %$seqs) {
- (my $ungapped = $$seqs{$id}) =~ s/[\.\-\s]//gms;
- $new_seqs{$id} = $ungapped;
- }
- return wantarray ? %new_seqs : \%new_seqs;
- }
- sub excise_columns {
- my ($seqs, $order, $regexes) = @_;
-
- unless ($order) {
- my @ids = keys(%$seqs);
- for (my $i=0; $i<scalar(@ids); $i++) {
- $$order{$ids[$i]} = $i+1;
- }
- }
- # Default is to remove gaps.
- $regexes = ["[\.\-]+"] unless $regexes;
-
- my @columns = alignment_columns($seqs, $order);
-
- my @columns_excised;
- foreach my $seq (@columns) {
- my $ok = 1;
- foreach my $regex (@$regexes) {
- if ($seq =~ /^$regex$/) {
- $ok = 0;
- last;
- }
- }
- push @columns_excised, $seq if $ok;
- }
-
- my %seqs_excised = columns_to_alignment(\@columns_excised, $order);
-
- return wantarray ? %seqs_excised : \%seqs_excised;
- }
- # Remove taxa from an alignment, and then excise any columns of gaps.
- sub prune_alignment {
- my ($seqs, $taxa) = @_;
- my %pruned = %$seqs;
-
- foreach my $taxon (@$taxa) {
- delete($pruned{$taxon});
- }
- my %final = excise_columns(\%pruned);
-
- return wantarray ? %final : \%final;
- }
- ################################################################################
- ################################################################################
- # SEQUENCE STATISTICS
- ################################################################################
- # Calculate single, di-, or tri- nucleotide frequencies of a sequence.
- sub nucleotide_freqs {
- my ($sequence, $k_let, $merge, $return_counts) = @_;
- $k_let = 1 unless $k_let;
- $merge = 0 unless $merge;
- $return_counts = 0 unless $return_counts;
- my (%counts, $total, %freqs);
-
- # Remove any whitespace
- $sequence =~ s/\s//g;
-
- if ($k_let == 1) {
- # Single nucleotide counting can be done fairly simply
- $total = length($sequence);
- while (length($sequence) > 0) {
- $sequence =~ s/^(.)//;
- $counts{$1}++;
- }
- } else {
- # If a possible k-let doesn't occur in a particular sequence, it won't
- # appear in the results - it's useful to have all possible k-lets in
- # the results hash, so we initialise them first.
- my %chars = ();
- foreach my $char (split //, $sequence) {
- $chars{$char}++;
- }
- foreach my $char (keys %chars) {
- $counts{$char} = 0;
- }
- for (my $i=1; $i<$k_let; $i++) {
- foreach my $word (keys %counts) {
- foreach my $char (keys %chars) {
- $counts{"$char$word"} = 0;
- $counts{"$word$char"} = 0;
- }
- delete($counts{$word});
- }
- }
- # Move a window of size k across the sequence, one base at a time
- my $k_let_minus_1 = $k_let - 1;
- $total = length($sequence) - $k_let_minus_1;
- while (length($sequence) >= $k_let) {
- $sequence =~ s/^(.(.{$k_let_minus_1}))/$2/;
- if ($merge) {
- # Count the sequences regardless of direction,
- # e.g. AC and CA are treated as the same
- my $sub_seq = $1;
- if (substr($sub_seq, 0, 1) gt substr($sub_seq, -1, 1)) {
- $sub_seq = reverse $sub_seq;
- }
- $counts{$sub_seq}++;
- } else {
- $counts{$1}++;
- }
- }
- }
-
- # So far, just have counts, which might be wanted; if not,
- # divide by the total to get frequencies.
- if ($return_counts) {
- return wantarray ? %counts : \%counts;
- } else {
- foreach my $sub_seq (keys %counts) {
- $freqs{$sub_seq} = sprintf("%.6f", $counts{$sub_seq}/$total);
- }
- return wantarray ? %freqs : \%freqs;
- }
- }
- # Calculate combined nucleotide frequencies of a set of sequences.
- sub nucleotide_freqs_multiple {
- my ($sequences, $k_let, $merge) = @_;
- $k_let = 1 unless $k_let;
-
- my (%all_counts, $total, %freqs);
-
- foreach my $sequence (@$sequences) {
- $sequence =~ s/[\.\-\s]//gms;
- my %counts = nucleotide_freqs($sequence, $k_let, $merge, 1);
- foreach my $nuc (keys %counts) {
- $all_counts{$nuc} += $counts{$nuc};
- }
- $total += length($sequence)-$k_let+1;
- }
-
- foreach my $nuc (keys %all_counts) {
- $freqs{$nuc} = sprintf("%.6f", $all_counts{$nuc}/$total);
- }
-
- return wantarray ? %freqs : \%freqs;
- }
- sub gc_content {
- my ($sequence) = @_;
- $sequence =~ s/[\.\-\s]//gms;
- my $gc = () = $sequence =~ /[cCgG]/g;
- return sprintf("%.6f", $gc/length($sequence));
- }
- sub gc_content_multiple {
- my ($sequences) = @_;
- my @sequences = @$sequences;
- my ($gc, $length);
- foreach my $sequence (@sequences) {
- $sequence =~ s/[\.\-\s]//gms;
- $gc += () = $sequence =~ /[cCgG]/g;
- $length += length($sequence);
- }
- return sprintf("%.6f", $gc/$length);
- }
- sub at_content {
- my ($sequence) = @_;
- $sequence =~ s/[\.\-\s]//gms;
- my $at = () = $sequence =~ /[aAtTuU]/g;
- return sprintf("%.6f", $at/length($sequence));
- }
- sub at_content_multiple {
- my ($sequences) = @_;
- my @sequences = @$sequences;
- my ($at, $length);
- foreach my $sequence (@sequences) {
- $sequence =~ s/[\.\-\s]//gms;
- $at += () = $sequence =~ /[aAtTuU]/g;
- $length += length($sequence);
- }
- return sprintf("%.6f", $at/$length);
- }
- # Calculate pairwise identity of an alignment.
- sub pairwise_identity {
- my ($sequences, $calc_type, $ignore_case) = @_;
- $calc_type = "include_gaps" unless $calc_type;
- $ignore_case = 1 unless defined $ignore_case;
- my $seq_count = scalar(@$sequences);
- my $seq_length = length($$sequences[0]);
- my ($shortest_length, $total_length);
-
- # Count the occurence of each character, at each position.
- my %position;
- foreach my $seq (@$sequences) {
- my $length = () = $seq =~ /[^\.\-]/g;
- $shortest_length = $length if !defined($shortest_length) || $length < $shortest_length;
- $total_length += $length;
-
- my @sequence = split(//, $seq);
- for (my $i=0; $i<length($seq); $i++) {
- if ($ignore_case) {
- $position{$i}{uc($sequence[$i])}++;
- } else {
- $position{$i}{$sequence[$i]}++;
- }
- }
- }
-
- my $matches = 0;
- my $gaps = 0;
- foreach my $pos (sort {$a <=> $b} keys %position) {
- foreach my $char (sort keys %{$position{$pos}}) {
- if ($position{$pos}{$char} > 1) {
- if ($calc_type eq "include_gaps") {
- $matches += binomial_coeff($position{$pos}{$char}, 2);
- } else {
- if ($char =~ /[\.\-]/) {
- $gaps += binomial_coeff($position{$pos}{$char}, 2);
- } else {
- $matches += binomial_coeff($position{$pos}{$char}, 2);
- }
- }
- }
- }
- }
-
- my $permutations = binomial_coeff($seq_count, 2);
- my $denominator;
- if ($calc_type eq "include_gaps") {
- $denominator = $permutations*$seq_length;
- } elsif ($calc_type eq "exclude_gaps") {
- $denominator = ($permutations*$seq_length)-$gaps;
- } elsif ($calc_type eq "shortest") {
- # Not entirely sure that this one makes sense for MSAs?
- $denominator = $permutations*$shortest_length;
- } elsif ($calc_type eq "mean_length") {
- $denominator = $permutation…
Large files files are truncated, but you can click here to view the full file