Sequence.pm - Only need this module if we use the embl

/lib/MonkeyShines/Sequence.pm

https://bitbucket.org/james_monkeyshines/monkeyshines · Perl · 2315 lines · 1476 code · 222 blank · 617 comment · 201 complexity · 09dba631b6a905d5f035d49393157456 MD5 · raw file
Large files are truncated click here to view the full file

package MonkeyShines::Sequence;

use warnings;
use strict;
use Carp;

our $VERSION = '0.1';

use File::Path qw(make_path);
use FindBin qw($Bin);

use MonkeyShines::Utils qw (
	timestamp_id
	read_from_file
	read_from_delim
	save_to_file
	save_to_delim
	mean
	median_hash
	binomial_coeff
	random_select
	random_select_and_remove
	list_exists
	flip_hash
	execute_check
);

# Only need this module if we use the embl_fetch function, which is
# currently mothballed at the bottom of this module...
#use LWP::Simple;

use Exporter;
our @ISA = qw(Exporter);
our @EXPORT_OK = qw(
	download_ucsc maf_index maf_blocks maf_blocks_fill phastcons phastcons_scores
	sequence_portion sequence_flanking flanking_lengths
	find_fasta_sequence match_fasta_sequences
	from_x_to_y from_clustal from_fasta from_maf from_maf_crop_id from_paml
	from_phylip_i from_phylip_s from_phylip
	to_clustal to_fasta to_maf to_paml to_phylip_i to_phylip_s to_phylip
	complement complement_rev
	reverse_seqs remove_gaps excise_columns prune_alignment
	nucleotide_freqs nucleotide_freqs_multiple
	gc_content gc_content_multiple at_content at_content_multiple
	pairwise_identity pairwise_identity_site all_pairwise_identities median_pairwise_identity
	consensus_sequence
	alignment_islands
	alignment_columns columns_to_alignment
	shuffle_sequence randomise_alignment shuffle_alignment random_structure
	clustalw mafft mafft_fa blastn
	blat_server_start blat_server_stop blat blat_top_hit blat_top_hits fetch_dna
	parse_structure sci
);

# This module contains wrapper scripts that execute various bits of software.
# If these are in the user's execution path, the global variable $bin_dir
# can simply be an empty string; otherwise it needs to be the path to the
# directory that contains the executables (or that contains symbolic links to
# the executables).
my $bin_dir = "";

# Sometimes scripts from other software packages need to be executed; this
# directory defines their location.
my $resources_dir = "$Bin/../../resources";

=pod

=head1 Name

MonkeyShines::Sequence - Sequence processing.

=head1 Description

MonkeyShines::Sequence is a collection of functions for sequence processing,
including alignment, format conversion, and sequence statistics.

This module contains wrapper scripts that execute various bits of software.
Generally, these rely on the software being in the user's execution path;
an alternative is to set the global variable $bin_dir in this module to the
path to the directory that contains the executables (or that contains
symbolic links to the executables).

List of software (executables) used by functions in this module:
BLAT (blat faToTwoBit gfClient, gfServer)
bx-python
ClustalW (clustalw)
Galaxy
MAFFT (mafft, ginsi, linsi)
SISSIz (sissiz)
uShuffle (ushuffle)

=head1 Functions

=head2 Sequence Retrieval and Parsing

=over

=item B<C<undef> = download_ucsc($type, $data_dir[, $species][, $chrs][, $x_way][, $ftp_url][, $build][, $unzip])>

Retrieve data from UCSC. The type of data is specified by $type, which must
be either 'fasta', 'maf', or 'phastcons'. The defaults are set up to get human
and vertebrate datafrom the MultiZ 46-way alignment: $species = 'Homo_sapiens';
$x_way = '46way'. For other species, check the UCSC ftp site, to get these
values. $build defaults to the current build for the reference species $species.

To retrieve data for specific chromosomes, provide an arrayref, $chrs (useful
for avoiding data from undetermined locations). Sometimes these files are
enormous, so I wouldn't recommend unzipping them until you need them, unless
you have memory and disk space to burn. All of the files are stored in
$data_dir/$type, which is created if it doesn't exist. The program checks there
before downloading, to see if newer copies of the file are available, and only
downloads if necessary. The formatting of phastCons filenames is entirely
inconsistent, so symbolic links are created with the format, $chr.wig.gz.

=item B<C<undef> = maf_index($maf_dir)>

For each maf file in $maf_dir, create an index file with a script from
the bx-python project (./scripts/bx-python/maf_build_index.py). This is a
time- and memory-intensive process, but is essential for efficiently
executing subsequent functions. If the maf files are gzipped, then they
will be unzipped, processed, then rezipped, one at time.

Obviously you need to have installed bx-python, which is fairly
straightforward: http://bitbucket.org/james_taylor/bx-python/wiki/HowToInstall
While you're at it, you might just want to install Galaxy, as it is needed for
other processing tasks: http://bitbucket.org/galaxy/galaxy-central/wiki/GetGalaxy

=item B<$sub_seq = sequence_portion($seq, $start, $stop)>

Return the sequence between $start and $stop for a given sequence $seq.

=item B<($flanking_l, $flanking_r) = sequence_flanking($seq, $sub_seq[, $left][, $right][, $pad])>

Return the flanking sequences on each side of $sub_seq for a given
sequence $seq. The default amount of flanking sequence returned is
the length of $sub_seq, on either side. To retrieve a specific amount of
flanking, use integer values for $left and $right; $right is set to the
same value as $left if only $left is given. If $pad is true then the
returned flanking regions will be padded with gaps at either end, so
that each is the specified length (this is not done by default). An error
will occur if $sub_seq does not appear in $seq, or if it appears multiple
times.

=item B<$seq = find_fasta_sequence($fasta_string, $id)>

Get a single sequence from fasta-formatted data, whose ID matches $id.
The returned sequence $seq retains the formatting, ie newlines.

=item B<[$%]seqs = match_fasta_sequences($fasta_string, $match)>

Get one or more sequences from fasta-formatted data, where the IDs match $match.
The returned sequences retain their formatting, ie newlines.

=back

=head2 Format Conversion

=over

=item B<$out = from_x_to_y($from, $to, $in[, $out_file])>

Convert an alignment, $in, from one format to another, $out.
$from and $to can take the following values: "clustal", "fasta", "maf",
"paml". Some of these formats are covered by the 'readseq'
software, but are duplicated here for ease of use. If $out_file is
given, then $out is saved there.

=item B<($seqs, $order) = from_<format>($in)>

<format> can be one of "clustal", "fasta", "maf", "paml",
e.g. from_clustal($in). Convert $in, an alignment in that format, to
a hashref, $seqs, with ids and sequences as keys and values, respectively.
Since the order of sequences in a file may be important, $order is a
hashref with ids as keys and incremental numbers as values.

=item B<($out) = to_<format>($seqs[, $order])>

<format> can be one of "clustal", "fasta", "maf", "paml",
e.g. to_clustal($seqs, $order). Save a hashref, $seqs with ids and
sequences as keys and values, respectively, in the specified format.
$order is a hashref with ids as keys and integer values that indicate
the order of the sequences in $out.

=item B<C<undef> = maf_blocks($bed_file, $maf_dir, $chr[, $species][, $out_file])>

Retrieve the MAF blocks for the interval(s) specified in $bed_file, using
the files in $maf_dir; the appropriate files are chosen according to the
chromosome, given by $chr. Indexes are required, for tolerable speed of
execution (see the maf_index function), and these will be created if they do
not exist. The appropirate MAF file will be unzipped then rezipped, if
necessary. If this function will be called repeatedly, it's wisest to unzip
the MAF file before executing the loop that does the repeated calling, as
this'll save a load of time. The maf data is saved to $outfile, which
defaults to $bed_file with the file type changed from 'bed' to 'maf'.

The function uses the Galaxy script ./scripts/galaxy/maf/interval2maf.py, so
Galaxy needs to be installed: http://bitbucket.org/galaxy/galaxy-central/wiki/GetGalaxy

=back

=head2 Sequence Manipulation

=over

=item B<$comp_seq = complement($seq[, $rna])>

Generate the complement of a sequence; set $rna to true for RNA sequence.

=item B<$comp_rev_seq = complement_rev($seq[, $rna])>

Generate the reverse complement of a sequence; set $rna to true for RNA
sequence.

=item B<%new_seqs | $new_seqs = reverse_seqs($seqs)>

Reverse every sequence in the hashref $seqs (handy for HoT calculations).

=item B<%new_seqs | $new_seqs = remove_gaps($seqs)>

Remove gaps from sequences stored in the hashref $seqs. Gaps are defined here
as dots, dashes, and whitespace characters.

=item B<%new_seqs | $new_seqs = excise_columns($seqs[, $order][, $regexes])>

Remove columns that have particular properties, e.g. those that consist
entirely of the same character, or of gaps and a single character
("[\.\-]*[a-zA-Z][\.\-]*"). It can be tricky to write a single regex to
cover all cases, so the function accepts an arrayref of regexes. The
default is to remove columns which are entirely gaps. The alignment is
specified by the hashref $seqs, with IDs as keys and sequences as values.
If the order of the sequences is important (e.g. a regex relies on a
reference sequence being the first row), $order can be used to provide
a hashref with the same keys as $seqs, and numeric values that give the
ordering. This function is useful when rows have been removed from a multiple
alignment, leaving some columns that consist entirely of gaps.

=item B<%new_seqs | $new_seqs = prune_alignment($seqs, $taxa)>

From a hashref $seqs, with IDs as keys and sequences as values, remove
the sequences with IDs in the arrayref $taxa, then use 'excise_columns'
to remove any columns which consist entirely of gaps.

=back

=head2 Sequence Statistics

=over

=item B<%freqs | $freqs = nucleotide_freqs($sequence[, $k_let][, $merge][, $return_counts])>

Calculate the mono-, di-, or tri- nucleotide frequencies of $sequence, by
setting $k_let = 1 (default), 2, or 3, respectively. For $k_let = 2 or 3,
if $merge is true then palindromes are considered to be equivalent, and
the counts are combined; e.g. the totals for AC and CA are added. A hash,
or hashref, is returned with the mono-, di-, or tri- nucleotides as keys;
and the frequencies (to 6d.p.) as values, unless $return_counts is true,
in which case the integer counts are the values. (In fact, $k_let can be
any integer value, so this function could be used to detect 'words' of
any length in sequence data.)

=item B<%freqs | $freqs = nucleotide_freqs_multiple($sequences[, $k_let][, $merge])>

Calculate the mono-, di-, or tri- nucleotide frequencies of an array of
sequences, $sequences, by using the 'nucleotide_freqs' function. Any gap
characters will be removed from the sequences before calculations are done.

=item B<$freq = gc_content($sequence)>
=item B<$freq = at_content($sequence)>

Calculate the GC or AT(U) content of a sequence, to 6d.p.

=item B<$freq = gc_content_multiple($sequences)>
=item B<$freq = at_content_multiple($sequences)>

Calculate the GC or AT(U) content of an arrayref of sequences, to 6d.p.
Gap characters are removed before doing the calculation.

=item B<$ident = pairwise_identity($sequences[, $calc_type][, $ignore_case])>

Calculate the pairwise identity of an alignment, given by the arrayref
$sequences; if more than two sequences are given, the mean pairwise identity
is calculated. There are a number of ways to define pairwise
identity, each involving slightly different calculations; options for
$calc_type are 'include_gaps' (default), 'exclude_gaps', 'shortest', and
'mean_length' (cf May (2004) Structure 12:737-8). The (mean) pairwise
identity, $ident, is returned to 6d.p. By default, the function treats
lower and upper case verions of the alignment characters as the same;
set $ignore_case to false to treat them differently.

=item B<$ident = pairwise_identity_site($column[, $calc_type][, $ignore_case])>

More or less the same as the pairwise_identity function, except the
calculations are done for a string values, $column, that represents a column
in an alignment.

=item B<($median_id, $median_value) | ($pairwise_identities | %pairwise_identities) =
	all_pairwise_identities($seq, $seqs[, $calc_type][, $median])>

For a reference sequence $seq, calculate the pairwise identity with a bunch
of other sequences in turn, then either return all of the values
($|%pairwise_identities) or pick the one with the median value and return
both the ID of the chosen sequence and it's value ($median_id, $median_value).
The former is the default, the latter is done if $median is true. See the
pairwise_identity function for more details on $calc_type.

=item B<$consensus_sequence = consensus_sequence($sequences[, $threshold][, $tie_breaks])>

There are several ways to get a consensus sequence. At the simplest end
of the spectrum, the frequency of each character in each column is counted,
and the highest value chosen (equivalent to setting $threshold = 0, the
default value). If multiple characters have equally high frequencies, choose
one at random ($tie_breaks = 'Random', the default).

If $threshold is greater than 0, then only characters with frequencies
greater than that are considered. [Not implemented properly yet.]

The $tie_breaks parameter determines the course of action when characters
appear equally frequently: 'Random' just selects one at random; 'Nonly'
and 'Xonly' insert 'N' or 'X', respectively; 'RYN' replaces the character
with 'R' or 'Y' if both characters are purine or pyrimidine, and 'N'
otherwise; and 'All' places all possible nucleotides in square brackets,
such that the returned consensus sequence is effectively a regex.

=item B<@results | $results = phastcons($bed_file, $phastcons_dir, $chr)>

Calculate mean, min and max scores, for an interval file, $bed_file. Requires
the phastCons wigFix files to have been downloaded (to $phastcons_dir), and
relies on consistent filenaming, as implemented by the 'download_ucsc'
function. This function will create index files if they don't already exist;
the original Galaxy file has been augmented to do this automagically
(./scripts/galaxy/stats/aggregate_scores_in_intervals.py).

Each array element in the results array/arrayref is a tab-delimited line from
the bed file, with mean, min, and max columns tacked on the end, in that order.

=item B<@results | $results = phastcons_scores($bed_file, $phastcons_dir, $chr, $fatal)>

Get scores for each position, for an interval (.bed) file. Requires the
phastCons index (.ba) files (which will automatically have been created by
the 'phastcons' function). Uses the Galaxy script
./scripts/galaxy/stats/scores_in_intervals.py; if this script fails then
the execution will be halted; set $fatal to zero to just display a warning.

The return value is an arrary or arrayref where each element is a tab-delimited
string of three variables, chr, position, score.

=item B<@columns | $columns = alignment_columns($seqs, $order)>

For a hashref of sequences $seqs, and a hashref with the same keys that
specifies the ordering of the sequences, extract the columns into an
array/arrayref.

=item B<%seqs | $seqs = columns_to_alignment($columns, $order)>

The inverse of the alignment_columns function, to translate an arrayref
of columns to a hashref of sequences, indexed with the keys of $order.

=back

=head2 Shuffle Sequences

=over

=item B<$shuffled = shuffle_sequence($sequence[, $k_let])>

Shuffle a single sequence whilst retaining nucleotide composition, where
$k_let is the number of consecutive nucleotides to consider (default is 1).
This a simple wrapper to ushuffle, which needs to be installed locally.

=item B<$randomised = randomise_alignment($aln_file[, $method][, $options][, $out_file][, $fatal])>

Randomise an alignment with either MultiPerm (the default for $method)
or SISSIz, preserving (approximately) dinucleotide content. Additional
arguments can be provided with the $options parameter - see the
MultiPerm/SISSIz documentation for possible values and formatting.
The randomised alignment is returned as a single string, and is saved
in $out_file, if provided. If $fatal is true (the default) then any errors
will halt execution, rather than just causing a warning.
MultiPerm and SISSIz do much the same thing, in two different ways; SISSIz's
approach is arguably better, but it takes roughly 10 times longer to execute,
and the resulting randomised alignments are difficult to tell apart.

=item B<$shuffled_seqs = shuffle_alignment($seqs, $order[, $no_replace])>

Shuffle the columns of an alignment, given by hashrefs $seqs and $order,
by selecting columns at random either with replacement (the default) or
without ($no_replace set to true). Very simple, makes no attempt to preserve
local conservation or gap structure.

=back

=head2 Align Sequences

=head3 ClustalW

=over

=item B<$out_file = clustalw($fasta_file[, $out_file][, $options])>

Align the sequences in $fasta_file with ClustalW (which must be locally
installed). If $out_file is not provided, the filetype of
$fasta_file is replaced with 'aln' and the output is saved in that
location. Command line options to ClustalW can be provided ($options);
see the ClustalW documentation for options and formatting details.

=back

=head3 MAFFT

=over

=item B<$out_file = mafft($fasta_file[, $out_file][, $local_or_global])>

Align the sequences in $fasta_file with MAFFT (which must be locally
installed). If $out_file is not provided, the filetype of
$fasta_file is replaced with 'aln' and the output is saved in that
location. MAFFT can be used in "local" or "global" mode by specifying
the appropriate string as the $local_or_global parameter; the default
is to allow MAFFT to do what it thinks is best. If $local_or_global is
specified, then the 'linsi' or 'ginsi' executables are required, rather
than 'mafft'.

=back

=head3 BLASTN

=over

=item B<%results | $results = blastn($sequence[, $db][, $entrez_query][, $options][, $out_file])>

Execute BLAST in nucleotide mode for $sequence, against database $db, which is
'nt' by default. A local BLAST installation is used if possible, but if an
Entrez query is given, a remote installation is used. A useful value for
$entrez_query is 'NOT(srcdb refseq model[PROP]', which will ignore hypothetical
proteins. Additional options can be passed via $options - see the BLAST docs
for details. If $out_file is given, then the results are saved. The results
hash that is returned is a hash of hashes; the first key is the sequence ID
from BLAST (ie giXXX), and the next level of keys are 'start', 'end', 'evalue',
'bitscore', and 'pident'.

=back

=head3 BLAT

=over

=item B<C<undef> = blat_server_start($fasta_dir[, $host][, $port])>

Start a local BLAT server, for the files in $fasta_dir. By default,
$host and $port are 'localhost' and 3309, respectively. In order to
speed up repeated starting of the BLAT server, the fasta files are
automatically converted to 'two_bit' format, and the data is cached
in an appropriately named directory, alongside $fasta_dir.

=item B<C<undef> = blat_server_stop([$host][, $port])>

Stop a local BLAT server. By default, $host and $port are 'localhost'
and 3309, respectively.

=item B<@results | $results = blat($seq_file, $out_file[, $host][, $port][, $verbose])>

Query a local BLAT server. By default, $host and $port are 'localhost'
and 3309, respectively. $seq_file should contain a sequence in fasta
format; the results are saved in $out_file. In addition to the BLAT
output, a score value is calculated and appended to each row; the rows
are then sorted on this score value, with the highest values at the top.
The array of arrays containing the output is returned.

=item B<($chr, $start, $end, $q_start, $q_end, $strand, $blocks) = blat_top_hit($results)>

Extract useful information for the top-scoring result from a set of BLAT
results, as returned by the 'blat' function.

=back

=head1 Author

James Allen
(james@monkeyshines.co.uk)

=head1 Copyright

Copyright 2009-2011 James Allen; University of Manchester

=head1 Licence

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.

=cut

# SVN Date: $Date: 2012-07-24 15:14:56 +0100 (Tue, 24 Jul 2012) $
# SVN ID: $Id: Sequence.pm 927 2012-07-24 14:14:56Z Julio $

################################################################################
# SEQUENCE RETRIEVAL AND PARSING
################################################################################
sub download_ucsc {
	my ($type, $data_dir, $species, $chrs, $x_way, $ftp_url, $build, $unzip) = @_;
	croak "Type ('fasta', 'maf', or 'phastcons') must be specified" unless $type;
	croak "Data directory must be specified" unless $data_dir;
	$species = "Homo_sapiens" unless $species;
	$species =~ s/ /_/g;
	$species = ucfirst(lc($species));
	$x_way = "46way" unless $x_way;
	$ftp_url = "ftp://hgdownload.cse.ucsc.edu/goldenPath" unless $ftp_url;
	$build = "currentGenomes/$species" unless $build;
	$unzip = 0 unless $unzip;
	
	my $ucsc_data_dir = "$data_dir/$type";
	make_path($ucsc_data_dir) unless -e $ucsc_data_dir;
	my @ucsc_data_dirs = ($ucsc_data_dir);
	if ($type eq "phastcons") {
		if ($x_way eq "46way") {
			make_path("$ucsc_data_dir/vertebrata") unless -e "$ucsc_data_dir/vertebrata";
			make_path("$ucsc_data_dir/placentalia") unless -e "$ucsc_data_dir/placentalia";
			make_path("$ucsc_data_dir/primates") unless -e "$ucsc_data_dir/primates";
			@ucsc_data_dirs = ("$ucsc_data_dir/vertebrata", "$ucsc_data_dir/placentalia", "$ucsc_data_dir/primates");
		}
	}
	
	if ($chrs) {
		foreach my $chr (@$chrs) {
			if ($type eq "fasta") {
				`wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/chromosomes/chr$chr.fa.gz`;
			} elsif ($type eq "maf") {
				# Annoyingly, UCSC isn't consistent about where it puts maf
				# files, so we check both possible locations, but will only
				# get data from one, for a given species.
				`wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/multiz$x_way/maf/chr$chr.maf.gz`;
				`wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/multiz$x_way/chr$chr.maf.gz`;
			} elsif ($type eq "phastcons") {
				if ($x_way eq "46way") {
					`wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir/placentalia $ftp_url/$build/phastCons$x_way/placentalMammals/chr$chr*.gz`;
					`wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir/primates $ftp_url/$build/phastCons$x_way/primates/chr$chr*.gz`;
					`wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir/vertebrata $ftp_url/$build/phastCons$x_way/vertebrate/chr$chr*.gz`;
				} else {
					`wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/phastCons$x_way/*chr$chr*.gz`;
				}
			}
		}
	} else {
		if ($type eq "fasta") {
			`wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/chromosomes/chr*.fa.gz`;
		} elsif ($type eq "maf") {
			`wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/multiz$x_way/maf/chr*.maf.gz`;
			`wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/multiz$x_way/chr*.maf.gz`;
		} elsif ($type eq "phastcons") {
			if ($x_way eq "46way") {
				`wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir/placentalia $ftp_url/$build/phastCons$x_way/placentalMammals/chr*.gz`;
				`wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir/primates $ftp_url/$build/phastCons$x_way/primates/chr*.gz`;
				`wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir/vertebrata $ftp_url/$build/phastCons$x_way/vertebrate/chr*.gz`;
			} else {
				`wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/phastCons$x_way/*chr*.gz`;
			}
		}
	}
	
	# Always handy to have the tree(s) downloaded too.
	if ($type eq "maf") {
		`wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/multiz$x_way/*.nh`;
	} elsif ($type eq "phastcons") {
		`wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/phastCons$x_way/*.mod`;
	}
	
	# The formatting of the phastCons filenames is a dog's dinner - it's
	# entirely inconsistent, so we create links to make them easier to
	# process at a later stage - we don't want to rename them, because
	# then wget will not know that the data already exists.
	if ($type eq "phastcons") {
		foreach my $dir (@ucsc_data_dirs) {
			opendir(DIR, $dir);
			foreach my $file (grep { /\.gz$/ } readdir(DIR)) {
				my ($chr) = $file =~ /(chr\w+)/;
				`ln -s $dir/$file $dir/$chr.wig.gz`;
			}
			closedir(DIR);
		}
	}
	
	if ($unzip) {
		foreach my $dir (@ucsc_data_dirs) {
			opendir(DIR, $dir);
			foreach my $file (grep { /\.gz$/ } readdir(DIR)) {
				(my $unzipped = $file) =~ s/\.gz$//;
				`gunzip -c $dir/$file > $dir/$unzipped`;
			}
			closedir(DIR);
		}
	}
	
	return;
}

sub maf_index {
	my ($maf_dir) = @_;
	croak "MAF directory must be specified" unless $maf_dir;
	
	# This function relies on bx-python being installed.
	my $index_script = "$resources_dir/scripts/bx-python/maf_build_index.py";
	
	opendir(DIR, $maf_dir);
	foreach my $maf_file (grep { /\.maf$/ } readdir(DIR)) {
		unless (-e "$maf_dir/$maf_file.index") {
			execute_check("$index_script $maf_dir/$maf_file", "usage", 0, "bx-python: maf_build_index.py", "fatal");
		}
	}
	closedir(DIR);
	opendir(DIR, $maf_dir);
	foreach my $maf_file (grep { /\.maf.gz$/ } readdir(DIR)) {
		(my $index_file = $maf_file) =~ s/\.gz$/\.index/;
		unless (-e "$maf_dir/$index_file") {
			`gunzip -c $maf_dir/$maf_file > $maf_dir/unzipped.tmp`;
			execute_check("$index_script $maf_dir/unzipped.tmp $maf_dir/$index_file", "usage", 0, "bx-python: maf_build_index.py", "fatal");
			`rm $maf_dir/unzipped.tmp`;
		}
	}
	closedir(DIR);
	
	return;
}

sub sequence_portion {
	my ($seq, $start, $stop) = @_;
	return substr($seq, $start-1, $stop-$start);
}

sub sequence_flanking {
	my ($seq, $sub_seq, $left, $right, $pad) = @_;
	
	# The default for the flanking region is the length of the sub-sequence,
	# on both sides; if a single length is provided, then it is applied to
	# both sides
	$left = length($sub_seq) unless $left;
	$right = $left unless $right;
	
	# By default, do not pad the flanking regions with
	# gaps if there is insufficient sequence.
	$pad = 0 unless $pad;
	
	# Check how many times the sub-sequence occurs in the sequence
	my $occurrence = () = $seq =~ /($sub_seq)/gi;
	croak "Sub-sequence does not occur within sequence" unless $occurrence;
	croak "Multiple occurrences of sub-sequence in sequence" if $occurrence > 1;
	
	my ($flanking_l, $flanking_r) = $seq =~ 
		/([\w\-]{0,$left})$sub_seq([\w\-]{0,$right})/i;
	
	if ($pad) {
		if (length($flanking_l) < $left) {
			$flanking_l = "-"x($left-length($flanking_l)).$flanking_l;
		}
		if (length($flanking_r) < $right) {
			$flanking_r = $flanking_r."-"x($right-length($flanking_r));
		}
	}
	
	return ($flanking_l, $flanking_r);
}

sub flanking_lengths {
	my ($subseq, $aligned) = @_;
	
	$aligned =~ s/\./\-/g;
	my @aligned = split(//, $aligned);
	(my $ungapped = $aligned) =~ s/\-//g;
	my $occurences = () = $ungapped =~ /$subseq/g;
	if (!defined($occurences) || $occurences == 0) {
		carp "  Couldn't find subsequence ($subseq) in sequence ($ungapped).\n";
	} elsif ($occurences > 1) {
		carp "  Multiple occurences of subsequence ($subseq) in alignment ($ungapped).\n";
	}
	my $upstream_bases = index($ungapped, $subseq);
	my $upstream_and_subseq_bases = $upstream_bases + length($subseq);
	
	my $bases = 0;
	my ($upstream_length, $subseq_length, $downstream_length) = (0, 0, 0);
	foreach my $position (0..$#aligned) {
		$bases++ if $aligned[$position] ne '-';
		
		if ($bases <= $upstream_bases) {
			$upstream_length++;
		} elsif ($bases < $upstream_and_subseq_bases) {
			$subseq_length++;
		} elsif ($bases == $upstream_and_subseq_bases) {
			$subseq_length++;
			last;
		}
	}
	$downstream_length = length($aligned) - $upstream_length - $subseq_length;
	
	return ($upstream_length, $subseq_length, $downstream_length);
}

# Get a single sequence from fasta-formatted data, for a given ID.
sub find_fasta_sequence {
	my ($fasta_string, $id) = @_;
	
	# The empty parentheses force a count, rather than a true/false result
	my $occurrence = () = $fasta_string =~ /^>$id\s/gms;
	croak "The id '$id' does not occur in the fasta data" unless $occurrence;
	croak "Multiple occurrences of '$id' in the fasta data" if $occurrence > 1;
	
	my ($seq) = $fasta_string =~ /^>$id[^\n]*\n([^>]*)/ms;
	$seq =~ s/\s+//gms;
	return $seq;
}

# Get all sequences from fasta-formatted data that match part of the ID line.
sub match_fasta_sequences {
	my ($fasta_string, $match) = @_;
	
	# If $match is not given, parse the whole file.
	$match = "" unless $match;
	
	my %seqs = $fasta_string =~ /^>([^\n]*$match\S*)[^\n]*\n([^>]*)/gms;
	foreach my $id (keys %seqs) {
		$seqs{$id} =~ s/\s+//gms;
	}
	return wantarray ? %seqs : \%seqs;
}
################################################################################

################################################################################
# FORMAT CONVERSION
################################################################################
sub from_clustal {
	my ($in) = @_;
	my (%seqs, %order);
	my $counter = 0;
	my @lines = split(/\n/, $in);
	foreach my $line (@lines) {
		if ($line =~ /^(\S+)\s+(\S+)$/) {
			$seqs{$1} .= $2;
			unless (exists($order{$1})) {
				$order{$1} = ++$counter;
			}
		}
	}
	return (\%seqs, \%order);
}

sub from_fasta {
	my ($in) = @_;
	my (%seqs, %order);
	%seqs = $in =~ /^>(\S+)[^\n]*\n([^>]*)/gms;
	foreach my $id (keys %seqs) {
		$seqs{$id} =~ s/[\n\r ]//gms;
	}
	my @ids = $in =~ /^>(\S+)/gms;
	for (my $i=0; $i<scalar(@ids); $i++) {
		$order{$ids[$i]} = $i+1;
	}
	return (\%seqs, \%order);
}

sub from_maf {
	my ($in) = @_;
	my (%blocks, %seqs, %order);
	
	# Ensure that we don't miss the last block if there aren't enough newlines.
	$in .= "\n\n";
	my @blocks = $in =~ /(^a.+?)\n\n/gms;
	
	my $counter = 0;
	foreach my $block (@blocks) {
		my ($start, $seq) = $block =~ /^s\s+\S+\s+(\d+)[^\n]+\s+(\S+)\s*$/ms;
		my @ids = $block =~ /^s\s+(\S+)/gms;
		my %block_seqs = $block =~ /^s\s+(\S+)[^\n]+\s+(\S+)\s*$/gms;
		$blocks{$start}{'seqs'} = \%block_seqs;
		$blocks{$start}{'size'} = length($seq);
		
		foreach my $id (@ids) {
			unless (exists($order{$id})) {
				$order{$id} = $counter++;
			}
		}
	}
	
	foreach my $start (sort keys %blocks) {
		foreach my $id (sort {$order{$a} <=> $order{$b}} keys %order) {
			if (exists($blocks{$start}{'seqs'}{$id})) {
				$seqs{$id} .= $blocks{$start}{'seqs'}{$id};
			} else {
				$seqs{$id} .= "-" x $blocks{$start}{'size'};
			}
		}
	}
	
	return (\%seqs, \%order);
}

# The from_maf function works if the species in the alignment are fully
# assembled on chromosomes, but can run into trouble with multiple contigs
# for one or more species. We make the assumption that the ids are of the
# format 'species.contig', and remove the contig information.
sub from_maf_crop_id {
	my ($in) = @_;
	$in =~ s/^(s\s+\w+)\S+/$1/gm;
	return from_maf($in);
}

sub from_paml {
	my ($in) = @_;
	my (%seqs, %order);
	my $counter = 0;
	my @lines = split(/\n/, $in);
	foreach my $line (@lines) {
		if ($line =~ /^(\S+)\s+(\S+)$/) {
			$seqs{$1} .= $2;
			unless (exists($order{$1})) {
				$order{$1} = ++$counter;
			}
		}
	}
	return (\%seqs, \%order);
}

sub from_phylip_i {
	my ($data, $relaxed) = @_;
	$relaxed = 1 unless defined $relaxed;
	my (%seqs, %order, @ids);
	my $counter = 0;
	
	my @lines = split(/\n[\n \t]*/, $data);
	my $info = shift @lines;
	my ($count, $length) = $info =~ /(\d+)\s+(\d+)/;
	
	foreach my $i (1..$count) {
		my $line = shift @lines;
		my ($id, $seq);
		if ($relaxed) {
			($id, $seq) = $line =~ /^(\S+)\s+(.+)/;
		} else {
			($id, $seq) = $line =~ /^(.{10})(.+)/;
		}
		$id =~ s/\s+$//g;
		$seqs{$id} = $seq;
		$order{$id} = ++$counter;
		push @ids, $id;
	}
	
	while (@lines) {
		foreach my $i (1..$count) {
			$seqs{$ids[$i-1]} .= shift @lines;
		}
	}
	
	foreach my $id (keys %seqs) {
		$seqs{$id} =~ s/\s+//g;
	}
	
	return (\%seqs, \%order);
}

sub from_phylip_s {
	my ($in) = @_;
	my (%seqs, %order);
	my $counter = 0;
	my @lines = split(/\n/, $in);
	foreach my $line (@lines) {
		if ($line =~ /^(\S+)\s+(\S+)$/) {
			$seqs{$1} .= $2;
			unless (exists($order{$1})) {
				$order{$1} = ++$counter;
			}
		}
	}
	return (\%seqs, \%order);
}

sub from_phylip {
	my ($in) = @_;
	carp "Conversion from PHYLIP assuming an interleaved file, strict format.";
	return from_phylip_i($in, 0);
}

sub to_clustal {
	my ($seqs, $order) = @_;
	my $out;
	my @ids = keys(%$seqs);
	$out = "CLUSTAL [Format Conversion] multiple sequence alignment\n\n\n";
	
	my $max_chunks = 0;
	my %seq_chunks;
	foreach my $id (@ids) {
		my @chunks = $$seqs{$id} =~ /(.{1,60})/gms;
		$max_chunks = scalar(@chunks) if scalar(@chunks) > $max_chunks;
		$seq_chunks{$id} = \@chunks;
	}
	
	unless ($order) {
		for (my $i=0; $i<scalar(@ids); $i++) {
			$$order{$ids[$i]} = $i+1;
		}
	}
	for (my $i=0; $i<$max_chunks; $i++) {
		foreach my $id (sort {$$order{$a} <=> $$order{$b}} keys %$order) {
			if (exists($seq_chunks{$id})) {
				$out .= "$id  ".$seq_chunks{$id}[$i]."\n";
			}
		}
		$out .= "\n\n";
	}
	return $out;
}

sub to_fasta {
	my ($seqs, $order) = @_;
	my $out;
	my @ids = keys(%$seqs);
	unless ($order) {
		for (my $i=0; $i<scalar(@ids); $i++) {
			$$order{$ids[$i]} = $i+1;
		}
	}
	foreach my $id (sort {$$order{$a} <=> $$order{$b}} keys %$order) {
		if (exists($$seqs{$id})) {
			(my $seq = $$seqs{$id}) =~ s/([\w\-\.]{1,80})/$1\n/gms;
			$out .= ">$id\n$seq\n";
		}
	}
	return $out;
}

sub to_maf {
	my ($seqs, $order) = @_;
	my $out;
	my @ids = keys(%$seqs);
	my $seq_length = length($$seqs{random_select(\@ids)});
	$out = "##maf version=1\n";
	$out .= "# Format Conversion, with dummy positional data\n\n";
	$out .= "a ENTRY=Alignment\n";
	unless ($order) {
		for (my $i=0; $i<scalar(@ids); $i++) {
			$$order{$ids[$i]} = $i+1;
		}
	}
	foreach my $id (sort {$$order{$a} <=> $$order{$b}} keys %$order) {
		if (exists($$seqs{$id})) {
			$out .= "s $id\t0\t$seq_length\t+\t$seq_length\t".$$seqs{$id}."\n";
		}
	}
	$out .= "\n";
	return $out;
}

sub to_paml {
	my ($seqs, $order) = @_;
	my $out;
	my @ids = keys(%$seqs);
	my $seq_count = scalar(keys(%$seqs));
	my $seq_length = length($$seqs{$ids[0]});
	$out = "  $seq_count $seq_length\n";
	unless ($order) {
		for (my $i=0; $i<scalar(@ids); $i++) {
			$$order{$ids[$i]} = $i+1;
		}
	}
	foreach my $id (sort {$$order{$a} <=> $$order{$b}} keys %$order) {
		if (exists($$seqs{$id})) {
			my $seq = $$seqs{$id};
			#$id =~ s/(.{28}).*/$1/;
			$out .= "$id  $seq\n";
		}
	}
	return $out;
}

sub to_phylip_i {
	my ($seqs, $order, $relaxed) = @_;
	$relaxed = 1 unless defined $relaxed;
	my $out;
	my @ids = keys(%$seqs);
	my $seq_count = scalar(keys(%$seqs));
	my $seq_length = length($$seqs{$ids[0]});
	$out = "  $seq_count $seq_length\n";
	unless ($order) {
		for (my $i=0; $i<scalar(@ids); $i++) {
			$$order{$ids[$i]} = $i+1;
		}
	}
	foreach my $id (sort {$$order{$a} <=> $$order{$b}} keys %$order) {
		(my $seq = $$seqs{$id}) =~ s/(.{1,10})/$1 /gm;
		$seq =~ s/(.{1,54}) /$1\n/gm;
		my @seq = split(/\n/, $seq);
		
		if ($relaxed) {
			$out .= "$id ";
		} else {
			$out .= sprintf "%-10.10s", $id;
		}
		$out .= shift @seq;
		$out .= "\n";
		
		$$seqs{$id} = \@seq;
	}
	
	my $chunks = scalar(@{$$seqs{$ids[0]}});
	foreach my $i (1..$chunks) {
		$out .= "\n";
		foreach my $id (sort {$$order{$a} <=> $$order{$b}} keys %$order) {
			$out .= "          ";
			$out .= shift @{$$seqs{$id}};
			$out .= "\n";
		}
	}
	
	return $out;
}

sub to_phylip_s {
	my ($seqs, $order) = @_;
	my $out;
	my @ids = keys(%$seqs);
	my $seq_count = scalar(keys(%$seqs));
	my $seq_length = length($$seqs{$ids[0]});
	$out = "  $seq_count $seq_length\n";
	unless ($order) {
		for (my $i=0; $i<scalar(@ids); $i++) {
			$$order{$ids[$i]} = $i+1;
		}
	}
	foreach my $id (sort {$$order{$a} <=> $$order{$b}} keys %$order) {
		if (exists($$seqs{$id})) {
			my $seq = $$seqs{$id};
			$out .= "$id  $seq\n";
		}
	}
	return $out;
}

sub to_phylip {
	my ($seqs, $order) = @_;
	carp "Conversion to PHYLIP assuming an interleaved file, strict format.";
	return to_phylip_i($seqs, $order, 0);
}

sub from_x_to_y {
	my ($from, $to, $in, $out_file) = @_;
	my ($seqs, $order, $out);
	
	if (lc($from) eq "ama") {
		($seqs, $order) = from_maf($in);
	} elsif (lc($from) eq "clustal") {
		($seqs, $order) = from_clustal($in);
	} elsif (lc($from) eq "fasta") {
		($seqs, $order) = from_fasta($in);
	} elsif (lc($from) eq "maf") {
		($seqs, $order) = from_maf($in);
	} elsif (lc($from) eq "paml") {
		($seqs, $order) = from_paml($in);
	} elsif (lc($from) eq "phylip") {
		($seqs, $order) = from_phylip($in);
	} else {
		croak "Unrecognised format '$from'";
	}
	
	if (lc($to) eq "ama") {
		$out = to_maf($seqs, $order);
	} elsif (lc($to) eq "clustal") {
		$out = to_clustal($seqs, $order);
	} elsif (lc($to) eq "fasta") {
		$out = to_fasta($seqs, $order);
	} elsif (lc($to) eq "maf") {
		$out = to_maf($seqs, $order);
	} elsif (lc($to) eq "paml") {
		$out = to_paml($seqs, $order);
	} elsif (lc($to) eq "phylip") {
		$out = to_phylip($seqs, $order);
	} else {
		croak "Unrecognised format '$to'";
	}
	save_to_file($out, $out_file) if $out_file;
	
	return $out;
}

sub maf_blocks {
	my ($bed_file, $maf_dir, $chr, $species, $out_file) = @_;
	croak "Bed file must be specified" unless $bed_file;
	croak "MAF directory must be specified" unless $maf_dir;
	croak "Chromosome must be specified" unless $chr;
	($out_file = $bed_file) =~ s/bed$/maf/ unless $out_file;
	
	my $maf_file = "$maf_dir/$chr.maf";
	my $zip_it = 0;
	unless (-e $maf_file) {
		croak "MAF file '$maf_file' does not exist" unless -e "$maf_file.gz";
		`gunzip $maf_file.gz`;
		$zip_it = 1;
	}
	
	my $index_file = "$maf_file.index";
	unless (-e $index_file) {
		my $index_script = "$resources_dir/scripts/bx-python/maf_build_index.py";
		execute_check("$index_script $maf_file", "usage", 0, "bx-python: maf_build_index.py", "fatal");
	}
	
	# The Galaxy code requires that a build be specified, which is a bit of
	# a hassle, because it can easily be determined from the first line of
	# the maf file; and, indeed, the Galaxy code requires that the location
	# of the unzipped maf file is given as well as the index, so it must be
	# doing something with it.
	my $first_line = `grep -m 1 '^s ' $maf_file`;
	my ($build) = $first_line =~ /^s (\w+)/;
	
	my $maf_script = "$resources_dir/scripts/galaxy/maf/interval2maf.py";
	my $maf_options = "--dbkey=$build ";
	$maf_options .= "--chromCol=1 --startCol=2 --endCol=3 --strandCol=6 ";
	if ($species) {
		$maf_options .= "--species=".join(",", @$species)." ";
	}
	$maf_options .= "--mafFile=$maf_file ";
	$maf_options .= "--mafIndex=$index_file ";
	$maf_options .= "--interval_file=$bed_file ";
	$maf_options .= "--output_file=$out_file ";
	
	execute_check("$maf_script $maf_options", "MAF blocks", 1, "Galaxy: interval2maf.py", "fatal");
	
	if ($zip_it) {
		`gzip $maf_file`;
	}
	
	return;
}

# Sometimes the reference sequence won't be aligned to anything, which will
# look like missing data in the maf file, which by definition contains blocks
# with two or more species. This is a problem when we want to look at large
# scale genomic data, when the spacing between aligned sections may be important.
# So we splice in the appropriate bit of sequence as a single-species block.
sub maf_blocks_fill {
	my ($maf_file, $two_bit_dir, $out_file, $chr, $start, $stop) = @_;
	$out_file = $maf_file unless $out_file;
	
	my $maf = read_from_file($maf_file);
	# Ensure that we don't miss the last block if there aren't enough newlines.
	$maf .= "\n\n";
	my ($new_maf) = $maf =~ /(^#.+\n)/m;
	my @blocks = $maf =~ /(^a.+?)\n\n/gms;
	
	# Assume that the top row in the maf file pertains to the reference species.
	my ($ref_species, $strand, $total) = $maf =~ /^s\s+(\S+)\s+\d+\s+\d+\s+(\S+)\s+(\d+)/m;
	($chr) = $ref_species =~ /(chr\w+)/ unless $chr;
	if ($total < $stop) {
		$stop = $total;
		carp("Stop position is beyond the end of the chromosome");
	}
	
	# Our cursor is always relative to the positive strand.
	my $cursor = $start;
	foreach my $block (@blocks) {
		my ($block_start, $bases, $block_strand) = $block =~ /^s\s+$ref_species\s+(\d+)\s+(\d+)\s+(\S+)\s+\d+\s+\S+\s*$/m;
		carp "Changed strand!" if $block_strand ne $strand;
		
		# Stuff on the negative strand is a pain in the giblets.
		# In the MAF files the start position is given relative to the
		# strand: so for the -ve strand, to get the position on the +ve
		# strand we need to substract the start from the total length of
		# the chr/contig. It's clearer if we repeat ourselves for +ve
		# and -ve strands, rather than get muddled with a more concise
		# chunk of code.
		if ($block_strand eq '-') {
			my $positive_start = $total - $block_start - $bases;
			if ($cursor < $positive_start) {
				my $seq = fetch_dna($two_bit_dir, undef, $chr, $cursor, $positive_start);
				$seq = complement_rev($seq);
				my $length = length($seq);
				my $negative_start = $total - $cursor - $length;
				$new_maf .= "a score=0.0\n";
				$new_maf .= "s $ref_species  $negative_start $length $block_strand $total $seq \n\n";
			}
			$cursor = $positive_start + $bases;
		} else {
			if ($cursor < $block_start) {
				my $seq = fetch_dna($two_bit_dir, undef, $chr, $cursor, $block_start);
				my $length = length($seq);
				$new_maf .= "a score=0.0\n";
				$new_maf .= "s $ref_species  $cursor $length $block_strand $total $seq \n\n";
			}
			$cursor = $block_start + $bases;
		}
		$new_maf .= "$block\n\n";
	}
	
	if ($cursor < $stop) {
		my $seq = fetch_dna($two_bit_dir, undef, $chr, $cursor, $stop);
		my $length = length($seq);
		$new_maf .= "a score=0.0\n";
		
		if ($strand eq '-') {
			$seq = complement_rev($seq);
			my $negative_start = $total - $cursor - $length;
			$new_maf .= "s $ref_species  $negative_start $length $strand $total $seq \n\n";
		} else {
			$new_maf .= "s $ref_species  $cursor $length $strand $total $seq \n\n";
		}
	}
	
	save_to_file($new_maf, $out_file);
	
	return $new_maf;
}
################################################################################

################################################################################
# SEQUENCE MANIPULATION
################################################################################
# Complement and (optionally) reverse a sequence.
sub complement {
	my ($seq, $rna) = @_;
	if ($rna) {
		$seq =~ tr/ACGURYacgury/UGCAYRugcayr/;
	} else {
		$seq =~ tr/ACGTRYacgtry/TGCAYRtgcayr/;
	}
	return $seq;
}

sub complement_rev {
	my ($seq, $rna) = @_;
	return reverse(complement($seq, $rna));
}

# Reverse sequences - primarily useful for doing HoT calculations.
sub reverse_seqs {
	my ($seqs) = @_;
	my %new_seqs;
	foreach my $id (sort keys %$seqs) {
		$new_seqs{$id} = reverse($$seqs{$id});
	}
	return wantarray ? %new_seqs : \%new_seqs;
}

sub remove_gaps {
	my ($seqs) = @_;
	my %new_seqs;
	foreach my $id (sort keys %$seqs) {
		(my $ungapped = $$seqs{$id}) =~ s/[\.\-\s]//gms;
		$new_seqs{$id} = $ungapped;
	}
	return wantarray ? %new_seqs : \%new_seqs;
}

sub excise_columns {
	my ($seqs, $order, $regexes) = @_;
	
	unless ($order) {
		my @ids = keys(%$seqs);
		for (my $i=0; $i<scalar(@ids); $i++) {
			$$order{$ids[$i]} = $i+1;
		}
	}
	# Default is to remove gaps.
	$regexes = ["[\.\-]+"] unless $regexes;
	
	my @columns = alignment_columns($seqs, $order);
	
	my @columns_excised;
	foreach my $seq (@columns) {
		my $ok = 1;
		foreach my $regex (@$regexes) {
			if ($seq =~ /^$regex$/) {
				$ok = 0;
				last;
			}
		}
		push @columns_excised, $seq if $ok;
	}
	
	my %seqs_excised = columns_to_alignment(\@columns_excised, $order);
	
	return wantarray ? %seqs_excised : \%seqs_excised;
}

# Remove taxa from an alignment, and then excise any columns of gaps.
sub prune_alignment {
	my ($seqs, $taxa) = @_;
	my %pruned = %$seqs;
	
	foreach my $taxon (@$taxa) {
		delete($pruned{$taxon});
	}
	my %final = excise_columns(\%pruned);
	
	return wantarray ? %final : \%final;
}
################################################################################

################################################################################
# SEQUENCE STATISTICS
################################################################################
# Calculate single, di-, or tri- nucleotide frequencies of a sequence.
sub nucleotide_freqs {
	my ($sequence, $k_let, $merge, $return_counts) = @_;
	$k_let = 1 unless $k_let;
	$merge = 0 unless $merge;
	$return_counts = 0 unless $return_counts;
	my (%counts, $total, %freqs);
	
	# Remove any whitespace
	$sequence =~ s/\s//g;
	
	if ($k_let == 1) {
		# Single nucleotide counting can be done fairly simply
		$total = length($sequence);
		while (length($sequence) > 0) {
			$sequence =~ s/^(.)//;
			$counts{$1}++;
		}
	} else {
		# If a possible k-let doesn't occur in a particular sequence, it won't
		# appear in the results - it's useful to have all possible k-lets in
		# the results hash, so we initialise them first.
		my %chars = ();
		foreach my $char (split //, $sequence) {
			$chars{$char}++;
		}
		foreach my $char (keys %chars) {
			$counts{$char} = 0;
		}
		for (my $i=1; $i<$k_let; $i++) {
			foreach my $word (keys %counts) {
				foreach my $char (keys %chars) {
					$counts{"$char$word"} = 0;
					$counts{"$word$char"} = 0;
				}
				delete($counts{$word});
			}
		}

		# Move a window of size k across the sequence, one base at a time
		my $k_let_minus_1 = $k_let - 1;
		$total = length($sequence) - $k_let_minus_1;
		while (length($sequence) >= $k_let) {
			$sequence =~ s/^(.(.{$k_let_minus_1}))/$2/;
			if ($merge) {
				# Count the sequences regardless of direction,
				# e.g. AC and CA are treated as the same
				my $sub_seq = $1;
				if (substr($sub_seq, 0, 1) gt substr($sub_seq, -1, 1)) {
					$sub_seq = reverse $sub_seq;
				}
				$counts{$sub_seq}++;
			} else {
				$counts{$1}++;
			}
		}
	}
	
	# So far, just have counts, which might be wanted; if not,
	# divide by the total to get frequencies.
	if ($return_counts) {
		return wantarray ? %counts : \%counts;
	} else {
		foreach my $sub_seq (keys %counts) {
			$freqs{$sub_seq} = sprintf("%.6f", $counts{$sub_seq}/$total);
		}
		return wantarray ? %freqs : \%freqs;
	}
}

# Calculate combined nucleotide frequencies of a set of sequences.
sub nucleotide_freqs_multiple {
	my ($sequences, $k_let, $merge) = @_;
	$k_let = 1 unless $k_let;
	
	my (%all_counts, $total, %freqs);
	
	foreach my $sequence (@$sequences) {
		$sequence =~ s/[\.\-\s]//gms;
		my %counts = nucleotide_freqs($sequence, $k_let, $merge, 1);
		foreach my $nuc (keys %counts) {
			$all_counts{$nuc} += $counts{$nuc};
		}
		$total += length($sequence)-$k_let+1;
	}
	
	foreach my $nuc (keys %all_counts) {
		$freqs{$nuc} = sprintf("%.6f", $all_counts{$nuc}/$total);
	}
	
	return wantarray ? %freqs : \%freqs;
}

sub gc_content {
	my ($sequence) = @_;
	$sequence =~ s/[\.\-\s]//gms;
	my $gc = () = $sequence =~ /[cCgG]/g;
	return sprintf("%.6f", $gc/length($sequence));
}

sub gc_content_multiple {
	my ($sequences) = @_;
	my @sequences = @$sequences;
	my ($gc, $length);
	foreach my $sequence (@sequences) {
		$sequence =~ s/[\.\-\s]//gms;
		$gc += () = $sequence =~ /[cCgG]/g;
		$length += length($sequence);
	}
	return sprintf("%.6f", $gc/$length);
}

sub at_content {
	my ($sequence) = @_;
	$sequence =~ s/[\.\-\s]//gms;
	my $at = () = $sequence =~ /[aAtTuU]/g;
	return sprintf("%.6f", $at/length($sequence));
}

sub at_content_multiple {
	my ($sequences) = @_;
	my @sequences = @$sequences;
	my ($at, $length);
	foreach my $sequence (@sequences) {
		$sequence =~ s/[\.\-\s]//gms;
		$at += () = $sequence =~ /[aAtTuU]/g;
		$length += length($sequence);
	}
	return sprintf("%.6f", $at/$length);
}

# Calculate pairwise identity of an alignment.
sub pairwise_identity {
	my ($sequences, $calc_type, $ignore_case) = @_;
	$calc_type = "include_gaps" unless $calc_type;
	$ignore_case = 1 unless defined $ignore_case;
	my $seq_count = scalar(@$sequences);
	my $seq_length = length($$sequences[0]);
	my ($shortest_length, $total_length);
	
	# Count the occurence of each character, at each position.
	my %position;
	foreach my $seq (@$sequences) {
		my $length = () = $seq =~ /[^\.\-]/g;
		$shortest_length = $length if !defined($shortest_length) || $length < $shortest_length;
		$total_length += $length;
		
		my @sequence = split(//, $seq);
		for (my $i=0; $i<length($seq); $i++) {
			if ($ignore_case) {
				$position{$i}{uc($sequence[$i])}++;
			} else {
				$position{$i}{$sequence[$i]}++;
			}
		}
	}
	
	my $matches = 0;
	my $gaps = 0;
	foreach my $pos (sort {$a <=> $b} keys %position) {
		foreach my $char (sort keys %{$position{$pos}}) {
			if ($position{$pos}{$char} > 1) {
				if ($calc_type eq "include_gaps") {
					$matches += binomial_coeff($position{$pos}{$char}, 2);
				} else {
					if ($char =~ /[\.\-]/) {
						$gaps += binomial_coeff($position{$pos}{$char}, 2);
					} else {
						$matches += binomial_coeff($position{$pos}{$char}, 2);
					}
				}
			}
		}
	}
	
	my $permutations = binomial_coeff($seq_count, 2);
	my $denominator;
	if ($calc_type eq "include_gaps") {
		$denominator = $permutations*$seq_length;
	} elsif ($calc_type eq "exclude_gaps") {
		$denominator = ($permutations*$seq_length)-$gaps;
	} elsif ($calc_type eq "shortest") {
		# Not entirely sure that this one makes sense for MSAs?
		$denominator = $permutations*$shortest_length;
	} elsif ($calc_type eq "mean_length") {
		$denominator = $permutation…