PageRenderTime 65ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 1ms

/lib/MonkeyShines/Sequence.pm

https://bitbucket.org/james_monkeyshines/monkeyshines
Perl | 2315 lines | 1549 code | 169 blank | 597 comment | 170 complexity | 09dba631b6a905d5f035d49393157456 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

  1. package MonkeyShines::Sequence;
  2. use warnings;
  3. use strict;
  4. use Carp;
  5. our $VERSION = '0.1';
  6. use File::Path qw(make_path);
  7. use FindBin qw($Bin);
  8. use MonkeyShines::Utils qw (
  9. timestamp_id
  10. read_from_file
  11. read_from_delim
  12. save_to_file
  13. save_to_delim
  14. mean
  15. median_hash
  16. binomial_coeff
  17. random_select
  18. random_select_and_remove
  19. list_exists
  20. flip_hash
  21. execute_check
  22. );
  23. # Only need this module if we use the embl_fetch function, which is
  24. # currently mothballed at the bottom of this module...
  25. #use LWP::Simple;
  26. use Exporter;
  27. our @ISA = qw(Exporter);
  28. our @EXPORT_OK = qw(
  29. download_ucsc maf_index maf_blocks maf_blocks_fill phastcons phastcons_scores
  30. sequence_portion sequence_flanking flanking_lengths
  31. find_fasta_sequence match_fasta_sequences
  32. from_x_to_y from_clustal from_fasta from_maf from_maf_crop_id from_paml
  33. from_phylip_i from_phylip_s from_phylip
  34. to_clustal to_fasta to_maf to_paml to_phylip_i to_phylip_s to_phylip
  35. complement complement_rev
  36. reverse_seqs remove_gaps excise_columns prune_alignment
  37. nucleotide_freqs nucleotide_freqs_multiple
  38. gc_content gc_content_multiple at_content at_content_multiple
  39. pairwise_identity pairwise_identity_site all_pairwise_identities median_pairwise_identity
  40. consensus_sequence
  41. alignment_islands
  42. alignment_columns columns_to_alignment
  43. shuffle_sequence randomise_alignment shuffle_alignment random_structure
  44. clustalw mafft mafft_fa blastn
  45. blat_server_start blat_server_stop blat blat_top_hit blat_top_hits fetch_dna
  46. parse_structure sci
  47. );
  48. # This module contains wrapper scripts that execute various bits of software.
  49. # If these are in the user's execution path, the global variable $bin_dir
  50. # can simply be an empty string; otherwise it needs to be the path to the
  51. # directory that contains the executables (or that contains symbolic links to
  52. # the executables).
  53. my $bin_dir = "";
  54. # Sometimes scripts from other software packages need to be executed; this
  55. # directory defines their location.
  56. my $resources_dir = "$Bin/../../resources";
  57. =pod
  58. =head1 Name
  59. MonkeyShines::Sequence - Sequence processing.
  60. =head1 Description
  61. MonkeyShines::Sequence is a collection of functions for sequence processing,
  62. including alignment, format conversion, and sequence statistics.
  63. This module contains wrapper scripts that execute various bits of software.
  64. Generally, these rely on the software being in the user's execution path;
  65. an alternative is to set the global variable $bin_dir in this module to the
  66. path to the directory that contains the executables (or that contains
  67. symbolic links to the executables).
  68. List of software (executables) used by functions in this module:
  69. BLAT (blat faToTwoBit gfClient, gfServer)
  70. bx-python
  71. ClustalW (clustalw)
  72. Galaxy
  73. MAFFT (mafft, ginsi, linsi)
  74. SISSIz (sissiz)
  75. uShuffle (ushuffle)
  76. =head1 Functions
  77. =head2 Sequence Retrieval and Parsing
  78. =over
  79. =item B<C<undef> = download_ucsc($type, $data_dir[, $species][, $chrs][, $x_way][, $ftp_url][, $build][, $unzip])>
  80. Retrieve data from UCSC. The type of data is specified by $type, which must
  81. be either 'fasta', 'maf', or 'phastcons'. The defaults are set up to get human
  82. and vertebrate datafrom the MultiZ 46-way alignment: $species = 'Homo_sapiens';
  83. $x_way = '46way'. For other species, check the UCSC ftp site, to get these
  84. values. $build defaults to the current build for the reference species $species.
  85. To retrieve data for specific chromosomes, provide an arrayref, $chrs (useful
  86. for avoiding data from undetermined locations). Sometimes these files are
  87. enormous, so I wouldn't recommend unzipping them until you need them, unless
  88. you have memory and disk space to burn. All of the files are stored in
  89. $data_dir/$type, which is created if it doesn't exist. The program checks there
  90. before downloading, to see if newer copies of the file are available, and only
  91. downloads if necessary. The formatting of phastCons filenames is entirely
  92. inconsistent, so symbolic links are created with the format, $chr.wig.gz.
  93. =item B<C<undef> = maf_index($maf_dir)>
  94. For each maf file in $maf_dir, create an index file with a script from
  95. the bx-python project (./scripts/bx-python/maf_build_index.py). This is a
  96. time- and memory-intensive process, but is essential for efficiently
  97. executing subsequent functions. If the maf files are gzipped, then they
  98. will be unzipped, processed, then rezipped, one at time.
  99. Obviously you need to have installed bx-python, which is fairly
  100. straightforward: http://bitbucket.org/james_taylor/bx-python/wiki/HowToInstall
  101. While you're at it, you might just want to install Galaxy, as it is needed for
  102. other processing tasks: http://bitbucket.org/galaxy/galaxy-central/wiki/GetGalaxy
  103. =item B<$sub_seq = sequence_portion($seq, $start, $stop)>
  104. Return the sequence between $start and $stop for a given sequence $seq.
  105. =item B<($flanking_l, $flanking_r) = sequence_flanking($seq, $sub_seq[, $left][, $right][, $pad])>
  106. Return the flanking sequences on each side of $sub_seq for a given
  107. sequence $seq. The default amount of flanking sequence returned is
  108. the length of $sub_seq, on either side. To retrieve a specific amount of
  109. flanking, use integer values for $left and $right; $right is set to the
  110. same value as $left if only $left is given. If $pad is true then the
  111. returned flanking regions will be padded with gaps at either end, so
  112. that each is the specified length (this is not done by default). An error
  113. will occur if $sub_seq does not appear in $seq, or if it appears multiple
  114. times.
  115. =item B<$seq = find_fasta_sequence($fasta_string, $id)>
  116. Get a single sequence from fasta-formatted data, whose ID matches $id.
  117. The returned sequence $seq retains the formatting, ie newlines.
  118. =item B<[$%]seqs = match_fasta_sequences($fasta_string, $match)>
  119. Get one or more sequences from fasta-formatted data, where the IDs match $match.
  120. The returned sequences retain their formatting, ie newlines.
  121. =back
  122. =head2 Format Conversion
  123. =over
  124. =item B<$out = from_x_to_y($from, $to, $in[, $out_file])>
  125. Convert an alignment, $in, from one format to another, $out.
  126. $from and $to can take the following values: "clustal", "fasta", "maf",
  127. "paml". Some of these formats are covered by the 'readseq'
  128. software, but are duplicated here for ease of use. If $out_file is
  129. given, then $out is saved there.
  130. =item B<($seqs, $order) = from_<format>($in)>
  131. <format> can be one of "clustal", "fasta", "maf", "paml",
  132. e.g. from_clustal($in). Convert $in, an alignment in that format, to
  133. a hashref, $seqs, with ids and sequences as keys and values, respectively.
  134. Since the order of sequences in a file may be important, $order is a
  135. hashref with ids as keys and incremental numbers as values.
  136. =item B<($out) = to_<format>($seqs[, $order])>
  137. <format> can be one of "clustal", "fasta", "maf", "paml",
  138. e.g. to_clustal($seqs, $order). Save a hashref, $seqs with ids and
  139. sequences as keys and values, respectively, in the specified format.
  140. $order is a hashref with ids as keys and integer values that indicate
  141. the order of the sequences in $out.
  142. =item B<C<undef> = maf_blocks($bed_file, $maf_dir, $chr[, $species][, $out_file])>
  143. Retrieve the MAF blocks for the interval(s) specified in $bed_file, using
  144. the files in $maf_dir; the appropriate files are chosen according to the
  145. chromosome, given by $chr. Indexes are required, for tolerable speed of
  146. execution (see the maf_index function), and these will be created if they do
  147. not exist. The appropirate MAF file will be unzipped then rezipped, if
  148. necessary. If this function will be called repeatedly, it's wisest to unzip
  149. the MAF file before executing the loop that does the repeated calling, as
  150. this'll save a load of time. The maf data is saved to $outfile, which
  151. defaults to $bed_file with the file type changed from 'bed' to 'maf'.
  152. The function uses the Galaxy script ./scripts/galaxy/maf/interval2maf.py, so
  153. Galaxy needs to be installed: http://bitbucket.org/galaxy/galaxy-central/wiki/GetGalaxy
  154. =back
  155. =head2 Sequence Manipulation
  156. =over
  157. =item B<$comp_seq = complement($seq[, $rna])>
  158. Generate the complement of a sequence; set $rna to true for RNA sequence.
  159. =item B<$comp_rev_seq = complement_rev($seq[, $rna])>
  160. Generate the reverse complement of a sequence; set $rna to true for RNA
  161. sequence.
  162. =item B<%new_seqs | $new_seqs = reverse_seqs($seqs)>
  163. Reverse every sequence in the hashref $seqs (handy for HoT calculations).
  164. =item B<%new_seqs | $new_seqs = remove_gaps($seqs)>
  165. Remove gaps from sequences stored in the hashref $seqs. Gaps are defined here
  166. as dots, dashes, and whitespace characters.
  167. =item B<%new_seqs | $new_seqs = excise_columns($seqs[, $order][, $regexes])>
  168. Remove columns that have particular properties, e.g. those that consist
  169. entirely of the same character, or of gaps and a single character
  170. ("[\.\-]*[a-zA-Z][\.\-]*"). It can be tricky to write a single regex to
  171. cover all cases, so the function accepts an arrayref of regexes. The
  172. default is to remove columns which are entirely gaps. The alignment is
  173. specified by the hashref $seqs, with IDs as keys and sequences as values.
  174. If the order of the sequences is important (e.g. a regex relies on a
  175. reference sequence being the first row), $order can be used to provide
  176. a hashref with the same keys as $seqs, and numeric values that give the
  177. ordering. This function is useful when rows have been removed from a multiple
  178. alignment, leaving some columns that consist entirely of gaps.
  179. =item B<%new_seqs | $new_seqs = prune_alignment($seqs, $taxa)>
  180. From a hashref $seqs, with IDs as keys and sequences as values, remove
  181. the sequences with IDs in the arrayref $taxa, then use 'excise_columns'
  182. to remove any columns which consist entirely of gaps.
  183. =back
  184. =head2 Sequence Statistics
  185. =over
  186. =item B<%freqs | $freqs = nucleotide_freqs($sequence[, $k_let][, $merge][, $return_counts])>
  187. Calculate the mono-, di-, or tri- nucleotide frequencies of $sequence, by
  188. setting $k_let = 1 (default), 2, or 3, respectively. For $k_let = 2 or 3,
  189. if $merge is true then palindromes are considered to be equivalent, and
  190. the counts are combined; e.g. the totals for AC and CA are added. A hash,
  191. or hashref, is returned with the mono-, di-, or tri- nucleotides as keys;
  192. and the frequencies (to 6d.p.) as values, unless $return_counts is true,
  193. in which case the integer counts are the values. (In fact, $k_let can be
  194. any integer value, so this function could be used to detect 'words' of
  195. any length in sequence data.)
  196. =item B<%freqs | $freqs = nucleotide_freqs_multiple($sequences[, $k_let][, $merge])>
  197. Calculate the mono-, di-, or tri- nucleotide frequencies of an array of
  198. sequences, $sequences, by using the 'nucleotide_freqs' function. Any gap
  199. characters will be removed from the sequences before calculations are done.
  200. =item B<$freq = gc_content($sequence)>
  201. =item B<$freq = at_content($sequence)>
  202. Calculate the GC or AT(U) content of a sequence, to 6d.p.
  203. =item B<$freq = gc_content_multiple($sequences)>
  204. =item B<$freq = at_content_multiple($sequences)>
  205. Calculate the GC or AT(U) content of an arrayref of sequences, to 6d.p.
  206. Gap characters are removed before doing the calculation.
  207. =item B<$ident = pairwise_identity($sequences[, $calc_type][, $ignore_case])>
  208. Calculate the pairwise identity of an alignment, given by the arrayref
  209. $sequences; if more than two sequences are given, the mean pairwise identity
  210. is calculated. There are a number of ways to define pairwise
  211. identity, each involving slightly different calculations; options for
  212. $calc_type are 'include_gaps' (default), 'exclude_gaps', 'shortest', and
  213. 'mean_length' (cf May (2004) Structure 12:737-8). The (mean) pairwise
  214. identity, $ident, is returned to 6d.p. By default, the function treats
  215. lower and upper case verions of the alignment characters as the same;
  216. set $ignore_case to false to treat them differently.
  217. =item B<$ident = pairwise_identity_site($column[, $calc_type][, $ignore_case])>
  218. More or less the same as the pairwise_identity function, except the
  219. calculations are done for a string values, $column, that represents a column
  220. in an alignment.
  221. =item B<($median_id, $median_value) | ($pairwise_identities | %pairwise_identities) =
  222. all_pairwise_identities($seq, $seqs[, $calc_type][, $median])>
  223. For a reference sequence $seq, calculate the pairwise identity with a bunch
  224. of other sequences in turn, then either return all of the values
  225. ($|%pairwise_identities) or pick the one with the median value and return
  226. both the ID of the chosen sequence and it's value ($median_id, $median_value).
  227. The former is the default, the latter is done if $median is true. See the
  228. pairwise_identity function for more details on $calc_type.
  229. =item B<$consensus_sequence = consensus_sequence($sequences[, $threshold][, $tie_breaks])>
  230. There are several ways to get a consensus sequence. At the simplest end
  231. of the spectrum, the frequency of each character in each column is counted,
  232. and the highest value chosen (equivalent to setting $threshold = 0, the
  233. default value). If multiple characters have equally high frequencies, choose
  234. one at random ($tie_breaks = 'Random', the default).
  235. If $threshold is greater than 0, then only characters with frequencies
  236. greater than that are considered. [Not implemented properly yet.]
  237. The $tie_breaks parameter determines the course of action when characters
  238. appear equally frequently: 'Random' just selects one at random; 'Nonly'
  239. and 'Xonly' insert 'N' or 'X', respectively; 'RYN' replaces the character
  240. with 'R' or 'Y' if both characters are purine or pyrimidine, and 'N'
  241. otherwise; and 'All' places all possible nucleotides in square brackets,
  242. such that the returned consensus sequence is effectively a regex.
  243. =item B<@results | $results = phastcons($bed_file, $phastcons_dir, $chr)>
  244. Calculate mean, min and max scores, for an interval file, $bed_file. Requires
  245. the phastCons wigFix files to have been downloaded (to $phastcons_dir), and
  246. relies on consistent filenaming, as implemented by the 'download_ucsc'
  247. function. This function will create index files if they don't already exist;
  248. the original Galaxy file has been augmented to do this automagically
  249. (./scripts/galaxy/stats/aggregate_scores_in_intervals.py).
  250. Each array element in the results array/arrayref is a tab-delimited line from
  251. the bed file, with mean, min, and max columns tacked on the end, in that order.
  252. =item B<@results | $results = phastcons_scores($bed_file, $phastcons_dir, $chr, $fatal)>
  253. Get scores for each position, for an interval (.bed) file. Requires the
  254. phastCons index (.ba) files (which will automatically have been created by
  255. the 'phastcons' function). Uses the Galaxy script
  256. ./scripts/galaxy/stats/scores_in_intervals.py; if this script fails then
  257. the execution will be halted; set $fatal to zero to just display a warning.
  258. The return value is an arrary or arrayref where each element is a tab-delimited
  259. string of three variables, chr, position, score.
  260. =item B<@columns | $columns = alignment_columns($seqs, $order)>
  261. For a hashref of sequences $seqs, and a hashref with the same keys that
  262. specifies the ordering of the sequences, extract the columns into an
  263. array/arrayref.
  264. =item B<%seqs | $seqs = columns_to_alignment($columns, $order)>
  265. The inverse of the alignment_columns function, to translate an arrayref
  266. of columns to a hashref of sequences, indexed with the keys of $order.
  267. =back
  268. =head2 Shuffle Sequences
  269. =over
  270. =item B<$shuffled = shuffle_sequence($sequence[, $k_let])>
  271. Shuffle a single sequence whilst retaining nucleotide composition, where
  272. $k_let is the number of consecutive nucleotides to consider (default is 1).
  273. This a simple wrapper to ushuffle, which needs to be installed locally.
  274. =item B<$randomised = randomise_alignment($aln_file[, $method][, $options][, $out_file][, $fatal])>
  275. Randomise an alignment with either MultiPerm (the default for $method)
  276. or SISSIz, preserving (approximately) dinucleotide content. Additional
  277. arguments can be provided with the $options parameter - see the
  278. MultiPerm/SISSIz documentation for possible values and formatting.
  279. The randomised alignment is returned as a single string, and is saved
  280. in $out_file, if provided. If $fatal is true (the default) then any errors
  281. will halt execution, rather than just causing a warning.
  282. MultiPerm and SISSIz do much the same thing, in two different ways; SISSIz's
  283. approach is arguably better, but it takes roughly 10 times longer to execute,
  284. and the resulting randomised alignments are difficult to tell apart.
  285. =item B<$shuffled_seqs = shuffle_alignment($seqs, $order[, $no_replace])>
  286. Shuffle the columns of an alignment, given by hashrefs $seqs and $order,
  287. by selecting columns at random either with replacement (the default) or
  288. without ($no_replace set to true). Very simple, makes no attempt to preserve
  289. local conservation or gap structure.
  290. =back
  291. =head2 Align Sequences
  292. =head3 ClustalW
  293. =over
  294. =item B<$out_file = clustalw($fasta_file[, $out_file][, $options])>
  295. Align the sequences in $fasta_file with ClustalW (which must be locally
  296. installed). If $out_file is not provided, the filetype of
  297. $fasta_file is replaced with 'aln' and the output is saved in that
  298. location. Command line options to ClustalW can be provided ($options);
  299. see the ClustalW documentation for options and formatting details.
  300. =back
  301. =head3 MAFFT
  302. =over
  303. =item B<$out_file = mafft($fasta_file[, $out_file][, $local_or_global])>
  304. Align the sequences in $fasta_file with MAFFT (which must be locally
  305. installed). If $out_file is not provided, the filetype of
  306. $fasta_file is replaced with 'aln' and the output is saved in that
  307. location. MAFFT can be used in "local" or "global" mode by specifying
  308. the appropriate string as the $local_or_global parameter; the default
  309. is to allow MAFFT to do what it thinks is best. If $local_or_global is
  310. specified, then the 'linsi' or 'ginsi' executables are required, rather
  311. than 'mafft'.
  312. =back
  313. =head3 BLASTN
  314. =over
  315. =item B<%results | $results = blastn($sequence[, $db][, $entrez_query][, $options][, $out_file])>
  316. Execute BLAST in nucleotide mode for $sequence, against database $db, which is
  317. 'nt' by default. A local BLAST installation is used if possible, but if an
  318. Entrez query is given, a remote installation is used. A useful value for
  319. $entrez_query is 'NOT(srcdb refseq model[PROP]', which will ignore hypothetical
  320. proteins. Additional options can be passed via $options - see the BLAST docs
  321. for details. If $out_file is given, then the results are saved. The results
  322. hash that is returned is a hash of hashes; the first key is the sequence ID
  323. from BLAST (ie giXXX), and the next level of keys are 'start', 'end', 'evalue',
  324. 'bitscore', and 'pident'.
  325. =back
  326. =head3 BLAT
  327. =over
  328. =item B<C<undef> = blat_server_start($fasta_dir[, $host][, $port])>
  329. Start a local BLAT server, for the files in $fasta_dir. By default,
  330. $host and $port are 'localhost' and 3309, respectively. In order to
  331. speed up repeated starting of the BLAT server, the fasta files are
  332. automatically converted to 'two_bit' format, and the data is cached
  333. in an appropriately named directory, alongside $fasta_dir.
  334. =item B<C<undef> = blat_server_stop([$host][, $port])>
  335. Stop a local BLAT server. By default, $host and $port are 'localhost'
  336. and 3309, respectively.
  337. =item B<@results | $results = blat($seq_file, $out_file[, $host][, $port][, $verbose])>
  338. Query a local BLAT server. By default, $host and $port are 'localhost'
  339. and 3309, respectively. $seq_file should contain a sequence in fasta
  340. format; the results are saved in $out_file. In addition to the BLAT
  341. output, a score value is calculated and appended to each row; the rows
  342. are then sorted on this score value, with the highest values at the top.
  343. The array of arrays containing the output is returned.
  344. =item B<($chr, $start, $end, $q_start, $q_end, $strand, $blocks) = blat_top_hit($results)>
  345. Extract useful information for the top-scoring result from a set of BLAT
  346. results, as returned by the 'blat' function.
  347. =back
  348. =head1 Author
  349. James Allen
  350. (james@monkeyshines.co.uk)
  351. =head1 Copyright
  352. Copyright 2009-2011 James Allen; University of Manchester
  353. =head1 Licence
  354. This program is free software: you can redistribute it and/or modify
  355. it under the terms of the GNU General Public License as published by
  356. the Free Software Foundation, either version 3 of the License, or
  357. (at your option) any later version.
  358. This program is distributed in the hope that it will be useful,
  359. but WITHOUT ANY WARRANTY; without even the implied warranty of
  360. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  361. GNU General Public License for more details.
  362. You should have received a copy of the GNU General Public License
  363. along with this program. If not, see <http://www.gnu.org/licenses/>.
  364. =cut
  365. # SVN Date: $Date: 2012-07-24 15:14:56 +0100 (Tue, 24 Jul 2012) $
  366. # SVN ID: $Id: Sequence.pm 927 2012-07-24 14:14:56Z Julio $
  367. ################################################################################
  368. # SEQUENCE RETRIEVAL AND PARSING
  369. ################################################################################
  370. sub download_ucsc {
  371. my ($type, $data_dir, $species, $chrs, $x_way, $ftp_url, $build, $unzip) = @_;
  372. croak "Type ('fasta', 'maf', or 'phastcons') must be specified" unless $type;
  373. croak "Data directory must be specified" unless $data_dir;
  374. $species = "Homo_sapiens" unless $species;
  375. $species =~ s/ /_/g;
  376. $species = ucfirst(lc($species));
  377. $x_way = "46way" unless $x_way;
  378. $ftp_url = "ftp://hgdownload.cse.ucsc.edu/goldenPath" unless $ftp_url;
  379. $build = "currentGenomes/$species" unless $build;
  380. $unzip = 0 unless $unzip;
  381. my $ucsc_data_dir = "$data_dir/$type";
  382. make_path($ucsc_data_dir) unless -e $ucsc_data_dir;
  383. my @ucsc_data_dirs = ($ucsc_data_dir);
  384. if ($type eq "phastcons") {
  385. if ($x_way eq "46way") {
  386. make_path("$ucsc_data_dir/vertebrata") unless -e "$ucsc_data_dir/vertebrata";
  387. make_path("$ucsc_data_dir/placentalia") unless -e "$ucsc_data_dir/placentalia";
  388. make_path("$ucsc_data_dir/primates") unless -e "$ucsc_data_dir/primates";
  389. @ucsc_data_dirs = ("$ucsc_data_dir/vertebrata", "$ucsc_data_dir/placentalia", "$ucsc_data_dir/primates");
  390. }
  391. }
  392. if ($chrs) {
  393. foreach my $chr (@$chrs) {
  394. if ($type eq "fasta") {
  395. `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/chromosomes/chr$chr.fa.gz`;
  396. } elsif ($type eq "maf") {
  397. # Annoyingly, UCSC isn't consistent about where it puts maf
  398. # files, so we check both possible locations, but will only
  399. # get data from one, for a given species.
  400. `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/multiz$x_way/maf/chr$chr.maf.gz`;
  401. `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/multiz$x_way/chr$chr.maf.gz`;
  402. } elsif ($type eq "phastcons") {
  403. if ($x_way eq "46way") {
  404. `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir/placentalia $ftp_url/$build/phastCons$x_way/placentalMammals/chr$chr*.gz`;
  405. `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir/primates $ftp_url/$build/phastCons$x_way/primates/chr$chr*.gz`;
  406. `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir/vertebrata $ftp_url/$build/phastCons$x_way/vertebrate/chr$chr*.gz`;
  407. } else {
  408. `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/phastCons$x_way/*chr$chr*.gz`;
  409. }
  410. }
  411. }
  412. } else {
  413. if ($type eq "fasta") {
  414. `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/chromosomes/chr*.fa.gz`;
  415. } elsif ($type eq "maf") {
  416. `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/multiz$x_way/maf/chr*.maf.gz`;
  417. `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/multiz$x_way/chr*.maf.gz`;
  418. } elsif ($type eq "phastcons") {
  419. if ($x_way eq "46way") {
  420. `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir/placentalia $ftp_url/$build/phastCons$x_way/placentalMammals/chr*.gz`;
  421. `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir/primates $ftp_url/$build/phastCons$x_way/primates/chr*.gz`;
  422. `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir/vertebrata $ftp_url/$build/phastCons$x_way/vertebrate/chr*.gz`;
  423. } else {
  424. `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/phastCons$x_way/*chr*.gz`;
  425. }
  426. }
  427. }
  428. # Always handy to have the tree(s) downloaded too.
  429. if ($type eq "maf") {
  430. `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/multiz$x_way/*.nh`;
  431. } elsif ($type eq "phastcons") {
  432. `wget --no-verbose --timestamping --directory-prefix=$ucsc_data_dir $ftp_url/$build/phastCons$x_way/*.mod`;
  433. }
  434. # The formatting of the phastCons filenames is a dog's dinner - it's
  435. # entirely inconsistent, so we create links to make them easier to
  436. # process at a later stage - we don't want to rename them, because
  437. # then wget will not know that the data already exists.
  438. if ($type eq "phastcons") {
  439. foreach my $dir (@ucsc_data_dirs) {
  440. opendir(DIR, $dir);
  441. foreach my $file (grep { /\.gz$/ } readdir(DIR)) {
  442. my ($chr) = $file =~ /(chr\w+)/;
  443. `ln -s $dir/$file $dir/$chr.wig.gz`;
  444. }
  445. closedir(DIR);
  446. }
  447. }
  448. if ($unzip) {
  449. foreach my $dir (@ucsc_data_dirs) {
  450. opendir(DIR, $dir);
  451. foreach my $file (grep { /\.gz$/ } readdir(DIR)) {
  452. (my $unzipped = $file) =~ s/\.gz$//;
  453. `gunzip -c $dir/$file > $dir/$unzipped`;
  454. }
  455. closedir(DIR);
  456. }
  457. }
  458. return;
  459. }
  460. sub maf_index {
  461. my ($maf_dir) = @_;
  462. croak "MAF directory must be specified" unless $maf_dir;
  463. # This function relies on bx-python being installed.
  464. my $index_script = "$resources_dir/scripts/bx-python/maf_build_index.py";
  465. opendir(DIR, $maf_dir);
  466. foreach my $maf_file (grep { /\.maf$/ } readdir(DIR)) {
  467. unless (-e "$maf_dir/$maf_file.index") {
  468. execute_check("$index_script $maf_dir/$maf_file", "usage", 0, "bx-python: maf_build_index.py", "fatal");
  469. }
  470. }
  471. closedir(DIR);
  472. opendir(DIR, $maf_dir);
  473. foreach my $maf_file (grep { /\.maf.gz$/ } readdir(DIR)) {
  474. (my $index_file = $maf_file) =~ s/\.gz$/\.index/;
  475. unless (-e "$maf_dir/$index_file") {
  476. `gunzip -c $maf_dir/$maf_file > $maf_dir/unzipped.tmp`;
  477. execute_check("$index_script $maf_dir/unzipped.tmp $maf_dir/$index_file", "usage", 0, "bx-python: maf_build_index.py", "fatal");
  478. `rm $maf_dir/unzipped.tmp`;
  479. }
  480. }
  481. closedir(DIR);
  482. return;
  483. }
  484. sub sequence_portion {
  485. my ($seq, $start, $stop) = @_;
  486. return substr($seq, $start-1, $stop-$start);
  487. }
  488. sub sequence_flanking {
  489. my ($seq, $sub_seq, $left, $right, $pad) = @_;
  490. # The default for the flanking region is the length of the sub-sequence,
  491. # on both sides; if a single length is provided, then it is applied to
  492. # both sides
  493. $left = length($sub_seq) unless $left;
  494. $right = $left unless $right;
  495. # By default, do not pad the flanking regions with
  496. # gaps if there is insufficient sequence.
  497. $pad = 0 unless $pad;
  498. # Check how many times the sub-sequence occurs in the sequence
  499. my $occurrence = () = $seq =~ /($sub_seq)/gi;
  500. croak "Sub-sequence does not occur within sequence" unless $occurrence;
  501. croak "Multiple occurrences of sub-sequence in sequence" if $occurrence > 1;
  502. my ($flanking_l, $flanking_r) = $seq =~
  503. /([\w\-]{0,$left})$sub_seq([\w\-]{0,$right})/i;
  504. if ($pad) {
  505. if (length($flanking_l) < $left) {
  506. $flanking_l = "-"x($left-length($flanking_l)).$flanking_l;
  507. }
  508. if (length($flanking_r) < $right) {
  509. $flanking_r = $flanking_r."-"x($right-length($flanking_r));
  510. }
  511. }
  512. return ($flanking_l, $flanking_r);
  513. }
  514. sub flanking_lengths {
  515. my ($subseq, $aligned) = @_;
  516. $aligned =~ s/\./\-/g;
  517. my @aligned = split(//, $aligned);
  518. (my $ungapped = $aligned) =~ s/\-//g;
  519. my $occurences = () = $ungapped =~ /$subseq/g;
  520. if (!defined($occurences) || $occurences == 0) {
  521. carp " Couldn't find subsequence ($subseq) in sequence ($ungapped).\n";
  522. } elsif ($occurences > 1) {
  523. carp " Multiple occurences of subsequence ($subseq) in alignment ($ungapped).\n";
  524. }
  525. my $upstream_bases = index($ungapped, $subseq);
  526. my $upstream_and_subseq_bases = $upstream_bases + length($subseq);
  527. my $bases = 0;
  528. my ($upstream_length, $subseq_length, $downstream_length) = (0, 0, 0);
  529. foreach my $position (0..$#aligned) {
  530. $bases++ if $aligned[$position] ne '-';
  531. if ($bases <= $upstream_bases) {
  532. $upstream_length++;
  533. } elsif ($bases < $upstream_and_subseq_bases) {
  534. $subseq_length++;
  535. } elsif ($bases == $upstream_and_subseq_bases) {
  536. $subseq_length++;
  537. last;
  538. }
  539. }
  540. $downstream_length = length($aligned) - $upstream_length - $subseq_length;
  541. return ($upstream_length, $subseq_length, $downstream_length);
  542. }
  543. # Get a single sequence from fasta-formatted data, for a given ID.
  544. sub find_fasta_sequence {
  545. my ($fasta_string, $id) = @_;
  546. # The empty parentheses force a count, rather than a true/false result
  547. my $occurrence = () = $fasta_string =~ /^>$id\s/gms;
  548. croak "The id '$id' does not occur in the fasta data" unless $occurrence;
  549. croak "Multiple occurrences of '$id' in the fasta data" if $occurrence > 1;
  550. my ($seq) = $fasta_string =~ /^>$id[^\n]*\n([^>]*)/ms;
  551. $seq =~ s/\s+//gms;
  552. return $seq;
  553. }
  554. # Get all sequences from fasta-formatted data that match part of the ID line.
  555. sub match_fasta_sequences {
  556. my ($fasta_string, $match) = @_;
  557. # If $match is not given, parse the whole file.
  558. $match = "" unless $match;
  559. my %seqs = $fasta_string =~ /^>([^\n]*$match\S*)[^\n]*\n([^>]*)/gms;
  560. foreach my $id (keys %seqs) {
  561. $seqs{$id} =~ s/\s+//gms;
  562. }
  563. return wantarray ? %seqs : \%seqs;
  564. }
  565. ################################################################################
  566. ################################################################################
  567. # FORMAT CONVERSION
  568. ################################################################################
  569. sub from_clustal {
  570. my ($in) = @_;
  571. my (%seqs, %order);
  572. my $counter = 0;
  573. my @lines = split(/\n/, $in);
  574. foreach my $line (@lines) {
  575. if ($line =~ /^(\S+)\s+(\S+)$/) {
  576. $seqs{$1} .= $2;
  577. unless (exists($order{$1})) {
  578. $order{$1} = ++$counter;
  579. }
  580. }
  581. }
  582. return (\%seqs, \%order);
  583. }
  584. sub from_fasta {
  585. my ($in) = @_;
  586. my (%seqs, %order);
  587. %seqs = $in =~ /^>(\S+)[^\n]*\n([^>]*)/gms;
  588. foreach my $id (keys %seqs) {
  589. $seqs{$id} =~ s/[\n\r ]//gms;
  590. }
  591. my @ids = $in =~ /^>(\S+)/gms;
  592. for (my $i=0; $i<scalar(@ids); $i++) {
  593. $order{$ids[$i]} = $i+1;
  594. }
  595. return (\%seqs, \%order);
  596. }
  597. sub from_maf {
  598. my ($in) = @_;
  599. my (%blocks, %seqs, %order);
  600. # Ensure that we don't miss the last block if there aren't enough newlines.
  601. $in .= "\n\n";
  602. my @blocks = $in =~ /(^a.+?)\n\n/gms;
  603. my $counter = 0;
  604. foreach my $block (@blocks) {
  605. my ($start, $seq) = $block =~ /^s\s+\S+\s+(\d+)[^\n]+\s+(\S+)\s*$/ms;
  606. my @ids = $block =~ /^s\s+(\S+)/gms;
  607. my %block_seqs = $block =~ /^s\s+(\S+)[^\n]+\s+(\S+)\s*$/gms;
  608. $blocks{$start}{'seqs'} = \%block_seqs;
  609. $blocks{$start}{'size'} = length($seq);
  610. foreach my $id (@ids) {
  611. unless (exists($order{$id})) {
  612. $order{$id} = $counter++;
  613. }
  614. }
  615. }
  616. foreach my $start (sort keys %blocks) {
  617. foreach my $id (sort {$order{$a} <=> $order{$b}} keys %order) {
  618. if (exists($blocks{$start}{'seqs'}{$id})) {
  619. $seqs{$id} .= $blocks{$start}{'seqs'}{$id};
  620. } else {
  621. $seqs{$id} .= "-" x $blocks{$start}{'size'};
  622. }
  623. }
  624. }
  625. return (\%seqs, \%order);
  626. }
  627. # The from_maf function works if the species in the alignment are fully
  628. # assembled on chromosomes, but can run into trouble with multiple contigs
  629. # for one or more species. We make the assumption that the ids are of the
  630. # format 'species.contig', and remove the contig information.
  631. sub from_maf_crop_id {
  632. my ($in) = @_;
  633. $in =~ s/^(s\s+\w+)\S+/$1/gm;
  634. return from_maf($in);
  635. }
  636. sub from_paml {
  637. my ($in) = @_;
  638. my (%seqs, %order);
  639. my $counter = 0;
  640. my @lines = split(/\n/, $in);
  641. foreach my $line (@lines) {
  642. if ($line =~ /^(\S+)\s+(\S+)$/) {
  643. $seqs{$1} .= $2;
  644. unless (exists($order{$1})) {
  645. $order{$1} = ++$counter;
  646. }
  647. }
  648. }
  649. return (\%seqs, \%order);
  650. }
  651. sub from_phylip_i {
  652. my ($data, $relaxed) = @_;
  653. $relaxed = 1 unless defined $relaxed;
  654. my (%seqs, %order, @ids);
  655. my $counter = 0;
  656. my @lines = split(/\n[\n \t]*/, $data);
  657. my $info = shift @lines;
  658. my ($count, $length) = $info =~ /(\d+)\s+(\d+)/;
  659. foreach my $i (1..$count) {
  660. my $line = shift @lines;
  661. my ($id, $seq);
  662. if ($relaxed) {
  663. ($id, $seq) = $line =~ /^(\S+)\s+(.+)/;
  664. } else {
  665. ($id, $seq) = $line =~ /^(.{10})(.+)/;
  666. }
  667. $id =~ s/\s+$//g;
  668. $seqs{$id} = $seq;
  669. $order{$id} = ++$counter;
  670. push @ids, $id;
  671. }
  672. while (@lines) {
  673. foreach my $i (1..$count) {
  674. $seqs{$ids[$i-1]} .= shift @lines;
  675. }
  676. }
  677. foreach my $id (keys %seqs) {
  678. $seqs{$id} =~ s/\s+//g;
  679. }
  680. return (\%seqs, \%order);
  681. }
  682. sub from_phylip_s {
  683. my ($in) = @_;
  684. my (%seqs, %order);
  685. my $counter = 0;
  686. my @lines = split(/\n/, $in);
  687. foreach my $line (@lines) {
  688. if ($line =~ /^(\S+)\s+(\S+)$/) {
  689. $seqs{$1} .= $2;
  690. unless (exists($order{$1})) {
  691. $order{$1} = ++$counter;
  692. }
  693. }
  694. }
  695. return (\%seqs, \%order);
  696. }
  697. sub from_phylip {
  698. my ($in) = @_;
  699. carp "Conversion from PHYLIP assuming an interleaved file, strict format.";
  700. return from_phylip_i($in, 0);
  701. }
  702. sub to_clustal {
  703. my ($seqs, $order) = @_;
  704. my $out;
  705. my @ids = keys(%$seqs);
  706. $out = "CLUSTAL [Format Conversion] multiple sequence alignment\n\n\n";
  707. my $max_chunks = 0;
  708. my %seq_chunks;
  709. foreach my $id (@ids) {
  710. my @chunks = $$seqs{$id} =~ /(.{1,60})/gms;
  711. $max_chunks = scalar(@chunks) if scalar(@chunks) > $max_chunks;
  712. $seq_chunks{$id} = \@chunks;
  713. }
  714. unless ($order) {
  715. for (my $i=0; $i<scalar(@ids); $i++) {
  716. $$order{$ids[$i]} = $i+1;
  717. }
  718. }
  719. for (my $i=0; $i<$max_chunks; $i++) {
  720. foreach my $id (sort {$$order{$a} <=> $$order{$b}} keys %$order) {
  721. if (exists($seq_chunks{$id})) {
  722. $out .= "$id ".$seq_chunks{$id}[$i]."\n";
  723. }
  724. }
  725. $out .= "\n\n";
  726. }
  727. return $out;
  728. }
  729. sub to_fasta {
  730. my ($seqs, $order) = @_;
  731. my $out;
  732. my @ids = keys(%$seqs);
  733. unless ($order) {
  734. for (my $i=0; $i<scalar(@ids); $i++) {
  735. $$order{$ids[$i]} = $i+1;
  736. }
  737. }
  738. foreach my $id (sort {$$order{$a} <=> $$order{$b}} keys %$order) {
  739. if (exists($$seqs{$id})) {
  740. (my $seq = $$seqs{$id}) =~ s/([\w\-\.]{1,80})/$1\n/gms;
  741. $out .= ">$id\n$seq\n";
  742. }
  743. }
  744. return $out;
  745. }
  746. sub to_maf {
  747. my ($seqs, $order) = @_;
  748. my $out;
  749. my @ids = keys(%$seqs);
  750. my $seq_length = length($$seqs{random_select(\@ids)});
  751. $out = "##maf version=1\n";
  752. $out .= "# Format Conversion, with dummy positional data\n\n";
  753. $out .= "a ENTRY=Alignment\n";
  754. unless ($order) {
  755. for (my $i=0; $i<scalar(@ids); $i++) {
  756. $$order{$ids[$i]} = $i+1;
  757. }
  758. }
  759. foreach my $id (sort {$$order{$a} <=> $$order{$b}} keys %$order) {
  760. if (exists($$seqs{$id})) {
  761. $out .= "s $id\t0\t$seq_length\t+\t$seq_length\t".$$seqs{$id}."\n";
  762. }
  763. }
  764. $out .= "\n";
  765. return $out;
  766. }
  767. sub to_paml {
  768. my ($seqs, $order) = @_;
  769. my $out;
  770. my @ids = keys(%$seqs);
  771. my $seq_count = scalar(keys(%$seqs));
  772. my $seq_length = length($$seqs{$ids[0]});
  773. $out = " $seq_count $seq_length\n";
  774. unless ($order) {
  775. for (my $i=0; $i<scalar(@ids); $i++) {
  776. $$order{$ids[$i]} = $i+1;
  777. }
  778. }
  779. foreach my $id (sort {$$order{$a} <=> $$order{$b}} keys %$order) {
  780. if (exists($$seqs{$id})) {
  781. my $seq = $$seqs{$id};
  782. #$id =~ s/(.{28}).*/$1/;
  783. $out .= "$id $seq\n";
  784. }
  785. }
  786. return $out;
  787. }
  788. sub to_phylip_i {
  789. my ($seqs, $order, $relaxed) = @_;
  790. $relaxed = 1 unless defined $relaxed;
  791. my $out;
  792. my @ids = keys(%$seqs);
  793. my $seq_count = scalar(keys(%$seqs));
  794. my $seq_length = length($$seqs{$ids[0]});
  795. $out = " $seq_count $seq_length\n";
  796. unless ($order) {
  797. for (my $i=0; $i<scalar(@ids); $i++) {
  798. $$order{$ids[$i]} = $i+1;
  799. }
  800. }
  801. foreach my $id (sort {$$order{$a} <=> $$order{$b}} keys %$order) {
  802. (my $seq = $$seqs{$id}) =~ s/(.{1,10})/$1 /gm;
  803. $seq =~ s/(.{1,54}) /$1\n/gm;
  804. my @seq = split(/\n/, $seq);
  805. if ($relaxed) {
  806. $out .= "$id ";
  807. } else {
  808. $out .= sprintf "%-10.10s", $id;
  809. }
  810. $out .= shift @seq;
  811. $out .= "\n";
  812. $$seqs{$id} = \@seq;
  813. }
  814. my $chunks = scalar(@{$$seqs{$ids[0]}});
  815. foreach my $i (1..$chunks) {
  816. $out .= "\n";
  817. foreach my $id (sort {$$order{$a} <=> $$order{$b}} keys %$order) {
  818. $out .= " ";
  819. $out .= shift @{$$seqs{$id}};
  820. $out .= "\n";
  821. }
  822. }
  823. return $out;
  824. }
  825. sub to_phylip_s {
  826. my ($seqs, $order) = @_;
  827. my $out;
  828. my @ids = keys(%$seqs);
  829. my $seq_count = scalar(keys(%$seqs));
  830. my $seq_length = length($$seqs{$ids[0]});
  831. $out = " $seq_count $seq_length\n";
  832. unless ($order) {
  833. for (my $i=0; $i<scalar(@ids); $i++) {
  834. $$order{$ids[$i]} = $i+1;
  835. }
  836. }
  837. foreach my $id (sort {$$order{$a} <=> $$order{$b}} keys %$order) {
  838. if (exists($$seqs{$id})) {
  839. my $seq = $$seqs{$id};
  840. $out .= "$id $seq\n";
  841. }
  842. }
  843. return $out;
  844. }
  845. sub to_phylip {
  846. my ($seqs, $order) = @_;
  847. carp "Conversion to PHYLIP assuming an interleaved file, strict format.";
  848. return to_phylip_i($seqs, $order, 0);
  849. }
  850. sub from_x_to_y {
  851. my ($from, $to, $in, $out_file) = @_;
  852. my ($seqs, $order, $out);
  853. if (lc($from) eq "ama") {
  854. ($seqs, $order) = from_maf($in);
  855. } elsif (lc($from) eq "clustal") {
  856. ($seqs, $order) = from_clustal($in);
  857. } elsif (lc($from) eq "fasta") {
  858. ($seqs, $order) = from_fasta($in);
  859. } elsif (lc($from) eq "maf") {
  860. ($seqs, $order) = from_maf($in);
  861. } elsif (lc($from) eq "paml") {
  862. ($seqs, $order) = from_paml($in);
  863. } elsif (lc($from) eq "phylip") {
  864. ($seqs, $order) = from_phylip($in);
  865. } else {
  866. croak "Unrecognised format '$from'";
  867. }
  868. if (lc($to) eq "ama") {
  869. $out = to_maf($seqs, $order);
  870. } elsif (lc($to) eq "clustal") {
  871. $out = to_clustal($seqs, $order);
  872. } elsif (lc($to) eq "fasta") {
  873. $out = to_fasta($seqs, $order);
  874. } elsif (lc($to) eq "maf") {
  875. $out = to_maf($seqs, $order);
  876. } elsif (lc($to) eq "paml") {
  877. $out = to_paml($seqs, $order);
  878. } elsif (lc($to) eq "phylip") {
  879. $out = to_phylip($seqs, $order);
  880. } else {
  881. croak "Unrecognised format '$to'";
  882. }
  883. save_to_file($out, $out_file) if $out_file;
  884. return $out;
  885. }
  886. sub maf_blocks {
  887. my ($bed_file, $maf_dir, $chr, $species, $out_file) = @_;
  888. croak "Bed file must be specified" unless $bed_file;
  889. croak "MAF directory must be specified" unless $maf_dir;
  890. croak "Chromosome must be specified" unless $chr;
  891. ($out_file = $bed_file) =~ s/bed$/maf/ unless $out_file;
  892. my $maf_file = "$maf_dir/$chr.maf";
  893. my $zip_it = 0;
  894. unless (-e $maf_file) {
  895. croak "MAF file '$maf_file' does not exist" unless -e "$maf_file.gz";
  896. `gunzip $maf_file.gz`;
  897. $zip_it = 1;
  898. }
  899. my $index_file = "$maf_file.index";
  900. unless (-e $index_file) {
  901. my $index_script = "$resources_dir/scripts/bx-python/maf_build_index.py";
  902. execute_check("$index_script $maf_file", "usage", 0, "bx-python: maf_build_index.py", "fatal");
  903. }
  904. # The Galaxy code requires that a build be specified, which is a bit of
  905. # a hassle, because it can easily be determined from the first line of
  906. # the maf file; and, indeed, the Galaxy code requires that the location
  907. # of the unzipped maf file is given as well as the index, so it must be
  908. # doing something with it.
  909. my $first_line = `grep -m 1 '^s ' $maf_file`;
  910. my ($build) = $first_line =~ /^s (\w+)/;
  911. my $maf_script = "$resources_dir/scripts/galaxy/maf/interval2maf.py";
  912. my $maf_options = "--dbkey=$build ";
  913. $maf_options .= "--chromCol=1 --startCol=2 --endCol=3 --strandCol=6 ";
  914. if ($species) {
  915. $maf_options .= "--species=".join(",", @$species)." ";
  916. }
  917. $maf_options .= "--mafFile=$maf_file ";
  918. $maf_options .= "--mafIndex=$index_file ";
  919. $maf_options .= "--interval_file=$bed_file ";
  920. $maf_options .= "--output_file=$out_file ";
  921. execute_check("$maf_script $maf_options", "MAF blocks", 1, "Galaxy: interval2maf.py", "fatal");
  922. if ($zip_it) {
  923. `gzip $maf_file`;
  924. }
  925. return;
  926. }
  927. # Sometimes the reference sequence won't be aligned to anything, which will
  928. # look like missing data in the maf file, which by definition contains blocks
  929. # with two or more species. This is a problem when we want to look at large
  930. # scale genomic data, when the spacing between aligned sections may be important.
  931. # So we splice in the appropriate bit of sequence as a single-species block.
  932. sub maf_blocks_fill {
  933. my ($maf_file, $two_bit_dir, $out_file, $chr, $start, $stop) = @_;
  934. $out_file = $maf_file unless $out_file;
  935. my $maf = read_from_file($maf_file);
  936. # Ensure that we don't miss the last block if there aren't enough newlines.
  937. $maf .= "\n\n";
  938. my ($new_maf) = $maf =~ /(^#.+\n)/m;
  939. my @blocks = $maf =~ /(^a.+?)\n\n/gms;
  940. # Assume that the top row in the maf file pertains to the reference species.
  941. my ($ref_species, $strand, $total) = $maf =~ /^s\s+(\S+)\s+\d+\s+\d+\s+(\S+)\s+(\d+)/m;
  942. ($chr) = $ref_species =~ /(chr\w+)/ unless $chr;
  943. if ($total < $stop) {
  944. $stop = $total;
  945. carp("Stop position is beyond the end of the chromosome");
  946. }
  947. # Our cursor is always relative to the positive strand.
  948. my $cursor = $start;
  949. foreach my $block (@blocks) {
  950. my ($block_start, $bases, $block_strand) = $block =~ /^s\s+$ref_species\s+(\d+)\s+(\d+)\s+(\S+)\s+\d+\s+\S+\s*$/m;
  951. carp "Changed strand!" if $block_strand ne $strand;
  952. # Stuff on the negative strand is a pain in the giblets.
  953. # In the MAF files the start position is given relative to the
  954. # strand: so for the -ve strand, to get the position on the +ve
  955. # strand we need to substract the start from the total length of
  956. # the chr/contig. It's clearer if we repeat ourselves for +ve
  957. # and -ve strands, rather than get muddled with a more concise
  958. # chunk of code.
  959. if ($block_strand eq '-') {
  960. my $positive_start = $total - $block_start - $bases;
  961. if ($cursor < $positive_start) {
  962. my $seq = fetch_dna($two_bit_dir, undef, $chr, $cursor, $positive_start);
  963. $seq = complement_rev($seq);
  964. my $length = length($seq);
  965. my $negative_start = $total - $cursor - $length;
  966. $new_maf .= "a score=0.0\n";
  967. $new_maf .= "s $ref_species $negative_start $length $block_strand $total $seq \n\n";
  968. }
  969. $cursor = $positive_start + $bases;
  970. } else {
  971. if ($cursor < $block_start) {
  972. my $seq = fetch_dna($two_bit_dir, undef, $chr, $cursor, $block_start);
  973. my $length = length($seq);
  974. $new_maf .= "a score=0.0\n";
  975. $new_maf .= "s $ref_species $cursor $length $block_strand $total $seq \n\n";
  976. }
  977. $cursor = $block_start + $bases;
  978. }
  979. $new_maf .= "$block\n\n";
  980. }
  981. if ($cursor < $stop) {
  982. my $seq = fetch_dna($two_bit_dir, undef, $chr, $cursor, $stop);
  983. my $length = length($seq);
  984. $new_maf .= "a score=0.0\n";
  985. if ($strand eq '-') {
  986. $seq = complement_rev($seq);
  987. my $negative_start = $total - $cursor - $length;
  988. $new_maf .= "s $ref_species $negative_start $length $strand $total $seq \n\n";
  989. } else {
  990. $new_maf .= "s $ref_species $cursor $length $strand $total $seq \n\n";
  991. }
  992. }
  993. save_to_file($new_maf, $out_file);
  994. return $new_maf;
  995. }
  996. ################################################################################
  997. ################################################################################
  998. # SEQUENCE MANIPULATION
  999. ################################################################################
  1000. # Complement and (optionally) reverse a sequence.
  1001. sub complement {
  1002. my ($seq, $rna) = @_;
  1003. if ($rna) {
  1004. $seq =~ tr/ACGURYacgury/UGCAYRugcayr/;
  1005. } else {
  1006. $seq =~ tr/ACGTRYacgtry/TGCAYRtgcayr/;
  1007. }
  1008. return $seq;
  1009. }
  1010. sub complement_rev {
  1011. my ($seq, $rna) = @_;
  1012. return reverse(complement($seq, $rna));
  1013. }
  1014. # Reverse sequences - primarily useful for doing HoT calculations.
  1015. sub reverse_seqs {
  1016. my ($seqs) = @_;
  1017. my %new_seqs;
  1018. foreach my $id (sort keys %$seqs) {
  1019. $new_seqs{$id} = reverse($$seqs{$id});
  1020. }
  1021. return wantarray ? %new_seqs : \%new_seqs;
  1022. }
  1023. sub remove_gaps {
  1024. my ($seqs) = @_;
  1025. my %new_seqs;
  1026. foreach my $id (sort keys %$seqs) {
  1027. (my $ungapped = $$seqs{$id}) =~ s/[\.\-\s]//gms;
  1028. $new_seqs{$id} = $ungapped;
  1029. }
  1030. return wantarray ? %new_seqs : \%new_seqs;
  1031. }
  1032. sub excise_columns {
  1033. my ($seqs, $order, $regexes) = @_;
  1034. unless ($order) {
  1035. my @ids = keys(%$seqs);
  1036. for (my $i=0; $i<scalar(@ids); $i++) {
  1037. $$order{$ids[$i]} = $i+1;
  1038. }
  1039. }
  1040. # Default is to remove gaps.
  1041. $regexes = ["[\.\-]+"] unless $regexes;
  1042. my @columns = alignment_columns($seqs, $order);
  1043. my @columns_excised;
  1044. foreach my $seq (@columns) {
  1045. my $ok = 1;
  1046. foreach my $regex (@$regexes) {
  1047. if ($seq =~ /^$regex$/) {
  1048. $ok = 0;
  1049. last;
  1050. }
  1051. }
  1052. push @columns_excised, $seq if $ok;
  1053. }
  1054. my %seqs_excised = columns_to_alignment(\@columns_excised, $order);
  1055. return wantarray ? %seqs_excised : \%seqs_excised;
  1056. }
  1057. # Remove taxa from an alignment, and then excise any columns of gaps.
  1058. sub prune_alignment {
  1059. my ($seqs, $taxa) = @_;
  1060. my %pruned = %$seqs;
  1061. foreach my $taxon (@$taxa) {
  1062. delete($pruned{$taxon});
  1063. }
  1064. my %final = excise_columns(\%pruned);
  1065. return wantarray ? %final : \%final;
  1066. }
  1067. ################################################################################
  1068. ################################################################################
  1069. # SEQUENCE STATISTICS
  1070. ################################################################################
  1071. # Calculate single, di-, or tri- nucleotide frequencies of a sequence.
  1072. sub nucleotide_freqs {
  1073. my ($sequence, $k_let, $merge, $return_counts) = @_;
  1074. $k_let = 1 unless $k_let;
  1075. $merge = 0 unless $merge;
  1076. $return_counts = 0 unless $return_counts;
  1077. my (%counts, $total, %freqs);
  1078. # Remove any whitespace
  1079. $sequence =~ s/\s//g;
  1080. if ($k_let == 1) {
  1081. # Single nucleotide counting can be done fairly simply
  1082. $total = length($sequence);
  1083. while (length($sequence) > 0) {
  1084. $sequence =~ s/^(.)//;
  1085. $counts{$1}++;
  1086. }
  1087. } else {
  1088. # If a possible k-let doesn't occur in a particular sequence, it won't
  1089. # appear in the results - it's useful to have all possible k-lets in
  1090. # the results hash, so we initialise them first.
  1091. my %chars = ();
  1092. foreach my $char (split //, $sequence) {
  1093. $chars{$char}++;
  1094. }
  1095. foreach my $char (keys %chars) {
  1096. $counts{$char} = 0;
  1097. }
  1098. for (my $i=1; $i<$k_let; $i++) {
  1099. foreach my $word (keys %counts) {
  1100. foreach my $char (keys %chars) {
  1101. $counts{"$char$word"} = 0;
  1102. $counts{"$word$char"} = 0;
  1103. }
  1104. delete($counts{$word});
  1105. }
  1106. }
  1107. # Move a window of size k across the sequence, one base at a time
  1108. my $k_let_minus_1 = $k_let - 1;
  1109. $total = length($sequence) - $k_let_minus_1;
  1110. while (length($sequence) >= $k_let) {
  1111. $sequence =~ s/^(.(.{$k_let_minus_1}))/$2/;
  1112. if ($merge) {
  1113. # Count the sequences regardless of direction,
  1114. # e.g. AC and CA are treated as the same
  1115. my $sub_seq = $1;
  1116. if (substr($sub_seq, 0, 1) gt substr($sub_seq, -1, 1)) {
  1117. $sub_seq = reverse $sub_seq;
  1118. }
  1119. $counts{$sub_seq}++;
  1120. } else {
  1121. $counts{$1}++;
  1122. }
  1123. }
  1124. }
  1125. # So far, just have counts, which might be wanted; if not,
  1126. # divide by the total to get frequencies.
  1127. if ($return_counts) {
  1128. return wantarray ? %counts : \%counts;
  1129. } else {
  1130. foreach my $sub_seq (keys %counts) {
  1131. $freqs{$sub_seq} = sprintf("%.6f", $counts{$sub_seq}/$total);
  1132. }
  1133. return wantarray ? %freqs : \%freqs;
  1134. }
  1135. }
  1136. # Calculate combined nucleotide frequencies of a set of sequences.
  1137. sub nucleotide_freqs_multiple {
  1138. my ($sequences, $k_let, $merge) = @_;
  1139. $k_let = 1 unless $k_let;
  1140. my (%all_counts, $total, %freqs);
  1141. foreach my $sequence (@$sequences) {
  1142. $sequence =~ s/[\.\-\s]//gms;
  1143. my %counts = nucleotide_freqs($sequence, $k_let, $merge, 1);
  1144. foreach my $nuc (keys %counts) {
  1145. $all_counts{$nuc} += $counts{$nuc};
  1146. }
  1147. $total += length($sequence)-$k_let+1;
  1148. }
  1149. foreach my $nuc (keys %all_counts) {
  1150. $freqs{$nuc} = sprintf("%.6f", $all_counts{$nuc}/$total);
  1151. }
  1152. return wantarray ? %freqs : \%freqs;
  1153. }
  1154. sub gc_content {
  1155. my ($sequence) = @_;
  1156. $sequence =~ s/[\.\-\s]//gms;
  1157. my $gc = () = $sequence =~ /[cCgG]/g;
  1158. return sprintf("%.6f", $gc/length($sequence));
  1159. }
  1160. sub gc_content_multiple {
  1161. my ($sequences) = @_;
  1162. my @sequences = @$sequences;
  1163. my ($gc, $length);
  1164. foreach my $sequence (@sequences) {
  1165. $sequence =~ s/[\.\-\s]//gms;
  1166. $gc += () = $sequence =~ /[cCgG]/g;
  1167. $length += length($sequence);
  1168. }
  1169. return sprintf("%.6f", $gc/$length);
  1170. }
  1171. sub at_content {
  1172. my ($sequence) = @_;
  1173. $sequence =~ s/[\.\-\s]//gms;
  1174. my $at = () = $sequence =~ /[aAtTuU]/g;
  1175. return sprintf("%.6f", $at/length($sequence));
  1176. }
  1177. sub at_content_multiple {
  1178. my ($sequences) = @_;
  1179. my @sequences = @$sequences;
  1180. my ($at, $length);
  1181. foreach my $sequence (@sequences) {
  1182. $sequence =~ s/[\.\-\s]//gms;
  1183. $at += () = $sequence =~ /[aAtTuU]/g;
  1184. $length += length($sequence);
  1185. }
  1186. return sprintf("%.6f", $at/$length);
  1187. }
  1188. # Calculate pairwise identity of an alignment.
  1189. sub pairwise_identity {
  1190. my ($sequences, $calc_type, $ignore_case) = @_;
  1191. $calc_type = "include_gaps" unless $calc_type;
  1192. $ignore_case = 1 unless defined $ignore_case;
  1193. my $seq_count = scalar(@$sequences);
  1194. my $seq_length = length($$sequences[0]);
  1195. my ($shortest_length, $total_length);
  1196. # Count the occurence of each character, at each position.
  1197. my %position;
  1198. foreach my $seq (@$sequences) {
  1199. my $length = () = $seq =~ /[^\.\-]/g;
  1200. $shortest_length = $length if !defined($shortest_length) || $length < $shortest_length;
  1201. $total_length += $length;
  1202. my @sequence = split(//, $seq);
  1203. for (my $i=0; $i<length($seq); $i++) {
  1204. if ($ignore_case) {
  1205. $position{$i}{uc($sequence[$i])}++;
  1206. } else {
  1207. $position{$i}{$sequence[$i]}++;
  1208. }
  1209. }
  1210. }
  1211. my $matches = 0;
  1212. my $gaps = 0;
  1213. foreach my $pos (sort {$a <=> $b} keys %position) {
  1214. foreach my $char (sort keys %{$position{$pos}}) {
  1215. if ($position{$pos}{$char} > 1) {
  1216. if ($calc_type eq "include_gaps") {
  1217. $matches += binomial_coeff($position{$pos}{$char}, 2);
  1218. } else {
  1219. if ($char =~ /[\.\-]/) {
  1220. $gaps += binomial_coeff($position{$pos}{$char}, 2);
  1221. } else {
  1222. $matches += binomial_coeff($position{$pos}{$char}, 2);
  1223. }
  1224. }
  1225. }
  1226. }
  1227. }
  1228. my $permutations = binomial_coeff($seq_count, 2);
  1229. my $denominator;
  1230. if ($calc_type eq "include_gaps") {
  1231. $denominator = $permutations*$seq_length;
  1232. } elsif ($calc_type eq "exclude_gaps") {
  1233. $denominator = ($permutations*$seq_length)-$gaps;
  1234. } elsif ($calc_type eq "shortest") {
  1235. # Not entirely sure that this one makes sense for MSAs?
  1236. $denominator = $permutations*$shortest_length;
  1237. } elsif ($calc_type eq "mean_length") {
  1238. $denominator = $permutation

Large files files are truncated, but you can click here to view the full file