PageRenderTime 33ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/Bio/SimpleAlign.pm

https://bitbucket.org/antismash/clusean
Perl | 3213 lines | 2534 code | 545 blank | 134 comment | 294 complexity | 4e6c8816b00e40960b17aaf4d793ee81 MD5 | raw file
  1. # BioPerl module for SimpleAlign
  2. #
  3. # Please direct questions and support issues to <bioperl-l@bioperl.org>
  4. #
  5. # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
  6. #
  7. # Copyright Ewan Birney
  8. #
  9. # You may distribute this module under the same terms as perl itself
  10. # POD documentation - main docs before the code
  11. #
  12. # History:
  13. # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
  14. # May 2001 major rewrite - Heikki Lehvaslaiho
  15. =head1 NAME
  16. Bio::SimpleAlign - Multiple alignments held as a set of sequences
  17. =head1 SYNOPSIS
  18. # Use Bio::AlignIO to read in the alignment
  19. $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
  20. $aln = $str->next_aln();
  21. # Describe
  22. print $aln->length;
  23. print $aln->num_residues;
  24. print $aln->is_flush;
  25. print $aln->num_sequences;
  26. print $aln->score;
  27. print $aln->percentage_identity;
  28. print $aln->consensus_string(50);
  29. # Find the position in the alignment for a sequence location
  30. $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
  31. # Extract sequences and check values for the alignment column $pos
  32. foreach $seq ($aln->each_seq) {
  33. $res = $seq->subseq($pos, $pos);
  34. $count{$res}++;
  35. }
  36. foreach $res (keys %count) {
  37. printf "Res: %s Count: %2d\n", $res, $count{$res};
  38. }
  39. # Manipulate
  40. $aln->remove_seq($seq);
  41. $mini_aln = $aln->slice(20,30); # get a block of columns
  42. $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
  43. $new_aln = $aln->remove_columns([20,30]); # remove by position
  44. $new_aln = $aln->remove_columns(['mismatch']); # remove by property
  45. # Analyze
  46. $str = $aln->consensus_string($threshold_percent);
  47. $str = $aln->match_line();
  48. $str = $aln->cigar_line();
  49. $id = $aln->percentage_identity;
  50. # See the module documentation for details and more methods.
  51. =head1 DESCRIPTION
  52. SimpleAlign is an object that handles a multiple sequence alignment
  53. (MSA). It is very permissive of types (it does not insist on sequences
  54. being all same length, for example). Think of it as a set of sequences
  55. with a whole series of built-in manipulations and methods for reading and
  56. writing alignments.
  57. SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
  58. to store its sequences. These are subsequences with a start and end
  59. positions in the parent reference sequence. Each sequence in the
  60. SimpleAlign object is a Bio::LocatableSeq.
  61. SimpleAlign expects the combination of name, start, and end for a
  62. given sequence to be unique in the alignment, and this is the key for the
  63. internal hashes (name, start, end are abbreviated C<nse> in the code).
  64. However, in some cases people do not want the name/start-end to be displayed:
  65. either multiple names in an alignment or names specific to the alignment
  66. (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
  67. C<displayname>, and generally is what is used to print out the
  68. alignment. They default to name/start-end.
  69. The SimpleAlign Module is derived from the Align module by Ewan Birney.
  70. =head1 FEEDBACK
  71. =head2 Mailing Lists
  72. User feedback is an integral part of the evolution of this and other
  73. Bioperl modules. Send your comments and suggestions preferably to one
  74. of the Bioperl mailing lists. Your participation is much appreciated.
  75. bioperl-l@bioperl.org - General discussion
  76. http://bioperl.org/wiki/Mailing_lists - About the mailing lists
  77. =head2 Support
  78. Please direct usage questions or support issues to the mailing list:
  79. I<bioperl-l@bioperl.org>
  80. rather than to the module maintainer directly. Many experienced and
  81. reponsive experts will be able look at the problem and quickly
  82. address it. Please include a thorough description of the problem
  83. with code and data examples if at all possible.
  84. =head2 Reporting Bugs
  85. Report bugs to the Bioperl bug tracking system to help us keep track
  86. the bugs and their resolution. Bug reports can be submitted via the
  87. web:
  88. http://bugzilla.open-bio.org/
  89. =head1 AUTHOR
  90. Ewan Birney, birney@ebi.ac.uk
  91. =head1 CONTRIBUTORS
  92. Allen Day, allenday-at-ucla.edu,
  93. Richard Adams, Richard.Adams-at-ed.ac.uk,
  94. David J. Evans, David.Evans-at-vir.gla.ac.uk,
  95. Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
  96. Allen Smith, allens-at-cpan.org,
  97. Jason Stajich, jason-at-bioperl.org,
  98. Anthony Underwood, aunderwood-at-phls.org.uk,
  99. Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
  100. Brian Osborne, bosborne at alum.mit.edu
  101. Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
  102. Hongyu Zhang, forward at hongyu.org
  103. Jay Hannah, jay at jays.net
  104. Alexandr Bezginov, albezg at gmail.com
  105. =head1 SEE ALSO
  106. L<Bio::LocatableSeq>
  107. =head1 APPENDIX
  108. The rest of the documentation details each of the object
  109. methods. Internal methods are usually preceded with a _
  110. =cut
  111. # 'Let the code begin...
  112. package Bio::SimpleAlign;
  113. use vars qw(%CONSERVATION_GROUPS);
  114. use strict;
  115. use Bio::LocatableSeq; # uses Seq's as list
  116. use Bio::Seq;
  117. use Bio::SeqFeature::Generic;
  118. BEGIN {
  119. # This data should probably be in a more centralized module...
  120. # it is taken from Clustalw documentation.
  121. # These are all the positively scoring groups that occur in the
  122. # Gonnet Pam250 matrix. The strong and weak groups are
  123. # defined as strong score >0.5 and weak score =<0.5 respectively.
  124. %CONSERVATION_GROUPS = (
  125. 'strong' => [ qw(
  126. STA
  127. NEQK
  128. NHQK
  129. NDEQ
  130. QHRK
  131. MILV
  132. MILF
  133. HY
  134. FYW )],
  135. 'weak' => [ qw(
  136. CSA
  137. ATV
  138. SAG
  139. STNK
  140. STPA
  141. SGND
  142. SNDEQK
  143. NDEQHK
  144. NEQHRK
  145. FVLIM
  146. HFY )],);
  147. }
  148. use base qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
  149. Bio::FeatureHolderI);
  150. =head2 new
  151. Title : new
  152. Usage : my $aln = Bio::SimpleAlign->new();
  153. Function : Creates a new simple align object
  154. Returns : Bio::SimpleAlign
  155. Args : -source => string representing the source program
  156. where this alignment came from
  157. -annotation => Bio::AnnotationCollectionI
  158. -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
  159. -seqs => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
  160. -consensus => consensus string
  161. -consensus_meta => Bio::Seq::Meta object containing consensus met information (kludge)
  162. =cut
  163. sub new {
  164. my($class,@args) = @_;
  165. my $self = $class->SUPER::new(@args);
  166. my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
  167. SOURCE
  168. SCORE
  169. ID
  170. ACCESSION
  171. DESCRIPTION
  172. SEQS
  173. FEATURES
  174. ANNOTATION
  175. SEQ_ANNOTATION
  176. CONSENSUS
  177. CONSENSUS_META
  178. )], @args);
  179. $src && $self->source($src);
  180. defined $score && $self->score($score);
  181. # we need to set up internal hashs first!
  182. $self->{'_seq'} = {};
  183. $self->{'_order'} = {};
  184. $self->{'_start_end_lists'} = {};
  185. $self->{'_dis_name'} = {};
  186. $self->{'_id'} = 'NoName';
  187. # maybe we should automatically read in from args. Hmmm...
  188. $id && $self->id($id);
  189. $acc && $self->accession($acc);
  190. $desc && $self->description($desc);
  191. $coll && $self->annotation($coll);
  192. # sequence annotation is layered into a provided annotation collection (or dies)
  193. if ($sa) {
  194. $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
  195. "with a sequence annotation collection")
  196. if !$coll;
  197. $coll->add_Annotation('seq_annotation', $sa);
  198. }
  199. if ($feats && ref $feats eq 'ARRAY') {
  200. for my $feat (@$feats) {
  201. $self->add_SeqFeature($feat);
  202. }
  203. }
  204. $con && $self->consensus($con);
  205. $cmeta && $self->consensus_meta($cmeta);
  206. # assumes these are in correct alignment order
  207. if ($seqs && ref($seqs) eq 'ARRAY') {
  208. for my $seq (@$seqs) {
  209. $self->add_seq($seq);
  210. }
  211. }
  212. return $self; # success - we hope!
  213. }
  214. =head1 Modifier methods
  215. These methods modify the MSA by adding, removing or shuffling complete
  216. sequences.
  217. =head2 add_seq
  218. Title : add_seq
  219. Usage : $myalign->add_seq($newseq);
  220. $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
  221. Function : Adds another sequence to the alignment. *Does not* align
  222. it - just adds it to the hashes.
  223. If -ORDER is specified, the sequence is inserted at the
  224. the position spec'd by -ORDER, and existing sequences
  225. are pushed down the storage array.
  226. Returns : nothing
  227. Args : A Bio::LocatableSeq object
  228. Positive integer for the sequence position (optional)
  229. See L<Bio::LocatableSeq> for more information
  230. =cut
  231. sub addSeq {
  232. my $self = shift;
  233. $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
  234. $self->add_seq(@_);
  235. }
  236. sub add_seq {
  237. my $self = shift;
  238. my @args = @_;
  239. my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
  240. my ($name,$id,$start,$end);
  241. unless ($seq) {
  242. $self->throw("LocatableSeq argument required");
  243. }
  244. if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
  245. $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
  246. }
  247. !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
  248. $order--; # jay's patch (user-specified order is 1-origin)
  249. if ($order < 0) {
  250. $self->throw("User-specified value for ORDER must be >= 1");
  251. }
  252. $id = $seq->id() ||$seq->display_id || $seq->primary_id;
  253. # build the symbol list for this sequence,
  254. # will prune out the gap and missing/match chars
  255. # when actually asked for the symbol list in the
  256. # symbol_chars
  257. # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
  258. $name = $seq->get_nse;
  259. if( $self->{'_seq'}->{$name} ) {
  260. $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
  261. }
  262. else {
  263. $self->debug( "Assigning $name to $order\n");
  264. my $ordh = $self->{'_order'};
  265. if ($ordh->{$order}) {
  266. # make space to insert
  267. # $c->() returns (in reverse order) the first subsequence
  268. # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
  269. # (3,2,1), and $c->(2,4,5) returns (2).
  270. my $c;
  271. $c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
  272. map {
  273. $ordh->{$_+1} = $ordh->{$_}
  274. } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
  275. }
  276. $ordh->{$order} = $name;
  277. unless( exists( $self->{'_start_end_lists'}->{$id})) {
  278. $self->{'_start_end_lists'}->{$id} = [];
  279. }
  280. push @{$self->{'_start_end_lists'}->{$id}}, $seq;
  281. }
  282. $self->{'_seq'}->{$name} = $seq;
  283. }
  284. =head2 remove_seq
  285. Title : remove_seq
  286. Usage : $aln->remove_seq($seq);
  287. Function : Removes a single sequence from an alignment
  288. Returns :
  289. Argument : a Bio::LocatableSeq object
  290. =cut
  291. sub removeSeq {
  292. my $self = shift;
  293. $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
  294. $self->remove_seq(@_);
  295. }
  296. sub remove_seq {
  297. my $self = shift;
  298. my $seq = shift;
  299. my ($name,$id,$start,$end);
  300. $self->throw("Need Bio::Locatable seq argument ")
  301. unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
  302. $id = $seq->id();
  303. $start = $seq->start();
  304. $end = $seq->end();
  305. $name = sprintf("%s/%d-%d",$id,$start,$end);
  306. if( !exists $self->{'_seq'}->{$name} ) {
  307. $self->throw("Sequence $name does not exist in the alignment to remove!");
  308. }
  309. delete $self->{'_seq'}->{$name};
  310. # we need to remove this seq from the start_end_lists hash
  311. if (exists $self->{'_start_end_lists'}->{$id}) {
  312. # we need to find the sequence in the array.
  313. my ($i, $found);;
  314. for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
  315. if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
  316. $found = 1;
  317. last;
  318. }
  319. }
  320. if ($found) {
  321. splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
  322. }
  323. else {
  324. $self->throw("Could not find the sequence to remoce from the start-end list");
  325. }
  326. }
  327. else {
  328. $self->throw("There is no seq list for the name $id");
  329. }
  330. # we need to shift order hash
  331. my %rev_order = reverse %{$self->{'_order'}};
  332. my $no = $rev_order{$name};
  333. my $num_sequences = $self->num_sequences;
  334. for (; $no < $num_sequences; $no++) {
  335. $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
  336. }
  337. delete $self->{'_order'}->{$no};
  338. return 1;
  339. }
  340. =head2 purge
  341. Title : purge
  342. Usage : $aln->purge(0.7);
  343. Function: Removes sequences above given sequence similarity
  344. This function will grind on large alignments. Beware!
  345. Example :
  346. Returns : An array of the removed sequences
  347. Args : float, threshold for similarity
  348. =cut
  349. sub purge {
  350. my ($self,$perc) = @_;
  351. my (%duplicate, @dups);
  352. my @seqs = $self->each_seq();
  353. for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
  354. my $seq = $seqs[$i];
  355. #skip if already in duplicate hash
  356. next if exists $duplicate{$seq->display_id} ;
  357. my $one = $seq->seq();
  358. my @one = split '', $one; #split to get 1aa per array element
  359. for (my $j=$i+1;$j < @seqs;$j++) {
  360. my $seq2 = $seqs[$j];
  361. #skip if already in duplicate hash
  362. next if exists $duplicate{$seq2->display_id} ;
  363. my $two = $seq2->seq();
  364. my @two = split '', $two;
  365. my $count = 0;
  366. my $res = 0;
  367. for (my $k=0;$k<@one;$k++) {
  368. if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
  369. $one[$k] eq $two[$k]) {
  370. $count++;
  371. }
  372. if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
  373. $two[$k] ne '.' && $two[$k] ne '-' ) {
  374. $res++;
  375. }
  376. }
  377. my $ratio = 0;
  378. $ratio = $count/$res unless $res == 0;
  379. # if above threshold put in duplicate hash and push onto
  380. # duplicate array for returning to get_unique
  381. if ( $ratio > $perc ) {
  382. $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
  383. $duplicate{$seq2->display_id} = 1;
  384. push @dups, $seq2;
  385. }
  386. }
  387. }
  388. foreach my $seq (@dups) {
  389. $self->remove_seq($seq);
  390. }
  391. return @dups;
  392. }
  393. =head2 sort_alphabetically
  394. Title : sort_alphabetically
  395. Usage : $ali->sort_alphabetically
  396. Function : Changes the order of the alignment to alphabetical on name
  397. followed by numerical by number.
  398. Returns :
  399. Argument :
  400. =cut
  401. sub sort_alphabetically {
  402. my $self = shift;
  403. my ($seq,$nse,@arr,%hash,$count);
  404. foreach $seq ( $self->each_seq() ) {
  405. $nse = $seq->get_nse;
  406. $hash{$nse} = $seq;
  407. }
  408. $count = 0;
  409. %{$self->{'_order'}} = (); # reset the hash;
  410. foreach $nse ( sort _alpha_startend keys %hash) {
  411. $self->{'_order'}->{$count} = $nse;
  412. $count++;
  413. }
  414. 1;
  415. }
  416. =head2 sort_by_list
  417. Title : sort_by_list
  418. Usage : $aln_ordered=$aln->sort_by_list($list_file)
  419. Function : Arbitrarily order sequences in an alignment
  420. Returns : A new Bio::SimpleAlign object
  421. Argument : a file listing sequence names in intended order (one name per line)
  422. =cut
  423. sub sort_by_list {
  424. my ($self, $list) = @_;
  425. my (@seq, @ids, %order);
  426. foreach my $seq ( $self->each_seq() ) {
  427. push @seq, $seq;
  428. push @ids, $seq->display_id;
  429. }
  430. my $ct=1;
  431. open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
  432. while (<$listfh>) {
  433. chomp;
  434. my $name=$_;
  435. $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
  436. $order{$name}=$ct++;
  437. }
  438. close($listfh);
  439. # use the map-sort-map idiom:
  440. my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
  441. my $aln = $self->new;
  442. foreach (@sorted) { $aln->add_seq($_) }
  443. return $aln;
  444. }
  445. =head2 set_new_reference
  446. Title : set_new_reference
  447. Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or
  448. the sequence whoes name is "B31" (full, exact, and case-sensitive),
  449. as the reference (1st) sequence
  450. Function : Change/Set a new reference (i.e., the first) sequence
  451. Returns : a new Bio::SimpleAlign object.
  452. Throws an exception if designated sequence not found
  453. Argument : a positive integer of sequence order, or a sequence name
  454. in the original alignment
  455. =cut
  456. sub set_new_reference {
  457. my ($self, $seqid) = @_;
  458. my $aln = $self->new;
  459. my (@seq, @ids, @new_seq);
  460. my $is_num=0;
  461. foreach my $seq ( $self->each_seq() ) {
  462. push @seq, $seq;
  463. push @ids, $seq->display_id;
  464. }
  465. if ($seqid =~ /^\d+$/) { # argument is seq position
  466. $is_num=1;
  467. $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
  468. } else { # argument is a seq name
  469. $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
  470. }
  471. for (my $i=0; $i<=$#seq; $i++) {
  472. my $pos=$i+1;
  473. if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
  474. unshift @new_seq, $seq[$i];
  475. } else {
  476. push @new_seq, $seq[$i];
  477. }
  478. }
  479. foreach (@new_seq) { $aln->add_seq($_); }
  480. return $aln;
  481. }
  482. sub _in_aln { # check if input name exists in the alignment
  483. my ($str, $ref) = @_;
  484. foreach (@$ref) {
  485. return 1 if $str eq $_;
  486. }
  487. return 0;
  488. }
  489. =head2 uniq_seq
  490. Title : uniq_seq
  491. Usage : $aln->uniq_seq(): Remove identical sequences in
  492. in the alignment. Ambiguous base ("N", "n") and
  493. leading and ending gaps ("-") are NOT counted as
  494. differences.
  495. Function : Make a new alignment of unique sequence types (STs)
  496. Returns : 1a. if called in a scalar context,
  497. a new Bio::SimpleAlign object (all sequences renamed as "ST")
  498. 1b. if called in an array context,
  499. a new Bio::SimpleAlign object, and a hashref whose keys
  500. are sequence types, and whose values are arrayrefs to
  501. lists of sequence ids within the corresponding sequence type
  502. 2. if $aln->verbose > 0, ST of each sequence is sent to
  503. STDERR (in a tabular format)
  504. Argument : None
  505. =cut
  506. sub uniq_seq {
  507. my ($self, $seqid) = @_;
  508. my $aln = $self->new;
  509. my (%member, %order, @seq, @uniq_str, $st);
  510. my $order=0;
  511. my $len = $self->length();
  512. $st = {};
  513. foreach my $seq ( $self->each_seq() ) {
  514. my $str = $seq->seq();
  515. # it's necessary to ignore "n", "N", leading gaps and ending gaps in
  516. # comparing two sequence strings
  517. # 1st, convert "n", "N" to "?" (for DNA sequence only):
  518. $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
  519. # 2nd, convert leading and ending gaps to "?":
  520. $str = &_convert_leading_ending_gaps($str, '-', '?');
  521. # Note that '?' also can mean unknown residue.
  522. # I don't like making global class member changes like this, too
  523. # prone to errors... -- cjfields 08-11-18
  524. local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
  525. my $new = Bio::LocatableSeq->new(
  526. -id => $seq->id(),
  527. -alphabet=> $seq->alphabet,
  528. -seq => $str,
  529. -start => $seq->start,
  530. -end => $seq->end
  531. );
  532. push @seq, $new;
  533. }
  534. foreach my $seq (@seq) {
  535. my $str = $seq->seq();
  536. my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
  537. if ($seen) { # seen before
  538. my @memb = @{$member{$key}};
  539. push @memb, $seq;
  540. $member{$key} = \@memb;
  541. } else { # not seen
  542. push @uniq_str, $key;
  543. $order++;
  544. $member{$key} = [ ($seq) ];
  545. $order{$key} = $order;
  546. }
  547. }
  548. foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
  549. # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
  550. my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
  551. # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
  552. $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
  553. my $gap='-';
  554. my $end= CORE::length($str2);
  555. $end -= CORE::length($1) while $str2 =~ m/($gap+)/g;
  556. my $new = Bio::LocatableSeq->new(-id =>"ST".$order{$str},
  557. -seq =>$str2,
  558. -start=>1,
  559. -end =>$end
  560. );
  561. $aln->add_seq($new);
  562. foreach (@{$member{$str}}) {
  563. push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
  564. $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
  565. }
  566. }
  567. return wantarray ? ($aln, $st) : $aln;
  568. }
  569. sub _check_uniq { # check if same seq exists in the alignment
  570. my ($str1, $ref, $length) = @_;
  571. my @char1=split //, $str1;
  572. my @array=@$ref;
  573. return (0, $str1) if @array==0; # not seen (1st sequence)
  574. foreach my $str2 (@array) {
  575. my $diff=0;
  576. my @char2=split //, $str2;
  577. for (my $i=0; $i<=$length-1; $i++) {
  578. next if $char1[$i] eq '?';
  579. next if $char2[$i] eq '?';
  580. $diff++ if $char1[$i] ne $char2[$i];
  581. }
  582. return (1, $str2) if $diff == 0; # seen before
  583. }
  584. return (0, $str1); # not seen
  585. }
  586. sub _convert_leading_ending_gaps {
  587. my $s=shift;
  588. my $sym1=shift;
  589. my $sym2=shift;
  590. my @array=split //, $s;
  591. # convert leading char:
  592. for (my $i=0; $i<=$#array; $i++) {
  593. ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
  594. }
  595. # convert ending char:
  596. for (my $i = $#array; $i>= 0; $i--) {
  597. ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
  598. }
  599. my $s_new=join '', @array;
  600. return $s_new;
  601. }
  602. =head1 Sequence selection methods
  603. Methods returning one or more sequences objects.
  604. =head2 each_seq
  605. Title : each_seq
  606. Usage : foreach $seq ( $align->each_seq() )
  607. Function : Gets a Seq object from the alignment
  608. Returns : Seq object
  609. Argument :
  610. =cut
  611. sub eachSeq {
  612. my $self = shift;
  613. $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
  614. $self->each_seq();
  615. }
  616. sub each_seq {
  617. my $self = shift;
  618. my (@arr,$order);
  619. foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
  620. if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
  621. push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
  622. }
  623. }
  624. return @arr;
  625. }
  626. =head2 each_alphabetically
  627. Title : each_alphabetically
  628. Usage : foreach $seq ( $ali->each_alphabetically() )
  629. Function : Returns a sequence object, but the objects are returned
  630. in alphabetically sorted order.
  631. Does not change the order of the alignment.
  632. Returns : Seq object
  633. Argument :
  634. =cut
  635. sub each_alphabetically {
  636. my $self = shift;
  637. my ($seq,$nse,@arr,%hash,$count);
  638. foreach $seq ( $self->each_seq() ) {
  639. $nse = $seq->get_nse;
  640. $hash{$nse} = $seq;
  641. }
  642. foreach $nse ( sort _alpha_startend keys %hash) {
  643. push(@arr,$hash{$nse});
  644. }
  645. return @arr;
  646. }
  647. sub _alpha_startend {
  648. my ($aname,$astart,$bname,$bstart);
  649. ($aname,$astart) = split (/-/,$a);
  650. ($bname,$bstart) = split (/-/,$b);
  651. if( $aname eq $bname ) {
  652. return $astart <=> $bstart;
  653. }
  654. else {
  655. return $aname cmp $bname;
  656. }
  657. }
  658. =head2 each_seq_with_id
  659. Title : each_seq_with_id
  660. Usage : foreach $seq ( $align->each_seq_with_id() )
  661. Function : Gets a Seq objects from the alignment, the contents
  662. being those sequences with the given name (there may be
  663. more than one)
  664. Returns : Seq object
  665. Argument : a seq name
  666. =cut
  667. sub eachSeqWithId {
  668. my $self = shift;
  669. $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
  670. $self->each_seq_with_id(@_);
  671. }
  672. sub each_seq_with_id {
  673. my $self = shift;
  674. my $id = shift;
  675. $self->throw("Method each_seq_with_id needs a sequence name argument")
  676. unless defined $id;
  677. my (@arr, $seq);
  678. if (exists($self->{'_start_end_lists'}->{$id})) {
  679. @arr = @{$self->{'_start_end_lists'}->{$id}};
  680. }
  681. return @arr;
  682. }
  683. =head2 get_seq_by_pos
  684. Title : get_seq_by_pos
  685. Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
  686. Function : Gets a sequence based on its position in the alignment.
  687. Numbering starts from 1. Sequence positions larger than
  688. num_sequences() will thow an error.
  689. Returns : a Bio::LocatableSeq object
  690. Args : positive integer for the sequence position
  691. =cut
  692. sub get_seq_by_pos {
  693. my $self = shift;
  694. my ($pos) = @_;
  695. $self->throw("Sequence position has to be a positive integer, not [$pos]")
  696. unless $pos =~ /^\d+$/ and $pos > 0;
  697. $self->throw("No sequence at position [$pos]")
  698. unless $pos <= $self->num_sequences ;
  699. my $nse = $self->{'_order'}->{--$pos};
  700. return $self->{'_seq'}->{$nse};
  701. }
  702. =head2 get_seq_by_id
  703. Title : get_seq_by_id
  704. Usage : $seq = $aln->get_seq_by_id($name) # seq named $name
  705. Function : Gets a sequence based on its name.
  706. Sequences that do not exist will warn and return undef
  707. Returns : a Bio::LocatableSeq object
  708. Args : string for sequence name
  709. =cut
  710. sub get_seq_by_id {
  711. my ($self,$name) = @_;
  712. unless( defined $name ) {
  713. $self->warn("Must provide a sequence name");
  714. return;
  715. }
  716. for my $seq ( values %{$self->{'_seq'}} ) {
  717. if ( $seq->id eq $name) {
  718. return $seq;
  719. }
  720. }
  721. return;
  722. }
  723. =head2 seq_with_features
  724. Title : seq_with_features
  725. Usage : $seq = $aln->seq_with_features(-pos => 1,
  726. -consensus => 60
  727. -mask =>
  728. sub { my $consensus = shift;
  729. for my $i (1..5){
  730. my $n = 'N' x $i;
  731. my $q = '\?' x $i;
  732. while($consensus =~ /[^?]$q[^?]/){
  733. $consensus =~ s/([^?])$q([^?])/$1$n$2/;
  734. }
  735. }
  736. return $consensus;
  737. }
  738. );
  739. Function: produces a Bio::Seq object by first splicing gaps from -pos
  740. (by means of a splice_by_seq_pos() call), then creating
  741. features using non-? chars (by means of a consensus_string()
  742. call with stringency -consensus).
  743. Returns : a Bio::Seq object
  744. Args : -pos : required. sequence from which to build the Bio::Seq
  745. object
  746. -consensus : optional, defaults to consensus_string()'s
  747. default cutoff value
  748. -mask : optional, a coderef to apply to consensus_string()'s
  749. output before building features. this may be useful for
  750. closing gaps of 1 bp by masking over them with N, for
  751. instance
  752. =cut
  753. sub seq_with_features{
  754. my ($self,%arg) = @_;
  755. #first do the preparatory splice
  756. $self->throw("must provide a -pos argument") unless $arg{-pos};
  757. $self->splice_by_seq_pos($arg{-pos});
  758. my $consensus_string = $self->consensus_string($arg{-consensus});
  759. $consensus_string = $arg{-mask}->($consensus_string)
  760. if defined($arg{-mask});
  761. my(@bs,@es);
  762. push @bs, 1 if $consensus_string =~ /^[^?]/;
  763. while($consensus_string =~ /\?[^?]/g){
  764. push @bs, pos($consensus_string);
  765. }
  766. while($consensus_string =~ /[^?]\?/g){
  767. push @es, pos($consensus_string);
  768. }
  769. push @es, CORE::length($consensus_string) if $consensus_string =~ /[^?]$/;
  770. my $seq = Bio::Seq->new();
  771. # my $rootfeature = Bio::SeqFeature::Generic->new(
  772. # -source_tag => 'location',
  773. # -start => $self->get_seq_by_pos($arg{-pos})->start,
  774. # -end => $self->get_seq_by_pos($arg{-pos})->end,
  775. # );
  776. # $seq->add_SeqFeature($rootfeature);
  777. while(my $b = shift @bs){
  778. my $e = shift @es;
  779. $seq->add_SeqFeature(
  780. Bio::SeqFeature::Generic->new(
  781. -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
  782. -end => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
  783. -source_tag => $self->source || 'MSA',
  784. )
  785. );
  786. }
  787. return $seq;
  788. }
  789. =head1 Create new alignments
  790. The result of these methods are horizontal or vertical subsets of the
  791. current MSA.
  792. =head2 select
  793. Title : select
  794. Usage : $aln2 = $aln->select(1, 3) # three first sequences
  795. Function : Creates a new alignment from a continuous subset of
  796. sequences. Numbering starts from 1. Sequence positions
  797. larger than num_sequences() will thow an error.
  798. Returns : a Bio::SimpleAlign object
  799. Args : positive integer for the first sequence
  800. positive integer for the last sequence to include (optional)
  801. =cut
  802. sub select {
  803. my $self = shift;
  804. my ($start, $end) = @_;
  805. $self->throw("Select start has to be a positive integer, not [$start]")
  806. unless $start =~ /^\d+$/ and $start > 0;
  807. $self->throw("Select end has to be a positive integer, not [$end]")
  808. unless $end =~ /^\d+$/ and $end > 0;
  809. $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
  810. unless $start <= $end;
  811. my $aln = $self->new;
  812. foreach my $pos ($start .. $end) {
  813. $aln->add_seq($self->get_seq_by_pos($pos));
  814. }
  815. $aln->id($self->id);
  816. # fix for meta, sf, ann
  817. return $aln;
  818. }
  819. =head2 select_noncont
  820. Title : select_noncont
  821. Usage : # 1st and 3rd sequences, sorted
  822. $aln2 = $aln->select_noncont(1, 3)
  823. # 1st and 3rd sequences, sorted (same as first)
  824. $aln2 = $aln->select_noncont(3, 1)
  825. # 1st and 3rd sequences, unsorted
  826. $aln2 = $aln->select_noncont('nosort',3, 1)
  827. Function : Creates a new alignment from a subset of sequences. Numbering
  828. starts from 1. Sequence positions larger than num_sequences() will
  829. throw an error. Sorts the order added to new alignment by default,
  830. to prevent sorting pass 'nosort' as the first argument in the list.
  831. Returns : a Bio::SimpleAlign object
  832. Args : array of integers for the sequences. If the string 'nosort' is
  833. passed as the first argument, the sequences will not be sorted
  834. in the new alignment but will appear in the order listed.
  835. =cut
  836. sub select_noncont {
  837. my $self = shift;
  838. my $nosort = 0;
  839. my (@pos) = @_;
  840. if ($pos[0] !~ m{^\d+$}) {
  841. my $sortcmd = shift @pos;
  842. if ($sortcmd eq 'nosort') {
  843. $nosort = 1;
  844. } else {
  845. $self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time.");
  846. }
  847. }
  848. my $end = $self->num_sequences;
  849. foreach ( @pos ) {
  850. $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
  851. unless( /^\d+$/ && $_ > 0 && $_ <= $end );
  852. }
  853. @pos = sort {$a <=> $b} @pos unless $nosort;
  854. my $aln = $self->new;
  855. foreach my $p (@pos) {
  856. $aln->add_seq($self->get_seq_by_pos($p));
  857. }
  858. $aln->id($self->id);
  859. # fix for meta, sf, ann
  860. return $aln;
  861. }
  862. =head2 slice
  863. Title : slice
  864. Usage : $aln2 = $aln->slice(20,30)
  865. Function : Creates a slice from the alignment inclusive of start and
  866. end columns, and the first column in the alignment is denoted 1.
  867. Sequences with no residues in the slice are excluded from the
  868. new alignment and a warning is printed. Slice beyond the length of
  869. the sequence does not do padding.
  870. Returns : A Bio::SimpleAlign object
  871. Args : Positive integer for start column, positive integer for end column,
  872. optional boolean which if true will keep gap-only columns in the newly
  873. created slice. Example:
  874. $aln2 = $aln->slice(20,30,1)
  875. =cut
  876. sub slice {
  877. my $self = shift;
  878. my ($start, $end, $keep_gap_only) = @_;
  879. $self->throw("Slice start has to be a positive integer, not [$start]")
  880. unless $start =~ /^\d+$/ and $start > 0;
  881. $self->throw("Slice end has to be a positive integer, not [$end]")
  882. unless $end =~ /^\d+$/ and $end > 0;
  883. $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
  884. unless $start <= $end;
  885. $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
  886. "[$start] is too big.") if $start > $self->length;
  887. my $cons_meta = $self->consensus_meta;
  888. my $aln = $self->new;
  889. $aln->id($self->id);
  890. foreach my $seq ( $self->each_seq() ) {
  891. my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
  892. Bio::Seq::Meta->new
  893. (-id => $seq->id,
  894. -alphabet => $seq->alphabet,
  895. -strand => $seq->strand,
  896. -verbose => $self->verbose) :
  897. Bio::LocatableSeq->new
  898. (-id => $seq->id,
  899. -alphabet => $seq->alphabet,
  900. -strand => $seq->strand,
  901. -verbose => $self->verbose);
  902. # seq
  903. my $seq_end = $end;
  904. $seq_end = $seq->length if( $end > $seq->length );
  905. my $slice_seq = $seq->subseq($start, $seq_end);
  906. $new_seq->seq( $slice_seq );
  907. $slice_seq =~ s/\W//g;
  908. if ($start > 1) {
  909. my $pre_start_seq = $seq->subseq(1, $start - 1);
  910. $pre_start_seq =~ s/\W//g;
  911. if (!defined($seq->strand)) {
  912. $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
  913. } elsif ($seq->strand < 0){
  914. $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
  915. } else {
  916. $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
  917. }
  918. } else {
  919. if ((defined $seq->strand)&&($seq->strand < 0)){
  920. $new_seq->start( $seq->end - CORE::length($slice_seq) + 1);
  921. } else {
  922. $new_seq->start( $seq->start);
  923. }
  924. }
  925. if ($new_seq->isa('Bio::Seq::MetaI')) {
  926. for my $meta_name ($seq->meta_names) {
  927. $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
  928. }
  929. }
  930. $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
  931. if ($new_seq->start and $new_seq->end >= $new_seq->start) {
  932. $aln->add_seq($new_seq);
  933. } else {
  934. if( $keep_gap_only ) {
  935. $aln->add_seq($new_seq);
  936. } else {
  937. my $nse = $seq->get_nse();
  938. $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
  939. " Sequence excluded from the new alignment.");
  940. }
  941. }
  942. }
  943. if ($cons_meta) {
  944. my $new = Bio::Seq::Meta->new();
  945. for my $meta_name ($cons_meta->meta_names) {
  946. $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
  947. }
  948. $aln->consensus_meta($new);
  949. }
  950. $aln->annotation($self->annotation);
  951. # fix for meta, sf, ann
  952. return $aln;
  953. }
  954. =head2 remove_columns
  955. Title : remove_columns
  956. Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or
  957. $aln2 = $aln->remove_columns([0,0],[6,8])
  958. Function : Creates an aligment with columns removed corresponding to
  959. the specified type or by specifying the columns by number.
  960. Returns : Bio::SimpleAlign object
  961. Args : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
  962. 'all_gaps_columns') or array ref where the referenced array
  963. contains a pair of integers that specify a range.
  964. The first column is 0
  965. =cut
  966. sub remove_columns {
  967. my ($self,@args) = @_;
  968. @args || $self->throw("Must supply column ranges or column types");
  969. my $aln;
  970. if ($args[0][0] =~ /^[a-z_]+$/i) {
  971. $aln = $self->_remove_columns_by_type($args[0]);
  972. } elsif ($args[0][0] =~ /^\d+$/) {
  973. $aln = $self->_remove_columns_by_num(\@args);
  974. } else {
  975. $self->throw("You must pass array references to remove_columns(), not @args");
  976. }
  977. # fix for meta, sf, ann
  978. $aln;
  979. }
  980. =head2 remove_gaps
  981. Title : remove_gaps
  982. Usage : $aln2 = $aln->remove_gaps
  983. Function : Creates an aligment with gaps removed
  984. Returns : a Bio::SimpleAlign object
  985. Args : a gap character(optional) if none specified taken
  986. from $self->gap_char,
  987. [optional] $all_gaps_columns flag (1 or 0, default is 0)
  988. indicates that only all-gaps columns should be deleted
  989. Used from method L<remove_columns> in most cases. Set gap character
  990. using L<gap_char()|gap_char>.
  991. =cut
  992. sub remove_gaps {
  993. my ($self,$gapchar,$all_gaps_columns) = @_;
  994. my $gap_line;
  995. if ($all_gaps_columns) {
  996. $gap_line = $self->all_gap_line($gapchar);
  997. } else {
  998. $gap_line = $self->gap_line($gapchar);
  999. }
  1000. my $aln = $self->new;
  1001. my @remove;
  1002. my $length = 0;
  1003. my $del_char = $gapchar || $self->gap_char;
  1004. # Do the matching to get the segments to remove
  1005. while ($gap_line =~ m/[$del_char]/g) {
  1006. my $start = pos($gap_line)-1;
  1007. $gap_line=~/\G[$del_char]+/gc;
  1008. my $end = pos($gap_line)-1;
  1009. #have to offset the start and end for subsequent removes
  1010. $start-=$length;
  1011. $end -=$length;
  1012. $length += ($end-$start+1);
  1013. push @remove, [$start,$end];
  1014. }
  1015. #remove the segments
  1016. $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
  1017. # fix for meta, sf, ann
  1018. return $aln;
  1019. }
  1020. sub _remove_col {
  1021. my ($self,$aln,$remove) = @_;
  1022. my @new;
  1023. my $gap = $self->gap_char;
  1024. # splice out the segments and create new seq
  1025. foreach my $seq($self->each_seq){
  1026. my $new_seq = Bio::LocatableSeq->new(
  1027. -id => $seq->id,
  1028. -alphabet=> $seq->alphabet,
  1029. -strand => $seq->strand,
  1030. -verbose => $self->verbose);
  1031. my $sequence = $seq->seq;
  1032. foreach my $pair(@{$remove}){
  1033. my $start = $pair->[0];
  1034. my $end = $pair->[1];
  1035. $sequence = $seq->seq unless $sequence;
  1036. my $orig = $sequence;
  1037. my $head = $start > 0 ? substr($sequence, 0, $start) : '';
  1038. my $tail = ($end + 1) >= CORE::length($sequence) ? '' : substr($sequence, $end + 1);
  1039. $sequence = $head.$tail;
  1040. # start
  1041. unless (defined $new_seq->start) {
  1042. if ($start == 0) {
  1043. my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
  1044. $new_seq->start($seq->start + $end + 1 - $start_adjust);
  1045. }
  1046. else {
  1047. my $start_adjust = $orig =~ /^$gap+/;
  1048. if ($start_adjust) {
  1049. $start_adjust = $+[0] == $start;
  1050. }
  1051. $new_seq->start($seq->start + $start_adjust);
  1052. }
  1053. }
  1054. # end
  1055. if (($end + 1) >= CORE::length($orig)) {
  1056. my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
  1057. $new_seq->end($seq->end - (CORE::length($orig) - $start) + $end_adjust);
  1058. }
  1059. else {
  1060. $new_seq->end($seq->end);
  1061. }
  1062. }
  1063. if ($new_seq->end < $new_seq->start) {
  1064. # we removed all columns except for gaps: set to 0 to indicate no
  1065. # sequence
  1066. $new_seq->start(0);
  1067. $new_seq->end(0);
  1068. }
  1069. $new_seq->seq($sequence) if $sequence;
  1070. push @new, $new_seq;
  1071. }
  1072. # add the new seqs to the alignment
  1073. foreach my $new(@new){
  1074. $aln->add_seq($new);
  1075. }
  1076. # fix for meta, sf, ann
  1077. return $aln;
  1078. }
  1079. sub _remove_columns_by_type {
  1080. my ($self,$type) = @_;
  1081. my $aln = $self->new;
  1082. my @remove;
  1083. my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
  1084. my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
  1085. my %matchchars = ( 'match' => '\*',
  1086. 'weak' => '\.',
  1087. 'strong' => ':',
  1088. 'mismatch' => ' ',
  1089. 'gaps' => '',
  1090. 'all_gaps_columns' => ''
  1091. );
  1092. # get the characters to delete against
  1093. my $del_char;
  1094. foreach my $type (@{$type}){
  1095. $del_char.= $matchchars{$type};
  1096. }
  1097. my $length = 0;
  1098. my $match_line = $self->match_line;
  1099. # do the matching to get the segments to remove
  1100. if($del_char){
  1101. while($match_line =~ m/[$del_char]/g ){
  1102. my $start = pos($match_line)-1;
  1103. $match_line=~/\G[$del_char]+/gc;
  1104. my $end = pos($match_line)-1;
  1105. #have to offset the start and end for subsequent removes
  1106. $start-=$length;
  1107. $end -=$length;
  1108. $length += ($end-$start+1);
  1109. push @remove, [$start,$end];
  1110. }
  1111. }
  1112. # remove the segments
  1113. $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
  1114. $aln = $aln->remove_gaps() if $gap;
  1115. $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
  1116. # fix for meta, sf, ann
  1117. $aln;
  1118. }
  1119. sub _remove_columns_by_num {
  1120. my ($self,$positions) = @_;
  1121. my $aln = $self->new;
  1122. # sort the positions
  1123. @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
  1124. my @remove;
  1125. my $length = 0;
  1126. foreach my $pos (@{$positions}) {
  1127. my ($start, $end) = @{$pos};
  1128. #have to offset the start and end for subsequent removes
  1129. $start-=$length;
  1130. $end -=$length;
  1131. $length += ($end-$start+1);
  1132. push @remove, [$start,$end];
  1133. }
  1134. #remove the segments
  1135. $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
  1136. # fix for meta, sf, ann
  1137. $aln;
  1138. }
  1139. =head1 Change sequences within the MSA
  1140. These methods affect characters in all sequences without changing the
  1141. alignment.
  1142. =head2 splice_by_seq_pos
  1143. Title : splice_by_seq_pos
  1144. Usage : $status = splice_by_seq_pos(1);
  1145. Function: splices all aligned sequences where the specified sequence
  1146. has gaps.
  1147. Example :
  1148. Returns : 1 on success
  1149. Args : position of sequence to splice by
  1150. =cut
  1151. sub splice_by_seq_pos{
  1152. my ($self,$pos) = @_;
  1153. my $guide = $self->get_seq_by_pos($pos);
  1154. my $guide_seq = $guide->seq;
  1155. $guide_seq =~ s/\./\-/g;
  1156. my @gaps = ();
  1157. $pos = -1;
  1158. while(($pos = index($guide_seq, '-', $pos)) > -1 ){
  1159. unshift @gaps, $pos;
  1160. $pos++;
  1161. }
  1162. foreach my $seq ($self->each_seq){
  1163. my @bases = split '', $seq->seq;
  1164. splice(@bases, $_, 1) foreach @gaps;
  1165. $seq->seq(join('', @bases));
  1166. }
  1167. 1;
  1168. }
  1169. =head2 map_chars
  1170. Title : map_chars
  1171. Usage : $ali->map_chars('\.','-')
  1172. Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
  1173. characters
  1174. Notice that the from (arg1) is interpretted as a regex,
  1175. so be careful about quoting meta characters (eg
  1176. $ali->map_chars('.','-') wont do what you want)
  1177. Returns :
  1178. Argument : 'from' rexexp
  1179. 'to' string
  1180. =cut
  1181. sub map_chars {
  1182. my $self = shift;
  1183. my $from = shift;
  1184. my $to = shift;
  1185. my ($seq,$temp);
  1186. $self->throw("Need exactly two arguments")
  1187. unless defined $from and defined $to;
  1188. foreach $seq ( $self->each_seq() ) {
  1189. $temp = $seq->seq();
  1190. $temp =~ s/$from/$to/g;
  1191. $seq->seq($temp);
  1192. }
  1193. return 1;
  1194. }
  1195. =head2 uppercase
  1196. Title : uppercase()
  1197. Usage : $ali->uppercase()
  1198. Function : Sets all the sequences to uppercase
  1199. Returns :
  1200. Argument :
  1201. =cut
  1202. sub uppercase {
  1203. my $self = shift;
  1204. my $seq;
  1205. my $temp;
  1206. foreach $seq ( $self->each_seq() ) {
  1207. $temp = $seq->seq();
  1208. $temp =~ tr/[a-z]/[A-Z]/;
  1209. $seq->seq($temp);
  1210. }
  1211. return 1;
  1212. }
  1213. =head2 cigar_line
  1214. Title : cigar_line()
  1215. Usage : %cigars = $align->cigar_line()
  1216. Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
  1217. Report) line for each sequence in the alignment. Examples are
  1218. "1,60" or "5,10:12,58", where the numbers refer to conserved
  1219. positions within the alignment. The keys of the hash are the
  1220. NSEs (name/start/end) assigned to each sequence.
  1221. Args : threshold (optional, defaults to 100)
  1222. Returns : Hash of strings (cigar lines)
  1223. =cut
  1224. sub cigar_line {
  1225. my $self = shift;
  1226. my $thr=shift||100;
  1227. my %cigars;
  1228. my @consensus = split "",($self->consensus_string($thr));
  1229. my $len = $self->length;
  1230. my $gapchar = $self->gap_char;
  1231. # create a precursor, something like (1,4,5,6,7,33,45),
  1232. # where each number corresponds to a conserved position
  1233. foreach my $seq ( $self->each_seq ) {
  1234. my @seq = split "", uc ($seq->seq);
  1235. my $pos = 1;
  1236. for (my $x = 0 ; $x < $len ; $x++ ) {
  1237. if ($seq[$x] eq $consensus[$x]) {
  1238. push @{$cigars{$seq->get_nse}},$pos;
  1239. $pos++;
  1240. } elsif ($seq[$x] ne $gapchar) {
  1241. $pos++;
  1242. }
  1243. }
  1244. }
  1245. # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
  1246. for my $name (keys %cigars) {
  1247. splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
  1248. ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
  1249. push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
  1250. ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
  1251. ${$cigars{$name}}[$#{$cigars{$name}}] );
  1252. for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
  1253. if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x] &&
  1254. ${$cigars{$name}}[$x + 1] > ${$cigars{$name}}[$x] + 1) {
  1255. splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
  1256. }
  1257. }
  1258. }
  1259. # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
  1260. for my $name (keys %cigars) {
  1261. my @remove;
  1262. for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
  1263. if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
  1264. ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
  1265. unshift @remove,$x;
  1266. }
  1267. }
  1268. for my $pos (@remove) {
  1269. splice @{$cigars{$name}}, $pos, 1;
  1270. }
  1271. }
  1272. # join and punctuate
  1273. for my $name (keys %cigars) {
  1274. my ($start,$end,$str) = "";
  1275. while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
  1276. $str .= ($start . "," . $end . ":");
  1277. }
  1278. $str =~ s/:$//;
  1279. $cigars{$name} = $str;
  1280. }
  1281. %cigars;
  1282. }
  1283. =head2 match_line
  1284. Title : match_line()
  1285. Usage : $line = $align->match_line()
  1286. Function : Generates a match line - much like consensus string
  1287. except that a line indicating the '*' for a match.
  1288. Args : (optional) Match line characters ('*' by default)
  1289. (optional) Strong match char (':' by default)
  1290. (optional) Weak match char ('.' by default)
  1291. Returns : String
  1292. =cut
  1293. sub match_line {
  1294. my ($self,$matchlinechar, $strong, $weak) = @_;
  1295. my %matchchars = ('match' => $matchlinechar || '*',
  1296. 'weak' => $weak || '.',
  1297. 'strong' => $strong || ':',
  1298. 'mismatch' => ' ',
  1299. );
  1300. my @seqchars;
  1301. my $alphabet;
  1302. foreach my $seq ( $self->each_seq ) {
  1303. push @seqchars, [ split(//, uc ($seq->seq)) ];
  1304. $alphabet = $seq->alphabet unless defined $alphabet;
  1305. }
  1306. my $refseq = shift @seqchars;
  1307. # let's just march down the columns
  1308. my $matchline;
  1309. POS:
  1310. foreach my $pos ( 0..$self->length ) {
  1311. my $refchar = $refseq->[$pos];
  1312. my $char = $matchchars{'mismatch'};
  1313. unless( defined $refchar ) {
  1314. last if $pos == $self->length; # short circuit on last residue
  1315. # this in place to handle jason's soon-to-be-committed
  1316. # intron mapping code
  1317. goto bottom;
  1318. }
  1319. my %col = ($refchar => 1);
  1320. my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
  1321. foreach my $seq ( @seqchars ) {
  1322. next if $pos >= scalar @$seq;
  1323. $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
  1324. $seq->[$pos] eq ' ' );
  1325. $col{$seq->[$pos]}++ if defined $seq->[$pos];
  1326. }
  1327. my @colresidues = sort keys %col;
  1328. # if all the values are the same
  1329. if( $dash ) { $char = $matchchars{'mismatch'} }
  1330. elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
  1331. elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
  1332. # matches for protein seqs
  1333. TYPE:
  1334. foreach my $type ( qw(strong weak) ) {
  1335. # iterate through categories
  1336. my %groups;
  1337. # iterate through each of the aa in the col
  1338. # look to see which groups it is in
  1339. foreach my $c ( @colresidues ) {
  1340. foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
  1341. push @{$groups{$f}},$c;
  1342. }
  1343. }
  1344. GRP:
  1345. foreach my $cols ( values %groups ) {
  1346. @$cols = sort @$cols;
  1347. # now we are just testing to see if two arrays
  1348. # are identical w/o changing either one
  1349. # have to be same len
  1350. next if( scalar @$cols != scalar @colresidues );
  1351. # walk down the length and check each slot
  1352. for($_=0;$_ < (scalar @$cols);$_++ ) {
  1353. next GRP if( $cols->[$_] ne $colresidues[$_] );
  1354. }
  1355. $char = $matchchars{$type};
  1356. last TYPE;
  1357. }
  1358. }
  1359. }
  1360. bottom:
  1361. $matchline .= $char;
  1362. }
  1363. return $matchline;
  1364. }
  1365. =head2 gap_line
  1366. Title : gap_line()
  1367. Usage : $line = $align->gap_line()
  1368. Function : Generates a gap line - much like consensus string
  1369. except that a line where '-' represents gap
  1370. Args : (optional) gap line characters ('-' by default)
  1371. Returns : string
  1372. =cut
  1373. sub gap_line {
  1374. my ($self,$gapchar) = @_;
  1375. $gapchar = $gapchar || $self->gap_char;
  1376. my %gap_hsh; # column gaps vector
  1377. foreach my $seq ( $self->each_seq ) {
  1378. my $i = 0;
  1379. map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar}
  1380. map {[$i++, $_]} split(//, uc ($seq->seq));
  1381. }
  1382. my $gap_line;
  1383. foreach my $pos ( 0..$self->length-1 ) {
  1384. $gap_line .= (exists $gap_hsh{$pos}) ? $gapchar:'.';
  1385. }
  1386. return $gap_line;
  1387. }
  1388. =head2 all_gap_line
  1389. Title : all_gap_line()
  1390. Usage : $line = $align->all_gap_line()
  1391. Function : Generates a gap line - much like consensus string
  1392. except that a line where '-' represents all-gap column
  1393. Args : (optional) gap line characters ('-' by default)
  1394. Returns : string
  1395. =cut
  1396. sub all_gap_line {
  1397. my ($self,$gapchar) = @_;
  1398. $gapchar = $gapchar || $self->gap_char;
  1399. my %gap_hsh; # column gaps counter hash
  1400. my @seqs = $self->each_seq;
  1401. foreach my $seq ( @seqs ) {
  1402. my $i = 0;
  1403. map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar}
  1404. map {[$i++, $_]} split(//, uc ($seq->seq));
  1405. }
  1406. my $gap_line;
  1407. foreach my $pos ( 0..$self->length-1 ) {
  1408. if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
  1409. # gaps column
  1410. $gap_line .= $gapchar;
  1411. } else {
  1412. $gap_line .= '.';
  1413. }
  1414. }
  1415. return $gap_line;
  1416. }
  1417. =head2 gap_col_matrix
  1418. Title : gap_col_matrix()
  1419. Usage : my $cols = $align->gap_col_matrix()
  1420. Function : Generates an array of hashes where
  1421. each entry in the array is a hash reference
  1422. with keys of all the sequence names and
  1423. and value of 1 or 0 if the sequence has a gap at that column
  1424. Args : (optional) gap line characters ($aln->gap_char or '-' by default)
  1425. =cut
  1426. sub gap_col_matrix {
  1427. my ($self,$gapchar) = @_;
  1428. $gapchar = $gapchar || $self->gap_char;
  1429. my %gap_hsh; # column gaps vector
  1430. my @cols;
  1431. foreach my $seq ( $self->each_seq ) {
  1432. my $i = 0;
  1433. my $str = $seq->seq;
  1434. my $len = $seq->length;
  1435. my $ch;
  1436. my $id = $seq->display_id;
  1437. while( $i < $len ) {
  1438. $ch = substr($str, $i, 1);
  1439. $cols[$i++]->{$id} = ($ch eq $gapchar);
  1440. }
  1441. }
  1442. return \@cols;
  1443. }
  1444. =head2 match
  1445. Title : match()
  1446. Usage : $ali->match()
  1447. Function : Goes through all columns and changes residues that are
  1448. identical to residue in first sequence to match '.'
  1449. character. Sets match_char.
  1450. USE WITH CARE: Most MSA formats do not support match
  1451. characters in sequences, so this is mostly for output
  1452. only. NEXUS format (Bio::AlignIO::nexus) can handle
  1453. it.
  1454. Returns : 1
  1455. Argument : a match character, optional, defaults to '.'
  1456. =cut
  1457. sub match {
  1458. my ($self, $match) = @_;
  1459. $match ||= '.';
  1460. my ($matching_char) = $match;
  1461. $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ; #';
  1462. $self->map_chars($matching_char, '-');
  1463. my @seqs = $self->each_seq();
  1464. return 1 unless scalar @seqs > 1;
  1465. my $refseq = shift @seqs ;
  1466. my @refseq = split //, $refseq->seq;
  1467. my $gapchar = $self->gap_char;
  1468. foreach my $seq ( @seqs ) {
  1469. my @varseq = split //, $seq->seq();
  1470. for ( my $i=0; $i < scalar @varseq; $i++) {
  1471. $varseq[$i] = $match if defined $refseq[$i] &&
  1472. ( $refseq[$i] =~ /[A-Za-z\*]/ ||
  1473. $refseq[$i] =~ /$gapchar/ )
  1474. && $refseq[$i] eq $varseq[$i];
  1475. }
  1476. $seq->seq(join '', @varseq);
  1477. }
  1478. $self->match_char($match);
  1479. return 1;
  1480. }
  1481. =head2 unmatch
  1482. Title : unmatch()
  1483. Usage : $ali->unmatch()
  1484. Function : Undoes the effect of method match. Unsets match_char.
  1485. Returns : 1
  1486. Argument : a match character, optional, defaults to '.'
  1487. See L<match> and L<match_char>
  1488. =cut
  1489. sub unmatch {
  1490. my ($self, $match) = @_;
  1491. $match ||= '.';
  1492. my @seqs = $self->each_seq();
  1493. return 1 unless scalar @seqs > 1;
  1494. my $refseq = shift @seqs ;
  1495. my @refseq = split //, $refseq->seq;
  1496. my $gapchar = $self->gap_char;
  1497. foreach my $seq ( @seqs ) {
  1498. my @varseq = split //, $seq->seq();
  1499. for ( my $i=0; $i < scalar @varseq; $i++) {
  1500. $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
  1501. ( $refseq[$i] =~ /[A-Za-z\*]/ ||
  1502. $refseq[$i] =~ /$gapchar/ ) &&
  1503. $varseq[$i] eq $match;
  1504. }
  1505. $seq->seq(join '', @varseq);
  1506. }
  1507. $self->match_char('');
  1508. return 1;
  1509. }
  1510. =head1 MSA attributes
  1511. Methods for setting and reading the MSA attributes.
  1512. Note that the methods defining character semantics depend on the user
  1513. to set them sensibly. They are needed only by certain input/output
  1514. methods. Unset them by setting to an empty string ('').
  1515. =head2 id
  1516. Title : id
  1517. Usage : $myalign->id("Ig")
  1518. Function : Gets/sets the id field of the alignment
  1519. Returns : An id string
  1520. Argument : An id string (optional)
  1521. =cut
  1522. sub id {
  1523. my ($self, $name) = @_;
  1524. if (defined( $name )) {
  1525. $self->{'_id'} = $name;
  1526. }
  1527. return $self->{'_id'};
  1528. }
  1529. =head2 accession
  1530. Title : accession
  1531. Usage : $myalign->accession("PF00244")
  1532. Function : Gets/sets the accession field of the alignment
  1533. Returns : An acc string
  1534. Argument : An acc string (optional)
  1535. =cut
  1536. sub accession {
  1537. my ($self, $acc) = @_;
  1538. if (defined( $acc )) {
  1539. $self->{'_accession'} = $acc;
  1540. }
  1541. return $self->{'_accession'};
  1542. }
  1543. =head2 description
  1544. Title : description
  1545. Usage : $myalign->description("14-3-3 proteins")
  1546. Function : Gets/sets the description field of the alignment
  1547. Returns : An description string
  1548. Argument : An description string (optional)
  1549. =cut
  1550. sub description {
  1551. my ($self, $name) = @_;
  1552. if (defined( $name )) {
  1553. $self->{'_description'} = $name;
  1554. }
  1555. return $self->{'_description'};
  1556. }
  1557. =head2 missing_char
  1558. Title : missing_char
  1559. Usage : $myalign->missing_char("?")
  1560. Function : Gets/sets the missing_char attribute of the alignment
  1561. It is generally recommended to set it to 'n' or 'N'
  1562. for nucleotides and to 'X' for protein.
  1563. Returns : An missing_char string,
  1564. Argument : An missing_char string (optional)
  1565. =cut
  1566. sub missing_char {
  1567. my ($self, $char) = @_;
  1568. if (defined $char ) {
  1569. $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1;
  1570. $self->{'_missing_char'} = $char;
  1571. }
  1572. return $self->{'_missing_char'};
  1573. }
  1574. =head2 match_char
  1575. Title : match_char
  1576. Usage : $myalign->match_char('.')
  1577. Function : Gets/sets the match_char attribute of the alignment
  1578. Returns : An match_char string,
  1579. Argument : An match_char string (optional)
  1580. =cut
  1581. sub match_char {
  1582. my ($self, $char) = @_;
  1583. if (defined $char ) {
  1584. $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1;
  1585. $self->{'_match_char'} = $char;
  1586. }
  1587. return $self->{'_match_char'};
  1588. }
  1589. =head2 gap_char
  1590. Title : gap_char
  1591. Usage : $myalign->gap_char('-')
  1592. Function : Gets/sets the gap_char attribute of the alignment
  1593. Returns : An gap_char string, defaults to '-'
  1594. Argument : An gap_char string (optional)
  1595. =cut
  1596. sub gap_char {
  1597. my ($self, $char) = @_;
  1598. if (defined $char || ! defined $self->{'_gap_char'} ) {
  1599. $char= '-' unless defined $char;
  1600. $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1;
  1601. $self->{'_gap_char'} = $char;
  1602. }
  1603. return $self->{'_gap_char'};
  1604. }
  1605. =head2 symbol_chars
  1606. Title : symbol_chars
  1607. Usage : my @symbolchars = $aln->symbol_chars;
  1608. Function: Returns all the seen symbols (other than gaps)
  1609. Returns : array of characters that are the seen symbols
  1610. Args : boolean to include the gap/missing/match characters
  1611. =cut
  1612. sub symbol_chars{
  1613. my ($self,$includeextra) = @_;
  1614. unless ($self->{'_symbols'}) {
  1615. foreach my $seq ($self->each_seq) {
  1616. map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
  1617. }
  1618. }
  1619. my %copy = %{$self->{'_symbols'}};
  1620. if( ! $includeextra ) {
  1621. foreach my $char ( $self->gap_char, $self->match_char,
  1622. $self->missing_char) {
  1623. delete $copy{$char} if( defined $char );
  1624. }
  1625. }
  1626. return keys %copy;
  1627. }
  1628. =head1 Alignment descriptors
  1629. These read only methods describe the MSA in various ways.
  1630. =head2 score
  1631. Title : score
  1632. Usage : $str = $ali->score()
  1633. Function : get/set a score of the alignment
  1634. Returns : a score for the alignment
  1635. Argument : an optional score to set
  1636. =cut
  1637. sub score {
  1638. my $self = shift;
  1639. $self->{score} = shift if @_;
  1640. return $self->{score};
  1641. }
  1642. =head2 consensus_string
  1643. Title : consensus_string
  1644. Usage : $str = $ali->consensus_string($threshold_percent)
  1645. Function : Makes a strict consensus
  1646. Returns : Consensus string
  1647. Argument : Optional treshold ranging from 0 to 100.
  1648. The consensus residue has to appear at least threshold %
  1649. of the sequences at a given location, otherwise a '?'
  1650. character will be placed at that location.
  1651. (Default value = 0%)
  1652. =cut
  1653. sub consensus_string {
  1654. my $self = shift;
  1655. my $threshold = shift;
  1656. my $out = "";
  1657. my $len = $self->length - 1;
  1658. foreach ( 0 .. $len ) {
  1659. $out .= $self->_consensus_aa($_,$threshold);
  1660. }
  1661. return $out;
  1662. }
  1663. sub _consensus_aa {
  1664. my $self = shift;
  1665. my $point = shift;
  1666. my $threshold_percent = shift || -1 ;
  1667. my ($seq,%hash,$count,$letter,$key);
  1668. my $gapchar = $self->gap_char;
  1669. foreach $seq ( $self->each_seq() ) {
  1670. $letter = substr($seq->seq,$point,1);
  1671. $self->throw("--$point-----------") if $letter eq '';
  1672. ($letter eq $gapchar || $letter =~ /\./) && next;
  1673. # print "Looking at $letter\n";
  1674. $hash{$letter}++;
  1675. }
  1676. my $number_of_sequences = $self->num_sequences();
  1677. my $threshold = $number_of_sequences * $threshold_percent / 100. ;
  1678. $count = -1;
  1679. $letter = '?';
  1680. foreach $key ( sort keys %hash ) {
  1681. # print "Now at $key $hash{$key}\n";
  1682. if( $hash{$key} > $count && $hash{$key} >= $threshold) {
  1683. $letter = $key;
  1684. $count = $hash{$key};
  1685. }
  1686. }
  1687. return $letter;
  1688. }
  1689. =head2 consensus_iupac
  1690. Title : consensus_iupac
  1691. Usage : $str = $ali->consensus_iupac()
  1692. Function : Makes a consensus using IUPAC ambiguity codes from DNA
  1693. and RNA. The output is in upper case except when gaps in
  1694. a column force output to be in lower case.
  1695. Note that if your alignment sequences contain a lot of
  1696. IUPAC ambiquity codes you often have to manually set
  1697. alphabet. Bio::PrimarySeq::_guess_type thinks they
  1698. indicate a protein sequence.
  1699. Returns : consensus string
  1700. Argument : none
  1701. Throws : on protein sequences
  1702. =cut
  1703. sub consensus_iupac {
  1704. my $self = shift;
  1705. my $out = "";
  1706. my $len = $self->length-1;
  1707. # only DNA and RNA sequences are valid
  1708. foreach my $seq ( $self->each_seq() ) {
  1709. $self->throw("Seq [". $seq->get_nse. "] is a protein")
  1710. if $seq->alphabet eq 'protein';
  1711. }
  1712. # loop over the alignment columns
  1713. foreach my $count ( 0 .. $len ) {
  1714. $out .= $self->_consensus_iupac($count);
  1715. }
  1716. return $out;
  1717. }
  1718. sub _consensus_iupac {
  1719. my ($self, $column) = @_;
  1720. my ($string, $char, $rna);
  1721. #determine all residues in a column
  1722. foreach my $seq ( $self->each_seq() ) {
  1723. $string .= substr($seq->seq, $column, 1);
  1724. }
  1725. $string = uc $string;
  1726. # quick exit if there's an N in the string
  1727. if ($string =~ /N/) {
  1728. $string =~ /\W/ ? return 'n' : return 'N';
  1729. }
  1730. # ... or if there are only gap characters
  1731. return '-' if $string =~ /^\W+$/;
  1732. # treat RNA as DNA in regexps
  1733. if ($string =~ /U/) {
  1734. $string =~ s/U/T/;
  1735. $rna = 1;
  1736. }
  1737. # the following s///'s only need to be done to the _first_ ambiguity code
  1738. # as we only need to see the _range_ of characters in $string
  1739. if ($string =~ /[VDHB]/) {
  1740. $string =~ s/V/AGC/;
  1741. $string =~ s/D/AGT/;
  1742. $string =~ s/H/ACT/;
  1743. $string =~ s/B/CTG/;
  1744. }
  1745. if ($string =~ /[SKYRWM]/) {
  1746. $string =~ s/S/GC/;
  1747. $string =~ s/K/GT/;
  1748. $string =~ s/Y/CT/;
  1749. $string =~ s/R/AG/;
  1750. $string =~ s/W/AT/;
  1751. $string =~ s/M/AC/;
  1752. }
  1753. # and now the guts of the thing
  1754. if ($string =~ /A/) {
  1755. $char = 'A'; # A A
  1756. if ($string =~ /G/) {
  1757. $char = 'R'; # A and G (purines) R
  1758. if ($string =~ /C/) {
  1759. $char = 'V'; # A and G and C V
  1760. if ($string =~ /T/) {
  1761. $char = 'N'; # A and G and C and T N
  1762. }
  1763. } elsif ($string =~ /T/) {
  1764. $char = 'D'; # A and G and T D
  1765. }
  1766. } elsif ($string =~ /C/) {
  1767. $char = 'M'; # A and C M
  1768. if ($string =~ /T/) {
  1769. $char = 'H'; # A and C and T H
  1770. }
  1771. } elsif ($string =~ /T/) {
  1772. $char = 'W'; # A and T W
  1773. }
  1774. } elsif ($string =~ /C/) {
  1775. $char = 'C'; # C C
  1776. if ($string =~ /T/) {
  1777. $char = 'Y'; # C and T (pyrimidines) Y
  1778. if ($string =~ /G/) {
  1779. $char = 'B'; # C and T and G B
  1780. }
  1781. } elsif ($string =~ /G/) {
  1782. $char = 'S'; # C and G S
  1783. }
  1784. } elsif ($string =~ /G/) {
  1785. $char = 'G'; # G G
  1786. if ($string =~ /C/) {
  1787. $char = 'S'; # G and C S
  1788. } elsif ($string =~ /T/) {
  1789. $char = 'K'; # G and T K
  1790. }
  1791. } elsif ($string =~ /T/) {
  1792. $char = 'T'; # T T
  1793. }
  1794. $char = 'U' if $rna and $char eq 'T';
  1795. $char = lc $char if $string =~ /\W/;
  1796. return $char;
  1797. }
  1798. =head2 consensus_meta
  1799. Title : consensus_meta
  1800. Usage : $seqmeta = $ali->consensus_meta()
  1801. Function : Returns a Bio::Seq::Meta object containing the consensus
  1802. strings derived from meta data analysis.
  1803. Returns : Bio::Seq::Meta
  1804. Argument : Bio::Seq::Meta
  1805. Throws : non-MetaI object
  1806. =cut
  1807. sub consensus_meta {
  1808. my ($self, $meta) = @_;
  1809. if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
  1810. $self->throw('Not a Bio::Seq::MetaI object');
  1811. }
  1812. return $self->{'_aln_meta'} = $meta if $meta;
  1813. return $self->{'_aln_meta'}
  1814. }
  1815. =head2 is_flush
  1816. Title : is_flush
  1817. Usage : if ( $ali->is_flush() )
  1818. Function : Tells you whether the alignment
  1819. : is flush, i.e. all of the same length
  1820. Returns : 1 or 0
  1821. Argument :
  1822. =cut
  1823. sub is_flush {
  1824. my ($self,$report) = @_;
  1825. my $seq;
  1826. my $length = (-1);
  1827. my $temp;
  1828. foreach $seq ( $self->each_seq() ) {
  1829. if( $length == (-1) ) {
  1830. $length = CORE::length($seq->seq());
  1831. next;
  1832. }
  1833. $temp = CORE::length($seq->seq());
  1834. if( $temp != $length ) {
  1835. $self->warn("expecting $length not $temp from ".
  1836. $seq->display_id) if( $report );
  1837. $self->debug("expecting $length not $temp from ".
  1838. $seq->display_id);
  1839. $self->debug($seq->seq(). "\n");
  1840. return 0;
  1841. }
  1842. }
  1843. return 1;
  1844. }
  1845. =head2 length
  1846. Title : length()
  1847. Usage : $len = $ali->length()
  1848. Function : Returns the maximum length of the alignment.
  1849. To be sure the alignment is a block, use is_flush
  1850. Returns : Integer
  1851. Argument :
  1852. =cut
  1853. sub length_aln {
  1854. my $self = shift;
  1855. $self->deprecated("length_aln - deprecated method. Use length() instead.");
  1856. $self->length(@_);
  1857. }
  1858. sub length {
  1859. my $self = shift;
  1860. my $seq;
  1861. my $length = -1;
  1862. my $temp;
  1863. foreach $seq ( $self->each_seq() ) {
  1864. $temp = $seq->length();
  1865. if( $temp > $length ) {
  1866. $length = $temp;
  1867. }
  1868. }
  1869. return $length;
  1870. }
  1871. =head2 maxdisplayname_length
  1872. Title : maxdisplayname_length
  1873. Usage : $ali->maxdisplayname_length()
  1874. Function : Gets the maximum length of the displayname in the
  1875. alignment. Used in writing out various MSA formats.
  1876. Returns : integer
  1877. Argument :
  1878. =cut
  1879. sub maxname_length {
  1880. my $self = shift;
  1881. $self->deprecated("maxname_length - deprecated method.".
  1882. " Use maxdisplayname_length() instead.");
  1883. $self->maxdisplayname_length();
  1884. }
  1885. sub maxnse_length {
  1886. my $self = shift;
  1887. $self->deprecated("maxnse_length - deprecated method.".
  1888. " Use maxnse_length() instead.");
  1889. $self->maxdisplayname_length();
  1890. }
  1891. sub maxdisplayname_length {
  1892. my $self = shift;
  1893. my $maxname = (-1);
  1894. my ($seq,$len);
  1895. foreach $seq ( $self->each_seq() ) {
  1896. $len = CORE::length $self->displayname($seq->get_nse());
  1897. if( $len > $maxname ) {
  1898. $maxname = $len;
  1899. }
  1900. }
  1901. return $maxname;
  1902. }
  1903. =head2 max_metaname_length
  1904. Title : max_metaname_length
  1905. Usage : $ali->max_metaname_length()
  1906. Function : Gets the maximum length of the meta name tags in the
  1907. alignment for the sequences and for the alignment.
  1908. Used in writing out various MSA formats.
  1909. Returns : integer
  1910. Argument : None
  1911. =cut
  1912. sub max_metaname_length {
  1913. my $self = shift;
  1914. my $maxname = (-1);
  1915. my ($seq,$len);
  1916. # check seq meta first
  1917. for $seq ( $self->each_seq() ) {
  1918. next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
  1919. for my $mtag ($seq->meta_names) {
  1920. $len = CORE::length $mtag;
  1921. if( $len > $maxname ) {
  1922. $maxname = $len;
  1923. }
  1924. }
  1925. }
  1926. # alignment meta
  1927. for my $meta ($self->consensus_meta) {
  1928. next unless $meta;
  1929. for my $name ($meta->meta_names) {
  1930. $len = CORE::length $name;
  1931. if( $len > $maxname ) {
  1932. $maxname = $len;
  1933. }
  1934. }
  1935. }
  1936. return $maxname;
  1937. }
  1938. =head2 num_residues
  1939. Title : num_residues
  1940. Usage : $no = $ali->num_residues
  1941. Function : number of residues in total in the alignment
  1942. Returns : integer
  1943. Argument :
  1944. Note : replaces no_residues()
  1945. =cut
  1946. sub num_residues {
  1947. my $self = shift;
  1948. my $count = 0;
  1949. foreach my $seq ($self->each_seq) {
  1950. my $str = $seq->seq();
  1951. $count += ($str =~ s/[A-Za-z]//g);
  1952. }
  1953. return $count;
  1954. }
  1955. =head2 num_sequences
  1956. Title : num_sequences
  1957. Usage : $depth = $ali->num_sequences
  1958. Function : number of sequence in the sequence alignment
  1959. Returns : integer
  1960. Argument : none
  1961. Note : replaces no_sequences()
  1962. =cut
  1963. sub num_sequences {
  1964. my $self = shift;
  1965. return scalar($self->each_seq);
  1966. }
  1967. =head2 average_percentage_identity
  1968. Title : average_percentage_identity
  1969. Usage : $id = $align->average_percentage_identity
  1970. Function: The function uses a fast method to calculate the average
  1971. percentage identity of the alignment
  1972. Returns : The average percentage identity of the alignment
  1973. Args : None
  1974. Notes : This method implemented by Kevin Howe calculates a figure that is
  1975. designed to be similar to the average pairwise identity of the
  1976. alignment (identical in the absence of gaps), without having to
  1977. explicitly calculate pairwise identities proposed by Richard Durbin.
  1978. Validated by Ewan Birney ad Alex Bateman.
  1979. =cut
  1980. sub average_percentage_identity{
  1981. my ($self,@args) = @_;
  1982. my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
  1983. 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
  1984. my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
  1985. if (! $self->is_flush()) {
  1986. $self->throw("All sequences in the alignment must be the same length");
  1987. }
  1988. @seqs = $self->each_seq();
  1989. $len = $self->length();
  1990. # load the each hash with correct keys for existence checks
  1991. for( my $index=0; $index < $len; $index++) {
  1992. foreach my $letter (@alphabet) {
  1993. $countHashes[$index]->{$letter} = 0;
  1994. }
  1995. }
  1996. foreach my $seq (@seqs) {
  1997. my @seqChars = split //, $seq->seq();
  1998. for( my $column=0; $column < @seqChars; $column++ ) {
  1999. my $char = uc($seqChars[$column]);
  2000. if (exists $countHashes[$column]->{$char}) {
  2001. $countHashes[$column]->{$char}++;
  2002. }
  2003. }
  2004. }
  2005. $total = 0;
  2006. $divisor = 0;
  2007. for(my $column =0; $column < $len; $column++) {
  2008. my %hash = %{$countHashes[$column]};
  2009. $subdivisor = 0;
  2010. foreach my $res (keys %hash) {
  2011. $total += $hash{$res}*($hash{$res} - 1);
  2012. $subdivisor += $hash{$res};
  2013. }
  2014. $divisor += $subdivisor * ($subdivisor - 1);
  2015. }
  2016. return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
  2017. }
  2018. =head2 percentage_identity
  2019. Title : percentage_identity
  2020. Usage : $id = $align->percentage_identity
  2021. Function: The function calculates the average percentage identity
  2022. (aliased to average_percentage_identity)
  2023. Returns : The average percentage identity
  2024. Args : None
  2025. =cut
  2026. sub percentage_identity {
  2027. my $self = shift;
  2028. return $self->average_percentage_identity();
  2029. }
  2030. =head2 overall_percentage_identity
  2031. Title : overall_percentage_identity
  2032. Usage : $id = $align->overall_percentage_identity
  2033. $id = $align->overall_percentage_identity('short')
  2034. Function: The function calculates the percentage identity of
  2035. the conserved columns
  2036. Returns : The percentage identity of the conserved columns
  2037. Args : length value to use, optional defaults to alignment length
  2038. possible values: 'align', 'short', 'long'
  2039. The argument values 'short' and 'long' refer to shortest and longest
  2040. sequence in the alignment. Method modification code by Hongyu Zhang.
  2041. =cut
  2042. sub overall_percentage_identity{
  2043. my ($self, $length_measure) = @_;
  2044. my %alphabet = map {$_ => undef} qw (A C G T U B D E F H I J K L M N O P Q R S V W X Y Z);
  2045. my %enum = map {$_ => undef} qw (align short long);
  2046. $self->throw("Unknown argument [$length_measure]")
  2047. if $length_measure and not exists $enum{$length_measure};
  2048. $length_measure ||= 'align';
  2049. if (! $self->is_flush()) {
  2050. $self->throw("All sequences in the alignment must be the same length");
  2051. }
  2052. # Count the residues seen at each position
  2053. my $len;
  2054. my $total = 0; # number of positions with identical residues
  2055. my @countHashes;
  2056. my @seqs = $self->each_seq;
  2057. my $nof_seqs = scalar @seqs;
  2058. my $aln_len = $self->length();
  2059. for my $seq (@seqs) {
  2060. my $seqstr = $seq->seq;
  2061. # Count residues for given sequence
  2062. for my $column (0 .. $aln_len-1) {
  2063. my $char = uc( substr($seqstr, $column, 1) );
  2064. if ( exists $alphabet{$char} ) {
  2065. # This is a valid char
  2066. if ( defined $countHashes[$column]->{$char} ) {
  2067. $countHashes[$column]->{$char}++;
  2068. } else {
  2069. $countHashes[$column]->{$char} = 1;
  2070. }
  2071. if ( $countHashes[$column]->{$char} == $nof_seqs ) {
  2072. # All sequences have this same residue
  2073. $total++;
  2074. }
  2075. }
  2076. }
  2077. # Sequence length
  2078. if ($length_measure eq 'short' || $length_measure eq 'long') {
  2079. my $seq_len = $seqstr =~ tr/[A-Za-z]//;
  2080. if ($length_measure eq 'short') {
  2081. if ( (not defined $len) || ($seq_len < $len) ) {
  2082. $len = $seq_len;
  2083. }
  2084. } elsif ($length_measure eq 'long') {
  2085. if ( (not defined $len) || ($seq_len > $len) ) {
  2086. $len = $seq_len;
  2087. }
  2088. }
  2089. }
  2090. }
  2091. if ($length_measure eq 'align') {
  2092. $len = $aln_len;
  2093. }
  2094. return ($total / $len ) * 100.0;
  2095. }
  2096. =head1 Alignment positions
  2097. Methods to map a sequence position into an alignment column and back.
  2098. column_from_residue_number() does the former. The latter is really a
  2099. property of the sequence object and can done using
  2100. L<Bio::LocatableSeq::location_from_column>:
  2101. # select somehow a sequence from the alignment, e.g.
  2102. my $seq = $aln->get_seq_by_pos(1);
  2103. #$loc is undef or Bio::LocationI object
  2104. my $loc = $seq->location_from_column(5);
  2105. =head2 column_from_residue_number
  2106. Title : column_from_residue_number
  2107. Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
  2108. Function: This function gives the position in the alignment
  2109. (i.e. column number) of the given residue number in the
  2110. sequence with the given name. For example, for the
  2111. alignment
  2112. Seq1/91-97 AC..DEF.GH.
  2113. Seq2/24-30 ACGG.RTY...
  2114. Seq3/43-51 AC.DDEF.GHI
  2115. column_from_residue_number( "Seq1", 94 ) returns 6.
  2116. column_from_residue_number( "Seq2", 25 ) returns 2.
  2117. column_from_residue_number( "Seq3", 50 ) returns 10.
  2118. An exception is thrown if the residue number would lie
  2119. outside the length of the aligment
  2120. (e.g. column_from_residue_number( "Seq2", 22 )
  2121. Note: If the the parent sequence is represented by more than
  2122. one alignment sequence and the residue number is present in
  2123. them, this method finds only the first one.
  2124. Returns : A column number for the position in the alignment of the
  2125. given residue in the given sequence (1 = first column)
  2126. Args : A sequence id/name (not a name/start-end)
  2127. A residue number in the whole sequence (not just that
  2128. segment of it in the alignment)
  2129. =cut
  2130. sub column_from_residue_number {
  2131. my ($self, $name, $resnumber) = @_;
  2132. $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
  2133. $self->throw("Second argument residue number missing") unless $resnumber;
  2134. foreach my $seq ($self->each_seq_with_id($name)) {
  2135. my $col;
  2136. eval {
  2137. $col = $seq->column_from_residue_number($resnumber);
  2138. };
  2139. next if $@;
  2140. return $col;
  2141. }
  2142. $self->throw("Could not find a sequence segment in $name ".
  2143. "containing residue number $resnumber");
  2144. }
  2145. =head1 Sequence names
  2146. Methods to manipulate the display name. The default name based on the
  2147. sequence id and subsequence positions can be overridden in various
  2148. ways.
  2149. =head2 displayname
  2150. Title : displayname
  2151. Usage : $myalign->displayname("Ig", "IgA")
  2152. Function : Gets/sets the display name of a sequence in the alignment
  2153. Returns : A display name string
  2154. Argument : name of the sequence
  2155. displayname of the sequence (optional)
  2156. =cut
  2157. sub get_displayname {
  2158. my $self = shift;
  2159. $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
  2160. $self->displayname(@_);
  2161. }
  2162. sub set_displayname {
  2163. my $self = shift;
  2164. $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
  2165. $self->displayname(@_);
  2166. }
  2167. sub displayname {
  2168. my ($self, $name, $disname) = @_;
  2169. $self->throw("No sequence with name [$name]")
  2170. unless defined $self->{'_seq'}->{$name};
  2171. if( $disname and $name) {
  2172. $self->{'_dis_name'}->{$name} = $disname;
  2173. return $disname;
  2174. }
  2175. elsif( defined $self->{'_dis_name'}->{$name} ) {
  2176. return $self->{'_dis_name'}->{$name};
  2177. } else {
  2178. return $name;
  2179. }
  2180. }
  2181. =head2 set_displayname_count
  2182. Title : set_displayname_count
  2183. Usage : $ali->set_displayname_count
  2184. Function : Sets the names to be name_# where # is the number of
  2185. times this name has been used.
  2186. Returns : 1, on success
  2187. Argument :
  2188. =cut
  2189. sub set_displayname_count {
  2190. my $self= shift;
  2191. my (@arr,$name,$seq,$count,$temp,$nse);
  2192. foreach $seq ( $self->each_alphabetically() ) {
  2193. $nse = $seq->get_nse();
  2194. #name will be set when this is the second
  2195. #time (or greater) is has been seen
  2196. if( defined $name and $name eq ($seq->id()) ) {
  2197. $temp = sprintf("%s_%s",$name,$count);
  2198. $self->displayname($nse,$temp);
  2199. $count++;
  2200. } else {
  2201. $count = 1;
  2202. $name = $seq->id();
  2203. $temp = sprintf("%s_%s",$name,$count);
  2204. $self->displayname($nse,$temp);
  2205. $count++;
  2206. }
  2207. }
  2208. return 1;
  2209. }
  2210. =head2 set_displayname_flat
  2211. Title : set_displayname_flat
  2212. Usage : $ali->set_displayname_flat()
  2213. Function : Makes all the sequences be displayed as just their name,
  2214. not name/start-end
  2215. Returns : 1
  2216. Argument :
  2217. =cut
  2218. sub set_displayname_flat {
  2219. my $self = shift;
  2220. my ($nse,$seq);
  2221. foreach $seq ( $self->each_seq() ) {
  2222. $nse = $seq->get_nse();
  2223. $self->displayname($nse,$seq->id());
  2224. }
  2225. return 1;
  2226. }
  2227. =head2 set_displayname_normal
  2228. Title : set_displayname_normal
  2229. Usage : $ali->set_displayname_normal()
  2230. Function : Makes all the sequences be displayed as name/start-end
  2231. Returns : 1, on success
  2232. Argument :
  2233. =cut
  2234. sub set_displayname_normal {
  2235. my $self = shift;
  2236. my ($nse,$seq);
  2237. foreach $seq ( $self->each_seq() ) {
  2238. $nse = $seq->get_nse();
  2239. $self->displayname($nse,$nse);
  2240. }
  2241. return 1;
  2242. }
  2243. =head2 source
  2244. Title : source
  2245. Usage : $obj->source($newval)
  2246. Function: sets the Alignment source program
  2247. Example :
  2248. Returns : value of source
  2249. Args : newvalue (optional)
  2250. =cut
  2251. sub source{
  2252. my ($self,$value) = @_;
  2253. if( defined $value) {
  2254. $self->{'_source'} = $value;
  2255. }
  2256. return $self->{'_source'};
  2257. }
  2258. =head2 set_displayname_safe
  2259. Title : set_displayname_safe
  2260. Usage : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
  2261. Function : Assign machine-generated serial names to sequences in input order.
  2262. Designed to protect names during PHYLIP runs. Assign 10-char string
  2263. in the form of "S000000001" to "S999999999". Restore the original
  2264. names using "restore_displayname".
  2265. Returns : 1. a new $aln with system names;
  2266. 2. a hash ref for restoring names
  2267. Argument : Number for id length (default 10)
  2268. =cut
  2269. sub set_displayname_safe {
  2270. my $self = shift;
  2271. my $idlength = shift || 10;
  2272. my ($seq, %phylip_name);
  2273. my $ct=0;
  2274. my $new=Bio::SimpleAlign->new();
  2275. foreach $seq ( $self->each_seq() ) {
  2276. $ct++;
  2277. my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct;
  2278. $phylip_name{$pname}=$seq->id();
  2279. my $new_seq= Bio::LocatableSeq->new(-id => $pname,
  2280. -seq => $seq->seq(),
  2281. -alphabet => $seq->alphabet,
  2282. -start => $seq->{_start},
  2283. -end => $seq->{_end}
  2284. );
  2285. $new->add_seq($new_seq);
  2286. }
  2287. $self->debug("$ct seq names changed. Restore names by using restore_displayname.");
  2288. return ($new, \%phylip_name);
  2289. }
  2290. =head2 restore_displayname
  2291. Title : restore_displayname
  2292. Usage : $aln_name_restored=$ali->restore_displayname($hash_ref)
  2293. Function : Restore original sequence names (after running
  2294. $ali->set_displayname_safe)
  2295. Returns : a new $aln with names restored.
  2296. Argument : a hash reference of names from "set_displayname_safe".
  2297. =cut
  2298. sub restore_displayname {
  2299. my $self = shift;
  2300. my $ref=shift;
  2301. my %name=%$ref;
  2302. my $new=Bio::SimpleAlign->new();
  2303. foreach my $seq ( $self->each_seq() ) {
  2304. $self->throw("No sequence with name") unless defined $name{$seq->id()};
  2305. my $new_seq= Bio::LocatableSeq->new(-id => $name{$seq->id()},
  2306. -seq => $seq->seq(),
  2307. -alphabet => $seq->alphabet,
  2308. -start => $seq->{_start},
  2309. -end => $seq->{_end}
  2310. );
  2311. $new->add_seq($new_seq);
  2312. }
  2313. return $new;
  2314. }
  2315. =head2 sort_by_start
  2316. Title : sort_by_start
  2317. Usage : $ali->sort_by_start
  2318. Function : Changes the order of the alignment to the start position of each
  2319. subalignment
  2320. Returns :
  2321. Argument :
  2322. =cut
  2323. sub sort_by_start {
  2324. my $self = shift;
  2325. my ($seq,$nse,@arr,%hash,$count);
  2326. foreach $seq ( $self->each_seq() ) {
  2327. $nse = $seq->get_nse;
  2328. $hash{$nse} = $seq;
  2329. }
  2330. $count = 0;
  2331. %{$self->{'_order'}} = (); # reset the hash;
  2332. foreach $nse ( sort _startend keys %hash) {
  2333. $self->{'_order'}->{$count} = $nse;
  2334. $count++;
  2335. }
  2336. 1;
  2337. }
  2338. sub _startend
  2339. {
  2340. my ($aname,$arange) = split (/[\/]/,$a);
  2341. my ($bname,$brange) = split (/[\/]/,$b);
  2342. my ($astart,$aend) = split(/\-/,$arange);
  2343. my ($bstart,$bend) = split(/\-/,$brange);
  2344. return $astart <=> $bstart;
  2345. }
  2346. =head2 bracket_string
  2347. Title : bracket_string
  2348. Usage : my @params = (-refseq => 'testseq',
  2349. -allele1 => 'allele1',
  2350. -allele2 => 'allele2',
  2351. -delimiters => '{}',
  2352. -separator => '/');
  2353. $str = $aln->bracket_string(@params)
  2354. Function : When supplied with a list of parameters (see below), returns a
  2355. string in BIC format. This is used for allelic comparisons.
  2356. Briefly, if either allele contains a base change when compared to
  2357. the refseq, the base or gap for each allele is represented in
  2358. brackets in the order present in the 'alleles' parameter.
  2359. For the following data:
  2360. >testseq
  2361. GGATCCATTGCTACT
  2362. >allele1
  2363. GGATCCATTCCTACT
  2364. >allele2
  2365. GGAT--ATTCCTCCT
  2366. the returned string with parameters 'refseq => testseq' and
  2367. 'alleles => [qw(allele1 allele2)]' would be:
  2368. GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
  2369. Returns : BIC-formatted string
  2370. Argument : Required args
  2371. refseq : string (ID) of the reference sequence used
  2372. as basis for comparison
  2373. allele1 : string (ID) of the first allele
  2374. allele2 : string (ID) of the second allele
  2375. Optional args
  2376. delimiters: two symbol string of left and right delimiters.
  2377. Only the first two symbols are used
  2378. default = '[]'
  2379. separator : string used as a separator. Only the first
  2380. symbol is used
  2381. default = '/'
  2382. Throws : On no refseq/alleles, or invalid refseq/alleles.
  2383. =cut
  2384. sub bracket_string {
  2385. my ($self, @args) = @_;
  2386. my ($ref, $a1, $a2, $delim, $sep) =
  2387. $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
  2388. $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
  2389. my ($ld, $rd);
  2390. ($ld, $rd) = split('', $delim, 2) if $delim;
  2391. $ld ||= '[';
  2392. $rd ||= ']';
  2393. $sep ||= '/';
  2394. my ($refseq, $allele1, $allele2) =
  2395. map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
  2396. if (!$refseq || !$allele1 || !$allele2) {
  2397. $self->throw("One of your refseq/allele IDs is invalid!");
  2398. }
  2399. my $len = $self->length-1;
  2400. my $bic = '';
  2401. # loop over the alignment columns
  2402. for my $column ( 0 .. $len ) {
  2403. my $string;
  2404. my ($compres, $res1, $res2) =
  2405. map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
  2406. # are any of the allele symbols different from the refseq?
  2407. $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
  2408. $ld.$res1.$sep.$res2.$rd;
  2409. $bic .= $string;
  2410. }
  2411. return $bic;
  2412. }
  2413. =head2 methods implementing Bio::FeatureHolderI
  2414. FeatureHolderI implementation to support labeled character sets like one
  2415. would get from NEXUS represented data.
  2416. =head2 get_SeqFeatures
  2417. Usage : @features = $aln->get_SeqFeatures
  2418. Function: Get the feature objects held by this feature holder.
  2419. Example :
  2420. Returns : an array of Bio::SeqFeatureI implementing objects
  2421. Args : optional filter coderef, taking a Bio::SeqFeatureI
  2422. : as argument, returning TRUE if wanted, FALSE if
  2423. : unwanted
  2424. =cut
  2425. sub get_SeqFeatures {
  2426. my $self = shift;
  2427. my $filter_cb = shift;
  2428. $self->throw("Arg (filter callback) must be a coderef") unless
  2429. !defined($filter_cb) or ref($filter_cb) eq 'CODE';
  2430. if( !defined $self->{'_as_feat'} ) {
  2431. $self->{'_as_feat'} = [];
  2432. }
  2433. if ($filter_cb) {
  2434. return grep { $filter_cb->($_) } @{$self->{'_as_feat'}};
  2435. }
  2436. return @{$self->{'_as_feat'}};
  2437. }
  2438. =head2 add_SeqFeature
  2439. Usage : $aln->add_SeqFeature($subfeat);
  2440. Function: adds a SeqFeature into the SeqFeature array.
  2441. Example :
  2442. Returns : true on success
  2443. Args : a Bio::SeqFeatureI object
  2444. Note : This implementation is not compliant
  2445. with Bio::FeatureHolderI
  2446. =cut
  2447. sub add_SeqFeature {
  2448. my ($self,@feat) = @_;
  2449. $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
  2450. foreach my $feat ( @feat ) {
  2451. if( !$feat->isa("Bio::SeqFeatureI") ) {
  2452. $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
  2453. }
  2454. push(@{$self->{'_as_feat'}},$feat);
  2455. }
  2456. return 1;
  2457. }
  2458. =head2 remove_SeqFeatures
  2459. Usage : $obj->remove_SeqFeatures
  2460. Function: Removes all SeqFeatures. If you want to remove only a subset,
  2461. remove that subset from the returned array, and add back the rest.
  2462. Returns : The array of Bio::SeqFeatureI features that was
  2463. deleted from this alignment.
  2464. Args : none
  2465. =cut
  2466. sub remove_SeqFeatures {
  2467. my $self = shift;
  2468. return () unless $self->{'_as_feat'};
  2469. my @feats = @{$self->{'_as_feat'}};
  2470. $self->{'_as_feat'} = [];
  2471. return @feats;
  2472. }
  2473. =head2 feature_count
  2474. Title : feature_count
  2475. Usage : $obj->feature_count()
  2476. Function: Return the number of SeqFeatures attached to the alignment
  2477. Returns : integer representing the number of SeqFeatures
  2478. Args : None
  2479. =cut
  2480. sub feature_count {
  2481. my ($self) = @_;
  2482. if (defined($self->{'_as_feat'})) {
  2483. return ($#{$self->{'_as_feat'}} + 1);
  2484. } else {
  2485. return 0;
  2486. }
  2487. }
  2488. =head2 get_all_SeqFeatures
  2489. Title : get_all_SeqFeatures
  2490. Usage :
  2491. Function: Get all SeqFeatures.
  2492. Example :
  2493. Returns : an array of Bio::SeqFeatureI implementing objects
  2494. Args : none
  2495. Note : Falls through to Bio::FeatureHolderI implementation.
  2496. =cut
  2497. =head2 methods for Bio::AnnotatableI
  2498. AnnotatableI implementation to support sequence alignments which
  2499. contain annotation (NEXUS, Stockholm).
  2500. =head2 annotation
  2501. Title : annotation
  2502. Usage : $ann = $aln->annotation or
  2503. $aln->annotation($ann)
  2504. Function: Gets or sets the annotation
  2505. Returns : Bio::AnnotationCollectionI object
  2506. Args : None or Bio::AnnotationCollectionI object
  2507. See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
  2508. for more information
  2509. =cut
  2510. sub annotation {
  2511. my ($obj,$value) = @_;
  2512. if( defined $value ) {
  2513. $obj->throw("object of class ".ref($value)." does not implement ".
  2514. "Bio::AnnotationCollectionI. Too bad.")
  2515. unless $value->isa("Bio::AnnotationCollectionI");
  2516. $obj->{'_annotation'} = $value;
  2517. } elsif( ! defined $obj->{'_annotation'}) {
  2518. $obj->{'_annotation'} = Bio::Annotation::Collection->new();
  2519. }
  2520. return $obj->{'_annotation'};
  2521. }
  2522. =head1 Deprecated methods
  2523. =cut
  2524. =head2 no_residues
  2525. Title : no_residues
  2526. Usage : $no = $ali->no_residues
  2527. Function : number of residues in total in the alignment
  2528. Returns : integer
  2529. Argument :
  2530. Note : deprecated in favor of num_residues()
  2531. =cut
  2532. sub no_residues {
  2533. my $self = shift;
  2534. $self->deprecated(-warn_version => 1.0069,
  2535. -throw_version => 1.0075,
  2536. -message => 'Use of method no_residues() is deprecated, use num_residues() instead');
  2537. $self->num_residues(@_);
  2538. }
  2539. =head2 no_sequences
  2540. Title : no_sequences
  2541. Usage : $depth = $ali->no_sequences
  2542. Function : number of sequence in the sequence alignment
  2543. Returns : integer
  2544. Argument :
  2545. Note : deprecated in favor of num_sequences()
  2546. =cut
  2547. sub no_sequences {
  2548. my $self = shift;
  2549. $self->deprecated(-warn_version => 1.0069,
  2550. -throw_version => 1.0075,
  2551. -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
  2552. $self->num_sequences(@_);
  2553. }
  2554. =head2 mask_columns
  2555. Title : mask_columns
  2556. Usage : $aln2 = $aln->mask_columns(20,30)
  2557. Function : Masks a slice of the alignment inclusive of start and
  2558. end columns, and the first column in the alignment is denoted 1.
  2559. Mask beyond the length of the sequence does not do padding.
  2560. Returns : A Bio::SimpleAlign object
  2561. Args : Positive integer for start column, positive integer for end column,
  2562. optional string value use for the mask. Example:
  2563. $aln2 = $aln->mask_columns(20,30,'?')
  2564. Note : Masking must use a character that is not used for gaps or
  2565. frameshifts. These can be adjusted using the relevant global
  2566. variables, but be aware these may be (uncontrollably) modified
  2567. elsewhere within BioPerl (see bug 2715)
  2568. =cut
  2569. sub mask_columns {
  2570. #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
  2571. my $self = shift;
  2572. my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS.
  2573. $Bio::LocatableSeq::FRAMESHIFT_SYMBOLS;
  2574. # coordinates are alignment-based, not sequence-based
  2575. my ($start, $end, $mask_char) = @_;
  2576. unless (defined $mask_char) { $mask_char = 'N' }
  2577. $self->throw("Mask start has to be a positive integer and less than ".
  2578. "alignment length, not [$start]")
  2579. unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
  2580. $self->throw("Mask end has to be a positive integer and less than ".
  2581. "alignment length, not [$end]")
  2582. unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
  2583. $self->throw("Mask start [$start] has to be smaller than or equal to ".
  2584. "end [$end]") unless $start <= $end;
  2585. $self->throw("Mask character $mask_char has to be a single character ".
  2586. "and not a gap or frameshift symbol")
  2587. unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres};
  2588. my $aln = $self->new;
  2589. $aln->id($self->id);
  2590. foreach my $seq ( $self->each_seq() ) {
  2591. my $new_seq = Bio::LocatableSeq->new(-id => $seq->id,
  2592. -alphabet => $seq->alphabet,
  2593. -strand => $seq->strand,
  2594. -verbose => $self->verbose);
  2595. # convert from 1-based alignment coords!
  2596. my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
  2597. $masked_string =~ s{[^$nonres]}{$mask_char}g;
  2598. my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
  2599. $new_seq->seq($new_dna_string);
  2600. $aln->add_seq($new_seq);
  2601. }
  2602. return $aln;
  2603. }
  2604. 1;