PageRenderTime 68ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 1ms

/external/lib/perl/Bio/SimpleAlign.pm

https://bitbucket.org/intogen/intogen-sm
Perl | 1934 lines | 1455 code | 404 blank | 75 comment | 158 complexity | c7debe379a82500d67e1a97a1a291789 MD5 | raw file
Possible License(s): GPL-3.0
  1. # $Id: SimpleAlign.pm,v 1.65.2.1 2003/07/02 16:00:19 jason Exp $
  2. # BioPerl module for SimpleAlign
  3. #
  4. # Cared for by Heikki Lehvaslaiho <heikki@ebi.ac.uk>
  5. #
  6. # Copyright Ewan Birney
  7. #
  8. # You may distribute this module under the same terms as perl itself
  9. # POD documentation - main docs before the code
  10. #
  11. # History:
  12. # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
  13. # May 2001 major rewrite - Heikki Lehvaslaiho
  14. =head1 NAME
  15. Bio::SimpleAlign - Multiple alignments held as a set of sequences
  16. =head1 SYNOPSIS
  17. # use Bio::AlignIO to read in the alignment
  18. $str = Bio::AlignIO->new('-file' => 't/data/testaln.pfam');
  19. $aln = $str->next_aln();
  20. # some descriptors
  21. print $aln->length, "\n";
  22. print $aln->no_residues, "\n";
  23. print $aln->is_flush, "\n";
  24. print $aln->no_sequences, "\n";
  25. print $aln->percentage_identity, "\n";
  26. print $aln->consensus_string(50), "\n";
  27. # find the position in the alignment for a sequence location
  28. $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
  29. # extract sequences and check values for the alignment column $pos
  30. foreach $seq ($aln->each_seq) {
  31. $res = $seq->subseq($pos, $pos);
  32. $count{$res}++;
  33. }
  34. foreach $res (keys %count) {
  35. printf "Res: %s Count: %2d\n", $res, $count{$res};
  36. }
  37. =head1 DESCRIPTION
  38. SimpleAlign handles multiple alignments of sequences. It is very
  39. permissive of types (it won't insist on things being all same length
  40. etc): really it is a SequenceSet explicitly held in memory with a
  41. whole series of built in manipulations and especially file format
  42. systems for read/writing alignments.
  43. SimpleAlign basically views an alignment as an immutable block of
  44. text. SimpleAlign *is not* the object to be using if you want to
  45. perform complex alignment manipulations.
  46. However for lightweight display/formatting and minimal manipulation
  47. (e.g. removing all-gaps columns) - this is the one to use.
  48. SimpleAlign uses a subclass of L<Bio::PrimarySeq> class
  49. L<Bio::LocatableSeq> to store its sequences. These are subsequences
  50. with a start and end positions in the parent reference sequence.
  51. Tricky concepts. SimpleAlign expects name,start,end to be 'unique' in
  52. the alignment, and this is the key for the internal hashes.
  53. (name,start,end is abbreviated nse in the code). However, in many
  54. cases people don't want the name/start-end to be displayed: either
  55. multiple names in an alignment or names specific to the alignment
  56. (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
  57. 'displayname', and generally is what is used to print out the
  58. alignment. They default to name/start-end.
  59. The SimpleAlign Module came from Ewan Birney's Align module.
  60. =head1 PROGRESS
  61. SimpleAlign is being slowly converted to bioperl coding standards,
  62. mainly by Ewan.
  63. =over 3
  64. =item Use Bio::Root::Object - done
  65. =item Use proper exceptions - done
  66. =item Use hashed constructor - not done!
  67. =back
  68. =head1 FEEDBACK
  69. =head2 Mailing Lists
  70. User feedback is an integral part of the evolution of this and other
  71. Bioperl modules. Send your comments and suggestions preferably to one
  72. of the Bioperl mailing lists. Your participation is much appreciated.
  73. bioperl-l@bioperl.org - General discussion
  74. http://bioperl.org/MailList.shtml - About the mailing lists
  75. =head2 Reporting Bugs
  76. Report bugs to the Bioperl bug tracking system to help us keep track
  77. the bugs and their resolution. Bug reports can be submitted via email
  78. or the web:
  79. bioperl-bugs@bio.perl.org
  80. http://bugzilla.bioperl.org/
  81. =head1 AUTHOR
  82. Ewan Birney, birney@sanger.ac.uk
  83. =head1 CONTRIBUTORS
  84. Richard Adams, Richard.Adams@ed.ac.uk,
  85. David J. Evans, David.Evans@vir.gla.ac.uk,
  86. Heikki Lehvaslaiho, heikki@ebi.ac.uk,
  87. Allen Smith, allens@cpan.org,
  88. Jason Stajich, jason@bioperl.org,
  89. Anthony Underwood, aunderwood@phls.org.uk,
  90. Xintao Wei & Giri Narasimhan, giri@cs.fiu.edu
  91. =head1 SEE ALSO
  92. L<Bio::LocatableSeq.pm>
  93. =head1 APPENDIX
  94. The rest of the documentation details each of the object
  95. methods. Internal methods are usually preceded with a _
  96. =cut
  97. # 'Let the code begin...
  98. package Bio::SimpleAlign;
  99. use vars qw(@ISA %CONSERVATION_GROUPS);
  100. use strict;
  101. use Bio::Root::Root;
  102. use Bio::LocatableSeq; # uses Seq's as list
  103. use Bio::Align::AlignI;
  104. BEGIN {
  105. # This data should probably be in a more centralized module...
  106. # it is taken from Clustalw documentation
  107. # These are all the positively scoring groups that occur in the
  108. # Gonnet Pam250 matrix. The strong and weak groups are
  109. # defined as strong score >0.5 and weak score =<0.5 respectively.
  110. %CONSERVATION_GROUPS = ( 'strong' => [ qw(STA
  111. NEQK
  112. NHQK
  113. NDEQ
  114. QHRK
  115. MILV
  116. MILF
  117. HY
  118. FYW)
  119. ],
  120. 'weak' => [ qw(CSA
  121. ATV
  122. SAG
  123. STNK
  124. STPA
  125. SGND
  126. SNDEQK
  127. NDEQHK
  128. NEQHRK
  129. FVLIM
  130. HFY) ],
  131. );
  132. }
  133. @ISA = qw(Bio::Root::Root Bio::Align::AlignI);
  134. =head2 new
  135. Title : new
  136. Usage : my $aln = new Bio::SimpleAlign();
  137. Function : Creates a new simple align object
  138. Returns : Bio::SimpleAlign
  139. Args : -source => string representing the source program
  140. where this alignment came from
  141. =cut
  142. sub new {
  143. my($class,@args) = @_;
  144. my $self = $class->SUPER::new(@args);
  145. my ($src) = $self->_rearrange([qw(SOURCE)], @args);
  146. $src && $self->source($src);
  147. # we need to set up internal hashs first!
  148. $self->{'_seq'} = {};
  149. $self->{'_order'} = {};
  150. $self->{'_start_end_lists'} = {};
  151. $self->{'_dis_name'} = {};
  152. $self->{'_id'} = 'NoName';
  153. $self->{'_symbols'} = {};
  154. # maybe we should automatically read in from args. Hmmm...
  155. return $self; # success - we hope!
  156. }
  157. =head1 Modifier methods
  158. These methods modify the MSE by adding, removing or shuffling complete
  159. sequences.
  160. =head2 add_seq
  161. Title : add_seq
  162. Usage : $myalign->add_seq($newseq);
  163. Function : Adds another sequence to the alignment. *Does not* align
  164. it - just adds it to the hashes.
  165. Returns : nothing
  166. Args : a Bio::LocatableSeq object
  167. order (optional)
  168. See L<Bio::LocatableSeq> for more information
  169. =cut
  170. sub addSeq {
  171. my $self = shift;
  172. $self->warn(ref($self). "::addSeq - deprecated method. Use add_seq() instead.");
  173. $self->add_seq(@_);
  174. }
  175. sub add_seq {
  176. my $self = shift;
  177. my $seq = shift;
  178. my $order = shift;
  179. my ($name,$id,$start,$end);
  180. if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
  181. $self->throw("Unable to process non locatable sequences [", ref($seq), "]");
  182. }
  183. $id = $seq->id() ||$seq->display_id || $seq->primary_id;
  184. $start = $seq->start();
  185. $end = $seq->end();
  186. # build the symbol list for this sequence,
  187. # will prune out the gap and missing/match chars
  188. # when actually asked for the symbol list in the
  189. # symbol_chars
  190. map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
  191. if( !defined $order ) {
  192. $order = keys %{$self->{'_seq'}};
  193. }
  194. $name = sprintf("%s/%d-%d",$id,$start,$end);
  195. if( $self->{'_seq'}->{$name} ) {
  196. $self->warn("Replacing one sequence [$name]\n");
  197. }
  198. else {
  199. #print STDERR "Assigning $name to $order\n";
  200. $self->{'_order'}->{$order} = $name;
  201. unless( exists( $self->{'_start_end_lists'}->{$id})) {
  202. $self->{'_start_end_lists'}->{$id} = [];
  203. }
  204. push @{$self->{'_start_end_lists'}->{$id}}, $seq;
  205. }
  206. $self->{'_seq'}->{$name} = $seq;
  207. }
  208. =head2 remove_seq
  209. Title : remove_seq
  210. Usage : $aln->remove_seq($seq);
  211. Function : Removes a single sequence from an alignment
  212. Returns :
  213. Argument : a Bio::LocatableSeq object
  214. =cut
  215. sub removeSeq {
  216. my $self = shift;
  217. $self->warn(ref($self). "::removeSeq - deprecated method. Use remove_seq() instead.");
  218. $self->remove_seq(@_);
  219. }
  220. sub remove_seq {
  221. my $self = shift;
  222. my $seq = shift;
  223. my ($name,$id,$start,$end);
  224. $self->throw("Need Bio::Locatable seq argument ")
  225. unless ref $seq && $seq->isa('Bio::LocatableSeq');
  226. $id = $seq->id();
  227. $start = $seq->start();
  228. $end = $seq->end();
  229. $name = sprintf("%s/%d-%d",$id,$start,$end);
  230. if( !exists $self->{'_seq'}->{$name} ) {
  231. $self->throw("Sequence $name does not exist in the alignment to remove!");
  232. }
  233. delete $self->{'_seq'}->{$name};
  234. # we need to remove this seq from the start_end_lists hash
  235. if (exists $self->{'_start_end_lists'}->{$id}) {
  236. # we need to find the sequence in the array.
  237. my ($i, $found);;
  238. for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
  239. if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
  240. $found = 1;
  241. last;
  242. }
  243. }
  244. if ($found) {
  245. splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
  246. }
  247. else {
  248. $self->throw("Could not find the sequence to remoce from the start-end list");
  249. }
  250. }
  251. else {
  252. $self->throw("There is no seq list for the name $id");
  253. }
  254. return 1;
  255. # we can't do anything about the order hash but that is ok
  256. # because each_seq will handle it
  257. }
  258. =head2 purge
  259. Title : purge
  260. Usage : $aln->purge(0.7);
  261. Function:
  262. Removes sequences above given sequence similarity
  263. This function will grind on large alignments. Beware!
  264. Example :
  265. Returns : An array of the removed sequences
  266. Args : float, threshold for similarity
  267. =cut
  268. sub purge {
  269. my ($self,$perc) = @_;
  270. my (%duplicate, @dups);
  271. my @seqs = $self->each_seq();
  272. for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
  273. my $seq = $seqs[$i];
  274. #skip if already in duplicate hash
  275. next if exists $duplicate{$seq->display_id} ;
  276. my $one = $seq->seq();
  277. my @one = split '', $one; #split to get 1aa per array element
  278. for (my $j=$i+1;$j < @seqs;$j++) {
  279. my $seq2 = $seqs[$j];
  280. #skip if already in duplicate hash
  281. next if exists $duplicate{$seq2->display_id} ;
  282. my $two = $seq2->seq();
  283. my @two = split '', $two;
  284. my $count = 0;
  285. my $res = 0;
  286. for (my $k=0;$k<@one;$k++) {
  287. if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
  288. $one[$k] eq $two[$k]) {
  289. $count++;
  290. }
  291. if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
  292. $two[$k] ne '.' && $two[$k] ne '-' ) {
  293. $res++;
  294. }
  295. }
  296. my $ratio = 0;
  297. $ratio = $count/$res unless $res == 0;
  298. # if above threshold put in duplicate hash and push onto
  299. # duplicate array for returning to get_unique
  300. if ( $ratio > $perc ) {
  301. print STDERR "duplicate!", $seq2->display_id, "\n" if $self->verbose > 0;
  302. $duplicate{$seq2->display_id} = 1;
  303. push @dups, $seq2;
  304. }
  305. }
  306. }
  307. foreach my $seq (@dups) {
  308. $self->remove_seq($seq);
  309. }
  310. return @dups;
  311. }
  312. =head2 sort_alphabetically
  313. Title : sort_alphabetically
  314. Usage : $ali->sort_alphabetically
  315. Function :
  316. Changes the order of the alignemnt to alphabetical on name
  317. followed by numerical by number.
  318. Returns :
  319. Argument :
  320. =cut
  321. sub sort_alphabetically {
  322. my $self = shift;
  323. my ($seq,$nse,@arr,%hash,$count);
  324. foreach $seq ( $self->each_seq() ) {
  325. $nse = $seq->get_nse;
  326. $hash{$nse} = $seq;
  327. }
  328. $count = 0;
  329. %{$self->{'_order'}} = (); # reset the hash;
  330. foreach $nse ( sort _alpha_startend keys %hash) {
  331. $self->{'_order'}->{$count} = $nse;
  332. $count++;
  333. }
  334. 1;
  335. }
  336. =head1 Sequence selection methods
  337. Methods returning one or more sequences objects.
  338. =head2 each_seq
  339. Title : each_seq
  340. Usage : foreach $seq ( $align->each_seq() )
  341. Function : Gets an array of Seq objects from the alignment
  342. Returns : an array
  343. Argument :
  344. =cut
  345. sub eachSeq {
  346. my $self = shift;
  347. $self->warn(ref($self). "::eachSeq - deprecated method. Use each_seq() instead.");
  348. $self->each_seq();
  349. }
  350. sub each_seq {
  351. my $self = shift;
  352. my (@arr,$order);
  353. foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
  354. if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
  355. push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
  356. }
  357. }
  358. return @arr;
  359. }
  360. =head2 each_alphabetically
  361. Title : each_alphabetically
  362. Usage : foreach $seq ( $ali->each_alphabetically() )
  363. Function :
  364. Returns an array of sequence object sorted alphabetically
  365. by name and then by start point.
  366. Does not change the order of the alignment
  367. Returns :
  368. Argument :
  369. =cut
  370. sub each_alphabetically {
  371. my $self = shift;
  372. my ($seq,$nse,@arr,%hash,$count);
  373. foreach $seq ( $self->each_seq() ) {
  374. $nse = $seq->get_nse;
  375. $hash{$nse} = $seq;
  376. }
  377. foreach $nse ( sort _alpha_startend keys %hash) {
  378. push(@arr,$hash{$nse});
  379. }
  380. return @arr;
  381. }
  382. sub _alpha_startend {
  383. my ($aname,$astart,$bname,$bstart);
  384. ($aname,$astart) = split (/-/,$a);
  385. ($bname,$bstart) = split (/-/,$b);
  386. if( $aname eq $bname ) {
  387. return $astart <=> $bstart;
  388. }
  389. else {
  390. return $aname cmp $bname;
  391. }
  392. }
  393. =head2 each_seq_with_id
  394. Title : each_seq_with_id
  395. Usage : foreach $seq ( $align->each_seq_with_id() )
  396. Function :
  397. Gets an array of Seq objects from the
  398. alignment, the contents being those sequences
  399. with the given name (there may be more than one)
  400. Returns : an array
  401. Argument : a seq name
  402. =cut
  403. sub eachSeqWithId {
  404. my $self = shift;
  405. $self->warn(ref($self). "::eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
  406. $self->each_seq_with_id(@_);
  407. }
  408. sub each_seq_with_id {
  409. my $self = shift;
  410. my $id = shift;
  411. $self->throw("Method each_seq_with_id needs a sequence name argument")
  412. unless defined $id;
  413. my (@arr, $seq);
  414. if (exists($self->{'_start_end_lists'}->{$id})) {
  415. @arr = @{$self->{'_start_end_lists'}->{$id}};
  416. }
  417. return @arr;
  418. }
  419. =head2 get_seq_by_pos
  420. Title : get_seq_by_pos
  421. Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
  422. Function :
  423. Gets a sequence based on its position in the alignment.
  424. Numbering starts from 1. Sequence positions larger than
  425. no_sequences() will thow an error.
  426. Returns : a Bio::LocatableSeq object
  427. Args : positive integer for the sequence osition
  428. =cut
  429. sub get_seq_by_pos {
  430. my $self = shift;
  431. my ($pos) = @_;
  432. $self->throw("Sequence position has to be a positive integer, not [$pos]")
  433. unless $pos =~ /^\d+$/ and $pos > 0;
  434. $self->throw("No sequence at position [$pos]")
  435. unless $pos <= $self->no_sequences ;
  436. my $nse = $self->{'_order'}->{--$pos};
  437. return $self->{'_seq'}->{$nse};
  438. }
  439. =head1 Create new alignments
  440. The result of these methods are horizontal or vertical subsets of the
  441. current MSE.
  442. =head2 select
  443. Title : select
  444. Usage : $aln2 = $aln->select(1, 3) # three first sequences
  445. Function :
  446. Creates a new alignment from a continuous subset of
  447. sequences. Numbering starts from 1. Sequence positions
  448. larger than no_sequences() will thow an error.
  449. Returns : a Bio::SimpleAlign object
  450. Args : positive integer for the first sequence
  451. positive integer for the last sequence to include (optional)
  452. =cut
  453. sub select {
  454. my $self = shift;
  455. my ($start, $end) = @_;
  456. $self->throw("Select start has to be a positive integer, not [$start]")
  457. unless $start =~ /^\d+$/ and $start > 0;
  458. $self->throw("Select end has to be a positive integer, not [$end]")
  459. unless $end =~ /^\d+$/ and $end > 0;
  460. $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
  461. unless $start <= $end;
  462. my $aln = new $self;
  463. foreach my $pos ($start .. $end) {
  464. $aln->add_seq($self->get_seq_by_pos($pos));
  465. }
  466. $aln->id($self->id);
  467. return $aln;
  468. }
  469. =head2 select_noncont
  470. Title : select_noncont
  471. Usage : $aln2 = $aln->select_noncont(1, 3) # first and 3rd sequences
  472. Function :
  473. Creates a new alignment from a subset of
  474. sequences. Numbering starts from 1. Sequence positions
  475. larger than no_sequences() will thow an error.
  476. Returns : a Bio::SimpleAlign object
  477. Args : array of integers for the sequences
  478. =cut
  479. sub select_noncont {
  480. my $self = shift;
  481. my (@pos) = @_;
  482. my $end = $self->no_sequences;
  483. foreach ( @pos ) {
  484. $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
  485. unless( /^\d+$/ && $_ > 0 && $_ <= $end );
  486. }
  487. my $aln = new $self;
  488. foreach my $p (@pos) {
  489. $aln->add_seq($self->get_seq_by_pos($p));
  490. }
  491. $aln->id($self->id);
  492. return $aln;
  493. }
  494. =head2 slice
  495. Title : slice
  496. Usage : $aln2 = $aln->slice(20, 30)
  497. Function :
  498. Creates a slice from the alignment inclusive of start and
  499. end columns. Sequences with no residues in the slice are
  500. excluded from the new alignment and a warning is printed.
  501. Slice beyond the length of the sequence does not do
  502. padding.
  503. Returns : a Bio::SimpleAlign object
  504. Args : positive integer for start column
  505. positive integer for end column
  506. =cut
  507. sub slice {
  508. my $self = shift;
  509. my ($start, $end) = @_;
  510. $self->throw("Slice start has to be a positive integer, not [$start]")
  511. unless $start =~ /^\d+$/ and $start > 0;
  512. $self->throw("Slice end has to be a positive integer, not [$end]")
  513. unless $end =~ /^\d+$/ and $end > 0;
  514. $self->throw("Slice $start [$start] has to be smaller than or equal to end [$end]")
  515. unless $start <= $end;
  516. my $aln_length = $self->length;
  517. $self->throw("This alignment has only ". $self->length.
  518. " residues. Slice start [$start] is too bigger.")
  519. if $start > $self->length;
  520. my $aln = new $self;
  521. $aln->id($self->id);
  522. foreach my $seq ( $self->each_seq() ) {
  523. my $new_seq = new Bio::LocatableSeq (-id => $seq->id);
  524. # seq
  525. my $seq_end = $end;
  526. $seq_end = $seq->length if $end > $seq->length;
  527. my $slice_seq = $seq->subseq($start, $seq_end);
  528. $new_seq->seq( $slice_seq );
  529. # start
  530. if ($start > 1) {
  531. my $pre_start_seq = $seq->subseq(1, $start - 1);
  532. $pre_start_seq =~ s/\W//g; #print "$pre_start_seq\n";
  533. $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
  534. } else {
  535. $new_seq->start( $seq->start);
  536. }
  537. # end
  538. $slice_seq =~ s/\W//g;
  539. $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
  540. if ($new_seq->start and $new_seq->end >= $new_seq->start) {
  541. $aln->add_seq($new_seq);
  542. } else {
  543. my $nse = $seq->get_nse();
  544. $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
  545. " Sequence excluded from the new alignment.");
  546. }
  547. }
  548. return $aln;
  549. }
  550. =head2 remove_columns
  551. Title : remove_column
  552. Usage : $aln2 = $aln->remove_columns(['mismatch','weak'])
  553. Function :
  554. Creates an aligment with columns removed corresponding to
  555. the specified criteria.
  556. Returns : a L<Bio::SimpleAlign> object
  557. Args : array ref of types, 'match'|'weak'|'strong'|'mismatch'
  558. =cut
  559. sub remove_columns{
  560. my ($self,$type) = @_;
  561. my %matchchars = ( 'match' => '\*',
  562. 'weak' => '\.',
  563. 'strong' => ':',
  564. 'mismatch'=> ' ',
  565. );
  566. #get the characters to delete against
  567. my $del_char;
  568. foreach my $type(@{$type}){
  569. $del_char.= $matchchars{$type};
  570. }
  571. my $match_line = $self->match_line;
  572. my $aln = new $self;
  573. my @remove;
  574. my $length = 0;
  575. #do the matching to get the segments to remove
  576. while($match_line=~m/[$del_char]/g){
  577. my $start = pos($match_line)-1;
  578. $match_line=~/\G[$del_char]+/gc;
  579. my $end = pos($match_line)-1;
  580. #have to offset the start and end for subsequent removes
  581. $start-=$length;
  582. $end -=$length;
  583. $length += ($end-$start+1);
  584. push @remove, [$start,$end];
  585. }
  586. #remove the segments
  587. $aln = $self->_remove_col($aln,\@remove);
  588. return $aln;
  589. }
  590. sub _remove_col {
  591. my ($self,$aln,$remove) = @_;
  592. my @new;
  593. #splice out the segments and create new seq
  594. foreach my $seq($self->each_seq){
  595. my $new_seq = new Bio::LocatableSeq(-id=>$seq->id);
  596. my $sequence;
  597. foreach my $pair(@{$remove}){
  598. my $start = $pair->[0];
  599. my $end = $pair->[1];
  600. $sequence = $seq->seq unless $sequence;
  601. my $spliced;
  602. $spliced .= $start > 0 ? substr($sequence,0,$start) : '';
  603. $spliced .= substr($sequence,$end+1,$seq->length-$end+1);
  604. $sequence = $spliced;
  605. if ($start == 1) {
  606. $new_seq->start($end);
  607. }
  608. else {
  609. $new_seq->start( $seq->start);
  610. }
  611. # end
  612. if($end >= $seq->end){
  613. $new_seq->end( $start);
  614. }
  615. else {
  616. $new_seq->end($seq->end);
  617. }
  618. }
  619. $new_seq->seq($sequence);
  620. push @new, $new_seq;
  621. }
  622. #add the new seqs to the alignment
  623. foreach my $new(@new){
  624. $aln->add_seq($new);
  625. }
  626. return $aln;
  627. }
  628. =head1 Change sequences within the MSE
  629. These methods affect characters in all sequences without changeing the
  630. alignment.
  631. =head2 map_chars
  632. Title : map_chars
  633. Usage : $ali->map_chars('\.','-')
  634. Function :
  635. Does a s/$arg1/$arg2/ on the sequences. Useful for gap
  636. characters
  637. Notice that the from (arg1) is interpretted as a regex,
  638. so be careful about quoting meta characters (eg
  639. $ali->map_chars('.','-') wont do what you want)
  640. Returns :
  641. Argument : 'from' rexexp
  642. 'to' string
  643. =cut
  644. sub map_chars {
  645. my $self = shift;
  646. my $from = shift;
  647. my $to = shift;
  648. my ($seq,$temp);
  649. $self->throw("Need exactly two arguments")
  650. unless defined $from and defined $to;
  651. foreach $seq ( $self->each_seq() ) {
  652. $temp = $seq->seq();
  653. $temp =~ s/$from/$to/g;
  654. $seq->seq($temp);
  655. }
  656. return 1;
  657. }
  658. =head2 uppercase
  659. Title : uppercase()
  660. Usage : $ali->uppercase()
  661. Function : Sets all the sequences to uppercase
  662. Returns :
  663. Argument :
  664. =cut
  665. sub uppercase {
  666. my $self = shift;
  667. my $seq;
  668. my $temp;
  669. foreach $seq ( $self->each_seq() ) {
  670. $temp = $seq->seq();
  671. $temp =~ tr/[a-z]/[A-Z]/;
  672. $seq->seq($temp);
  673. }
  674. return 1;
  675. }
  676. =head2 cigar_line
  677. Title : cigar_line()
  678. Usage : $align->cigar_line()
  679. Function : Generates a "cigar" line for each sequence in the alignment
  680. The format is simply A-1,60;B-1,1:4,60;C-5,10:12,58
  681. where A,B,C,etc. are the sequence identifiers, and the numbers
  682. refer to conserved positions within the alignment
  683. Args : none
  684. =cut
  685. sub cigar_line {
  686. my ($self) = @_;
  687. my %cigar;
  688. my %clines;
  689. my @seqchars;
  690. my $seqcount = 0;
  691. my $sc;
  692. foreach my $seq ( $self->each_seq ) {
  693. push @seqchars, [ split(//, uc ($seq->seq)) ];
  694. $sc = scalar(@seqchars);
  695. }
  696. foreach my $pos ( 0..$self->length ) {
  697. my $i=0;
  698. foreach my $seq ( @seqchars ) {
  699. $i++;
  700. # print STDERR "Seq $i at pos $pos: ".$seq->[$pos]."\n";
  701. if ($seq->[$pos] eq '.') {
  702. if (defined $cigar{$i} && $clines{$i} !~ $cigar{$i}) {
  703. $clines{$i}.=$cigar{$i};
  704. }
  705. }
  706. else {
  707. if (! defined $cigar{$i}) {
  708. $clines{$i}.=($pos+1).",";
  709. }
  710. $cigar{$i}=$pos+1;
  711. }
  712. if ($pos+1 == $self->length && ($clines{$i} =~ /\,$/) ) {
  713. $clines{$i}.=$cigar{$i};
  714. }
  715. }
  716. }
  717. for(my $i=1; $i<$sc+1;$i++) {
  718. print STDERR "Seq $i cigar line ".$clines{$i}."\n";
  719. }
  720. return %clines;
  721. }
  722. =head2 match_line
  723. Title : match_line()
  724. Usage : $align->match_line()
  725. Function : Generates a match line - much like consensus string
  726. except that a line indicating the '*' for a match.
  727. Args : (optional) Match line characters ('*' by default)
  728. (optional) Strong match char (':' by default)
  729. (optional) Weak match char ('.' by default)
  730. =cut
  731. sub match_line {
  732. my ($self,$matchlinechar, $strong, $weak) = @_;
  733. my %matchchars = ( 'match' => $matchlinechar || '*',
  734. 'weak' => $weak || '.',
  735. 'strong' => $strong || ':',
  736. 'mismatch'=> ' ',
  737. );
  738. my @seqchars;
  739. my $seqcount = 0;
  740. my $alphabet;
  741. foreach my $seq ( $self->each_seq ) {
  742. push @seqchars, [ split(//, uc ($seq->seq)) ];
  743. $alphabet = $seq->alphabet unless defined $alphabet;
  744. }
  745. my $refseq = shift @seqchars;
  746. # let's just march down the columns
  747. my $matchline;
  748. POS: foreach my $pos ( 0..$self->length ) {
  749. my $refchar = $refseq->[$pos];
  750. next unless $refchar; # skip ''
  751. my %col = ($refchar => 1);
  752. my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
  753. foreach my $seq ( @seqchars ) {
  754. $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
  755. $seq->[$pos] eq ' ' );
  756. $col{$seq->[$pos]}++;
  757. }
  758. my @colresidues = sort keys %col;
  759. my $char = $matchchars{'mismatch'};
  760. # if all the values are the same
  761. if( $dash ) { $char = $matchchars{'mismatch'} }
  762. elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
  763. elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
  764. # matches for protein seqs
  765. TYPE: foreach my $type ( qw(strong weak) ) {
  766. # iterate through categories
  767. my %groups;
  768. # iterate through each of the aa in the col
  769. # look to see which groups it is in
  770. foreach my $c ( @colresidues ) {
  771. foreach my $f ( grep /\Q$c/, @{$CONSERVATION_GROUPS{$type}} ) {
  772. push @{$groups{$f}},$c;
  773. }
  774. }
  775. GRP: foreach my $cols ( values %groups ) {
  776. @$cols = sort @$cols;
  777. # now we are just testing to see if two arrays
  778. # are identical w/o changing either one
  779. # have to be same len
  780. next if( scalar @$cols != scalar @colresidues );
  781. # walk down the length and check each slot
  782. for($_=0;$_ < (scalar @$cols);$_++ ) {
  783. next GRP if( $cols->[$_] ne $colresidues[$_] );
  784. }
  785. $char = $matchchars{$type};
  786. last TYPE;
  787. }
  788. }
  789. }
  790. $matchline .= $char;
  791. }
  792. return $matchline;
  793. }
  794. =head2 match
  795. Title : match()
  796. Usage : $ali->match()
  797. Function :
  798. Goes through all columns and changes residues that are
  799. identical to residue in first sequence to match '.'
  800. character. Sets match_char.
  801. USE WITH CARE: Most MSE formats do not support match
  802. characters in sequences, so this is mostly for output
  803. only. NEXUS format (Bio::AlignIO::nexus) can handle
  804. it.
  805. Returns : 1
  806. Argument : a match character, optional, defaults to '.'
  807. =cut
  808. sub match {
  809. my ($self, $match) = @_;
  810. $match ||= '.';
  811. my ($matching_char) = $match;
  812. $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ; #';
  813. $self->map_chars($matching_char, '-');
  814. my @seqs = $self->each_seq();
  815. return 1 unless scalar @seqs > 1;
  816. my $refseq = shift @seqs ;
  817. my @refseq = split //, $refseq->seq;
  818. my $gapchar = $self->gap_char;
  819. foreach my $seq ( @seqs ) {
  820. my @varseq = split //, $seq->seq();
  821. for ( my $i=0; $i < scalar @varseq; $i++) {
  822. $varseq[$i] = $match if defined $refseq[$i] &&
  823. ( $refseq[$i] =~ /[A-Za-z\*]/ ||
  824. $refseq[$i] =~ /$gapchar/ )
  825. && $refseq[$i] eq $varseq[$i];
  826. }
  827. $seq->seq(join '', @varseq);
  828. }
  829. $self->match_char($match);
  830. return 1;
  831. }
  832. =head2 unmatch
  833. Title : unmatch()
  834. Usage : $ali->unmatch()
  835. Function : Undoes the effect of method match. Unsets match_char.
  836. Returns : 1
  837. Argument : a match character, optional, defaults to '.'
  838. See L<match> and L<match_char>
  839. =cut
  840. sub unmatch {
  841. my ($self, $match) = @_;
  842. $match ||= '.';
  843. my @seqs = $self->each_seq();
  844. return 1 unless scalar @seqs > 1;
  845. my $refseq = shift @seqs ;
  846. my @refseq = split //, $refseq->seq;
  847. my $gapchar = $self->gap_char;
  848. foreach my $seq ( @seqs ) {
  849. my @varseq = split //, $seq->seq();
  850. for ( my $i=0; $i < scalar @varseq; $i++) {
  851. $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
  852. ( $refseq[$i] =~ /[A-Za-z\*]/ ||
  853. $refseq[$i] =~ /$gapchar/ ) &&
  854. $varseq[$i] eq $match;
  855. }
  856. $seq->seq(join '', @varseq);
  857. }
  858. $self->match_char('');
  859. return 1;
  860. }
  861. =head1 MSE attibutes
  862. Methods for setting and reading the MSE attributes.
  863. Note that the methods defining character semantics depend on the user
  864. to set them sensibly. They are needed only by certain input/output
  865. methods. Unset them by setting to an empty string ('').
  866. =head2 id
  867. Title : id
  868. Usage : $myalign->id("Ig")
  869. Function : Gets/sets the id field of the alignment
  870. Returns : An id string
  871. Argument : An id string (optional)
  872. =cut
  873. sub id {
  874. my ($self, $name) = @_;
  875. if (defined( $name )) {
  876. $self->{'_id'} = $name;
  877. }
  878. return $self->{'_id'};
  879. }
  880. =head2 missing_char
  881. Title : missing_char
  882. Usage : $myalign->missing_char("?")
  883. Function : Gets/sets the missing_char attribute of the alignment
  884. It is generally recommended to set it to 'n' or 'N'
  885. for nucleotides and to 'X' for protein.
  886. Returns : An missing_char string,
  887. Argument : An missing_char string (optional)
  888. =cut
  889. sub missing_char {
  890. my ($self, $char) = @_;
  891. if (defined $char ) {
  892. $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1;
  893. $self->{'_missing_char'} = $char;
  894. }
  895. return $self->{'_missing_char'};
  896. }
  897. =head2 match_char
  898. Title : match_char
  899. Usage : $myalign->match_char('.')
  900. Function : Gets/sets the match_char attribute of the alignment
  901. Returns : An match_char string,
  902. Argument : An match_char string (optional)
  903. =cut
  904. sub match_char {
  905. my ($self, $char) = @_;
  906. if (defined $char ) {
  907. $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1;
  908. $self->{'_match_char'} = $char;
  909. }
  910. return $self->{'_match_char'};
  911. }
  912. =head2 gap_char
  913. Title : gap_char
  914. Usage : $myalign->gap_char('-')
  915. Function : Gets/sets the gap_char attribute of the alignment
  916. Returns : An gap_char string, defaults to '-'
  917. Argument : An gap_char string (optional)
  918. =cut
  919. sub gap_char {
  920. my ($self, $char) = @_;
  921. if (defined $char || ! defined $self->{'_gap_char'} ) {
  922. $char= '-' unless defined $char;
  923. $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1;
  924. $self->{'_gap_char'} = $char;
  925. }
  926. return $self->{'_gap_char'};
  927. }
  928. =head2 symbol_chars
  929. Title : symbol_chars
  930. Usage : my @symbolchars = $aln->symbol_chars;
  931. Function: Returns all the seen symbols (other than gaps)
  932. Returns : array of characters that are the seen symbols
  933. Args : boolean to include the gap/missing/match characters
  934. =cut
  935. sub symbol_chars{
  936. my ($self,$includeextra) = @_;
  937. if( ! defined $self->{'_symbols'} ) {
  938. $self->warn("Symbol list was not initialized");
  939. return ();
  940. }
  941. my %copy = %{$self->{'_symbols'}};
  942. if( ! $includeextra ) {
  943. foreach my $char ( $self->gap_char, $self->match_char,
  944. $self->missing_char) {
  945. delete $copy{$char} if( defined $char );
  946. }
  947. }
  948. return keys %copy;
  949. }
  950. =head1 Alignment descriptors
  951. These read only methods describe the MSE in various ways.
  952. =head2 consensus_string
  953. Title : consensus_string
  954. Usage : $str = $ali->consensus_string($threshold_percent)
  955. Function : Makes a strict consensus
  956. Returns :
  957. Argument : Optional treshold ranging from 0 to 100.
  958. The consensus residue has to appear at least threshold %
  959. of the sequences at a given location, otherwise a '?'
  960. character will be placed at that location.
  961. (Default value = 0%)
  962. =cut
  963. sub consensus_string {
  964. my $self = shift;
  965. my $threshold = shift;
  966. my $len;
  967. my ($out,$count);
  968. $out = "";
  969. $len = $self->length - 1;
  970. foreach $count ( 0 .. $len ) {
  971. $out .= $self->_consensus_aa($count,$threshold);
  972. }
  973. return $out;
  974. }
  975. sub _consensus_aa {
  976. my $self = shift;
  977. my $point = shift;
  978. my $threshold_percent = shift || -1 ;
  979. my ($seq,%hash,$count,$letter,$key);
  980. foreach $seq ( $self->each_seq() ) {
  981. $letter = substr($seq->seq,$point,1);
  982. $self->throw("--$point-----------") if $letter eq '';
  983. ($letter =~ /\./) && next;
  984. # print "Looking at $letter\n";
  985. $hash{$letter}++;
  986. }
  987. my $number_of_sequences = $self->no_sequences();
  988. my $threshold = $number_of_sequences * $threshold_percent / 100. ;
  989. $count = -1;
  990. $letter = '?';
  991. foreach $key ( sort keys %hash ) {
  992. # print "Now at $key $hash{$key}\n";
  993. if( $hash{$key} > $count && $hash{$key} >= $threshold) {
  994. $letter = $key;
  995. $count = $hash{$key};
  996. }
  997. }
  998. return $letter;
  999. }
  1000. =head2 consensus_iupac
  1001. Title : consensus_iupac
  1002. Usage : $str = $ali->consensus_iupac()
  1003. Function :
  1004. Makes a consensus using IUPAC ambiguity codes from DNA
  1005. and RNA. The output is in upper case except when gaps in
  1006. a column force output to be in lower case.
  1007. Note that if your alignment sequences contain a lot of
  1008. IUPAC ambiquity codes you often have to manually set
  1009. alphabet. Bio::PrimarySeq::_guess_type thinks they
  1010. indicate a protein sequence.
  1011. Returns : consensus string
  1012. Argument : none
  1013. Throws : on protein sequences
  1014. =cut
  1015. sub consensus_iupac {
  1016. my $self = shift;
  1017. my $out = "";
  1018. my $len = $self->length-1;
  1019. # only DNA and RNA sequences are valid
  1020. foreach my $seq ( $self->each_seq() ) {
  1021. $self->throw("Seq [". $seq->get_nse. "] is a protein")
  1022. if $seq->alphabet eq 'protein';
  1023. }
  1024. # loop over the alignment columns
  1025. foreach my $count ( 0 .. $len ) {
  1026. $out .= $self->_consensus_iupac($count);
  1027. }
  1028. return $out;
  1029. }
  1030. sub _consensus_iupac {
  1031. my ($self, $column) = @_;
  1032. my ($string, $char, $rna);
  1033. #determine all residues in a column
  1034. foreach my $seq ( $self->each_seq() ) {
  1035. $string .= substr($seq->seq, $column, 1);
  1036. }
  1037. $string = uc $string;
  1038. # quick exit if there's an N in the string
  1039. if ($string =~ /N/) {
  1040. $string =~ /\W/ ? return 'n' : return 'N';
  1041. }
  1042. # ... or if there are only gap characters
  1043. return '-' if $string =~ /^\W+$/;
  1044. # treat RNA as DNA in regexps
  1045. if ($string =~ /U/) {
  1046. $string =~ s/U/T/;
  1047. $rna = 1;
  1048. }
  1049. # the following s///'s only need to be done to the _first_ ambiguity code
  1050. # as we only need to see the _range_ of characters in $string
  1051. if ($string =~ /[VDHB]/) {
  1052. $string =~ s/V/AGC/;
  1053. $string =~ s/D/AGT/;
  1054. $string =~ s/H/ACT/;
  1055. $string =~ s/B/CTG/;
  1056. }
  1057. if ($string =~ /[SKYRWM]/) {
  1058. $string =~ s/S/GC/;
  1059. $string =~ s/K/GT/;
  1060. $string =~ s/Y/CT/;
  1061. $string =~ s/R/AG/;
  1062. $string =~ s/W/AT/;
  1063. $string =~ s/M/AC/;
  1064. }
  1065. # and now the guts of the thing
  1066. if ($string =~ /A/) {
  1067. $char = 'A'; # A A
  1068. if ($string =~ /G/) {
  1069. $char = 'R'; # A and G (purines) R
  1070. if ($string =~ /C/) {
  1071. $char = 'V'; # A and G and C V
  1072. if ($string =~ /T/) {
  1073. $char = 'N'; # A and G and C and T N
  1074. }
  1075. } elsif ($string =~ /T/) {
  1076. $char = 'D'; # A and G and T D
  1077. }
  1078. } elsif ($string =~ /C/) {
  1079. $char = 'M'; # A and C M
  1080. if ($string =~ /T/) {
  1081. $char = 'H'; # A and C and T H
  1082. }
  1083. } elsif ($string =~ /T/) {
  1084. $char = 'W'; # A and T W
  1085. }
  1086. } elsif ($string =~ /C/) {
  1087. $char = 'C'; # C C
  1088. if ($string =~ /T/) {
  1089. $char = 'Y'; # C and T (pyrimidines) Y
  1090. if ($string =~ /G/) {
  1091. $char = 'B'; # C and T and G B
  1092. }
  1093. } elsif ($string =~ /G/) {
  1094. $char = 'S'; # C and G S
  1095. }
  1096. } elsif ($string =~ /G/) {
  1097. $char = 'G'; # G G
  1098. if ($string =~ /C/) {
  1099. $char = 'S'; # G and C S
  1100. } elsif ($string =~ /T/) {
  1101. $char = 'K'; # G and T K
  1102. }
  1103. } elsif ($string =~ /T/) {
  1104. $char = 'T'; # T T
  1105. }
  1106. $char = 'U' if $rna and $char eq 'T';
  1107. $char = lc $char if $string =~ /\W/;
  1108. return $char;
  1109. }
  1110. =head2 is_flush
  1111. Title : is_flush
  1112. Usage : if( $ali->is_flush() )
  1113. :
  1114. :
  1115. Function : Tells you whether the alignment
  1116. : is flush, ie all of the same length
  1117. :
  1118. :
  1119. Returns : 1 or 0
  1120. Argument :
  1121. =cut
  1122. sub is_flush {
  1123. my ($self,$report) = @_;
  1124. my $seq;
  1125. my $length = (-1);
  1126. my $temp;
  1127. foreach $seq ( $self->each_seq() ) {
  1128. if( $length == (-1) ) {
  1129. $length = CORE::length($seq->seq());
  1130. next;
  1131. }
  1132. $temp = CORE::length($seq->seq());
  1133. if( $temp != $length ) {
  1134. $self->warn("expecting $length not $temp from ".
  1135. $seq->display_id) if( $report );
  1136. $self->debug("expecting $length not $temp from ".
  1137. $seq->display_id);
  1138. $self->debug($seq->seq(). "\n");
  1139. return 0;
  1140. }
  1141. }
  1142. return 1;
  1143. }
  1144. =head2 length
  1145. Title : length()
  1146. Usage : $len = $ali->length()
  1147. Function : Returns the maximum length of the alignment.
  1148. To be sure the alignment is a block, use is_flush
  1149. Returns :
  1150. Argument :
  1151. =cut
  1152. sub length_aln {
  1153. my $self = shift;
  1154. $self->warn(ref($self). "::length_aln - deprecated method. Use length() instead.");
  1155. $self->length(@_);
  1156. }
  1157. sub length {
  1158. my $self = shift;
  1159. my $seq;
  1160. my $length = (-1);
  1161. my ($temp,$len);
  1162. foreach $seq ( $self->each_seq() ) {
  1163. $temp = CORE::length($seq->seq());
  1164. if( $temp > $length ) {
  1165. $length = $temp;
  1166. }
  1167. }
  1168. return $length;
  1169. }
  1170. =head2 maxdisplayname_length
  1171. Title : maxdisplayname_length
  1172. Usage : $ali->maxdisplayname_length()
  1173. Function :
  1174. Gets the maximum length of the displayname in the
  1175. alignment. Used in writing out various MSE formats.
  1176. Returns : integer
  1177. Argument :
  1178. =cut
  1179. sub maxname_length {
  1180. my $self = shift;
  1181. $self->warn(ref($self). "::maxname_length - deprecated method.".
  1182. " Use maxdisplayname_length() instead.");
  1183. $self->maxdisplayname_length();
  1184. }
  1185. sub maxnse_length {
  1186. my $self = shift;
  1187. $self->warn(ref($self). "::maxnse_length - deprecated method.".
  1188. " Use maxnse_length() instead.");
  1189. $self->maxdisplayname_length();
  1190. }
  1191. sub maxdisplayname_length {
  1192. my $self = shift;
  1193. my $maxname = (-1);
  1194. my ($seq,$len);
  1195. foreach $seq ( $self->each_seq() ) {
  1196. $len = CORE::length $self->displayname($seq->get_nse());
  1197. if( $len > $maxname ) {
  1198. $maxname = $len;
  1199. }
  1200. }
  1201. return $maxname;
  1202. }
  1203. =head2 no_residues
  1204. Title : no_residues
  1205. Usage : $no = $ali->no_residues
  1206. Function : number of residues in total in the alignment
  1207. Returns : integer
  1208. Argument :
  1209. =cut
  1210. sub no_residues {
  1211. my $self = shift;
  1212. my $count = 0;
  1213. foreach my $seq ($self->each_seq) {
  1214. my $str = $seq->seq();
  1215. $count += ($str =~ s/[^A-Za-z]//g);
  1216. }
  1217. return $count;
  1218. }
  1219. =head2 no_sequences
  1220. Title : no_sequences
  1221. Usage : $depth = $ali->no_sequences
  1222. Function : number of sequence in the sequence alignment
  1223. Returns : integer
  1224. Argument :
  1225. =cut
  1226. sub no_sequences {
  1227. my $self = shift;
  1228. return scalar($self->each_seq);
  1229. }
  1230. =head2 average_percentage_identity
  1231. Title : average_percentage_identity
  1232. Usage : $id = $align->average_percentage_identity
  1233. Function: The function uses a fast method to calculate the average
  1234. percentage identity of the alignment
  1235. Returns : The average percentage identity of the alignment
  1236. Args : None
  1237. Notes : This method implemented by Kevin Howe calculates a figure that is
  1238. designed to be similar to the average pairwise identity of the
  1239. alignment (identical in the absence of gaps), without having to
  1240. explicitly calculate pairwise identities proposed by Richard Durbin.
  1241. Validated by Ewan Birney ad Alex Bateman.
  1242. =cut
  1243. sub average_percentage_identity{
  1244. my ($self,@args) = @_;
  1245. my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
  1246. 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
  1247. my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
  1248. if (! $self->is_flush()) {
  1249. $self->throw("All sequences in the alignment must be the same length");
  1250. }
  1251. @seqs = $self->each_seq();
  1252. $len = $self->length();
  1253. # load the each hash with correct keys for existence checks
  1254. for( my $index=0; $index < $len; $index++) {
  1255. foreach my $letter (@alphabet) {
  1256. $countHashes[$index]->{$letter} = 0;
  1257. }
  1258. }
  1259. foreach my $seq (@seqs) {
  1260. my @seqChars = split //, $seq->seq();
  1261. for( my $column=0; $column < @seqChars; $column++ ) {
  1262. my $char = uc($seqChars[$column]);
  1263. if (exists $countHashes[$column]->{$char}) {
  1264. $countHashes[$column]->{$char}++;
  1265. }
  1266. }
  1267. }
  1268. $total = 0;
  1269. $divisor = 0;
  1270. for(my $column =0; $column < $len; $column++) {
  1271. my %hash = %{$countHashes[$column]};
  1272. $subdivisor = 0;
  1273. foreach my $res (keys %hash) {
  1274. $total += $hash{$res}*($hash{$res} - 1);
  1275. $subdivisor += $hash{$res};
  1276. }
  1277. $divisor += $subdivisor * ($subdivisor - 1);
  1278. }
  1279. return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
  1280. }
  1281. =head2 percentage_identity
  1282. Title : percentage_identity
  1283. Usage : $id = $align->percentage_identity
  1284. Function: The function calculates the average percentage identity
  1285. (aliased for average_percentage_identity)
  1286. Returns : The average percentage identity
  1287. Args : None
  1288. =cut
  1289. sub percentage_identity {
  1290. my $self = shift;
  1291. return $self->average_percentage_identity();
  1292. }
  1293. =head2 overall_percentage_identity
  1294. Title : percentage_identity
  1295. Usage : $id = $align->percentage_identity
  1296. Function: The function calculates the percentage identity of
  1297. the conserved columns
  1298. Returns : The percentage identity of the conserved columns
  1299. Args : None
  1300. =cut
  1301. sub overall_percentage_identity{
  1302. my ($self,@args) = @_;
  1303. my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
  1304. 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
  1305. my ($len, $total, @seqs, @countHashes);
  1306. if (! $self->is_flush()) {
  1307. $self->throw("All sequences in the alignment must be the same length");
  1308. }
  1309. @seqs = $self->each_seq();
  1310. $len = $self->length();
  1311. # load the each hash with correct keys for existence checks
  1312. for( my $index=0; $index < $len; $index++) {
  1313. foreach my $letter (@alphabet) {
  1314. $countHashes[$index]->{$letter} = 0;
  1315. }
  1316. }
  1317. foreach my $seq (@seqs) {
  1318. my @seqChars = split //, $seq->seq();
  1319. for( my $column=0; $column < @seqChars; $column++ ) {
  1320. my $char = uc($seqChars[$column]);
  1321. if (exists $countHashes[$column]->{$char}) {
  1322. $countHashes[$column]->{$char}++;
  1323. }
  1324. }
  1325. }
  1326. $total = 0;
  1327. for(my $column =0; $column < $len; $column++) {
  1328. my %hash = %{$countHashes[$column]};
  1329. foreach ( values %hash ) {
  1330. next if( $_ == 0 );
  1331. $total++ if( $_ == scalar @seqs );
  1332. last;
  1333. }
  1334. }
  1335. return ($total / $len ) * 100.0;
  1336. }
  1337. =head1 Alignment positions
  1338. Methods to map a sequence position into an alignment column and back.
  1339. column_from_residue_number() does the former. The latter is really a
  1340. property of the sequence object and can done using
  1341. L<Bio::LocatableSeq::location_from_column>:
  1342. # select somehow a sequence from the alignment, e.g.
  1343. my $seq = $aln->get_seq_by_pos(1);
  1344. #$loc is undef or Bio::LocationI object
  1345. my $loc = $seq->location_from_column(5);
  1346. =head2 column_from_residue_number
  1347. Title : column_from_residue_number
  1348. Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
  1349. Function:
  1350. This function gives the position in the alignment
  1351. (i.e. column number) of the given residue number in the
  1352. sequence with the given name. For example, for the
  1353. alignment
  1354. Seq1/91-97 AC..DEF.GH
  1355. Seq2/24-30 ACGG.RTY..
  1356. Seq3/43-51 AC.DDEFGHI
  1357. column_from_residue_number( "Seq1", 94 ) returns 5.
  1358. column_from_residue_number( "Seq2", 25 ) returns 2.
  1359. column_from_residue_number( "Seq3", 50 ) returns 9.
  1360. An exception is thrown if the residue number would lie
  1361. outside the length of the aligment
  1362. (e.g. column_from_residue_number( "Seq2", 22 )
  1363. Note: If the the parent sequence is represented by more than
  1364. one alignment sequence and the residue number is present in
  1365. them, this method finds only the first one.
  1366. Returns : A column number for the position in the alignment of the
  1367. given residue in the given sequence (1 = first column)
  1368. Args : A sequence id/name (not a name/start-end)
  1369. A residue number in the whole sequence (not just that
  1370. segment of it in the alignment)
  1371. =cut
  1372. sub column_from_residue_number {
  1373. my ($self, $name, $resnumber) = @_;
  1374. $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
  1375. $self->throw("Second argument residue number missing") unless $resnumber;
  1376. foreach my $seq ($self->each_seq_with_id($name)) {
  1377. my $col;
  1378. eval {
  1379. $col = $seq->column_from_residue_number($resnumber);
  1380. };
  1381. next if $@;
  1382. return $col;
  1383. }
  1384. $self->throw("Could not find a sequence segment in $name ".
  1385. "containing residue number $resnumber");
  1386. }
  1387. =head1 Sequence names
  1388. Methods to manipulate the display name. The default name based on the
  1389. sequence id and subsequence positions can be overridden in various
  1390. ways.
  1391. =head2 displayname
  1392. Title : displayname
  1393. Usage : $myalign->displayname("Ig", "IgA")
  1394. Function : Gets/sets the display name of a sequence in the alignment
  1395. :
  1396. Returns : A display name string
  1397. Argument : name of the sequence
  1398. displayname of the sequence (optional)
  1399. =cut
  1400. sub get_displayname {
  1401. my $self = shift;
  1402. $self->warn(ref($self). "::get_displayname - deprecated method. Use displayname() instead.");
  1403. $self->displayname(@_);
  1404. }
  1405. sub set_displayname {
  1406. my $self = shift;
  1407. $self->warn(ref($self). "::set_displayname - deprecated method. Use displayname() instead.");
  1408. $self->displayname(@_);
  1409. }
  1410. sub displayname {
  1411. my ($self, $name, $disname) = @_;
  1412. $self->throw("No sequence with name [$name]") unless $self->{'_seq'}->{$name};
  1413. if( $disname and $name) {
  1414. $self->{'_dis_name'}->{$name} = $disname;
  1415. return $disname;
  1416. }
  1417. elsif( defined $self->{'_dis_name'}->{$name} ) {
  1418. return $self->{'_dis_name'}->{$name};
  1419. } else {
  1420. return $name;
  1421. }
  1422. }
  1423. =head2 set_displayname_count
  1424. Title : set_displayname_count
  1425. Usage : $ali->set_displayname_count
  1426. Function :
  1427. Sets the names to be name_# where # is the number of
  1428. times this name has been used.
  1429. Returns :
  1430. Argument :
  1431. =cut
  1432. sub set_displayname_count {
  1433. my $self= shift;
  1434. my (@arr,$name,$seq,$count,$temp,$nse);
  1435. foreach $seq ( $self->each_alphabetically() ) {
  1436. $nse = $seq->get_nse();
  1437. #name will be set when this is the second
  1438. #time (or greater) is has been seen
  1439. if( defined $name and $name eq ($seq->id()) ) {
  1440. $temp = sprintf("%s_%s",$name,$count);
  1441. $self->displayname($nse,$temp);
  1442. $count++;
  1443. } else {
  1444. $count = 1;
  1445. $name = $seq->id();
  1446. $temp = sprintf("%s_%s",$name,$count);
  1447. $self->displayname($nse,$temp);
  1448. $count++;
  1449. }
  1450. }
  1451. return 1;
  1452. }
  1453. =head2 set_displayname_flat
  1454. Title : set_displayname_flat
  1455. Usage : $ali->set_displayname_flat()
  1456. Function : Makes all the sequences be displayed as just their name,
  1457. not name/start-end
  1458. Returns : 1
  1459. Argument :
  1460. =cut
  1461. sub set_displayname_flat {
  1462. my $self = shift;
  1463. my ($nse,$seq);
  1464. foreach $seq ( $self->each_seq() ) {
  1465. $nse = $seq->get_nse();
  1466. $self->displayname($nse,$seq->id());
  1467. }
  1468. return 1;
  1469. }
  1470. =head2 set_displayname_normal
  1471. Title : set_displayname_normal
  1472. Usage : $ali->set_displayname_normal()
  1473. Function : Makes all the sequences be displayed as name/start-end
  1474. Returns :
  1475. Argument :
  1476. =cut
  1477. sub set_displayname_normal {
  1478. my $self = shift;
  1479. my ($nse,$seq);
  1480. foreach $seq ( $self->each_seq() ) {
  1481. $nse = $seq->get_nse();
  1482. $self->displayname($nse,$nse);
  1483. }
  1484. return 1;
  1485. }
  1486. =head2 source
  1487. Title : source
  1488. Usage : $obj->source($newval)
  1489. Function: sets the Alignment source program
  1490. Example :
  1491. Returns : value of source
  1492. Args : newvalue (optional)
  1493. =cut
  1494. sub source{
  1495. my ($self,$value) = @_;
  1496. if( defined $value) {
  1497. $self->{'_source'} = $value;
  1498. }
  1499. return $self->{'_source'};
  1500. }
  1501. 1;