PageRenderTime 66ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 1ms

/Bio/SimpleAlign.pm

https://bitbucket.org/antismash/clusean
Perl | 3213 lines | 2534 code | 545 blank | 134 comment | 294 complexity | 4e6c8816b00e40960b17aaf4d793ee81 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

  1. # BioPerl module for SimpleAlign
  2. #
  3. # Please direct questions and support issues to <bioperl-l@bioperl.org>
  4. #
  5. # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
  6. #
  7. # Copyright Ewan Birney
  8. #
  9. # You may distribute this module under the same terms as perl itself
  10. # POD documentation - main docs before the code
  11. #
  12. # History:
  13. # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
  14. # May 2001 major rewrite - Heikki Lehvaslaiho
  15. =head1 NAME
  16. Bio::SimpleAlign - Multiple alignments held as a set of sequences
  17. =head1 SYNOPSIS
  18. # Use Bio::AlignIO to read in the alignment
  19. $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
  20. $aln = $str->next_aln();
  21. # Describe
  22. print $aln->length;
  23. print $aln->num_residues;
  24. print $aln->is_flush;
  25. print $aln->num_sequences;
  26. print $aln->score;
  27. print $aln->percentage_identity;
  28. print $aln->consensus_string(50);
  29. # Find the position in the alignment for a sequence location
  30. $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
  31. # Extract sequences and check values for the alignment column $pos
  32. foreach $seq ($aln->each_seq) {
  33. $res = $seq->subseq($pos, $pos);
  34. $count{$res}++;
  35. }
  36. foreach $res (keys %count) {
  37. printf "Res: %s Count: %2d\n", $res, $count{$res};
  38. }
  39. # Manipulate
  40. $aln->remove_seq($seq);
  41. $mini_aln = $aln->slice(20,30); # get a block of columns
  42. $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
  43. $new_aln = $aln->remove_columns([20,30]); # remove by position
  44. $new_aln = $aln->remove_columns(['mismatch']); # remove by property
  45. # Analyze
  46. $str = $aln->consensus_string($threshold_percent);
  47. $str = $aln->match_line();
  48. $str = $aln->cigar_line();
  49. $id = $aln->percentage_identity;
  50. # See the module documentation for details and more methods.
  51. =head1 DESCRIPTION
  52. SimpleAlign is an object that handles a multiple sequence alignment
  53. (MSA). It is very permissive of types (it does not insist on sequences
  54. being all same length, for example). Think of it as a set of sequences
  55. with a whole series of built-in manipulations and methods for reading and
  56. writing alignments.
  57. SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
  58. to store its sequences. These are subsequences with a start and end
  59. positions in the parent reference sequence. Each sequence in the
  60. SimpleAlign object is a Bio::LocatableSeq.
  61. SimpleAlign expects the combination of name, start, and end for a
  62. given sequence to be unique in the alignment, and this is the key for the
  63. internal hashes (name, start, end are abbreviated C<nse> in the code).
  64. However, in some cases people do not want the name/start-end to be displayed:
  65. either multiple names in an alignment or names specific to the alignment
  66. (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
  67. C<displayname>, and generally is what is used to print out the
  68. alignment. They default to name/start-end.
  69. The SimpleAlign Module is derived from the Align module by Ewan Birney.
  70. =head1 FEEDBACK
  71. =head2 Mailing Lists
  72. User feedback is an integral part of the evolution of this and other
  73. Bioperl modules. Send your comments and suggestions preferably to one
  74. of the Bioperl mailing lists. Your participation is much appreciated.
  75. bioperl-l@bioperl.org - General discussion
  76. http://bioperl.org/wiki/Mailing_lists - About the mailing lists
  77. =head2 Support
  78. Please direct usage questions or support issues to the mailing list:
  79. I<bioperl-l@bioperl.org>
  80. rather than to the module maintainer directly. Many experienced and
  81. reponsive experts will be able look at the problem and quickly
  82. address it. Please include a thorough description of the problem
  83. with code and data examples if at all possible.
  84. =head2 Reporting Bugs
  85. Report bugs to the Bioperl bug tracking system to help us keep track
  86. the bugs and their resolution. Bug reports can be submitted via the
  87. web:
  88. http://bugzilla.open-bio.org/
  89. =head1 AUTHOR
  90. Ewan Birney, birney@ebi.ac.uk
  91. =head1 CONTRIBUTORS
  92. Allen Day, allenday-at-ucla.edu,
  93. Richard Adams, Richard.Adams-at-ed.ac.uk,
  94. David J. Evans, David.Evans-at-vir.gla.ac.uk,
  95. Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
  96. Allen Smith, allens-at-cpan.org,
  97. Jason Stajich, jason-at-bioperl.org,
  98. Anthony Underwood, aunderwood-at-phls.org.uk,
  99. Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
  100. Brian Osborne, bosborne at alum.mit.edu
  101. Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
  102. Hongyu Zhang, forward at hongyu.org
  103. Jay Hannah, jay at jays.net
  104. Alexandr Bezginov, albezg at gmail.com
  105. =head1 SEE ALSO
  106. L<Bio::LocatableSeq>
  107. =head1 APPENDIX
  108. The rest of the documentation details each of the object
  109. methods. Internal methods are usually preceded with a _
  110. =cut
  111. # 'Let the code begin...
  112. package Bio::SimpleAlign;
  113. use vars qw(%CONSERVATION_GROUPS);
  114. use strict;
  115. use Bio::LocatableSeq; # uses Seq's as list
  116. use Bio::Seq;
  117. use Bio::SeqFeature::Generic;
  118. BEGIN {
  119. # This data should probably be in a more centralized module...
  120. # it is taken from Clustalw documentation.
  121. # These are all the positively scoring groups that occur in the
  122. # Gonnet Pam250 matrix. The strong and weak groups are
  123. # defined as strong score >0.5 and weak score =<0.5 respectively.
  124. %CONSERVATION_GROUPS = (
  125. 'strong' => [ qw(
  126. STA
  127. NEQK
  128. NHQK
  129. NDEQ
  130. QHRK
  131. MILV
  132. MILF
  133. HY
  134. FYW )],
  135. 'weak' => [ qw(
  136. CSA
  137. ATV
  138. SAG
  139. STNK
  140. STPA
  141. SGND
  142. SNDEQK
  143. NDEQHK
  144. NEQHRK
  145. FVLIM
  146. HFY )],);
  147. }
  148. use base qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
  149. Bio::FeatureHolderI);
  150. =head2 new
  151. Title : new
  152. Usage : my $aln = Bio::SimpleAlign->new();
  153. Function : Creates a new simple align object
  154. Returns : Bio::SimpleAlign
  155. Args : -source => string representing the source program
  156. where this alignment came from
  157. -annotation => Bio::AnnotationCollectionI
  158. -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
  159. -seqs => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
  160. -consensus => consensus string
  161. -consensus_meta => Bio::Seq::Meta object containing consensus met information (kludge)
  162. =cut
  163. sub new {
  164. my($class,@args) = @_;
  165. my $self = $class->SUPER::new(@args);
  166. my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
  167. SOURCE
  168. SCORE
  169. ID
  170. ACCESSION
  171. DESCRIPTION
  172. SEQS
  173. FEATURES
  174. ANNOTATION
  175. SEQ_ANNOTATION
  176. CONSENSUS
  177. CONSENSUS_META
  178. )], @args);
  179. $src && $self->source($src);
  180. defined $score && $self->score($score);
  181. # we need to set up internal hashs first!
  182. $self->{'_seq'} = {};
  183. $self->{'_order'} = {};
  184. $self->{'_start_end_lists'} = {};
  185. $self->{'_dis_name'} = {};
  186. $self->{'_id'} = 'NoName';
  187. # maybe we should automatically read in from args. Hmmm...
  188. $id && $self->id($id);
  189. $acc && $self->accession($acc);
  190. $desc && $self->description($desc);
  191. $coll && $self->annotation($coll);
  192. # sequence annotation is layered into a provided annotation collection (or dies)
  193. if ($sa) {
  194. $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
  195. "with a sequence annotation collection")
  196. if !$coll;
  197. $coll->add_Annotation('seq_annotation', $sa);
  198. }
  199. if ($feats && ref $feats eq 'ARRAY') {
  200. for my $feat (@$feats) {
  201. $self->add_SeqFeature($feat);
  202. }
  203. }
  204. $con && $self->consensus($con);
  205. $cmeta && $self->consensus_meta($cmeta);
  206. # assumes these are in correct alignment order
  207. if ($seqs && ref($seqs) eq 'ARRAY') {
  208. for my $seq (@$seqs) {
  209. $self->add_seq($seq);
  210. }
  211. }
  212. return $self; # success - we hope!
  213. }
  214. =head1 Modifier methods
  215. These methods modify the MSA by adding, removing or shuffling complete
  216. sequences.
  217. =head2 add_seq
  218. Title : add_seq
  219. Usage : $myalign->add_seq($newseq);
  220. $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
  221. Function : Adds another sequence to the alignment. *Does not* align
  222. it - just adds it to the hashes.
  223. If -ORDER is specified, the sequence is inserted at the
  224. the position spec'd by -ORDER, and existing sequences
  225. are pushed down the storage array.
  226. Returns : nothing
  227. Args : A Bio::LocatableSeq object
  228. Positive integer for the sequence position (optional)
  229. See L<Bio::LocatableSeq> for more information
  230. =cut
  231. sub addSeq {
  232. my $self = shift;
  233. $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
  234. $self->add_seq(@_);
  235. }
  236. sub add_seq {
  237. my $self = shift;
  238. my @args = @_;
  239. my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
  240. my ($name,$id,$start,$end);
  241. unless ($seq) {
  242. $self->throw("LocatableSeq argument required");
  243. }
  244. if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
  245. $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
  246. }
  247. !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
  248. $order--; # jay's patch (user-specified order is 1-origin)
  249. if ($order < 0) {
  250. $self->throw("User-specified value for ORDER must be >= 1");
  251. }
  252. $id = $seq->id() ||$seq->display_id || $seq->primary_id;
  253. # build the symbol list for this sequence,
  254. # will prune out the gap and missing/match chars
  255. # when actually asked for the symbol list in the
  256. # symbol_chars
  257. # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
  258. $name = $seq->get_nse;
  259. if( $self->{'_seq'}->{$name} ) {
  260. $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
  261. }
  262. else {
  263. $self->debug( "Assigning $name to $order\n");
  264. my $ordh = $self->{'_order'};
  265. if ($ordh->{$order}) {
  266. # make space to insert
  267. # $c->() returns (in reverse order) the first subsequence
  268. # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
  269. # (3,2,1), and $c->(2,4,5) returns (2).
  270. my $c;
  271. $c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
  272. map {
  273. $ordh->{$_+1} = $ordh->{$_}
  274. } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
  275. }
  276. $ordh->{$order} = $name;
  277. unless( exists( $self->{'_start_end_lists'}->{$id})) {
  278. $self->{'_start_end_lists'}->{$id} = [];
  279. }
  280. push @{$self->{'_start_end_lists'}->{$id}}, $seq;
  281. }
  282. $self->{'_seq'}->{$name} = $seq;
  283. }
  284. =head2 remove_seq
  285. Title : remove_seq
  286. Usage : $aln->remove_seq($seq);
  287. Function : Removes a single sequence from an alignment
  288. Returns :
  289. Argument : a Bio::LocatableSeq object
  290. =cut
  291. sub removeSeq {
  292. my $self = shift;
  293. $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
  294. $self->remove_seq(@_);
  295. }
  296. sub remove_seq {
  297. my $self = shift;
  298. my $seq = shift;
  299. my ($name,$id,$start,$end);
  300. $self->throw("Need Bio::Locatable seq argument ")
  301. unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
  302. $id = $seq->id();
  303. $start = $seq->start();
  304. $end = $seq->end();
  305. $name = sprintf("%s/%d-%d",$id,$start,$end);
  306. if( !exists $self->{'_seq'}->{$name} ) {
  307. $self->throw("Sequence $name does not exist in the alignment to remove!");
  308. }
  309. delete $self->{'_seq'}->{$name};
  310. # we need to remove this seq from the start_end_lists hash
  311. if (exists $self->{'_start_end_lists'}->{$id}) {
  312. # we need to find the sequence in the array.
  313. my ($i, $found);;
  314. for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
  315. if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
  316. $found = 1;
  317. last;
  318. }
  319. }
  320. if ($found) {
  321. splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
  322. }
  323. else {
  324. $self->throw("Could not find the sequence to remoce from the start-end list");
  325. }
  326. }
  327. else {
  328. $self->throw("There is no seq list for the name $id");
  329. }
  330. # we need to shift order hash
  331. my %rev_order = reverse %{$self->{'_order'}};
  332. my $no = $rev_order{$name};
  333. my $num_sequences = $self->num_sequences;
  334. for (; $no < $num_sequences; $no++) {
  335. $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
  336. }
  337. delete $self->{'_order'}->{$no};
  338. return 1;
  339. }
  340. =head2 purge
  341. Title : purge
  342. Usage : $aln->purge(0.7);
  343. Function: Removes sequences above given sequence similarity
  344. This function will grind on large alignments. Beware!
  345. Example :
  346. Returns : An array of the removed sequences
  347. Args : float, threshold for similarity
  348. =cut
  349. sub purge {
  350. my ($self,$perc) = @_;
  351. my (%duplicate, @dups);
  352. my @seqs = $self->each_seq();
  353. for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
  354. my $seq = $seqs[$i];
  355. #skip if already in duplicate hash
  356. next if exists $duplicate{$seq->display_id} ;
  357. my $one = $seq->seq();
  358. my @one = split '', $one; #split to get 1aa per array element
  359. for (my $j=$i+1;$j < @seqs;$j++) {
  360. my $seq2 = $seqs[$j];
  361. #skip if already in duplicate hash
  362. next if exists $duplicate{$seq2->display_id} ;
  363. my $two = $seq2->seq();
  364. my @two = split '', $two;
  365. my $count = 0;
  366. my $res = 0;
  367. for (my $k=0;$k<@one;$k++) {
  368. if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
  369. $one[$k] eq $two[$k]) {
  370. $count++;
  371. }
  372. if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
  373. $two[$k] ne '.' && $two[$k] ne '-' ) {
  374. $res++;
  375. }
  376. }
  377. my $ratio = 0;
  378. $ratio = $count/$res unless $res == 0;
  379. # if above threshold put in duplicate hash and push onto
  380. # duplicate array for returning to get_unique
  381. if ( $ratio > $perc ) {
  382. $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
  383. $duplicate{$seq2->display_id} = 1;
  384. push @dups, $seq2;
  385. }
  386. }
  387. }
  388. foreach my $seq (@dups) {
  389. $self->remove_seq($seq);
  390. }
  391. return @dups;
  392. }
  393. =head2 sort_alphabetically
  394. Title : sort_alphabetically
  395. Usage : $ali->sort_alphabetically
  396. Function : Changes the order of the alignment to alphabetical on name
  397. followed by numerical by number.
  398. Returns :
  399. Argument :
  400. =cut
  401. sub sort_alphabetically {
  402. my $self = shift;
  403. my ($seq,$nse,@arr,%hash,$count);
  404. foreach $seq ( $self->each_seq() ) {
  405. $nse = $seq->get_nse;
  406. $hash{$nse} = $seq;
  407. }
  408. $count = 0;
  409. %{$self->{'_order'}} = (); # reset the hash;
  410. foreach $nse ( sort _alpha_startend keys %hash) {
  411. $self->{'_order'}->{$count} = $nse;
  412. $count++;
  413. }
  414. 1;
  415. }
  416. =head2 sort_by_list
  417. Title : sort_by_list
  418. Usage : $aln_ordered=$aln->sort_by_list($list_file)
  419. Function : Arbitrarily order sequences in an alignment
  420. Returns : A new Bio::SimpleAlign object
  421. Argument : a file listing sequence names in intended order (one name per line)
  422. =cut
  423. sub sort_by_list {
  424. my ($self, $list) = @_;
  425. my (@seq, @ids, %order);
  426. foreach my $seq ( $self->each_seq() ) {
  427. push @seq, $seq;
  428. push @ids, $seq->display_id;
  429. }
  430. my $ct=1;
  431. open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
  432. while (<$listfh>) {
  433. chomp;
  434. my $name=$_;
  435. $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
  436. $order{$name}=$ct++;
  437. }
  438. close($listfh);
  439. # use the map-sort-map idiom:
  440. my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
  441. my $aln = $self->new;
  442. foreach (@sorted) { $aln->add_seq($_) }
  443. return $aln;
  444. }
  445. =head2 set_new_reference
  446. Title : set_new_reference
  447. Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or
  448. the sequence whoes name is "B31" (full, exact, and case-sensitive),
  449. as the reference (1st) sequence
  450. Function : Change/Set a new reference (i.e., the first) sequence
  451. Returns : a new Bio::SimpleAlign object.
  452. Throws an exception if designated sequence not found
  453. Argument : a positive integer of sequence order, or a sequence name
  454. in the original alignment
  455. =cut
  456. sub set_new_reference {
  457. my ($self, $seqid) = @_;
  458. my $aln = $self->new;
  459. my (@seq, @ids, @new_seq);
  460. my $is_num=0;
  461. foreach my $seq ( $self->each_seq() ) {
  462. push @seq, $seq;
  463. push @ids, $seq->display_id;
  464. }
  465. if ($seqid =~ /^\d+$/) { # argument is seq position
  466. $is_num=1;
  467. $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
  468. } else { # argument is a seq name
  469. $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
  470. }
  471. for (my $i=0; $i<=$#seq; $i++) {
  472. my $pos=$i+1;
  473. if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
  474. unshift @new_seq, $seq[$i];
  475. } else {
  476. push @new_seq, $seq[$i];
  477. }
  478. }
  479. foreach (@new_seq) { $aln->add_seq($_); }
  480. return $aln;
  481. }
  482. sub _in_aln { # check if input name exists in the alignment
  483. my ($str, $ref) = @_;
  484. foreach (@$ref) {
  485. return 1 if $str eq $_;
  486. }
  487. return 0;
  488. }
  489. =head2 uniq_seq
  490. Title : uniq_seq
  491. Usage : $aln->uniq_seq(): Remove identical sequences in
  492. in the alignment. Ambiguous base ("N", "n") and
  493. leading and ending gaps ("-") are NOT counted as
  494. differences.
  495. Function : Make a new alignment of unique sequence types (STs)
  496. Returns : 1a. if called in a scalar context,
  497. a new Bio::SimpleAlign object (all sequences renamed as "ST")
  498. 1b. if called in an array context,
  499. a new Bio::SimpleAlign object, and a hashref whose keys
  500. are sequence types, and whose values are arrayrefs to
  501. lists of sequence ids within the corresponding sequence type
  502. 2. if $aln->verbose > 0, ST of each sequence is sent to
  503. STDERR (in a tabular format)
  504. Argument : None
  505. =cut
  506. sub uniq_seq {
  507. my ($self, $seqid) = @_;
  508. my $aln = $self->new;
  509. my (%member, %order, @seq, @uniq_str, $st);
  510. my $order=0;
  511. my $len = $self->length();
  512. $st = {};
  513. foreach my $seq ( $self->each_seq() ) {
  514. my $str = $seq->seq();
  515. # it's necessary to ignore "n", "N", leading gaps and ending gaps in
  516. # comparing two sequence strings
  517. # 1st, convert "n", "N" to "?" (for DNA sequence only):
  518. $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
  519. # 2nd, convert leading and ending gaps to "?":
  520. $str = &_convert_leading_ending_gaps($str, '-', '?');
  521. # Note that '?' also can mean unknown residue.
  522. # I don't like making global class member changes like this, too
  523. # prone to errors... -- cjfields 08-11-18
  524. local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
  525. my $new = Bio::LocatableSeq->new(
  526. -id => $seq->id(),
  527. -alphabet=> $seq->alphabet,
  528. -seq => $str,
  529. -start => $seq->start,
  530. -end => $seq->end
  531. );
  532. push @seq, $new;
  533. }
  534. foreach my $seq (@seq) {
  535. my $str = $seq->seq();
  536. my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
  537. if ($seen) { # seen before
  538. my @memb = @{$member{$key}};
  539. push @memb, $seq;
  540. $member{$key} = \@memb;
  541. } else { # not seen
  542. push @uniq_str, $key;
  543. $order++;
  544. $member{$key} = [ ($seq) ];
  545. $order{$key} = $order;
  546. }
  547. }
  548. foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
  549. # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
  550. my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
  551. # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
  552. $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
  553. my $gap='-';
  554. my $end= CORE::length($str2);
  555. $end -= CORE::length($1) while $str2 =~ m/($gap+)/g;
  556. my $new = Bio::LocatableSeq->new(-id =>"ST".$order{$str},
  557. -seq =>$str2,
  558. -start=>1,
  559. -end =>$end
  560. );
  561. $aln->add_seq($new);
  562. foreach (@{$member{$str}}) {
  563. push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
  564. $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
  565. }
  566. }
  567. return wantarray ? ($aln, $st) : $aln;
  568. }
  569. sub _check_uniq { # check if same seq exists in the alignment
  570. my ($str1, $ref, $length) = @_;
  571. my @char1=split //, $str1;
  572. my @array=@$ref;
  573. return (0, $str1) if @array==0; # not seen (1st sequence)
  574. foreach my $str2 (@array) {
  575. my $diff=0;
  576. my @char2=split //, $str2;
  577. for (my $i=0; $i<=$length-1; $i++) {
  578. next if $char1[$i] eq '?';
  579. next if $char2[$i] eq '?';
  580. $diff++ if $char1[$i] ne $char2[$i];
  581. }
  582. return (1, $str2) if $diff == 0; # seen before
  583. }
  584. return (0, $str1); # not seen
  585. }
  586. sub _convert_leading_ending_gaps {
  587. my $s=shift;
  588. my $sym1=shift;
  589. my $sym2=shift;
  590. my @array=split //, $s;
  591. # convert leading char:
  592. for (my $i=0; $i<=$#array; $i++) {
  593. ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
  594. }
  595. # convert ending char:
  596. for (my $i = $#array; $i>= 0; $i--) {
  597. ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
  598. }
  599. my $s_new=join '', @array;
  600. return $s_new;
  601. }
  602. =head1 Sequence selection methods
  603. Methods returning one or more sequences objects.
  604. =head2 each_seq
  605. Title : each_seq
  606. Usage : foreach $seq ( $align->each_seq() )
  607. Function : Gets a Seq object from the alignment
  608. Returns : Seq object
  609. Argument :
  610. =cut
  611. sub eachSeq {
  612. my $self = shift;
  613. $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
  614. $self->each_seq();
  615. }
  616. sub each_seq {
  617. my $self = shift;
  618. my (@arr,$order);
  619. foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
  620. if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
  621. push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
  622. }
  623. }
  624. return @arr;
  625. }
  626. =head2 each_alphabetically
  627. Title : each_alphabetically
  628. Usage : foreach $seq ( $ali->each_alphabetically() )
  629. Function : Returns a sequence object, but the objects are returned
  630. in alphabetically sorted order.
  631. Does not change the order of the alignment.
  632. Returns : Seq object
  633. Argument :
  634. =cut
  635. sub each_alphabetically {
  636. my $self = shift;
  637. my ($seq,$nse,@arr,%hash,$count);
  638. foreach $seq ( $self->each_seq() ) {
  639. $nse = $seq->get_nse;
  640. $hash{$nse} = $seq;
  641. }
  642. foreach $nse ( sort _alpha_startend keys %hash) {
  643. push(@arr,$hash{$nse});
  644. }
  645. return @arr;
  646. }
  647. sub _alpha_startend {
  648. my ($aname,$astart,$bname,$bstart);
  649. ($aname,$astart) = split (/-/,$a);
  650. ($bname,$bstart) = split (/-/,$b);
  651. if( $aname eq $bname ) {
  652. return $astart <=> $bstart;
  653. }
  654. else {
  655. return $aname cmp $bname;
  656. }
  657. }
  658. =head2 each_seq_with_id
  659. Title : each_seq_with_id
  660. Usage : foreach $seq ( $align->each_seq_with_id() )
  661. Function : Gets a Seq objects from the alignment, the contents
  662. being those sequences with the given name (there may be
  663. more than one)
  664. Returns : Seq object
  665. Argument : a seq name
  666. =cut
  667. sub eachSeqWithId {
  668. my $self = shift;
  669. $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
  670. $self->each_seq_with_id(@_);
  671. }
  672. sub each_seq_with_id {
  673. my $self = shift;
  674. my $id = shift;
  675. $self->throw("Method each_seq_with_id needs a sequence name argument")
  676. unless defined $id;
  677. my (@arr, $seq);
  678. if (exists($self->{'_start_end_lists'}->{$id})) {
  679. @arr = @{$self->{'_start_end_lists'}->{$id}};
  680. }
  681. return @arr;
  682. }
  683. =head2 get_seq_by_pos
  684. Title : get_seq_by_pos
  685. Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
  686. Function : Gets a sequence based on its position in the alignment.
  687. Numbering starts from 1. Sequence positions larger than
  688. num_sequences() will thow an error.
  689. Returns : a Bio::LocatableSeq object
  690. Args : positive integer for the sequence position
  691. =cut
  692. sub get_seq_by_pos {
  693. my $self = shift;
  694. my ($pos) = @_;
  695. $self->throw("Sequence position has to be a positive integer, not [$pos]")
  696. unless $pos =~ /^\d+$/ and $pos > 0;
  697. $self->throw("No sequence at position [$pos]")
  698. unless $pos <= $self->num_sequences ;
  699. my $nse = $self->{'_order'}->{--$pos};
  700. return $self->{'_seq'}->{$nse};
  701. }
  702. =head2 get_seq_by_id
  703. Title : get_seq_by_id
  704. Usage : $seq = $aln->get_seq_by_id($name) # seq named $name
  705. Function : Gets a sequence based on its name.
  706. Sequences that do not exist will warn and return undef
  707. Returns : a Bio::LocatableSeq object
  708. Args : string for sequence name
  709. =cut
  710. sub get_seq_by_id {
  711. my ($self,$name) = @_;
  712. unless( defined $name ) {
  713. $self->warn("Must provide a sequence name");
  714. return;
  715. }
  716. for my $seq ( values %{$self->{'_seq'}} ) {
  717. if ( $seq->id eq $name) {
  718. return $seq;
  719. }
  720. }
  721. return;
  722. }
  723. =head2 seq_with_features
  724. Title : seq_with_features
  725. Usage : $seq = $aln->seq_with_features(-pos => 1,
  726. -consensus => 60
  727. -mask =>
  728. sub { my $consensus = shift;
  729. for my $i (1..5){
  730. my $n = 'N' x $i;
  731. my $q = '\?' x $i;
  732. while($consensus =~ /[^?]$q[^?]/){
  733. $consensus =~ s/([^?])$q([^?])/$1$n$2/;
  734. }
  735. }
  736. return $consensus;
  737. }
  738. );
  739. Function: produces a Bio::Seq object by first splicing gaps from -pos
  740. (by means of a splice_by_seq_pos() call), then creating
  741. features using non-? chars (by means of a consensus_string()
  742. call with stringency -consensus).
  743. Returns : a Bio::Seq object
  744. Args : -pos : required. sequence from which to build the Bio::Seq
  745. object
  746. -consensus : optional, defaults to consensus_string()'s
  747. default cutoff value
  748. -mask : optional, a coderef to apply to consensus_string()'s
  749. output before building features. this may be useful for
  750. closing gaps of 1 bp by masking over them with N, for
  751. instance
  752. =cut
  753. sub seq_with_features{
  754. my ($self,%arg) = @_;
  755. #first do the preparatory splice
  756. $self->throw("must provide a -pos argument") unless $arg{-pos};
  757. $self->splice_by_seq_pos($arg{-pos});
  758. my $consensus_string = $self->consensus_string($arg{-consensus});
  759. $consensus_string = $arg{-mask}->($consensus_string)
  760. if defined($arg{-mask});
  761. my(@bs,@es);
  762. push @bs, 1 if $consensus_string =~ /^[^?]/;
  763. while($consensus_string =~ /\?[^?]/g){
  764. push @bs, pos($consensus_string);
  765. }
  766. while($consensus_string =~ /[^?]\?/g){
  767. push @es, pos($consensus_string);
  768. }
  769. push @es, CORE::length($consensus_string) if $consensus_string =~ /[^?]$/;
  770. my $seq = Bio::Seq->new();
  771. # my $rootfeature = Bio::SeqFeature::Generic->new(
  772. # -source_tag => 'location',
  773. # -start => $self->get_seq_by_pos($arg{-pos})->start,
  774. # -end => $self->get_seq_by_pos($arg{-pos})->end,
  775. # );
  776. # $seq->add_SeqFeature($rootfeature);
  777. while(my $b = shift @bs){
  778. my $e = shift @es;
  779. $seq->add_SeqFeature(
  780. Bio::SeqFeature::Generic->new(
  781. -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
  782. -end => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
  783. -source_tag => $self->source || 'MSA',
  784. )
  785. );
  786. }
  787. return $seq;
  788. }
  789. =head1 Create new alignments
  790. The result of these methods are horizontal or vertical subsets of the
  791. current MSA.
  792. =head2 select
  793. Title : select
  794. Usage : $aln2 = $aln->select(1, 3) # three first sequences
  795. Function : Creates a new alignment from a continuous subset of
  796. sequences. Numbering starts from 1. Sequence positions
  797. larger than num_sequences() will thow an error.
  798. Returns : a Bio::SimpleAlign object
  799. Args : positive integer for the first sequence
  800. positive integer for the last sequence to include (optional)
  801. =cut
  802. sub select {
  803. my $self = shift;
  804. my ($start, $end) = @_;
  805. $self->throw("Select start has to be a positive integer, not [$start]")
  806. unless $start =~ /^\d+$/ and $start > 0;
  807. $self->throw("Select end has to be a positive integer, not [$end]")
  808. unless $end =~ /^\d+$/ and $end > 0;
  809. $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
  810. unless $start <= $end;
  811. my $aln = $self->new;
  812. foreach my $pos ($start .. $end) {
  813. $aln->add_seq($self->get_seq_by_pos($pos));
  814. }
  815. $aln->id($self->id);
  816. # fix for meta, sf, ann
  817. return $aln;
  818. }
  819. =head2 select_noncont
  820. Title : select_noncont
  821. Usage : # 1st and 3rd sequences, sorted
  822. $aln2 = $aln->select_noncont(1, 3)
  823. # 1st and 3rd sequences, sorted (same as first)
  824. $aln2 = $aln->select_noncont(3, 1)
  825. # 1st and 3rd sequences, unsorted
  826. $aln2 = $aln->select_noncont('nosort',3, 1)
  827. Function : Creates a new alignment from a subset of sequences. Numbering
  828. starts from 1. Sequence positions larger than num_sequences() will
  829. throw an error. Sorts the order added to new alignment by default,
  830. to prevent sorting pass 'nosort' as the first argument in the list.
  831. Returns : a Bio::SimpleAlign object
  832. Args : array of integers for the sequences. If the string 'nosort' is
  833. passed as the first argument, the sequences will not be sorted
  834. in the new alignment but will appear in the order listed.
  835. =cut
  836. sub select_noncont {
  837. my $self = shift;
  838. my $nosort = 0;
  839. my (@pos) = @_;
  840. if ($pos[0] !~ m{^\d+$}) {
  841. my $sortcmd = shift @pos;
  842. if ($sortcmd eq 'nosort') {
  843. $nosort = 1;
  844. } else {
  845. $self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time.");
  846. }
  847. }
  848. my $end = $self->num_sequences;
  849. foreach ( @pos ) {
  850. $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
  851. unless( /^\d+$/ && $_ > 0 && $_ <= $end );
  852. }
  853. @pos = sort {$a <=> $b} @pos unless $nosort;
  854. my $aln = $self->new;
  855. foreach my $p (@pos) {
  856. $aln->add_seq($self->get_seq_by_pos($p));
  857. }
  858. $aln->id($self->id);
  859. # fix for meta, sf, ann
  860. return $aln;
  861. }
  862. =head2 slice
  863. Title : slice
  864. Usage : $aln2 = $aln->slice(20,30)
  865. Function : Creates a slice from the alignment inclusive of start and
  866. end columns, and the first column in the alignment is denoted 1.
  867. Sequences with no residues in the slice are excluded from the
  868. new alignment and a warning is printed. Slice beyond the length of
  869. the sequence does not do padding.
  870. Returns : A Bio::SimpleAlign object
  871. Args : Positive integer for start column, positive integer for end column,
  872. optional boolean which if true will keep gap-only columns in the newly
  873. created slice. Example:
  874. $aln2 = $aln->slice(20,30,1)
  875. =cut
  876. sub slice {
  877. my $self = shift;
  878. my ($start, $end, $keep_gap_only) = @_;
  879. $self->throw("Slice start has to be a positive integer, not [$start]")
  880. unless $start =~ /^\d+$/ and $start > 0;
  881. $self->throw("Slice end has to be a positive integer, not [$end]")
  882. unless $end =~ /^\d+$/ and $end > 0;
  883. $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
  884. unless $start <= $end;
  885. $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
  886. "[$start] is too big.") if $start > $self->length;
  887. my $cons_meta = $self->consensus_meta;
  888. my $aln = $self->new;
  889. $aln->id($self->id);
  890. foreach my $seq ( $self->each_seq() ) {
  891. my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
  892. Bio::Seq::Meta->new
  893. (-id => $seq->id,
  894. -alphabet => $seq->alphabet,
  895. -strand => $seq->strand,
  896. -verbose => $self->verbose) :
  897. Bio::LocatableSeq->new
  898. (-id => $seq->id,
  899. -alphabet => $seq->alphabet,
  900. -strand => $seq->strand,
  901. -verbose => $self->verbose);
  902. # seq
  903. my $seq_end = $end;
  904. $seq_end = $seq->length if( $end > $seq->length );
  905. my $slice_seq = $seq->subseq($start, $seq_end);
  906. $new_seq->seq( $slice_seq );
  907. $slice_seq =~ s/\W//g;
  908. if ($start > 1) {
  909. my $pre_start_seq = $seq->subseq(1, $start - 1);
  910. $pre_start_seq =~ s/\W//g;
  911. if (!defined($seq->strand)) {
  912. $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
  913. } elsif ($seq->strand < 0){
  914. $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
  915. } else {
  916. $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
  917. }
  918. } else {
  919. if ((defined $seq->strand)&&($seq->strand < 0)){
  920. $new_seq->start( $seq->end - CORE::length($slice_seq) + 1);
  921. } else {
  922. $new_seq->start( $seq->start);
  923. }
  924. }
  925. if ($new_seq->isa('Bio::Seq::MetaI')) {
  926. for my $meta_name ($seq->meta_names) {
  927. $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
  928. }
  929. }
  930. $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
  931. if ($new_seq->start and $new_seq->end >= $new_seq->start) {
  932. $aln->add_seq($new_seq);
  933. } else {
  934. if( $keep_gap_only ) {
  935. $aln->add_seq($new_seq);
  936. } else {
  937. my $nse = $seq->get_nse();
  938. $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
  939. " Sequence excluded from the new alignment.");
  940. }
  941. }
  942. }
  943. if ($cons_meta) {
  944. my $new = Bio::Seq::Meta->new();
  945. for my $meta_name ($cons_meta->meta_names) {
  946. $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
  947. }
  948. $aln->consensus_meta($new);
  949. }
  950. $aln->annotation($self->annotation);
  951. # fix for meta, sf, ann
  952. return $aln;
  953. }
  954. =head2 remove_columns
  955. Title : remove_columns
  956. Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or
  957. $aln2 = $aln->remove_columns([0,0],[6,8])
  958. Function : Creates an aligment with columns removed corresponding to
  959. the specified type or by specifying the columns by number.
  960. Returns : Bio::SimpleAlign object
  961. Args : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
  962. 'all_gaps_columns') or array ref where the referenced array
  963. contains a pair of integers that specify a range.
  964. The first column is 0
  965. =cut
  966. sub remove_columns {
  967. my ($self,@args) = @_;
  968. @args || $self->throw("Must supply column ranges or column types");
  969. my $aln;
  970. if ($args[0][0] =~ /^[a-z_]+$/i) {
  971. $aln = $self->_remove_columns_by_type($args[0]);
  972. } elsif ($args[0][0] =~ /^\d+$/) {
  973. $aln = $self->_remove_columns_by_num(\@args);
  974. } else {
  975. $self->throw("You must pass array references to remove_columns(), not @args");
  976. }
  977. # fix for meta, sf, ann
  978. $aln;
  979. }
  980. =head2 remove_gaps
  981. Title : remove_gaps
  982. Usage : $aln2 = $aln->remove_gaps
  983. Function : Creates an aligment with gaps removed
  984. Returns : a Bio::SimpleAlign object
  985. Args : a gap character(optional) if none specified taken
  986. from $self->gap_char,
  987. [optional] $all_gaps_columns flag (1 or 0, default is 0)
  988. indicates that only all-gaps columns should be deleted
  989. Used from method L<remove_columns> in most cases. Set gap character
  990. using L<gap_char()|gap_char>.
  991. =cut
  992. sub remove_gaps {
  993. my ($self,$gapchar,$all_gaps_columns) = @_;
  994. my $gap_line;
  995. if ($all_gaps_columns) {
  996. $gap_line = $self->all_gap_line($gapchar);
  997. } else {
  998. $gap_line = $self->gap_line($gapchar);
  999. }
  1000. my $aln = $self->new;
  1001. my @remove;
  1002. my $length = 0;
  1003. my $del_char = $gapchar || $self->gap_char;
  1004. # Do the matching to get the segments to remove
  1005. while ($gap_line =~ m/[$del_char]/g) {
  1006. my $start = pos($gap_line)-1;
  1007. $gap_line=~/\G[$del_char]+/gc;
  1008. my $end = pos($gap_line)-1;
  1009. #have to offset the start and end for subsequent removes
  1010. $start-=$length;
  1011. $end -=$length;
  1012. $length += ($end-$start+1);
  1013. push @remove, [$start,$end];
  1014. }
  1015. #remove the segments
  1016. $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
  1017. # fix for meta, sf, ann
  1018. return $aln;
  1019. }
  1020. sub _remove_col {
  1021. my ($self,$aln,$remove) = @_;
  1022. my @new;
  1023. my $gap = $self->gap_char;
  1024. # splice out the segments and create new seq
  1025. foreach my $seq($self->each_seq){
  1026. my $new_seq = Bio::LocatableSeq->new(
  1027. -id => $seq->id,
  1028. -alphabet=> $seq->alphabet,
  1029. -strand => $seq->strand,
  1030. -verbose => $self->verbose);
  1031. my $sequence = $seq->seq;
  1032. foreach my $pair(@{$remove}){
  1033. my $start = $pair->[0];
  1034. my $end = $pair->[1];
  1035. $sequence = $seq->seq unless $sequence;
  1036. my $orig = $sequence;
  1037. my $head = $start > 0 ? substr($sequence, 0, $start) : '';
  1038. my $tail = ($end + 1) >= CORE::length($sequence) ? '' : substr($sequence, $end + 1);
  1039. $sequence = $head.$tail;
  1040. # start
  1041. unless (defined $new_seq->start) {
  1042. if ($start == 0) {
  1043. my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
  1044. $new_seq->start($seq->start + $end + 1 - $start_adjust);
  1045. }
  1046. else {
  1047. my $start_adjust = $orig =~ /^$gap+/;
  1048. if ($start_adjust) {
  1049. $start_adjust = $+[0] == $start;
  1050. }
  1051. $new_seq->start($seq->start + $start_adjust);
  1052. }
  1053. }
  1054. # end
  1055. if (($end + 1) >= CORE::length($orig)) {
  1056. my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
  1057. $new_seq->end($seq->end - (CORE::length($orig) - $start) + $end_adjust);
  1058. }
  1059. else {
  1060. $new_seq->end($seq->end);
  1061. }
  1062. }
  1063. if ($new_seq->end < $new_seq->start) {
  1064. # we removed all columns except for gaps: set to 0 to indicate no
  1065. # sequence
  1066. $new_seq->start(0);
  1067. $new_seq->end(0);
  1068. }
  1069. $new_seq->seq($sequence) if $sequence;
  1070. push @new, $new_seq;
  1071. }
  1072. # add the new seqs to the alignment
  1073. foreach my $new(@new){
  1074. $aln->add_seq($new);
  1075. }
  1076. # fix for meta, sf, ann
  1077. return $aln;
  1078. }
  1079. sub _remove_columns_by_type {
  1080. my ($self,$type) = @_;
  1081. my $aln = $self->new;
  1082. my @remove;
  1083. my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
  1084. my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
  1085. my %matchchars = ( 'match' => '\*',
  1086. 'weak' => '\.',
  1087. 'strong' => ':',
  1088. 'mismatch' => ' ',
  1089. 'gaps' => '',
  1090. 'all_gaps_columns' => ''
  1091. );
  1092. # get the characters to delete against
  1093. my $del_char;
  1094. foreach my $type (@{$type}){
  1095. $del_char.= $matchchars{$type};
  1096. }
  1097. my $length = 0;
  1098. my $match_line = $self->match_line;
  1099. # do the matching to get the segments to remove
  1100. if($del_char){
  1101. while($match_line =~ m/[$del_char]/g ){
  1102. my $start = pos($match_line)-1;
  1103. $match_line=~/\G[$del_char]+/gc;
  1104. my $end = pos($match_line)-1;
  1105. #have to offset the start and end for subsequent removes
  1106. $start-=$length;
  1107. $end -=$length;
  1108. $length += ($end-$start+1);
  1109. push @remove, [$start,$end];
  1110. }
  1111. }
  1112. # remove the segments
  1113. $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
  1114. $aln = $aln->remove_gaps() if $gap;
  1115. $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
  1116. # fix for meta, sf, ann
  1117. $aln;
  1118. }
  1119. sub _remove_columns_by_num {
  1120. my ($self,$positions) = @_;
  1121. my $aln = $self->new;
  1122. # sort the positions
  1123. @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
  1124. my @remove;
  1125. my $length = 0;
  1126. foreach my $pos (@{$positions}) {
  1127. my ($start, $end) = @{$pos};
  1128. #have to offset the start and end for subsequent removes
  1129. $start-=$length;
  1130. $end -=$length;
  1131. $length += ($end-$start+1);
  1132. push @remove, [$start,$end];
  1133. }
  1134. #remove the segments
  1135. $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
  1136. # fix for meta, sf, ann
  1137. $aln;
  1138. }
  1139. =head1 Change sequences within the MSA
  1140. These methods affect characters in all sequences without changing the
  1141. alignment.
  1142. =head2 splice_by_seq_pos
  1143. Title : splice_by_seq_pos
  1144. Usage : $status = splice_by_seq_pos(1);
  1145. Function: splices all aligned sequences where the specified sequence
  1146. has gaps.
  1147. Example :
  1148. Returns : 1 on success
  1149. Args : position of sequence to splice by
  1150. =cut
  1151. sub splice_by_seq_pos{
  1152. my ($self,$pos) = @_;
  1153. my $guide = $self->get_seq_by_pos($pos);
  1154. my $guide_seq = $guide->seq;
  1155. $guide_seq =~ s/\./\-/g;
  1156. my @gaps = ();
  1157. $pos = -1;
  1158. while(($pos = index($guide_seq, '-', $pos)) > -1 ){
  1159. unshift @gaps, $pos;
  1160. $pos++;
  1161. }
  1162. foreach my $seq ($self->each_seq){
  1163. my @bases = split '', $seq->seq;
  1164. splice(@bases, $_, 1) foreach @gaps;
  1165. $seq->seq(join('', @bases));
  1166. }
  1167. 1;
  1168. }
  1169. =head2 map_chars
  1170. Title : map_chars
  1171. Usage : $ali->map_chars('\.','-')
  1172. Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
  1173. characters
  1174. Notice that the from (arg1) is interpretted as a regex,
  1175. so be careful about quoting meta characters (eg
  1176. $ali->map_chars('.','-') wont do what you want)
  1177. Returns :
  1178. Argument : 'from' rexexp
  1179. 'to' string
  1180. =cut
  1181. sub map_chars {
  1182. my $self = shift;
  1183. my $from = shift;
  1184. my $to = shift;
  1185. my ($seq,$temp);
  1186. $self->throw("Need exactly two arguments")
  1187. unless defined $from and defined $to;
  1188. foreach $seq ( $self->each_seq() ) {
  1189. $temp = $seq->seq();
  1190. $temp =~ s/$from/$to/g;
  1191. $seq->seq($temp);
  1192. }
  1193. return 1;
  1194. }
  1195. =head2 uppercase
  1196. Title : uppercase()
  1197. Usage : $ali->uppercase()
  1198. Function : Sets all the sequences to uppercase
  1199. Returns :
  1200. Argument :
  1201. =cut
  1202. sub uppercase {
  1203. my $self = shift;
  1204. my $seq;
  1205. my $temp;
  1206. foreach $seq ( $self->each_seq() ) {
  1207. $temp = $seq->seq();
  1208. $temp =~ tr/[a-z]/[A-Z]/;
  1209. $seq->seq($temp);
  1210. }
  1211. return 1;
  1212. }
  1213. =head2 cigar_line
  1214. Title : cigar_line()
  1215. Usage : %cigars = $align->cigar_line()
  1216. Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
  1217. Report) line for each sequence in the alignment. Examples are
  1218. "1,60" or "5,10:12,58", where the numbers refer to conserved
  1219. positions within the alignment. The keys of the hash are the
  1220. NSEs (name/start/end) assigned to each sequence.
  1221. Args : threshold (optional, defaults to 100)
  1222. Returns : Hash of strings (cigar lines)
  1223. =cut
  1224. sub cigar_line {
  1225. my $self = shift;
  1226. my $thr=shift||100;
  1227. my %cigars;
  1228. my @consensus = split "",($self->consensus_string($thr));
  1229. my $len = $self->length;
  1230. my $gapchar = $self->gap_char;
  1231. # create a precursor, something like (1,4,5,6,7,33,45),
  1232. # where each number corresponds to a conserved position
  1233. foreach my $seq ( $self->each_seq ) {
  1234. my @seq = split "", uc ($seq->seq);
  1235. my $pos = 1;
  1236. for (my $x = 0 ; $x < $len ; $x++ ) {
  1237. if ($seq[$x] eq $consensus[$x]) {
  1238. push @{$cigars{$seq->get_nse}},$pos;
  1239. $pos++;
  1240. } elsif ($seq[$x] ne $gapchar) {
  1241. $pos++;
  1242. }
  1243. }
  1244. }
  1245. # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
  1246. for my $name (keys %cigars) {
  1247. splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
  1248. ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
  1249. push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
  1250. ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
  1251. ${$cigars{$name}}[$#{$cigars{$name}}] );
  1252. for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
  1253. if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x] &&
  1254. ${$cigars{$name}}[$x + 1] > ${$cigars{$name}}[$x] + 1) {
  1255. splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
  1256. }
  1257. }
  1258. }
  1259. # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
  1260. for my $name (keys %cigars) {
  1261. my @remove;
  1262. for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
  1263. if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
  1264. ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
  1265. unshift @remove,$x;
  1266. }
  1267. }
  1268. for my $pos (@remove) {
  1269. splice @{$cigars{$name}}, $pos, 1;
  1270. }
  1271. }
  1272. # join and punctuate
  1273. for my $name (keys %cigars) {
  1274. my ($start,$end,$str) = "";
  1275. while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
  1276. $str .= ($start . "," . $end . ":");
  1277. }
  1278. $str =~ s/:$//;
  1279. $cigars{$name} = $str;
  1280. }
  1281. %cigars;
  1282. }
  1283. =head2 match_line
  1284. Title : match_line()
  1285. Usage : $line = $align->match_line()
  1286. Function : Generates a match line - much like consensus string
  1287. except that a line indicating the '*' for a match.
  1288. Args : (optional) Match line characters ('*' by default)
  1289. (optional) Strong match char (':' by default)
  1290. (optional) Weak match char ('.' by default)
  1291. Returns : String
  1292. =cut
  1293. sub match_line {
  1294. my ($self,$matchlinechar, $strong, $weak) = @_;
  1295. my %matchchars = ('match' => $matchlinechar || '*',
  1296. 'weak' => $weak || '.',
  1297. 'strong' => $strong || ':',
  1298. 'mismatch' => ' ',
  1299. );
  1300. my @seqchars;
  1301. my $alphabet;
  1302. foreach my $seq ( $self->each_seq ) {
  1303. push @seqchars, [ split(//, uc ($seq->seq)) ];
  1304. $alphabet = $seq->alphabet unless defined $alphabet;
  1305. }
  1306. my $refseq = shift @seqchars;
  1307. # let's just march down the columns
  1308. my $matchline;
  1309. POS:
  1310. foreach my $pos ( 0..$self->length ) {
  1311. my $refchar = $refseq->[$pos];
  1312. my $char = $matchchars{'mismatch'};
  1313. unless( defined $refchar ) {
  1314. last if $pos == $self->length; # short circuit on last residue
  1315. # this in place to handle jason's soon-to-be-committed
  1316. # intron mapping code
  1317. goto bottom;
  1318. }
  1319. my %col = ($refchar => 1);
  1320. my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
  1321. foreach my $seq ( @seqchars ) {
  1322. next if $pos >= scalar @$seq;
  1323. $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
  1324. $seq->[$pos] eq ' ' );
  1325. $col{$seq->[$pos]}++ if defined $seq->[$pos];
  1326. }
  1327. my @colresidues = sort keys %col;
  1328. # if all the values are the same
  1329. if( $dash ) { $char = $matchchars{'mismatch'} }
  1330. elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
  1331. elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
  1332. # matches for protein seqs
  1333. TYPE:
  1334. foreach my $type ( qw(strong weak) ) {
  1335. # iterate through categories
  1336. my %groups;
  1337. # iterate through each of the aa in the col
  1338. # look to see which groups it is in
  1339. foreach my $c ( @colresidues ) {
  1340. foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
  1341. push @{$groups{$f}},$c;
  1342. }
  1343. }
  1344. GRP:
  1345. foreach my $cols ( values %groups ) {
  1346. @$cols = sort @$cols;
  1347. # now we are just testing to see if two arrays
  1348. # are identical w/o changing either one
  1349. # have to be same len
  1350. next if( scalar @$cols != scalar @colresidues );
  1351. # walk down the length and check each slot
  1352. for($_=0;$_ < (scalar @$cols);$_++ ) {
  1353. next GRP if( $cols->[$_] ne $colresidues[$_] );
  1354. }
  1355. $char = $matchchars{$type};
  1356. last TYPE;
  1357. }
  1358. }
  1359. }
  1360. bottom:
  1361. $matchline .= $char;
  1362. }
  1363. return $matchline;
  1364. }
  1365. =head2 gap_line
  1366. Title : gap_line()
  1367. Usage : $line = $align->gap_line()
  1368. Function : Generates a gap line - much like consensus string
  1369. except that a line where '-' represents gap
  1370. Args : (optional) gap line characters ('-' by default)
  1371. Returns : string
  1372. =cut
  1373. sub gap_line {
  1374. my ($self,$gapchar) = @_;
  1375. $gapchar = $gapchar || $self->gap_char;
  1376. my %gap_hsh; # column gaps vector
  1377. foreach my $seq ( $self->each_seq ) {
  1378. my $i = 0;
  1379. map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar}
  1380. map {[$i++, $_]} split(//, uc ($seq->seq));
  1381. }
  1382. my $gap_line;
  1383. foreach my $pos ( 0..$self->length-1 ) {
  1384. $gap_line .= (exists $gap_hsh{$pos}) ? $gapchar:'.';
  1385. }
  1386. return $gap_line;
  1387. }
  1388. =head2 all_gap_line
  1389. Title : all_gap_line()
  1390. Usage : $line = $align->all_gap_line()
  1391. Function : Generates a gap line - much like consensus string
  1392. except that a line where '-' represents all-gap column
  1393. Args : (optional) gap line characters ('-' by default)
  1394. Returns : string
  1395. =cut
  1396. sub all_gap_line {
  1397. my ($self,$gapchar) = @_;
  1398. $gapchar = $gapchar || $self->gap_char;
  1399. my %gap_hsh; # column gaps counter hash
  1400. my @seqs = $self->each_seq;
  1401. foreach my $seq ( @seqs ) {
  1402. my $i = 0;
  1403. map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar}
  1404. map {[$i++, $_]} split(//, uc ($seq->seq));
  1405. }
  1406. my

Large files files are truncated, but you can click here to view the full file