/Bio/AlignIO/nexus.pm

http://github.com/bioperl/bioperl-live · Perl · 469 lines · 344 code · 80 blank · 45 comment · 77 complexity · 82612091fba0a6d9d198379e12c26fa2 MD5 · raw file

  1. #
  2. # BioPerl module for Bio::AlignIO::nexus
  3. #
  4. # Copyright Heikki Lehvaslaiho
  5. #
  6. =head1 NAME
  7. Bio::AlignIO::nexus - NEXUS format sequence input/output stream
  8. =head1 SYNOPSIS
  9. Do not use this module directly. Use it via the L<Bio::AlignIO> class.
  10. use Bio::AlignIO;
  11. my $in = Bio::AlignIO->new(-format => 'nexus',
  12. -file => 'aln.nexus');
  13. while( my $aln = $in->next_aln ) {
  14. # do something with the alignment
  15. }
  16. =head1 DESCRIPTION
  17. This object can transform L<Bio::Align::AlignI> objects to and from NEXUS
  18. data blocks. See method documentation for supported NEXUS features.
  19. =head1 ACKNOWLEDGEMENTS
  20. Will Fisher has written an excellent standalone NEXUS format parser in
  21. Perl, readnexus. A number of tricks were adapted from it.
  22. =head1 FEEDBACK
  23. =head2 Support
  24. Please direct usage questions or support issues to the mailing list:
  25. I<bioperl-l@bioperl.org>
  26. rather than to the module maintainer directly. Many experienced and
  27. reponsive experts will be able look at the problem and quickly
  28. address it. Please include a thorough description of the problem
  29. with code and data examples if at all possible.
  30. =head2 Reporting Bugs
  31. Report bugs to the Bioperl bug tracking system to help us keep track
  32. the bugs and their resolution. Bug reports can be submitted via the
  33. web:
  34. https://github.com/bioperl/bioperl-live/issues
  35. =head1 AUTHORS - Heikki Lehvaslaiho
  36. Email: heikki-at-bioperl-dot-org
  37. =head1 APPENDIX
  38. The rest of the documentation details each of the object
  39. methods. Internal methods are usually preceded with a _
  40. =cut
  41. # Let the code begin...
  42. package Bio::AlignIO::nexus;
  43. use vars qw(%valid_type);
  44. use strict;
  45. no strict "refs";
  46. use base qw(Bio::AlignIO);
  47. BEGIN {
  48. %valid_type = map {$_, 1} qw( dna rna protein standard );
  49. # standard throws error: inherited from Bio::PrimarySeq
  50. }
  51. =head2 new
  52. Title : new
  53. Usage : $alignio = Bio::AlignIO->new(-format => 'nexus', -file => 'filename');
  54. Function: returns a new Bio::AlignIO object to handle clustalw files
  55. Returns : Bio::AlignIO::clustalw object
  56. Args : -verbose => verbosity setting (-1,0,1,2)
  57. -file => name of file to read in or with ">" - writeout
  58. -fh => alternative to -file param - provide a filehandle
  59. to read from/write to
  60. -format => type of Alignment Format to process or produce
  61. Customization of nexus flavor output
  62. -show_symbols => print the symbols="ATGC" in the data definition
  63. (MrBayes does not like this)
  64. boolean [default is 1]
  65. -show_endblock => print an 'endblock;' at the end of the data
  66. (MyBayes does not like this)
  67. boolean [default is 1]
  68. =cut
  69. sub _initialize {
  70. my ($self, @args) = @_;
  71. $self->SUPER::_initialize(@args);
  72. my ($show_symbols, $endblock) =
  73. $self->_rearrange([qw(SHOW_SYMBOLS SHOW_ENDBLOCK)], @args);
  74. my @names = qw(symbols endblock);
  75. for my $v ( $show_symbols, $endblock ) {
  76. $v = 1 unless defined $v; # default value is 1
  77. my $n = shift @names;
  78. $self->flag($n, $v);
  79. }
  80. }
  81. =head2 next_aln
  82. Title : next_aln
  83. Usage : $aln = $stream->next_aln()
  84. Function: Returns the next alignment in the stream.
  85. Supports the following NEXUS format features:
  86. - The file has to start with '#NEXUS'
  87. - Reads in the name of the alignment from a comment
  88. (anything after 'TITLE: ') .
  89. - Sequence names can be given in a taxa block, too.
  90. - If matchchar notation is used, converts
  91. them back to sequence characters.
  92. - Does character conversions specified in the
  93. NEXUS equate command.
  94. - Sequence names of type 'Homo sapiens' and
  95. Homo_sapiens are treated identically.
  96. Returns : L<Bio::Align::AlignI> object
  97. Args :
  98. =cut
  99. sub next_aln {
  100. my $self = shift;
  101. my $entry;
  102. my ($aln_name, $seqcount, $residuecount, %hash, $alphabet,
  103. $match, $gap, $missing, $equate, $interleave,
  104. $name,$str,@names,$seqname,$start,$end,$count,$seq);
  105. local $Bio::LocatableSeq::OTHER_SYMBOLS = '\*\?\.';
  106. local $Bio::LocatableSeq::GAP_SYMBOLS = '\-';
  107. my $aln = Bio::SimpleAlign->new(-source => 'nexus');
  108. # file starts with '#NEXUS' but we allow white space only lines before it
  109. $entry = $self->_readline;
  110. $entry = $self->_readline while defined $entry && $entry =~ /^\s+$/;
  111. return unless $entry;
  112. $self->throw("Not a valid interleaved NEXUS file! [#NEXUS] not starting the file\n$entry")
  113. unless ($entry && $entry =~ /^#NEXUS/i);
  114. # skip anything before either the taxa or data block
  115. # but read in the optional title in a comment
  116. while (defined($entry = $self->_readline)) {
  117. local ($_) = $entry;
  118. /\[TITLE. *([^\]]+)]\s+/i and $aln_name = $1;
  119. last if /^begin +data/i || /^begin +taxa/i;
  120. }
  121. $aln_name =~ s/\s/_/g and $aln->id($aln_name) if $aln_name;
  122. # data and taxa blocks
  123. my $incomment;
  124. while (defined ($entry = $self->_readline)) {
  125. local ($_) = $entry;
  126. next if s/\[[^\]]+\]//g; # remove comments
  127. if( s/\[[^\]]+$// ) {
  128. $incomment = 1;
  129. # skip line if it is now empty or contains only whitespace
  130. next if /^\s*$/;
  131. } elsif($incomment) {
  132. if( s/^[^\]]*\]// ) {
  133. $incomment = 0;
  134. } else {
  135. next;
  136. }
  137. } elsif( /taxlabels/i ) {
  138. # doesn't deal with taxlabels adequately and can mess things up!
  139. # @names = $self->_read_taxlabels;
  140. } else {
  141. /ntax\s*=\s*(\d+)/i and $seqcount = $1;
  142. /nchar\s*=\s*(\d+)/i and $residuecount = $1;
  143. /matchchar\s*=\s*(.)/i and $match = $1;
  144. /gap\s*=\s*(.)/i and $gap = $1;
  145. /missing\s*=\s*(.)/i and $missing = $1;
  146. /equate\s*=\s*\"([^\"]+)/i and $equate = $1; # "e.g. equate="T=C G=A";
  147. /datatype\s*=\s*(\w+)/i and $alphabet = lc $1;
  148. /interleave/i and $interleave = 1 ;
  149. last if /matrix/io;
  150. }
  151. }
  152. $self->throw("Not a valid NEXUS sequence file. Datatype not specified.")
  153. unless $alphabet;
  154. $self->throw("Not a valid NEXUS sequence file. Datatype should not be [$alphabet]")
  155. unless $valid_type{$alphabet};
  156. $self->throw("\"$gap\" is not a valid gap character. For compatability, gap char can not be one of: ()[]{}/\,;:=*'`\"<>^")
  157. if $gap && $gap =~ /[\(\)\[\]\{\}\/\\\,\;\:\=\*\'\`\<\>\^]/;
  158. $self->throw("\"$missing\" is not a valid missing character. For compatability, missing char can not be one of: ()[]{}/\,;:=*'`\"<>^")
  159. if $missing && $missing =~ /[\(\)\[\]\{\}\/\\\,\;\:\=\*\'\`\<\>\^]/;
  160. $aln->gap_char($gap);
  161. $aln->missing_char($missing);
  162. #
  163. # if data is not right after the matrix line
  164. # read the empty lines out
  165. #
  166. while ($entry = $self->_readline) {
  167. unless ($entry =~ /^\s+$/) {
  168. $self->_pushback($entry);
  169. last;
  170. }
  171. }
  172. #
  173. # matrix command
  174. #
  175. # first alignment section
  176. if (@names == 0) { # taxa block did not exist
  177. while ($entry = $self->_readline) {
  178. local ($_) = $entry;
  179. if( s/\[[^\]]+\]//g ) { #] remove comments
  180. next if /^\s*$/;
  181. # skip line if it is now empty or contains only whitespace
  182. }
  183. if ($interleave && defined$count && ($count <= $seqcount)) {
  184. /^\s+$/ and last;
  185. } else {
  186. /^\s+$/ and next;
  187. }
  188. /^\s*;/ and last; # stop if colon at end of matrix is on it's own line
  189. #/^\s*;\s*$/ and last;
  190. if ( /^\s*([\"\'](.+?)[\"\']|(\S+))\s+(.*)\s*$/ ) {
  191. # get single and double quoted names, or all the first
  192. # nonwhite word as the name, and remained is seq
  193. #if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)$/) { #'
  194. $name = ($2 || $3);
  195. if ($4) {
  196. # seq is on same line as name
  197. # this is the usual NEXUS format
  198. $str = $4;
  199. } else {
  200. # otherwise get seq from following lines. No comments allowed
  201. # a less common matrix format, usually used for very long seqs
  202. $str='';
  203. while (local ($_) = $self->_readline) {
  204. my $str_tmp = $_;
  205. $str_tmp =~ s/[\s;]//g;
  206. $str .= $str_tmp;
  207. last if length$str == $residuecount;
  208. }
  209. }
  210. $name =~ s/ /_/g;
  211. push @names, $name;
  212. $str =~ s/[\s;]//g;
  213. $count = @names;
  214. $hash{$count} = $str;
  215. }
  216. $self->throw("Not a valid interleaved NEXUS file! seqcount [$count] > predeclared [$seqcount] in the first section") if $count > $seqcount;
  217. /;/ and last; # stop if colon at end of matrix is on the same line as the last seq
  218. }
  219. }
  220. # interleaved sections
  221. $count = 0;
  222. if ( $interleave ) { # only read next section if file is interleaved
  223. while( $entry = $self->_readline) {
  224. local ($_) = $entry;
  225. if( s/\[[^\]]+\]//g ) { #] remove comments
  226. next if /^\s*$/; # skip line if it is now empty or contains only whitespace
  227. }
  228. /^\s*;/ and last; # stop if colon at end of matrix is on it's own line
  229. $count = 0, next if $entry =~ /^\s*$/;
  230. if (/^\s*(\'([^\']*?)\'|([^\']\S*))\s+(.*)$/) {
  231. $str = $4;
  232. $str =~ s/[\s;]//g;
  233. $count++;
  234. $hash{$count} .= $str;
  235. };
  236. $self->throw("Not a valid interleaved NEXUS file!
  237. seqcount [$count] > predeclared [$seqcount] ") if $count > $seqcount;
  238. /;/ and last; # stop if colon at end of matrix is on the same line as the last seq
  239. }
  240. }
  241. return if @names < 1;
  242. # sequence creation
  243. $count = 0;
  244. foreach $name ( @names ) {
  245. $count++;
  246. if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
  247. ($seqname,$start,$end) = ($1,$2,$3);
  248. } else {
  249. ($seqname,$start,$str) = ($name,1,$hash{$count});
  250. $str =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]//g;
  251. $end = length($str);
  252. }
  253. # consistency test
  254. $self->throw("Length of sequence [$seqname] is not [$residuecount]; got".CORE::length($hash{$count}))
  255. unless CORE::length($hash{$count}) == $residuecount;
  256. $seq = Bio::LocatableSeq->new('-seq' => $hash{$count},
  257. '-display_id' => $seqname,
  258. '-start' => $start,
  259. '-end' => $end,
  260. '-alphabet' => $alphabet
  261. );
  262. $aln->add_seq($seq);
  263. }
  264. # if matchchar is used
  265. $aln->unmatch($match) if $match;
  266. # if equate ( e.g. equate="T=C G=A") is used
  267. if ($equate) {
  268. $aln->map_chars($1, $2) while $equate =~ /(\S)=(\S)/g;
  269. }
  270. while (defined $entry &&
  271. $entry !~ /endblock/i) {
  272. $entry = $self->_readline;
  273. }
  274. return $aln if $aln->num_sequences;
  275. return;
  276. }
  277. sub _read_taxlabels {
  278. my ($self) = @_;
  279. my ($name, @names);
  280. while (my $entry = $self->_readline) {
  281. last if $entry =~ m/^\s*(END)?;/i;
  282. if( $entry =~ m/\s*(\S+)\s+/ ) {
  283. ($name) = ($1);
  284. $name =~ s/\[[^\[]+\]//g;
  285. $name =~ s/\W/_/g;
  286. push @names, $name;
  287. }
  288. }
  289. return @names;
  290. }
  291. =head2 write_aln
  292. Title : write_aln
  293. Usage : $stream->write_aln(@aln)
  294. Function: Writes the $aln object into the stream in interleaved NEXUS
  295. format. Everything is written into a data block.
  296. SimpleAlign methods match_char, missing_char and gap_char must be set
  297. if you want to see them in the output.
  298. Returns : 1 for success and 0 for error
  299. Args : L<Bio::Align::AlignI> object
  300. =cut
  301. sub write_aln {
  302. my ($self,@aln) = @_;
  303. my $count = 0;
  304. my $wrapped = 0;
  305. my $maxname;
  306. my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index );
  307. my ($match, $missing, $gap,$symbols) = ('', '', '','');
  308. foreach my $aln (@aln) {
  309. if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
  310. $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
  311. next;
  312. }
  313. $self->throw("All sequences in the alignment must be the same length")
  314. unless $aln->is_flush($self->verbose);
  315. $length = $aln->length();
  316. $self->_print (sprintf("#NEXUS\n[TITLE: %s]\n\nbegin data;\ndimensions ntax=%s nchar=%s;\n",
  317. $aln->id, $aln->num_sequences, $length));
  318. $match = "match=". $aln->match_char if $aln->match_char;
  319. $missing = "missing=". $aln->missing_char if $aln->missing_char;
  320. $gap = "gap=". $aln->gap_char if $aln->gap_char;
  321. $symbols = 'symbols="'.join('',$aln->symbol_chars). '"'
  322. if( $self->flag('symbols') && $aln->symbol_chars);
  323. $self->_print
  324. (sprintf("format interleave datatype=%s %s %s %s %s;\n\nmatrix\n",
  325. $aln->get_seq_by_pos(1)->alphabet, $match,
  326. $missing, $gap, $symbols));
  327. # account for single quotes round names
  328. my $indent = $aln->maxdisplayname_length+2;
  329. $aln->set_displayname_flat();
  330. foreach $seq ( $aln->each_seq() ) {
  331. my $nmid = $aln->displayname($seq->get_nse());
  332. if( $nmid =~ /[^\w\d\.]/ ) {
  333. # put name in single quotes incase it contains any of
  334. # the following chars: ()[]{}/\,;:=*'"`+-<> that are not
  335. # allowed in PAUP* and possible other software
  336. $name = sprintf("%-${indent}s", "\'" . $nmid . "\'");
  337. } else {
  338. $name = sprintf("%-${indent}s", $nmid);
  339. }
  340. $hash{$name} = $seq->seq;
  341. push(@arr,$name);
  342. }
  343. while( $count < $length ) {
  344. # there is another block to go!
  345. foreach $name ( @arr ) {
  346. my $dispname = $name;
  347. # $dispname = '' if $wrapped;
  348. $self->_print (sprintf("%${indent}s ",$dispname));
  349. $tempcount = $count;
  350. $index = 0;
  351. while( ($tempcount + 10 < $length) && ($index < 5) ) {
  352. $self->_print (sprintf("%s ",substr($hash{$name},$tempcount,10)));
  353. $tempcount += 10;
  354. $index++;
  355. }
  356. # last
  357. if( $index < 5) {
  358. # space to print!
  359. $self->_print (sprintf("%s ",substr($hash{$name},$tempcount)));
  360. $tempcount += 10;
  361. }
  362. $self->_print ("\n");
  363. }
  364. $self->_print ("\n\n");
  365. $count = $tempcount;
  366. $wrapped = 1;
  367. }
  368. if( $self->flag('endblock') ) {
  369. $self->_print (";\n\nendblock;\n");
  370. } else {
  371. $self->_print (";\n\nend;\n");
  372. }
  373. }
  374. $self->flush if $self->_flush_on_write && defined $self->_fh;
  375. return 1;
  376. }
  377. =head2 flag
  378. Title : flag
  379. Usage : $obj->flag($name,$value)
  380. Function: Get/Set a flag value
  381. Returns : value of flag (a scalar)
  382. Args : on set, new value (a scalar or undef, optional)
  383. =cut
  384. sub flag{
  385. my ($self,$name,$val) = @_;
  386. return $self->{'flag'}->{$name} = $val if defined $val;
  387. return $self->{'flag'}->{$name};
  388. }
  389. 1;