/interfaces/Perl/RNA/Files.pm
https://github.com/ViennaRNA/ViennaRNA · Perl · 187 lines · 139 code · 47 blank · 1 comment · 28 complexity · 399c0294f102dba986a241ec92e4aea1 MD5 · raw file
- package RNA::Files;
- use strict;
- use Exporter;
- use warnings;
- use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
- $VERSION = 1.00;
- @ISA = qw(Exporter);
- @EXPORT = ();
- @EXPORT_OK = qw(ParseDotPlotPS
- ParseRNAstrandDP);
- %EXPORT_TAGS = (ALL => [qw(&ParseDotPlotPS &ParseRNAstrandDP)]);
- =head1 AUTHOR
- Ronny Lorenz (ronny@tbi.univie.ac.at)
- =head1 NAME
- RNA::Files - A set of subroutines to parse RNA secondary structure
- data containing file types
- =head1 DESCRIPTION
- This package provides various subroutines for parsing and extracting data
- from several file formats that contain RNA secondary structure data.
- =head1 METHODS
- =cut
- =head2 ParseDotplotPS($filename)
- I<Parse a dot-plot postscript file>
- This subroutine parses a PostScript (PS) dot-plot file as generated by
- programs of the RNA Package. From the dot-plot file, it extracts
- the RNA sequence, MFE structure, and base pair probabilities.
- =cut
- sub ParseDotPlotPS{
- my $dotplot = shift;
- my %mfe;
- my %probs;
- my $seq;
- my $seq_start;
- if( -e $dotplot ){
- open(DOT, "<".$dotplot);
- while(<DOT>){
- if(defined($seq_start)){
- undef($seq_start), next unless /^([AUCGTNaucgtn]+)/;
- $seq .= $1;
- }
- $seq_start = 1 if /^\/sequence/;
- next unless /(\d+)\s+(\d+)\s+([0-9.Ee-]+)\s+(ubox|lbox)/;
- my ($i, $j, $p) = ($1,$2,$3);
- if($4 eq "ubox"){
- $p *= $p;
- $probs{$i,$j} = $p;
- } else {
- $mfe{$i,$j} = 1;
- }
- }
- close(DOT);
- return ($seq, \%mfe, \%probs);
- } else {
- return undef;
- }
- }
- =head2 ParseRNAstrandDP($filename)
- I<Parse an RNAstrand generated Dot-Parenthesis file>
- This function parses an RNAstrand Dot-Parenthesis (DP) file to extract the
- contained sequences, reference structures, and data sources.
- It returns a reference to an array of hashes with the RNAstrand data set
- where the following hash keys are available:
- =over
- =item * B<NAME> ... The name/identifier of the data set entry
- =item * B<EXTERNAL> ... The source (short version) of the data
- =item * B<EXTERNAL_FULL> ... The source (full version) of the data
- =item * B<SEQUENCE> ... The RNA sequence
- =item * B<STRUCTURE> ... The RNA secondary structure
- =back
- I<For now, the RNAstrand file must not contain pseudo-knots!>
- Otherwise, parsing will fail due to unrecognized characters within the
- structure string.
- =cut
- sub ParseRNAstrandDP{
- open( FILE, $_[0] );
- my @data = ();
- my @tmp = ();
- while ( my $line = <FILE> ) {
- if ( $line =~ m/^#/ ) {
- if ( $#tmp != -1 and $tmp[$#tmp] !~ m/^#/ ) {
- push @data, _parse_block( \@tmp );
- @tmp = ();
- }
- }
- chomp $line;
- push @tmp, $line;
- }
- push @data, _parse_block( \@tmp ) if ( $#tmp != -1 );
- return \@data;
- }
- sub _parse_block {
- my @tmp = @{ $_[0] };
- my @seq = ();
- my @struct = ();
- my $external = '';
- my $external_full = '';
- my $name = '';
- for my $i ( 0 .. $#tmp ) {
- if ( $tmp[$i] =~ m/^#\sFile\s(.*)/ ) {
- $name = $1;
- next;
- }
- if ( $tmp[$i] =~ m/^#\sExternal source:\s([^,]+),?/ ) {
- $external = $1;
- }
- if ( $tmp[$i] =~ m/^#\sExternal source:\s(.*)/ ) {
- $external_full = $1;
- }
- next if ( $tmp[$i] =~ m/^#/ );
- my $seq_flag = ( $tmp[$i] =~ m/.*[A-Z|~].*/i ) ? 1 : 0;
- my $struct_flag = ( $tmp[$i] =~ m/.*[\.\(\)].*/ ) ? 1 : 0;
- push @seq, $tmp[$i] if ( $seq_flag == 1 and $struct_flag == 0 );
- push @struct, $tmp[$i] if ( $seq_flag == 0 and $struct_flag == 1 );
- if ( $seq_flag == 1 and $struct_flag == 1 ) {
- my $count = ( $tmp[$i] =~ tr/[A-Z]// );
- if ( $count / length( $tmp[$i] ) > 0.5 ) {
- push @seq, $tmp[$i];
- }
- }
- }
- my $tmphash = {};
- $tmphash->{NAME} = $name;
- $tmphash->{EXTERNAL} = $external;
- $tmphash->{EXTERNAL_FULL} = $external_full;
- $tmphash->{SEQUENCE} = join( "", @seq );
- $tmphash->{SEQUENCE} =~ s/\x{C2}\x{B0}/N/g;
- #$tmphash->{sequence} =~ s/([^[:ascii:]])/sprintf("&#%d;",ord($1))/eg;
- $tmphash->{STRUCTURE} = join( "", @struct );
- if ( length( $tmphash->{SEQUENCE} ) != length( $tmphash->{STRUCTURE} ) ) {
- print STDERR "Unable to parse RNAstrand file correctly!\nMaybe it contains pseudo-knotted structures?\n";
- print STDERR join( "\n", @tmp );
- print STDERR "SEQ: ", $tmphash->{SEQUENCE}, "\n";
- print STDERR "STR: ", $tmphash->{STRUCTURE}, "\n";
- print STDERR "SEQ: ", length( $tmphash->{SEQUENCE} ), "\n";
- print STDERR "STR: ", length( $tmphash->{STRUCTURE} ), "\n";
- exit 1;
- }
- return $tmphash;
- }
- 1;