Files.pm | searchcode

/interfaces/Perl/RNA/Files.pm

https://github.com/ViennaRNA/ViennaRNA · Perl · 187 lines · 139 code · 47 blank · 1 comment · 28 complexity · 399c0294f102dba986a241ec92e4aea1 MD5 · raw file

package RNA::Files;

use strict;
use Exporter;
use warnings;

use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);

$VERSION     = 1.00;
@ISA         = qw(Exporter);
@EXPORT      = ();
@EXPORT_OK   = qw(ParseDotPlotPS
                  ParseRNAstrandDP);
%EXPORT_TAGS = (ALL => [qw(&ParseDotPlotPS &ParseRNAstrandDP)]);

=head1 AUTHOR

Ronny Lorenz (ronny@tbi.univie.ac.at)

=head1 NAME

RNA::Files - A set of subroutines to parse RNA secondary structure
data containing file types

=head1 DESCRIPTION

This package provides various subroutines for parsing and extracting data
from several file formats that contain RNA secondary structure data.

=head1 METHODS

=cut

=head2  ParseDotplotPS($filename)

I<Parse a dot-plot postscript file>

This subroutine parses a PostScript (PS) dot-plot file as generated by
programs of the RNA Package. From the dot-plot file, it extracts
the RNA sequence, MFE structure, and base pair probabilities.


=cut

sub ParseDotPlotPS{
  my $dotplot = shift;
  my %mfe;
  my %probs;
  my $seq;
  my $seq_start;

  if( -e $dotplot ){
    open(DOT, "<".$dotplot);
    while(<DOT>){
      if(defined($seq_start)){
        undef($seq_start), next  unless /^([AUCGTNaucgtn]+)/;
        $seq .= $1;
      }

      $seq_start = 1 if /^\/sequence/;

      next unless /(\d+)\s+(\d+)\s+([0-9.Ee-]+)\s+(ubox|lbox)/;
      my ($i, $j, $p) = ($1,$2,$3);
      if($4 eq "ubox"){
        $p *= $p;
        $probs{$i,$j} = $p;
      } else {
        $mfe{$i,$j} = 1;
      }
    }
    close(DOT);
    return ($seq, \%mfe, \%probs);
  } else {
    return undef;
  }
}

=head2 ParseRNAstrandDP($filename)

I<Parse an RNAstrand generated Dot-Parenthesis file>

This function parses an RNAstrand Dot-Parenthesis (DP) file to extract the
contained sequences, reference structures, and data sources.

It returns a reference to an array of hashes with the RNAstrand data set
where the following hash keys are available:

=over

=item * B<NAME> ... The name/identifier of the data set entry

=item * B<EXTERNAL> ... The source (short version) of the data

=item * B<EXTERNAL_FULL> ... The source (full version) of the data

=item * B<SEQUENCE> ... The RNA sequence

=item * B<STRUCTURE> ... The RNA secondary structure

=back

I<For now, the RNAstrand file must not contain pseudo-knots!>

Otherwise, parsing will fail due to unrecognized characters within the
structure string.

=cut

sub ParseRNAstrandDP{

  open( FILE, $_[0] );
  my @data = ();
  my @tmp  = ();
  while ( my $line = <FILE> ) {
    if ( $line =~ m/^#/ ) {
      if ( $#tmp != -1 and $tmp[$#tmp] !~ m/^#/ ) {
        push @data, _parse_block( \@tmp );
        @tmp = ();
      }
    }
    chomp $line;
    push @tmp, $line;
  }
  push @data, _parse_block( \@tmp ) if ( $#tmp != -1 );

  return \@data;
}

sub _parse_block {
  my @tmp = @{ $_[0] };

  my @seq           = ();
  my @struct        = ();
  my $external      = '';
  my $external_full = '';
  my $name          = '';
  for my $i ( 0 .. $#tmp ) {
    if ( $tmp[$i] =~ m/^#\sFile\s(.*)/ ) {
      $name = $1;
      next;
    }
    if ( $tmp[$i] =~ m/^#\sExternal source:\s([^,]+),?/ ) {
      $external = $1;
    }
    if ( $tmp[$i] =~ m/^#\sExternal source:\s(.*)/ ) {
      $external_full = $1;
    }
    next if ( $tmp[$i] =~ m/^#/ );
    my $seq_flag    = ( $tmp[$i] =~ m/.*[A-Z|~].*/i ) ? 1 : 0;
    my $struct_flag = ( $tmp[$i] =~ m/.*[\.\(\)].*/ ) ? 1 : 0;
    push @seq,    $tmp[$i] if ( $seq_flag == 1 and $struct_flag == 0 );
    push @struct, $tmp[$i] if ( $seq_flag == 0 and $struct_flag == 1 );
    if ( $seq_flag == 1 and $struct_flag == 1 ) {
      my $count = ( $tmp[$i] =~ tr/[A-Z]// );
      if ( $count / length( $tmp[$i] ) > 0.5 ) {
        push @seq, $tmp[$i];
      }

    }
  }

  my $tmphash = {};
  $tmphash->{NAME}          = $name;
  $tmphash->{EXTERNAL}      = $external;
  $tmphash->{EXTERNAL_FULL} = $external_full;
  $tmphash->{SEQUENCE}      = join( "", @seq );
  $tmphash->{SEQUENCE} =~ s/\x{C2}\x{B0}/N/g;

  #$tmphash->{sequence} =~ s/([^[:ascii:]])/sprintf("&#%d;",ord($1))/eg;
  $tmphash->{STRUCTURE} = join( "", @struct );

  if ( length( $tmphash->{SEQUENCE} ) != length( $tmphash->{STRUCTURE} ) ) {
    print STDERR "Unable to parse RNAstrand file correctly!\nMaybe it contains pseudo-knotted structures?\n";
    print STDERR join( "\n", @tmp );
    print STDERR "SEQ: ", $tmphash->{SEQUENCE},  "\n";
    print STDERR "STR: ", $tmphash->{STRUCTURE}, "\n";
    print STDERR "SEQ: ", length( $tmphash->{SEQUENCE} ),  "\n";
    print STDERR "STR: ", length( $tmphash->{STRUCTURE} ), "\n";

    exit 1;
  }

  return $tmphash;
}


1;