/interfaces/Perl/RNA/Files.pm

https://github.com/ViennaRNA/ViennaRNA · Perl · 187 lines · 139 code · 47 blank · 1 comment · 28 complexity · 399c0294f102dba986a241ec92e4aea1 MD5 · raw file

  1. package RNA::Files;
  2. use strict;
  3. use Exporter;
  4. use warnings;
  5. use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  6. $VERSION = 1.00;
  7. @ISA = qw(Exporter);
  8. @EXPORT = ();
  9. @EXPORT_OK = qw(ParseDotPlotPS
  10. ParseRNAstrandDP);
  11. %EXPORT_TAGS = (ALL => [qw(&ParseDotPlotPS &ParseRNAstrandDP)]);
  12. =head1 AUTHOR
  13. Ronny Lorenz (ronny@tbi.univie.ac.at)
  14. =head1 NAME
  15. RNA::Files - A set of subroutines to parse RNA secondary structure
  16. data containing file types
  17. =head1 DESCRIPTION
  18. This package provides various subroutines for parsing and extracting data
  19. from several file formats that contain RNA secondary structure data.
  20. =head1 METHODS
  21. =cut
  22. =head2 ParseDotplotPS($filename)
  23. I<Parse a dot-plot postscript file>
  24. This subroutine parses a PostScript (PS) dot-plot file as generated by
  25. programs of the RNA Package. From the dot-plot file, it extracts
  26. the RNA sequence, MFE structure, and base pair probabilities.
  27. =cut
  28. sub ParseDotPlotPS{
  29. my $dotplot = shift;
  30. my %mfe;
  31. my %probs;
  32. my $seq;
  33. my $seq_start;
  34. if( -e $dotplot ){
  35. open(DOT, "<".$dotplot);
  36. while(<DOT>){
  37. if(defined($seq_start)){
  38. undef($seq_start), next unless /^([AUCGTNaucgtn]+)/;
  39. $seq .= $1;
  40. }
  41. $seq_start = 1 if /^\/sequence/;
  42. next unless /(\d+)\s+(\d+)\s+([0-9.Ee-]+)\s+(ubox|lbox)/;
  43. my ($i, $j, $p) = ($1,$2,$3);
  44. if($4 eq "ubox"){
  45. $p *= $p;
  46. $probs{$i,$j} = $p;
  47. } else {
  48. $mfe{$i,$j} = 1;
  49. }
  50. }
  51. close(DOT);
  52. return ($seq, \%mfe, \%probs);
  53. } else {
  54. return undef;
  55. }
  56. }
  57. =head2 ParseRNAstrandDP($filename)
  58. I<Parse an RNAstrand generated Dot-Parenthesis file>
  59. This function parses an RNAstrand Dot-Parenthesis (DP) file to extract the
  60. contained sequences, reference structures, and data sources.
  61. It returns a reference to an array of hashes with the RNAstrand data set
  62. where the following hash keys are available:
  63. =over
  64. =item * B<NAME> ... The name/identifier of the data set entry
  65. =item * B<EXTERNAL> ... The source (short version) of the data
  66. =item * B<EXTERNAL_FULL> ... The source (full version) of the data
  67. =item * B<SEQUENCE> ... The RNA sequence
  68. =item * B<STRUCTURE> ... The RNA secondary structure
  69. =back
  70. I<For now, the RNAstrand file must not contain pseudo-knots!>
  71. Otherwise, parsing will fail due to unrecognized characters within the
  72. structure string.
  73. =cut
  74. sub ParseRNAstrandDP{
  75. open( FILE, $_[0] );
  76. my @data = ();
  77. my @tmp = ();
  78. while ( my $line = <FILE> ) {
  79. if ( $line =~ m/^#/ ) {
  80. if ( $#tmp != -1 and $tmp[$#tmp] !~ m/^#/ ) {
  81. push @data, _parse_block( \@tmp );
  82. @tmp = ();
  83. }
  84. }
  85. chomp $line;
  86. push @tmp, $line;
  87. }
  88. push @data, _parse_block( \@tmp ) if ( $#tmp != -1 );
  89. return \@data;
  90. }
  91. sub _parse_block {
  92. my @tmp = @{ $_[0] };
  93. my @seq = ();
  94. my @struct = ();
  95. my $external = '';
  96. my $external_full = '';
  97. my $name = '';
  98. for my $i ( 0 .. $#tmp ) {
  99. if ( $tmp[$i] =~ m/^#\sFile\s(.*)/ ) {
  100. $name = $1;
  101. next;
  102. }
  103. if ( $tmp[$i] =~ m/^#\sExternal source:\s([^,]+),?/ ) {
  104. $external = $1;
  105. }
  106. if ( $tmp[$i] =~ m/^#\sExternal source:\s(.*)/ ) {
  107. $external_full = $1;
  108. }
  109. next if ( $tmp[$i] =~ m/^#/ );
  110. my $seq_flag = ( $tmp[$i] =~ m/.*[A-Z|~].*/i ) ? 1 : 0;
  111. my $struct_flag = ( $tmp[$i] =~ m/.*[\.\(\)].*/ ) ? 1 : 0;
  112. push @seq, $tmp[$i] if ( $seq_flag == 1 and $struct_flag == 0 );
  113. push @struct, $tmp[$i] if ( $seq_flag == 0 and $struct_flag == 1 );
  114. if ( $seq_flag == 1 and $struct_flag == 1 ) {
  115. my $count = ( $tmp[$i] =~ tr/[A-Z]// );
  116. if ( $count / length( $tmp[$i] ) > 0.5 ) {
  117. push @seq, $tmp[$i];
  118. }
  119. }
  120. }
  121. my $tmphash = {};
  122. $tmphash->{NAME} = $name;
  123. $tmphash->{EXTERNAL} = $external;
  124. $tmphash->{EXTERNAL_FULL} = $external_full;
  125. $tmphash->{SEQUENCE} = join( "", @seq );
  126. $tmphash->{SEQUENCE} =~ s/\x{C2}\x{B0}/N/g;
  127. #$tmphash->{sequence} =~ s/([^[:ascii:]])/sprintf("&#%d;",ord($1))/eg;
  128. $tmphash->{STRUCTURE} = join( "", @struct );
  129. if ( length( $tmphash->{SEQUENCE} ) != length( $tmphash->{STRUCTURE} ) ) {
  130. print STDERR "Unable to parse RNAstrand file correctly!\nMaybe it contains pseudo-knotted structures?\n";
  131. print STDERR join( "\n", @tmp );
  132. print STDERR "SEQ: ", $tmphash->{SEQUENCE}, "\n";
  133. print STDERR "STR: ", $tmphash->{STRUCTURE}, "\n";
  134. print STDERR "SEQ: ", length( $tmphash->{SEQUENCE} ), "\n";
  135. print STDERR "STR: ", length( $tmphash->{STRUCTURE} ), "\n";
  136. exit 1;
  137. }
  138. return $tmphash;
  139. }
  140. 1;