PageRenderTime 1030ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/Bio/ClusterIO/unigene.pm

http://github.com/bioperl/bioperl-live
Perl | 276 lines | 198 code | 42 blank | 36 comment | 11 complexity | 7dd9ebc6f24ebd2e41e78d9fe3b2ccde MD5 | raw file
Possible License(s): GPL-3.0, AGPL-1.0
  1. # BioPerl module for Bio::ClusterIO::unigene
  2. #
  3. # Please direct questions and support issues to <bioperl-l@bioperl.org>
  4. #
  5. # Cared for by Andrew Macgregor <andrew at cbbc.murdoch.edu.au>
  6. #
  7. # Copyright Andrew Macgregor, Jo-Ann Stanton, David Green
  8. # Molecular Embryology Group, Anatomy & Structural Biology, University of Otago
  9. # http://meg.otago.ac.nz
  10. #
  11. # You may distribute this module under the same terms as perl itself
  12. #
  13. # _history
  14. # April 17, 2002 - Initial implementation by Andrew Macgregor
  15. # POD documentation - main docs before the code
  16. =head1 NAME
  17. Bio::ClusterIO::unigene - UniGene input stream
  18. =head1 SYNOPSIS
  19. Do not use this module directly. Use it via the Bio::ClusterIO class.
  20. =head1 DESCRIPTION
  21. This object reads from Unigene *.data files downloaded from
  22. ftp://ftp.ncbi.nih.gov/repository/UniGene/. It does not download and
  23. decompress the file, you have to do that yourself.
  24. =head1 FEEDBACK
  25. =head2 Mailing Lists
  26. User feedback is an integral part of the evolution of this and other
  27. Bioperl modules. Send your comments and suggestions preferably to one
  28. of the Bioperl mailing lists. Your participation is much appreciated.
  29. bioperl-l@bioperl.org - General discussion
  30. http://bioperl.org/wiki/Mailing_lists - About the mailing lists
  31. =head2 Support
  32. Please direct usage questions or support issues to the mailing list:
  33. I<bioperl-l@bioperl.org>
  34. rather than to the module maintainer directly. Many experienced and
  35. reponsive experts will be able look at the problem and quickly
  36. address it. Please include a thorough description of the problem
  37. with code and data examples if at all possible.
  38. =head2 Reporting Bugs
  39. Report bugs to the Bioperl bug tracking system to help us keep track
  40. the bugs and their resolution. Bug reports can be submitted via the
  41. web:
  42. https://github.com/bioperl/bioperl-live/issues
  43. =head1 AUTHORS - Andrew Macgregor
  44. Email: andrew at cbbc.murdoch.edu.au
  45. =head1 APPENDIX
  46. The rest of the documentation details each of the object
  47. methods. Internal methods are usually preceded with a _
  48. =cut
  49. #'
  50. # Let the code begin...
  51. package Bio::ClusterIO::unigene;
  52. use strict;
  53. use Bio::Cluster::UniGene;
  54. use Bio::Cluster::ClusterFactory;
  55. use base qw(Bio::ClusterIO);
  56. my %line_is = (
  57. ID => q/ID\s+(\w{2,3}\.\d+)/,
  58. TITLE => q/TITLE\s+(\S.*)/,
  59. GENE => q/GENE\s+(\S.*)/,
  60. CYTOBAND => q/CYTOBAND\s+(\S.*)/,
  61. MGI => q/MGI\s+(\S.*)/,
  62. LOCUSLINK => q/LOCUSLINK\s+(\S.*)/,
  63. HOMOL => q/HOMOL\s+(\S.*)/,
  64. EXPRESS => q/EXPRESS\s+(\S.*)/,
  65. RESTR_EXPR => q/RESTR_EXPR\s+(\S.*)/,
  66. GNM_TERMINUS => q/GNM_TERMINUS\s+(\S.*)/,
  67. CHROMOSOME => q/CHROMOSOME\s+(\S.*)/,
  68. STS => q/STS\s+(\S.*)/,
  69. TXMAP => q/TXMAP\s+(\S.*)/,
  70. PROTSIM => q/PROTSIM\s+(\S.*)/,
  71. SCOUNT => q/SCOUNT\s+(\S.*)/,
  72. SEQUENCE => q/SEQUENCE\s+(\S.*)/,
  73. ACC => q/ACC=(\w+)(\.\d+)?/,
  74. NID => q/NID=\s*(\S.*)/,
  75. PID => q/PID=\s*(\S.*)/,
  76. CLONE => q/CLONE=\s*(\S.*)/,
  77. END => q/END=\s*(\S.*)/,
  78. LID => q/LID=\s*(\S.*)/,
  79. MGC => q/MGC=\s*(\S.*)/,
  80. SEQTYPE => q/SEQTYPE=\s*(\S.*)/,
  81. TRACE => q/TRACE=\s*(\S.*)/,
  82. PERIPHERAL => q/PERIPHERAL=\s*(\S.*)/,
  83. DELIMITER => q{^//},
  84. );
  85. # we set the right factory here
  86. sub _initialize {
  87. my($self, @args) = @_;
  88. $self->SUPER::_initialize(@args);
  89. if(! $self->cluster_factory()) {
  90. $self->cluster_factory(Bio::Cluster::ClusterFactory->new(
  91. -type => 'Bio::Cluster::UniGene'));
  92. }
  93. }
  94. =head2 next_cluster
  95. Title : next_cluster
  96. Usage : $unigene = $stream->next_cluster()
  97. Function: returns the next unigene in the stream
  98. Returns : Bio::Cluster::UniGene object
  99. Args : NONE
  100. =cut
  101. sub next_cluster {
  102. my( $self) = @_;
  103. local $/ = "\n//";
  104. return unless my $entry = $self->_readline;
  105. # set up the variables we'll need
  106. my (%unigene,@express,@locuslink,@chromosome,
  107. @sts,@txmap,@protsim,@sequence);
  108. my $UGobj;
  109. # set up the regexes
  110. # add whitespace parsing and precompile regexes
  111. #foreach (values %line_is) {
  112. # $_ =~ s/\s+/\\s+/g;
  113. # print STDERR "Regex is $_\n";
  114. # #$_ = qr/$_/x;
  115. #}
  116. #$line_is{'TITLE'} = qq/TITLE\\s+(\\S.+)/;
  117. # run each line in an entry against the regexes
  118. foreach my $line (split /\n/, $entry) {
  119. #print STDERR "Wanting to match $line\n";
  120. if ($line =~ /$line_is{ID}/gcx) {
  121. $unigene{ID} = $1;
  122. }
  123. elsif ($line =~ /$line_is{TITLE}/gcx ) {
  124. #print STDERR "MATCHED with [$1]\n";
  125. $unigene{TITLE} = $1;
  126. }
  127. elsif ($line =~ /$line_is{GENE}/gcx) {
  128. $unigene{GENE} = $1;
  129. }
  130. elsif ($line =~ /$line_is{CYTOBAND}/gcx) {
  131. $unigene{CYTOBAND} = $1;
  132. }
  133. elsif ($line =~ /$line_is{MGI}/gcx) {
  134. $unigene{MGI} = $1;
  135. }
  136. elsif ($line =~ /$line_is{LOCUSLINK}/gcx) {
  137. @locuslink = split /;/, $1;
  138. }
  139. elsif ($line =~ /$line_is{HOMOL}/gcx) {
  140. $unigene{HOMOL} = $1;
  141. }
  142. elsif ($line =~ /$line_is{EXPRESS}/gcx) {
  143. my $express = $1;
  144. # remove initial semicolon if present
  145. $express =~ s/^;//;
  146. @express = split /\s*;/, $express;
  147. }
  148. elsif ($line =~ /$line_is{RESTR_EXPR}/gcx) {
  149. $unigene{RESTR_EXPR} = $1;
  150. }
  151. elsif ($line =~ /$line_is{GNM_TERMINUS}/gcx) {
  152. $unigene{GNM_TERMINUS} = $1;
  153. }
  154. elsif ($line =~ /$line_is{CHROMOSOME}/gcx) {
  155. push @chromosome, $1;
  156. }
  157. elsif ($line =~ /$line_is{TXMAP}/gcx) {
  158. push @txmap, $1;
  159. }
  160. elsif ($line =~ /$line_is{STS}/gcx) {
  161. push @sts, $1;
  162. }
  163. elsif ($line =~ /$line_is{PROTSIM}/gcx) {
  164. push @protsim, $1;
  165. }
  166. elsif ($line =~ /$line_is{SCOUNT}/gcx) {
  167. $unigene{SCOUNT} = $1;
  168. }
  169. elsif ($line =~ /$line_is{SEQUENCE}/gcx) {
  170. # parse into each sequence line
  171. my $seq = {};
  172. # add unigene id to each seq
  173. #$seq->{unigene_id} = $unigene{ID};
  174. my @items = split(/;/, $1);
  175. foreach (@items) {
  176. if (/$line_is{ACC}/gcx) {
  177. $seq->{acc} = $1;
  178. # remove leading dot if version pattern matched
  179. $seq->{version} = substr($2,1) if defined $2;
  180. }
  181. elsif (/$line_is{NID}/gcx) {
  182. $seq->{nid} = $1;
  183. }
  184. elsif (/$line_is{PID}/gcx) {
  185. $seq->{pid} = $1;
  186. }
  187. elsif (/$line_is{CLONE}/gcx) {
  188. $seq->{clone} = $1;
  189. }
  190. elsif (/$line_is{END}/gcx) {
  191. $seq->{end} = $1;
  192. }
  193. elsif (/$line_is{LID}/gcx) {
  194. $seq->{lid} = $1;
  195. }
  196. elsif (/$line_is{MGC}/gcx) {
  197. $seq->{mgc} = $1;
  198. }
  199. elsif (/$line_is{SEQTYPE}/gcx) {
  200. $seq->{seqtype} = $1;
  201. }
  202. elsif (/$line_is{TRACE}/gcx) {
  203. $seq->{trace} = $1;
  204. }
  205. elsif (/$line_is{PERIPHERAL}/gcx) {
  206. $seq->{peripheral} = $1;
  207. }
  208. }
  209. push @sequence, $seq;
  210. }
  211. elsif ($line =~ /$line_is{DELIMITER}/gcx) {
  212. # at the end of the record, add data to the object
  213. $UGobj = $self->cluster_factory->create_object(
  214. -display_id => $unigene{ID},
  215. -description => $unigene{TITLE},
  216. -size => $unigene{SCOUNT},
  217. -members => \@sequence);
  218. $UGobj->gene($unigene{GENE}) if defined ($unigene{GENE});
  219. $UGobj->cytoband($unigene{CYTOBAND}) if defined($unigene{CYTOBAND});
  220. $UGobj->mgi($unigene{MGI}) if defined ($unigene{MGI});
  221. $UGobj->locuslink(\@locuslink);
  222. $UGobj->homol($unigene{HOMOL}) if defined ($unigene{HOMOL});
  223. $UGobj->express(\@express);
  224. $UGobj->restr_expr($unigene{RESTR_EXPR}) if defined ($unigene{RESTR_EXPR});
  225. $UGobj->gnm_terminus($unigene{GNM_TERMINUS}) if defined ($unigene{GNM_TERMINUS});
  226. $UGobj->chromosome(\@chromosome);
  227. $UGobj->sts(\@sts);
  228. $UGobj->txmap(\@txmap);
  229. $UGobj->protsim(\@protsim);
  230. }
  231. }
  232. return $UGobj;
  233. }
  234. 1;