PageRenderTime 59ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/Bio/Factory/FTLocationFactory.pm

http://github.com/bioperl/bioperl-live
Perl | 375 lines | 239 code | 65 blank | 71 comment | 61 complexity | 5a28db5f80de7d2315de29f30b8e3078 MD5 | raw file
Possible License(s): GPL-3.0, AGPL-1.0
  1. #
  2. # BioPerl module for Bio::Factory::FTLocationFactory
  3. #
  4. # Please direct questions and support issues to <bioperl-l@bioperl.org>
  5. #
  6. # Cared for by Hilmar Lapp <hlapp at gmx.net>
  7. #
  8. # Copyright Hilmar Lapp
  9. #
  10. # You may distribute this module under the same terms as perl itself
  11. #
  12. # (c) Hilmar Lapp, hlapp at gnf.org, 2002.
  13. # (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002.
  14. #
  15. # You may distribute this module under the same terms as perl itself.
  16. # Refer to the Perl Artistic License (see the license accompanying this
  17. # software package, or see http://www.perl.com/language/misc/Artistic.html)
  18. # for the terms under which you may use, modify, and redistribute this module.
  19. #
  20. # THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
  21. # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
  22. # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  23. #
  24. # POD documentation - main docs before the code
  25. =head1 NAME
  26. Bio::Factory::FTLocationFactory - A FeatureTable Location Parser
  27. =head1 SYNOPSIS
  28. # parse a string into a location object
  29. $loc = Bio::Factory::FTLocationFactory->from_string("join(100..200,
  30. 400..500");
  31. =head1 DESCRIPTION
  32. Implementation of string-encoded location parsing for the Genbank feature
  33. table encoding of locations.
  34. =head1 FEEDBACK
  35. =head2 Mailing Lists
  36. User feedback is an integral part of the evolution of this and other
  37. Bioperl modules. Send your comments and suggestions preferably to
  38. the Bioperl mailing list. Your participation is much appreciated.
  39. bioperl-l@bioperl.org - General discussion
  40. http://bioperl.org/wiki/Mailing_lists - About the mailing lists
  41. =head2 Support
  42. Please direct usage questions or support issues to the mailing list:
  43. I<bioperl-l@bioperl.org>
  44. rather than to the module maintainer directly. Many experienced and
  45. reponsive experts will be able look at the problem and quickly
  46. address it. Please include a thorough description of the problem
  47. with code and data examples if at all possible.
  48. =head2 Reporting Bugs
  49. Report bugs to the Bioperl bug tracking system to help us keep track
  50. of the bugs and their resolution. Bug reports can be submitted via the
  51. web:
  52. https://github.com/bioperl/bioperl-live/issues
  53. =head1 AUTHOR - Hilmar Lapp
  54. Email hlapp at gmx.net
  55. =head1 CONTRIBUTORS
  56. Jason Stajich, jason-at-bioperl-dot-org
  57. Chris Fields, cjfields-at-uiuc-dot-edu
  58. =head1 APPENDIX
  59. The rest of the documentation details each of the object methods.
  60. Internal methods are usually preceded with a _
  61. =cut
  62. # Let the code begin...
  63. package Bio::Factory::FTLocationFactory;
  64. use vars qw($LOCREG);
  65. use strict;
  66. # Object preamble - inherits from Bio::Root::Root
  67. use Bio::Location::Simple;
  68. use Bio::Location::Split;
  69. use Bio::Location::Fuzzy;
  70. use base qw(Bio::Root::Root Bio::Factory::LocationFactoryI);
  71. BEGIN {
  72. # the below is an optimized regex obj. from J. Freidl's Mastering Reg Exp.
  73. $LOCREG = qr{
  74. (?>
  75. [^()]+
  76. |
  77. \(
  78. (??{$LOCREG})
  79. \)
  80. )*
  81. }x;
  82. }
  83. =head2 new
  84. Title : new
  85. Usage : my $obj = Bio::Factory::FTLocationFactory->new();
  86. Function: Builds a new Bio::Factory::FTLocationFactory object
  87. Returns : an instance of Bio::Factory::FTLocationFactory
  88. Args :
  89. =cut
  90. =head2 from_string
  91. Title : from_string
  92. Usage : $loc = $locfactory->from_string("100..200");
  93. Function: Parses the given string and returns a Bio::LocationI implementing
  94. object representing the location encoded by the string.
  95. This implementation parses the Genbank feature table
  96. encoding of locations.
  97. Example :
  98. Returns : A Bio::LocationI implementing object.
  99. Args : A string.
  100. =cut
  101. sub from_string {
  102. my ($self,$locstr,$op) = @_;
  103. my $loc;
  104. #$self->debug("$locstr\n");
  105. # $op for operator (error handling)
  106. # run on first pass only
  107. # Note : These location types are now deprecated in GenBank (Oct. 2006)
  108. if (!defined($op)) {
  109. # convert all (X.Y) to [X.Y]
  110. $locstr =~ s{\((\d+\.\d+)\)}{\[$1\]}g;
  111. # convert ABC123:(X..Y) to ABC123:[X..Y]
  112. # we should never see the above
  113. $locstr =~ s{:\((\d+\.{2}\d+)\)}{:\[$1\]}g;
  114. }
  115. if ($locstr =~ m{(.*?)\(($LOCREG)\)(.*)}o) { # any matching parentheses?
  116. my ($beg, $mid, $end) = ($1, $2, $3);
  117. my (@sublocs) = (split(q(,),$beg), $mid, split(q(,),$end));
  118. my @loc_objs;
  119. my $loc_obj;
  120. SUBLOCS:
  121. while (@sublocs) {
  122. my $subloc = shift @sublocs;
  123. next if !$subloc;
  124. my $oparg = ($subloc eq 'join' || $subloc eq 'bond' ||
  125. $subloc eq 'order' || $subloc eq 'complement') ? $subloc : undef;
  126. # has operator, requires further work (recurse)
  127. if ($oparg) {
  128. my $sub = shift @sublocs;
  129. # simple split operators (no recursive calls needed)
  130. if (($oparg eq 'join' || $oparg eq 'order' || $oparg eq 'bond' )
  131. && $sub !~ m{(?:join|order|bond)}) {
  132. my @splitlocs = split(q(,), $sub);
  133. $loc_obj = Bio::Location::Split->new(-verbose => 1,
  134. -splittype => $oparg);
  135. # Store strand values for later consistency check
  136. my @subloc_strands;
  137. my @s_objs;
  138. foreach my $splitloc (@splitlocs) {
  139. next unless $splitloc;
  140. my $sobj;
  141. if ($splitloc =~ m{\(($LOCREG)\)}) {
  142. my $comploc = $1;
  143. $sobj = $self->_parse_location($comploc);
  144. $sobj->strand(-1);
  145. push @subloc_strands, -1;
  146. } else {
  147. $sobj = $self->_parse_location($splitloc);
  148. push @subloc_strands, 1;
  149. }
  150. push @s_objs, $sobj;
  151. }
  152. # Sublocations strand values consistency check to set
  153. # Guide Strand and sublocations adding order
  154. if (scalar @s_objs > 0) {
  155. my $identical = 0;
  156. my $first_value = $subloc_strands[0];
  157. foreach my $strand (@subloc_strands) {
  158. $identical++ if ($strand == $first_value);
  159. }
  160. if ($identical == scalar @subloc_strands) {
  161. # Set guide_strand if all sublocations have the same strand
  162. $loc_obj->guide_strand($first_value);
  163. # Reverse sublocation order for negative strand locations, e.g.:
  164. # Common (CAA24672.1):
  165. # join(complement(4918..5163),complement(2691..4571))
  166. # Trans-splicing (NP_958375.1):
  167. # join(32737..32825,complement(174205..174384),complement(69520..71506))
  168. if ($first_value == -1) {
  169. @s_objs = reverse @s_objs;
  170. }
  171. }
  172. else {
  173. # Mixed strand values
  174. $loc_obj->guide_strand(undef);
  175. }
  176. # Add sublocations
  177. foreach my $s_obj (@s_objs) {
  178. $loc_obj->add_sub_Location($s_obj);
  179. }
  180. }
  181. } else {
  182. $loc_obj = $self->from_string($sub, $oparg);
  183. # reinsure the operator is set correctly for this level
  184. # unless it is complement
  185. $loc_obj->splittype($oparg) unless $oparg eq 'complement';
  186. }
  187. }
  188. # no operator, simple or fuzzy
  189. else {
  190. $loc_obj = $self->from_string($subloc,1);
  191. }
  192. if ($op && $op eq 'complement') {
  193. $loc_obj->strand(-1);
  194. }
  195. # For Split-type $loc_obj, if guide strand is set (meaning consistent strand for
  196. # all sublocs) and guide strand is the same than the last location from @loc_objs,
  197. # then recover the sublocations and add them to @loc_objs. This way,
  198. # "join(10..20,join(30..40,50..60))" becomes "join(10..20,30..40,50..60)"
  199. my $guide_strand = ($loc_obj->isa('Bio::Location::SplitLocationI')) ? ($loc_obj->guide_strand || 0) : 0;
  200. my $last_strand = (scalar @loc_objs > 0) ? $loc_objs[-1]->strand : 0;
  201. if ( $guide_strand != 0
  202. and $guide_strand == $last_strand
  203. and $oparg eq $op # join(,join()) OK, order(join()) NOT OK
  204. ) {
  205. my @subloc_objs = $loc_obj->sub_Location(0);
  206. foreach my $subloc_obj (@subloc_objs) {
  207. push @loc_objs, $subloc_obj;
  208. }
  209. }
  210. else {
  211. push @loc_objs, $loc_obj;
  212. }
  213. }
  214. my $ct = @loc_objs;
  215. if ($op && !($op eq 'join' || $op eq 'order' || $op eq 'bond')
  216. && $ct > 1 ) {
  217. $self->throw("Bad operator $op: had multiple locations ".
  218. scalar(@loc_objs).", should be SplitLocationI");
  219. }
  220. if ($ct > 1) {
  221. $loc = Bio::Location::Split->new();
  222. $loc->add_sub_Location(shift @loc_objs) while (@loc_objs);
  223. return $loc;
  224. } else {
  225. $loc = shift @loc_objs;
  226. return $loc;
  227. }
  228. } else { # simple location(s)
  229. $loc = $self->_parse_location($locstr);
  230. $loc->strand(-1) if ($op && $op eq 'complement');
  231. }
  232. return $loc;
  233. }
  234. =head2 _parse_location
  235. Title : _parse_location
  236. Usage : $loc = $locfactory->_parse_location( $loc_string)
  237. Function: Parses the given location string and returns a location object
  238. with start() and end() and strand() set appropriately.
  239. Note that this method is private.
  240. Returns : A Bio::LocationI implementing object or undef on failure
  241. Args : location string
  242. =cut
  243. sub _parse_location {
  244. my ($self, $locstr) = @_;
  245. my ($loc, $seqid);
  246. #$self->debug( "Location parse, processing $locstr\n");
  247. # 'remote' location?
  248. if($locstr =~ m{^(\S+):(.*)$}o) {
  249. # yes; memorize remote ID and strip from location string
  250. $seqid = $1;
  251. $locstr = $2;
  252. }
  253. # split into start and end
  254. my ($start, $end) = split(/\.\./, $locstr);
  255. # remove enclosing parentheses if any; note that because of parentheses
  256. # possibly surrounding the entire location the parentheses around start
  257. # and/or may be asymmetrical
  258. # Note: these are from X.Y fuzzy locations, which are deprecated!
  259. $start =~ s/(?:^\[+|\]+$)//g if $start;
  260. $end =~ s/(?:^\[+|\]+$)//g if $end;
  261. # Is this a simple (exact) or a fuzzy location? Simples have exact start
  262. # and end, or is between two adjacent bases. Everything else is fuzzy.
  263. my $loctype = ".."; # exact with start and end as default
  264. $loctype = '?' if ( ($locstr =~ /\?/) && ($locstr !~ /\?\d+/) );
  265. my $locclass = "Bio::Location::Simple";
  266. if(! defined($end)) {
  267. if($locstr =~ /(\d+)([\.\^])(\d+)/) {
  268. $start = $1;
  269. $end = $3;
  270. $loctype = $2;
  271. $locclass = "Bio::Location::Fuzzy"
  272. unless (abs($end-$start) <= 1) && ($loctype eq "^");
  273. } else {
  274. $end = $start;
  275. }
  276. }
  277. # start_num and end_num are for the numeric only versions of
  278. # start and end so they can be compared
  279. # in a few lines
  280. my ($start_num, $end_num) = ($start,$end);
  281. if ( ($start =~ /[\>\<\?\.\^]/) || ($end =~ /[\>\<\?\.\^]/) ) {
  282. $locclass = 'Bio::Location::Fuzzy';
  283. if($start =~ /(\d+)/) {
  284. ($start_num) = $1;
  285. } else {
  286. $start_num = 0
  287. }
  288. if ($end =~ /(\d+)/) {
  289. ($end_num) = $1;
  290. } else { $end_num = 0 }
  291. }
  292. my $strand = 1;
  293. if( $start_num > $end_num && $loctype ne '?') {
  294. ($start,$end,$strand) = ($end,$start,-1);
  295. }
  296. # instantiate location and initialize
  297. $loc = $locclass->new(-verbose => $self->verbose,
  298. -start => $start,
  299. -end => $end,
  300. -strand => $strand,
  301. -location_type => $loctype);
  302. # set remote ID if remote location
  303. if($seqid) {
  304. $loc->is_remote(1);
  305. $loc->seq_id($seqid);
  306. }
  307. # done (hopefully)
  308. return $loc;
  309. }
  310. 1;