PageRenderTime 50ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/fls_fastq_screen/perlmodules/perl5/modules/BioPerl-1.6.923/share/perl/5.14.2/Bio/Tools/IUPAC.pm

https://gitlab.com/pooja043/Globus_Docker_3
Perl | 560 lines | 420 code | 118 blank | 22 comment | 35 complexity | 6039b57e287d31bf5f701a9ab6094e3e MD5 | raw file
  1. #
  2. # BioPerl module for IUPAC
  3. #
  4. # Please direct questions and support issues to <bioperl-l@bioperl.org>
  5. #
  6. # Cared for by Aaron Mackey <amackey@virginia.edu>
  7. #
  8. # Copyright Aaron Mackey
  9. #
  10. # You may distribute this module under the same terms as perl itself
  11. # POD documentation - main docs before the code
  12. =head1 NAME
  13. Bio::Tools::IUPAC - Generates unique sequence objects or regular expressions from
  14. an ambiguous IUPAC sequence
  15. =head1 SYNOPSIS
  16. use Bio::PrimarySeq;
  17. use Bio::Tools::IUPAC;
  18. # Get the IUPAC code for proteins
  19. my %iupac_prot = Bio::Tools::IUPAC->new->iupac_iup;
  20. # Create a sequence with degenerate residues
  21. my $ambiseq = Bio::PrimarySeq->new(-seq => 'ARTCGUTGN', -alphabet => 'dna');
  22. # Create all possible non-degenerate sequences
  23. my $iupac = Bio::Tools::IUPAC->new(-seq => $ambiseq);
  24. while ($uniqueseq = $iupac->next_seq()) {
  25. # process the unique Bio::Seq object.
  26. }
  27. # Get a regular expression that matches all possible sequences
  28. my $regexp = $iupac->regexp();
  29. =head1 DESCRIPTION
  30. Bio::Tools::IUPAC is a tool that manipulates sequences with ambiguous residues
  31. following the IUPAC conventions. Non-standard characters have the meaning
  32. described below:
  33. IUPAC-IUB SYMBOLS FOR NUCLEOTIDE (DNA OR RNA) NOMENCLATURE:
  34. Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030
  35. ---------------------------------------------------------------
  36. Symbol Meaning Nucleic Acid
  37. ---------------------------------------------------------------
  38. A A Adenine
  39. C C Cytosine
  40. G G Guanine
  41. T T Thymine
  42. U U Uracil
  43. M A or C aMino
  44. R A or G puRine
  45. W A or T Weak
  46. S C or G Strong
  47. Y C or T pYrimidine
  48. K G or T Keto
  49. V A or C or G not T (closest unused char after T)
  50. H A or C or T not G (closest unused char after G)
  51. D A or G or T not C (closest unused char after C)
  52. B C or G or T not A (closest unused char after A)
  53. X G or A or T or C Unknown (very rarely used)
  54. N G or A or T or C Unknown (commonly used)
  55. IUPAC-IUP AMINO ACID SYMBOLS:
  56. Biochem J. 1984 Apr 15; 219(2): 345-373
  57. Eur J Biochem. 1993 Apr 1; 213(1): 2
  58. ------------------------------------------
  59. Symbol Meaning
  60. ------------------------------------------
  61. A Alanine
  62. B Aspartic Acid, Asparagine
  63. C Cysteine
  64. D Aspartic Acid
  65. E Glutamic Acid
  66. F Phenylalanine
  67. G Glycine
  68. H Histidine
  69. I Isoleucine
  70. J Isoleucine/Leucine
  71. K Lysine
  72. L Leucine
  73. M Methionine
  74. N Asparagine
  75. O Pyrrolysine
  76. P Proline
  77. Q Glutamine
  78. R Arginine
  79. S Serine
  80. T Threonine
  81. U Selenocysteine
  82. V Valine
  83. W Tryptophan
  84. X Unknown
  85. Y Tyrosine
  86. Z Glutamic Acid, Glutamine
  87. * Terminator
  88. There are a few things Bio::Tools::IUPAC can do for you:
  89. =over
  90. =item *
  91. report the IUPAC mapping between ambiguous and non-ambiguous residues
  92. =item *
  93. produce a stream of all possible corresponding unambiguous Bio::Seq objects given
  94. an ambiguous sequence object
  95. =item *
  96. convert an ambiguous sequence object to a corresponding regular expression
  97. =back
  98. =head1 FEEDBACK
  99. =head2 Mailing Lists
  100. User feedback is an integral part of the evolution of this and other
  101. Bioperl modules. Send your comments and suggestions preferably to one
  102. of the Bioperl mailing lists. Your participation is much appreciated.
  103. bioperl-l@bioperl.org - General discussion
  104. http://bioperl.org/wiki/Mailing_lists - About the mailing lists
  105. =head2 Support
  106. Please direct usage questions or support issues to the mailing list:
  107. I<bioperl-l@bioperl.org>
  108. rather than to the module maintainer directly. Many experienced and
  109. reponsive experts will be able look at the problem and quickly
  110. address it. Please include a thorough description of the problem
  111. with code and data examples if at all possible.
  112. =head2 Reporting Bugs
  113. Report bugs to the Bioperl bug tracking system to help us keep track
  114. the bugs and their resolution. Bug reports can be submitted via the
  115. web:
  116. https://redmine.open-bio.org/projects/bioperl/
  117. =head1 AUTHOR - Aaron Mackey
  118. Email amackey-at-virginia.edu
  119. =head1 APPENDIX
  120. The rest of the documentation details each of the object
  121. methods. Internal methods are usually preceded with a _
  122. =cut
  123. package Bio::Tools::IUPAC;
  124. use strict;
  125. use base qw(Bio::Root::Root);
  126. use vars qw(%IUB %IUB_AMB %REV_IUB %IUP %IUP_AMB $AUTOLOAD);
  127. BEGIN {
  128. # Ambiguous nucleic residues are matched to unambiguous residues
  129. %IUB = (
  130. A => [qw(A)],
  131. C => [qw(C)],
  132. G => [qw(G)],
  133. T => [qw(T)],
  134. U => [qw(U)],
  135. M => [qw(A C)],
  136. R => [qw(A G)],
  137. S => [qw(C G)],
  138. W => [qw(A T)],
  139. Y => [qw(C T)],
  140. K => [qw(G T)],
  141. V => [qw(A C G)],
  142. H => [qw(A C T)],
  143. D => [qw(A G T)],
  144. B => [qw(C G T)],
  145. N => [qw(A C G T)],
  146. X => [qw(A C G T)],
  147. );
  148. # Same as %IUB but ambiguous residues are matched to ambiguous residues only
  149. %IUB_AMB = (
  150. M => [qw(M)],
  151. R => [qw(R)],
  152. W => [qw(W)],
  153. S => [qw(S)],
  154. Y => [qw(Y)],
  155. K => [qw(K)],
  156. V => [qw(M R S V)],
  157. H => [qw(H M W Y)],
  158. D => [qw(D K R W)],
  159. B => [qw(B K S Y)],
  160. N => [qw(B D H K M N R S V W Y)],
  161. );
  162. # The inverse of %IUB
  163. %REV_IUB = (
  164. A => 'A',
  165. T => 'T',
  166. U => 'U',
  167. C => 'C',
  168. G => 'G',
  169. AC => 'M',
  170. AG => 'R',
  171. AT => 'W',
  172. CG => 'S',
  173. CT => 'Y',
  174. GT => 'K',
  175. ACG => 'V',
  176. ACT => 'H',
  177. AGT => 'D',
  178. CGT => 'B',
  179. ACGT => 'N',
  180. N => 'N'
  181. );
  182. # Same thing with proteins now
  183. %IUP = (
  184. A => [qw(A)],
  185. B => [qw(D N)],
  186. C => [qw(C)],
  187. D => [qw(D)],
  188. E => [qw(E)],
  189. F => [qw(F)],
  190. G => [qw(G)],
  191. H => [qw(H)],
  192. I => [qw(I)],
  193. J => [qw(I L)],
  194. K => [qw(K)],
  195. L => [qw(L)],
  196. M => [qw(M)],
  197. N => [qw(N)],
  198. O => [qw(O)],
  199. P => [qw(P)],
  200. Q => [qw(Q)],
  201. R => [qw(R)],
  202. S => [qw(S)],
  203. T => [qw(T)],
  204. U => [qw(U)],
  205. V => [qw(V)],
  206. W => [qw(W)],
  207. X => [qw(X)],
  208. Y => [qw(Y)],
  209. Z => [qw(E Q)],
  210. '*' => [qw(*)],
  211. );
  212. %IUP_AMB = (
  213. B => [qw(B)],
  214. J => [qw(J)],
  215. Z => [qw(Z)],
  216. );
  217. }
  218. =head2 new
  219. Title : new
  220. Usage : Bio::Tools::IUPAC->new($seq);
  221. Function: Create a new IUPAC object, which acts as a sequence stream (akin to
  222. SeqIO)
  223. Args : an ambiguously coded sequence object that has a specified 'alphabet'
  224. Returns : a Bio::Tools::IUPAC object.
  225. =cut
  226. sub new {
  227. my ($class,@args) = @_;
  228. my $self = $class->SUPER::new(@args);
  229. my ($seq) = $self->_rearrange([qw(SEQ)],@args);
  230. if ( (not defined $seq) && @args && ref($args[0]) ) {
  231. # parameter not passed as named parameter?
  232. $seq = $args[0];
  233. }
  234. if (defined $seq) {
  235. if (not $seq->isa('Bio::PrimarySeqI')) {
  236. $self->throw('Must supply a sequence object');
  237. }
  238. if (length $seq->seq == 0) {
  239. $self->throw('Sequence had zero-length');
  240. }
  241. $self->{'_seq'} = $seq;
  242. }
  243. return $self;
  244. }
  245. sub _initialize {
  246. my ($self) = @_;
  247. my %iupac = $self->iupac;
  248. $self->{'_alpha'} = [ map { $iupac{uc $_} } split('', $self->{'_seq'}->seq) ];
  249. $self->{'_string'} = [(0) x length($self->{'_seq'}->seq())];
  250. $self->{'_string'}->[0] = -1;
  251. }
  252. =head2 next_seq
  253. Title : next_seq
  254. Usage : $iupac->next_seq();
  255. Function: returns the next unique sequence object
  256. Args : none.
  257. Returns : a Bio::Seq object
  258. =cut
  259. sub next_seq {
  260. my ($self) = @_;
  261. if (not exists $self->{'_string'}) {
  262. $self->_initialize();
  263. }
  264. for my $i ( 0 .. $#{$self->{'_string'}} ) {
  265. next unless $self->{'_string'}->[$i] || @{$self->{'_alpha'}->[$i]} > 1;
  266. if ( $self->{'_string'}->[$i] == $#{$self->{'_alpha'}->[$i]} ) { # rollover
  267. if ( $i == $#{$self->{'_string'}} ) { # end of possibilities
  268. return;
  269. } else {
  270. $self->{'_string'}->[$i] = 0;
  271. next;
  272. }
  273. } else {
  274. $self->{'_string'}->[$i]++;
  275. my $j = -1;
  276. my $seqstr = join('', map { $j++; $self->{'_alpha'}->[$j]->[$_]; } @{$self->{'_string'}});
  277. my $desc = $self->{'_seq'}->desc() || '';
  278. $self->{'_num'}++;
  279. 1 while $self->{'_num'} =~ s/(\d)(\d\d\d)(?!\d)/$1,$2/;
  280. $desc =~ s/( \[Bio::Tools::IUPAC-generated\sunique sequence # [^\]]*\])|$/ \[Bio::Tools::IUPAC-generated unique sequence # $self->{'_num'}\]/;
  281. $self->{'_num'} =~ s/,//g;
  282. # Return a fresh sequence object
  283. return Bio::PrimarySeq->new(-seq => $seqstr, -desc => $desc);
  284. }
  285. }
  286. }
  287. =head2 iupac
  288. Title : iupac
  289. Usage : my %symbols = $iupac->iupac;
  290. Function: Returns a hash of symbols -> symbol components of the right type
  291. for the given sequence, i.e. it is the same as iupac_iup() if
  292. Bio::Tools::IUPAC was given a proteic sequence, or iupac_iub() if the
  293. sequence was nucleic. For example, the key 'M' has the value ['A', 'C'].
  294. Args : none
  295. Returns : Hash
  296. =cut
  297. sub iupac {
  298. my ($self) = @_;
  299. my $alphabet = lc( $self->{'_seq'}->alphabet() );
  300. if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
  301. return %IUB; # nucleic
  302. } elsif ( $alphabet eq 'protein' ) {
  303. return %IUP; # proteic
  304. } else {
  305. $self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
  306. }
  307. }
  308. =head2 iupac_amb
  309. Title : iupac_amb
  310. Usage : my %symbols = $iupac->iupac_amb;
  311. Function: Same as iupac() but only contains a mapping between ambiguous residues
  312. and the ambiguous residues they map to. For example, the key 'N' has
  313. the value ['R', 'Y', 'K', 'M', 'S', 'W', 'B', 'D', 'H', 'V', 'N'],
  314. i.e. it matches all other ambiguous residues.
  315. Args : none
  316. Returns : Hash
  317. =cut
  318. sub iupac_amb {
  319. my ($self) = @_;
  320. my $alphabet = lc( $self->{'_seq'}->alphabet() );
  321. if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
  322. return %IUB_AMB; # nucleic
  323. } elsif ( $alphabet eq 'protein' ) {
  324. return %IUP_AMB; # proteic
  325. } else {
  326. $self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
  327. }
  328. }
  329. =head2 iupac_iup
  330. Title : iupac_iup
  331. Usage : my %aasymbols = $iupac->iupac_iup;
  332. Function: Returns a hash of PROTEIN symbols -> non-ambiguous symbol components
  333. Args : none
  334. Returns : Hash
  335. =cut
  336. sub iupac_iup {
  337. return %IUP;
  338. }
  339. =head2 iupac_iup_amb
  340. Title : iupac_iup_amb
  341. Usage : my %aasymbols = $iupac->iupac_iup_amb;
  342. Function: Returns a hash of PROTEIN symbols -> ambiguous symbol components
  343. Args : none
  344. Returns : Hash
  345. =cut
  346. sub iupac_iup_amb {
  347. return %IUP_AMB;
  348. }
  349. =head2 iupac_iub
  350. Title : iupac_iub
  351. Usage : my %dnasymbols = $iupac->iupac_iub;
  352. Function: Returns a hash of DNA symbols -> non-ambiguous symbol components
  353. Args : none
  354. Returns : Hash
  355. =cut
  356. sub iupac_iub {
  357. return %IUB;
  358. }
  359. =head2 iupac_iub_amb
  360. Title : iupac_iub_amb
  361. Usage : my %dnasymbols = $iupac->iupac_iub;
  362. Function: Returns a hash of DNA symbols -> ambiguous symbol components
  363. Args : none
  364. Returns : Hash
  365. =cut
  366. sub iupac_iub_amb {
  367. return %IUB_AMB;
  368. }
  369. =head2 iupac_rev_iub
  370. Title : iupac_rev_iub
  371. Usage : my %dnasymbols = $iupac->iupac_rev_iub;
  372. Function: Returns a hash of nucleotide combinations -> IUPAC code
  373. (a reverse of the iupac_iub hash).
  374. Args : none
  375. Returns : Hash
  376. =cut
  377. sub iupac_rev_iub {
  378. return %REV_IUB;
  379. }
  380. =head2 count
  381. Title : count
  382. Usage : my $total = $iupac->count();
  383. Function: Calculates the number of unique, unambiguous sequences that
  384. this ambiguous sequence could generate
  385. Args : none
  386. Return : int
  387. =cut
  388. sub count {
  389. my ($self) = @_;
  390. if (not exists $self->{'_string'}) {
  391. $self->_initialize();
  392. }
  393. my $count = 1;
  394. $count *= scalar(@$_) for (@{$self->{'_alpha'}});
  395. return $count;
  396. }
  397. =head2 regexp
  398. Title : regexp
  399. Usage : my $re = $iupac->regexp();
  400. Function: Converts the ambiguous sequence into a regular expression that
  401. matches all of the corresponding ambiguous and non-ambiguous sequences.
  402. You can further manipulate the resulting regular expression with the
  403. Bio::Tools::SeqPattern module. After you are done building your
  404. regular expression, you might want to compile it and make it case-
  405. insensitive:
  406. $re = qr/$re/i;
  407. Args : 1 to match RNA: T and U characters will match interchangeably
  408. Return : regular expression
  409. =cut
  410. sub regexp {
  411. my ($self, $match_rna) = @_;
  412. my $re;
  413. my $seq = $self->{'_seq'}->seq;
  414. my %iupac = $self->iupac;
  415. my %iupac_amb = $self->iupac_amb;
  416. for my $pos (0 .. length($seq)-1) {
  417. my $res = substr $seq, $pos, 1;
  418. my $iupacs = $iupac{$res};
  419. my $iupacs_amb = $iupac_amb{$res} || [];
  420. if (not defined $iupacs) {
  421. $self->throw("Primer sequence '$seq' is not a valid IUPAC sequence.".
  422. " Offending character was '$res'.\n");
  423. }
  424. my $part = join '', (@$iupacs, @$iupacs_amb);
  425. if ($match_rna) {
  426. $part =~ s/T/TU/i || $part =~ s/U/TU/i;
  427. }
  428. if (length $part > 1) {
  429. $part = '['.$part.']';
  430. }
  431. $re .= $part;
  432. }
  433. return $re;
  434. }
  435. sub AUTOLOAD {
  436. my $self = shift @_;
  437. my $method = $AUTOLOAD;
  438. $method =~ s/.*:://;
  439. return $self->{'_seq'}->$method(@_)
  440. unless $method eq 'DESTROY';
  441. }
  442. 1;