PageRenderTime 72ms CodeModel.GetById 44ms RepoModel.GetById 1ms app.codeStats 0ms

/notify_user/pymodules/python2.7/lib/python/biopython-1.65-py2.7-linux-x86_64.egg/BioSQL/BioSeq.py

https://gitlab.com/pooja043/Globus_Docker_4
Python | 576 lines | 524 code | 5 blank | 47 comment | 4 complexity | 13782261a22e124afe747131967b81c4 MD5 | raw file
  1. # Copyright 2002 by Andrew Dalke. All rights reserved.
  2. # Revisions 2007-2009 copyright by Peter Cock. All rights reserved.
  3. # Revisions 2008-2009 copyright by Cymon J. Cox. All rights reserved.
  4. # This code is part of the Biopython distribution and governed by its
  5. # license. Please see the LICENSE file that should have been included
  6. # as part of this package.
  7. #
  8. # Note that BioSQL (including the database schema and scripts) is
  9. # available and licensed separately. Please consult www.biosql.org
  10. """Implementations of Biopython-like Seq objects on top of BioSQL.
  11. This allows retrival of items stored in a BioSQL database using
  12. a biopython-like SeqRecord and Seq interface.
  13. Note: Currently we do not support recording per-letter-annotations
  14. (like quality scores) in BioSQL.
  15. """
  16. from Bio._py3k import unicode
  17. from Bio import Alphabet
  18. from Bio.Seq import Seq, UnknownSeq
  19. from Bio.SeqRecord import SeqRecord, _RestrictedDict
  20. from Bio import SeqFeature
  21. class DBSeq(Seq):
  22. """BioSQL equivalent of the Biopython Seq object."""
  23. def __init__(self, primary_id, adaptor, alphabet, start, length):
  24. """Create a new DBSeq object referring to a BioSQL entry.
  25. You wouldn't normally create a DBSeq object yourself, this is done
  26. for you when retreiving a DBSeqRecord object from the database.
  27. """
  28. self.primary_id = primary_id
  29. self.adaptor = adaptor
  30. self.alphabet = alphabet
  31. self._length = length
  32. self.start = start
  33. def __len__(self):
  34. return self._length
  35. def __getitem__(self, index): # Seq API requirement
  36. # Note since Python 2.0, __getslice__ is deprecated
  37. # and __getitem__ is used instead.
  38. # See http://docs.python.org/ref/sequence-methods.html
  39. if isinstance(index, int):
  40. # Return a single letter as a string
  41. i = index
  42. if i < 0:
  43. if -i > self._length:
  44. raise IndexError(i)
  45. i = i + self._length
  46. elif i >= self._length:
  47. raise IndexError(i)
  48. return self.adaptor.get_subseq_as_string(self.primary_id,
  49. self.start + i,
  50. self.start + i + 1)
  51. if not isinstance(index, slice):
  52. raise ValueError("Unexpected index type")
  53. # Return the (sub)sequence as another DBSeq or Seq object
  54. # (see the Seq obect's __getitem__ method)
  55. if index.start is None:
  56. i = 0
  57. else:
  58. i = index.start
  59. if i < 0:
  60. # Map to equavilent positive index
  61. if -i > self._length:
  62. raise IndexError(i)
  63. i = i + self._length
  64. elif i >= self._length:
  65. # Trivial case, should return empty string!
  66. i = self._length
  67. if index.stop is None:
  68. j = self._length
  69. else:
  70. j = index.stop
  71. if j < 0:
  72. # Map to equavilent positive index
  73. if -j > self._length:
  74. raise IndexError(j)
  75. j = j + self._length
  76. elif j >= self._length:
  77. j = self._length
  78. if i >= j:
  79. # Trivial case, empty string.
  80. return Seq("", self.alphabet)
  81. elif index.step is None or index.step == 1:
  82. # Easy case - can return a DBSeq with the start and end adjusted
  83. return self.__class__(self.primary_id, self.adaptor, self.alphabet,
  84. self.start + i, j - i)
  85. else:
  86. # Tricky. Will have to create a Seq object because of the stride
  87. full = self.adaptor.get_subseq_as_string(self.primary_id,
  88. self.start + i,
  89. self.start + j)
  90. return Seq(full[::index.step], self.alphabet)
  91. def tostring(self):
  92. """Returns the full sequence as a python string (DEPRECATED).
  93. You are now encouraged to use str(my_seq) instead of
  94. my_seq.tostring()."""
  95. import warnings
  96. warnings.warn("This method is obsolete; please use str(my_seq) "
  97. "instead of my_seq.tostring().",
  98. PendingDeprecationWarning)
  99. return self.adaptor.get_subseq_as_string(self.primary_id,
  100. self.start,
  101. self.start + self._length)
  102. def __str__(self):
  103. """Returns the full sequence as a python string."""
  104. return self.adaptor.get_subseq_as_string(self.primary_id,
  105. self.start,
  106. self.start + self._length)
  107. data = property(tostring, doc="Sequence as string (DEPRECATED)")
  108. def toseq(self):
  109. """Returns the full sequence as a Seq object."""
  110. # Note - the method name copies that of the MutableSeq object
  111. return Seq(str(self), self.alphabet)
  112. def __add__(self, other):
  113. # Let the Seq object deal with the alphabet issues etc
  114. return self.toseq() + other
  115. def __radd__(self, other):
  116. # Let the Seq object deal with the alphabet issues etc
  117. return other + self.toseq()
  118. def _retrieve_seq(adaptor, primary_id):
  119. # The database schema ensures there will be only one matching
  120. # row in the table.
  121. # If an UnknownSeq was recorded, seq will be NULL,
  122. # but length will be populated. This means length(seq)
  123. # will return None.
  124. seqs = adaptor.execute_and_fetchall(
  125. "SELECT alphabet, length, length(seq) FROM biosequence"
  126. " WHERE bioentry_id = %s", (primary_id,))
  127. if not seqs:
  128. return
  129. assert len(seqs) == 1
  130. moltype, given_length, length = seqs[0]
  131. try:
  132. length = int(length)
  133. given_length = int(length)
  134. assert length == given_length
  135. have_seq = True
  136. except TypeError:
  137. assert length is None
  138. seqs = adaptor.execute_and_fetchall(
  139. "SELECT alphabet, length, seq FROM biosequence"
  140. " WHERE bioentry_id = %s", (primary_id,))
  141. assert len(seqs) == 1
  142. moltype, given_length, seq = seqs[0]
  143. assert seq is None or seq == ""
  144. length = int(given_length)
  145. have_seq = False
  146. del seq
  147. del given_length
  148. moltype = moltype.lower() # might be upper case in database
  149. # We have no way of knowing if these sequences will use IUPAC
  150. # alphabets, and we certainly can't assume they are unambiguous!
  151. if moltype == "dna":
  152. alphabet = Alphabet.generic_dna
  153. elif moltype == "rna":
  154. alphabet = Alphabet.generic_rna
  155. elif moltype == "protein":
  156. alphabet = Alphabet.generic_protein
  157. elif moltype == "unknown":
  158. # This is used in BioSQL/Loader.py and would happen
  159. # for any generic or nucleotide alphabets.
  160. alphabet = Alphabet.single_letter_alphabet
  161. else:
  162. raise AssertionError("Unknown moltype: %s" % moltype)
  163. if have_seq:
  164. return DBSeq(primary_id, adaptor, alphabet, 0, int(length))
  165. else:
  166. return UnknownSeq(length, alphabet)
  167. def _retrieve_dbxrefs(adaptor, primary_id):
  168. """Retrieve the database cross references for the sequence."""
  169. _dbxrefs = []
  170. dbxrefs = adaptor.execute_and_fetchall(
  171. "SELECT dbname, accession, version"
  172. " FROM bioentry_dbxref join dbxref using (dbxref_id)"
  173. " WHERE bioentry_id = %s"
  174. " ORDER BY rank", (primary_id,))
  175. for dbname, accession, version in dbxrefs:
  176. if version and version != "0":
  177. v = "%s.%s" % (accession, version)
  178. else:
  179. v = accession
  180. _dbxrefs.append("%s:%s" % (dbname, v))
  181. return _dbxrefs
  182. def _retrieve_features(adaptor, primary_id):
  183. sql = "SELECT seqfeature_id, type.name, rank" \
  184. " FROM seqfeature join term type on (type_term_id = type.term_id)" \
  185. " WHERE bioentry_id = %s" \
  186. " ORDER BY rank"
  187. results = adaptor.execute_and_fetchall(sql, (primary_id,))
  188. seq_feature_list = []
  189. for seqfeature_id, seqfeature_type, seqfeature_rank in results:
  190. # Get qualifiers [except for db_xref which is stored separately]
  191. qvs = adaptor.execute_and_fetchall(
  192. "SELECT name, value"
  193. " FROM seqfeature_qualifier_value join term using (term_id)"
  194. " WHERE seqfeature_id = %s"
  195. " ORDER BY rank", (seqfeature_id,))
  196. qualifiers = {}
  197. for qv_name, qv_value in qvs:
  198. qualifiers.setdefault(qv_name, []).append(qv_value)
  199. # Get db_xrefs [special case of qualifiers]
  200. qvs = adaptor.execute_and_fetchall(
  201. "SELECT dbxref.dbname, dbxref.accession"
  202. " FROM dbxref join seqfeature_dbxref using (dbxref_id)"
  203. " WHERE seqfeature_dbxref.seqfeature_id = %s"
  204. " ORDER BY rank", (seqfeature_id,))
  205. for qv_name, qv_value in qvs:
  206. value = "%s:%s" % (qv_name, qv_value)
  207. qualifiers.setdefault("db_xref", []).append(value)
  208. # Get locations
  209. results = adaptor.execute_and_fetchall(
  210. "SELECT location_id, start_pos, end_pos, strand"
  211. " FROM location"
  212. " WHERE seqfeature_id = %s"
  213. " ORDER BY rank", (seqfeature_id,))
  214. locations = []
  215. # convert to Python standard form
  216. # Convert strand = 0 to strand = None
  217. # re: comment in Loader.py:
  218. # Biopython uses None when we don't know strand information but
  219. # BioSQL requires something (non null) and sets this as zero
  220. # So we'll use the strand or 0 if Biopython spits out None
  221. for location_id, start, end, strand in results:
  222. if start:
  223. start -= 1
  224. if strand == 0:
  225. strand = None
  226. if strand not in (+1, -1, None):
  227. raise ValueError("Invalid strand %s found in database for "
  228. "seqfeature_id %s" % (strand, seqfeature_id))
  229. if end < start:
  230. import warnings
  231. from Bio import BiopythonWarning
  232. warnings.warn("Inverted location start/end (%i and %i) for "
  233. "seqfeature_id %s" % (start, end, seqfeature_id),
  234. BiopythonWarning)
  235. locations.append((location_id, start, end, strand))
  236. # Get possible remote reference information
  237. remote_results = adaptor.execute_and_fetchall(
  238. "SELECT location_id, dbname, accession, version"
  239. " FROM location join dbxref using (dbxref_id)"
  240. " WHERE seqfeature_id = %s", (seqfeature_id,))
  241. lookup = {}
  242. for location_id, dbname, accession, version in remote_results:
  243. if version and version != "0":
  244. v = "%s.%s" % (accession, version)
  245. else:
  246. v = accession
  247. # subfeature remote location db_ref are stored as a empty string when
  248. # not present
  249. if dbname == "":
  250. dbname = None
  251. lookup[location_id] = (dbname, v)
  252. feature = SeqFeature.SeqFeature(type=seqfeature_type)
  253. # Store the key as a private property
  254. feature._seqfeature_id = seqfeature_id
  255. feature.qualifiers = qualifiers
  256. if len(locations) == 0:
  257. pass
  258. elif len(locations) == 1:
  259. location_id, start, end, strand = locations[0]
  260. # See Bug 2677, we currently don't record the location_operator
  261. # For consistency with older versions Biopython, default to "".
  262. feature.location_operator = \
  263. _retrieve_location_qualifier_value(adaptor, location_id)
  264. dbname, version = lookup.get(location_id, (None, None))
  265. feature.location = SeqFeature.FeatureLocation(start, end)
  266. feature.strand = strand
  267. feature.ref_db = dbname
  268. feature.ref = version
  269. else:
  270. sub_features = feature.sub_features
  271. assert sub_features == []
  272. for location in locations:
  273. location_id, start, end, strand = location
  274. dbname, version = lookup.get(location_id, (None, None))
  275. subfeature = SeqFeature.SeqFeature()
  276. subfeature.type = seqfeature_type
  277. subfeature.location = SeqFeature.FeatureLocation(start, end)
  278. # subfeature.location_operator = \
  279. # _retrieve_location_qualifier_value(adaptor, location_id)
  280. subfeature.strand = strand
  281. subfeature.ref_db = dbname
  282. subfeature.ref = version
  283. sub_features.append(subfeature)
  284. # Locations are in order, but because of remote locations for
  285. # sub-features they are not necessarily in numerical order:
  286. strands = set(sf.strand for sf in sub_features)
  287. if len(strands) == 1 and -1 in strands:
  288. # Evil hack time for backwards compatibility
  289. # TODO - Check if BioPerl and (old) Biopython did the same,
  290. # we may have an existing incompatibility lurking here...
  291. locs = [f.location for f in sub_features[::-1]]
  292. else:
  293. # All forward, or mixed strands
  294. locs = [f.location for f in sub_features]
  295. feature.location = SeqFeature.CompoundLocation(
  296. locs, seqfeature_type)
  297. # TODO - See Bug 2677 - we don't yet record location_operator,
  298. # so for consistency with older versions of Biopython default
  299. # to assuming its a join.
  300. feature.location_operator = "join"
  301. seq_feature_list.append(feature)
  302. return seq_feature_list
  303. def _retrieve_location_qualifier_value(adaptor, location_id):
  304. value = adaptor.execute_and_fetch_col0(
  305. "SELECT value FROM location_qualifier_value"
  306. " WHERE location_id = %s", (location_id,))
  307. try:
  308. return value[0]
  309. except IndexError:
  310. return ""
  311. def _retrieve_annotations(adaptor, primary_id, taxon_id):
  312. annotations = {}
  313. annotations.update(_retrieve_qualifier_value(adaptor, primary_id))
  314. annotations.update(_retrieve_reference(adaptor, primary_id))
  315. annotations.update(_retrieve_taxon(adaptor, primary_id, taxon_id))
  316. annotations.update(_retrieve_comment(adaptor, primary_id))
  317. # Convert values into strings in cases of unicode from the database.
  318. # BioSQL could eventually be expanded to be unicode aware.
  319. str_anns = {}
  320. for key, val in annotations.items():
  321. if isinstance(val, list):
  322. val = [_make_unicode_into_string(x) for x in val]
  323. elif isinstance(val, unicode):
  324. val = str(val)
  325. str_anns[key] = val
  326. return str_anns
  327. def _make_unicode_into_string(text):
  328. if isinstance(text, unicode):
  329. return str(text)
  330. else:
  331. return text
  332. def _retrieve_qualifier_value(adaptor, primary_id):
  333. qvs = adaptor.execute_and_fetchall(
  334. "SELECT name, value"
  335. " FROM bioentry_qualifier_value JOIN term USING (term_id)"
  336. " WHERE bioentry_id = %s"
  337. " ORDER BY rank", (primary_id,))
  338. qualifiers = {}
  339. for name, value in qvs:
  340. if name == "keyword":
  341. name = "keywords"
  342. # See handling of "date" in Loader.py
  343. elif name == "date_changed":
  344. name = "date"
  345. elif name == "secondary_accession":
  346. name = "accessions"
  347. qualifiers.setdefault(name, []).append(value)
  348. return qualifiers
  349. def _retrieve_reference(adaptor, primary_id):
  350. # XXX dbxref_qualifier_value
  351. refs = adaptor.execute_and_fetchall(
  352. "SELECT start_pos, end_pos, "
  353. " location, title, authors,"
  354. " dbname, accession"
  355. " FROM bioentry_reference"
  356. " JOIN reference USING (reference_id)"
  357. " LEFT JOIN dbxref USING (dbxref_id)"
  358. " WHERE bioentry_id = %s"
  359. " ORDER BY rank", (primary_id,))
  360. references = []
  361. for start, end, location, title, authors, dbname, accession in refs:
  362. reference = SeqFeature.Reference()
  363. # If the start/end are missing, reference.location is an empty list
  364. if (start is not None) or (end is not None):
  365. if start is not None:
  366. start -= 1 # python counting
  367. reference.location = [SeqFeature.FeatureLocation(start, end)]
  368. # Don't replace the default "" with None.
  369. if authors:
  370. reference.authors = authors
  371. if title:
  372. reference.title = title
  373. reference.journal = location
  374. if dbname == 'PUBMED':
  375. reference.pubmed_id = accession
  376. elif dbname == 'MEDLINE':
  377. reference.medline_id = accession
  378. references.append(reference)
  379. if references:
  380. return {'references': references}
  381. else:
  382. return {}
  383. def _retrieve_taxon(adaptor, primary_id, taxon_id):
  384. a = {}
  385. common_names = adaptor.execute_and_fetch_col0(
  386. "SELECT name FROM taxon_name WHERE taxon_id = %s"
  387. " AND name_class = 'genbank common name'", (taxon_id,))
  388. if common_names:
  389. a['source'] = common_names[0]
  390. scientific_names = adaptor.execute_and_fetch_col0(
  391. "SELECT name FROM taxon_name WHERE taxon_id = %s"
  392. " AND name_class = 'scientific name'", (taxon_id,))
  393. if scientific_names:
  394. a['organism'] = scientific_names[0]
  395. ncbi_taxids = adaptor.execute_and_fetch_col0(
  396. "SELECT ncbi_taxon_id FROM taxon WHERE taxon_id = %s", (taxon_id,))
  397. if ncbi_taxids and ncbi_taxids[0] and ncbi_taxids[0] != "0":
  398. a['ncbi_taxid'] = ncbi_taxids[0]
  399. # Old code used the left/right values in the taxon table to get the
  400. # taxonomy lineage in one SQL command. This was actually very slow,
  401. # and would fail if the (optional) left/right values were missing.
  402. #
  403. # The following code is based on a contribution from Eric Gibert, and
  404. # relies on the taxon table's parent_taxon_id field only (ignoring the
  405. # optional left/right values). This means that it has to make a
  406. # separate SQL query for each entry in the lineage, but it does still
  407. # appear to be *much* faster. See Bug 2494.
  408. taxonomy = []
  409. while taxon_id:
  410. name, rank, parent_taxon_id = adaptor.execute_one(
  411. "SELECT taxon_name.name, taxon.node_rank, taxon.parent_taxon_id"
  412. " FROM taxon, taxon_name"
  413. " WHERE taxon.taxon_id=taxon_name.taxon_id"
  414. " AND taxon_name.name_class='scientific name'"
  415. " AND taxon.taxon_id = %s", (taxon_id,))
  416. if taxon_id == parent_taxon_id:
  417. # If the taxon table has been populated by the BioSQL script
  418. # load_ncbi_taxonomy.pl this is how top parent nodes are stored.
  419. # Personally, I would have used a NULL parent_taxon_id here.
  420. break
  421. if rank != "no rank":
  422. # For consistency with older versions of Biopython, we are only
  423. # interested in taxonomy entries with a stated rank.
  424. # Add this to the start of the lineage list.
  425. taxonomy.insert(0, name)
  426. taxon_id = parent_taxon_id
  427. if taxonomy:
  428. a['taxonomy'] = taxonomy
  429. return a
  430. def _retrieve_comment(adaptor, primary_id):
  431. qvs = adaptor.execute_and_fetchall(
  432. "SELECT comment_text FROM comment"
  433. " WHERE bioentry_id=%s"
  434. " ORDER BY rank", (primary_id,))
  435. comments = [comm[0] for comm in qvs]
  436. # Don't want to add an empty list...
  437. if comments:
  438. return {"comment": comments}
  439. else:
  440. return {}
  441. class DBSeqRecord(SeqRecord):
  442. """BioSQL equivalent of the Biopython SeqRecord object."""
  443. def __init__(self, adaptor, primary_id):
  444. self._adaptor = adaptor
  445. self._primary_id = primary_id
  446. (self._biodatabase_id, self._taxon_id, self.name,
  447. accession, version, self._identifier,
  448. self._division, self.description) = self._adaptor.execute_one(
  449. "SELECT biodatabase_id, taxon_id, name, accession, version,"
  450. " identifier, division, description"
  451. " FROM bioentry"
  452. " WHERE bioentry_id = %s", (self._primary_id,))
  453. if version and version != "0":
  454. self.id = "%s.%s" % (accession, version)
  455. else:
  456. self.id = accession
  457. # We don't yet record any per-letter-annotations in the
  458. # BioSQL database, but we should set this property up
  459. # for completeness (and the __str__ method).
  460. try:
  461. length = len(self.seq)
  462. except:
  463. # Could be no sequence in the database!
  464. length = 0
  465. self._per_letter_annotations = _RestrictedDict(length=length)
  466. def __get_seq(self):
  467. if not hasattr(self, "_seq"):
  468. self._seq = _retrieve_seq(self._adaptor, self._primary_id)
  469. return self._seq
  470. def __set_seq(self, seq):
  471. self._seq = seq
  472. def __del_seq(self):
  473. del self._seq
  474. seq = property(__get_seq, __set_seq, __del_seq, "Seq object")
  475. def __get_dbxrefs(self):
  476. if not hasattr(self, "_dbxrefs"):
  477. self._dbxrefs = _retrieve_dbxrefs(self._adaptor, self._primary_id)
  478. return self._dbxrefs
  479. def __set_dbxrefs(self, dbxrefs):
  480. self._dbxrefs = dbxrefs
  481. def __del_dbxrefs(self):
  482. del self._dbxrefs
  483. dbxrefs = property(__get_dbxrefs, __set_dbxrefs, __del_dbxrefs,
  484. "Database cross references")
  485. def __get_features(self):
  486. if not hasattr(self, "_features"):
  487. self._features = _retrieve_features(self._adaptor,
  488. self._primary_id)
  489. return self._features
  490. def __set_features(self, features):
  491. self._features = features
  492. def __del_features(self):
  493. del self._features
  494. features = property(__get_features, __set_features, __del_features,
  495. "Features")
  496. def __get_annotations(self):
  497. if not hasattr(self, "_annotations"):
  498. self._annotations = _retrieve_annotations(self._adaptor,
  499. self._primary_id,
  500. self._taxon_id)
  501. if self._identifier:
  502. self._annotations["gi"] = self._identifier
  503. if self._division:
  504. self._annotations["data_file_division"] = self._division
  505. return self._annotations
  506. def __set_annotations(self, annotations):
  507. self._annotations = annotations
  508. def __del_annotations(self):
  509. del self._annotations
  510. annotations = property(__get_annotations, __set_annotations,
  511. __del_annotations, "Annotations")