PageRenderTime 66ms CodeModel.GetById 32ms RepoModel.GetById 0ms app.codeStats 0ms

/rest_apis/ensembl_remote_rest.py

https://github.com/kdaily/bcbb
Python | 226 lines | 208 code | 4 blank | 14 comment | 9 complexity | a4a0545ee8c22543a2796b556af4c527 MD5 | raw file
  1. """Provide a remote REST-like API interface to Ensembl.
  2. This provides a wrapping around screen-scraping of Ensembl
  3. with the goal of retrieving comparative genomics information.
  4. Usage:
  5. ensembl_remote_rest.py Organism Gene_id
  6. Example:
  7. ensembl_remote_rest.py Homo_sapiens ENSG00000173894
  8. """
  9. from __future__ import with_statement
  10. import sys
  11. import re
  12. import urllib2
  13. import os
  14. import time
  15. from BeautifulSoup import BeautifulSoup
  16. from Bio import SeqIO
  17. import newick
  18. import networkx
  19. #gene_ids = [("Homo_sapiens", "ENSG00000173894")]
  20. #gene_ids = [("Homo_sapiens", "ENSG00000168283")]
  21. #gene_ids = [("Homo_sapiens", "ENSG00000183741")]
  22. def main(organism, gene_id):
  23. write_fasta = False
  24. cache_dir = os.path.join(os.getcwd(), "cache")
  25. ensembl_rest = EnsemblComparaRest(cache_dir)
  26. orthologs = ensembl_rest.orthologs(organism, gene_id)
  27. compara_tree = ensembl_rest.compara_tree(organism, gene_id)
  28. compara_tree = '(' + compara_tree[:-1] + ');'
  29. tree_rec = newick.parse_tree(compara_tree.strip())
  30. d_vis = DistanceVisitor()
  31. tree_rec.dfs_traverse(d_vis)
  32. tree_proteins = [l.identifier for l in tree_rec.leaves]
  33. orthologs = [(organism, gene_id)] + orthologs
  34. out_recs = []
  35. root_id = None
  36. all_items = []
  37. for o_organism, o_id in orthologs:
  38. transcripts = ensembl_rest.transcripts(o_organism, o_id)
  39. tx, p = [(tx, p) for (tx, p) in transcripts if p in
  40. tree_proteins][0]
  41. cur_item = EnsemblComparaTranscript(o_organism, o_id, tx, p)
  42. if root_id is None:
  43. root_id = p
  44. cur_item.distance = networkx.dijkstra_path_length(d_vis.graph,
  45. "'%s'" % root_id, "'%s'" % p)
  46. #print o_organism, o_id, p
  47. cur_item.domains = ensembl_rest.protein_domains(o_organism, o_id,
  48. tx)
  49. cur_item.statistics = ensembl_rest.protein_stats(o_organism, o_id,
  50. tx)
  51. all_items.append(cur_item)
  52. if write_fasta:
  53. out_rec = ensembl_rest.protein_fasta(o_organism, o_id,
  54. tx)
  55. out_rec.id = o_id
  56. out_rec.description = o_organism
  57. out_recs.append(out_rec)
  58. if len(out_recs) > 0:
  59. with open("%s_%s_orthologs.txt" % (organism, gene_id), "w") as \
  60. out_handle:
  61. SeqIO.write(out_recs, out_handle, "fasta")
  62. analyze_comparative_set(all_items)
  63. def analyze_comparative_set(all_items):
  64. def distance_cmp(one, two):
  65. return cmp(one.distance, two.distance)
  66. all_items.sort(distance_cmp)
  67. for item in all_items:
  68. print item.organism, item.distance, item.domains, \
  69. item.statistics.get('Charge', '').strip()
  70. class EnsemblComparaTranscript:
  71. """Hold comparative information retrieved from Ensembl on a transcript.
  72. """
  73. def __init__(self, organism, g_id, t_id, p_id):
  74. self.organism = organism
  75. self.g_id = g_id
  76. self.t_id = t_id
  77. self.p_id = p_id
  78. self.distance = None
  79. self.domains = []
  80. self.statistics = {}
  81. class DistanceVisitor(newick.tree.TreeVisitor):
  82. def __init__(self):
  83. self.graph = networkx.Graph()
  84. def pre_visit_edge(self, src, b, l, dest):
  85. self.graph.add_edge(repr(src), repr(dest), l)
  86. class EnsemblComparaRest:
  87. """Provide a REST-like API interface to Ensembl.
  88. """
  89. def __init__(self, cache_dir):
  90. self._base_url = "http://www.ensembl.org"
  91. self._cache_dir = cache_dir
  92. if not(os.path.exists(cache_dir)):
  93. os.makedirs(cache_dir)
  94. def protein_stats(self, organism, gene_id, tx_id):
  95. """Retrieve dictionary of statistics for a gene transcript.
  96. """
  97. stats = {}
  98. with self._get_open_handle("Transcript", "ProteinSummary",
  99. organism, gene_id, tx_id) as in_handle:
  100. soup = BeautifulSoup(in_handle)
  101. stats_possibilities = soup.findAll("dl", "summary")
  102. for stats_check in stats_possibilities:
  103. stats_assert = stats_check.find("dt", text="Statistics")
  104. if stats_assert:
  105. stats_line = stats_check.find("p")
  106. for stats_part in stats_line:
  107. if stats_part.find(":") > 0:
  108. key, value = stats_part.split(":")
  109. stats[key] = value
  110. return stats
  111. def protein_domains(self, organism, gene_id, tx_id):
  112. """Retrieve characterized domains in a gene transcript.
  113. """
  114. domains = []
  115. with self._get_open_handle("Transcript", "Domains", organism,
  116. gene_id, tx_id) as in_handle:
  117. soup = BeautifulSoup(in_handle)
  118. domain_table = soup.find("table", "ss autocenter")
  119. if domain_table is not None:
  120. domain_links = domain_table.findAll("a", href =
  121. re.compile("interpro"))
  122. for domain_link in domain_links:
  123. domains.append(domain_link.string)
  124. domains = list(set(domains))
  125. return domains
  126. def protein_fasta(self, organism, gene_id, tx_id):
  127. """Retrieve the fasta sequence for a given gene and transcript.
  128. """
  129. final_url = "%s/%s/Transcript/Export?db=core;g=%s;output=fasta;t=%s;"\
  130. "st=peptide;_format=Text" % (self._base_url, organism,
  131. gene_id, tx_id)
  132. handle = self._safe_open(final_url)
  133. rec = SeqIO.read(handle, "fasta")
  134. handle.close()
  135. return rec
  136. def transcripts(self, organism, gene_id):
  137. """Retrieve a list of (transcript, protein) ids for the given gene_id.
  138. """
  139. txs = []
  140. ps = []
  141. valid_gene_starts = ["EN", "FB", "AA", "AG"]
  142. with self._get_open_handle("Gene", "Summary", organism,
  143. gene_id) as in_handle:
  144. soup = BeautifulSoup(in_handle)
  145. tx_info = soup.find("table", {"id" : "transcripts"})
  146. if tx_info is None:
  147. tx_info = soup.find(True, {"id" : "transcripts_text"})
  148. #print tx_info
  149. tx_links = tx_info.findAll("a",
  150. href = re.compile("Transcript/Summary"))
  151. for tx_link in tx_links:
  152. if tx_link.string and tx_link.string[:2] in valid_gene_starts:
  153. txs.append(tx_link.string)
  154. p_links = tx_info.findAll("a",
  155. href = re.compile("Transcript/ProteinSummary"))
  156. for p_link in p_links:
  157. if p_link.string:
  158. ps.append(p_link.string)
  159. assert len(txs) == len(ps), (organism, gene_id, txs, ps)
  160. return zip(txs, ps)
  161. def orthologs(self, organism, gene_id):
  162. """Retrieve a list of orthologs for the given gene ID.
  163. """
  164. orthologs = []
  165. with self._get_open_handle("Gene", "Compara_Ortholog",
  166. organism, gene_id) as in_handle:
  167. soup = BeautifulSoup(in_handle)
  168. orth_table = soup.find("table", "orthologues")
  169. orth_links = orth_table.findAll("a",
  170. href = re.compile("Gene/Summary"))
  171. for orth_link in orth_links:
  172. href_parts = [x for x in orth_link['href'].split('/') if x]
  173. orthologs.append((href_parts[0], orth_link.string))
  174. return orthologs
  175. def compara_tree(self, organism, gene_id):
  176. """Retrieve the comparative tree calculated by compara.
  177. """
  178. with self._get_open_handle("Component/Gene/Web/ComparaTree", "text",
  179. organism, gene_id) as in_handle:
  180. soup = BeautifulSoup(in_handle)
  181. tree_details = soup.find("pre")
  182. return tree_details.string
  183. def _get_open_handle(self, item_type, action, organism, gene_id,
  184. tx_id = None):
  185. full_url = "%s/%s/%s/%s?g=%s" % (self._base_url, organism,
  186. item_type, action, gene_id)
  187. if tx_id:
  188. full_url += ";t=%s" % (tx_id)
  189. url_parts = [p for p in full_url.split("/") if p]
  190. cache_file = os.path.join(self._cache_dir, "_".join(url_parts[1:]))
  191. if not os.path.exists(cache_file):
  192. #print full_url, cache_file
  193. in_handle = self._safe_open(full_url)
  194. with open(cache_file, 'w') as out_handle:
  195. out_handle.write(in_handle.read())
  196. in_handle.close()
  197. return open(cache_file, 'r')
  198. def _safe_open(self, url):
  199. while 1:
  200. try:
  201. in_handle = urllib2.urlopen(url)
  202. return in_handle
  203. except urllib2.URLError, msg:
  204. print msg
  205. time.sleep(5)
  206. if __name__ == "__main__":
  207. main(sys.argv[1], sys.argv[2])