ensembl_remote_rest.py

/rest_apis/ensembl_remote_rest.py

https://github.com/kdaily/bcbb · Python · 226 lines · 177 code · 17 blank · 32 comment · 39 complexity · a4a0545ee8c22543a2796b556af4c527 MD5 · raw file

"""Provide a remote REST-like API interface to Ensembl.

This provides a wrapping around screen-scraping of Ensembl
with the goal of retrieving comparative genomics information.

Usage:
    ensembl_remote_rest.py Organism Gene_id
Example:
    ensembl_remote_rest.py Homo_sapiens ENSG00000173894
"""
from __future__ import with_statement
import sys
import re
import urllib2
import os
import time

from BeautifulSoup import BeautifulSoup
from Bio import SeqIO
import newick
import networkx
    
#gene_ids = [("Homo_sapiens", "ENSG00000173894")]
#gene_ids = [("Homo_sapiens", "ENSG00000168283")]
#gene_ids = [("Homo_sapiens", "ENSG00000183741")]

def main(organism, gene_id):
    write_fasta = False
    cache_dir = os.path.join(os.getcwd(), "cache")
    ensembl_rest = EnsemblComparaRest(cache_dir)
    orthologs = ensembl_rest.orthologs(organism, gene_id)
    compara_tree = ensembl_rest.compara_tree(organism, gene_id)
    compara_tree = '(' + compara_tree[:-1] + ');'
    tree_rec = newick.parse_tree(compara_tree.strip())
    d_vis = DistanceVisitor()
    tree_rec.dfs_traverse(d_vis)
    tree_proteins = [l.identifier for l in tree_rec.leaves]
    orthologs = [(organism, gene_id)] + orthologs
    out_recs = []
    root_id = None
    all_items = []
    for o_organism, o_id in orthologs:
        transcripts = ensembl_rest.transcripts(o_organism, o_id)
        tx, p = [(tx, p) for (tx, p) in transcripts if p in
                tree_proteins][0]
        cur_item = EnsemblComparaTranscript(o_organism, o_id, tx, p)
        if root_id is None:
            root_id = p
        cur_item.distance = networkx.dijkstra_path_length(d_vis.graph,
                "'%s'" % root_id, "'%s'" % p)
        #print o_organism, o_id, p
        cur_item.domains = ensembl_rest.protein_domains(o_organism, o_id,
                tx)
        cur_item.statistics = ensembl_rest.protein_stats(o_organism, o_id,
                tx)
        all_items.append(cur_item)
        if write_fasta:
            out_rec = ensembl_rest.protein_fasta(o_organism, o_id,
                    tx)
            out_rec.id = o_id
            out_rec.description = o_organism
            out_recs.append(out_rec)
    if len(out_recs) > 0:
        with open("%s_%s_orthologs.txt" % (organism, gene_id), "w") as \
                out_handle:
            SeqIO.write(out_recs, out_handle, "fasta")
    analyze_comparative_set(all_items)

def analyze_comparative_set(all_items):
    def distance_cmp(one, two):
        return cmp(one.distance, two.distance)
    all_items.sort(distance_cmp)
    for item in all_items:
        print item.organism, item.distance, item.domains, \
                item.statistics.get('Charge', '').strip()

class EnsemblComparaTranscript:
    """Hold comparative information retrieved from Ensembl on a transcript.
    """
    def __init__(self, organism, g_id, t_id, p_id):
        self.organism = organism
        self.g_id = g_id
        self.t_id = t_id
        self.p_id = p_id
        self.distance = None
        self.domains = []
        self.statistics = {}

class DistanceVisitor(newick.tree.TreeVisitor):
    def __init__(self):
        self.graph = networkx.Graph()
        
    def pre_visit_edge(self, src, b, l, dest):
        self.graph.add_edge(repr(src), repr(dest), l)

class EnsemblComparaRest:
    """Provide a REST-like API interface to Ensembl.
    """
    def __init__(self, cache_dir):
        self._base_url = "http://www.ensembl.org"
        self._cache_dir = cache_dir
        if not(os.path.exists(cache_dir)):
            os.makedirs(cache_dir)

    def protein_stats(self, organism, gene_id, tx_id):
        """Retrieve dictionary of statistics for a gene transcript.
        """
        stats = {}
        with self._get_open_handle("Transcript", "ProteinSummary",
                organism, gene_id, tx_id) as in_handle:
            soup = BeautifulSoup(in_handle)
            stats_possibilities = soup.findAll("dl", "summary")
            for stats_check in stats_possibilities:
                stats_assert = stats_check.find("dt", text="Statistics")
                if stats_assert:
                    stats_line = stats_check.find("p")
                    for stats_part in stats_line:
                        if stats_part.find(":") > 0:
                            key, value = stats_part.split(":")
                            stats[key] = value
        return stats

    def protein_domains(self, organism, gene_id, tx_id):
        """Retrieve characterized domains in a gene transcript.
        """
        domains = []
        with self._get_open_handle("Transcript", "Domains", organism,
                gene_id, tx_id) as in_handle:
            soup = BeautifulSoup(in_handle)
            domain_table = soup.find("table", "ss autocenter")
            if domain_table is not None:
                domain_links = domain_table.findAll("a", href =
                        re.compile("interpro"))
                for domain_link in domain_links:
                    domains.append(domain_link.string)
        domains = list(set(domains))
        return domains

    def protein_fasta(self, organism, gene_id, tx_id):
        """Retrieve the fasta sequence for a given gene and transcript.
        """
        final_url = "%s/%s/Transcript/Export?db=core;g=%s;output=fasta;t=%s;"\
                "st=peptide;_format=Text" % (self._base_url, organism,
                        gene_id, tx_id)
        handle = self._safe_open(final_url)
        rec = SeqIO.read(handle, "fasta")
        handle.close()
        return rec

    def transcripts(self, organism, gene_id):
        """Retrieve a list of (transcript, protein) ids for the given gene_id.
        """
        txs = []
        ps = []
        valid_gene_starts = ["EN", "FB", "AA", "AG"]
        with self._get_open_handle("Gene", "Summary", organism,
                gene_id) as in_handle:
            soup = BeautifulSoup(in_handle)
            tx_info = soup.find("table", {"id" : "transcripts"})
            if tx_info is None:
                tx_info = soup.find(True, {"id" : "transcripts_text"})
            #print tx_info
            tx_links = tx_info.findAll("a", 
                    href = re.compile("Transcript/Summary"))
            for tx_link in tx_links:
                if tx_link.string and tx_link.string[:2] in valid_gene_starts:
                    txs.append(tx_link.string)
            p_links = tx_info.findAll("a", 
                    href = re.compile("Transcript/ProteinSummary"))
            for p_link in p_links:
                if p_link.string:
                    ps.append(p_link.string)
        assert len(txs) == len(ps), (organism, gene_id, txs, ps)
        return zip(txs, ps)

    def orthologs(self, organism, gene_id):
        """Retrieve a list of orthologs for the given gene ID.
        """
        orthologs = []
        with self._get_open_handle("Gene", "Compara_Ortholog",
                organism, gene_id) as in_handle:
            soup = BeautifulSoup(in_handle)
            orth_table = soup.find("table", "orthologues")
            orth_links = orth_table.findAll("a", 
                    href = re.compile("Gene/Summary"))
            for orth_link in orth_links:
                href_parts = [x for x in orth_link['href'].split('/') if x]
                orthologs.append((href_parts[0], orth_link.string))
        return orthologs

    def compara_tree(self, organism, gene_id):
        """Retrieve the comparative tree calculated by compara.
        """
        with self._get_open_handle("Component/Gene/Web/ComparaTree", "text",
                organism, gene_id) as in_handle:
            soup = BeautifulSoup(in_handle)
            tree_details = soup.find("pre")
            return tree_details.string

    def _get_open_handle(self, item_type, action, organism, gene_id,
                         tx_id = None):
        full_url = "%s/%s/%s/%s?g=%s" % (self._base_url, organism,
                item_type, action, gene_id)
        if tx_id:
            full_url += ";t=%s" % (tx_id)
        url_parts = [p for p in full_url.split("/") if p]
        cache_file = os.path.join(self._cache_dir, "_".join(url_parts[1:]))
        if not os.path.exists(cache_file):
            #print full_url, cache_file
            in_handle = self._safe_open(full_url)
            with open(cache_file, 'w') as out_handle:
                out_handle.write(in_handle.read())
            in_handle.close()
        return open(cache_file, 'r')

    def _safe_open(self, url):
        while 1:
            try:
                in_handle = urllib2.urlopen(url)
                return in_handle
            except urllib2.URLError, msg:
                print msg
                time.sleep(5)

if __name__ == "__main__":
    main(sys.argv[1], sys.argv[2])
Tech Fingerprint

Alerts (9)

'def' Ensure functions have docstrings for documentation
27 69 70 93
Complexity hotspot; lines 165 to 166 (total complexity: 3)
165 166
Complexity hotspot; lines 186 to 187 (total complexity: 3)
186 187
'time.sleep(' Avoid blocking; use threading.Timer or asyncio.sleep for non-blocking delays
223