/exomate/scripts/annotatevep.py
Python | 330 lines | 321 code | 8 blank | 1 comment | 2 complexity | a1d51314c00ac651b136fd05826f2332 MD5 | raw file
- #!/usr/bin/env python
- import sys
- import os.path
- from io import StringIO
- import psycopg2
- from itertools import islice
- from sqlalchemy.engine.url import make_url
- from sqlalchemy import (Table, Column, Integer, SmallInteger, BigInteger,
- String, Boolean, Float, Date, DateTime, CHAR, Index, ForeignKey,
- UniqueConstraint, CheckConstraint)
- import sqlalchemy.types as sqltypes
- from sqt.io.fasta import IndexedFasta
- from exomate import database
- from exomate.variants import normalized_indel, transition_transversion
- from exomate.models import *
- from exomate.commandline import HelpfulArgumentParser
- from collections import namedtuple, defaultdict
- from time import gmtime, strftime
- from functools import lru_cache
- from itertools import count, zip_longest, islice
- from time import sleep
- import tempfile
- from subprocess import Popen, PIPE, STDOUT
- import threading
- import math
- import copy
- __author__ = "Christopher Schroeder, Johannes Koester"
- CSQ = namedtuple("CSQ", "allele gene feature feature_type consequence cdna_position cds_position protein_position amino_acids codons existing_variation aa_maf ea_maf exon intron motif_name motif_pos high_inf_pos motif_score_change distance clin_sig canonical symbol symbol_source sift polyphen gmaf biotype ensp domains ccds hgvsc hgvsp afr_maf amr_maf asn_maf eur_maf pubmed ")
- keys2 = ["gene", "feature", "feature_type", "consequence", "cdna_position_start", "cdna_position_stop", "cds_position_start", "cds_position_stop", "protein_position_start", "protein_position_stop", "ref_amino_acid", "alt_amino_acid", "ref_codon", "alt_codon", "existing_variation", "aa_maf", "ea_maf", "exon_number", "exon_count", "intron_number", "intron_count", "motif_score_change", "distance", "canonical", "symbol", "symbol_source", "sift_prediction_term", "sift_score", "polyphen_prediction_term", "polyphen_score", "biotype", "ensp", "domains", "ccds", "hgvsc", "hgvsp", "afr_maf", "amr_maf", "asn_maf", "eur_maf"]
- def VepReader(p, single=False):
- from exomate.variants import VARIANT
- VARIANT_DICT = dict(VARIANT.__dict__)
- def no_question(x, y):
- if x == '?' or x == '':
- x = None
- if y == '?' or x == '':
- y = None
- return x, y
- def split_strip(x, splitter, stripper='', no_question=no_question):
- if not x or x == '-':
- return None, None
- if len(x.split(splitter)) == 1:
- return no_question(x.rstrip(stripper), x.rstrip(stripper))
- return no_question(*x.rstrip(stripper).split(splitter))
- f = p.stdout
- for line in f:
- line = line.decode()
- if line.startswith("#"):
- continue
- v = {}
- v["chrom"], v["pos"], v["id"], v["ref"], v["alt"], v["qual"], v["filter"], info = line.split("\t")[0:8]
- v["pos"] = str((int)(v["pos"]) - 1) # back to 0-based
- csqs = info.split(";")[-1][4:].split(",")
- for csq in csqs:
- try:
- vep = CSQ(*csq.split("|"))
- except:
- print("error")
- print(csq)
- continue
- v.update({ x:(vep.__getattribute__(x) if vep.__getattribute__(x) else None) for x in vep._fields})
- v["ref_amino_acid"], v["alt_amino_acid"] = split_strip(v["amino_acids"], "/")
- v["ref_codon"], v["alt_codon"] = split_strip(v["codons"], ("/"))
- v["sift_prediction_term"], v["sift_score"] = split_strip(v["sift"], "(", ")")
- v["polyphen_prediction_term"], v["polyphen_score"] = split_strip(v["polyphen"], "(", ")")
- v["exon_number"], v["exon_count"] = split_strip(v["exon"], "/")
- v["intron_number"], v["intron_count"] = split_strip(v["intron"], "/")
- v["cdna_position_start"], v["cdna_position_stop"] = split_strip(v["cdna_position"], "-")
- v["cds_position_start"], v["cds_position_stop"] = split_strip(v["cds_position"], "-")
- v["protein_position_start"], v["protein_position_stop"] = split_strip(v["protein_position"], "-")
- v["aa_maf"], nothing = split_strip(v["aa_maf"], "&")
- v["asn_maf"], nothing = split_strip(v["asn_maf"], "&")
- v["eur_maf"], nothing = split_strip(v["eur_maf"], "&")
- v["amr_maf"], nothing = split_strip(v["amr_maf"], "&")
- v["ea_maf"], nothing = split_strip(v["ea_maf"], "&")
- v["afr_maf"], nothing = split_strip(v["afr_maf"], "&")
- v["exon_number"], nothing = split_strip(v["exon_number"], "-")
- # building bitvector of consequences
- bitvector = 0
- for c in v["consequence"].split("&"):
- if c.endswith('_variant'):
- c = c[:-8]
- if c == "3_prime_UTR": c = "THREE_PRIME_UTR"
- if c == "5_prime_UTR": c = "FIVE_PRIME_UTR"
- bitvector |= 2 ** VARIANT_DICT[c.upper()]
- v["consequence"] = bitvector
- a = float(v["aa_maf"] or 0)
- a = float(v["asn_maf"] or 0)
- a = float(v["eur_maf"] or 0)
- a = float(v["amr_maf"] or 0)
- a = float(v["afr_maf"] or 0)
- yield v
- if single: break
- def vep(session, p):
- print("load unannotated variants", file=sys.stderr)
- # with tempfile.NamedTemporaryFile(delete=True, mode='wt') as f:
- with open("pseudovep.vcf", "wt") as f:
- p.stdin.write("##fileformat=VCFv4.1\n".encode())
- p.stdin.write("\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO\n"]).encode())
- # fetch all variants where at least one call exist, to avoid annotating variants without calls e.g. dbsnp variants
- results = session.execute("select v.chrom, v.pos + 1, v.id, v.ref, v.alt from (select distinct(variant_id) from calls WHERE variant_id > (select coalesce(max(variant_id)-1000,0) FROM annotations)) c join variants v on c.variant_id = v.id ORDER BY v.id")
- for x in results:
- line = "\t".join(str(i) for i in x) + "\t.\t.\t.\n"
- p.stdin.write((line).encode())
- p.stdin.close()
- p.wait()
- def importing(session, p, chunksize=5000):
- print("fetching existing genes", file=sys.stderr)
- existing_genes = set(x[0] for x in session.execute("SELECT accession FROM genes").fetchall())
- print("fetching existing transcripts", file=sys.stderr)
- existing_transcripts = set(x[0] for x in session.execute("SELECT accession FROM transcripts").fetchall())
- print("fetching existing annotations", file=sys.stderr)
- existing_annotations = set(tuple(x) for x in session.execute("SELECT v.chrom, v.pos, v.alt, t.accession FROM annotations a JOIN variants v ON v.id = a.variant_id JOIN transcripts t ON t.id = a.transcript_id WHERE v.id > (select coalesce(max(variant_id)-1000,0) FROM annotations) ORDER BY v.id").fetchall())
- vep = VepReader(p)
- fmt = lambda x: tuple(str(i) if i is not None else r"\N" for i in x)
- def grouper(iterable, n):
- it = iter(iterable)
- while True:
- chunk = tuple(map(copy.copy, islice(it, n)))
- if not chunk:
- return
- yield chunk
- print("processing the vep file", file=sys.stderr)
- for i, group in enumerate(grouper(vep, chunksize)):
- print(i * chunksize, "variants processed", file=sys.stderr)
- inserting_genes = []
- inserting_transcripts = []
- inserting_annotations = []
- inserting_tmp_annotations = []
- for record in group:
- if record["gene"] not in existing_genes:
- inserting_genes.append(fmt((record["gene"], record["symbol"])))
- existing_genes.add(record["gene"])
- if record["feature"] not in existing_transcripts:
- inserting_transcripts.append(fmt((record["feature"], record["gene"], record["biotype"], record["ensp"], 0)))
- existing_transcripts.add(record["feature"])
- if (record["chrom"], int(record["pos"]), record["alt"], record["feature"]) not in existing_annotations:
- inserting_annotations.append(fmt(tuple(record[k] for k in keys2)))
- inserting_tmp_annotations.append(fmt(tuple(record[k] for k in ["chrom", "pos", "alt", "ref", "feature"])))
- conn = session.get_bind().raw_connection()
- cursor = conn.cursor()
- # genes
- if inserting_genes: # only if new genes are existing
- print("building gene string", file=sys.stderr)
- insert_string = "\n".join(["\t".join(g) for g in inserting_genes]) + "\n"
-
- print("create temporary gene table", file=sys.stderr)
- cursor.execute("""
- CREATE TEMPORARY TABLE tmp_genes (
- accession VARCHAR,
- name VARCHAR
- )""")
- print("inserting into temporary gene table", file=sys.stderr)
- cursor.copy_from(StringIO(insert_string), "tmp_genes", columns=["accession", "name"])
- print("join temporary gene table", file=sys.stderr)
- cursor.execute('''
- INSERT INTO genes (accession, name)
- SELECT g1.accession, g1.name FROM tmp_genes g1 LEFT JOIN genes g2 ON
- g1.accession = g2.accession
- WHERE g2.id IS NULL;
- ''')
- cursor.execute("""DROP TABLE tmp_genes""")
- conn.commit()
- # transcripts
- if inserting_transcripts:
- print("building transcript string", file=sys.stderr)
- insert_string = "\n".join(["\t".join(t) for t in inserting_transcripts]) + "\n"
- print("create temporary transcript table", file=sys.stderr)
- cursor.execute("""
- CREATE TEMPORARY TABLE tmp_transcripts (
- accession VARCHAR,
- gene_accession VARCHAR,
- biotype VARCHAR,
- protein_accession VARCHAR,
- feature_count INTEGER
- )""")
- print("inserting into temporary transcript table", file=sys.stderr)
- cursor.copy_from(StringIO(insert_string), "tmp_transcripts", columns=["accession", "gene_accession, biotype, protein_accession, feature_count"])
- print("join temporary transcript table", file=sys.stderr)
- cursor.execute('''
- INSERT INTO transcripts (accession, gene_id, biotype, protein_accession, feature_count)
- SELECT t.accession, g.id, t.biotype, t.protein_accession, t.feature_count FROM tmp_transcripts t
- LEFT JOIN genes g ON g.accession = t.gene_accession
- LEFT JOIN transcripts t2 ON t.accession = t2.accession
- WHERE t2.id IS NULL;
- ''')
- cursor.execute("""DROP TABLE tmp_transcripts""")
- conn.commit()
- # annotations
- if inserting_tmp_annotations:
- print("building annotation string", file=sys.stderr)
- insert_string = "\n".join(["\t".join(a) for a in inserting_tmp_annotations]) + "\n"
- print("create temporary annotation table", file=sys.stderr)
- cursor.execute("""
- CREATE TEMPORARY TABLE tmp_annotations (
- id SERIAL,
- chrom VARCHAR,
- pos INTEGER,
- alt VARCHAR,
- ref VARCHAR,
- feature VARCHAR)
- """)
- print("inserting into temporary annotation table", file=sys.stderr)
- cursor.copy_from(StringIO(insert_string), "tmp_annotations", columns=["chrom", "pos", "alt", "ref", "feature"])
- print("fetching variant and transcript ids", file=sys.stderr)
- cursor.execute('''SELECT v.id, t.id
- FROM tmp_annotations a
- LEFT JOIN variants v ON
- v.chrom = a.chrom AND
- v.pos = a.pos AND
- v.alt = a.alt AND
- v.ref = a.ref
- LEFT JOIN transcripts t ON
- t.accession = a.feature
- ORDER BY a.id
- ''')
- variant_transcripts = cursor.fetchall()
- insert_string = []
- for ids, a in zip(variant_transcripts, inserting_annotations):
- if ids in existing_annotations: continue
- vid, tid = ids
- # if tid == None:
- # print(ids)
- # print(a)
- if vid == None or tid == None: continue
- insert_string.append("\t".join((str(vid),str(tid)) + a))
- if len(insert_string):
- print("inserting", len(insert_string), "annotations", file=sys.stderr)
- insert_string = "\n".join(insert_string) + "\n"
- cursor.copy_from(StringIO(insert_string), "annotations", columns=["variant_id", "transcript_id"] + keys2)
- else:
- print("variants already annotated", file=sys.stderr)
- cursor.execute("""DROP TABLE tmp_annotations""")
- conn.commit()
- sys.stderr.flush()
- def main():
- parser = HelpfulArgumentParser(description=__doc__)
- parser.add_argument("--vep-cache", "-c", help="The path to the vep-cache")
- parser.add_argument("--threads", "-t", default=1, help="invoke vep with that many threads")
- parser.add_argument("--html", default="output_vep.html", help="the vep output html file, must end with .html or .htm")
- # parser.add_argument("--clear-existing", action='store_true', default=False, help="Overwrite existing feature entries instead of skipping them")
- parser.add_argument("--url", "-u", default=None, help="Database configuration URL. If this is missing, the configuration is read from the exomate configuration file")
- parser.add_argument("--verbose", "-v", action='store_true', default=False, help="show SQL statements (enables SQLalchemy echoing)")
- args = parser.parse_args()
- Session = database.get_session(url=args.url)
- session = Session()
- print("connected to:", session.bind.engine, file=sys.stderr)
- if args.verbose:
- session.bind.engine.echo = True
- print("open subprocess", file=sys.stderr)
- print("threads", args.threads)
- command = "vep --force_overwrite --offline --fasta /vol/ref/vep/homo_sapiens/74/Homo_sapiens.GRCh37.74.dna.primary_assembly.fa --vcf --dir %(vep_cache)s --fork %(fork)s --hgvs --everything -o STDOUT --stats_file %(output_html)s" % {'vep_cache': args.vep_cache, 'fork': args.threads, "output_html": args.html}
- p = Popen(command, shell=True, stdout=PIPE, stdin=PIPE, stderr=sys.stderr)
- t = threading.Thread(target=vep, args=(session, p))
- t.start()
- importing(session, p)
- t.join()
- session.commit()
- session.close()
- if __name__ == '__main__':
- main()