annotatevep.py | searchcode

/exomate/scripts/annotatevep.py

https://bitbucket.org/marcelm/exomate · Python · 330 lines · 249 code · 70 blank · 11 comment · 56 complexity · a1d51314c00ac651b136fd05826f2332 MD5 · raw file

#!/usr/bin/env python

import sys
import os.path
from io import StringIO
import psycopg2
from itertools import islice
from sqlalchemy.engine.url import make_url
from sqlalchemy import (Table, Column, Integer, SmallInteger, BigInteger,
	String, Boolean, Float, Date, DateTime, CHAR, Index, ForeignKey,
	UniqueConstraint, CheckConstraint)
import sqlalchemy.types as sqltypes
from sqt.io.fasta import IndexedFasta
from exomate import database
from exomate.variants import normalized_indel, transition_transversion
from exomate.models import *
from exomate.commandline import HelpfulArgumentParser
from collections import namedtuple, defaultdict
from time import gmtime, strftime
from functools import lru_cache
from itertools import count, zip_longest, islice
from time import sleep
import tempfile
from subprocess import Popen, PIPE, STDOUT
import threading
import math
import copy

__author__ = "Christopher Schroeder, Johannes Koester"

CSQ = namedtuple("CSQ", "allele gene feature feature_type consequence cdna_position cds_position protein_position amino_acids codons existing_variation aa_maf ea_maf exon intron motif_name motif_pos high_inf_pos motif_score_change distance clin_sig canonical symbol symbol_source sift polyphen gmaf biotype ensp domains ccds hgvsc hgvsp afr_maf amr_maf asn_maf eur_maf pubmed ")

keys2 = ["gene", "feature", "feature_type", "consequence", "cdna_position_start", "cdna_position_stop", "cds_position_start", "cds_position_stop", "protein_position_start", "protein_position_stop", "ref_amino_acid", "alt_amino_acid", "ref_codon", "alt_codon", "existing_variation", "aa_maf", "ea_maf", "exon_number", "exon_count", "intron_number", "intron_count", "motif_score_change", "distance", "canonical", "symbol", "symbol_source", "sift_prediction_term", "sift_score", "polyphen_prediction_term", "polyphen_score", "biotype", "ensp", "domains", "ccds", "hgvsc", "hgvsp", "afr_maf", "amr_maf", "asn_maf", "eur_maf"]


def VepReader(p, single=False):
	from exomate.variants import VARIANT

	VARIANT_DICT = dict(VARIANT.__dict__)

	def no_question(x, y):
		if x == '?' or x == '':
			x = None
		if y == '?' or x == '':
			y = None
		return x, y

	def split_strip(x, splitter, stripper='', no_question=no_question):
		if not x or x == '-':
			return None, None
		if len(x.split(splitter)) == 1:
			return no_question(x.rstrip(stripper), x.rstrip(stripper))
		return no_question(*x.rstrip(stripper).split(splitter))

	f = p.stdout
	for line in f:
		line = line.decode()
		if line.startswith("#"):
			continue

		v = {}
		v["chrom"], v["pos"], v["id"], v["ref"], v["alt"], v["qual"], v["filter"], info = line.split("\t")[0:8]

		v["pos"] = str((int)(v["pos"]) - 1) # back to 0-based

		csqs = info.split(";")[-1][4:].split(",")
		for csq in csqs:
			try:
				vep = CSQ(*csq.split("|"))
			except:
				print("error")
				print(csq)
				continue
			v.update({ x:(vep.__getattribute__(x) if vep.__getattribute__(x) else None) for x in vep._fields})
			v["ref_amino_acid"], v["alt_amino_acid"] = split_strip(v["amino_acids"], "/")
			v["ref_codon"], v["alt_codon"] = split_strip(v["codons"], ("/"))
			v["sift_prediction_term"], v["sift_score"] = split_strip(v["sift"], "(", ")")
			v["polyphen_prediction_term"], v["polyphen_score"] = split_strip(v["polyphen"], "(", ")")
			v["exon_number"], v["exon_count"] = split_strip(v["exon"], "/")
			v["intron_number"], v["intron_count"] = split_strip(v["intron"], "/")
			v["cdna_position_start"], v["cdna_position_stop"] = split_strip(v["cdna_position"], "-")
			v["cds_position_start"], v["cds_position_stop"] = split_strip(v["cds_position"], "-")
			v["protein_position_start"], v["protein_position_stop"] = split_strip(v["protein_position"], "-")
			v["aa_maf"], nothing = split_strip(v["aa_maf"], "&")
			v["asn_maf"], nothing = split_strip(v["asn_maf"], "&")
			v["eur_maf"], nothing = split_strip(v["eur_maf"], "&")
			v["amr_maf"], nothing = split_strip(v["amr_maf"], "&")
			v["ea_maf"], nothing = split_strip(v["ea_maf"], "&")
			v["afr_maf"], nothing = split_strip(v["afr_maf"], "&")

			v["exon_number"], nothing = split_strip(v["exon_number"], "-")

			# building bitvector of consequences
			bitvector = 0
			for c in v["consequence"].split("&"):
				if c.endswith('_variant'):
					c = c[:-8]
				if c == "3_prime_UTR": c = "THREE_PRIME_UTR"
				if c == "5_prime_UTR": c = "FIVE_PRIME_UTR"
				bitvector |= 2 ** VARIANT_DICT[c.upper()]


			v["consequence"] = bitvector

			a = float(v["aa_maf"] or 0)
			a = float(v["asn_maf"] or 0)
			a = float(v["eur_maf"] or 0)
			a = float(v["amr_maf"] or 0)
			a = float(v["afr_maf"] or 0)

			yield v
			if single: break


def vep(session, p):
	print("load unannotated variants", file=sys.stderr)

#	with tempfile.NamedTemporaryFile(delete=True, mode='wt') as f:
	with open("pseudovep.vcf", "wt") as f:
		p.stdin.write("##fileformat=VCFv4.1\n".encode())
		p.stdin.write("\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO\n"]).encode())

		# fetch all variants where at least one call exist, to avoid annotating variants without calls e.g. dbsnp variants
		results = session.execute("select v.chrom, v.pos + 1, v.id, v.ref, v.alt from (select distinct(variant_id) from calls WHERE variant_id > (select coalesce(max(variant_id)-1000,0) FROM annotations)) c join variants v on c.variant_id = v.id ORDER BY v.id")


		for x in results:
			line = "\t".join(str(i) for i in x) + "\t.\t.\t.\n"
			p.stdin.write((line).encode())
		p.stdin.close()
		p.wait()


def importing(session, p, chunksize=5000):
	print("fetching existing genes", file=sys.stderr)
	existing_genes = set(x[0] for x in session.execute("SELECT accession FROM genes").fetchall())
	print("fetching existing transcripts", file=sys.stderr)
	existing_transcripts = set(x[0] for x in session.execute("SELECT accession FROM transcripts").fetchall())
	print("fetching existing annotations", file=sys.stderr)
	existing_annotations = set(tuple(x) for x in session.execute("SELECT v.chrom, v.pos, v.alt, t.accession FROM annotations a JOIN variants v ON v.id = a.variant_id JOIN transcripts t ON t.id = a.transcript_id WHERE v.id > (select coalesce(max(variant_id)-1000,0) FROM annotations) ORDER BY v.id").fetchall())

	vep = VepReader(p)

	fmt = lambda x: tuple(str(i) if i is not None else r"\N" for i in x)

	def grouper(iterable, n):
		it = iter(iterable)
		while True:
			chunk = tuple(map(copy.copy, islice(it, n)))
			if not chunk:
				return
			yield chunk

	print("processing the vep file", file=sys.stderr)


	for i, group in enumerate(grouper(vep, chunksize)):
		print(i * chunksize, "variants processed", file=sys.stderr)

		inserting_genes = []
		inserting_transcripts = []
		inserting_annotations = []
		inserting_tmp_annotations = []

		for record in group:
			if record["gene"] not in existing_genes:
				inserting_genes.append(fmt((record["gene"], record["symbol"])))
				existing_genes.add(record["gene"])

			if record["feature"] not in existing_transcripts:
				inserting_transcripts.append(fmt((record["feature"], record["gene"], record["biotype"], record["ensp"], 0)))
				existing_transcripts.add(record["feature"])

			if (record["chrom"], int(record["pos"]), record["alt"], record["feature"]) not in existing_annotations:
				inserting_annotations.append(fmt(tuple(record[k] for k in keys2)))
				inserting_tmp_annotations.append(fmt(tuple(record[k] for k in ["chrom", "pos", "alt", "ref", "feature"])))

		conn = session.get_bind().raw_connection()
		cursor = conn.cursor()

		# genes
		if inserting_genes: # only if new genes are existing
			print("building gene string", file=sys.stderr)
			insert_string = "\n".join(["\t".join(g) for g in inserting_genes]) + "\n"

	
			print("create temporary gene table", file=sys.stderr)
			cursor.execute("""
			CREATE TEMPORARY TABLE tmp_genes (
				accession VARCHAR,
				name VARCHAR
			)""")

			print("inserting into temporary gene table", file=sys.stderr)
			cursor.copy_from(StringIO(insert_string), "tmp_genes", columns=["accession", "name"])

			print("join temporary gene table", file=sys.stderr)
			cursor.execute('''
				INSERT INTO genes (accession, name)
				SELECT g1.accession, g1.name FROM tmp_genes g1 LEFT JOIN genes g2 ON
				g1.accession = g2.accession
				WHERE g2.id IS NULL;
				''')

			cursor.execute("""DROP TABLE tmp_genes""")
			conn.commit()

		# transcripts
		if inserting_transcripts:
			print("building transcript string", file=sys.stderr)
			insert_string = "\n".join(["\t".join(t) for t in inserting_transcripts]) + "\n"

			print("create temporary transcript table", file=sys.stderr)
			cursor.execute("""
			CREATE TEMPORARY TABLE tmp_transcripts (
				accession VARCHAR,
				gene_accession VARCHAR,
				biotype VARCHAR,
				protein_accession VARCHAR,
				feature_count INTEGER
			)""")

			print("inserting into temporary transcript table", file=sys.stderr)
			cursor.copy_from(StringIO(insert_string), "tmp_transcripts", columns=["accession", "gene_accession, biotype, protein_accession, feature_count"])

			print("join temporary transcript table", file=sys.stderr)

			cursor.execute('''
				INSERT INTO transcripts (accession, gene_id, biotype, protein_accession, feature_count)
				SELECT t.accession, g.id, t.biotype, t.protein_accession, t.feature_count FROM tmp_transcripts t
				LEFT JOIN genes g ON g.accession = t.gene_accession
				LEFT JOIN transcripts t2 ON t.accession = t2.accession
				WHERE t2.id IS NULL;
				''')


			cursor.execute("""DROP TABLE tmp_transcripts""")
			conn.commit()


		# annotations
		if inserting_tmp_annotations:
			print("building annotation string", file=sys.stderr)
			insert_string = "\n".join(["\t".join(a) for a in inserting_tmp_annotations]) + "\n"

			print("create temporary annotation table", file=sys.stderr)
			cursor.execute("""
			CREATE TEMPORARY TABLE tmp_annotations (
				id SERIAL,
				chrom VARCHAR,
				pos INTEGER,
				alt VARCHAR,
				ref VARCHAR,
				feature VARCHAR)
			""")

			print("inserting into temporary annotation table", file=sys.stderr)
			cursor.copy_from(StringIO(insert_string), "tmp_annotations", columns=["chrom", "pos", "alt", "ref", "feature"])

			print("fetching variant and transcript ids", file=sys.stderr)
			cursor.execute('''SELECT v.id, t.id
				FROM tmp_annotations a
				LEFT JOIN variants v ON
					v.chrom = a.chrom AND
					v.pos = a.pos AND
					v.alt = a.alt AND
					v.ref = a.ref
				LEFT JOIN transcripts t ON
					t.accession = a.feature
				ORDER BY a.id
				''')
			variant_transcripts = cursor.fetchall()

			insert_string = []
			for ids, a in zip(variant_transcripts, inserting_annotations):
				if ids in existing_annotations: continue
				vid, tid = ids
	#			if tid == None:
	#				print(ids)
	#				print(a)
				if vid == None or tid == None: continue
				insert_string.append("\t".join((str(vid),str(tid)) + a))


			if len(insert_string):
				print("inserting", len(insert_string), "annotations", file=sys.stderr)
				insert_string = "\n".join(insert_string) + "\n"
				cursor.copy_from(StringIO(insert_string), "annotations", columns=["variant_id", "transcript_id"] + keys2)
			else:
				print("variants already annotated", file=sys.stderr)

			cursor.execute("""DROP TABLE tmp_annotations""")
			conn.commit()
		sys.stderr.flush()



def main():
	parser = HelpfulArgumentParser(description=__doc__)
	parser.add_argument("--vep-cache", "-c", help="The path to the vep-cache")
	parser.add_argument("--threads", "-t", default=1, help="invoke vep with that many threads")
	parser.add_argument("--html", default="output_vep.html", help="the vep output html file, must end with .html or .htm")
#	parser.add_argument("--clear-existing", action='store_true', default=False, help="Overwrite existing feature entries instead of skipping them")
	parser.add_argument("--url", "-u", default=None, help="Database configuration URL. If this is missing, the configuration is read from the exomate configuration file")
	parser.add_argument("--verbose", "-v", action='store_true', default=False, help="show SQL statements (enables SQLalchemy echoing)")


	args = parser.parse_args()

	Session = database.get_session(url=args.url)
	session = Session()
	print("connected to:", session.bind.engine, file=sys.stderr)
	if args.verbose:
		session.bind.engine.echo = True


	print("open subprocess", file=sys.stderr)
	print("threads", args.threads)
	command = "vep --force_overwrite --offline --fasta /vol/ref/vep/homo_sapiens/74/Homo_sapiens.GRCh37.74.dna.primary_assembly.fa --vcf --dir %(vep_cache)s --fork %(fork)s --hgvs --everything -o STDOUT --stats_file %(output_html)s" % {'vep_cache': args.vep_cache, 'fork': args.threads, "output_html": args.html}
	p = Popen(command, shell=True, stdout=PIPE, stdin=PIPE, stderr=sys.stderr)

	t = threading.Thread(target=vep, args=(session, p))
	t.start()
	importing(session, p)
	t.join()
	session.commit()
	session.close()

if __name__ == '__main__':
	main()
Tech Fingerprint

Alerts (46)

'import *' Avoid to prevent namespace pollution; import specific names or use aliases
16
'def' Ensure functions have docstrings for documentation
36 41 48 115 134 146 298
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
70
'print(' Use logging module for better control and configurability
71 72 116 135 137 139 154 158 183 187 194 197 210 213 223 226 243 246 257 260 279 280 286 290 312 317 318
Complexity hotspot; lines 105 to 109 (total complexity: 5)
105 106 107 108 109
'lambda' Avoid complex 'lambda' functions; prefer named functions for clarity and debugging
144
'== None' Use 'is' for None comparisons (e.g., x is None)
278 281
'open(' Use 'with open()' to ensure Files are properly closed
320
'threading.Thread(' Use concurrent.futures for easier thread management
322