PageRenderTime 70ms CodeModel.GetById 35ms RepoModel.GetById 0ms app.codeStats 0ms

/exomate/scripts/annotatevep.py

https://bitbucket.org/marcelm/exomate
Python | 330 lines | 321 code | 8 blank | 1 comment | 2 complexity | a1d51314c00ac651b136fd05826f2332 MD5 | raw file
  1. #!/usr/bin/env python
  2. import sys
  3. import os.path
  4. from io import StringIO
  5. import psycopg2
  6. from itertools import islice
  7. from sqlalchemy.engine.url import make_url
  8. from sqlalchemy import (Table, Column, Integer, SmallInteger, BigInteger,
  9. String, Boolean, Float, Date, DateTime, CHAR, Index, ForeignKey,
  10. UniqueConstraint, CheckConstraint)
  11. import sqlalchemy.types as sqltypes
  12. from sqt.io.fasta import IndexedFasta
  13. from exomate import database
  14. from exomate.variants import normalized_indel, transition_transversion
  15. from exomate.models import *
  16. from exomate.commandline import HelpfulArgumentParser
  17. from collections import namedtuple, defaultdict
  18. from time import gmtime, strftime
  19. from functools import lru_cache
  20. from itertools import count, zip_longest, islice
  21. from time import sleep
  22. import tempfile
  23. from subprocess import Popen, PIPE, STDOUT
  24. import threading
  25. import math
  26. import copy
  27. __author__ = "Christopher Schroeder, Johannes Koester"
  28. CSQ = namedtuple("CSQ", "allele gene feature feature_type consequence cdna_position cds_position protein_position amino_acids codons existing_variation aa_maf ea_maf exon intron motif_name motif_pos high_inf_pos motif_score_change distance clin_sig canonical symbol symbol_source sift polyphen gmaf biotype ensp domains ccds hgvsc hgvsp afr_maf amr_maf asn_maf eur_maf pubmed ")
  29. keys2 = ["gene", "feature", "feature_type", "consequence", "cdna_position_start", "cdna_position_stop", "cds_position_start", "cds_position_stop", "protein_position_start", "protein_position_stop", "ref_amino_acid", "alt_amino_acid", "ref_codon", "alt_codon", "existing_variation", "aa_maf", "ea_maf", "exon_number", "exon_count", "intron_number", "intron_count", "motif_score_change", "distance", "canonical", "symbol", "symbol_source", "sift_prediction_term", "sift_score", "polyphen_prediction_term", "polyphen_score", "biotype", "ensp", "domains", "ccds", "hgvsc", "hgvsp", "afr_maf", "amr_maf", "asn_maf", "eur_maf"]
  30. def VepReader(p, single=False):
  31. from exomate.variants import VARIANT
  32. VARIANT_DICT = dict(VARIANT.__dict__)
  33. def no_question(x, y):
  34. if x == '?' or x == '':
  35. x = None
  36. if y == '?' or x == '':
  37. y = None
  38. return x, y
  39. def split_strip(x, splitter, stripper='', no_question=no_question):
  40. if not x or x == '-':
  41. return None, None
  42. if len(x.split(splitter)) == 1:
  43. return no_question(x.rstrip(stripper), x.rstrip(stripper))
  44. return no_question(*x.rstrip(stripper).split(splitter))
  45. f = p.stdout
  46. for line in f:
  47. line = line.decode()
  48. if line.startswith("#"):
  49. continue
  50. v = {}
  51. v["chrom"], v["pos"], v["id"], v["ref"], v["alt"], v["qual"], v["filter"], info = line.split("\t")[0:8]
  52. v["pos"] = str((int)(v["pos"]) - 1) # back to 0-based
  53. csqs = info.split(";")[-1][4:].split(",")
  54. for csq in csqs:
  55. try:
  56. vep = CSQ(*csq.split("|"))
  57. except:
  58. print("error")
  59. print(csq)
  60. continue
  61. v.update({ x:(vep.__getattribute__(x) if vep.__getattribute__(x) else None) for x in vep._fields})
  62. v["ref_amino_acid"], v["alt_amino_acid"] = split_strip(v["amino_acids"], "/")
  63. v["ref_codon"], v["alt_codon"] = split_strip(v["codons"], ("/"))
  64. v["sift_prediction_term"], v["sift_score"] = split_strip(v["sift"], "(", ")")
  65. v["polyphen_prediction_term"], v["polyphen_score"] = split_strip(v["polyphen"], "(", ")")
  66. v["exon_number"], v["exon_count"] = split_strip(v["exon"], "/")
  67. v["intron_number"], v["intron_count"] = split_strip(v["intron"], "/")
  68. v["cdna_position_start"], v["cdna_position_stop"] = split_strip(v["cdna_position"], "-")
  69. v["cds_position_start"], v["cds_position_stop"] = split_strip(v["cds_position"], "-")
  70. v["protein_position_start"], v["protein_position_stop"] = split_strip(v["protein_position"], "-")
  71. v["aa_maf"], nothing = split_strip(v["aa_maf"], "&")
  72. v["asn_maf"], nothing = split_strip(v["asn_maf"], "&")
  73. v["eur_maf"], nothing = split_strip(v["eur_maf"], "&")
  74. v["amr_maf"], nothing = split_strip(v["amr_maf"], "&")
  75. v["ea_maf"], nothing = split_strip(v["ea_maf"], "&")
  76. v["afr_maf"], nothing = split_strip(v["afr_maf"], "&")
  77. v["exon_number"], nothing = split_strip(v["exon_number"], "-")
  78. # building bitvector of consequences
  79. bitvector = 0
  80. for c in v["consequence"].split("&"):
  81. if c.endswith('_variant'):
  82. c = c[:-8]
  83. if c == "3_prime_UTR": c = "THREE_PRIME_UTR"
  84. if c == "5_prime_UTR": c = "FIVE_PRIME_UTR"
  85. bitvector |= 2 ** VARIANT_DICT[c.upper()]
  86. v["consequence"] = bitvector
  87. a = float(v["aa_maf"] or 0)
  88. a = float(v["asn_maf"] or 0)
  89. a = float(v["eur_maf"] or 0)
  90. a = float(v["amr_maf"] or 0)
  91. a = float(v["afr_maf"] or 0)
  92. yield v
  93. if single: break
  94. def vep(session, p):
  95. print("load unannotated variants", file=sys.stderr)
  96. # with tempfile.NamedTemporaryFile(delete=True, mode='wt') as f:
  97. with open("pseudovep.vcf", "wt") as f:
  98. p.stdin.write("##fileformat=VCFv4.1\n".encode())
  99. p.stdin.write("\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO\n"]).encode())
  100. # fetch all variants where at least one call exist, to avoid annotating variants without calls e.g. dbsnp variants
  101. results = session.execute("select v.chrom, v.pos + 1, v.id, v.ref, v.alt from (select distinct(variant_id) from calls WHERE variant_id > (select coalesce(max(variant_id)-1000,0) FROM annotations)) c join variants v on c.variant_id = v.id ORDER BY v.id")
  102. for x in results:
  103. line = "\t".join(str(i) for i in x) + "\t.\t.\t.\n"
  104. p.stdin.write((line).encode())
  105. p.stdin.close()
  106. p.wait()
  107. def importing(session, p, chunksize=5000):
  108. print("fetching existing genes", file=sys.stderr)
  109. existing_genes = set(x[0] for x in session.execute("SELECT accession FROM genes").fetchall())
  110. print("fetching existing transcripts", file=sys.stderr)
  111. existing_transcripts = set(x[0] for x in session.execute("SELECT accession FROM transcripts").fetchall())
  112. print("fetching existing annotations", file=sys.stderr)
  113. existing_annotations = set(tuple(x) for x in session.execute("SELECT v.chrom, v.pos, v.alt, t.accession FROM annotations a JOIN variants v ON v.id = a.variant_id JOIN transcripts t ON t.id = a.transcript_id WHERE v.id > (select coalesce(max(variant_id)-1000,0) FROM annotations) ORDER BY v.id").fetchall())
  114. vep = VepReader(p)
  115. fmt = lambda x: tuple(str(i) if i is not None else r"\N" for i in x)
  116. def grouper(iterable, n):
  117. it = iter(iterable)
  118. while True:
  119. chunk = tuple(map(copy.copy, islice(it, n)))
  120. if not chunk:
  121. return
  122. yield chunk
  123. print("processing the vep file", file=sys.stderr)
  124. for i, group in enumerate(grouper(vep, chunksize)):
  125. print(i * chunksize, "variants processed", file=sys.stderr)
  126. inserting_genes = []
  127. inserting_transcripts = []
  128. inserting_annotations = []
  129. inserting_tmp_annotations = []
  130. for record in group:
  131. if record["gene"] not in existing_genes:
  132. inserting_genes.append(fmt((record["gene"], record["symbol"])))
  133. existing_genes.add(record["gene"])
  134. if record["feature"] not in existing_transcripts:
  135. inserting_transcripts.append(fmt((record["feature"], record["gene"], record["biotype"], record["ensp"], 0)))
  136. existing_transcripts.add(record["feature"])
  137. if (record["chrom"], int(record["pos"]), record["alt"], record["feature"]) not in existing_annotations:
  138. inserting_annotations.append(fmt(tuple(record[k] for k in keys2)))
  139. inserting_tmp_annotations.append(fmt(tuple(record[k] for k in ["chrom", "pos", "alt", "ref", "feature"])))
  140. conn = session.get_bind().raw_connection()
  141. cursor = conn.cursor()
  142. # genes
  143. if inserting_genes: # only if new genes are existing
  144. print("building gene string", file=sys.stderr)
  145. insert_string = "\n".join(["\t".join(g) for g in inserting_genes]) + "\n"
  146. print("create temporary gene table", file=sys.stderr)
  147. cursor.execute("""
  148. CREATE TEMPORARY TABLE tmp_genes (
  149. accession VARCHAR,
  150. name VARCHAR
  151. )""")
  152. print("inserting into temporary gene table", file=sys.stderr)
  153. cursor.copy_from(StringIO(insert_string), "tmp_genes", columns=["accession", "name"])
  154. print("join temporary gene table", file=sys.stderr)
  155. cursor.execute('''
  156. INSERT INTO genes (accession, name)
  157. SELECT g1.accession, g1.name FROM tmp_genes g1 LEFT JOIN genes g2 ON
  158. g1.accession = g2.accession
  159. WHERE g2.id IS NULL;
  160. ''')
  161. cursor.execute("""DROP TABLE tmp_genes""")
  162. conn.commit()
  163. # transcripts
  164. if inserting_transcripts:
  165. print("building transcript string", file=sys.stderr)
  166. insert_string = "\n".join(["\t".join(t) for t in inserting_transcripts]) + "\n"
  167. print("create temporary transcript table", file=sys.stderr)
  168. cursor.execute("""
  169. CREATE TEMPORARY TABLE tmp_transcripts (
  170. accession VARCHAR,
  171. gene_accession VARCHAR,
  172. biotype VARCHAR,
  173. protein_accession VARCHAR,
  174. feature_count INTEGER
  175. )""")
  176. print("inserting into temporary transcript table", file=sys.stderr)
  177. cursor.copy_from(StringIO(insert_string), "tmp_transcripts", columns=["accession", "gene_accession, biotype, protein_accession, feature_count"])
  178. print("join temporary transcript table", file=sys.stderr)
  179. cursor.execute('''
  180. INSERT INTO transcripts (accession, gene_id, biotype, protein_accession, feature_count)
  181. SELECT t.accession, g.id, t.biotype, t.protein_accession, t.feature_count FROM tmp_transcripts t
  182. LEFT JOIN genes g ON g.accession = t.gene_accession
  183. LEFT JOIN transcripts t2 ON t.accession = t2.accession
  184. WHERE t2.id IS NULL;
  185. ''')
  186. cursor.execute("""DROP TABLE tmp_transcripts""")
  187. conn.commit()
  188. # annotations
  189. if inserting_tmp_annotations:
  190. print("building annotation string", file=sys.stderr)
  191. insert_string = "\n".join(["\t".join(a) for a in inserting_tmp_annotations]) + "\n"
  192. print("create temporary annotation table", file=sys.stderr)
  193. cursor.execute("""
  194. CREATE TEMPORARY TABLE tmp_annotations (
  195. id SERIAL,
  196. chrom VARCHAR,
  197. pos INTEGER,
  198. alt VARCHAR,
  199. ref VARCHAR,
  200. feature VARCHAR)
  201. """)
  202. print("inserting into temporary annotation table", file=sys.stderr)
  203. cursor.copy_from(StringIO(insert_string), "tmp_annotations", columns=["chrom", "pos", "alt", "ref", "feature"])
  204. print("fetching variant and transcript ids", file=sys.stderr)
  205. cursor.execute('''SELECT v.id, t.id
  206. FROM tmp_annotations a
  207. LEFT JOIN variants v ON
  208. v.chrom = a.chrom AND
  209. v.pos = a.pos AND
  210. v.alt = a.alt AND
  211. v.ref = a.ref
  212. LEFT JOIN transcripts t ON
  213. t.accession = a.feature
  214. ORDER BY a.id
  215. ''')
  216. variant_transcripts = cursor.fetchall()
  217. insert_string = []
  218. for ids, a in zip(variant_transcripts, inserting_annotations):
  219. if ids in existing_annotations: continue
  220. vid, tid = ids
  221. # if tid == None:
  222. # print(ids)
  223. # print(a)
  224. if vid == None or tid == None: continue
  225. insert_string.append("\t".join((str(vid),str(tid)) + a))
  226. if len(insert_string):
  227. print("inserting", len(insert_string), "annotations", file=sys.stderr)
  228. insert_string = "\n".join(insert_string) + "\n"
  229. cursor.copy_from(StringIO(insert_string), "annotations", columns=["variant_id", "transcript_id"] + keys2)
  230. else:
  231. print("variants already annotated", file=sys.stderr)
  232. cursor.execute("""DROP TABLE tmp_annotations""")
  233. conn.commit()
  234. sys.stderr.flush()
  235. def main():
  236. parser = HelpfulArgumentParser(description=__doc__)
  237. parser.add_argument("--vep-cache", "-c", help="The path to the vep-cache")
  238. parser.add_argument("--threads", "-t", default=1, help="invoke vep with that many threads")
  239. parser.add_argument("--html", default="output_vep.html", help="the vep output html file, must end with .html or .htm")
  240. # parser.add_argument("--clear-existing", action='store_true', default=False, help="Overwrite existing feature entries instead of skipping them")
  241. parser.add_argument("--url", "-u", default=None, help="Database configuration URL. If this is missing, the configuration is read from the exomate configuration file")
  242. parser.add_argument("--verbose", "-v", action='store_true', default=False, help="show SQL statements (enables SQLalchemy echoing)")
  243. args = parser.parse_args()
  244. Session = database.get_session(url=args.url)
  245. session = Session()
  246. print("connected to:", session.bind.engine, file=sys.stderr)
  247. if args.verbose:
  248. session.bind.engine.echo = True
  249. print("open subprocess", file=sys.stderr)
  250. print("threads", args.threads)
  251. command = "vep --force_overwrite --offline --fasta /vol/ref/vep/homo_sapiens/74/Homo_sapiens.GRCh37.74.dna.primary_assembly.fa --vcf --dir %(vep_cache)s --fork %(fork)s --hgvs --everything -o STDOUT --stats_file %(output_html)s" % {'vep_cache': args.vep_cache, 'fork': args.threads, "output_html": args.html}
  252. p = Popen(command, shell=True, stdout=PIPE, stdin=PIPE, stderr=sys.stderr)
  253. t = threading.Thread(target=vep, args=(session, p))
  254. t.start()
  255. importing(session, p)
  256. t.join()
  257. session.commit()
  258. session.close()
  259. if __name__ == '__main__':
  260. main()