PageRenderTime 60ms CodeModel.GetById 30ms RepoModel.GetById 0ms app.codeStats 1ms

/exomate/views/mutations.py

https://bitbucket.org/marcelm/exomate
Python | 288 lines | 237 code | 31 blank | 20 comment | 33 complexity | b024f6f03045b406fbeebc4dd33257b6 MD5 | raw file
  1. """
  2. The Mutations page.
  3. """
  4. import os
  5. from collections import defaultdict, namedtuple, Counter
  6. from flask import request, render_template
  7. from sqlalchemy import or_, and_, func, between, distinct
  8. from exomate.shortener import shorten
  9. from exomate.application import app
  10. from exomate.database import get_session
  11. from exomate.models import *
  12. from exomate.variants import VARIANT, VARIANT_OUT_NAMES, EXON_VARIANTS
  13. from exomate.views.common import Row, NO_FILTER, get_incomplete_annotation_warning, parse_aux_filter_parameters, get_aux_filter, parse_mutation_query_parameters
  14. session = get_session()
  15. #@shorten("mutations")
  16. @app.route('/mutations', methods=['GET', 'POST'])
  17. def mutations():
  18. if request.method != 'POST':
  19. diseases = session.query(Disease).all()
  20. samples = session.query(Sample).join(Disease).order_by(Disease.name, Sample.accession).all()
  21. sample_groups = session.query(SampleGroup).all()
  22. known_sources = session.query(KnownSource).all()
  23. bedfiles = [x for (x,) in session.query(distinct(BedAnnotation.accession)).all()]
  24. warning = get_incomplete_annotation_warning()
  25. return render_template('mutations-query.html', diseases=diseases,
  26. samples=samples,
  27. known_sources=known_sources,
  28. warning=warning,
  29. sample_groups=sample_groups,
  30. bedfiles = bedfiles,
  31. variant=VARIANT_OUT_NAMES,
  32. exon_variants=EXON_VARIANTS)
  33. query = parse_mutation_query_parameters(request.form)
  34. # TODO this can be simplified a lot by using SQLalchemy properly
  35. #
  36. # get all affected samples
  37. affected_samples = session.query(Sample).filter(Sample.accession.in_(query.affected_sample_names)).all()
  38. if not affected_samples:
  39. return render_template('error.html', message="No samples specified")
  40. samples = dict((s.accession, s) for s in affected_samples)
  41. # get selected sample ids by the selected accessions
  42. sample_ids = set(s.id for s in affected_samples)
  43. # extend unaffected sample ids by the samples of the selected diseases
  44. filter_sample_ids_ext = set(s.id for s in query.filter_samples)
  45. if query.filter_diseases:
  46. filter_sample_ids_ext |= set(s.id for s in session.query(Sample).\
  47. filter(Sample.disease_id.in_(d.id for d in query.filter_diseases)).all())
  48. # make sure the filter samples set doesn't contain a selected sample and save the intersection for later output
  49. samples_unaffected_intersection_ids = filter_sample_ids_ext & sample_ids
  50. samples_unaffected_intersection_names = set([r for (r, ) in session.query(Sample.accession).filter(Sample.id.in_(samples_unaffected_intersection_ids)).all()]) if samples_unaffected_intersection_ids else []
  51. # the real filter samples (minus selected samples)
  52. filter_sample_ids_ext = filter_sample_ids_ext - sample_ids
  53. affected_filter = False
  54. if query.affected_sample_names:
  55. affected_filter = Sample.accession.in_(query.affected_sample_names)
  56. # building filter, to filter unaffected samples
  57. unaffected_samples = session.query(Sample).join(Library).join(Unit).join(UnitStats)
  58. #unaffected_filter = False
  59. #if filter_sample_ids_ext:
  60. unaffected_filter = Sample.id.in_(filter_sample_ids_ext)
  61. unaffected_samples = unaffected_samples.filter(unaffected_filter).all()
  62. affected_filter &= Call.qual >= query.affected_qual
  63. # show only homozygous or heterozygous?
  64. if not (query.show_homozygous and query.show_heterozygous):
  65. if query.show_homozygous == query.show_heterozygous: #both unchecked
  66. return render_template('error.html', message="At least heterozygous or homozygous must be checked")
  67. affected_filter &= Call.is_heterozygous == query.show_heterozygous
  68. # additional quality filters
  69. aux_filter_query = parse_aux_filter_parameters(request.form)
  70. aux_filter = get_aux_filter(aux_filter_query)
  71. # maf filter
  72. if query.min_eur_maf > 0.0:
  73. affected_filter &= Annotation.eur_maf >= query.min_eur_maf
  74. if query.max_eur_maf < 1.0:
  75. affected_filter &= Annotation.eur_maf >= query.max_eur_maf
  76. if query.min_amr_maf > 0.0:
  77. affected_filter &= Annotation.amr_maf >= query.min_amr_maf
  78. if query.max_amr_maf < 1.0:
  79. affected_filter &= Annotation.amr_maf >= query.max_amr_maf
  80. if query.min_afr_maf > 0.0:
  81. affected_filter &= Annotation.afr_maf >= query.min_afr_maf
  82. if query.max_afr_maf < 1.0:
  83. affected_filter &= Annotation.afr_maf >= query.max_afr_maf
  84. if query.min_asn_maf > 0.0:
  85. affected_filter &= Annotation.asn_maf >= query.min_asn_maf
  86. if query.max_asn_maf < 1.0:
  87. affected_filter &= Annotation.asn_maf >= query.max_asn_maf
  88. genesused = []
  89. if query.all_genes:
  90. transcripts = None
  91. gene_filter = NO_FILTER
  92. notfound = []
  93. else:
  94. gene_names = query.gene_names
  95. genesused = [g for g in session.query(Gene).filter(Gene.name.in_(gene_names)).all()]
  96. notfound = list(set(gene_names) - set([g.name for g in genesused]))
  97. transcripts = session.query(Transcript).join(Gene).filter(Gene.name.in_(gene_names)).all()
  98. gene_filter = Gene.name.in_(gene_names)
  99. mutations = affected_mutations(
  100. affected_filter,
  101. unaffected_filter,
  102. aux_filter,
  103. query.unaffected_qual,
  104. query.consequences_bits,
  105. query.known_sources,
  106. query.splice_dist,
  107. query.show_dbsnp_clinical,
  108. query.show_dbsnp_precious,
  109. query.show_dbsnp_locus_specific,
  110. query.show_coding_variants,
  111. query.show_intron_variants,
  112. query.show_utr_variants,
  113. query.show_protein_products_only,
  114. query.ignore_heterozygous,
  115. query.bedfile_accession).filter(gene_filter).all()
  116. rows = defaultdict(Row)
  117. samples_per_gene = defaultdict(set) # gene_name -> sample, used as counter, to detect common genes in different samples
  118. variant_per_gene = defaultdict(set)
  119. # grouping (call/annotation)s without bedannoation
  120. CallAnnotation = namedtuple("CallAnnotation", "call annotation bed")
  121. for call, annotation, bed in mutations:
  122. gene = annotation.transcript.gene
  123. unit = call.sample.units[0]
  124. variant = call.variant
  125. sample_accession = unit.library.sample.accession
  126. samples_per_gene[gene.name].add(sample_accession)
  127. # the key to group the entries
  128. key = (annotation.transcript.gene.name, annotation.variant.id)
  129. rows[key].add(CallAnnotation(call, annotation, bed))
  130. variant_per_gene[gene.name].add(variant.id)
  131. sorted_rows = sorted([row for row in rows.values() if len(samples_per_gene[row.gene.name]) >= query.min_samples_per_gene and len(row.samples) >= query.min_samples_per_variant and len(variant_per_gene[row.gene.name]) >= query.min_variants_per_gene], key=lambda row: row.gene.name)
  132. warning = get_incomplete_annotation_warning()
  133. return render_template(
  134. 'mutations-result.html',
  135. rows = sorted_rows,
  136. notfound=notfound,
  137. genesused=genesused,
  138. transcripts=transcripts,
  139. unaffected_samples=unaffected_samples,
  140. warning=warning,
  141. common_genes=samples_per_gene,
  142. samples_unaffected_intersection_names=samples_unaffected_intersection_names,
  143. query=query,
  144. mutation_stats=mutation_stats(mutations, affected_samples))
  145. def affected_mutations(
  146. affected_filter,
  147. unaffected_filter,
  148. aux_filter,
  149. unaffected_qual,
  150. consequences,
  151. known_sources='all',
  152. splice_dist=10,
  153. show_dbsnp_clinical=True,
  154. show_dbsnp_precious=True,
  155. show_dbsnp_locus_specific=True,
  156. show_coding_variants=True,
  157. show_intron_variants=False,
  158. show_utr_variants=False,
  159. show_protein_products_only=False,
  160. ignore_heterozygous=True,
  161. bedaccession=None
  162. ):
  163. """
  164. Return a query that finds all variants of 'affected' units and subtracts
  165. both uninteresting ones from other units and also known variants.
  166. known_sources -- a list of KnownSource.id values (which may be empty).
  167. The known variants from that source are subtracted from the result, but
  168. filtered according to tags in dbSNP (see KnownVariant in models.py). If
  169. this has the special value 'all', then all known variants are filtered.
  170. affected_filter -- expression that is used to select all interesting
  171. calls. The expression can access Call, Unit, Library, Sample,
  172. Disease and Patient.
  173. unaffected_filter -- expression that is used to filter out all uninteresting
  174. calls from other units.
  175. Also, synonymous variants are filtered unless they are close to a splice site.
  176. """
  177. # Find all calls from samples that we are interested in
  178. calls = session.query(Call).join(Sample).join(Patient).\
  179. join(Disease).filter(affected_filter)
  180. # Find all sample_ids of those calls that should be filtered out.
  181. # TODO this was 10 times faster with unit_ids, is this still the case
  182. # with sample_ids?
  183. #
  184. # It is 10 times faster to determine this list beforehand via a separate
  185. # query than to add a subquery to the final SQL statement.
  186. # The original took 49s at best. This version takes 4s at best, but
  187. # sometimes 35s.
  188. unaffected_samples = session.query(Sample).\
  189. join(Patient).join(Disease).\
  190. filter(unaffected_filter).all()
  191. unaffected_sample_ids = [ sample.id for sample in unaffected_samples ]
  192. # If unaffected samples are given, subtract their variants
  193. if unaffected_sample_ids:
  194. unaffected_calls = session.query(Call).\
  195. filter(Call.qual > unaffected_qual).\
  196. filter(Call.sample_id.in_(unaffected_sample_ids)).subquery()
  197. equal_variant = unaffected_calls.c.variant_id == Call.variant_id
  198. if not ignore_heterozygous:
  199. equal_variant &= unaffected_calls.c.is_heterozygous == Call.is_heterozygous
  200. calls = calls.outerjoin(unaffected_calls, equal_variant).\
  201. filter(unaffected_calls.c.variant_id == None)
  202. # Subtract known variants. Variants that are known,
  203. # but for which one of the dbsnp_tags is set
  204. # are kept.
  205. # (no allele frequency here, only relevant for recessive)
  206. if known_sources:
  207. conditions = [KnownVariant.variant_id == None]
  208. if show_dbsnp_clinical:
  209. conditions.append(and_(KnownVariant.clinical, KnownVariant.clinical_significance != 2))
  210. if show_dbsnp_precious:
  211. conditions.append(KnownVariant.precious)
  212. if show_dbsnp_locus_specific:
  213. conditions.append(KnownVariant.locus_specific_db)
  214. join_condition = KnownVariant.variant_id == Call.variant_id
  215. if known_sources != 'all':
  216. join_condition = and_(join_condition, KnownVariant.source_id.in_(ks.id for ks in known_sources))
  217. calls = calls.outerjoin(KnownVariant, join_condition).filter(or_(*conditions))
  218. # Select variants based on their annotation.
  219. result = session.query(Call, Annotation).select_from(calls.subquery()).\
  220. join(Variant).join(Annotation).filter(aux_filter).filter(Annotation.consequence.op('&')(consequences)>0).join(Transcript).join(Gene)
  221. #pprint(session.execute(Explain(result)).fetchall())
  222. result = session.query(Call, Annotation, BedAnnotation).select_from(calls.subquery()).\
  223. join(Variant).join(Annotation).filter(aux_filter).filter(Annotation.consequence.op('&')(consequences)>0).join(Transcript).join(Gene).\
  224. outerjoin(BedAnnotation, and_(BedAnnotation.accession == bedaccession, BedAnnotation.chrom == Variant.chrom, between(Variant.pos, BedAnnotation.start, BedAnnotation.stop)))
  225. if show_protein_products_only:
  226. result = result.filter(Transcript.protein_accession != None)
  227. return result
  228. def mutation_stats(mutations, affected_samples):
  229. total_library_size = sum(unit.stats.bases for sample in affected_samples for unit in sample.units if unit.stats)
  230. if not total_library_size:
  231. return []
  232. mutation_counts = Counter(annotation.transcript for call, annotation, bed in mutations)
  233. mutations_per_kilobase = {transcript: count / (transcript.length / 1e3) for transcript, count in list(mutation_counts.items())}
  234. mutations_per_kilobase_per_million = {transcript: mpk / (total_library_size / 1e6) for transcript, mpk in list(mutations_per_kilobase.items())}
  235. # return sorted([(transcript, mutation_counts[transcript], mutations_per_kilobase_per_million[transcript]) for transcript in mutations_per_kilobase_per_million], key=lambda item: item[0].name)
  236. return sorted([(transcript, mutation_counts[transcript], mutations_per_kilobase_per_million[transcript]) for transcript in mutations_per_kilobase_per_million], key=lambda item: item[0].accession)