/exomate/views/mutations.py
Python | 288 lines | 237 code | 31 blank | 20 comment | 33 complexity | b024f6f03045b406fbeebc4dd33257b6 MD5 | raw file
- """
- The Mutations page.
- """
- import os
- from collections import defaultdict, namedtuple, Counter
- from flask import request, render_template
- from sqlalchemy import or_, and_, func, between, distinct
- from exomate.shortener import shorten
- from exomate.application import app
- from exomate.database import get_session
- from exomate.models import *
- from exomate.variants import VARIANT, VARIANT_OUT_NAMES, EXON_VARIANTS
- from exomate.views.common import Row, NO_FILTER, get_incomplete_annotation_warning, parse_aux_filter_parameters, get_aux_filter, parse_mutation_query_parameters
- session = get_session()
- #@shorten("mutations")
- @app.route('/mutations', methods=['GET', 'POST'])
- def mutations():
- if request.method != 'POST':
- diseases = session.query(Disease).all()
- samples = session.query(Sample).join(Disease).order_by(Disease.name, Sample.accession).all()
- sample_groups = session.query(SampleGroup).all()
- known_sources = session.query(KnownSource).all()
- bedfiles = [x for (x,) in session.query(distinct(BedAnnotation.accession)).all()]
- warning = get_incomplete_annotation_warning()
- return render_template('mutations-query.html', diseases=diseases,
- samples=samples,
- known_sources=known_sources,
- warning=warning,
- sample_groups=sample_groups,
- bedfiles = bedfiles,
- variant=VARIANT_OUT_NAMES,
- exon_variants=EXON_VARIANTS)
- query = parse_mutation_query_parameters(request.form)
- # TODO this can be simplified a lot by using SQLalchemy properly
- #
- # get all affected samples
- affected_samples = session.query(Sample).filter(Sample.accession.in_(query.affected_sample_names)).all()
- if not affected_samples:
- return render_template('error.html', message="No samples specified")
- samples = dict((s.accession, s) for s in affected_samples)
- # get selected sample ids by the selected accessions
- sample_ids = set(s.id for s in affected_samples)
- # extend unaffected sample ids by the samples of the selected diseases
- filter_sample_ids_ext = set(s.id for s in query.filter_samples)
- if query.filter_diseases:
- filter_sample_ids_ext |= set(s.id for s in session.query(Sample).\
- filter(Sample.disease_id.in_(d.id for d in query.filter_diseases)).all())
- # make sure the filter samples set doesn't contain a selected sample and save the intersection for later output
- samples_unaffected_intersection_ids = filter_sample_ids_ext & sample_ids
- samples_unaffected_intersection_names = set([r for (r, ) in session.query(Sample.accession).filter(Sample.id.in_(samples_unaffected_intersection_ids)).all()]) if samples_unaffected_intersection_ids else []
- # the real filter samples (minus selected samples)
- filter_sample_ids_ext = filter_sample_ids_ext - sample_ids
- affected_filter = False
- if query.affected_sample_names:
- affected_filter = Sample.accession.in_(query.affected_sample_names)
- # building filter, to filter unaffected samples
- unaffected_samples = session.query(Sample).join(Library).join(Unit).join(UnitStats)
- #unaffected_filter = False
- #if filter_sample_ids_ext:
- unaffected_filter = Sample.id.in_(filter_sample_ids_ext)
- unaffected_samples = unaffected_samples.filter(unaffected_filter).all()
- affected_filter &= Call.qual >= query.affected_qual
- # show only homozygous or heterozygous?
- if not (query.show_homozygous and query.show_heterozygous):
- if query.show_homozygous == query.show_heterozygous: #both unchecked
- return render_template('error.html', message="At least heterozygous or homozygous must be checked")
- affected_filter &= Call.is_heterozygous == query.show_heterozygous
- # additional quality filters
- aux_filter_query = parse_aux_filter_parameters(request.form)
- aux_filter = get_aux_filter(aux_filter_query)
- # maf filter
- if query.min_eur_maf > 0.0:
- affected_filter &= Annotation.eur_maf >= query.min_eur_maf
- if query.max_eur_maf < 1.0:
- affected_filter &= Annotation.eur_maf >= query.max_eur_maf
- if query.min_amr_maf > 0.0:
- affected_filter &= Annotation.amr_maf >= query.min_amr_maf
- if query.max_amr_maf < 1.0:
- affected_filter &= Annotation.amr_maf >= query.max_amr_maf
- if query.min_afr_maf > 0.0:
- affected_filter &= Annotation.afr_maf >= query.min_afr_maf
- if query.max_afr_maf < 1.0:
- affected_filter &= Annotation.afr_maf >= query.max_afr_maf
- if query.min_asn_maf > 0.0:
- affected_filter &= Annotation.asn_maf >= query.min_asn_maf
- if query.max_asn_maf < 1.0:
- affected_filter &= Annotation.asn_maf >= query.max_asn_maf
- genesused = []
- if query.all_genes:
- transcripts = None
- gene_filter = NO_FILTER
- notfound = []
- else:
- gene_names = query.gene_names
- genesused = [g for g in session.query(Gene).filter(Gene.name.in_(gene_names)).all()]
- notfound = list(set(gene_names) - set([g.name for g in genesused]))
- transcripts = session.query(Transcript).join(Gene).filter(Gene.name.in_(gene_names)).all()
- gene_filter = Gene.name.in_(gene_names)
- mutations = affected_mutations(
- affected_filter,
- unaffected_filter,
- aux_filter,
- query.unaffected_qual,
- query.consequences_bits,
- query.known_sources,
- query.splice_dist,
- query.show_dbsnp_clinical,
- query.show_dbsnp_precious,
- query.show_dbsnp_locus_specific,
- query.show_coding_variants,
- query.show_intron_variants,
- query.show_utr_variants,
- query.show_protein_products_only,
- query.ignore_heterozygous,
- query.bedfile_accession).filter(gene_filter).all()
- rows = defaultdict(Row)
- samples_per_gene = defaultdict(set) # gene_name -> sample, used as counter, to detect common genes in different samples
- variant_per_gene = defaultdict(set)
- # grouping (call/annotation)s without bedannoation
- CallAnnotation = namedtuple("CallAnnotation", "call annotation bed")
- for call, annotation, bed in mutations:
- gene = annotation.transcript.gene
- unit = call.sample.units[0]
- variant = call.variant
- sample_accession = unit.library.sample.accession
- samples_per_gene[gene.name].add(sample_accession)
- # the key to group the entries
- key = (annotation.transcript.gene.name, annotation.variant.id)
- rows[key].add(CallAnnotation(call, annotation, bed))
- variant_per_gene[gene.name].add(variant.id)
- sorted_rows = sorted([row for row in rows.values() if len(samples_per_gene[row.gene.name]) >= query.min_samples_per_gene and len(row.samples) >= query.min_samples_per_variant and len(variant_per_gene[row.gene.name]) >= query.min_variants_per_gene], key=lambda row: row.gene.name)
- warning = get_incomplete_annotation_warning()
- return render_template(
- 'mutations-result.html',
- rows = sorted_rows,
- notfound=notfound,
- genesused=genesused,
- transcripts=transcripts,
- unaffected_samples=unaffected_samples,
- warning=warning,
- common_genes=samples_per_gene,
- samples_unaffected_intersection_names=samples_unaffected_intersection_names,
- query=query,
- mutation_stats=mutation_stats(mutations, affected_samples))
- def affected_mutations(
- affected_filter,
- unaffected_filter,
- aux_filter,
- unaffected_qual,
- consequences,
- known_sources='all',
- splice_dist=10,
- show_dbsnp_clinical=True,
- show_dbsnp_precious=True,
- show_dbsnp_locus_specific=True,
- show_coding_variants=True,
- show_intron_variants=False,
- show_utr_variants=False,
- show_protein_products_only=False,
- ignore_heterozygous=True,
- bedaccession=None
- ):
- """
- Return a query that finds all variants of 'affected' units and subtracts
- both uninteresting ones from other units and also known variants.
- known_sources -- a list of KnownSource.id values (which may be empty).
- The known variants from that source are subtracted from the result, but
- filtered according to tags in dbSNP (see KnownVariant in models.py). If
- this has the special value 'all', then all known variants are filtered.
- affected_filter -- expression that is used to select all interesting
- calls. The expression can access Call, Unit, Library, Sample,
- Disease and Patient.
- unaffected_filter -- expression that is used to filter out all uninteresting
- calls from other units.
- Also, synonymous variants are filtered unless they are close to a splice site.
- """
- # Find all calls from samples that we are interested in
- calls = session.query(Call).join(Sample).join(Patient).\
- join(Disease).filter(affected_filter)
- # Find all sample_ids of those calls that should be filtered out.
- # TODO this was 10 times faster with unit_ids, is this still the case
- # with sample_ids?
- #
- # It is 10 times faster to determine this list beforehand via a separate
- # query than to add a subquery to the final SQL statement.
- # The original took 49s at best. This version takes 4s at best, but
- # sometimes 35s.
- unaffected_samples = session.query(Sample).\
- join(Patient).join(Disease).\
- filter(unaffected_filter).all()
- unaffected_sample_ids = [ sample.id for sample in unaffected_samples ]
- # If unaffected samples are given, subtract their variants
- if unaffected_sample_ids:
- unaffected_calls = session.query(Call).\
- filter(Call.qual > unaffected_qual).\
- filter(Call.sample_id.in_(unaffected_sample_ids)).subquery()
- equal_variant = unaffected_calls.c.variant_id == Call.variant_id
- if not ignore_heterozygous:
- equal_variant &= unaffected_calls.c.is_heterozygous == Call.is_heterozygous
- calls = calls.outerjoin(unaffected_calls, equal_variant).\
- filter(unaffected_calls.c.variant_id == None)
- # Subtract known variants. Variants that are known,
- # but for which one of the dbsnp_tags is set
- # are kept.
- # (no allele frequency here, only relevant for recessive)
- if known_sources:
- conditions = [KnownVariant.variant_id == None]
- if show_dbsnp_clinical:
- conditions.append(and_(KnownVariant.clinical, KnownVariant.clinical_significance != 2))
- if show_dbsnp_precious:
- conditions.append(KnownVariant.precious)
- if show_dbsnp_locus_specific:
- conditions.append(KnownVariant.locus_specific_db)
- join_condition = KnownVariant.variant_id == Call.variant_id
- if known_sources != 'all':
- join_condition = and_(join_condition, KnownVariant.source_id.in_(ks.id for ks in known_sources))
- calls = calls.outerjoin(KnownVariant, join_condition).filter(or_(*conditions))
- # Select variants based on their annotation.
- result = session.query(Call, Annotation).select_from(calls.subquery()).\
- join(Variant).join(Annotation).filter(aux_filter).filter(Annotation.consequence.op('&')(consequences)>0).join(Transcript).join(Gene)
- #pprint(session.execute(Explain(result)).fetchall())
- result = session.query(Call, Annotation, BedAnnotation).select_from(calls.subquery()).\
- join(Variant).join(Annotation).filter(aux_filter).filter(Annotation.consequence.op('&')(consequences)>0).join(Transcript).join(Gene).\
- outerjoin(BedAnnotation, and_(BedAnnotation.accession == bedaccession, BedAnnotation.chrom == Variant.chrom, between(Variant.pos, BedAnnotation.start, BedAnnotation.stop)))
- if show_protein_products_only:
- result = result.filter(Transcript.protein_accession != None)
- return result
- def mutation_stats(mutations, affected_samples):
- total_library_size = sum(unit.stats.bases for sample in affected_samples for unit in sample.units if unit.stats)
- if not total_library_size:
- return []
- mutation_counts = Counter(annotation.transcript for call, annotation, bed in mutations)
- mutations_per_kilobase = {transcript: count / (transcript.length / 1e3) for transcript, count in list(mutation_counts.items())}
- mutations_per_kilobase_per_million = {transcript: mpk / (total_library_size / 1e6) for transcript, mpk in list(mutations_per_kilobase.items())}
- # return sorted([(transcript, mutation_counts[transcript], mutations_per_kilobase_per_million[transcript]) for transcript in mutations_per_kilobase_per_million], key=lambda item: item[0].name)
- return sorted([(transcript, mutation_counts[transcript], mutations_per_kilobase_per_million[transcript]) for transcript in mutations_per_kilobase_per_million], key=lambda item: item[0].accession)