indel_analysis.py - This Python script takes a BAM file as …

/tools/indels/indel_analysis.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 227 lines · 173 code · 7 blank · 47 comment · 73 complexity · 3fb3a2ec9975bf037f9e32060f9cd7c6 MD5 · raw file

#!/usr/bin/env python

"""
Given an input sam file, provides analysis of the indels..

usage: %prog [options] [input3 sum3[ input4 sum4[ input5 sum5[...]]]]
   -i, --input=i: The sam file to analyze
   -t, --threshold=t: The deletion frequency threshold
   -I, --out_ins=I: The interval output file showing insertions
   -D, --out_del=D: The interval output file showing deletions
"""

import re, sys
from galaxy import eggs
import pkg_resources; pkg_resources.require( "bx-python" )
from bx.cookbook import doc_optparse


def stop_err( msg ):
    sys.stderr.write( '%s\n' % msg )
    sys.exit()

def add_to_mis_matches( mis_matches, pos, bases ):
    """
    Adds the bases and counts to the mis_matches dict
    """
    for j, base in enumerate( bases ):
        try:
            mis_matches[ pos + j ][ base ] += 1
        except KeyError:
            try:
                mis_matches[ pos + j ][ base ] = 1
            except KeyError:
                mis_matches[ pos + j ] = { base: 1 }

def __main__():
    #Parse Command Line
    options, args = doc_optparse.parse( __doc__ )
    # prep output files
    out_ins = open( options.out_ins, 'wb' )
    out_del = open( options.out_del, 'wb' )
    # patterns
    pat = re.compile( '^((?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M)$|((?P<match_width>\d+)M)$' )
    pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' )
    # for tracking occurences at each pos of ref
    mis_matches = {}
    indels = {}
    multi_indel_lines = 0
    # go through all lines in input file
    for i,line in enumerate( open( options.input, 'rb' ) ):
        if line.strip() and not line.startswith( '#' ) and not line.startswith( '@' ) :
            split_line = line.split( '\t' )
            chrom = split_line[2].strip()
            pos = int( split_line[3].strip() )
            cigar = split_line[5].strip()
            bases = split_line[9].strip()
            # if not an indel or match, exit
            if chrom == '*':
                continue
            # find matches like 3M2D7M or 7M3I10M
            match = {}
            m = pat.match( cigar )
            # unprocessable CIGAR
            if not m:
                m = pat_multi.match( cigar )
                # skip this line if no match
                if not m:
                    continue
                # account for multiple indels or operations we don't process
                else:
                    multi_indel_lines += 1
            # get matching parts for the indel or full match if matching
            else:
                if not mis_matches.has_key( chrom ):
                    mis_matches[ chrom ] = {}
                    indels[ chrom ] = { 'D': {}, 'I': {} }
                parts = m.groupdict()
                if parts[ 'match_width' ] or ( parts[ 'lmatch' ] and parts[ 'ins_del_width' ] and parts[ 'rmatch' ] ):
                    match = parts
            # see if matches meet filter requirements
            if match:
                # match/mismatch
                if parts[ 'match_width' ]:
                    add_to_mis_matches( mis_matches[ chrom ], pos, bases )
                # indel
                else:
                    # pieces of CIGAR string
                    left = int( match[ 'lmatch' ] )
                    middle = int( match[ 'ins_del_width' ] )
                    right = int( match[ 'rmatch' ] )
                    left_bases = bases[ : left ]
                    if match[ 'ins_del' ] == 'I':
                        middle_bases = bases[ left : left + middle ]
                    else:
                        middle_bases = ''
                    right_bases = bases[ -right : ]
                    start = pos + left
                    # add data to ref_pos dict for match/mismatch bases on left and on right
                    add_to_mis_matches( mis_matches[ chrom ], pos, left_bases )
                    if match[ 'ins_del' ] == 'I':
                        add_to_mis_matches( mis_matches[ chrom ], start, right_bases )
                    else:
                        add_to_mis_matches( mis_matches[ chrom ], start + middle, right_bases )
                    # for insertions, count instances of particular inserted bases
                    if match[ 'ins_del' ] == 'I':
                        if indels[ chrom ][ 'I' ].has_key( start ):
                            try:
                                indels[ chrom ][ 'I' ][ start ][ middle_bases ] += 1
                            except KeyError:
                                indels[ chrom ][ 'I' ][ start ][ middle_bases ] = 1
                        else:
                            indels[ chrom ][ 'I' ][ start ] = { middle_bases: 1 }
                    # for deletions, count number of deletions bases
                    else:
                        if indels[ chrom ][ 'D' ].has_key( start ):
                            try:
                                indels[ chrom ][ 'D' ][ start ][ middle ] += 1
                            except KeyError:
                                indels[ chrom ][ 'D' ][ start ][ middle ] = 1
                        else:
                            indels[ chrom ][ 'D' ][ start ] = { middle: 1 }
    # compute deletion frequencies and insertion frequencies for checking against threshold
    freqs = {}
    ins_freqs = {}
    chroms = mis_matches.keys()
    chroms.sort()
    for chrom in chroms:
        freqs[ chrom ] = {}
        ins_freqs[ chrom ] = {}
        poses = mis_matches[ chrom ].keys()
        poses.extend( indels[ chrom ][ 'D' ].keys() )
        poses.extend( indels[ chrom ][ 'I' ].keys() )
        poses = list( set( poses ) )
        for pos in poses:
            # all reads touching this particular position
            freqs[ chrom ][ pos ] = {}
            sum_counts = 0.0
            sum_counts_end = 0.0
            # get basic counts (match/mismatch)
            try:
                sum_counts += float( sum( mis_matches[ chrom ][ pos ].values() ) )
            except KeyError:
                pass
            try:
                sum_counts_end += float( sum( mis_matches[ chrom ][ pos + 1 ].values() ) )
            except KeyError:
                pass
            # add deletions also touching this position
            try:
                sum_counts += float( sum( indels[ chrom ][ 'D' ][ pos ].values() ) )
            except KeyError:
                pass
            try:
                sum_counts_end += float( sum( indels[ chrom ][ 'D' ][ pos + 1 ].values() ) )
            except KeyError:
                pass
            freqs[ chrom ][ pos ][ 'total' ] = sum_counts
            # calculate actual frequencies
            # deletions
            # frequencies for deletions
            try:
                for d in indels[ chrom ][ 'D' ][ pos ].keys():
                    freqs[ chrom ][ pos ][ d ] = indels[ chrom ][ 'D' ][ pos ][ d ] / sum_counts
            except KeyError:
                pass
            # frequencies for matches/mismatches
            try:
                for base in mis_matches[ chrom ][ pos ].keys():
                    try:
                        prop = float( mis_matches[ chrom ][ pos ][ base ] ) / sum_counts
                        freqs[ chrom ][ pos ][ base ] = prop
                    except ZeroDivisionError:
                        freqs[ chrom ][ pos ][ base ] = 0.0
            except KeyError:
                pass
            # insertions
            try:
                for bases in indels[ chrom ][ 'I' ][ pos ].keys():
                    prop_start = indels[ chrom ][ 'I' ][ pos ][ bases ] / ( indels[ chrom ][ 'I' ][ pos ][ bases ] + sum_counts )
                    try:
                        prop_end = indels[ chrom ][ 'I' ][ pos ][ bases ] / ( indels[ chrom ][ 'I' ][ pos ][ bases ] + sum_counts_end )
                    except ZeroDivisionError:
                        prop_end = 0.0
                    try:
                        ins_freqs[ chrom ][ pos ][ bases ] = [ prop_start, prop_end ]
                    except KeyError:
                        ins_freqs[ chrom ][ pos ] = { bases: [ prop_start, prop_end ] }
            except KeyError, e:
                pass
    # output to files if meet threshold requirement
    threshold = float( options.threshold )
    #out_del.write( '#Chrom\tStart\tEnd\t#Del\t#Reads\t%TotReads\n' )
    #out_ins.write( '#Chrom\tStart\tEnd\tInsBases\t#Reads\t%TotReadsAtStart\t%ReadsAtEnd\n' )
    for chrom in chroms:
        # deletions file
        poses = indels[ chrom ][ 'D' ].keys()
        poses.sort()
        for pos in poses:
            start = pos
            dels = indels[ chrom ][ 'D' ][ start ].keys()
            dels.sort()
            for d in dels:
                end = start + d
                prop = freqs[ chrom ][ start ][ d ]
                if prop > threshold :
                    out_del.write( '%s\t%s\t%s\t%s\t%.2f\n' % ( chrom, start, end, indels[ chrom ][ 'D' ][ pos ][ d ], 100.0 * prop ) )
        # insertions file
        poses = indels[ chrom ][ 'I' ].keys()
        poses.sort()
        for pos in poses:
            start = pos
            end = pos + 1
            ins_bases = indels[ chrom ][ 'I' ][ start ].keys()
            ins_bases.sort()
            for bases in ins_bases:
                prop_start = ins_freqs[ chrom ][ start ][ bases ][0]
                prop_end = ins_freqs[ chrom ][ start ][ bases ][1]
                if prop_start > threshold or prop_end > threshold:
                    out_ins.write( '%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\n' % ( chrom, start, end, bases, indels[ chrom ][ 'I' ][ start ][ bases ], 100.0 * prop_start, 100.0 * prop_end ) )
    # close out files
    out_del.close()
    out_ins.close()
    # if skipped lines because of more than one indel, output message
    if multi_indel_lines > 0:
        sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines )

if __name__=="__main__": __main__()
Summary ✨

This Python script takes a BAM file as input and outputs two files: one with deletions, and another with insertions. It uses the samtools command-line tool to parse the BAM file and extract information about indels. The output files are in tab-separated value (TSV) format and contain columns for chromosome, start position, end position, number of reads supporting the indel, and percentage of total reads that support it.
Tech Fingerprint

Alerts (7)

'def' Ensure functions have docstrings for documentation
19
'open(' Use 'with open()' to ensure Files are properly closed
40 41 50
Complexity hotspot; lines 78 to 79 (total complexity: 5)
78 79
'list(' Avoid unnecessary list conversions; use generators where possible
133