indel_sam2interval.py - This Python script takes a SAM file…

/tools/indels/indel_sam2interval.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 161 lines · 119 code · 12 blank · 30 comment · 55 complexity · 01984c112db9dfd2aaa4374cbb4a8b35 MD5 · raw file


#!/usr/bin/env python

"""
Allows user to filter out non-indels from SAM.

usage: %prog [options]
   -i, --input=i: The input SAM file
   -u, --include_base=u: Whether or not to include the base for insertions
   -c, --collapse=c: Wheter to collapse multiple occurrences of a location with counts shown
   -o, --int_out=o: The interval output file for the converted SAM file
   -b, --bed_ins_out=b: The bed output file with insertions only for the converted SAM file
   -d, --bed_del_out=d: The bed output file with deletions only for the converted SAM file
"""

import re, sys
from galaxy import eggs
import pkg_resources; pkg_resources.require( "bx-python" )
from bx.cookbook import doc_optparse


def stop_err( msg ):
    sys.stderr.write( '%s\n' % msg )
    sys.exit()

def numeric_sort( text1, text2 ):
    """
    For two items containing space-separated text, compares equivalent pieces
    numerically if both numeric or as text otherwise
    """
    pieces1 = text1.split()
    pieces2 = text2.split()
    if len( pieces1 ) == 0:
        return 1
    if len( pieces2 ) == 0:
        return -1
    for i, pc1 in enumerate( pieces1 ):
        if i == len( pieces2 ):
            return 1
        if not pieces2[i].isdigit():
            if pc1.isdigit():
                return -1
            else:
                if pc1 > pieces2[i]:
                    return 1
                elif pc1 < pieces2[i]:
                    return -1
        else:
            if not pc1.isdigit():
                return 1
            else:
                if int( pc1 ) > int( pieces2[i] ):
                    return 1
                elif int( pc1 ) < int( pieces2[i] ):
                    return -1
    if i < len( pieces2 ) - 1:
        return -1
    return 0

def __main__():
    #Parse Command Line
    options, args = doc_optparse.parse( __doc__ )

    # open up output files
    output = open( options.int_out, 'wb' )
    if options.bed_ins_out != 'None':
        output_bed_ins = open( options.bed_ins_out, 'wb' )
    else:
        output_bed_ins = None
    if options.bed_del_out != 'None':
        output_bed_del = open( options.bed_del_out, 'wb' )
    else:
        output_bed_del = None

    # the pattern to match, assuming just one indel per cigar string
    pat_indel = re.compile( '^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$' )
    pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' )

    # go through all lines in input file
    out_data = {}
    multi_indel_lines = 0
    for line in open( options.input, 'rb' ):
        if line and not line.startswith( '#' ) and not line.startswith( '@' ) :
            split_line = line.split( '\t' )
            if split_line < 12:
                continue
            # grab relevant pieces
            cigar = split_line[5].strip()
            pos = int( split_line[3] )
            chr = split_line[2]
            base_string = split_line[9]
            # parse cigar string
            m = pat_indel.match( cigar )
            if not m:
                m = pat_multi.match( cigar )
                # skip this line if no match
                if not m:
                    continue
                # account for multiple indels or operations we don't process
                else:
                    multi_indel_lines += 1
                continue
            else:
                match = m.groupdict()
            left = int( match[ 'lmatch' ] )
            middle = int( match[ 'ins_del_width' ] )
            middle_type = match[ 'ins_del' ]
            bases = base_string[ left : left + middle ]
            # calculate start and end positions, and output to insertion or deletion file
            start = left + pos
            if middle_type == 'D':
                end = start + middle
                data = [ chr, start, end, 'D' ]
                if options.include_base == "true":
                    data.append( '-' )
            else:
                end = start + 1
                data = [ chr, start, end, 'I' ]
                if options.include_base == "true":
                    data.append( bases )
            location = '\t'.join( [ '%s' % d for d in data ] )
            try:
                out_data[ location ] += 1
            except KeyError:
                out_data[ location ] = 1
    # output to interval file
    # get all locations and sort
    locations = out_data.keys()
    locations.sort( numeric_sort )
    last_line = ''
    # output each location, either with counts or each occurrence
    for loc in locations:
        sp_loc = loc.split( '\t' )
        cur_line = '\t'.join( sp_loc[:3] )
        if options.collapse == 'true':
            output.write( '%s\t%s\n' % ( loc, out_data[ loc ] ) )
            if output_bed_del and sp_loc[3] == 'D':
                output_bed_del.write( '%s\n' % cur_line )
            if output_bed_ins and sp_loc[3] == 'I' and last_line != cur_line:
                output_bed_ins.write( '%s\n' % cur_line )
                last_line = cur_line
        else:
            for i in range( out_data[ loc ] ):
                output.write( '%s\n' % loc )
                if output_bed_del or output_bed_ins:
                    if output_bed_del and sp_loc[3] == 'D':
                        output_bed_del.write( '%s\n' % cur_line )
                    if output_bed_ins and sp_loc[3] == 'I':
                        output_bed_ins.write( '%s\n' % cur_line )

    # cleanup, close files
    if output_bed_ins:
        output_bed_ins.close()
    if output_bed_del:
        output_bed_del.close()
    output.close()

    # if skipped lines because of more than one indel, output message
    if multi_indel_lines > 0:
        sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines )

if __name__=="__main__": __main__()

Summary ✨

This Python script takes a SAM file as input and outputs two files: an interval file with counts of insertions and deletions, and a BED file with insertions only or deletions only. The script ignores alignments that contain more than one indel.

Tech Fingerprint

Alerts (9)

'def' Ensure functions have docstrings for documentation
21
'open(' Use 'with open()' to ensure Files are properly closed
64 66 70 81
Complexity hotspot; lines 81 to 82 (total complexity: 4)
81 82
Complexity hotspot; lines 144 to 145 (total complexity: 4)
144 145