PageRenderTime 46ms CodeModel.GetById 6ms app.highlight 34ms RepoModel.GetById 2ms app.codeStats 0ms

/tools/indels/indel_sam2interval.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 161 lines | 125 code | 9 blank | 27 comment | 32 complexity | 01984c112db9dfd2aaa4374cbb4a8b35 MD5 | raw file
  1#!/usr/bin/env python
  2
  3"""
  4Allows user to filter out non-indels from SAM.
  5
  6usage: %prog [options]
  7   -i, --input=i: The input SAM file
  8   -u, --include_base=u: Whether or not to include the base for insertions
  9   -c, --collapse=c: Wheter to collapse multiple occurrences of a location with counts shown
 10   -o, --int_out=o: The interval output file for the converted SAM file
 11   -b, --bed_ins_out=b: The bed output file with insertions only for the converted SAM file
 12   -d, --bed_del_out=d: The bed output file with deletions only for the converted SAM file
 13"""
 14
 15import re, sys
 16from galaxy import eggs
 17import pkg_resources; pkg_resources.require( "bx-python" )
 18from bx.cookbook import doc_optparse
 19
 20
 21def stop_err( msg ):
 22    sys.stderr.write( '%s\n' % msg )
 23    sys.exit()
 24
 25def numeric_sort( text1, text2 ):
 26    """
 27    For two items containing space-separated text, compares equivalent pieces
 28    numerically if both numeric or as text otherwise
 29    """
 30    pieces1 = text1.split()
 31    pieces2 = text2.split()
 32    if len( pieces1 ) == 0:
 33        return 1
 34    if len( pieces2 ) == 0:
 35        return -1
 36    for i, pc1 in enumerate( pieces1 ):
 37        if i == len( pieces2 ):
 38            return 1
 39        if not pieces2[i].isdigit():
 40            if pc1.isdigit():
 41                return -1
 42            else:
 43                if pc1 > pieces2[i]:
 44                    return 1
 45                elif pc1 < pieces2[i]:
 46                    return -1
 47        else:
 48            if not pc1.isdigit():
 49                return 1
 50            else:
 51                if int( pc1 ) > int( pieces2[i] ):
 52                    return 1
 53                elif int( pc1 ) < int( pieces2[i] ):
 54                    return -1
 55    if i < len( pieces2 ) - 1:
 56        return -1
 57    return 0
 58
 59def __main__():
 60    #Parse Command Line
 61    options, args = doc_optparse.parse( __doc__ )
 62
 63    # open up output files
 64    output = open( options.int_out, 'wb' )
 65    if options.bed_ins_out != 'None':
 66        output_bed_ins = open( options.bed_ins_out, 'wb' )
 67    else:
 68        output_bed_ins = None
 69    if options.bed_del_out != 'None':
 70        output_bed_del = open( options.bed_del_out, 'wb' )
 71    else:
 72        output_bed_del = None
 73
 74    # the pattern to match, assuming just one indel per cigar string
 75    pat_indel = re.compile( '^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$' )
 76    pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' )
 77
 78    # go through all lines in input file
 79    out_data = {}
 80    multi_indel_lines = 0
 81    for line in open( options.input, 'rb' ):
 82        if line and not line.startswith( '#' ) and not line.startswith( '@' ) :
 83            split_line = line.split( '\t' )
 84            if split_line < 12:
 85                continue
 86            # grab relevant pieces
 87            cigar = split_line[5].strip()
 88            pos = int( split_line[3] )
 89            chr = split_line[2]
 90            base_string = split_line[9]
 91            # parse cigar string
 92            m = pat_indel.match( cigar )
 93            if not m:
 94                m = pat_multi.match( cigar )
 95                # skip this line if no match
 96                if not m:
 97                    continue
 98                # account for multiple indels or operations we don't process
 99                else:
100                    multi_indel_lines += 1
101                continue
102            else:
103                match = m.groupdict()
104            left = int( match[ 'lmatch' ] )
105            middle = int( match[ 'ins_del_width' ] )
106            middle_type = match[ 'ins_del' ]
107            bases = base_string[ left : left + middle ]
108            # calculate start and end positions, and output to insertion or deletion file
109            start = left + pos
110            if middle_type == 'D':
111                end = start + middle
112                data = [ chr, start, end, 'D' ]
113                if options.include_base == "true":
114                    data.append( '-' )
115            else:
116                end = start + 1
117                data = [ chr, start, end, 'I' ]
118                if options.include_base == "true":
119                    data.append( bases )
120            location = '\t'.join( [ '%s' % d for d in data ] )
121            try:
122                out_data[ location ] += 1
123            except KeyError:
124                out_data[ location ] = 1
125    # output to interval file
126    # get all locations and sort
127    locations = out_data.keys()
128    locations.sort( numeric_sort )
129    last_line = ''
130    # output each location, either with counts or each occurrence
131    for loc in locations:
132        sp_loc = loc.split( '\t' )
133        cur_line = '\t'.join( sp_loc[:3] )
134        if options.collapse == 'true':
135            output.write( '%s\t%s\n' % ( loc, out_data[ loc ] ) )
136            if output_bed_del and sp_loc[3] == 'D':
137                output_bed_del.write( '%s\n' % cur_line )
138            if output_bed_ins and sp_loc[3] == 'I' and last_line != cur_line:
139                output_bed_ins.write( '%s\n' % cur_line )
140                last_line = cur_line
141        else:
142            for i in range( out_data[ loc ] ):
143                output.write( '%s\n' % loc )
144                if output_bed_del or output_bed_ins:
145                    if output_bed_del and sp_loc[3] == 'D':
146                        output_bed_del.write( '%s\n' % cur_line )
147                    if output_bed_ins and sp_loc[3] == 'I':
148                        output_bed_ins.write( '%s\n' % cur_line )
149
150    # cleanup, close files
151    if output_bed_ins:
152        output_bed_ins.close()
153    if output_bed_del:
154        output_bed_del.close()
155    output.close()
156
157    # if skipped lines because of more than one indel, output message
158    if multi_indel_lines > 0:
159        sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines )
160
161if __name__=="__main__": __main__()