PageRenderTime 63ms CodeModel.GetById 11ms app.highlight 48ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/indels/indel_analysis.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 227 lines | 187 code | 7 blank | 33 comment | 38 complexity | 3fb3a2ec9975bf037f9e32060f9cd7c6 MD5 | raw file
  1#!/usr/bin/env python
  2
  3"""
  4Given an input sam file, provides analysis of the indels..
  5
  6usage: %prog [options] [input3 sum3[ input4 sum4[ input5 sum5[...]]]]
  7   -i, --input=i: The sam file to analyze
  8   -t, --threshold=t: The deletion frequency threshold
  9   -I, --out_ins=I: The interval output file showing insertions
 10   -D, --out_del=D: The interval output file showing deletions
 11"""
 12
 13import re, sys
 14from galaxy import eggs
 15import pkg_resources; pkg_resources.require( "bx-python" )
 16from bx.cookbook import doc_optparse
 17
 18
 19def stop_err( msg ):
 20    sys.stderr.write( '%s\n' % msg )
 21    sys.exit()
 22
 23def add_to_mis_matches( mis_matches, pos, bases ):
 24    """
 25    Adds the bases and counts to the mis_matches dict
 26    """
 27    for j, base in enumerate( bases ):
 28        try:
 29            mis_matches[ pos + j ][ base ] += 1
 30        except KeyError:
 31            try:
 32                mis_matches[ pos + j ][ base ] = 1
 33            except KeyError:
 34                mis_matches[ pos + j ] = { base: 1 }
 35
 36def __main__():
 37    #Parse Command Line
 38    options, args = doc_optparse.parse( __doc__ )
 39    # prep output files
 40    out_ins = open( options.out_ins, 'wb' )
 41    out_del = open( options.out_del, 'wb' )
 42    # patterns
 43    pat = re.compile( '^((?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M)$|((?P<match_width>\d+)M)$' )
 44    pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' )
 45    # for tracking occurences at each pos of ref
 46    mis_matches = {}
 47    indels = {}
 48    multi_indel_lines = 0
 49    # go through all lines in input file
 50    for i,line in enumerate( open( options.input, 'rb' ) ):
 51        if line.strip() and not line.startswith( '#' ) and not line.startswith( '@' ) :
 52            split_line = line.split( '\t' )
 53            chrom = split_line[2].strip()
 54            pos = int( split_line[3].strip() )
 55            cigar = split_line[5].strip()
 56            bases = split_line[9].strip()
 57            # if not an indel or match, exit
 58            if chrom == '*':
 59                continue
 60            # find matches like 3M2D7M or 7M3I10M
 61            match = {}
 62            m = pat.match( cigar )
 63            # unprocessable CIGAR
 64            if not m:
 65                m = pat_multi.match( cigar )
 66                # skip this line if no match
 67                if not m:
 68                    continue
 69                # account for multiple indels or operations we don't process
 70                else:
 71                    multi_indel_lines += 1
 72            # get matching parts for the indel or full match if matching
 73            else:
 74                if not mis_matches.has_key( chrom ):
 75                    mis_matches[ chrom ] = {}
 76                    indels[ chrom ] = { 'D': {}, 'I': {} }
 77                parts = m.groupdict()
 78                if parts[ 'match_width' ] or ( parts[ 'lmatch' ] and parts[ 'ins_del_width' ] and parts[ 'rmatch' ] ):
 79                    match = parts
 80            # see if matches meet filter requirements
 81            if match:
 82                # match/mismatch
 83                if parts[ 'match_width' ]:
 84                    add_to_mis_matches( mis_matches[ chrom ], pos, bases )
 85                # indel
 86                else:
 87                    # pieces of CIGAR string
 88                    left = int( match[ 'lmatch' ] )
 89                    middle = int( match[ 'ins_del_width' ] )
 90                    right = int( match[ 'rmatch' ] )
 91                    left_bases = bases[ : left ]
 92                    if match[ 'ins_del' ] == 'I':
 93                        middle_bases = bases[ left : left + middle ]
 94                    else:
 95                        middle_bases = ''
 96                    right_bases = bases[ -right : ]
 97                    start = pos + left
 98                    # add data to ref_pos dict for match/mismatch bases on left and on right
 99                    add_to_mis_matches( mis_matches[ chrom ], pos, left_bases )
100                    if match[ 'ins_del' ] == 'I':
101                        add_to_mis_matches( mis_matches[ chrom ], start, right_bases )
102                    else:
103                        add_to_mis_matches( mis_matches[ chrom ], start + middle, right_bases )
104                    # for insertions, count instances of particular inserted bases
105                    if match[ 'ins_del' ] == 'I':
106                        if indels[ chrom ][ 'I' ].has_key( start ):
107                            try:
108                                indels[ chrom ][ 'I' ][ start ][ middle_bases ] += 1
109                            except KeyError:
110                                indels[ chrom ][ 'I' ][ start ][ middle_bases ] = 1
111                        else:
112                            indels[ chrom ][ 'I' ][ start ] = { middle_bases: 1 }
113                    # for deletions, count number of deletions bases
114                    else:
115                        if indels[ chrom ][ 'D' ].has_key( start ):
116                            try:
117                                indels[ chrom ][ 'D' ][ start ][ middle ] += 1
118                            except KeyError:
119                                indels[ chrom ][ 'D' ][ start ][ middle ] = 1
120                        else:
121                            indels[ chrom ][ 'D' ][ start ] = { middle: 1 }
122    # compute deletion frequencies and insertion frequencies for checking against threshold
123    freqs = {}
124    ins_freqs = {}
125    chroms = mis_matches.keys()
126    chroms.sort()
127    for chrom in chroms:
128        freqs[ chrom ] = {}
129        ins_freqs[ chrom ] = {}
130        poses = mis_matches[ chrom ].keys()
131        poses.extend( indels[ chrom ][ 'D' ].keys() )
132        poses.extend( indels[ chrom ][ 'I' ].keys() )
133        poses = list( set( poses ) )
134        for pos in poses:
135            # all reads touching this particular position
136            freqs[ chrom ][ pos ] = {}
137            sum_counts = 0.0
138            sum_counts_end = 0.0
139            # get basic counts (match/mismatch)
140            try:
141                sum_counts += float( sum( mis_matches[ chrom ][ pos ].values() ) )
142            except KeyError:
143                pass
144            try:
145                sum_counts_end += float( sum( mis_matches[ chrom ][ pos + 1 ].values() ) )
146            except KeyError:
147                pass
148            # add deletions also touching this position
149            try:
150                sum_counts += float( sum( indels[ chrom ][ 'D' ][ pos ].values() ) )
151            except KeyError:
152                pass
153            try:
154                sum_counts_end += float( sum( indels[ chrom ][ 'D' ][ pos + 1 ].values() ) )
155            except KeyError:
156                pass
157            freqs[ chrom ][ pos ][ 'total' ] = sum_counts
158            # calculate actual frequencies
159            # deletions
160            # frequencies for deletions
161            try:
162                for d in indels[ chrom ][ 'D' ][ pos ].keys():
163                    freqs[ chrom ][ pos ][ d ] = indels[ chrom ][ 'D' ][ pos ][ d ] / sum_counts
164            except KeyError:
165                pass
166            # frequencies for matches/mismatches
167            try:
168                for base in mis_matches[ chrom ][ pos ].keys():
169                    try:
170                        prop = float( mis_matches[ chrom ][ pos ][ base ] ) / sum_counts
171                        freqs[ chrom ][ pos ][ base ] = prop
172                    except ZeroDivisionError:
173                        freqs[ chrom ][ pos ][ base ] = 0.0
174            except KeyError:
175                pass
176            # insertions
177            try:
178                for bases in indels[ chrom ][ 'I' ][ pos ].keys():
179                    prop_start = indels[ chrom ][ 'I' ][ pos ][ bases ] / ( indels[ chrom ][ 'I' ][ pos ][ bases ] + sum_counts )
180                    try:
181                        prop_end = indels[ chrom ][ 'I' ][ pos ][ bases ] / ( indels[ chrom ][ 'I' ][ pos ][ bases ] + sum_counts_end )
182                    except ZeroDivisionError:
183                        prop_end = 0.0
184                    try:
185                        ins_freqs[ chrom ][ pos ][ bases ] = [ prop_start, prop_end ]
186                    except KeyError:
187                        ins_freqs[ chrom ][ pos ] = { bases: [ prop_start, prop_end ] }
188            except KeyError, e:
189                pass
190    # output to files if meet threshold requirement
191    threshold = float( options.threshold )
192    #out_del.write( '#Chrom\tStart\tEnd\t#Del\t#Reads\t%TotReads\n' )
193    #out_ins.write( '#Chrom\tStart\tEnd\tInsBases\t#Reads\t%TotReadsAtStart\t%ReadsAtEnd\n' )
194    for chrom in chroms:
195        # deletions file
196        poses = indels[ chrom ][ 'D' ].keys()
197        poses.sort()
198        for pos in poses:
199            start = pos
200            dels = indels[ chrom ][ 'D' ][ start ].keys()
201            dels.sort()
202            for d in dels:
203                end = start + d
204                prop = freqs[ chrom ][ start ][ d ]
205                if prop > threshold :
206                    out_del.write( '%s\t%s\t%s\t%s\t%.2f\n' % ( chrom, start, end, indels[ chrom ][ 'D' ][ pos ][ d ], 100.0 * prop ) )
207        # insertions file
208        poses = indels[ chrom ][ 'I' ].keys()
209        poses.sort()
210        for pos in poses:
211            start = pos
212            end = pos + 1
213            ins_bases = indels[ chrom ][ 'I' ][ start ].keys()
214            ins_bases.sort()
215            for bases in ins_bases:
216                prop_start = ins_freqs[ chrom ][ start ][ bases ][0]
217                prop_end = ins_freqs[ chrom ][ start ][ bases ][1]
218                if prop_start > threshold or prop_end > threshold:
219                    out_ins.write( '%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\n' % ( chrom, start, end, bases, indels[ chrom ][ 'I' ][ start ][ bases ], 100.0 * prop_start, 100.0 * prop_end ) )
220    # close out files
221    out_del.close()
222    out_ins.close()
223    # if skipped lines because of more than one indel, output message
224    if multi_indel_lines > 0:
225        sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines )
226
227if __name__=="__main__": __main__()