PageRenderTime 33ms CodeModel.GetById 14ms app.highlight 16ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/regVariation/getIndelRates_3way.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 249 lines | 221 code | 25 blank | 3 comment | 72 complexity | c4f2f0d5ad6752264faa2638d35856cf MD5 | raw file
  1#!/usr/bin/env python
  2#Guruprasad Ananda
  3
  4from galaxy import eggs
  5import pkg_resources
  6pkg_resources.require( "bx-python" )
  7
  8import sys, os, tempfile
  9import traceback
 10import fileinput
 11from warnings import warn
 12
 13from galaxy.tools.util.galaxyops import *
 14from bx.intervals.io import *
 15
 16from bx.intervals.operations import quicksect
 17
 18def stop_err(msg):
 19    sys.stderr.write(msg)
 20    sys.exit()
 21    
 22def counter(node, start, end, sort_col):
 23    global full, blk_len, blk_list
 24    if node.start < start:
 25        if node.right:
 26            counter(node.right, start, end, sort_col)
 27    elif start <= node.start <= end and start <= node.end <= end:
 28        full += 1
 29        if node.other[0] not in blk_list:
 30            blk_list.append(node.other[0])
 31            blk_len += int(node.other[sort_col+2])
 32        if node.left and node.left.maxend > start:
 33            counter(node.left, start, end, sort_col)
 34        if node.right: 
 35            counter(node.right, start, end, sort_col)
 36    elif node.start > end:
 37        if node.left: 
 38            counter(node.left, start, end, sort_col)
 39            
 40
 41infile = sys.argv[1]  
 42fout = open(sys.argv[2],'w')
 43int_file = sys.argv[3]
 44if int_file != "None": #User has specified an interval file
 45    try:
 46        fint = open(int_file, 'r')
 47        dbkey_i = sys.argv[4]
 48        chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] )
 49    except:
 50        stop_err("Unable to open input Interval file")
 51        
 52def main():
 53
 54    for i, line in enumerate( file ( infile )):
 55        line = line.rstrip('\r\n')
 56        if len( line )>0 and not line.startswith( '#' ):
 57            elems = line.split( '\t' )
 58            break
 59        if i == 30:
 60            break # Hopefully we'll never get here...
 61        
 62    if len( elems ) != 18:
 63        stop_err( "This tool only works on tabular data output by 'Fetch Indels from 3-way alignments' tool. The data in your input dataset is either missing or not formatted properly." )
 64    
 65    for i, line in enumerate( file ( infile )):
 66        line = line.rstrip('\r\n')
 67        elems = line.split('\t')
 68        try:
 69            assert int(elems[0])
 70            assert len(elems) == 18
 71            if int_file != "None":
 72                if dbkey_i not in elems[3] and  dbkey_i not in elems[8] and dbkey_i not in elems[13]:
 73                    stop_err("The species build corresponding to your interval file is not present in the Indel file.") 
 74                if dbkey_i in elems[3]:
 75                    sort_col = 4
 76                elif dbkey_i in elems[8]:
 77                    sort_col = 9
 78                elif dbkey_i in elems[13]:
 79                    sort_col = 14
 80            else:
 81                species = []
 82                species.append( elems[3].split('.')[0] )
 83                species.append( elems[8].split('.')[0] )
 84                species.append( elems[13].split('.')[0] )
 85                sort_col = 0    #Based on block numbers
 86            break
 87        except:
 88            continue
 89        
 90        
 91    fin = open(infile, 'r')
 92    skipped = 0
 93    
 94    if int_file == "None":
 95        sorted_infile = tempfile.NamedTemporaryFile()
 96        cmdline = "sort -n -k"+str(1)+" -o "+sorted_infile.name+" "+infile
 97        try:
 98            os.system(cmdline)
 99        except:
100            stop_err("Encountered error while sorting the input file.")
101        print >>fout, "#Block\t%s_InsRate\t%s_InsRate\t%s_InsRate\t%s_DelRate\t%s_DelRate\t%s_DelRate" %(species[0],species[1],species[2],species[0],species[1],species[2])
102        prev_bnum = -1
103        sorted_infile.seek(0)
104        for line in sorted_infile.readlines():
105            line = line.rstrip('\r\n')
106            elems = line.split('\t')
107            try:
108                assert int(elems[0])
109                assert len(elems) == 18
110                new_bnum = int(elems[0])
111                if new_bnum != prev_bnum:
112                    if prev_bnum != -1:
113                        irate = []
114                        drate = []
115                        for i,elem in enumerate(inserts):
116                            try:
117                                irate.append(str("%.2e" %(inserts[i]/blen[i])))
118                            except:
119                                irate.append('0')
120                            try:
121                                drate.append(str("%.2e" %(deletes[i]/blen[i])))
122                            except:
123                                drate.append('0')
124                        print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate))
125                    inserts = [0.0, 0.0, 0.0]
126                    deletes = [0.0, 0.0, 0.0]
127                    blen = []
128                    blen.append( int(elems[6]) )
129                    blen.append( int(elems[11]) )
130                    blen.append( int(elems[16]) )
131                line_sp = elems[1].split('.')[0]
132                sp_ind = species.index(line_sp)
133                if elems[1].endswith('insert'):
134                    inserts[sp_ind] += 1
135                elif elems[1].endswith('delete'):
136                    deletes[sp_ind] += 1
137                prev_bnum = new_bnum 
138            except Exception, ei:
139                #print >>sys.stderr, ei
140                continue
141        irate = []
142        drate = []
143        for i,elem in enumerate(inserts):
144            try:
145                irate.append(str("%.2e" %(inserts[i]/blen[i])))
146            except:
147                irate.append('0')
148            try:
149                drate.append(str("%.2e" %(deletes[i]/blen[i])))
150            except:
151                drate.append('0')
152        print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate))
153        sys.exit()
154    
155    
156    inf = open(infile, 'r')
157    start_met = False
158    end_met = False
159    sp_file = tempfile.NamedTemporaryFile()
160    for n, line in enumerate(inf):
161        line = line.rstrip('\r\n')
162        elems = line.split('\t')
163        try:
164            assert int(elems[0])
165            assert len(elems) == 18
166            if dbkey_i not in elems[1]: 
167                if not(start_met):   
168                    continue
169                else:
170                    sp_end = n
171                    break
172            else:
173                print >>sp_file, line
174                if not(start_met):
175                    start_met = True
176                    sp_start = n
177        except:
178            continue
179    
180    try:
181        assert sp_end
182    except:
183        sp_end = n+1
184    
185    sp_file.seek(0)
186    win = NiceReaderWrapper( fileinput.FileInput( int_file ),
187                                chrom_col=chr_col_i,
188                                start_col=start_col_i,
189                                end_col=end_col_i,
190                                strand_col=strand_col_i,
191                                fix_strand=True)
192    
193    indel = NiceReaderWrapper( fileinput.FileInput( sp_file.name ),
194                                chrom_col=1,
195                                start_col=sort_col,
196                                end_col=sort_col+1,
197                                strand_col=-1,
198                                fix_strand=True)
199    
200    indelTree = quicksect.IntervalTree()
201    for item in indel:
202        if type( item ) is GenomicInterval:
203            indelTree.insert( item, indel.linenum, item.fields )
204    result=[]
205    
206    global full, blk_len, blk_list
207    for interval in win:
208        if type( interval ) is Header:
209            pass
210        if type( interval ) is Comment:
211            pass
212        elif type( interval ) == GenomicInterval:
213            chrom = interval.chrom
214            start = int(interval.start)
215            end = int(interval.end)
216            if start > end: 
217                warn( "Interval start after end!" )
218            ins_chr = "%s.%s_insert" %(dbkey_i,chrom)
219            del_chr = "%s.%s_delete" %(dbkey_i,chrom)
220            irate = 0
221            drate = 0
222            if ins_chr not in indelTree.chroms and del_chr not in indelTree.chroms:
223                pass    
224            else:
225                if ins_chr in indelTree.chroms:
226                    full = 0.0
227                    blk_len = 0
228                    blk_list = []
229                    root = indelTree.chroms[ins_chr]    #root node for the chrom insertion tree
230                    counter(root, start, end, sort_col)
231                    if blk_len:
232                        irate = full/blk_len
233                
234                if del_chr in indelTree.chroms:
235                    full = 0.0
236                    blk_len = 0
237                    blk_list = []
238                    root = indelTree.chroms[del_chr]    #root node for the chrom insertion tree
239                    counter(root, start, end, sort_col)
240                    if blk_len:
241                        drate = full/blk_len
242                
243            interval.fields.append(str("%.2e" %irate))
244            interval.fields.append(str("%.2e" %drate))
245            print >>fout, "\t".join(interval.fields)
246            fout.flush()
247
248if __name__ == "__main__":
249    main()