/tools/regVariation/getIndelRates_3way.py
Python | 249 lines | 221 code | 25 blank | 3 comment | 72 complexity | c4f2f0d5ad6752264faa2638d35856cf MD5 | raw file
1#!/usr/bin/env python 2#Guruprasad Ananda 3 4from galaxy import eggs 5import pkg_resources 6pkg_resources.require( "bx-python" ) 7 8import sys, os, tempfile 9import traceback 10import fileinput 11from warnings import warn 12 13from galaxy.tools.util.galaxyops import * 14from bx.intervals.io import * 15 16from bx.intervals.operations import quicksect 17 18def stop_err(msg): 19 sys.stderr.write(msg) 20 sys.exit() 21 22def counter(node, start, end, sort_col): 23 global full, blk_len, blk_list 24 if node.start < start: 25 if node.right: 26 counter(node.right, start, end, sort_col) 27 elif start <= node.start <= end and start <= node.end <= end: 28 full += 1 29 if node.other[0] not in blk_list: 30 blk_list.append(node.other[0]) 31 blk_len += int(node.other[sort_col+2]) 32 if node.left and node.left.maxend > start: 33 counter(node.left, start, end, sort_col) 34 if node.right: 35 counter(node.right, start, end, sort_col) 36 elif node.start > end: 37 if node.left: 38 counter(node.left, start, end, sort_col) 39 40 41infile = sys.argv[1] 42fout = open(sys.argv[2],'w') 43int_file = sys.argv[3] 44if int_file != "None": #User has specified an interval file 45 try: 46 fint = open(int_file, 'r') 47 dbkey_i = sys.argv[4] 48 chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] ) 49 except: 50 stop_err("Unable to open input Interval file") 51 52def main(): 53 54 for i, line in enumerate( file ( infile )): 55 line = line.rstrip('\r\n') 56 if len( line )>0 and not line.startswith( '#' ): 57 elems = line.split( '\t' ) 58 break 59 if i == 30: 60 break # Hopefully we'll never get here... 61 62 if len( elems ) != 18: 63 stop_err( "This tool only works on tabular data output by 'Fetch Indels from 3-way alignments' tool. The data in your input dataset is either missing or not formatted properly." ) 64 65 for i, line in enumerate( file ( infile )): 66 line = line.rstrip('\r\n') 67 elems = line.split('\t') 68 try: 69 assert int(elems[0]) 70 assert len(elems) == 18 71 if int_file != "None": 72 if dbkey_i not in elems[3] and dbkey_i not in elems[8] and dbkey_i not in elems[13]: 73 stop_err("The species build corresponding to your interval file is not present in the Indel file.") 74 if dbkey_i in elems[3]: 75 sort_col = 4 76 elif dbkey_i in elems[8]: 77 sort_col = 9 78 elif dbkey_i in elems[13]: 79 sort_col = 14 80 else: 81 species = [] 82 species.append( elems[3].split('.')[0] ) 83 species.append( elems[8].split('.')[0] ) 84 species.append( elems[13].split('.')[0] ) 85 sort_col = 0 #Based on block numbers 86 break 87 except: 88 continue 89 90 91 fin = open(infile, 'r') 92 skipped = 0 93 94 if int_file == "None": 95 sorted_infile = tempfile.NamedTemporaryFile() 96 cmdline = "sort -n -k"+str(1)+" -o "+sorted_infile.name+" "+infile 97 try: 98 os.system(cmdline) 99 except: 100 stop_err("Encountered error while sorting the input file.") 101 print >>fout, "#Block\t%s_InsRate\t%s_InsRate\t%s_InsRate\t%s_DelRate\t%s_DelRate\t%s_DelRate" %(species[0],species[1],species[2],species[0],species[1],species[2]) 102 prev_bnum = -1 103 sorted_infile.seek(0) 104 for line in sorted_infile.readlines(): 105 line = line.rstrip('\r\n') 106 elems = line.split('\t') 107 try: 108 assert int(elems[0]) 109 assert len(elems) == 18 110 new_bnum = int(elems[0]) 111 if new_bnum != prev_bnum: 112 if prev_bnum != -1: 113 irate = [] 114 drate = [] 115 for i,elem in enumerate(inserts): 116 try: 117 irate.append(str("%.2e" %(inserts[i]/blen[i]))) 118 except: 119 irate.append('0') 120 try: 121 drate.append(str("%.2e" %(deletes[i]/blen[i]))) 122 except: 123 drate.append('0') 124 print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate)) 125 inserts = [0.0, 0.0, 0.0] 126 deletes = [0.0, 0.0, 0.0] 127 blen = [] 128 blen.append( int(elems[6]) ) 129 blen.append( int(elems[11]) ) 130 blen.append( int(elems[16]) ) 131 line_sp = elems[1].split('.')[0] 132 sp_ind = species.index(line_sp) 133 if elems[1].endswith('insert'): 134 inserts[sp_ind] += 1 135 elif elems[1].endswith('delete'): 136 deletes[sp_ind] += 1 137 prev_bnum = new_bnum 138 except Exception, ei: 139 #print >>sys.stderr, ei 140 continue 141 irate = [] 142 drate = [] 143 for i,elem in enumerate(inserts): 144 try: 145 irate.append(str("%.2e" %(inserts[i]/blen[i]))) 146 except: 147 irate.append('0') 148 try: 149 drate.append(str("%.2e" %(deletes[i]/blen[i]))) 150 except: 151 drate.append('0') 152 print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate)) 153 sys.exit() 154 155 156 inf = open(infile, 'r') 157 start_met = False 158 end_met = False 159 sp_file = tempfile.NamedTemporaryFile() 160 for n, line in enumerate(inf): 161 line = line.rstrip('\r\n') 162 elems = line.split('\t') 163 try: 164 assert int(elems[0]) 165 assert len(elems) == 18 166 if dbkey_i not in elems[1]: 167 if not(start_met): 168 continue 169 else: 170 sp_end = n 171 break 172 else: 173 print >>sp_file, line 174 if not(start_met): 175 start_met = True 176 sp_start = n 177 except: 178 continue 179 180 try: 181 assert sp_end 182 except: 183 sp_end = n+1 184 185 sp_file.seek(0) 186 win = NiceReaderWrapper( fileinput.FileInput( int_file ), 187 chrom_col=chr_col_i, 188 start_col=start_col_i, 189 end_col=end_col_i, 190 strand_col=strand_col_i, 191 fix_strand=True) 192 193 indel = NiceReaderWrapper( fileinput.FileInput( sp_file.name ), 194 chrom_col=1, 195 start_col=sort_col, 196 end_col=sort_col+1, 197 strand_col=-1, 198 fix_strand=True) 199 200 indelTree = quicksect.IntervalTree() 201 for item in indel: 202 if type( item ) is GenomicInterval: 203 indelTree.insert( item, indel.linenum, item.fields ) 204 result=[] 205 206 global full, blk_len, blk_list 207 for interval in win: 208 if type( interval ) is Header: 209 pass 210 if type( interval ) is Comment: 211 pass 212 elif type( interval ) == GenomicInterval: 213 chrom = interval.chrom 214 start = int(interval.start) 215 end = int(interval.end) 216 if start > end: 217 warn( "Interval start after end!" ) 218 ins_chr = "%s.%s_insert" %(dbkey_i,chrom) 219 del_chr = "%s.%s_delete" %(dbkey_i,chrom) 220 irate = 0 221 drate = 0 222 if ins_chr not in indelTree.chroms and del_chr not in indelTree.chroms: 223 pass 224 else: 225 if ins_chr in indelTree.chroms: 226 full = 0.0 227 blk_len = 0 228 blk_list = [] 229 root = indelTree.chroms[ins_chr] #root node for the chrom insertion tree 230 counter(root, start, end, sort_col) 231 if blk_len: 232 irate = full/blk_len 233 234 if del_chr in indelTree.chroms: 235 full = 0.0 236 blk_len = 0 237 blk_list = [] 238 root = indelTree.chroms[del_chr] #root node for the chrom insertion tree 239 counter(root, start, end, sort_col) 240 if blk_len: 241 drate = full/blk_len 242 243 interval.fields.append(str("%.2e" %irate)) 244 interval.fields.append(str("%.2e" %drate)) 245 print >>fout, "\t".join(interval.fields) 246 fout.flush() 247 248if __name__ == "__main__": 249 main()