PageRenderTime 47ms CodeModel.GetById 2ms app.highlight 40ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/regVariation/microsats_mutability.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 489 lines | 484 code | 2 blank | 3 comment | 10 complexity | 2266701224b961d46a7cef7392d6e42f MD5 | raw file
  1#!/usr/bin/env python
  2#Guruprasad Ananda
  3"""
  4This tool computes microsatellite mutability for the orthologous microsatellites fetched from  'Extract Orthologous Microsatellites from pair-wise alignments' tool.
  5"""
  6from galaxy import eggs
  7import sys, string, re, commands, tempfile, os, fileinput
  8from galaxy.tools.util.galaxyops import *
  9from bx.intervals.io import *
 10from bx.intervals.operations import quicksect
 11
 12fout = open(sys.argv[2],'w')
 13p_group = int(sys.argv[3])        #primary "group-by" feature
 14p_bin_size = int(sys.argv[4])
 15s_group = int(sys.argv[5])        #sub-group by feature
 16s_bin_size = int(sys.argv[6])
 17mono_threshold = 9
 18non_mono_threshold = 4
 19p_group_cols = [p_group, p_group+7]
 20s_group_cols = [s_group, s_group+7]
 21num_generations = int(sys.argv[7])
 22region = sys.argv[8] 
 23int_file = sys.argv[9]
 24if int_file != "None": #User has specified an interval file
 25    try:
 26        fint = open(int_file, 'r')
 27        dbkey_i = sys.argv[10]
 28        chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[11] )
 29    except:
 30        stop_err("Unable to open input Interval file")
 31    
 32def stop_err(msg):
 33    sys.stderr.write(msg)
 34    sys.exit()
 35
 36def reverse_complement(text):
 37    DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" )
 38    comp = [ch for ch in text.translate(DNA_COMP)]
 39    comp.reverse()
 40    return "".join(comp)
 41
 42def get_unique_elems(elems):
 43    seen=set()
 44    return[x for x in elems if x not in seen and not seen.add(x)]
 45
 46def get_binned_lists(uniqlist, binsize):
 47    binnedlist=[]
 48    uniqlist.sort()
 49    start = int(uniqlist[0])
 50    bin_ind=0
 51    l_ind=0
 52    binnedlist.append([])
 53    while l_ind < len(uniqlist):
 54        elem = int(uniqlist[l_ind])
 55        if elem in range(start,start+binsize):
 56            binnedlist[bin_ind].append(elem)
 57        else:
 58            start += binsize
 59            bin_ind += 1
 60            binnedlist.append([])
 61            binnedlist[bin_ind].append(elem)
 62        l_ind += 1
 63    return binnedlist
 64
 65def fetch_weight(H,C,t):
 66    if (H-(C-H)) < t:
 67        return 2.0
 68    else:
 69        return 1.0
 70
 71def mutabilityEstimator(repeats1,repeats2,thresholds):
 72    mut_num = 0.0    #Mutability Numerator
 73    mut_den = 0.0    #Mutability denominator
 74    for ind,H in enumerate(repeats1):
 75        C = repeats2[ind]
 76        t = thresholds[ind]
 77        w = fetch_weight(H,C,t)
 78        mut_num += ((H-C)*(H-C)*w)
 79        mut_den += w
 80    return [mut_num, mut_den]
 81
 82def output_writer(blk, blk_lines):
 83    global winspecies, speciesind
 84    all_elems_1=[]
 85    all_elems_2=[]
 86    all_s_elems_1=[]
 87    all_s_elems_2=[]
 88    for bline in blk_lines:
 89        if not(bline):
 90            continue
 91        items = bline.split('\t')
 92        seq1 = items[1]
 93        start1 = items[2]
 94        end1 = items[3]
 95        seq2 = items[8]
 96        start2 = items[9]
 97        end2 = items[10] 
 98        if p_group_cols[0] == 6:
 99            items[p_group_cols[0]] = int(items[p_group_cols[0]])
100            items[p_group_cols[1]] = int(items[p_group_cols[1]])
101        if s_group_cols[0] == 6:
102            items[s_group_cols[0]] = int(items[s_group_cols[0]])
103            items[s_group_cols[1]] = int(items[s_group_cols[1]])
104        all_elems_1.append(items[p_group_cols[0]])    #primary col elements for species 1
105        all_elems_2.append(items[p_group_cols[1]])    #primary col elements for species 2
106        if s_group_cols[0] != -1:    #sub-group is not None
107            all_s_elems_1.append(items[s_group_cols[0]])    #secondary col elements for species 1
108            all_s_elems_2.append(items[s_group_cols[1]])    #secondary col elements for species 2
109    uniq_elems_1 = get_unique_elems(all_elems_1)
110    uniq_elems_2 = get_unique_elems(all_elems_2)
111    if s_group_cols[0] != -1:
112        uniq_s_elems_1 = get_unique_elems(all_s_elems_1)
113        uniq_s_elems_2 = get_unique_elems(all_s_elems_2)
114    mut1={}
115    mut2={}
116    count1 = {}
117    count2 = {}
118    """
119    if p_group_cols[0] == 7:    #i.e. the option chosen is group-by unit(AG, GTC, etc)
120        uniq_elems_1 = get_unique_units(j.sort(lambda x, y: len(x)-len(y)))
121    """
122    if p_group_cols[0] == 6:    #i.e. the option chosen is group-by repeat number.
123        uniq_elems_1 = get_binned_lists(uniq_elems_1,p_bin_size)
124        uniq_elems_2 = get_binned_lists(uniq_elems_2,p_bin_size)
125        
126    if s_group_cols[0] == 6:    #i.e. the option chosen is subgroup-by repeat number.
127        uniq_s_elems_1 = get_binned_lists(uniq_s_elems_1,s_bin_size)
128        uniq_s_elems_2 = get_binned_lists(uniq_s_elems_2,s_bin_size)
129
130    for pitem1 in uniq_elems_1:
131        #repeats1 = []
132        #repeats2 = []
133        thresholds = []
134        if s_group_cols[0] != -1:    #Sub-group by feature is not None
135            for sitem1 in uniq_s_elems_1:
136                repeats1 = []
137                repeats2 = []
138                if type(sitem1) == type(''):
139                    sitem1 = sitem1.strip()
140                for bline in blk_lines:
141                    belems = bline.split('\t')
142                    if type(pitem1) == list:
143                        if p_group_cols[0] == 6:
144                            belems[p_group_cols[0]] = int(belems[p_group_cols[0]])
145                        if belems[p_group_cols[0]] in pitem1:
146                            if belems[s_group_cols[0]]==sitem1:
147                                repeats1.append(int(belems[6]))
148                                repeats2.append(int(belems[13]))
149                                if belems[4] == 'mononucleotide':
150                                    thresholds.append(mono_threshold)
151                                else:
152                                    thresholds.append(non_mono_threshold)
153                                mut1[str(pitem1)+'\t'+str(sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
154                                if region == 'align':
155                                    count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2))
156                                else:    
157                                    if winspecies == 1:
158                                        count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats1)
159                                    elif winspecies == 2:
160                                        count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats2)
161                    else:
162                        if type(sitem1) == list:
163                            if s_group_cols[0] == 6:
164                                belems[s_group_cols[0]] = int(belems[s_group_cols[0]])
165                            if belems[p_group_cols[0]]==pitem1 and belems[s_group_cols[0]] in sitem1:
166                                repeats1.append(int(belems[6]))
167                                repeats2.append(int(belems[13]))
168                                if belems[4] == 'mononucleotide':
169                                    thresholds.append(mono_threshold)
170                                else:
171                                    thresholds.append(non_mono_threshold)
172                                mut1["%s\t%s" %(pitem1,sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
173                                if region == 'align':
174                                    count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2))
175                                else:    
176                                    if winspecies == 1:
177                                        count1[str(pitem1)+'\t'+str(sitem1)]=sum(repeats1)
178                                    elif winspecies == 2:
179                                        count1[str(pitem1)+'\t'+str(sitem1)]=sum(repeats2)
180                        else:
181                            if belems[p_group_cols[0]]==pitem1 and belems[s_group_cols[0]]==sitem1:
182                                repeats1.append(int(belems[6]))
183                                repeats2.append(int(belems[13]))
184                                if belems[4] == 'mononucleotide':
185                                    thresholds.append(mono_threshold)
186                                else:
187                                    thresholds.append(non_mono_threshold)
188                                mut1["%s\t%s" %(pitem1,sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
189                                if region == 'align':
190                                    count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2))
191                                else:    
192                                    if winspecies == 1:
193                                        count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats1)
194                                    elif winspecies == 2:
195                                        count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats2)
196        else:   #Sub-group by feature is None
197            for bline in blk_lines:
198                belems = bline.split('\t')
199                if type(pitem1) == list:
200                    #print >>sys.stderr, "item: " + str(item1)
201                    if p_group_cols[0] == 6:
202                        belems[p_group_cols[0]] = int(belems[p_group_cols[0]])
203                    if belems[p_group_cols[0]] in pitem1:
204                        repeats1.append(int(belems[6]))
205                        repeats2.append(int(belems[13]))
206                        if belems[4] == 'mononucleotide':
207                            thresholds.append(mono_threshold)
208                        else:
209                            thresholds.append(non_mono_threshold)
210                else:
211                    if belems[p_group_cols[0]]==pitem1:
212                        repeats1.append(int(belems[6]))
213                        repeats2.append(int(belems[13]))
214                        if belems[4] == 'mononucleotide':
215                            thresholds.append(mono_threshold)
216                        else:
217                            thresholds.append(non_mono_threshold)
218            mut1["%s" %(pitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
219            if region == 'align':
220                count1["%s" %(pitem1)]=min(sum(repeats1),sum(repeats2))
221            else:            
222                if winspecies == 1:
223                    count1[str(pitem1)]=sum(repeats1)
224                elif winspecies == 2:
225                    count1[str(pitem1)]=sum(repeats2)
226                
227    for pitem2 in uniq_elems_2:
228        #repeats1 = []
229        #repeats2 = []
230        thresholds = []
231        if s_group_cols[0] != -1:    #Sub-group by feature is not None
232            for sitem2 in uniq_s_elems_2:
233                repeats1 = []
234                repeats2 = []
235                if type(sitem2)==type(''):
236                    sitem2 = sitem2.strip()
237                for bline in blk_lines:
238                    belems = bline.split('\t')
239                    if type(pitem2) == list:
240                        if p_group_cols[0] == 6:
241                            belems[p_group_cols[1]] = int(belems[p_group_cols[1]])
242                        if belems[p_group_cols[1]] in pitem2 and belems[s_group_cols[1]]==sitem2:
243                            repeats2.append(int(belems[13]))
244                            repeats1.append(int(belems[6]))
245                            if belems[4] == 'mononucleotide':
246                                thresholds.append(mono_threshold)
247                            else:
248                                thresholds.append(non_mono_threshold)
249                            mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
250                            #count2[str(pitem2)+'\t'+str(sitem2)]=len(repeats2)
251                            if region == 'align':
252                                count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2))
253                            else: 
254                                if winspecies == 1:
255                                    count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2)
256                                elif winspecies == 2:
257                                    count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1)
258                    else:
259                        if type(sitem2) == list:
260                            if s_group_cols[0] == 6:
261                                belems[s_group_cols[1]] = int(belems[s_group_cols[1]])
262                            if belems[p_group_cols[1]]==pitem2 and belems[s_group_cols[1]] in sitem2:
263                                repeats2.append(int(belems[13]))
264                                repeats1.append(int(belems[6]))
265                                if belems[4] == 'mononucleotide':
266                                    thresholds.append(mono_threshold)
267                                else:
268                                    thresholds.append(non_mono_threshold)
269                                mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
270                                if region == 'align':
271                                    count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2))
272                                else: 
273                                    if winspecies == 1:
274                                        count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2)
275                                    elif winspecies == 2:
276                                        count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1)
277                        else:
278                            if belems[p_group_cols[1]]==pitem2 and belems[s_group_cols[1]]==sitem2:
279                                repeats1.append(int(belems[13]))
280                                repeats2.append(int(belems[6]))
281                                if belems[4] == 'mononucleotide':
282                                    thresholds.append(mono_threshold)
283                                else:
284                                    thresholds.append(non_mono_threshold)
285                                mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
286                                if region == 'align':
287                                    count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2))
288                                else: 
289                                    if winspecies == 1:
290                                        count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2)
291                                    elif winspecies == 2:
292                                        count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1)
293        else:   #Sub-group by feature is None
294            for bline in blk_lines:
295                belems = bline.split('\t')
296                if type(pitem2) == list:
297                    if p_group_cols[0] == 6:
298                        belems[p_group_cols[1]] = int(belems[p_group_cols[1]])
299                    if belems[p_group_cols[1]] in pitem2:
300                        repeats2.append(int(belems[13]))
301                        repeats1.append(int(belems[6]))
302                        if belems[4] == 'mononucleotide':
303                            thresholds.append(mono_threshold)
304                        else:
305                            thresholds.append(non_mono_threshold)
306                else:
307                    if belems[p_group_cols[1]]==pitem2:
308                        repeats2.append(int(belems[13]))
309                        repeats1.append(int(belems[6]))
310                        if belems[4] == 'mononucleotide':
311                            thresholds.append(mono_threshold)
312                        else:
313                            thresholds.append(non_mono_threshold)
314            mut2["%s" %(pitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
315            if region == 'align':
316                count2["%s" %(pitem2)]=min(sum(repeats1),sum(repeats2))
317            else:
318                if winspecies == 1:
319                    count2["%s" %(pitem2)]=sum(repeats2)
320                elif winspecies == 2:
321                    count2["%s" %(pitem2)]=sum(repeats1)
322    for key in mut1.keys():
323        if key in mut2.keys():
324            mut = (mut1[key][0]+mut2[key][0])/(mut1[key][1]+mut2[key][1])
325            count = count1[key]
326            del mut2[key]
327        else:
328            unit_found = False
329            if p_group_cols[0] == 7 or s_group_cols[0] == 7: #if it is Repeat Unit (AG, GCT etc.) check for reverse-complements too
330                if p_group_cols[0] == 7:
331                    this,other = 0,1
332                else:
333                    this,other = 1,0
334                groups1 = key.split('\t')
335                mutn = mut1[key][0]
336                mutd = mut1[key][1]
337                count = 0
338                for key2 in mut2.keys():
339                    groups2 = key2.split('\t')
340                    if groups1[other] == groups2[other]:
341                        if groups1[this] in groups2[this]*2 or reverse_complement(groups1[this]) in groups2[this]*2:
342                            #mut = (mut1[key][0]+mut2[key2][0])/(mut1[key][1]+mut2[key2][1])
343                            mutn += mut2[key2][0]
344                            mutd += mut2[key2][1]
345                            count += int(count2[key2])
346                            unit_found = True
347                            del mut2[key2]
348                            #break
349            if unit_found:
350                mut = mutn/mutd
351            else:
352                mut = mut1[key][0]/mut1[key][1]
353                count = count1[key]
354        mut = "%.2e" %(mut/num_generations)
355        if region == 'align':
356            print >>fout, str(blk) + '\t'+seq1 + '\t' + seq2 + '\t' +key.strip()+ '\t'+str(mut) + '\t'+ str(count)
357        elif region == 'win':
358            fout.write("%s\t%s\t%s\t%s\n" %(blk,key.strip(),mut,count))
359            fout.flush()
360            
361    #catch any remaining repeats, for instance if the orthologous position contained different repeat units
362    for remaining_key in mut2.keys():
363        mut = mut2[remaining_key][0]/mut2[remaining_key][1]
364        mut = "%.2e" %(mut/num_generations)
365        count = count2[remaining_key]
366        if region == 'align':
367            print >>fout, str(blk) + '\t'+seq1 + '\t'+seq2 + '\t'+remaining_key.strip()+ '\t'+str(mut)+ '\t'+ str(count)
368        elif region == 'win':
369            fout.write("%s\t%s\t%s\t%s\n" %(blk,remaining_key.strip(),mut,count))
370            fout.flush()
371            #print >>fout, blk + '\t'+remaining_key.strip()+ '\t'+str(mut)+ '\t'+ str(count)
372
373def counter(node, start, end, report_func):
374    if start <= node.start < end and start < node.end <= end:
375        report_func(node) 
376        if node.right:
377            counter(node.right, start, end, report_func)
378        if node.left:
379            counter(node.left, start, end, report_func)
380    elif node.start < start and node.right:
381        counter(node.right, start, end, report_func)
382    elif node.start >= end and node.left and node.left.maxend > start:
383        counter(node.left, start, end, report_func)
384            
385        
386def main():
387    infile = sys.argv[1]
388    
389    for i, line in enumerate( file ( infile )):
390        line = line.rstrip('\r\n')
391        if len( line )>0 and not line.startswith( '#' ):
392            elems = line.split( '\t' )
393            break
394        if i == 30:
395            break # Hopefully we'll never get here...
396    
397    if len( elems ) != 15:
398        stop_err( "This tool only works on tabular data output by 'Extract Orthologous Microsatellites from pair-wise alignments' tool. The data in your input dataset is either missing or not formatted properly." )
399    global winspecies, speciesind
400    if region == 'win':
401        if dbkey_i in elems[1]:
402            winspecies = 1
403            speciesind = 1 
404        elif dbkey_i in elems[8]:
405            winspecies = 2
406            speciesind = 8
407        else:
408            stop_err("The species build corresponding to your interval file is not present in the Microsatellite file.") 
409        
410    fin = open(infile, 'r')
411    skipped = 0
412    blk=0
413    win=0
414    linestr=""
415    
416    if region == 'win':
417        
418        msats = NiceReaderWrapper( fileinput.FileInput( infile ),
419                                chrom_col = speciesind,
420                                start_col = speciesind+1,
421                                end_col = speciesind+2,
422                                strand_col = -1,
423                                fix_strand = True)
424        msatTree = quicksect.IntervalTree()
425        for item in msats:
426            if type( item ) is GenomicInterval:
427                msatTree.insert( item, msats.linenum, item.fields )
428        
429        for iline in fint:
430            try:
431                iline = iline.rstrip('\r\n')
432                if not(iline) or iline == "":
433                    continue
434                ielems = iline.strip("\r\n").split('\t')
435                ichr = ielems[chr_col_i]
436                istart = int(ielems[start_col_i])
437                iend = int(ielems[end_col_i])
438                isrc = "%s.%s" %(dbkey_i,ichr)
439                if isrc not in msatTree.chroms:
440                    continue
441                result = []
442                root = msatTree.chroms[isrc]    #root node for the chrom
443                counter(root, istart, iend, lambda node: result.append( node ))
444                if not(result):
445                    continue
446                tmpfile1 = tempfile.NamedTemporaryFile('wb+')
447                for node in result:
448                    tmpfile1.write("%s\n" % "\t".join( node.other ))
449                
450                tmpfile1.seek(0)
451                output_writer(iline, tmpfile1.readlines())
452            except:
453                skipped+=1
454        if skipped:
455            print "Skipped %d intervals as invalid." %(skipped)
456    elif region == 'align':
457        if s_group_cols[0] != -1:
458            print >>fout, "#Window\tSpecies_1\tSpecies_2\tGroupby_Feature\tSubGroupby_Feature\tMutability\tCount"
459        else:
460            print >>fout, "#Window\tSpecies_1\tWindow_Start\tWindow_End\tSpecies_2\tGroupby_Feature\tMutability\tCount"
461        prev_bnum = -1
462        try:
463            for line in fin:
464                line = line.strip("\r\n")
465                if not(line) or line == "":
466                    continue
467                elems = line.split('\t')
468                try:
469                    assert int(elems[0])
470                    assert len(elems) == 15
471                except:
472                    continue
473                new_bnum = int(elems[0])
474                if new_bnum != prev_bnum:
475                    if prev_bnum != -1:
476                        output_writer(prev_bnum, linestr.strip().replace('\r','\n').split('\n'))
477                    linestr = line + "\n"
478                else:
479                    linestr += line
480                    linestr += "\n"
481                prev_bnum = new_bnum
482            output_writer(prev_bnum, linestr.strip().replace('\r','\n').split('\n'))
483        except Exception, ea:
484            print >>sys.stderr, ea
485            skipped += 1
486        if skipped:
487            print "Skipped %d lines as invalid." %(skipped)
488if __name__ == "__main__":
489    main()