PageRenderTime 43ms CodeModel.GetById 23ms app.highlight 17ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/metag_tools/mapping_to_ucsc.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 204 lines | 190 code | 9 blank | 5 comment | 15 complexity | 8638c06e8a7a51e875b4e8b4bc45e807 MD5 | raw file
  1#!/usr/bin/env python
  2
  3from galaxy import eggs
  4import sys, tempfile, os
  5
  6assert sys.version_info[:2] >= (2.4)
  7
  8def stop_err(msg):
  9    sys.stderr.write(msg)
 10    sys.exit()
 11    
 12def main():
 13
 14    out_fname = sys.argv[1]
 15    in_fname = sys.argv[2]
 16    chr_col = int(sys.argv[3])-1
 17    coord_col = int(sys.argv[4])-1
 18    track_type = sys.argv[5]
 19    if track_type == 'coverage' or track_type == 'both': 
 20        coverage_col = int(sys.argv[6])-1
 21        cname = sys.argv[7]
 22        cdescription = sys.argv[8]
 23        ccolor = sys.argv[9].replace('-',',')
 24        cvisibility = sys.argv[10]
 25    if track_type == 'snp' or track_type == 'both':
 26        if track_type == 'both':
 27            j = 5
 28        else:
 29            j = 0 
 30        #sname = sys.argv[7+j]
 31        sdescription = sys.argv[6+j]
 32        svisibility = sys.argv[7+j]
 33        #ref_col = int(sys.argv[10+j])-1
 34        read_col = int(sys.argv[8+j])-1
 35    
 36
 37    # Sort the input file based on chromosome (alphabetically) and start co-ordinates (numerically)
 38    sorted_infile = tempfile.NamedTemporaryFile()
 39    try:
 40        os.system("sort -k %d,%d -k %dn -o %s %s" %(chr_col+1,chr_col+1,coord_col+1,sorted_infile.name,in_fname))
 41    except Exception, exc:
 42        stop_err( 'Initialization error -> %s' %str(exc) )
 43
 44    #generate chr list
 45    sorted_infile.seek(0)
 46    chr_vals = []
 47    for line in file( sorted_infile.name ):
 48        line = line.strip()
 49        if not(line):
 50            continue
 51        try:
 52            fields = line.split('\t')
 53            chr = fields[chr_col]
 54            if chr not in chr_vals:
 55                chr_vals.append(chr)
 56        except:
 57            pass
 58    if not(chr_vals):   
 59        stop_err("Skipped all lines as invalid.")
 60        
 61    if track_type == 'coverage' or track_type == 'both':
 62        if track_type == 'coverage':
 63            fout = open( out_fname, "w" )
 64        else:
 65            fout = tempfile.NamedTemporaryFile()
 66        fout.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \
 67                      % ( cname, cdescription, ccolor, cvisibility ))
 68    if track_type == 'snp' or track_type == 'both':
 69        fout_a = tempfile.NamedTemporaryFile()
 70        fout_t = tempfile.NamedTemporaryFile()
 71        fout_g = tempfile.NamedTemporaryFile()
 72        fout_c = tempfile.NamedTemporaryFile()
 73        fout_ref = tempfile.NamedTemporaryFile()
 74        
 75        fout_a.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \
 76                      % ( "Track A", sdescription, '255,0,0', svisibility ))
 77        fout_t.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \
 78                      % ( "Track T", sdescription, '0,255,0', svisibility ))
 79        fout_g.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \
 80                      % ( "Track G", sdescription, '0,0,255', svisibility ))
 81        fout_c.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \
 82                      % ( "Track C", sdescription, '255,0,255', svisibility ))
 83        
 84        
 85    sorted_infile.seek(0)
 86    for line in file( sorted_infile.name ):
 87        line = line.strip()
 88        if not(line):
 89            continue
 90        try:
 91            fields = line.split('\t')
 92            chr = fields[chr_col]
 93            start = int(fields[coord_col])
 94            assert start > 0
 95        except:
 96            continue
 97        try:
 98            ind = chr_vals.index(chr)    #encountered chr for the 1st time
 99            del chr_vals[ind]
100            prev_start = ''
101            header = "variableStep chrom=%s\n" %(chr)
102            if track_type == 'coverage' or track_type == 'both':
103                coverage = int(fields[coverage_col])
104                line1 = "%s\t%s\n" %(start,coverage)
105                fout.write("%s%s" %(header,line1))
106            if track_type == 'snp' or track_type == 'both':
107                a = t = g = c = 0
108                fout_a.write("%s" %(header))
109                fout_t.write("%s" %(header))
110                fout_g.write("%s" %(header))
111                fout_c.write("%s" %(header))
112                try:
113                    #ref_nt = fields[ref_col].capitalize()
114                    read_nt = fields[read_col].capitalize()
115                    try:
116                        nt_ind = ['A','T','G','C'].index(read_nt)
117                        if nt_ind == 0:
118                            a+=1
119                        elif nt_ind == 1:
120                            t+=1
121                        elif nt_ind == 2:
122                            g+=1
123                        else:
124                            c+=1
125                    except ValueError:
126                        pass
127                except:
128                    pass
129            prev_start = start
130        except ValueError:
131            if start != prev_start:
132                if track_type == 'coverage' or track_type == 'both':
133                    coverage = int(fields[coverage_col])
134                    fout.write("%s\t%s\n" %(start,coverage)) 
135                if track_type == 'snp' or track_type == 'both':
136                    if a:
137                        fout_a.write("%s\t%s\n" %(prev_start,a))
138                    if t:
139                        fout_t.write("%s\t%s\n" %(prev_start,t))
140                    if g:
141                        fout_g.write("%s\t%s\n" %(prev_start,g))
142                    if c:
143                        fout_c.write("%s\t%s\n" %(prev_start,c))
144                    a = t = g = c = 0
145                    try:
146                        #ref_nt = fields[ref_col].capitalize()
147                        read_nt = fields[read_col].capitalize()
148                        try:
149                            nt_ind = ['A','T','G','C'].index(read_nt)
150                            if nt_ind == 0:
151                                a+=1
152                            elif nt_ind == 1:
153                                t+=1
154                            elif nt_ind == 2:
155                                g+=1
156                            else:
157                                c+=1
158                        except ValueError:
159                            pass
160                    except:
161                        pass
162                prev_start = start
163            else:
164                if track_type == 'snp' or track_type == 'both':
165                    try:
166                        #ref_nt = fields[ref_col].capitalize()
167                        read_nt = fields[read_col].capitalize()
168                        try:
169                            nt_ind = ['A','T','G','C'].index(read_nt)
170                            if nt_ind == 0:
171                                a+=1
172                            elif nt_ind == 1:
173                                t+=1
174                            elif nt_ind == 2:
175                                g+=1
176                            else:
177                                c+=1
178                        except ValueError:
179                            pass
180                    except:
181                        pass
182    
183    if track_type == 'snp' or track_type == 'both':
184        if a:
185            fout_a.write("%s\t%s\n" %(prev_start,a))
186        if t:
187            fout_t.write("%s\t%s\n" %(prev_start,t))
188        if g:
189            fout_g.write("%s\t%s\n" %(prev_start,g))
190        if c:
191            fout_c.write("%s\t%s\n" %(prev_start,c))
192            
193        fout_a.seek(0)
194        fout_g.seek(0)
195        fout_t.seek(0)
196        fout_c.seek(0)    
197    
198    if track_type == 'snp':
199        os.system("cat %s %s %s %s >> %s" %(fout_a.name,fout_t.name,fout_g.name,fout_c.name,out_fname))
200    elif track_type == 'both':
201        fout.seek(0)
202        os.system("cat %s %s %s %s %s | cat > %s" %(fout.name,fout_a.name,fout_t.name,fout_g.name,fout_c.name,out_fname))
203if __name__ == "__main__":
204    main()