PageRenderTime 35ms CodeModel.GetById 13ms app.highlight 17ms RepoModel.GetById 2ms app.codeStats 0ms

/tools/rgenetics/rgTDT.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 264 lines | 235 code | 10 blank | 19 comment | 19 complexity | c2393724fc3494aff4dece918effc022 MD5 | raw file
  1#!/usr/local/bin/python
  2# hack to run and process a plink tdt
  3# expects args as
  4# bfilepath outname jobname outformat (wig,xls)
  5# ross lazarus
  6# for wig files, we need annotation so look for map file or complain
  7
  8"""
  9Parameters for wiggle track definition lines
 10All options are placed in a single line separated by spaces:
 11
 12  track type=wiggle_0 name=track_label description=center_label \
 13        visibility=display_mode color=r,g,b altColor=r,g,b \
 14        priority=priority autoScale=on|off \
 15        gridDefault=on|off maxHeightPixels=max:default:min \
 16        graphType=bar|points viewLimits=lower:upper \
 17        yLineMark=real-value yLineOnOff=on|off \
 18        windowingFunction=maximum|mean|minimum smoothingWindow=off|2-16
 19"""
 20
 21import sys,math,shutil,subprocess,os,time,tempfile,shutil,string
 22from os.path import abspath
 23from optparse import OptionParser
 24from rgutils import timenow, plinke
 25myversion = 'v0.003 January 2010'
 26verbose = False
 27
 28
 29
 30def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000):
 31    """
 32    score must be scaled to 0-1000
 33
 34    Want to make some wig tracks from each analysis
 35    Best n -log10(p). Make top hit the window.
 36    we use our tab output which has
 37    rs	chrom	offset	ADD_stat	ADD_p	ADD_log10p
 38    rs3094315	1	792429	1.151	0.2528	0.597223
 39
 40    """
 41
 42    def is_number(s):
 43        try:
 44            float(s)
 45            return True
 46        except ValueError:
 47            return False
 48    header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description)
 49    column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ]
 50    halfwidth=100
 51    resfpath = os.path.join(twd,resf)
 52    resf = open(resfpath,'r')
 53    resfl = resf.readlines() # dumb but convenient for millions of rows
 54    resfl = [x.split() for x in resfl]
 55    headl = resfl[0]
 56    resfl = resfl[1:]
 57    headl = [x.strip().upper() for x in headl]
 58    headIndex = dict(zip(headl,range(0,len(headl))))
 59    # s = 'rs\tchrom\toffset\ta1\ta2\ttransmitted\tuntransmitted\tTDTChiSq\tTDTp\t-log10TDTp\tAbsTDTOR\tTDTOR'
 60    chrpos = headIndex.get('CHROM',None)
 61    rspos = headIndex.get('RS',None)
 62    offspos = headIndex.get('OFFSET',None)
 63    ppos = headIndex.get('-LOG10TDTP',None)
 64    wewant = [chrpos,rspos,offspos,ppos]
 65    if None in wewant: # missing something
 66       logf.write('### Error missing a required header in makeGFF - headIndex=%s\n' % headIndex)
 67       return
 68    resfl = [x for x in resfl if x[ppos] > '']
 69    resfl = [(float(x[ppos]),x) for x in resfl] # decorate
 70    resfl.sort()
 71    resfl.reverse() # using -log10 so larger is better
 72    resfl = resfl[:topn] # truncate
 73    pvals = [x[0] for x in resfl] # need to scale
 74    resfl = [x[1] for x in resfl] # drop decoration
 75    maxp = max(pvals) # need to scale
 76    minp = min(pvals)
 77    prange = abs(maxp-minp) + 0.5 # fudge
 78    scalefact = 1000.0/prange
 79    logf.write('###maxp=%f,minp=%f,prange=%f,scalefact=%f\n' % (maxp,minp,prange,scalefact))
 80    for i,row in enumerate(resfl):
 81        row[ppos] = '%d' % (int(scalefact*pvals[i]))
 82        resfl[i] = row # replace
 83    outf = file(outfname,'w')
 84    outf.write(header)
 85    outres = [] # need to resort into chrom offset order
 86    for i,lrow in enumerate(resfl):
 87        chrom,snp,offset,p, = [lrow[x] for x in wewant]
 88        gff = ('chr%s' % chrom,'rgTDT','variation','%d' % (int(offset)-halfwidth),
 89               '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i]))
 90        outres.append(gff)
 91    outres = [(x[0],int(x[3]),x) for x in outres] # decorate
 92    outres.sort() # into chrom offset
 93    outres=[x[2] for x in outres] # undecorate
 94    outres = ['\t'.join(x) for x in outres]
 95    outf.write('\n'.join(outres))
 96    outf.write('\n')
 97    outf.close()
 98
 99
100
101def xformTDT(infname='',resf='',outfname='',name='foo',mapf='/usr/local/galaxy/data/rg/lped/x.bim'):
102    """munge a plink .tdt file into either a ucsc track or an xls file
103  CHR         SNP  A1:A2      T:U_TDT       OR_TDT    CHISQ_TDT        P_TDT
104   0   MitoT217C    2:3          0:0           NA           NA           NA
105   0   MitoG228A    1:4          0:0           NA           NA           NA
106   0   MitoT250C    2:3          0:0           NA           NA           NA
107    map file has
108    1       rs4378174       0       003980745
109    1       rs10796404      0       005465256
110    1       rs2697965       0       014023092
111
112   grrr!
113   Changed in 1.01 to
114   [rerla@hg fresh]$ head database/job_working_directory/445/rgTDT.tdt
115     CHR         SNP           BP  A1  A2      T      U           OR        CHISQ            P
116   1  rs12562034       758311   1   3     71     79       0.8987       0.4267       0.5136
117   1   rs3934834       995669   4   2     98    129       0.7597        4.233      0.03963
118
119
120    """
121    if verbose:
122        print 'Rgenetics Galaxy Tools, rgTDT.py.xformTDT got resf=%s, outtype=%s, outfname=%s' % (resf,outtype,outfname)
123    wewantcols = ['SNP','CHR','BP','A1','A2','T','U','OR','CHISQ','P']
124    res = []
125    s = 'rs\tchrom\toffset\ta1\ta2\ttransmitted\tuntransmitted\tTDTChiSq\tTDTp\t-log10TDTp\tAbsTDTOR\tTDTOR' # header
126    res.append(s)
127    rsdict = {}
128    if not mapf:
129        sys.stderr.write('rgTDT called but no map file provided - cannot determine locations')
130        sys.exit(1)
131    map = file(mapf,'r')
132    for l in map: # plink map
133        ll = l.strip().split()
134        if len(ll) >= 3:
135            rs=ll[1].strip()
136            chrom = ll[0]
137            if chrom.lower() == 'x':
138               chrom = '23'
139            if chrom.lower() == 'y':
140               chrom = '24'
141            if chrom.lower() == 'mito':
142               chrom = '25'
143            offset = ll[3]
144            rsdict[rs] = (chrom,offset)
145    f = open(resf,'r')
146    headl = f.next().strip()
147    headl = headl.split()
148    wewant = [headl.index(x) for x in wewantcols]
149    llen = len(headl)
150    lnum = anum = 0
151    for l in f:
152        lnum += 1
153        ll = l.strip().split()
154        if len(ll) >= llen: # valid line
155            snp,chrom,offset,a1,a2,t,u,orat,chisq,p = [ll[x] for x in wewant]
156            if chisq == 'NA' or p == 'NA' or orat == 'NA':
157                continue # can't use these lines - gg gets unhappy
158            snp = snp.strip()
159            lp = '0.0'
160            fp = '1.0'
161            fakeorat = '1.0'
162            if p.upper().strip() <> 'NA':
163                try:
164                   fp = float(p)
165                   if fp <> 0:
166                       lp = '%6f' % -math.log10(fp)
167                       fp = '%6f' % fp
168                except:
169                  pass
170            else:
171                p = '1.0'
172            if orat.upper().strip() <> 'NA':
173                try:
174                   fakeorat = orat
175                   if float(orat) < 1.0:
176                      fakeorat = '%6f' % (1.0/float(orat)) # invert so large values big
177                except:
178                   pass
179            else:
180                orat = '1.0'
181            outl = '\t'.join([snp,chrom,offset,a1,a2,t,u,chisq,p,lp,fakeorat,orat])
182            res.append(outl)
183    f = file(outfname,'w')
184    res.append('')
185    f.write('\n'.join(res))
186    f.close()
187
188
189if __name__ == "__main__":
190    """ called as
191    <command interpreter="python">
192        rgTDT.py -i '$infile.extra_files_path/$infile.metadata.base_name' -o '$title' -f '$outformat' -r '$out_file1' -l '$logf' -x '${GALAXY_DATA_INDEX_DIR}/rg/bin/pl$
193
194    </command>
195
196    """
197    u = """ called in xml as
198        <command interpreter="python2.4">
199        rgTDT.py -i $i -o $out_prefix -f $outformat -r $out_file1 -l $logf
200    </command>
201    """
202    if len(sys.argv) < 6:
203       s = '## Error rgTDT.py needs 5 command line params - got %s \n' % (sys.argv)
204       if verbose:
205            print >> sys.stdout, s
206       sys.exit(0)
207    parser = OptionParser(usage=u, version="%prog 0.01")
208    a = parser.add_option
209    a("-i","--infile",dest="bfname")
210    a("-o","--oprefix",dest="oprefix")
211    a("-f","--formatOut",dest="outformat")
212    a("-r","--results",dest="outfname")
213    a("-l","--logfile",dest="logf")
214    a("-d","--du",dest="uId")
215    a("-e","--email",dest="uEmail")
216    a("-g","--gff",dest="gffout",default="")
217    (options,args) = parser.parse_args()
218    killme = string.punctuation + string.whitespace
219    trantab = string.maketrans(killme,'_'*len(killme))
220    title = options.oprefix
221    title = title.translate(trantab)
222    map_file = '%s.bim' % (options.bfname) #
223    me = sys.argv[0]
224    alogf = options.logf # absolute paths
225    od = os.path.split(alogf)[0]
226    try:
227      os.path.makedirs(od)
228    except:
229      pass
230    aoutf = options.outfname # absolute paths
231    od = os.path.split(aoutf)[0]
232    try:
233      os.path.makedirs(od)
234    except:
235      pass
236    vcl = [plinke,'--noweb', '--bfile',options.bfname,'--out',title,'--mind','0.5','--tdt']
237    logme = []
238    if verbose:
239        s = 'Rgenetics %s http://rgenetics.org Galaxy Tools rgTDT.py started %s\n' % (myversion,timenow())
240        print >> sys.stdout,s
241        logme.append(s)
242        s ='rgTDT.py: bfname=%s, logf=%s, argv = %s\n' % (options.bfname,alogf, sys.argv)
243        print >> sys.stdout,s
244        logme.append(s)
245        s = 'rgTDT.py: vcl=%s\n' % (' '.join(vcl))
246        print >> sys.stdout,s
247        logme.append(s)
248    twd = tempfile.mkdtemp(suffix='rgTDT') # make sure plink doesn't spew log file into the root!
249    tname = os.path.join(twd,title)
250    p=subprocess.Popen(' '.join(vcl),shell=True,cwd=twd)
251    retval = p.wait()
252    shutil.copy('%s.log' % tname,alogf)
253    sto = file(alogf,'a')
254    sto.write('\n'.join(logme))
255    resf = '%s.tdt' % tname # plink output is here we hope
256    xformTDT(options.bfname,resf,aoutf,title,map_file) # leaves the desired summary file
257    gffout = options.gffout
258    if gffout > '':
259        makeGFF(resf=aoutf,outfname=gffout,logf=sto,twd='.',name='rgTDT_Top_Table',description=title,topn=1000)
260    shutil.rmtree(twd)
261    sto.close()
262
263
264