PageRenderTime 45ms CodeModel.GetById 13ms app.highlight 27ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/rgenetics/rgGLM.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 287 lines | 259 code | 14 blank | 14 comment | 44 complexity | 3375d217617263dfcab7b93ad5f32586 MD5 | raw file
  1#!/usr/local/bin/python
  2"""
  3# added most of the available options for linear models
  4# june 2009 rml
  5# hack to run and process a plink quantitative trait
  6#
  7
  8This is a wrapper for Shaun Purcell's Plink linear/logistic models for
  9traits, covariates and genotypes.
 10
 11It requires some judgement to interpret the findings
 12We need some better visualizations - manhattan plots are good.
 13svg with rs numbers for top 1%?
 14
 15toptable tools - truncate a gg file down to some low percentile
 16
 17intersect with other tables - eg gene expression regressions on snps
 18
 19
 20
 21"""
 22
 23import sys,math,shutil,subprocess,os,string,tempfile,shutil,commands
 24from rgutils import plinke
 25
 26def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000):
 27    """
 28    score must be scaled to 0-1000
 29    
 30    Want to make some wig tracks from each analysis
 31    Best n -log10(p). Make top hit the window.
 32    we use our tab output which has
 33    rs	chrom	offset	ADD_stat	ADD_p	ADD_log10p
 34    rs3094315	1	792429	1.151	0.2528	0.597223
 35
 36    """
 37
 38    def is_number(s):
 39        try:
 40            float(s)
 41            return True
 42        except ValueError:
 43            return False
 44    header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description)          
 45    column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ]
 46    halfwidth=100
 47    resfpath = os.path.join(twd,resf)
 48    resf = open(resfpath,'r')
 49    resfl = resf.readlines() # dumb but convenient for millions of rows
 50    resfl = [x.split() for x in resfl]
 51    headl = resfl[0]
 52    resfl = resfl[1:]
 53    headl = [x.strip().upper() for x in headl]
 54    headIndex = dict(zip(headl,range(0,len(headl))))
 55    chrpos = headIndex.get('CHROM',None)
 56    rspos = headIndex.get('RS',None)
 57    offspos = headIndex.get('OFFSET',None)
 58    ppos = headIndex.get('ADD_LOG10P',None)
 59    wewant = [chrpos,rspos,offspos,ppos]
 60    if None in wewant: # missing something
 61       logf.write('### Error missing a required header in makeGFF - headIndex=%s\n' % headIndex)
 62       return
 63    resfl = [x for x in resfl if x[ppos] > '']
 64    resfl = [(float(x[ppos]),x) for x in resfl] # decorate
 65    resfl.sort()
 66    resfl.reverse() # using -log10 so larger is better
 67    resfl = resfl[:topn] # truncate
 68    pvals = [x[0] for x in resfl] # need to scale
 69    resfl = [x[1] for x in resfl] # drop decoration
 70    if len(pvals) == 0:
 71        logf.write('### no pvalues found in resfl - %s' % (resfl[:3]))
 72        sys.exit(1)
 73    maxp = max(pvals) # need to scale
 74    minp = min(pvals)
 75    prange = abs(maxp-minp) + 0.5 # fudge
 76    scalefact = 1000.0/prange
 77    logf.write('###maxp=%f,minp=%f,prange=%f,scalefact=%f\n' % (maxp,minp,prange,scalefact))
 78    for i,row in enumerate(resfl):
 79        row[ppos] = '%d' % (int(scalefact*pvals[i]))
 80        resfl[i] = row # replace
 81    outf = file(outfname,'w')
 82    outf.write(header)
 83    outres = [] # need to resort into chrom offset order
 84    for i,lrow in enumerate(resfl):
 85        chrom,snp,offset,p, = [lrow[x] for x in wewant]
 86        gff = ('chr%s' % chrom,'rgGLM','variation','%d' % (int(offset)-halfwidth),
 87               '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i]))
 88        outres.append(gff)
 89    outres = [(x[0],int(x[3]),x) for x in outres] # decorate
 90    outres.sort() # into chrom offset
 91    outres=[x[2] for x in outres] # undecorate
 92    outres = ['\t'.join(x) for x in outres]    
 93    outf.write('\n'.join(outres))
 94    outf.write('\n')
 95    outf.close()
 96
 97
 98
 99def xformQassoc(resf='',outfname='',logf=None,twd='.'):
100    """	plink.assoc.linear to gg file
101from the docs
102The output per each SNP might look something like:
103
104    CHR        SNP      BP  A1       TEST   NMISS       OR      STAT         P
105      5   rs000001   10001   A        ADD     664   0.7806    -1.942   0.05216
106      5   rs000001   10001   A     DOMDEV     664   0.9395   -0.3562    0.7217
107      5   rs000001   10001   A       COV1     664   0.9723   -0.7894    0.4299
108      5   rs000001   10001   A       COV2     664    1.159    0.5132    0.6078
109      5   rs000001   10001   A   GENO_2DF     664       NA     5.059    0.0797   
110    need to transform into gg columns for each distinct test
111    or bed for tracks?
112    
113    """
114    logf.write('xformQassoc got resf=%s, outfname=%s\n' % (resf,outfname))
115    resdict = {}
116    rsdict = {}
117    markerlist = []
118    # plink is "clever" - will run logistic if only 2 categories such as gender
119    resfs = resf.split('.')
120    if resfs[-1] == 'logistic':
121        resfs[-1] = 'linear'
122    else:
123        resfs[-1] = 'logistic'
124    altresf = '.'.join(resfs)
125
126    altresfpath = os.path.join(twd,altresf)
127    resfpath = os.path.join(twd,resf)
128    try:
129        resf = open(resfpath,'r')
130    except:
131        try:
132            resf = open(altresfpath,'r')
133        except:
134            print >> sys.stderr, '## error - no file plink output %s or %s found - cannot continue' % (resfpath, altresfpath)
135            sys.exit(1)
136    for lnum,row in enumerate(resf):
137        if lnum == 0:
138            headl = row.split()
139            headl = [x.strip().upper() for x in headl]
140            headIndex = dict(zip(headl,range(0,len(headl))))
141            chrpos = headIndex.get('CHR',None)
142            rspos = headIndex.get('SNP',None)
143            offspos = headIndex.get('BP',None)
144            nmisspos = headIndex.get('NMISS',None)
145            testpos = headIndex.get('TEST',None)
146            ppos = headIndex.get('P',None)
147            coeffpos = headIndex.get('OR',None)
148            if not coeffpos:
149                coeffpos = headIndex.get('BETA',None)
150            apos = headIndex.get('A1',None)
151            statpos = headIndex.get('STAT',None)
152            wewant = [chrpos,rspos,offspos,testpos,statpos,ppos,coeffpos,apos]
153            if None in wewant: # missing something
154               logf.write('missing a required header in xformQassoc - headIndex=%s\n' % headIndex)
155               return
156            llen = len(headl)        
157        else: # no Nones!
158            ll = row.split()
159            if len(ll) >= llen: # valid line
160                chrom,snp,offset,test,stat,p,coeff,allele = [ll[x] for x in wewant]
161                snp = snp.strip()
162                if p <> 'NA' :
163                  try:
164                    ffp = float(p)
165                    if ffp <> 0:
166                       lp =  -math.log10(ffp)
167                  except:
168                    lp = 0.0
169                  resdict.setdefault(test,{})
170                  resdict[test][snp] = (stat,p,'%f' % lp)
171                  if rsdict.get(snp,None) == None:
172                      rsdict[snp] = (chrom,offset)
173                      markerlist.append(snp)
174    # now have various tests indexed by rs
175    tk = resdict.keys()
176    tk.sort() # tests
177    ohead = ['rs','chrom','offset']
178    for t in tk: # add headers
179        ohead.append('%s_stat' % t)
180        ohead.append('%s_p' % t)
181        ohead.append('%s_log10p' % t)
182    oheads = '\t'.join(ohead)
183    res = [oheads,]
184    for snp in markerlist: # retain original order
185        chrom,offset = rsdict[snp]
186        outl = [snp,chrom,offset]
187        for t in tk:
188            outl += resdict[t][snp] # add stat,p for this test
189        outs = '\t'.join(outl)
190        res.append(outs)
191    f = file(outfname,'w')
192    res.append('')
193    f.write('\n'.join(res))
194    f.close()
195
196                
197if __name__ == "__main__":
198    """
199
200    <command interpreter="python">   
201        rgGLM.py '$i.extra_files_path/$i.metadata.base_name' '$phef.extra_files_path/$phef.metadata.base_name'
202        "$title1" '$predvar' '$covar' '$out_file1' '$logf' '$i.metadata.base_name'
203        '$inter' '$cond' '$gender' '$mind' '$geno' '$maf' '$logistic' '$wigout'
204    </command>
205    """
206    topn = 1000
207    killme = string.punctuation+string.whitespace
208    trantab = string.maketrans(killme,'_'*len(killme))
209    if len(sys.argv) < 17:
210       s = 'rgGLM.py needs 17 params - got %s \n' % (sys.argv)
211       sys.stderr.write(s) # print >>,s would probably also work?
212       sys.exit(0)
213    blurb = 'rgGLM.py called with %s' % sys.argv
214    print >> sys.stdout,blurb
215    bfname = sys.argv[1]
216    phename = sys.argv[2]
217    title = sys.argv[3]
218    title.translate(trantab)
219    predvar = sys.argv[4]
220    covar = sys.argv[5].strip()
221    outfname = sys.argv[6] 
222    logfname = sys.argv[7]
223    op = os.path.split(logfname)[0]
224    try: # for test - needs this done
225        os.makedirs(op)
226    except:
227        pass    
228    basename = sys.argv[8].translate(trantab)
229    inter = sys.argv[9] == '1'
230    cond = sys.argv[10].strip()
231    if cond == 'None':
232        cond = ''
233    gender = sys.argv[11] == '1'
234    mind = sys.argv[12]
235    geno = sys.argv[13]
236    maf = sys.argv[14]
237    logistic = sys.argv[15].strip()=='1'
238    gffout = sys.argv[16]
239    me = sys.argv[0]
240    phepath = '%s.pphe' % phename
241    twd = tempfile.mkdtemp(suffix='rgGLM') # make sure plink doesn't spew log file into the root!
242    tplog = os.path.join(twd,'%s.log' % basename) # should be path to plink log
243    vcl = [plinke,'--noweb','--bfile',bfname,'--pheno-name','"%s"' % predvar,'--pheno',
244           phepath,'--out',basename,'--mind %s' % mind, '--geno %s' % geno,
245           '--maf %s' % maf]
246    if logistic:
247        vcl.append('--logistic')
248        resf = '%s.assoc.logistic' % basename # plink output is here we hope
249    else:
250        vcl.append('--linear')
251        resf = '%s.assoc.linear' % basename # plink output is here we hope
252    resf = os.path.join(twd,resf)
253    if gender:
254        vcl.append('--sex')
255    if inter:
256        vcl.append('--interaction')
257    if covar > 'None':
258        vcl += ['--covar',phepath,'--covar-name',covar] # comma sep list of covariates
259    tcfile = None
260    if len(cond) > 0: # plink wants these in a file..
261        dummy,tcfile = tempfile.mkstemp(suffix='condlist') #
262        f = open(tcfile,'w')
263        cl = cond.split()
264        f.write('\n'.join(cl))
265        f.write('\n')
266        f.close()
267        vcl.append('--condition-list %s' % tcfile)
268    p=subprocess.Popen(' '.join(vcl),shell=True,cwd=twd)
269    retval = p.wait()
270    if tcfile:
271        os.unlink(tcfile)
272    plinklog = file(tplog,'r').read()
273    logf = file(logfname,'w')
274    logf.write(blurb)
275    logf.write('\n')
276    logf.write('vcl=%s\n' % vcl)
277    xformQassoc(resf=resf,outfname=outfname,logf=logf,twd=twd) # leaves the desired summary file
278    makeGFF(resf=outfname,outfname=gffout,logf=logf,twd=twd,name='rgGLM_TopTable',description=title,topn=topn)
279    logf.write('\n')
280    logf.write(plinklog)
281    logf.close()
282    #shutil.rmtree(twd) # clean up
283
284
285
286
287