/tools/rgenetics/rgGLM.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 287 lines · 239 code · 17 blank · 31 comment · 46 complexity · 3375d217617263dfcab7b93ad5f32586 MD5 · raw file

  1. #!/usr/local/bin/python
  2. """
  3. # added most of the available options for linear models
  4. # june 2009 rml
  5. # hack to run and process a plink quantitative trait
  6. #
  7. This is a wrapper for Shaun Purcell's Plink linear/logistic models for
  8. traits, covariates and genotypes.
  9. It requires some judgement to interpret the findings
  10. We need some better visualizations - manhattan plots are good.
  11. svg with rs numbers for top 1%?
  12. toptable tools - truncate a gg file down to some low percentile
  13. intersect with other tables - eg gene expression regressions on snps
  14. """
  15. import sys,math,shutil,subprocess,os,string,tempfile,shutil,commands
  16. from rgutils import plinke
  17. def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000):
  18. """
  19. score must be scaled to 0-1000
  20. Want to make some wig tracks from each analysis
  21. Best n -log10(p). Make top hit the window.
  22. we use our tab output which has
  23. rs chrom offset ADD_stat ADD_p ADD_log10p
  24. rs3094315 1 792429 1.151 0.2528 0.597223
  25. """
  26. def is_number(s):
  27. try:
  28. float(s)
  29. return True
  30. except ValueError:
  31. return False
  32. header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description)
  33. column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ]
  34. halfwidth=100
  35. resfpath = os.path.join(twd,resf)
  36. resf = open(resfpath,'r')
  37. resfl = resf.readlines() # dumb but convenient for millions of rows
  38. resfl = [x.split() for x in resfl]
  39. headl = resfl[0]
  40. resfl = resfl[1:]
  41. headl = [x.strip().upper() for x in headl]
  42. headIndex = dict(zip(headl,range(0,len(headl))))
  43. chrpos = headIndex.get('CHROM',None)
  44. rspos = headIndex.get('RS',None)
  45. offspos = headIndex.get('OFFSET',None)
  46. ppos = headIndex.get('ADD_LOG10P',None)
  47. wewant = [chrpos,rspos,offspos,ppos]
  48. if None in wewant: # missing something
  49. logf.write('### Error missing a required header in makeGFF - headIndex=%s\n' % headIndex)
  50. return
  51. resfl = [x for x in resfl if x[ppos] > '']
  52. resfl = [(float(x[ppos]),x) for x in resfl] # decorate
  53. resfl.sort()
  54. resfl.reverse() # using -log10 so larger is better
  55. resfl = resfl[:topn] # truncate
  56. pvals = [x[0] for x in resfl] # need to scale
  57. resfl = [x[1] for x in resfl] # drop decoration
  58. if len(pvals) == 0:
  59. logf.write('### no pvalues found in resfl - %s' % (resfl[:3]))
  60. sys.exit(1)
  61. maxp = max(pvals) # need to scale
  62. minp = min(pvals)
  63. prange = abs(maxp-minp) + 0.5 # fudge
  64. scalefact = 1000.0/prange
  65. logf.write('###maxp=%f,minp=%f,prange=%f,scalefact=%f\n' % (maxp,minp,prange,scalefact))
  66. for i,row in enumerate(resfl):
  67. row[ppos] = '%d' % (int(scalefact*pvals[i]))
  68. resfl[i] = row # replace
  69. outf = file(outfname,'w')
  70. outf.write(header)
  71. outres = [] # need to resort into chrom offset order
  72. for i,lrow in enumerate(resfl):
  73. chrom,snp,offset,p, = [lrow[x] for x in wewant]
  74. gff = ('chr%s' % chrom,'rgGLM','variation','%d' % (int(offset)-halfwidth),
  75. '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i]))
  76. outres.append(gff)
  77. outres = [(x[0],int(x[3]),x) for x in outres] # decorate
  78. outres.sort() # into chrom offset
  79. outres=[x[2] for x in outres] # undecorate
  80. outres = ['\t'.join(x) for x in outres]
  81. outf.write('\n'.join(outres))
  82. outf.write('\n')
  83. outf.close()
  84. def xformQassoc(resf='',outfname='',logf=None,twd='.'):
  85. """ plink.assoc.linear to gg file
  86. from the docs
  87. The output per each SNP might look something like:
  88. CHR SNP BP A1 TEST NMISS OR STAT P
  89. 5 rs000001 10001 A ADD 664 0.7806 -1.942 0.05216
  90. 5 rs000001 10001 A DOMDEV 664 0.9395 -0.3562 0.7217
  91. 5 rs000001 10001 A COV1 664 0.9723 -0.7894 0.4299
  92. 5 rs000001 10001 A COV2 664 1.159 0.5132 0.6078
  93. 5 rs000001 10001 A GENO_2DF 664 NA 5.059 0.0797
  94. need to transform into gg columns for each distinct test
  95. or bed for tracks?
  96. """
  97. logf.write('xformQassoc got resf=%s, outfname=%s\n' % (resf,outfname))
  98. resdict = {}
  99. rsdict = {}
  100. markerlist = []
  101. # plink is "clever" - will run logistic if only 2 categories such as gender
  102. resfs = resf.split('.')
  103. if resfs[-1] == 'logistic':
  104. resfs[-1] = 'linear'
  105. else:
  106. resfs[-1] = 'logistic'
  107. altresf = '.'.join(resfs)
  108. altresfpath = os.path.join(twd,altresf)
  109. resfpath = os.path.join(twd,resf)
  110. try:
  111. resf = open(resfpath,'r')
  112. except:
  113. try:
  114. resf = open(altresfpath,'r')
  115. except:
  116. print >> sys.stderr, '## error - no file plink output %s or %s found - cannot continue' % (resfpath, altresfpath)
  117. sys.exit(1)
  118. for lnum,row in enumerate(resf):
  119. if lnum == 0:
  120. headl = row.split()
  121. headl = [x.strip().upper() for x in headl]
  122. headIndex = dict(zip(headl,range(0,len(headl))))
  123. chrpos = headIndex.get('CHR',None)
  124. rspos = headIndex.get('SNP',None)
  125. offspos = headIndex.get('BP',None)
  126. nmisspos = headIndex.get('NMISS',None)
  127. testpos = headIndex.get('TEST',None)
  128. ppos = headIndex.get('P',None)
  129. coeffpos = headIndex.get('OR',None)
  130. if not coeffpos:
  131. coeffpos = headIndex.get('BETA',None)
  132. apos = headIndex.get('A1',None)
  133. statpos = headIndex.get('STAT',None)
  134. wewant = [chrpos,rspos,offspos,testpos,statpos,ppos,coeffpos,apos]
  135. if None in wewant: # missing something
  136. logf.write('missing a required header in xformQassoc - headIndex=%s\n' % headIndex)
  137. return
  138. llen = len(headl)
  139. else: # no Nones!
  140. ll = row.split()
  141. if len(ll) >= llen: # valid line
  142. chrom,snp,offset,test,stat,p,coeff,allele = [ll[x] for x in wewant]
  143. snp = snp.strip()
  144. if p <> 'NA' :
  145. try:
  146. ffp = float(p)
  147. if ffp <> 0:
  148. lp = -math.log10(ffp)
  149. except:
  150. lp = 0.0
  151. resdict.setdefault(test,{})
  152. resdict[test][snp] = (stat,p,'%f' % lp)
  153. if rsdict.get(snp,None) == None:
  154. rsdict[snp] = (chrom,offset)
  155. markerlist.append(snp)
  156. # now have various tests indexed by rs
  157. tk = resdict.keys()
  158. tk.sort() # tests
  159. ohead = ['rs','chrom','offset']
  160. for t in tk: # add headers
  161. ohead.append('%s_stat' % t)
  162. ohead.append('%s_p' % t)
  163. ohead.append('%s_log10p' % t)
  164. oheads = '\t'.join(ohead)
  165. res = [oheads,]
  166. for snp in markerlist: # retain original order
  167. chrom,offset = rsdict[snp]
  168. outl = [snp,chrom,offset]
  169. for t in tk:
  170. outl += resdict[t][snp] # add stat,p for this test
  171. outs = '\t'.join(outl)
  172. res.append(outs)
  173. f = file(outfname,'w')
  174. res.append('')
  175. f.write('\n'.join(res))
  176. f.close()
  177. if __name__ == "__main__":
  178. """
  179. <command interpreter="python">
  180. rgGLM.py '$i.extra_files_path/$i.metadata.base_name' '$phef.extra_files_path/$phef.metadata.base_name'
  181. "$title1" '$predvar' '$covar' '$out_file1' '$logf' '$i.metadata.base_name'
  182. '$inter' '$cond' '$gender' '$mind' '$geno' '$maf' '$logistic' '$wigout'
  183. </command>
  184. """
  185. topn = 1000
  186. killme = string.punctuation+string.whitespace
  187. trantab = string.maketrans(killme,'_'*len(killme))
  188. if len(sys.argv) < 17:
  189. s = 'rgGLM.py needs 17 params - got %s \n' % (sys.argv)
  190. sys.stderr.write(s) # print >>,s would probably also work?
  191. sys.exit(0)
  192. blurb = 'rgGLM.py called with %s' % sys.argv
  193. print >> sys.stdout,blurb
  194. bfname = sys.argv[1]
  195. phename = sys.argv[2]
  196. title = sys.argv[3]
  197. title.translate(trantab)
  198. predvar = sys.argv[4]
  199. covar = sys.argv[5].strip()
  200. outfname = sys.argv[6]
  201. logfname = sys.argv[7]
  202. op = os.path.split(logfname)[0]
  203. try: # for test - needs this done
  204. os.makedirs(op)
  205. except:
  206. pass
  207. basename = sys.argv[8].translate(trantab)
  208. inter = sys.argv[9] == '1'
  209. cond = sys.argv[10].strip()
  210. if cond == 'None':
  211. cond = ''
  212. gender = sys.argv[11] == '1'
  213. mind = sys.argv[12]
  214. geno = sys.argv[13]
  215. maf = sys.argv[14]
  216. logistic = sys.argv[15].strip()=='1'
  217. gffout = sys.argv[16]
  218. me = sys.argv[0]
  219. phepath = '%s.pphe' % phename
  220. twd = tempfile.mkdtemp(suffix='rgGLM') # make sure plink doesn't spew log file into the root!
  221. tplog = os.path.join(twd,'%s.log' % basename) # should be path to plink log
  222. vcl = [plinke,'--noweb','--bfile',bfname,'--pheno-name','"%s"' % predvar,'--pheno',
  223. phepath,'--out',basename,'--mind %s' % mind, '--geno %s' % geno,
  224. '--maf %s' % maf]
  225. if logistic:
  226. vcl.append('--logistic')
  227. resf = '%s.assoc.logistic' % basename # plink output is here we hope
  228. else:
  229. vcl.append('--linear')
  230. resf = '%s.assoc.linear' % basename # plink output is here we hope
  231. resf = os.path.join(twd,resf)
  232. if gender:
  233. vcl.append('--sex')
  234. if inter:
  235. vcl.append('--interaction')
  236. if covar > 'None':
  237. vcl += ['--covar',phepath,'--covar-name',covar] # comma sep list of covariates
  238. tcfile = None
  239. if len(cond) > 0: # plink wants these in a file..
  240. dummy,tcfile = tempfile.mkstemp(suffix='condlist') #
  241. f = open(tcfile,'w')
  242. cl = cond.split()
  243. f.write('\n'.join(cl))
  244. f.write('\n')
  245. f.close()
  246. vcl.append('--condition-list %s' % tcfile)
  247. p=subprocess.Popen(' '.join(vcl),shell=True,cwd=twd)
  248. retval = p.wait()
  249. if tcfile:
  250. os.unlink(tcfile)
  251. plinklog = file(tplog,'r').read()
  252. logf = file(logfname,'w')
  253. logf.write(blurb)
  254. logf.write('\n')
  255. logf.write('vcl=%s\n' % vcl)
  256. xformQassoc(resf=resf,outfname=outfname,logf=logf,twd=twd) # leaves the desired summary file
  257. makeGFF(resf=outfname,outfname=gffout,logf=logf,twd=twd,name='rgGLM_TopTable',description=title,topn=topn)
  258. logf.write('\n')
  259. logf.write(plinklog)
  260. logf.close()
  261. #shutil.rmtree(twd) # clean up