/tools/rgenetics/rgTDT.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 264 lines · 201 code · 14 blank · 49 comment · 45 complexity · c2393724fc3494aff4dece918effc022 MD5 · raw file

  1. #!/usr/local/bin/python
  2. # hack to run and process a plink tdt
  3. # expects args as
  4. # bfilepath outname jobname outformat (wig,xls)
  5. # ross lazarus
  6. # for wig files, we need annotation so look for map file or complain
  7. """
  8. Parameters for wiggle track definition lines
  9. All options are placed in a single line separated by spaces:
  10. track type=wiggle_0 name=track_label description=center_label \
  11. visibility=display_mode color=r,g,b altColor=r,g,b \
  12. priority=priority autoScale=on|off \
  13. gridDefault=on|off maxHeightPixels=max:default:min \
  14. graphType=bar|points viewLimits=lower:upper \
  15. yLineMark=real-value yLineOnOff=on|off \
  16. windowingFunction=maximum|mean|minimum smoothingWindow=off|2-16
  17. """
  18. import sys,math,shutil,subprocess,os,time,tempfile,shutil,string
  19. from os.path import abspath
  20. from optparse import OptionParser
  21. from rgutils import timenow, plinke
  22. myversion = 'v0.003 January 2010'
  23. verbose = False
  24. def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000):
  25. """
  26. score must be scaled to 0-1000
  27. Want to make some wig tracks from each analysis
  28. Best n -log10(p). Make top hit the window.
  29. we use our tab output which has
  30. rs chrom offset ADD_stat ADD_p ADD_log10p
  31. rs3094315 1 792429 1.151 0.2528 0.597223
  32. """
  33. def is_number(s):
  34. try:
  35. float(s)
  36. return True
  37. except ValueError:
  38. return False
  39. header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description)
  40. column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ]
  41. halfwidth=100
  42. resfpath = os.path.join(twd,resf)
  43. resf = open(resfpath,'r')
  44. resfl = resf.readlines() # dumb but convenient for millions of rows
  45. resfl = [x.split() for x in resfl]
  46. headl = resfl[0]
  47. resfl = resfl[1:]
  48. headl = [x.strip().upper() for x in headl]
  49. headIndex = dict(zip(headl,range(0,len(headl))))
  50. # s = 'rs\tchrom\toffset\ta1\ta2\ttransmitted\tuntransmitted\tTDTChiSq\tTDTp\t-log10TDTp\tAbsTDTOR\tTDTOR'
  51. chrpos = headIndex.get('CHROM',None)
  52. rspos = headIndex.get('RS',None)
  53. offspos = headIndex.get('OFFSET',None)
  54. ppos = headIndex.get('-LOG10TDTP',None)
  55. wewant = [chrpos,rspos,offspos,ppos]
  56. if None in wewant: # missing something
  57. logf.write('### Error missing a required header in makeGFF - headIndex=%s\n' % headIndex)
  58. return
  59. resfl = [x for x in resfl if x[ppos] > '']
  60. resfl = [(float(x[ppos]),x) for x in resfl] # decorate
  61. resfl.sort()
  62. resfl.reverse() # using -log10 so larger is better
  63. resfl = resfl[:topn] # truncate
  64. pvals = [x[0] for x in resfl] # need to scale
  65. resfl = [x[1] for x in resfl] # drop decoration
  66. maxp = max(pvals) # need to scale
  67. minp = min(pvals)
  68. prange = abs(maxp-minp) + 0.5 # fudge
  69. scalefact = 1000.0/prange
  70. logf.write('###maxp=%f,minp=%f,prange=%f,scalefact=%f\n' % (maxp,minp,prange,scalefact))
  71. for i,row in enumerate(resfl):
  72. row[ppos] = '%d' % (int(scalefact*pvals[i]))
  73. resfl[i] = row # replace
  74. outf = file(outfname,'w')
  75. outf.write(header)
  76. outres = [] # need to resort into chrom offset order
  77. for i,lrow in enumerate(resfl):
  78. chrom,snp,offset,p, = [lrow[x] for x in wewant]
  79. gff = ('chr%s' % chrom,'rgTDT','variation','%d' % (int(offset)-halfwidth),
  80. '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i]))
  81. outres.append(gff)
  82. outres = [(x[0],int(x[3]),x) for x in outres] # decorate
  83. outres.sort() # into chrom offset
  84. outres=[x[2] for x in outres] # undecorate
  85. outres = ['\t'.join(x) for x in outres]
  86. outf.write('\n'.join(outres))
  87. outf.write('\n')
  88. outf.close()
  89. def xformTDT(infname='',resf='',outfname='',name='foo',mapf='/usr/local/galaxy/data/rg/lped/x.bim'):
  90. """munge a plink .tdt file into either a ucsc track or an xls file
  91. CHR SNP A1:A2 T:U_TDT OR_TDT CHISQ_TDT P_TDT
  92. 0 MitoT217C 2:3 0:0 NA NA NA
  93. 0 MitoG228A 1:4 0:0 NA NA NA
  94. 0 MitoT250C 2:3 0:0 NA NA NA
  95. map file has
  96. 1 rs4378174 0 003980745
  97. 1 rs10796404 0 005465256
  98. 1 rs2697965 0 014023092
  99. grrr!
  100. Changed in 1.01 to
  101. [rerla@hg fresh]$ head database/job_working_directory/445/rgTDT.tdt
  102. CHR SNP BP A1 A2 T U OR CHISQ P
  103. 1 rs12562034 758311 1 3 71 79 0.8987 0.4267 0.5136
  104. 1 rs3934834 995669 4 2 98 129 0.7597 4.233 0.03963
  105. """
  106. if verbose:
  107. print 'Rgenetics Galaxy Tools, rgTDT.py.xformTDT got resf=%s, outtype=%s, outfname=%s' % (resf,outtype,outfname)
  108. wewantcols = ['SNP','CHR','BP','A1','A2','T','U','OR','CHISQ','P']
  109. res = []
  110. s = 'rs\tchrom\toffset\ta1\ta2\ttransmitted\tuntransmitted\tTDTChiSq\tTDTp\t-log10TDTp\tAbsTDTOR\tTDTOR' # header
  111. res.append(s)
  112. rsdict = {}
  113. if not mapf:
  114. sys.stderr.write('rgTDT called but no map file provided - cannot determine locations')
  115. sys.exit(1)
  116. map = file(mapf,'r')
  117. for l in map: # plink map
  118. ll = l.strip().split()
  119. if len(ll) >= 3:
  120. rs=ll[1].strip()
  121. chrom = ll[0]
  122. if chrom.lower() == 'x':
  123. chrom = '23'
  124. if chrom.lower() == 'y':
  125. chrom = '24'
  126. if chrom.lower() == 'mito':
  127. chrom = '25'
  128. offset = ll[3]
  129. rsdict[rs] = (chrom,offset)
  130. f = open(resf,'r')
  131. headl = f.next().strip()
  132. headl = headl.split()
  133. wewant = [headl.index(x) for x in wewantcols]
  134. llen = len(headl)
  135. lnum = anum = 0
  136. for l in f:
  137. lnum += 1
  138. ll = l.strip().split()
  139. if len(ll) >= llen: # valid line
  140. snp,chrom,offset,a1,a2,t,u,orat,chisq,p = [ll[x] for x in wewant]
  141. if chisq == 'NA' or p == 'NA' or orat == 'NA':
  142. continue # can't use these lines - gg gets unhappy
  143. snp = snp.strip()
  144. lp = '0.0'
  145. fp = '1.0'
  146. fakeorat = '1.0'
  147. if p.upper().strip() <> 'NA':
  148. try:
  149. fp = float(p)
  150. if fp <> 0:
  151. lp = '%6f' % -math.log10(fp)
  152. fp = '%6f' % fp
  153. except:
  154. pass
  155. else:
  156. p = '1.0'
  157. if orat.upper().strip() <> 'NA':
  158. try:
  159. fakeorat = orat
  160. if float(orat) < 1.0:
  161. fakeorat = '%6f' % (1.0/float(orat)) # invert so large values big
  162. except:
  163. pass
  164. else:
  165. orat = '1.0'
  166. outl = '\t'.join([snp,chrom,offset,a1,a2,t,u,chisq,p,lp,fakeorat,orat])
  167. res.append(outl)
  168. f = file(outfname,'w')
  169. res.append('')
  170. f.write('\n'.join(res))
  171. f.close()
  172. if __name__ == "__main__":
  173. """ called as
  174. <command interpreter="python">
  175. rgTDT.py -i '$infile.extra_files_path/$infile.metadata.base_name' -o '$title' -f '$outformat' -r '$out_file1' -l '$logf' -x '${GALAXY_DATA_INDEX_DIR}/rg/bin/pl$
  176. </command>
  177. """
  178. u = """ called in xml as
  179. <command interpreter="python2.4">
  180. rgTDT.py -i $i -o $out_prefix -f $outformat -r $out_file1 -l $logf
  181. </command>
  182. """
  183. if len(sys.argv) < 6:
  184. s = '## Error rgTDT.py needs 5 command line params - got %s \n' % (sys.argv)
  185. if verbose:
  186. print >> sys.stdout, s
  187. sys.exit(0)
  188. parser = OptionParser(usage=u, version="%prog 0.01")
  189. a = parser.add_option
  190. a("-i","--infile",dest="bfname")
  191. a("-o","--oprefix",dest="oprefix")
  192. a("-f","--formatOut",dest="outformat")
  193. a("-r","--results",dest="outfname")
  194. a("-l","--logfile",dest="logf")
  195. a("-d","--du",dest="uId")
  196. a("-e","--email",dest="uEmail")
  197. a("-g","--gff",dest="gffout",default="")
  198. (options,args) = parser.parse_args()
  199. killme = string.punctuation + string.whitespace
  200. trantab = string.maketrans(killme,'_'*len(killme))
  201. title = options.oprefix
  202. title = title.translate(trantab)
  203. map_file = '%s.bim' % (options.bfname) #
  204. me = sys.argv[0]
  205. alogf = options.logf # absolute paths
  206. od = os.path.split(alogf)[0]
  207. try:
  208. os.path.makedirs(od)
  209. except:
  210. pass
  211. aoutf = options.outfname # absolute paths
  212. od = os.path.split(aoutf)[0]
  213. try:
  214. os.path.makedirs(od)
  215. except:
  216. pass
  217. vcl = [plinke,'--noweb', '--bfile',options.bfname,'--out',title,'--mind','0.5','--tdt']
  218. logme = []
  219. if verbose:
  220. s = 'Rgenetics %s http://rgenetics.org Galaxy Tools rgTDT.py started %s\n' % (myversion,timenow())
  221. print >> sys.stdout,s
  222. logme.append(s)
  223. s ='rgTDT.py: bfname=%s, logf=%s, argv = %s\n' % (options.bfname,alogf, sys.argv)
  224. print >> sys.stdout,s
  225. logme.append(s)
  226. s = 'rgTDT.py: vcl=%s\n' % (' '.join(vcl))
  227. print >> sys.stdout,s
  228. logme.append(s)
  229. twd = tempfile.mkdtemp(suffix='rgTDT') # make sure plink doesn't spew log file into the root!
  230. tname = os.path.join(twd,title)
  231. p=subprocess.Popen(' '.join(vcl),shell=True,cwd=twd)
  232. retval = p.wait()
  233. shutil.copy('%s.log' % tname,alogf)
  234. sto = file(alogf,'a')
  235. sto.write('\n'.join(logme))
  236. resf = '%s.tdt' % tname # plink output is here we hope
  237. xformTDT(options.bfname,resf,aoutf,title,map_file) # leaves the desired summary file
  238. gffout = options.gffout
  239. if gffout > '':
  240. makeGFF(resf=aoutf,outfname=gffout,logf=sto,twd='.',name='rgTDT_Top_Table',description=title,topn=1000)
  241. shutil.rmtree(twd)
  242. sto.close()