/tools/rgenetics/rgHaploView.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 513 lines · 467 code · 11 blank · 35 comment · 86 complexity · 6924fdd9691208666a71c9611fd38829 MD5 · raw file

  1. """
  2. released under the terms of the LGPL
  3. copyright ross lazarus August 2007
  4. for the rgenetics project
  5. Special galaxy tool for the camp2007 data
  6. Allows grabbing genotypes from an arbitrary region and estimating
  7. ld using haploview
  8. stoopid haploview won't allow control of dest directory for plots - always end
  9. up where the data came from - need to futz to get it where it belongs
  10. Needs a mongo results file in the location hardwired below or could be passed in as
  11. a library parameter - but this file must have a very specific structure
  12. rs chrom offset float1...floatn
  13. """
  14. import sys, array, os, string, tempfile, shutil, subprocess, glob
  15. from rgutils import galhtmlprefix
  16. progname = os.path.split(sys.argv[0])[1]
  17. javabin = 'java'
  18. #hvbin = '/usr/local/bin/Haploview.jar'
  19. #hvbin = '/home/universe/linux-i686/haploview/Haploview.jar'
  20. # get this from tool as a parameter - can use
  21. atrandic = {'A':'1','C':'2','G':'3','T':'4','N':'0','-':'0','1':'1','2':'2','3':'3','4':'4','0':'0'}
  22. class NullDevice:
  23. """ a dev/null for ignoring output
  24. """
  25. def write(self, s):
  26. pass
  27. class ldPlot:
  28. def __init__(self, argv=[]):
  29. """
  30. setup
  31. """
  32. self.args=argv
  33. self.parseArgs(argv=self.args)
  34. self.setupRegions()
  35. def parseArgs(self,argv=[]):
  36. """
  37. """
  38. ts = '%s%s' % (string.punctuation,string.whitespace)
  39. ptran = string.maketrans(ts,'_'*len(ts))
  40. ### Figure out what genomic region we are interested in
  41. self.region = argv[1]
  42. self.orslist = argv[2].replace('X',' ').lower() # galaxy replaces newlines with XX - go figure
  43. self.title = argv[3].translate(ptran)
  44. # for outputs
  45. self.outfile = argv[4]
  46. self.logfn = 'Log_%s.txt' % (self.title)
  47. self.histextra = argv[5]
  48. self.base_name = argv[6]
  49. self.pedFileBase = os.path.join(self.histextra,self.base_name)
  50. print 'pedfilebase=%s' % self.pedFileBase
  51. self.minMaf=argv[7]
  52. if self.minMaf:
  53. try:
  54. self.minMaf = float(self.minMaf)
  55. except:
  56. self.minMaf = 0.0
  57. self.maxDist=argv[8] or None
  58. self.ldType=argv[9] or 'RSQ'
  59. self.hiRes = (argv[10].lower() == 'hi')
  60. self.memSize= argv[11] or '1000'
  61. self.memSize = int(self.memSize)
  62. self.outfpath = argv[12]
  63. self.infotrack = False # note that otherwise this breaks haploview in headless mode
  64. #infotrack = argv[13] == 'info'
  65. # this fails in headless mode as at april 2010 with haploview 4.2
  66. self.tagr2 = argv[14] or '0.8'
  67. hmpanels = argv[15] # eg "['CEU','YRI']"
  68. if hmpanels:
  69. hmpanels = hmpanels.replace('[','')
  70. hmpanels = hmpanels.replace(']','')
  71. hmpanels = hmpanels.replace("'",'')
  72. hmpanels = hmpanels.split(',')
  73. self.hmpanels = hmpanels
  74. self.hvbin = argv[16] # added rml june 2008
  75. self.bindir = os.path.split(self.hvbin)[0]
  76. # jan 2010 - always assume utes are on path to avoid platform problems
  77. self.pdfjoin = 'pdfjoin' # os.path.join(bindir,'pdfjoin')
  78. self.pdfnup = 'pdfnup' # os.path.join(bindir,'pdfnup')
  79. self.mogrify = 'mogrify' # os.path.join(bindir,'mogrify')
  80. self.convert = 'convert' # os.path.join(bindir,'convert')
  81. self.log_file = os.path.join(self.outfpath,self.logfn)
  82. self.MAP_FILE = '%s.map' % self.pedFileBase
  83. self.DATA_FILE = '%s.ped' % self.pedFileBase
  84. try:
  85. os.makedirs(self.outfpath)
  86. s = '## made new path %s\n' % self.outfpath
  87. except:
  88. pass
  89. self.lf = file(self.log_file,'w')
  90. s = 'PATH=%s\n' % os.environ.get('PATH','?')
  91. self.lf.write(s)
  92. def getRs(self):
  93. if self.region > '':
  94. useRs = []
  95. useRsdict={}
  96. try: # TODO make a regexp?
  97. c,rest = self.region.split(':')
  98. chromosome = c.replace('chr','')
  99. rest = rest.replace(',','') # remove commas
  100. spos,epos = rest.split('-')
  101. spos = int(spos)
  102. epos = int(epos)
  103. s = '## %s parsing chrom %s from %d to %d\n' % (progname,chromosome,spos,epos)
  104. self.lf.write(s)
  105. self.lf.write('\n')
  106. print >> sys.stdout, s
  107. except:
  108. s = '##! %s unable to parse region %s - MUST look like "chr8:10,000-100,000\n' % (progname,self.region)
  109. print >> sys.stdout, s
  110. self.lf.write(s)
  111. self.lf.write('\n')
  112. self.lf.close()
  113. sys.exit(1)
  114. else:
  115. useRs = self.orslist.split() # galaxy replaces newlines with XX - go figure
  116. useRsdict = dict(zip(useRs,useRs))
  117. return useRs, useRsdict
  118. def setupRegions(self):
  119. """
  120. This turns out to be complex because we allow the user
  121. flexibility - paste a list of rs or give a region.
  122. In most cases, some subset has to be generated correctly before running Haploview
  123. """
  124. chromosome = ''
  125. spos = epos = -9
  126. rslist = []
  127. rsdict = {}
  128. useRs,useRsdict = self.getRs()
  129. self.useTemp = False
  130. try:
  131. dfile = open(self.DATA_FILE, 'r')
  132. except: # bad input file name?
  133. s = '##! RGeno unable to open file %s\n' % (self.DATA_FILE)
  134. self.lf.write(s)
  135. self.lf.write('\n')
  136. self.lf.close()
  137. print >> sys.stdout, s
  138. raise
  139. sys.exit(1)
  140. try:
  141. mfile = open(self.MAP_FILE, 'r')
  142. except: # bad input file name?
  143. s = '##! RGeno unable to open file %s' % (self.MAP_FILE)
  144. lf.write(s)
  145. lf.write('\n')
  146. lf.close()
  147. print >> sys.stdout, s
  148. raise
  149. sys.exit(1)
  150. if len(useRs) > 0 or spos <> -9 : # subset region
  151. self.useTemp = True
  152. ### Figure out which markers are in this region
  153. markers = []
  154. snpcols = {}
  155. chroms = {}
  156. minpos = 2**32
  157. maxpos = 0
  158. for lnum,row in enumerate(mfile):
  159. line = row.strip()
  160. if not line: continue
  161. chrom, snp, genpos, abspos = line.split()
  162. try:
  163. ic = int(chrom)
  164. except:
  165. ic = None
  166. if ic and ic <= 23:
  167. try:
  168. abspos = int(abspos)
  169. if abspos > maxpos:
  170. maxpos = abspos
  171. if abspos < minpos:
  172. minpos = abspos
  173. except:
  174. abspos = epos + 999999999 # so next test fails
  175. if useRsdict.get(snp,None) or (spos <> -9 and chrom == chromosome and (spos <= abspos <= epos)):
  176. if chromosome == '':
  177. chromosome = chrom
  178. chroms.setdefault(chrom,chrom)
  179. markers.append((chrom,abspos,snp)) # decorate for sort into genomic
  180. snpcols[snp] = lnum # so we know which col to find genos for this marker
  181. markers.sort()
  182. rslist = [x[2] for x in markers] # drop decoration
  183. rsdict = dict(zip(rslist,rslist))
  184. if len(rslist) == 0:
  185. s = '##! %s: Found no rs numbers matching %s' % (progname,self.args[1:3])
  186. self.lf.write(s)
  187. self.lf.write('\n')
  188. self.lf.close()
  189. print >> sys.stdout, s
  190. sys.exit(1)
  191. if spos == -9:
  192. spos = minpos
  193. epos = maxpos
  194. s = '## %s looking for %d rs (%s)' % (progname,len(rslist),rslist[:5])
  195. self.lf.write(s)
  196. print >> sys.stdout, s
  197. wewant = [(6+(2*snpcols[x])) for x in rslist] #
  198. # column indices of first geno of each marker pair to get the markers into genomic
  199. ### ... and then parse the rest of the ped file to pull out
  200. ### the genotypes for all subjects for those markers
  201. # /usr/local/galaxy/data/rg/1/lped/
  202. self.tempMapName = os.path.join(self.outfpath,'%s.info' % self.title)
  203. self.tempMap = file(self.tempMapName,'w')
  204. self.tempPedName = os.path.join(self.outfpath,'%s.ped' % self.title)
  205. self.tempPed = file(self.tempPedName,'w')
  206. self.pngpath = '%s.LD.PNG' % self.tempPedName
  207. map = ['%s\t%s' % (x[2],x[1]) for x in markers] # snp,abspos in genomic order for haploview
  208. self.tempMap.write('%s\n' % '\n'.join(map))
  209. self.tempMap.close()
  210. nrows = 0
  211. for line in dfile:
  212. line = line.strip()
  213. if not line:
  214. continue
  215. fields = line.split()
  216. preamble = fields[:6]
  217. g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant]
  218. g = ' '.join(g)
  219. g = g.split() # we'll get there
  220. g = [atrandic.get(x,'0') for x in g] # numeric alleles...
  221. self.tempPed.write('%s %s\n' % (' '.join(preamble), ' '.join(g)))
  222. nrows += 1
  223. self.tempPed.close()
  224. s = '## %s: wrote %d markers, %d subjects for region %s\n' % (progname,len(rslist),nrows,self.region)
  225. self.lf.write(s)
  226. self.lf.write('\n')
  227. print >> sys.stdout,s
  228. else: # even if using all, must set up haploview info file instead of map
  229. markers = []
  230. chroms = {}
  231. spos = sys.maxint
  232. epos = -spos
  233. for lnum,row in enumerate(mfile):
  234. line = row.strip()
  235. if not line: continue
  236. chrom, snp, genpos, abspos = line.split()
  237. try:
  238. ic = int(chrom)
  239. except:
  240. ic = None
  241. if ic and ic <= 23:
  242. if chromosome == '':
  243. chromosome = chrom
  244. chroms.setdefault(chrom,chrom)
  245. try:
  246. p = int(abspos)
  247. if p < spos and p <> 0:
  248. spos = p
  249. if p > epos and p <> 0:
  250. epos = p
  251. except:
  252. pass
  253. markers.append('%s %s' % (snp,abspos)) # no sort - pass
  254. # now have spos and epos for hapmap if hmpanels
  255. self.tempMapName = os.path.join(self.outfpath,'%s.info' % self.title)
  256. self.tempMap = file(self.tempMapName,'w')
  257. self.tempMap.write('\n'.join(markers))
  258. self.tempMap.close()
  259. self.tempPedName = os.path.join(self.outfpath,'%s.ped' % self.title)
  260. try: # will fail on winblows!
  261. os.symlink(self.DATA_FILE,self.tempPedName)
  262. except:
  263. shutil.copy(self.DATA_FILE,self.tempPedName) # wasteful but..
  264. self.nchroms = len(chroms) # if > 1 can't really do this safely
  265. dfile.close()
  266. mfile.close()
  267. self.spos = spos
  268. self.epos = epos
  269. self.chromosome = chromosome
  270. if self.nchroms > 1:
  271. s = '## warning - multiple chromosomes found in your map file - %s\n' % ','.join(chroms.keys())
  272. self.lf.write(s)
  273. print >> sys.stdout,s
  274. sys.exit(1)
  275. def run(self,vcl):
  276. """
  277. """
  278. p=subprocess.Popen(vcl,shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf)
  279. retval = p.wait()
  280. self.lf.write('## executing %s returned %d\n' % (vcl,retval))
  281. def plotHmPanels(self,ste):
  282. """
  283. """
  284. sp = '%d' % (self.spos/1000.) # hapmap wants kb
  285. ep = '%d' % (self.epos/1000.)
  286. fnum=0
  287. for panel in self.hmpanels:
  288. if panel > '' and panel.lower() <> 'none': # in case someone checks that option too :)
  289. ptran = panel.strip()
  290. ptran = ptran.replace('+','_')
  291. fnum += 1 # preserve an order or else we get sorted
  292. vcl = [javabin,'-jar',self.hvbin,'-n','-memory','%d' % self.memSize,
  293. '-chromosome',self.chromosome, '-panel',panel.strip(),
  294. '-hapmapDownload','-startpos',sp,'-endpos',ep,
  295. '-ldcolorscheme',self.ldType]
  296. if self.minMaf:
  297. vcl += ['-minMaf','%f' % self.minMaf]
  298. if self.maxDist:
  299. vcl += ['-maxDistance',self.maxDist]
  300. if self.hiRes:
  301. vcl.append('-png')
  302. else:
  303. vcl.append('-compressedpng')
  304. if self.infotrack:
  305. vcl.append('-infoTrack')
  306. p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=ste,stdout=self.lf)
  307. retval = p.wait()
  308. inpng = 'Chromosome%s%s.LD.PNG' % (self.chromosome,panel)
  309. inpng = inpng.replace(' ','') # mysterious spaces!
  310. outpng = '%d_HapMap_%s_%s.png' % (fnum,ptran,self.chromosome)
  311. # hack for stupid chb+jpt
  312. outpng = outpng.replace(' ','')
  313. tmppng = '%s.tmp.png' % self.title
  314. tmppng = tmppng.replace(' ','')
  315. outpng = os.path.split(outpng)[-1]
  316. vcl = [self.convert, '-resize 800x400!', inpng, tmppng]
  317. self.run(' '.join(vcl))
  318. s = "text 10,300 'HapMap %s'" % ptran.strip()
  319. vcl = [self.convert, '-pointsize 25','-fill maroon',
  320. '-draw "%s"' % s, tmppng, outpng]
  321. self.run(' '.join(vcl))
  322. try:
  323. os.remove(os.path.join(self.outfpath,tmppng))
  324. except:
  325. pass
  326. def doPlots(self):
  327. """
  328. """
  329. DATA_FILE = self.tempPedName # for haploview
  330. INFO_FILE = self.tempMapName
  331. fblog,blog = tempfile.mkstemp()
  332. ste = open(blog,'w') # to catch the blather
  333. # if no need to rewrite - set up names for haploview call
  334. vcl = [javabin,'-jar',self.hvbin,'-n','-memory','%d' % self.memSize,'-pairwiseTagging',
  335. '-pedfile',DATA_FILE,'-info',INFO_FILE,'-tagrsqcounts',
  336. '-tagrsqcutoff',self.tagr2, '-ldcolorscheme',self.ldType]
  337. if self.minMaf:
  338. vcl += ['-minMaf','%f' % self.minMaf]
  339. if self.maxDist:
  340. vcl += ['-maxDistance',self.maxDist]
  341. if self.hiRes:
  342. vcl.append('-png')
  343. else:
  344. vcl.append('-compressedpng')
  345. if self.nchroms == 1:
  346. vcl += ['-chromosome',self.chromosome]
  347. if self.infotrack:
  348. vcl.append('-infoTrack')
  349. self.run(' '.join(vcl))
  350. vcl = [self.mogrify, '-resize 800x400!', '*.PNG']
  351. self.run(' '.join(vcl))
  352. inpng = '%s.LD.PNG' % DATA_FILE # stupid but necessary - can't control haploview name mangle
  353. inpng = inpng.replace(' ','')
  354. inpng = os.path.split(inpng)[-1]
  355. tmppng = '%s.tmp.png' % self.title
  356. tmppng = tmppng.replace(' ','')
  357. outpng = '1_%s.png' % self.title
  358. outpng = outpng.replace(' ','')
  359. outpng = os.path.split(outpng)[-1]
  360. vcl = [self.convert, '-resize 800x400!', inpng, tmppng]
  361. self.run(' '.join(vcl))
  362. s = "text 10,300 '%s'" % self.title[:40]
  363. vcl = [self.convert, '-pointsize 25','-fill maroon',
  364. '-draw "%s"' % s, tmppng, outpng]
  365. self.run(' '.join(vcl))
  366. try:
  367. os.remove(os.path.join(self.outfpath,tmppng))
  368. except:
  369. pass # label all the plots then delete all the .PNG files before munging
  370. fnum=1
  371. if self.hmpanels:
  372. self.plotHmPanels(ste)
  373. nimages = len(glob.glob(os.path.join(self.outfpath,'*.png'))) # rely on HaploView shouting - PNG @!
  374. self.lf.write('### nimages=%d\n' % nimages)
  375. if nimages > 0: # haploview may fail?
  376. vcl = '%s -format pdf -resize 800x400! *.png' % self.mogrify
  377. self.run(vcl)
  378. vcl = '%s *.pdf --fitpaper true --outfile alljoin.pdf' % self.pdfjoin
  379. self.run(vcl)
  380. vcl = '%s alljoin.pdf --nup 1x%d --outfile allnup.pdf' % (self.pdfnup,nimages)
  381. self.run(vcl)
  382. vcl = '%s -resize x300 allnup.pdf allnup.png' % (self.convert)
  383. self.run(vcl)
  384. ste.close() # temp file used to catch haploview blather
  385. hblather = open(blog,'r').readlines() # to catch the blather
  386. os.unlink(blog)
  387. if len(hblather) > 0:
  388. self.lf.write('## In addition, Haploview complained:')
  389. self.lf.write(''.join(hblather))
  390. self.lf.write('\n')
  391. self.lf.close()
  392. def writeHtml(self):
  393. """
  394. """
  395. flist = glob.glob(os.path.join(self.outfpath, '*'))
  396. flist.sort()
  397. ts = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~' + string.whitespace
  398. ftran = string.maketrans(ts,'_'*len(ts))
  399. outf = file(self.outfile,'w')
  400. outf.write(galhtmlprefix % progname)
  401. s = '<h4>rgenetics for Galaxy %s, wrapping HaploView</h4>' % (progname)
  402. outf.write(s)
  403. mainthumb = 'allnup.png'
  404. mainpdf = 'allnup.pdf'
  405. if os.path.exists(os.path.join(self.outfpath,mainpdf)):
  406. if not os.path.exists(os.path.join(self.outfpath,mainthumb)):
  407. outf.write('<table><tr><td colspan="3"><a href="%s">Main combined LD plot</a></td></tr></table>\n' % (mainpdf))
  408. else:
  409. outf.write('<table><tr><td><a href="%s"><img src="%s" title="Main combined LD image" hspace="10" align="middle">' % (mainpdf,mainthumb))
  410. outf.write('</td><td>Click the thumbnail at left to download the main combined LD image <a href=%s>%s</a></td></tr></table>\n' % (mainpdf,mainpdf))
  411. else:
  412. outf.write('(No main image was generated - this usually means a Haploview error connecting to Hapmap site - please try later)<br/>\n')
  413. outf.write('<br><div><hr><ul>\n')
  414. for i, data in enumerate( flist ):
  415. dn = os.path.split(data)[-1]
  416. if dn[:3] <> 'all':
  417. continue
  418. newdn = dn.translate(ftran)
  419. if dn <> newdn:
  420. os.rename(os.path.join(self.outfpath,dn),os.path.join(self.outfpath,newdn))
  421. dn = newdn
  422. dnlabel = dn
  423. ext = dn.split('.')[-1]
  424. if dn == 'allnup.pdf':
  425. dnlabel = 'All pdf plots on a single page'
  426. elif dn == 'alljoin.pdf':
  427. dnlabel = 'All pdf plots, each on a separate page'
  428. outf.write('<li><a href="%s">%s - %s</a></li>\n' % (dn,dn,dnlabel))
  429. for i, data in enumerate( flist ):
  430. dn = os.path.split(data)[-1]
  431. if dn[:3] == 'all':
  432. continue
  433. newdn = dn.translate(ftran)
  434. if dn <> newdn:
  435. os.rename(os.path.join(self.outfpath,dn),os.path.join(self.outfpath,newdn))
  436. dn = newdn
  437. dnlabel = dn
  438. ext = dn.split('.')[-1]
  439. if dn == 'allnup.pdf':
  440. dnlabel = 'All pdf plots on a single page'
  441. elif dn == 'alljoin.pdf':
  442. dnlabel = 'All pdf plots, each on a separate page'
  443. elif ext == 'info':
  444. dnlabel = '%s map data for Haploview input' % self.title
  445. elif ext == 'ped':
  446. dnlabel = '%s genotype data for Haploview input' % self.title
  447. elif dn.find('CEU') <> -1 or dn.find('YRI') <> -1 or dn.find('CHB_JPT') <> -1: # is hapmap
  448. dnlabel = 'Hapmap data'
  449. if ext == 'TAGS' or ext == 'TESTS' or ext == 'CHAPS':
  450. dnlabel = dnlabel + ' Tagger output'
  451. outf.write('<li><a href="%s">%s - %s</a></li>\n' % (dn,dn,dnlabel))
  452. outf.write('</ol><br>')
  453. outf.write("</div><div><hr>Job Log follows below (see %s)<pre>" % self.logfn)
  454. s = file(self.log_file,'r').readlines()
  455. s = '\n'.join(s)
  456. outf.write('%s</pre><hr></div>' % s)
  457. outf.write('</body></html>')
  458. outf.close()
  459. if self.useTemp:
  460. try:
  461. os.unlink(self.tempMapName)
  462. os.unlink(self.tempPedName)
  463. except:
  464. pass
  465. if __name__ == "__main__":
  466. """ ### Sanity check the arguments
  467. <command interpreter="python">
  468. rgHaploView.py "$ucsc_region" "$rslist" "$title" "$out_file1"
  469. "$lhistIn.extra_files_path" "$lhistIn.metadata.base_name"
  470. "$minmaf" "$maxdist" "$ldtype" "$hires" "$memsize" "$out_file1.files_path"
  471. "$infoTrack" "$tagr2" "$hmpanel" ${GALAXY_DATA_INDEX_DIR}/rg/bin/haploview.jar
  472. </command>
  473. remember to figure out chromosome and complain if > 1?
  474. and use the -chromosome <1-22,X,Y> parameter to haploview
  475. skipcheck?
  476. """
  477. progname = os.path.split(sys.argv[0])[-1]
  478. if len(sys.argv) < 16:
  479. s = '##!%s: Expected 16 params in sys.argv, got %d (%s)' % (progname,len(sys.argv), sys.argv)
  480. print s
  481. sys.exit(1)
  482. ld = ldPlot(argv = sys.argv)
  483. ld.doPlots()
  484. ld.writeHtml()