PageRenderTime 50ms CodeModel.GetById 9ms app.highlight 35ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/rgenetics/rgHaploView.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 513 lines | 482 code | 5 blank | 26 comment | 45 complexity | 6924fdd9691208666a71c9611fd38829 MD5 | raw file
  1"""
  2released under the terms of the LGPL
  3copyright ross lazarus August 2007
  4for the rgenetics project
  5
  6Special galaxy tool for the camp2007 data
  7Allows grabbing genotypes from an arbitrary region and estimating
  8ld using haploview
  9
 10stoopid haploview won't allow control of dest directory for plots - always end
 11up where the data came from - need to futz to get it where it belongs
 12
 13Needs a mongo results file in the location hardwired below or could be passed in as
 14a library parameter - but this file must have a very specific structure
 15rs chrom offset float1...floatn
 16
 17
 18"""
 19
 20
 21import sys, array, os, string, tempfile, shutil, subprocess, glob
 22from rgutils import galhtmlprefix
 23
 24progname = os.path.split(sys.argv[0])[1]
 25
 26javabin = 'java'
 27#hvbin = '/usr/local/bin/Haploview.jar'
 28#hvbin = '/home/universe/linux-i686/haploview/Haploview.jar'
 29# get this from tool as a parameter - can use 
 30
 31
 32
 33atrandic = {'A':'1','C':'2','G':'3','T':'4','N':'0','-':'0','1':'1','2':'2','3':'3','4':'4','0':'0'}
 34
 35class NullDevice:
 36    """ a dev/null for ignoring output
 37    """
 38    def write(self, s):
 39        pass
 40
 41class ldPlot:
 42    
 43    def __init__(self, argv=[]):
 44        """
 45        setup
 46        """
 47        self.args=argv
 48        self.parseArgs(argv=self.args)
 49        self.setupRegions()
 50                
 51    def parseArgs(self,argv=[]):
 52        """
 53        """
 54        ts = '%s%s' % (string.punctuation,string.whitespace)
 55        ptran =  string.maketrans(ts,'_'*len(ts))
 56        ### Figure out what genomic region we are interested in
 57        self.region = argv[1]
 58        self.orslist = argv[2].replace('X',' ').lower() # galaxy replaces newlines with XX - go figure
 59        self.title = argv[3].translate(ptran)
 60        # for outputs
 61        self.outfile = argv[4]
 62        self.logfn = 'Log_%s.txt' % (self.title)
 63        self.histextra = argv[5]
 64        self.base_name = argv[6]
 65        self.pedFileBase = os.path.join(self.histextra,self.base_name)
 66        print 'pedfilebase=%s' % self.pedFileBase
 67        self.minMaf=argv[7]
 68        if self.minMaf:
 69            try:
 70                self.minMaf = float(self.minMaf)
 71            except:
 72                self.minMaf = 0.0
 73        self.maxDist=argv[8] or None
 74        self.ldType=argv[9] or 'RSQ'
 75        self.hiRes = (argv[10].lower() == 'hi')
 76        self.memSize= argv[11] or '1000'
 77        self.memSize = int(self.memSize)
 78        self.outfpath = argv[12]
 79        self.infotrack = False # note that otherwise this breaks haploview in headless mode 
 80        #infotrack = argv[13] == 'info'
 81        # this fails in headless mode as at april 2010 with haploview 4.2
 82        self.tagr2 = argv[14] or '0.8'
 83        hmpanels = argv[15] # eg "['CEU','YRI']"
 84        if hmpanels:
 85           hmpanels = hmpanels.replace('[','')
 86           hmpanels = hmpanels.replace(']','')
 87           hmpanels = hmpanels.replace("'",'')
 88           hmpanels = hmpanels.split(',')
 89        self.hmpanels = hmpanels
 90        self.hvbin = argv[16] # added rml june 2008
 91        self.bindir = os.path.split(self.hvbin)[0]
 92        # jan 2010 - always assume utes are on path to avoid platform problems
 93        self.pdfjoin = 'pdfjoin' # os.path.join(bindir,'pdfjoin')
 94        self.pdfnup = 'pdfnup' # os.path.join(bindir,'pdfnup')
 95        self.mogrify = 'mogrify' # os.path.join(bindir,'mogrify')
 96        self.convert = 'convert' # os.path.join(bindir,'convert')
 97        self.log_file = os.path.join(self.outfpath,self.logfn)
 98        self.MAP_FILE = '%s.map' % self.pedFileBase
 99        self.DATA_FILE = '%s.ped' % self.pedFileBase
100        try:
101            os.makedirs(self.outfpath)
102            s = '## made new path %s\n' % self.outfpath
103        except:
104            pass
105        self.lf = file(self.log_file,'w')
106        s = 'PATH=%s\n' % os.environ.get('PATH','?')
107        self.lf.write(s)
108
109    def getRs(self):
110        if self.region > '':
111            useRs = []
112            useRsdict={}
113            try: # TODO make a regexp?
114                c,rest = self.region.split(':')
115                chromosome = c.replace('chr','')
116                rest = rest.replace(',','') # remove commas
117                spos,epos = rest.split('-')
118                spos = int(spos)
119                epos = int(epos)
120                s = '## %s parsing chrom %s from %d to %d\n' % (progname,chromosome,spos,epos)
121                self.lf.write(s)
122                self.lf.write('\n')
123                print >> sys.stdout, s
124            except:
125                s = '##! %s unable to parse region %s - MUST look like "chr8:10,000-100,000\n' % (progname,self.region)
126                print >> sys.stdout, s
127                self.lf.write(s)
128                self.lf.write('\n')
129                self.lf.close()
130                sys.exit(1)
131        else:
132            useRs = self.orslist.split() # galaxy replaces newlines with XX - go figure
133            useRsdict = dict(zip(useRs,useRs))
134        return useRs, useRsdict
135
136    
137    def setupRegions(self):
138        """
139        This turns out to be complex because we allow the user
140        flexibility - paste a list of rs or give a region.
141        In most cases, some subset has to be generated correctly before running Haploview
142        """
143        chromosome = ''
144        spos = epos = -9
145        rslist = []
146        rsdict = {}
147        useRs,useRsdict = self.getRs()
148        self.useTemp = False
149        try:
150            dfile = open(self.DATA_FILE, 'r')
151        except: # bad input file name?
152            s = '##! RGeno unable to open file %s\n' % (self.DATA_FILE)
153            self.lf.write(s)
154            self.lf.write('\n')
155            self.lf.close()
156            print >> sys.stdout, s
157            raise
158            sys.exit(1)
159        try:
160            mfile = open(self.MAP_FILE, 'r')
161        except: # bad input file name?
162            s = '##! RGeno unable to open file %s' % (self.MAP_FILE)
163            lf.write(s)
164            lf.write('\n')
165            lf.close()
166            print >> sys.stdout, s
167            raise
168            sys.exit(1)
169        if len(useRs) > 0 or spos <> -9 : # subset region
170            self.useTemp = True
171            ### Figure out which markers are in this region
172            markers = []
173            snpcols = {}
174            chroms = {}
175            minpos = 2**32
176            maxpos = 0
177            for lnum,row in enumerate(mfile):
178                line = row.strip()
179                if not line: continue
180                chrom, snp, genpos, abspos = line.split()
181                try:
182                    ic = int(chrom)
183                except:
184                    ic = None
185                if ic and ic <= 23:
186                    try:
187                        abspos = int(abspos)
188                        if abspos > maxpos:
189                            maxpos = abspos
190                        if abspos < minpos:
191                            minpos = abspos
192                    except:
193                        abspos = epos + 999999999 # so next test fails
194                if useRsdict.get(snp,None) or (spos <> -9 and chrom == chromosome and (spos <= abspos <= epos)):
195                    if chromosome == '':
196                        chromosome = chrom
197                    chroms.setdefault(chrom,chrom)
198                    markers.append((chrom,abspos,snp)) # decorate for sort into genomic
199                    snpcols[snp] = lnum # so we know which col to find genos for this marker
200            markers.sort()
201            rslist = [x[2] for x in markers] # drop decoration
202            rsdict = dict(zip(rslist,rslist))
203            if len(rslist) == 0:
204                s = '##! %s: Found no rs numbers matching %s' % (progname,self.args[1:3])
205                self.lf.write(s)
206                self.lf.write('\n')
207                self.lf.close()
208                print >> sys.stdout, s
209                sys.exit(1)
210            if spos == -9:
211                spos = minpos
212                epos = maxpos
213            s = '## %s looking for %d rs (%s)' % (progname,len(rslist),rslist[:5])
214            self.lf.write(s)
215            print >> sys.stdout, s
216            wewant = [(6+(2*snpcols[x])) for x in rslist] #
217            # column indices of first geno of each marker pair to get the markers into genomic
218            ### ... and then parse the rest of the ped file to pull out
219            ### the genotypes for all subjects for those markers
220            # /usr/local/galaxy/data/rg/1/lped/
221            self.tempMapName = os.path.join(self.outfpath,'%s.info' % self.title)
222            self.tempMap = file(self.tempMapName,'w')
223            self.tempPedName = os.path.join(self.outfpath,'%s.ped' % self.title)
224            self.tempPed = file(self.tempPedName,'w')
225            self.pngpath = '%s.LD.PNG' % self.tempPedName
226            map = ['%s\t%s' % (x[2],x[1]) for x in markers] # snp,abspos in genomic order for haploview
227            self.tempMap.write('%s\n' % '\n'.join(map))
228            self.tempMap.close()
229            nrows = 0
230            for line in dfile:
231                line = line.strip()
232                if not line:
233                    continue
234                fields = line.split()
235                preamble = fields[:6]
236                g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant]
237                g = ' '.join(g)
238                g = g.split() # we'll get there
239                g = [atrandic.get(x,'0') for x in g] # numeric alleles...
240                self.tempPed.write('%s %s\n' % (' '.join(preamble), ' '.join(g)))
241                nrows += 1
242            self.tempPed.close()
243            s = '## %s: wrote %d markers, %d subjects for region %s\n' % (progname,len(rslist),nrows,self.region)
244            self.lf.write(s)
245            self.lf.write('\n')
246            print >> sys.stdout,s
247        else: # even if using all, must set up haploview info file instead of map
248            markers = []
249            chroms = {}
250            spos = sys.maxint
251            epos = -spos
252            for lnum,row in enumerate(mfile):
253              line = row.strip()
254              if not line: continue
255              chrom, snp, genpos, abspos = line.split()
256              try:
257                ic = int(chrom)
258              except:
259                ic = None
260              if ic and ic <= 23:
261                if chromosome == '':
262                    chromosome = chrom
263                chroms.setdefault(chrom,chrom)
264                try:
265                    p = int(abspos)
266                    if p < spos and p <> 0:
267                        spos = p
268                    if p > epos and p <> 0:
269                        epos = p
270                except:
271                    pass
272                markers.append('%s %s' % (snp,abspos)) # no sort - pass
273            # now have spos and epos for hapmap if hmpanels
274            self.tempMapName = os.path.join(self.outfpath,'%s.info' % self.title)
275            self.tempMap = file(self.tempMapName,'w')
276            self.tempMap.write('\n'.join(markers))
277            self.tempMap.close()
278            self.tempPedName = os.path.join(self.outfpath,'%s.ped' % self.title)
279            try: # will fail on winblows!
280                os.symlink(self.DATA_FILE,self.tempPedName)
281            except:
282                shutil.copy(self.DATA_FILE,self.tempPedName) # wasteful but..
283        self.nchroms = len(chroms) # if > 1 can't really do this safely
284        dfile.close()
285        mfile.close()
286        self.spos = spos
287        self.epos = epos
288        self.chromosome = chromosome
289        if self.nchroms > 1:
290            s = '## warning - multiple chromosomes found in your map file - %s\n' % ','.join(chroms.keys())
291            self.lf.write(s)
292            print >> sys.stdout,s
293            sys.exit(1)
294
295    def run(self,vcl):
296        """
297        """
298        p=subprocess.Popen(vcl,shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf)
299        retval = p.wait()
300        self.lf.write('## executing %s returned %d\n' % (vcl,retval))
301
302    def plotHmPanels(self,ste):
303        """
304        """
305        sp = '%d' % (self.spos/1000.) # hapmap wants kb
306        ep = '%d' % (self.epos/1000.)
307        fnum=0
308        for panel in self.hmpanels:
309            if panel > '' and panel.lower() <> 'none': # in case someone checks that option too :)
310                ptran = panel.strip()
311                ptran = ptran.replace('+','_')
312                fnum += 1 # preserve an order or else we get sorted
313                vcl = [javabin,'-jar',self.hvbin,'-n','-memory','%d' % self.memSize,
314                  '-chromosome',self.chromosome, '-panel',panel.strip(),
315                  '-hapmapDownload','-startpos',sp,'-endpos',ep,
316                  '-ldcolorscheme',self.ldType]
317                if self.minMaf:
318                    vcl += ['-minMaf','%f' % self.minMaf]
319                if self.maxDist:
320                    vcl += ['-maxDistance',self.maxDist]
321                if self.hiRes:
322                    vcl.append('-png')
323                else:
324                    vcl.append('-compressedpng')
325                if self.infotrack:
326                    vcl.append('-infoTrack')
327                p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=ste,stdout=self.lf)
328                retval = p.wait()
329                inpng = 'Chromosome%s%s.LD.PNG' % (self.chromosome,panel)
330                inpng = inpng.replace(' ','') # mysterious spaces!
331                outpng = '%d_HapMap_%s_%s.png' % (fnum,ptran,self.chromosome)
332                # hack for stupid chb+jpt
333                outpng = outpng.replace(' ','')
334                tmppng = '%s.tmp.png' % self.title
335                tmppng = tmppng.replace(' ','')
336                outpng = os.path.split(outpng)[-1]
337                vcl = [self.convert, '-resize 800x400!', inpng, tmppng]
338                self.run(' '.join(vcl))
339                s = "text 10,300 'HapMap %s'" % ptran.strip()
340                vcl = [self.convert, '-pointsize 25','-fill maroon',
341                      '-draw "%s"' % s, tmppng, outpng]
342                self.run(' '.join(vcl))
343                try:
344                    os.remove(os.path.join(self.outfpath,tmppng))
345                except:
346                    pass
347
348    def doPlots(self):
349        """
350        """    
351        DATA_FILE = self.tempPedName # for haploview
352        INFO_FILE = self.tempMapName
353        fblog,blog = tempfile.mkstemp()
354        ste = open(blog,'w') # to catch the blather
355        # if no need to rewrite - set up names for haploview call
356        vcl = [javabin,'-jar',self.hvbin,'-n','-memory','%d' % self.memSize,'-pairwiseTagging',
357               '-pedfile',DATA_FILE,'-info',INFO_FILE,'-tagrsqcounts',
358               '-tagrsqcutoff',self.tagr2, '-ldcolorscheme',self.ldType]
359        if self.minMaf:
360            vcl += ['-minMaf','%f' % self.minMaf]
361        if self.maxDist:
362            vcl += ['-maxDistance',self.maxDist]
363        if self.hiRes:
364            vcl.append('-png')
365        else:
366            vcl.append('-compressedpng')
367        if self.nchroms == 1:
368            vcl += ['-chromosome',self.chromosome]
369        if self.infotrack:
370            vcl.append('-infoTrack')
371        self.run(' '.join(vcl))
372        vcl = [self.mogrify, '-resize 800x400!', '*.PNG']
373        self.run(' '.join(vcl))
374        inpng = '%s.LD.PNG' % DATA_FILE # stupid but necessary - can't control haploview name mangle
375        inpng = inpng.replace(' ','')
376        inpng = os.path.split(inpng)[-1]
377        tmppng = '%s.tmp.png' % self.title
378        tmppng = tmppng.replace(' ','')
379        outpng = '1_%s.png' % self.title
380        outpng = outpng.replace(' ','')
381        outpng = os.path.split(outpng)[-1]
382        vcl = [self.convert, '-resize 800x400!', inpng, tmppng]
383        self.run(' '.join(vcl))
384        s = "text 10,300 '%s'" % self.title[:40]
385        vcl = [self.convert, '-pointsize 25','-fill maroon',
386              '-draw "%s"' % s, tmppng, outpng]
387        self.run(' '.join(vcl))
388        try:
389            os.remove(os.path.join(self.outfpath,tmppng))
390        except:
391            pass    # label all the plots then delete all the .PNG files before munging
392        fnum=1
393        if self.hmpanels:
394            self.plotHmPanels(ste)
395        nimages = len(glob.glob(os.path.join(self.outfpath,'*.png'))) # rely on HaploView shouting - PNG @!
396        self.lf.write('### nimages=%d\n' % nimages)
397        if nimages > 0: # haploview may fail?
398            vcl = '%s -format pdf -resize 800x400! *.png' % self.mogrify
399            self.run(vcl)
400            vcl = '%s *.pdf --fitpaper true --outfile alljoin.pdf' % self.pdfjoin
401            self.run(vcl)
402            vcl = '%s alljoin.pdf --nup 1x%d --outfile allnup.pdf' % (self.pdfnup,nimages)
403            self.run(vcl)
404            vcl = '%s -resize x300 allnup.pdf allnup.png' % (self.convert)
405            self.run(vcl)
406        ste.close() # temp file used to catch haploview blather
407        hblather = open(blog,'r').readlines() # to catch the blather    
408        os.unlink(blog)
409        if len(hblather) > 0:
410           self.lf.write('## In addition, Haploview complained:')
411           self.lf.write(''.join(hblather))
412           self.lf.write('\n')
413        self.lf.close()
414        
415    def writeHtml(self):
416        """
417        """
418        flist = glob.glob(os.path.join(self.outfpath, '*'))
419        flist.sort()
420        ts = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~' + string.whitespace
421        ftran =  string.maketrans(ts,'_'*len(ts))
422        outf = file(self.outfile,'w')
423        outf.write(galhtmlprefix % progname)
424        s = '<h4>rgenetics for Galaxy %s, wrapping HaploView</h4>' % (progname)
425        outf.write(s)
426        mainthumb = 'allnup.png'
427        mainpdf = 'allnup.pdf'
428        if os.path.exists(os.path.join(self.outfpath,mainpdf)):
429            if not os.path.exists(os.path.join(self.outfpath,mainthumb)):
430                outf.write('<table><tr><td colspan="3"><a href="%s">Main combined LD plot</a></td></tr></table>\n' % (mainpdf))
431            else:
432                outf.write('<table><tr><td><a href="%s"><img src="%s" title="Main combined LD image" hspace="10" align="middle">' % (mainpdf,mainthumb))
433                outf.write('</td><td>Click the thumbnail at left to download the main combined LD image <a href=%s>%s</a></td></tr></table>\n' % (mainpdf,mainpdf))
434        else:
435            outf.write('(No main image was generated - this usually means a Haploview error connecting to Hapmap site - please try later)<br/>\n')
436        outf.write('<br><div><hr><ul>\n')
437        for i, data in enumerate( flist ):
438            dn = os.path.split(data)[-1]
439            if dn[:3] <> 'all':
440                continue
441            newdn = dn.translate(ftran)
442            if dn <> newdn:
443                os.rename(os.path.join(self.outfpath,dn),os.path.join(self.outfpath,newdn))
444                dn = newdn
445            dnlabel = dn
446            ext = dn.split('.')[-1]
447            if dn == 'allnup.pdf':
448                dnlabel = 'All pdf plots on a single page'
449            elif dn == 'alljoin.pdf':
450                dnlabel = 'All pdf plots, each on a separate page'
451            outf.write('<li><a href="%s">%s - %s</a></li>\n' % (dn,dn,dnlabel))
452        for i, data in enumerate( flist ):
453            dn = os.path.split(data)[-1]
454            if dn[:3] == 'all':
455                continue
456            newdn = dn.translate(ftran)
457            if dn <> newdn:
458                os.rename(os.path.join(self.outfpath,dn),os.path.join(self.outfpath,newdn))
459                dn = newdn
460            dnlabel = dn
461            ext = dn.split('.')[-1]
462            if dn == 'allnup.pdf':
463                dnlabel = 'All pdf plots on a single page'
464            elif dn == 'alljoin.pdf':
465                dnlabel = 'All pdf plots, each on a separate page'
466            elif ext == 'info':
467                dnlabel = '%s map data for Haploview input' % self.title
468            elif ext == 'ped':
469                dnlabel = '%s genotype data for Haploview input' % self.title
470            elif dn.find('CEU') <> -1 or dn.find('YRI') <> -1 or dn.find('CHB_JPT') <> -1: # is hapmap
471                dnlabel = 'Hapmap data'
472            if ext == 'TAGS' or ext == 'TESTS' or ext == 'CHAPS':
473                dnlabel = dnlabel + ' Tagger output'
474            outf.write('<li><a href="%s">%s - %s</a></li>\n' % (dn,dn,dnlabel))
475        outf.write('</ol><br>')
476        outf.write("</div><div><hr>Job Log follows below (see %s)<pre>" % self.logfn)
477        s = file(self.log_file,'r').readlines()
478        s = '\n'.join(s)
479        outf.write('%s</pre><hr></div>' % s)
480        outf.write('</body></html>')
481        outf.close()
482        if self.useTemp:
483            try:
484                os.unlink(self.tempMapName)
485                os.unlink(self.tempPedName)
486            except:
487                pass
488        
489if __name__ == "__main__":
490    """  ### Sanity check the arguments
491
492    <command interpreter="python">
493    rgHaploView.py "$ucsc_region" "$rslist" "$title" "$out_file1"
494    "$lhistIn.extra_files_path" "$lhistIn.metadata.base_name"
495    "$minmaf" "$maxdist" "$ldtype" "$hires" "$memsize" "$out_file1.files_path"
496    "$infoTrack" "$tagr2" "$hmpanel" ${GALAXY_DATA_INDEX_DIR}/rg/bin/haploview.jar
497    </command>
498
499    remember to figure out chromosome and complain if > 1?
500    and use the -chromosome <1-22,X,Y> parameter to haploview
501    skipcheck?
502    """
503    progname = os.path.split(sys.argv[0])[-1]
504    if len(sys.argv) < 16:
505        s = '##!%s: Expected 16 params in sys.argv, got %d (%s)' % (progname,len(sys.argv), sys.argv)
506        print s
507        sys.exit(1)
508    ld = ldPlot(argv = sys.argv)
509    ld.doPlots()
510    ld.writeHtml()
511
512
513