PageRenderTime 99ms CodeModel.GetById 15ms app.highlight 74ms RepoModel.GetById 1ms app.codeStats 1ms

/tools/picard/picard_wrapper.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 734 lines | 709 code | 8 blank | 17 comment | 33 complexity | 7a56cf80605c53b2976692d444bd3498 MD5 | raw file
  1#!/usr/bin/env python
  2"""
  3Originally written by Kelly Vincent
  4pretty output and additional picard wrappers by Ross Lazarus for rgenetics
  5Runs all available wrapped Picard tools.
  6usage: picard_wrapper.py [options]
  7code Ross wrote licensed under the LGPL
  8see http://www.gnu.org/copyleft/lesser.html
  9"""
 10
 11import optparse, os, sys, subprocess, tempfile, shutil, time, logging
 12
 13galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
 14<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 15<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 16<head>
 17<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
 18<meta name="generator" content="Galaxy %s tool output - see http://getgalaxy.org/" />
 19<title></title>
 20<link rel="stylesheet" href="/static/style/base.css" type="text/css" />
 21</head>
 22<body>
 23<div class="document">
 24"""
 25galhtmlattr = """Galaxy tool %s run at %s</b><br/>"""
 26galhtmlpostfix = """</div></body></html>\n"""
 27
 28
 29def stop_err( msg ):
 30    sys.stderr.write( '%s\n' % msg )
 31    sys.exit()
 32    
 33
 34def timenow():
 35    """return current time as a string
 36    """
 37    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
 38
 39
 40class PicardBase():
 41    """
 42    simple base class with some utilities for Picard
 43    adapted and merged with Kelly Vincent's code april 2011 Ross
 44    lots of changes...
 45    """
 46    
 47    def __init__(self, opts=None,arg0=None):
 48        """ common stuff needed at init for a picard tool
 49        """
 50        assert opts <> None, 'PicardBase needs opts at init'
 51        self.opts = opts
 52        if self.opts.outdir == None:
 53             self.opts.outdir = os.getcwd() # fixmate has no html file eg so use temp dir
 54        assert self.opts.outdir <> None,'## PicardBase needs a temp directory if no output directory passed in'
 55        self.picname = self.baseName(opts.jar)
 56        if self.picname.startswith('picard'):
 57            self.picname = opts.picard_cmd # special case for some tools like replaceheader?
 58        self.progname = self.baseName(arg0)
 59        self.version = '0.002'
 60        self.delme = [] # list of files to destroy
 61        self.title = opts.title
 62        self.inputfile = opts.input
 63        try:
 64            os.makedirs(opts.outdir)
 65        except:
 66            pass
 67        try:
 68            os.makedirs(opts.tmpdir)
 69        except:
 70            pass
 71        self.log_filename = os.path.join(self.opts.outdir,'%s.log' % self.picname)
 72        self.metricsOut =  os.path.join(opts.outdir,'%s.metrics.txt' % self.picname)
 73        self.setLogging(logfname=self.log_filename)
 74 
 75    def baseName(self,name=None):
 76        return os.path.splitext(os.path.basename(name))[0]
 77
 78    def setLogging(self,logfname="picard_wrapper.log"):
 79        """setup a logger
 80        """
 81        logging.basicConfig(level=logging.INFO,
 82                    filename=logfname,
 83                    filemode='a')
 84
 85
 86    def readLarge(self,fname=None):
 87        """ read a potentially huge file.
 88        """
 89        try:
 90            # get stderr, allowing for case where it's very large
 91            tmp = open( fname, 'rb' )
 92            s = ''
 93            buffsize = 1048576
 94            try:
 95                while True:
 96                    more = tmp.read( buffsize )
 97                    if len(more) > 0:
 98                        s += more
 99                    else:
100                        break
101            except OverflowError:
102                pass
103            tmp.close()
104        except Exception, e:
105            stop_err( 'Error : %s' % str( e ) )   
106        return s
107    
108    def runCL(self,cl=None,output_dir=None):
109        """ construct and run a command line
110        we have galaxy's temp path as opt.temp_dir so don't really need isolation
111        sometimes stdout is needed as the output - ugly hacks to deal with potentially vast artifacts
112        """
113        assert cl <> None, 'PicardBase runCL needs a command line as cl'
114        if output_dir == None:
115            output_dir = self.opts.outdir
116        if type(cl) == type([]):
117            cl = ' '.join(cl)
118        fd,templog = tempfile.mkstemp(dir=output_dir,suffix='rgtempRun.txt')
119        tlf = open(templog,'wb')
120        fd,temperr = tempfile.mkstemp(dir=output_dir,suffix='rgtempErr.txt')
121        tef = open(temperr,'wb')
122        process = subprocess.Popen(cl, shell=True, stderr=tef, stdout=tlf, cwd=output_dir)
123        rval = process.wait()
124        tlf.close()
125        tef.close()
126        stderrs = self.readLarge(temperr)
127        stdouts = self.readLarge(templog)        
128        if len(stderrs) > 0:
129            s = '## executing %s returned status %d and stderr: \n%s\n' % (cl,rval,stderrs)
130        else:
131            s = '## executing %s returned status %d and nothing on stderr\n' % (cl,rval)
132        logging.info(s)
133        os.unlink(templog) # always
134        os.unlink(temperr) # always
135        return s, stdouts # sometimes this is an output
136    
137    def runPic(self, jar, cl):
138        """
139        cl should be everything after the jar file name in the command
140        """
141        runme = ['java -Xmx%s' % self.opts.maxjheap]
142        runme.append('-jar %s' % jar)
143        runme += cl
144        s,stdout = self.runCL(cl=runme, output_dir=self.opts.outdir)
145        return stdout
146
147    def samToBam(self,infile=None,outdir=None):
148        """
149        use samtools view to convert sam to bam
150        """
151        fd,tempbam = tempfile.mkstemp(dir=outdir,suffix='rgutilsTemp.bam')
152        cl = ['samtools view -h -b -S -o ',tempbam,infile]
153        tlog,stdouts = self.runCL(cl,outdir)
154        return tlog,tempbam
155
156    #def bamToSam(self,infile=None,outdir=None):
157    #    """
158    #    use samtools view to convert bam to sam
159    #    """
160    #    fd,tempsam = tempfile.mkstemp(dir=outdir,suffix='rgutilsTemp.sam')
161    #    cl = ['samtools view -h -o ',tempsam,infile]
162    #    tlog,stdouts = self.runCL(cl,outdir)
163    #    return tlog,tempsam
164
165    def sortSam(self, infile=None,outfile=None,outdir=None):
166        """
167        """
168        print '## sortSam got infile=%s,outfile=%s,outdir=%s' % (infile,outfile,outdir)
169        cl = ['samtools sort',infile,outfile]
170        tlog,stdouts = self.runCL(cl,outdir)
171        return tlog
172
173    def cleanup(self):
174        for fname in self.delme:
175            try:
176                os.unlink(fname)
177            except:
178                pass
179                    
180    def prettyPicout(self,transpose,maxrows):
181        """organize picard outpouts into a report html page
182        """
183        res = []
184        try:
185            r = open(self.metricsOut,'r').readlines()
186        except:
187            r = []        
188        if len(r) > 0:
189            res.append('<b>Picard on line resources</b><ul>\n')
190            res.append('<li><a href="http://picard.sourceforge.net/index.shtml">Click here for Picard Documentation</a></li>\n')
191            res.append('<li><a href="http://picard.sourceforge.net/picard-metric-definitions.shtml">Click here for Picard Metrics definitions</a></li></ul><hr/>\n')
192            if transpose:
193                res.append('<b>Picard output (transposed to make it easier to see)</b><hr/>\n')       
194            else:
195                res.append('<b>Picard output</b><hr/>\n')  
196            res.append('<table cellpadding="3" >\n')
197            dat = []
198            heads = []
199            lastr = len(r) - 1
200            # special case for estimate library complexity hist
201            thist = False
202            for i,row in enumerate(r):
203                if row.strip() > '':
204                    srow = row.split('\t')
205                    if row.startswith('#'):
206                        heads.append(row.strip()) # want strings
207                    else:
208                        dat.append(srow) # want lists
209                    if row.startswith('## HISTOGRAM'):
210                        thist = True
211            if len(heads) > 0:
212                hres = ['<tr class="d%d"><td colspan="2">%s</td></tr>' % (i % 2,x) for i,x in enumerate(heads)]
213                res += hres
214                heads = []
215            if len(dat) > 0:
216                if transpose and not thist:
217                    tdat = map(None,*dat) # transpose an arbitrary list of lists
218                    tdat = ['<tr class="d%d"><td>%s</td><td>%s&nbsp;</td></tr>\n' % ((i+len(heads)) % 2,x[0],x[1]) for i,x in enumerate(tdat)] 
219                else:
220                    tdat = ['\t'.join(x).strip() for x in dat] # back to strings :(
221                    tdat = ['<tr class="d%d"><td colspan="2">%s</td></tr>\n' % ((i+len(heads)) % 2,x) for i,x in enumerate(tdat)]
222                res += tdat
223                dat = []
224            res.append('</table>\n')   
225        return res
226
227    def fixPicardOutputs(self,transpose,maxloglines):
228        """
229        picard produces long hard to read tab header files
230        make them available but present them transposed for readability
231        """
232        logging.shutdown()
233        self.cleanup() # remove temp files stored in delme
234        rstyle="""<style type="text/css">
235        tr.d0 td {background-color: oldlace; color: black;}
236        tr.d1 td {background-color: aliceblue; color: black;}
237        </style>"""    
238        res = [rstyle,]
239        res.append(galhtmlprefix % self.progname)   
240        res.append(galhtmlattr % (self.picname,timenow()))
241        flist = [x for x in os.listdir(self.opts.outdir) if not x.startswith('.')] 
242        pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf']
243        if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs
244            for p in pdflist:
245                imghref = '%s.jpg' % os.path.splitext(p)[0] # removes .pdf
246                res.append('<table cellpadding="10"><tr><td>\n')
247                res.append('<a href="%s"><img src="%s" title="Click image preview for a print quality PDF version" hspace="10" align="middle"></a>\n' % (p,imghref)) 
248                res.append('</tr></td></table>\n')   
249        if len(flist) > 0:
250            res.append('<b>The following output files were created (click the filename to view/download a copy):</b><hr/>')
251            res.append('<table>\n')
252            for i,f in enumerate(flist):
253                fn = os.path.split(f)[-1]
254                res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,fn))
255            res.append('</table><p/>\n') 
256        pres = self.prettyPicout(transpose,maxloglines)
257        if len(pres) > 0:
258            res += pres
259        l = open(self.log_filename,'r').readlines()
260        llen = len(l)
261        if llen > 0: 
262            res.append('<b>Picard Tool Run Log</b><hr/>\n') 
263            rlog = ['<pre>',]
264            if llen > maxloglines:
265                n = min(50,int(maxloglines/2))
266                rlog += l[:n]
267                rlog.append('------------ ## %d rows deleted ## --------------\n' % (llen-maxloglines))
268                rlog += l[-n:]
269            else:
270                rlog += l
271            rlog.append('</pre>')
272            if llen > maxloglines:
273                rlog.append('\n<b>## WARNING - %d log lines truncated - <a href="%s">%s</a> contains entire output</b>' % (llen - maxloglines,self.log_filename,self.log_filename))
274            res += rlog
275        else:
276            res.append("### Odd, Picard left no log file %s - must have really barfed badly?\n" % self.log_filename)
277        res.append('<hr/>The freely available <a href="http://picard.sourceforge.net/command-line-overview.shtml">Picard software</a> \n') 
278        res.append( 'generated all outputs reported here running as a <a href="http://getgalaxy.org">Galaxy</a> tool')   
279        res.append(galhtmlpostfix) 
280        outf = open(self.opts.htmlout,'w')
281        outf.write(''.join(res))   
282        outf.write('\n')
283        outf.close()
284
285    def makePicInterval(self,inbed=None,outf=None):
286        """
287        picard wants bait and target files to have the same header length as the incoming bam/sam 
288        a meaningful (ie accurate) representation will fail because of this - so this hack
289        it would be far better to be able to supply the original bed untouched
290        """
291        assert inbed <> None
292        bed = open(inbed,'r').readlines()
293        thead = os.path.join(self.opts.outdir,'tempSamHead.txt')
294        if self.opts.datatype == 'sam':
295            cl = ['samtools view -H -S',self.opts.input,'>',thead]
296        else:
297            cl = ['samtools view -H',self.opts.input,'>',thead]
298        self.runCL(cl=cl,output_dir=self.opts.outdir)
299        head = open(thead,'r').readlines()
300        s = '## got %d rows of header\n' % (len(head))
301        logging.info(s)
302        o = open(outf,'w')
303        o.write(''.join(head))
304        o.write(''.join(bed))
305        o.close()
306        return outf
307
308    def cleanSam(self, insam=None, newsam=None, picardErrors=[],outformat=None):
309        """
310        interesting problem - if paired, must remove mate pair of errors too or we have a new set of errors after cleaning - missing mate pairs!
311        Do the work of removing all the error sequences
312        pysam is cool
313        infile = pysam.Samfile( "-", "r" )
314        outfile = pysam.Samfile( "-", "w", template = infile )
315        for s in infile: outfile.write(s)
316
317        errors from ValidateSameFile.jar look like
318        WARNING: Record 32, Read name SRR006041.1202260, NM tag (nucleotide differences) is missing
319        ERROR: Record 33, Read name SRR006041.1042721, Empty sequence dictionary.
320        ERROR: Record 33, Read name SRR006041.1042721, RG ID on SAMRecord not found in header: SRR006041
321
322        """
323        assert os.path.isfile(insam), 'rgPicardValidate cleansam needs an input sam file - cannot find %s' % insam
324        assert newsam <> None, 'rgPicardValidate cleansam needs an output new sam file path'
325        removeNames = [x.split(',')[1].replace(' Read name ','') for x in picardErrors if len(x.split(',')) > 2]
326        remDict = dict(zip(removeNames,range(len(removeNames))))
327        infile = pysam.Samfile(insam,'rb')
328        info = 'found %d error sequences in picardErrors, %d unique' % (len(removeNames),len(remDict))
329        if len(removeNames) > 0:
330            outfile = pysam.Samfile(newsam,'wb',template=infile) # template must be an open file
331            i = 0
332            j = 0
333            for row in infile:
334                dropme = remDict.get(row.qname,None) # keep if None
335                if not dropme:
336                    outfile.write(row)
337                    j += 1
338                else: # discard
339                    i += 1
340            info = '%s\n%s' % (info, 'Discarded %d lines writing %d to %s from %s' % (i,j,newsam,insam))
341            outfile.close()
342            infile.close()
343        else: # we really want a nullop or a simple pointer copy
344            infile.close()
345            if newsam:
346                shutil.copy(insam,newsam)
347        logging.info(info)
348                
349
350
351def __main__():
352    doFix = False # tools returning htmlfile don't need this
353    doTranspose = True # default
354    maxloglines = 100 # default 
355    #Parse Command Line
356    op = optparse.OptionParser()
357    # All tools
358    op.add_option('-i', '--input', dest='input', help='Input SAM or BAM file' )
359    op.add_option('-e', '--inputext', default=None)
360    op.add_option('-o', '--output', default=None)
361    op.add_option('-n', '--title', default="Pick a Picard Tool")
362    op.add_option('-t', '--htmlout', default=None)
363    op.add_option('-d', '--outdir', default=None)
364    op.add_option('-x', '--maxjheap', default='4g')
365    op.add_option('-b', '--bisulphite', default='false')
366    op.add_option('-s', '--sortorder', default='query')     
367    op.add_option('','--tmpdir', default='/tmp')
368    op.add_option('-j','--jar',default='')    
369    op.add_option('','--picard-cmd',default=None)    
370    # Many tools
371    op.add_option( '', '--output-format', dest='output_format', help='Output format' )
372    op.add_option( '', '--bai-file', dest='bai_file', help='The path to the index file for the input bam file' )
373    op.add_option( '', '--ref', dest='ref', help='Built-in reference with fasta and dict file', default=None )
374    # CreateSequenceDictionary
375    op.add_option( '', '--ref-file', dest='ref_file', help='Fasta to use as reference', default=None )
376    op.add_option( '', '--species-name', dest='species_name', help='Species name to use in creating dict file from fasta file' )
377    op.add_option( '', '--build-name', dest='build_name', help='Name of genome assembly to use in creating dict file from fasta file' )
378    op.add_option( '', '--trunc-names', dest='trunc_names', help='Truncate sequence names at first whitespace from fasta file' )
379    # MarkDuplicates
380    op.add_option( '', '--remdups', default='true', help='Remove duplicates from output file' )
381    op.add_option( '', '--optdupdist', default="100", help='Maximum pixels between two identical sequences in order to consider them optical duplicates.' )
382    # CollectInsertSizeMetrics
383    op.add_option('', '--taillimit', default="0")
384    op.add_option('', '--histwidth', default="0")
385    op.add_option('', '--minpct', default="0.01")
386    # CollectAlignmentSummaryMetrics
387    op.add_option('', '--maxinsert', default="20")
388    op.add_option('', '--adaptors', action='append', type="string")
389    # FixMateInformation and validate
390    # CollectGcBiasMetrics
391    op.add_option('', '--windowsize', default='100')
392    op.add_option('', '--mingenomefrac', default='0.00001')    
393    # AddOrReplaceReadGroups
394    op.add_option( '', '--rg-opts', dest='rg_opts', help='Specify extra (optional) arguments with full, otherwise preSet' )
395    op.add_option( '', '--rg-lb', dest='rg_library', help='Read Group Library' )
396    op.add_option( '', '--rg-pl', dest='rg_platform', help='Read Group platform (e.g. illumina, solid)' )
397    op.add_option( '', '--rg-pu', dest='rg_plat_unit', help='Read Group platform unit (eg. run barcode) ' )
398    op.add_option( '', '--rg-sm', dest='rg_sample', help='Read Group sample name' )
399    op.add_option( '', '--rg-id', dest='rg_id', help='Read Group ID' )
400    op.add_option( '', '--rg-cn', dest='rg_seq_center', help='Read Group sequencing center name' )
401    op.add_option( '', '--rg-ds', dest='rg_desc', help='Read Group description' )
402    # ReorderSam
403    op.add_option( '', '--allow-inc-dict-concord', dest='allow_inc_dict_concord', help='Allow incomplete dict concordance' )
404    op.add_option( '', '--allow-contig-len-discord', dest='allow_contig_len_discord', help='Allow contig length discordance' )
405    # ReplaceSamHeader
406    op.add_option( '', '--header-file', dest='header_file', help='sam or bam file from which header will be read' )
407
408    op.add_option('','--assumesorted', default='true') 
409    op.add_option('','--readregex', default="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*")
410    #estimatelibrarycomplexity
411    op.add_option('','--minid', default="5")
412    op.add_option('','--maxdiff', default="0.03")
413    op.add_option('','--minmeanq', default="20")
414    #hsmetrics
415    op.add_option('','--baitbed', default=None)
416    op.add_option('','--targetbed', default=None)
417    #validate
418    op.add_option('','--ignoreflags', action='append', type="string")
419    op.add_option('','--maxerrors', default=None)
420    op.add_option('','--datatype', default=None)
421    op.add_option('','--bamout', default=None)
422    op.add_option('','--samout', default=None)
423
424    opts, args = op.parse_args()
425    opts.sortme = opts.assumesorted == 'false'
426    assert opts.input <> None
427    # need to add
428    # instance that does all the work
429    pic = PicardBase(opts,sys.argv[0])
430
431    tmp_dir = opts.outdir
432    haveTempout = False # we use this where sam output is an option
433
434    # set ref and dict files to use (create if necessary)
435    ref_file_name = opts.ref
436    if opts.ref_file <> None:
437        csd = 'CreateSequenceDictionary'
438        realjarpath = os.path.split(opts.jar)[0]
439        jarpath = os.path.join(realjarpath,'%s.jar' % csd) # for refseq
440        tmp_ref_fd, tmp_ref_name = tempfile.mkstemp( dir=opts.tmpdir , prefix = pic.picname)
441        ref_file_name = '%s.fasta' % tmp_ref_name
442        # build dict
443        dict_file_name = '%s.dict' % tmp_ref_name
444        os.symlink( opts.ref_file, ref_file_name )
445        cl = ['REFERENCE=%s' % ref_file_name]
446        cl.append('OUTPUT=%s' % dict_file_name)
447        cl.append('URI=%s' % os.path.basename( opts.ref_file ))
448        cl.append('TRUNCATE_NAMES_AT_WHITESPACE=%s' % opts.trunc_names)
449        if opts.species_name:
450            cl.append('SPECIES=%s' % opts.species_name)
451        if opts.build_name:
452            cl.append('GENOME_ASSEMBLY=%s' % opts.build_name)
453        pic.delme.append(dict_file_name)
454        pic.delme.append(ref_file_name)
455        pic.delme.append(tmp_ref_name)
456        s = pic.runPic(jarpath, cl)
457        # run relevant command(s)
458
459    # define temporary output
460    # if output is sam, it must have that extension, otherwise bam will be produced
461    # specify sam or bam file with extension
462    if opts.output_format == 'sam':
463        suff = '.sam'
464    else:
465        suff = ''
466    tmp_fd, tempout = tempfile.mkstemp( dir=opts.tmpdir, suffix=suff )
467
468    cl = ['VALIDATION_STRINGENCY=LENIENT',]
469
470    if pic.picname == 'AddOrReplaceReadGroups':
471        # sort order to match Galaxy's default
472        cl.append('SORT_ORDER=coordinate')
473        # input
474        cl.append('INPUT=%s' % opts.input)
475        # outputs
476        cl.append('OUTPUT=%s' % tempout)
477        # required read groups
478        cl.append('RGLB="%s"' % opts.rg_library)
479        cl.append('RGPL="%s"' % opts.rg_platform)
480        cl.append('RGPU="%s"' % opts.rg_plat_unit)
481        cl.append('RGSM="%s"' % opts.rg_sample)
482        if opts.rg_id:
483            cl.append('RGID="%s"' % opts.rg_id)
484        # optional read groups
485        if opts.rg_seq_center:
486            cl.append('RGCN="%s"' % opts.rg_seq_center)
487        if opts.rg_desc:
488            cl.append('RGDS="%s"' % opts.rg_desc)
489        pic.runPic(opts.jar, cl)
490        haveTempout = True
491
492    elif pic.picname == 'BamIndexStats':
493        tmp_fd, tmp_name = tempfile.mkstemp( dir=tmp_dir )
494        tmp_bam_name = '%s.bam' % tmp_name
495        tmp_bai_name = '%s.bai' % tmp_bam_name
496        os.symlink( opts.input, tmp_bam_name )
497        os.symlink( opts.bai_file, tmp_bai_name )
498        cl.append('INPUT=%s' % ( tmp_bam_name ))
499        pic.delme.append(tmp_bam_name)
500        pic.delme.append(tmp_bai_name)
501        pic.delme.append(tmp_name)
502        s = pic.runPic( opts.jar, cl )
503        f = open(pic.metricsOut,'a')
504        f.write(s) # got this on stdout from runCl
505        f.write('\n')
506        f.close()
507        doTranspose = False # but not transposed
508
509    elif pic.picname == 'EstimateLibraryComplexity':
510        cl.append('I=%s' % opts.input)
511        cl.append('O=%s' % pic.metricsOut)
512        if float(opts.minid) > 0:
513            cl.append('MIN_IDENTICAL_BASES=%s' % opts.minid)
514        if float(opts.maxdiff) > 0.0:
515            cl.append('MAX_DIFF_RATE=%s' % opts.maxdiff)
516        if float(opts.minmeanq) > 0:
517            cl.append('MIN_MEAN_QUALITY=%s' % opts.minmeanq)
518        if opts.readregex > '':
519            cl.append('READ_NAME_REGEX="%s"' % opts.readregex)
520        if float(opts.optdupdist) > 0:
521            cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist)
522        pic.runPic(opts.jar,cl)
523
524    elif pic.picname == 'CollectAlignmentSummaryMetrics':
525        # Why do we do this fakefasta thing? Because we need NO fai to be available or picard barfs unless it has the same length as the input data.
526        # why? Dunno Seems to work without complaining if the .bai file is AWOL....
527        fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
528        try:
529            os.symlink(ref_file_name,fakefasta)
530        except:
531            s = '## unable to symlink %s to %s - different devices? May need to replace with shutil.copy'
532            info = s
533            shutil.copy(ref_file_name,fakefasta)
534        pic.delme.append(fakefasta)
535        cl.append('ASSUME_SORTED=%s' % opts.assumesorted)
536        adaptorseqs = ''.join([' ADAPTER_SEQUENCE=%s' % x for x in opts.adaptors])
537        cl.append(adaptorseqs)
538        cl.append('IS_BISULFITE_SEQUENCED=%s' % opts.bisulphite)
539        cl.append('MAX_INSERT_SIZE=%s' % opts.maxinsert)
540        cl.append('OUTPUT=%s' % pic.metricsOut)
541        cl.append('R=%s' % fakefasta)
542        cl.append('TMP_DIR=%s' % opts.tmpdir)
543        if not opts.assumesorted.lower() == 'true': # we need to sort input
544            fakeinput = '%s.sorted' % opts.input
545            s = pic.sortSam(opts.input, fakeinput, opts.outdir)
546            pic.delme.append(fakeinput)
547            cl.append('INPUT=%s' % fakeinput)
548        else:
549            cl.append('INPUT=%s' % os.path.abspath(opts.input)) 
550        pic.runPic(opts.jar,cl)
551       
552        
553    elif pic.picname == 'CollectGcBiasMetrics':
554        assert os.path.isfile(ref_file_name),'PicardGC needs a reference sequence - cannot read %s' % ref_file_name
555        # sigh. Why do we do this fakefasta thing? Because we need NO fai to be available or picard barfs unless it has the same length as the input data.
556        # why? Dunno 
557        fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
558        try:
559            os.symlink(ref_file_name,fakefasta)
560        except:
561            s = '## unable to symlink %s to %s - different devices? May need to replace with shutil.copy'
562            info = s
563            shutil.copy(ref_file_name,fakefasta)
564        pic.delme.append(fakefasta)
565        x = 'rgPicardGCBiasMetrics'
566        pdfname = '%s.pdf' % x
567        jpgname = '%s.jpg' % x
568        tempout = os.path.join(opts.outdir,'rgPicardGCBiasMetrics.out')
569        temppdf = os.path.join(opts.outdir,pdfname)
570        cl.append('R=%s' % fakefasta)
571        cl.append('WINDOW_SIZE=%s' % opts.windowsize)
572        cl.append('MINIMUM_GENOME_FRACTION=%s' % opts.mingenomefrac)
573        cl.append('INPUT=%s' % opts.input)
574        cl.append('OUTPUT=%s' % tempout)
575        cl.append('TMP_DIR=%s' % opts.tmpdir)
576        cl.append('CHART_OUTPUT=%s' % temppdf)
577        cl.append('SUMMARY_OUTPUT=%s' % pic.metricsOut)
578        pic.runPic(opts.jar,cl)
579        if os.path.isfile(temppdf):
580            cl2 = ['convert','-resize x400',temppdf,os.path.join(opts.outdir,jpgname)] # make the jpg for fixPicardOutputs to find
581            s,stdouts = pic.runCL(cl=cl2,output_dir=opts.outdir)
582        else:
583            s='### runGC: Unable to find pdf %s - please check the log for the causal problem\n' % temppdf
584        lf = open(pic.log_filename,'a')
585        lf.write(s)
586        lf.write('\n')
587        lf.close()
588        
589    elif pic.picname == 'CollectInsertSizeMetrics':
590        isPDF = 'InsertSizeHist.pdf'
591        pdfpath = os.path.join(opts.outdir,isPDF)
592        histpdf = 'InsertSizeHist.pdf'
593        cl.append('I=%s' % opts.input)
594        cl.append('O=%s' % pic.metricsOut)
595        cl.append('HISTOGRAM_FILE=%s' % histpdf)
596        if opts.taillimit <> '0':
597            cl.append('TAIL_LIMIT=%s' % opts.taillimit)
598        if  opts.histwidth <> '0':
599            cl.append('HISTOGRAM_WIDTH=%s' % opts.histwidth)
600        if float( opts.minpct) > 0.0:
601            cl.append('MINIMUM_PCT=%s' % opts.minpct)
602        pic.runPic(opts.jar,cl)   
603        if os.path.exists(pdfpath): # automake thumbnail - will be added to html 
604            cl2 = ['mogrify', '-format jpg -resize x400 %s' % pdfpath]
605            s,stdouts = pic.runCL(cl=cl2,output_dir=opts.outdir)
606        else:
607            s = 'Unable to find expected pdf file %s<br/>\n' % pdfpath
608            s += 'This <b>always happens if single ended data was provided</b> to this tool,\n'
609            s += 'so please double check that your input data really is paired-end NGS data.<br/>\n'
610            s += 'If your input was paired data this may be a bug worth reporting to the galaxy-bugs list\n<br/>'
611            stdouts = ''
612        logging.info(s)
613        if len(stdouts) > 0:
614           logging.info(stdouts)
615        
616    elif pic.picname == 'MarkDuplicates':
617        # assume sorted even if header says otherwise
618        cl.append('ASSUME_SORTED=%s' % (opts.assumesorted))
619        # input
620        cl.append('INPUT=%s' % opts.input)
621        # outputs
622        cl.append('OUTPUT=%s' % opts.output) 
623        cl.append('METRICS_FILE=%s' % pic.metricsOut )
624        # remove or mark duplicates
625        cl.append('REMOVE_DUPLICATES=%s' % opts.remdups)
626        # the regular expression to be used to parse reads in incoming SAM file
627        cl.append('READ_NAME_REGEX="%s"' % opts.readregex)
628        # maximum offset between two duplicate clusters
629        cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist)
630        pic.runPic(opts.jar, cl)
631
632    elif pic.picname == 'FixMateInformation':
633        cl.append('I=%s' % opts.input)
634        cl.append('O=%s' % tempout)
635        cl.append('SORT_ORDER=%s' % opts.sortorder)
636        pic.runPic(opts.jar,cl)
637        haveTempout = True
638        
639    elif pic.picname == 'ReorderSam':
640        # input
641        cl.append('INPUT=%s' % opts.input)
642        # output
643        cl.append('OUTPUT=%s' % tempout)
644        # reference
645        cl.append('REFERENCE=%s' % ref_file_name)
646        # incomplete dict concordance
647        if opts.allow_inc_dict_concord == 'true':
648            cl.append('ALLOW_INCOMPLETE_DICT_CONCORDANCE=true')
649        # contig length discordance
650        if opts.allow_contig_len_discord == 'true':
651            cl.append('ALLOW_CONTIG_LENGTH_DISCORDANCE=true')
652        pic.runPic(opts.jar, cl)
653        haveTempout = True
654
655    elif pic.picname == 'ReplaceSamHeader':
656        cl.append('INPUT=%s' % opts.input)
657        cl.append('OUTPUT=%s' % tempout)
658        cl.append('HEADER=%s' % opts.header_file)
659        pic.runPic(opts.jar, cl)
660        haveTempout = True
661
662    elif pic.picname == 'CalculateHsMetrics':
663        maxloglines = 100
664        baitfname = os.path.join(opts.outdir,'rgPicardHsMetrics.bait')
665        targetfname = os.path.join(opts.outdir,'rgPicardHsMetrics.target')
666        baitf = pic.makePicInterval(opts.baitbed,baitfname)
667        if opts.targetbed == opts.baitbed: # same file sometimes
668            targetf = baitf
669        else:
670            targetf = pic.makePicInterval(opts.targetbed,targetfname)   
671        cl.append('BAIT_INTERVALS=%s' % baitf)
672        cl.append('TARGET_INTERVALS=%s' % targetf)
673        cl.append('INPUT=%s' % os.path.abspath(opts.input))
674        cl.append('OUTPUT=%s' % pic.metricsOut)
675        cl.append('TMP_DIR=%s' % opts.tmpdir)
676        pic.runPic(opts.jar,cl)
677           
678    elif pic.picname == 'ValidateSamFile':
679        import pysam
680        doTranspose = False
681        sortedfile = os.path.join(opts.outdir,'rgValidate.sorted')
682        stf = open(pic.log_filename,'w')
683        tlog = None
684        if opts.datatype == 'sam': # need to work with a bam 
685            tlog,tempbam = pic.samToBam(opts.input,opts.outdir)
686            try:
687                tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
688            except:
689                print '## exception on sorting sam file %s' % opts.input
690        else: # is already bam
691            try:
692                tlog = pic.sortSam(opts.input,sortedfile,opts.outdir)
693            except: # bug - [bam_sort_core] not being ignored - TODO fixme
694                print '## exception on sorting bam file %s' % opts.input
695        if tlog:
696            print '##tlog=',tlog
697            stf.write(tlog)
698            stf.write('\n')
699        sortedfile = '%s.bam' % sortedfile # samtools does that      
700        cl.append('O=%s' % pic.metricsOut)
701        cl.append('TMP_DIR=%s' % opts.tmpdir)
702        cl.append('I=%s' % sortedfile)
703        opts.maxerrors = '99999999'
704        cl.append('MAX_OUTPUT=%s' % opts.maxerrors)
705        if opts.ignoreflags[0] <> 'None': # picard error values to ignore
706            igs = ['IGNORE=%s' % x for x in opts.ignoreflags if x <> 'None']
707            cl.append(' '.join(igs))
708        if opts.bisulphite.lower() <> 'false':
709            cl.append('IS_BISULFITE_SEQUENCED=true')
710        if opts.ref <> None or opts.ref_file <> None:
711            cl.append('R=%s' %  ref_file_name)
712        pic.runPic(opts.jar,cl)
713        if opts.datatype == 'sam':
714            pic.delme.append(tempbam)
715        newsam = opts.output
716        outformat = 'bam'
717        pe = open(pic.metricsOut,'r').readlines()
718        pic.cleanSam(insam=sortedfile, newsam=newsam, picardErrors=pe,outformat=outformat)
719        pic.delme.append(sortedfile) # not wanted
720        stf.close()
721        pic.cleanup()
722    else:
723        print >> sys.stderr,'picard.py got an unknown tool name - %s' % pic.picname
724        sys.exit(1)
725    if haveTempout:
726        # Some Picard tools produced a potentially intermediate bam file. 
727        # Either just move to final location or create sam
728        shutil.move(tempout, os.path.abspath(opts.output))
729
730    if opts.htmlout <> None or doFix: # return a pretty html page
731        pic.fixPicardOutputs(transpose=doTranspose,maxloglines=maxloglines)
732
733if __name__=="__main__": __main__()
734