/tools/picard/picard_wrapper.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 734 lines · 587 code · 47 blank · 100 comment · 115 complexity · 7a56cf80605c53b2976692d444bd3498 MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. Originally written by Kelly Vincent
  4. pretty output and additional picard wrappers by Ross Lazarus for rgenetics
  5. Runs all available wrapped Picard tools.
  6. usage: picard_wrapper.py [options]
  7. code Ross wrote licensed under the LGPL
  8. see http://www.gnu.org/copyleft/lesser.html
  9. """
  10. import optparse, os, sys, subprocess, tempfile, shutil, time, logging
  11. galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
  12. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  13. <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  14. <head>
  15. <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  16. <meta name="generator" content="Galaxy %s tool output - see http://getgalaxy.org/" />
  17. <title></title>
  18. <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
  19. </head>
  20. <body>
  21. <div class="document">
  22. """
  23. galhtmlattr = """Galaxy tool %s run at %s</b><br/>"""
  24. galhtmlpostfix = """</div></body></html>\n"""
  25. def stop_err( msg ):
  26. sys.stderr.write( '%s\n' % msg )
  27. sys.exit()
  28. def timenow():
  29. """return current time as a string
  30. """
  31. return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
  32. class PicardBase():
  33. """
  34. simple base class with some utilities for Picard
  35. adapted and merged with Kelly Vincent's code april 2011 Ross
  36. lots of changes...
  37. """
  38. def __init__(self, opts=None,arg0=None):
  39. """ common stuff needed at init for a picard tool
  40. """
  41. assert opts <> None, 'PicardBase needs opts at init'
  42. self.opts = opts
  43. if self.opts.outdir == None:
  44. self.opts.outdir = os.getcwd() # fixmate has no html file eg so use temp dir
  45. assert self.opts.outdir <> None,'## PicardBase needs a temp directory if no output directory passed in'
  46. self.picname = self.baseName(opts.jar)
  47. if self.picname.startswith('picard'):
  48. self.picname = opts.picard_cmd # special case for some tools like replaceheader?
  49. self.progname = self.baseName(arg0)
  50. self.version = '0.002'
  51. self.delme = [] # list of files to destroy
  52. self.title = opts.title
  53. self.inputfile = opts.input
  54. try:
  55. os.makedirs(opts.outdir)
  56. except:
  57. pass
  58. try:
  59. os.makedirs(opts.tmpdir)
  60. except:
  61. pass
  62. self.log_filename = os.path.join(self.opts.outdir,'%s.log' % self.picname)
  63. self.metricsOut = os.path.join(opts.outdir,'%s.metrics.txt' % self.picname)
  64. self.setLogging(logfname=self.log_filename)
  65. def baseName(self,name=None):
  66. return os.path.splitext(os.path.basename(name))[0]
  67. def setLogging(self,logfname="picard_wrapper.log"):
  68. """setup a logger
  69. """
  70. logging.basicConfig(level=logging.INFO,
  71. filename=logfname,
  72. filemode='a')
  73. def readLarge(self,fname=None):
  74. """ read a potentially huge file.
  75. """
  76. try:
  77. # get stderr, allowing for case where it's very large
  78. tmp = open( fname, 'rb' )
  79. s = ''
  80. buffsize = 1048576
  81. try:
  82. while True:
  83. more = tmp.read( buffsize )
  84. if len(more) > 0:
  85. s += more
  86. else:
  87. break
  88. except OverflowError:
  89. pass
  90. tmp.close()
  91. except Exception, e:
  92. stop_err( 'Error : %s' % str( e ) )
  93. return s
  94. def runCL(self,cl=None,output_dir=None):
  95. """ construct and run a command line
  96. we have galaxy's temp path as opt.temp_dir so don't really need isolation
  97. sometimes stdout is needed as the output - ugly hacks to deal with potentially vast artifacts
  98. """
  99. assert cl <> None, 'PicardBase runCL needs a command line as cl'
  100. if output_dir == None:
  101. output_dir = self.opts.outdir
  102. if type(cl) == type([]):
  103. cl = ' '.join(cl)
  104. fd,templog = tempfile.mkstemp(dir=output_dir,suffix='rgtempRun.txt')
  105. tlf = open(templog,'wb')
  106. fd,temperr = tempfile.mkstemp(dir=output_dir,suffix='rgtempErr.txt')
  107. tef = open(temperr,'wb')
  108. process = subprocess.Popen(cl, shell=True, stderr=tef, stdout=tlf, cwd=output_dir)
  109. rval = process.wait()
  110. tlf.close()
  111. tef.close()
  112. stderrs = self.readLarge(temperr)
  113. stdouts = self.readLarge(templog)
  114. if len(stderrs) > 0:
  115. s = '## executing %s returned status %d and stderr: \n%s\n' % (cl,rval,stderrs)
  116. else:
  117. s = '## executing %s returned status %d and nothing on stderr\n' % (cl,rval)
  118. logging.info(s)
  119. os.unlink(templog) # always
  120. os.unlink(temperr) # always
  121. return s, stdouts # sometimes this is an output
  122. def runPic(self, jar, cl):
  123. """
  124. cl should be everything after the jar file name in the command
  125. """
  126. runme = ['java -Xmx%s' % self.opts.maxjheap]
  127. runme.append('-jar %s' % jar)
  128. runme += cl
  129. s,stdout = self.runCL(cl=runme, output_dir=self.opts.outdir)
  130. return stdout
  131. def samToBam(self,infile=None,outdir=None):
  132. """
  133. use samtools view to convert sam to bam
  134. """
  135. fd,tempbam = tempfile.mkstemp(dir=outdir,suffix='rgutilsTemp.bam')
  136. cl = ['samtools view -h -b -S -o ',tempbam,infile]
  137. tlog,stdouts = self.runCL(cl,outdir)
  138. return tlog,tempbam
  139. #def bamToSam(self,infile=None,outdir=None):
  140. # """
  141. # use samtools view to convert bam to sam
  142. # """
  143. # fd,tempsam = tempfile.mkstemp(dir=outdir,suffix='rgutilsTemp.sam')
  144. # cl = ['samtools view -h -o ',tempsam,infile]
  145. # tlog,stdouts = self.runCL(cl,outdir)
  146. # return tlog,tempsam
  147. def sortSam(self, infile=None,outfile=None,outdir=None):
  148. """
  149. """
  150. print '## sortSam got infile=%s,outfile=%s,outdir=%s' % (infile,outfile,outdir)
  151. cl = ['samtools sort',infile,outfile]
  152. tlog,stdouts = self.runCL(cl,outdir)
  153. return tlog
  154. def cleanup(self):
  155. for fname in self.delme:
  156. try:
  157. os.unlink(fname)
  158. except:
  159. pass
  160. def prettyPicout(self,transpose,maxrows):
  161. """organize picard outpouts into a report html page
  162. """
  163. res = []
  164. try:
  165. r = open(self.metricsOut,'r').readlines()
  166. except:
  167. r = []
  168. if len(r) > 0:
  169. res.append('<b>Picard on line resources</b><ul>\n')
  170. res.append('<li><a href="http://picard.sourceforge.net/index.shtml">Click here for Picard Documentation</a></li>\n')
  171. res.append('<li><a href="http://picard.sourceforge.net/picard-metric-definitions.shtml">Click here for Picard Metrics definitions</a></li></ul><hr/>\n')
  172. if transpose:
  173. res.append('<b>Picard output (transposed to make it easier to see)</b><hr/>\n')
  174. else:
  175. res.append('<b>Picard output</b><hr/>\n')
  176. res.append('<table cellpadding="3" >\n')
  177. dat = []
  178. heads = []
  179. lastr = len(r) - 1
  180. # special case for estimate library complexity hist
  181. thist = False
  182. for i,row in enumerate(r):
  183. if row.strip() > '':
  184. srow = row.split('\t')
  185. if row.startswith('#'):
  186. heads.append(row.strip()) # want strings
  187. else:
  188. dat.append(srow) # want lists
  189. if row.startswith('## HISTOGRAM'):
  190. thist = True
  191. if len(heads) > 0:
  192. hres = ['<tr class="d%d"><td colspan="2">%s</td></tr>' % (i % 2,x) for i,x in enumerate(heads)]
  193. res += hres
  194. heads = []
  195. if len(dat) > 0:
  196. if transpose and not thist:
  197. tdat = map(None,*dat) # transpose an arbitrary list of lists
  198. tdat = ['<tr class="d%d"><td>%s</td><td>%s&nbsp;</td></tr>\n' % ((i+len(heads)) % 2,x[0],x[1]) for i,x in enumerate(tdat)]
  199. else:
  200. tdat = ['\t'.join(x).strip() for x in dat] # back to strings :(
  201. tdat = ['<tr class="d%d"><td colspan="2">%s</td></tr>\n' % ((i+len(heads)) % 2,x) for i,x in enumerate(tdat)]
  202. res += tdat
  203. dat = []
  204. res.append('</table>\n')
  205. return res
  206. def fixPicardOutputs(self,transpose,maxloglines):
  207. """
  208. picard produces long hard to read tab header files
  209. make them available but present them transposed for readability
  210. """
  211. logging.shutdown()
  212. self.cleanup() # remove temp files stored in delme
  213. rstyle="""<style type="text/css">
  214. tr.d0 td {background-color: oldlace; color: black;}
  215. tr.d1 td {background-color: aliceblue; color: black;}
  216. </style>"""
  217. res = [rstyle,]
  218. res.append(galhtmlprefix % self.progname)
  219. res.append(galhtmlattr % (self.picname,timenow()))
  220. flist = [x for x in os.listdir(self.opts.outdir) if not x.startswith('.')]
  221. pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf']
  222. if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs
  223. for p in pdflist:
  224. imghref = '%s.jpg' % os.path.splitext(p)[0] # removes .pdf
  225. res.append('<table cellpadding="10"><tr><td>\n')
  226. res.append('<a href="%s"><img src="%s" title="Click image preview for a print quality PDF version" hspace="10" align="middle"></a>\n' % (p,imghref))
  227. res.append('</tr></td></table>\n')
  228. if len(flist) > 0:
  229. res.append('<b>The following output files were created (click the filename to view/download a copy):</b><hr/>')
  230. res.append('<table>\n')
  231. for i,f in enumerate(flist):
  232. fn = os.path.split(f)[-1]
  233. res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,fn))
  234. res.append('</table><p/>\n')
  235. pres = self.prettyPicout(transpose,maxloglines)
  236. if len(pres) > 0:
  237. res += pres
  238. l = open(self.log_filename,'r').readlines()
  239. llen = len(l)
  240. if llen > 0:
  241. res.append('<b>Picard Tool Run Log</b><hr/>\n')
  242. rlog = ['<pre>',]
  243. if llen > maxloglines:
  244. n = min(50,int(maxloglines/2))
  245. rlog += l[:n]
  246. rlog.append('------------ ## %d rows deleted ## --------------\n' % (llen-maxloglines))
  247. rlog += l[-n:]
  248. else:
  249. rlog += l
  250. rlog.append('</pre>')
  251. if llen > maxloglines:
  252. rlog.append('\n<b>## WARNING - %d log lines truncated - <a href="%s">%s</a> contains entire output</b>' % (llen - maxloglines,self.log_filename,self.log_filename))
  253. res += rlog
  254. else:
  255. res.append("### Odd, Picard left no log file %s - must have really barfed badly?\n" % self.log_filename)
  256. res.append('<hr/>The freely available <a href="http://picard.sourceforge.net/command-line-overview.shtml">Picard software</a> \n')
  257. res.append( 'generated all outputs reported here running as a <a href="http://getgalaxy.org">Galaxy</a> tool')
  258. res.append(galhtmlpostfix)
  259. outf = open(self.opts.htmlout,'w')
  260. outf.write(''.join(res))
  261. outf.write('\n')
  262. outf.close()
  263. def makePicInterval(self,inbed=None,outf=None):
  264. """
  265. picard wants bait and target files to have the same header length as the incoming bam/sam
  266. a meaningful (ie accurate) representation will fail because of this - so this hack
  267. it would be far better to be able to supply the original bed untouched
  268. """
  269. assert inbed <> None
  270. bed = open(inbed,'r').readlines()
  271. thead = os.path.join(self.opts.outdir,'tempSamHead.txt')
  272. if self.opts.datatype == 'sam':
  273. cl = ['samtools view -H -S',self.opts.input,'>',thead]
  274. else:
  275. cl = ['samtools view -H',self.opts.input,'>',thead]
  276. self.runCL(cl=cl,output_dir=self.opts.outdir)
  277. head = open(thead,'r').readlines()
  278. s = '## got %d rows of header\n' % (len(head))
  279. logging.info(s)
  280. o = open(outf,'w')
  281. o.write(''.join(head))
  282. o.write(''.join(bed))
  283. o.close()
  284. return outf
  285. def cleanSam(self, insam=None, newsam=None, picardErrors=[],outformat=None):
  286. """
  287. interesting problem - if paired, must remove mate pair of errors too or we have a new set of errors after cleaning - missing mate pairs!
  288. Do the work of removing all the error sequences
  289. pysam is cool
  290. infile = pysam.Samfile( "-", "r" )
  291. outfile = pysam.Samfile( "-", "w", template = infile )
  292. for s in infile: outfile.write(s)
  293. errors from ValidateSameFile.jar look like
  294. WARNING: Record 32, Read name SRR006041.1202260, NM tag (nucleotide differences) is missing
  295. ERROR: Record 33, Read name SRR006041.1042721, Empty sequence dictionary.
  296. ERROR: Record 33, Read name SRR006041.1042721, RG ID on SAMRecord not found in header: SRR006041
  297. """
  298. assert os.path.isfile(insam), 'rgPicardValidate cleansam needs an input sam file - cannot find %s' % insam
  299. assert newsam <> None, 'rgPicardValidate cleansam needs an output new sam file path'
  300. removeNames = [x.split(',')[1].replace(' Read name ','') for x in picardErrors if len(x.split(',')) > 2]
  301. remDict = dict(zip(removeNames,range(len(removeNames))))
  302. infile = pysam.Samfile(insam,'rb')
  303. info = 'found %d error sequences in picardErrors, %d unique' % (len(removeNames),len(remDict))
  304. if len(removeNames) > 0:
  305. outfile = pysam.Samfile(newsam,'wb',template=infile) # template must be an open file
  306. i = 0
  307. j = 0
  308. for row in infile:
  309. dropme = remDict.get(row.qname,None) # keep if None
  310. if not dropme:
  311. outfile.write(row)
  312. j += 1
  313. else: # discard
  314. i += 1
  315. info = '%s\n%s' % (info, 'Discarded %d lines writing %d to %s from %s' % (i,j,newsam,insam))
  316. outfile.close()
  317. infile.close()
  318. else: # we really want a nullop or a simple pointer copy
  319. infile.close()
  320. if newsam:
  321. shutil.copy(insam,newsam)
  322. logging.info(info)
  323. def __main__():
  324. doFix = False # tools returning htmlfile don't need this
  325. doTranspose = True # default
  326. maxloglines = 100 # default
  327. #Parse Command Line
  328. op = optparse.OptionParser()
  329. # All tools
  330. op.add_option('-i', '--input', dest='input', help='Input SAM or BAM file' )
  331. op.add_option('-e', '--inputext', default=None)
  332. op.add_option('-o', '--output', default=None)
  333. op.add_option('-n', '--title', default="Pick a Picard Tool")
  334. op.add_option('-t', '--htmlout', default=None)
  335. op.add_option('-d', '--outdir', default=None)
  336. op.add_option('-x', '--maxjheap', default='4g')
  337. op.add_option('-b', '--bisulphite', default='false')
  338. op.add_option('-s', '--sortorder', default='query')
  339. op.add_option('','--tmpdir', default='/tmp')
  340. op.add_option('-j','--jar',default='')
  341. op.add_option('','--picard-cmd',default=None)
  342. # Many tools
  343. op.add_option( '', '--output-format', dest='output_format', help='Output format' )
  344. op.add_option( '', '--bai-file', dest='bai_file', help='The path to the index file for the input bam file' )
  345. op.add_option( '', '--ref', dest='ref', help='Built-in reference with fasta and dict file', default=None )
  346. # CreateSequenceDictionary
  347. op.add_option( '', '--ref-file', dest='ref_file', help='Fasta to use as reference', default=None )
  348. op.add_option( '', '--species-name', dest='species_name', help='Species name to use in creating dict file from fasta file' )
  349. op.add_option( '', '--build-name', dest='build_name', help='Name of genome assembly to use in creating dict file from fasta file' )
  350. op.add_option( '', '--trunc-names', dest='trunc_names', help='Truncate sequence names at first whitespace from fasta file' )
  351. # MarkDuplicates
  352. op.add_option( '', '--remdups', default='true', help='Remove duplicates from output file' )
  353. op.add_option( '', '--optdupdist', default="100", help='Maximum pixels between two identical sequences in order to consider them optical duplicates.' )
  354. # CollectInsertSizeMetrics
  355. op.add_option('', '--taillimit', default="0")
  356. op.add_option('', '--histwidth', default="0")
  357. op.add_option('', '--minpct', default="0.01")
  358. # CollectAlignmentSummaryMetrics
  359. op.add_option('', '--maxinsert', default="20")
  360. op.add_option('', '--adaptors', action='append', type="string")
  361. # FixMateInformation and validate
  362. # CollectGcBiasMetrics
  363. op.add_option('', '--windowsize', default='100')
  364. op.add_option('', '--mingenomefrac', default='0.00001')
  365. # AddOrReplaceReadGroups
  366. op.add_option( '', '--rg-opts', dest='rg_opts', help='Specify extra (optional) arguments with full, otherwise preSet' )
  367. op.add_option( '', '--rg-lb', dest='rg_library', help='Read Group Library' )
  368. op.add_option( '', '--rg-pl', dest='rg_platform', help='Read Group platform (e.g. illumina, solid)' )
  369. op.add_option( '', '--rg-pu', dest='rg_plat_unit', help='Read Group platform unit (eg. run barcode) ' )
  370. op.add_option( '', '--rg-sm', dest='rg_sample', help='Read Group sample name' )
  371. op.add_option( '', '--rg-id', dest='rg_id', help='Read Group ID' )
  372. op.add_option( '', '--rg-cn', dest='rg_seq_center', help='Read Group sequencing center name' )
  373. op.add_option( '', '--rg-ds', dest='rg_desc', help='Read Group description' )
  374. # ReorderSam
  375. op.add_option( '', '--allow-inc-dict-concord', dest='allow_inc_dict_concord', help='Allow incomplete dict concordance' )
  376. op.add_option( '', '--allow-contig-len-discord', dest='allow_contig_len_discord', help='Allow contig length discordance' )
  377. # ReplaceSamHeader
  378. op.add_option( '', '--header-file', dest='header_file', help='sam or bam file from which header will be read' )
  379. op.add_option('','--assumesorted', default='true')
  380. op.add_option('','--readregex', default="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*")
  381. #estimatelibrarycomplexity
  382. op.add_option('','--minid', default="5")
  383. op.add_option('','--maxdiff', default="0.03")
  384. op.add_option('','--minmeanq', default="20")
  385. #hsmetrics
  386. op.add_option('','--baitbed', default=None)
  387. op.add_option('','--targetbed', default=None)
  388. #validate
  389. op.add_option('','--ignoreflags', action='append', type="string")
  390. op.add_option('','--maxerrors', default=None)
  391. op.add_option('','--datatype', default=None)
  392. op.add_option('','--bamout', default=None)
  393. op.add_option('','--samout', default=None)
  394. opts, args = op.parse_args()
  395. opts.sortme = opts.assumesorted == 'false'
  396. assert opts.input <> None
  397. # need to add
  398. # instance that does all the work
  399. pic = PicardBase(opts,sys.argv[0])
  400. tmp_dir = opts.outdir
  401. haveTempout = False # we use this where sam output is an option
  402. # set ref and dict files to use (create if necessary)
  403. ref_file_name = opts.ref
  404. if opts.ref_file <> None:
  405. csd = 'CreateSequenceDictionary'
  406. realjarpath = os.path.split(opts.jar)[0]
  407. jarpath = os.path.join(realjarpath,'%s.jar' % csd) # for refseq
  408. tmp_ref_fd, tmp_ref_name = tempfile.mkstemp( dir=opts.tmpdir , prefix = pic.picname)
  409. ref_file_name = '%s.fasta' % tmp_ref_name
  410. # build dict
  411. dict_file_name = '%s.dict' % tmp_ref_name
  412. os.symlink( opts.ref_file, ref_file_name )
  413. cl = ['REFERENCE=%s' % ref_file_name]
  414. cl.append('OUTPUT=%s' % dict_file_name)
  415. cl.append('URI=%s' % os.path.basename( opts.ref_file ))
  416. cl.append('TRUNCATE_NAMES_AT_WHITESPACE=%s' % opts.trunc_names)
  417. if opts.species_name:
  418. cl.append('SPECIES=%s' % opts.species_name)
  419. if opts.build_name:
  420. cl.append('GENOME_ASSEMBLY=%s' % opts.build_name)
  421. pic.delme.append(dict_file_name)
  422. pic.delme.append(ref_file_name)
  423. pic.delme.append(tmp_ref_name)
  424. s = pic.runPic(jarpath, cl)
  425. # run relevant command(s)
  426. # define temporary output
  427. # if output is sam, it must have that extension, otherwise bam will be produced
  428. # specify sam or bam file with extension
  429. if opts.output_format == 'sam':
  430. suff = '.sam'
  431. else:
  432. suff = ''
  433. tmp_fd, tempout = tempfile.mkstemp( dir=opts.tmpdir, suffix=suff )
  434. cl = ['VALIDATION_STRINGENCY=LENIENT',]
  435. if pic.picname == 'AddOrReplaceReadGroups':
  436. # sort order to match Galaxy's default
  437. cl.append('SORT_ORDER=coordinate')
  438. # input
  439. cl.append('INPUT=%s' % opts.input)
  440. # outputs
  441. cl.append('OUTPUT=%s' % tempout)
  442. # required read groups
  443. cl.append('RGLB="%s"' % opts.rg_library)
  444. cl.append('RGPL="%s"' % opts.rg_platform)
  445. cl.append('RGPU="%s"' % opts.rg_plat_unit)
  446. cl.append('RGSM="%s"' % opts.rg_sample)
  447. if opts.rg_id:
  448. cl.append('RGID="%s"' % opts.rg_id)
  449. # optional read groups
  450. if opts.rg_seq_center:
  451. cl.append('RGCN="%s"' % opts.rg_seq_center)
  452. if opts.rg_desc:
  453. cl.append('RGDS="%s"' % opts.rg_desc)
  454. pic.runPic(opts.jar, cl)
  455. haveTempout = True
  456. elif pic.picname == 'BamIndexStats':
  457. tmp_fd, tmp_name = tempfile.mkstemp( dir=tmp_dir )
  458. tmp_bam_name = '%s.bam' % tmp_name
  459. tmp_bai_name = '%s.bai' % tmp_bam_name
  460. os.symlink( opts.input, tmp_bam_name )
  461. os.symlink( opts.bai_file, tmp_bai_name )
  462. cl.append('INPUT=%s' % ( tmp_bam_name ))
  463. pic.delme.append(tmp_bam_name)
  464. pic.delme.append(tmp_bai_name)
  465. pic.delme.append(tmp_name)
  466. s = pic.runPic( opts.jar, cl )
  467. f = open(pic.metricsOut,'a')
  468. f.write(s) # got this on stdout from runCl
  469. f.write('\n')
  470. f.close()
  471. doTranspose = False # but not transposed
  472. elif pic.picname == 'EstimateLibraryComplexity':
  473. cl.append('I=%s' % opts.input)
  474. cl.append('O=%s' % pic.metricsOut)
  475. if float(opts.minid) > 0:
  476. cl.append('MIN_IDENTICAL_BASES=%s' % opts.minid)
  477. if float(opts.maxdiff) > 0.0:
  478. cl.append('MAX_DIFF_RATE=%s' % opts.maxdiff)
  479. if float(opts.minmeanq) > 0:
  480. cl.append('MIN_MEAN_QUALITY=%s' % opts.minmeanq)
  481. if opts.readregex > '':
  482. cl.append('READ_NAME_REGEX="%s"' % opts.readregex)
  483. if float(opts.optdupdist) > 0:
  484. cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist)
  485. pic.runPic(opts.jar,cl)
  486. elif pic.picname == 'CollectAlignmentSummaryMetrics':
  487. # Why do we do this fakefasta thing? Because we need NO fai to be available or picard barfs unless it has the same length as the input data.
  488. # why? Dunno Seems to work without complaining if the .bai file is AWOL....
  489. fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
  490. try:
  491. os.symlink(ref_file_name,fakefasta)
  492. except:
  493. s = '## unable to symlink %s to %s - different devices? May need to replace with shutil.copy'
  494. info = s
  495. shutil.copy(ref_file_name,fakefasta)
  496. pic.delme.append(fakefasta)
  497. cl.append('ASSUME_SORTED=%s' % opts.assumesorted)
  498. adaptorseqs = ''.join([' ADAPTER_SEQUENCE=%s' % x for x in opts.adaptors])
  499. cl.append(adaptorseqs)
  500. cl.append('IS_BISULFITE_SEQUENCED=%s' % opts.bisulphite)
  501. cl.append('MAX_INSERT_SIZE=%s' % opts.maxinsert)
  502. cl.append('OUTPUT=%s' % pic.metricsOut)
  503. cl.append('R=%s' % fakefasta)
  504. cl.append('TMP_DIR=%s' % opts.tmpdir)
  505. if not opts.assumesorted.lower() == 'true': # we need to sort input
  506. fakeinput = '%s.sorted' % opts.input
  507. s = pic.sortSam(opts.input, fakeinput, opts.outdir)
  508. pic.delme.append(fakeinput)
  509. cl.append('INPUT=%s' % fakeinput)
  510. else:
  511. cl.append('INPUT=%s' % os.path.abspath(opts.input))
  512. pic.runPic(opts.jar,cl)
  513. elif pic.picname == 'CollectGcBiasMetrics':
  514. assert os.path.isfile(ref_file_name),'PicardGC needs a reference sequence - cannot read %s' % ref_file_name
  515. # sigh. Why do we do this fakefasta thing? Because we need NO fai to be available or picard barfs unless it has the same length as the input data.
  516. # why? Dunno
  517. fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
  518. try:
  519. os.symlink(ref_file_name,fakefasta)
  520. except:
  521. s = '## unable to symlink %s to %s - different devices? May need to replace with shutil.copy'
  522. info = s
  523. shutil.copy(ref_file_name,fakefasta)
  524. pic.delme.append(fakefasta)
  525. x = 'rgPicardGCBiasMetrics'
  526. pdfname = '%s.pdf' % x
  527. jpgname = '%s.jpg' % x
  528. tempout = os.path.join(opts.outdir,'rgPicardGCBiasMetrics.out')
  529. temppdf = os.path.join(opts.outdir,pdfname)
  530. cl.append('R=%s' % fakefasta)
  531. cl.append('WINDOW_SIZE=%s' % opts.windowsize)
  532. cl.append('MINIMUM_GENOME_FRACTION=%s' % opts.mingenomefrac)
  533. cl.append('INPUT=%s' % opts.input)
  534. cl.append('OUTPUT=%s' % tempout)
  535. cl.append('TMP_DIR=%s' % opts.tmpdir)
  536. cl.append('CHART_OUTPUT=%s' % temppdf)
  537. cl.append('SUMMARY_OUTPUT=%s' % pic.metricsOut)
  538. pic.runPic(opts.jar,cl)
  539. if os.path.isfile(temppdf):
  540. cl2 = ['convert','-resize x400',temppdf,os.path.join(opts.outdir,jpgname)] # make the jpg for fixPicardOutputs to find
  541. s,stdouts = pic.runCL(cl=cl2,output_dir=opts.outdir)
  542. else:
  543. s='### runGC: Unable to find pdf %s - please check the log for the causal problem\n' % temppdf
  544. lf = open(pic.log_filename,'a')
  545. lf.write(s)
  546. lf.write('\n')
  547. lf.close()
  548. elif pic.picname == 'CollectInsertSizeMetrics':
  549. isPDF = 'InsertSizeHist.pdf'
  550. pdfpath = os.path.join(opts.outdir,isPDF)
  551. histpdf = 'InsertSizeHist.pdf'
  552. cl.append('I=%s' % opts.input)
  553. cl.append('O=%s' % pic.metricsOut)
  554. cl.append('HISTOGRAM_FILE=%s' % histpdf)
  555. if opts.taillimit <> '0':
  556. cl.append('TAIL_LIMIT=%s' % opts.taillimit)
  557. if opts.histwidth <> '0':
  558. cl.append('HISTOGRAM_WIDTH=%s' % opts.histwidth)
  559. if float( opts.minpct) > 0.0:
  560. cl.append('MINIMUM_PCT=%s' % opts.minpct)
  561. pic.runPic(opts.jar,cl)
  562. if os.path.exists(pdfpath): # automake thumbnail - will be added to html
  563. cl2 = ['mogrify', '-format jpg -resize x400 %s' % pdfpath]
  564. s,stdouts = pic.runCL(cl=cl2,output_dir=opts.outdir)
  565. else:
  566. s = 'Unable to find expected pdf file %s<br/>\n' % pdfpath
  567. s += 'This <b>always happens if single ended data was provided</b> to this tool,\n'
  568. s += 'so please double check that your input data really is paired-end NGS data.<br/>\n'
  569. s += 'If your input was paired data this may be a bug worth reporting to the galaxy-bugs list\n<br/>'
  570. stdouts = ''
  571. logging.info(s)
  572. if len(stdouts) > 0:
  573. logging.info(stdouts)
  574. elif pic.picname == 'MarkDuplicates':
  575. # assume sorted even if header says otherwise
  576. cl.append('ASSUME_SORTED=%s' % (opts.assumesorted))
  577. # input
  578. cl.append('INPUT=%s' % opts.input)
  579. # outputs
  580. cl.append('OUTPUT=%s' % opts.output)
  581. cl.append('METRICS_FILE=%s' % pic.metricsOut )
  582. # remove or mark duplicates
  583. cl.append('REMOVE_DUPLICATES=%s' % opts.remdups)
  584. # the regular expression to be used to parse reads in incoming SAM file
  585. cl.append('READ_NAME_REGEX="%s"' % opts.readregex)
  586. # maximum offset between two duplicate clusters
  587. cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist)
  588. pic.runPic(opts.jar, cl)
  589. elif pic.picname == 'FixMateInformation':
  590. cl.append('I=%s' % opts.input)
  591. cl.append('O=%s' % tempout)
  592. cl.append('SORT_ORDER=%s' % opts.sortorder)
  593. pic.runPic(opts.jar,cl)
  594. haveTempout = True
  595. elif pic.picname == 'ReorderSam':
  596. # input
  597. cl.append('INPUT=%s' % opts.input)
  598. # output
  599. cl.append('OUTPUT=%s' % tempout)
  600. # reference
  601. cl.append('REFERENCE=%s' % ref_file_name)
  602. # incomplete dict concordance
  603. if opts.allow_inc_dict_concord == 'true':
  604. cl.append('ALLOW_INCOMPLETE_DICT_CONCORDANCE=true')
  605. # contig length discordance
  606. if opts.allow_contig_len_discord == 'true':
  607. cl.append('ALLOW_CONTIG_LENGTH_DISCORDANCE=true')
  608. pic.runPic(opts.jar, cl)
  609. haveTempout = True
  610. elif pic.picname == 'ReplaceSamHeader':
  611. cl.append('INPUT=%s' % opts.input)
  612. cl.append('OUTPUT=%s' % tempout)
  613. cl.append('HEADER=%s' % opts.header_file)
  614. pic.runPic(opts.jar, cl)
  615. haveTempout = True
  616. elif pic.picname == 'CalculateHsMetrics':
  617. maxloglines = 100
  618. baitfname = os.path.join(opts.outdir,'rgPicardHsMetrics.bait')
  619. targetfname = os.path.join(opts.outdir,'rgPicardHsMetrics.target')
  620. baitf = pic.makePicInterval(opts.baitbed,baitfname)
  621. if opts.targetbed == opts.baitbed: # same file sometimes
  622. targetf = baitf
  623. else:
  624. targetf = pic.makePicInterval(opts.targetbed,targetfname)
  625. cl.append('BAIT_INTERVALS=%s' % baitf)
  626. cl.append('TARGET_INTERVALS=%s' % targetf)
  627. cl.append('INPUT=%s' % os.path.abspath(opts.input))
  628. cl.append('OUTPUT=%s' % pic.metricsOut)
  629. cl.append('TMP_DIR=%s' % opts.tmpdir)
  630. pic.runPic(opts.jar,cl)
  631. elif pic.picname == 'ValidateSamFile':
  632. import pysam
  633. doTranspose = False
  634. sortedfile = os.path.join(opts.outdir,'rgValidate.sorted')
  635. stf = open(pic.log_filename,'w')
  636. tlog = None
  637. if opts.datatype == 'sam': # need to work with a bam
  638. tlog,tempbam = pic.samToBam(opts.input,opts.outdir)
  639. try:
  640. tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
  641. except:
  642. print '## exception on sorting sam file %s' % opts.input
  643. else: # is already bam
  644. try:
  645. tlog = pic.sortSam(opts.input,sortedfile,opts.outdir)
  646. except: # bug - [bam_sort_core] not being ignored - TODO fixme
  647. print '## exception on sorting bam file %s' % opts.input
  648. if tlog:
  649. print '##tlog=',tlog
  650. stf.write(tlog)
  651. stf.write('\n')
  652. sortedfile = '%s.bam' % sortedfile # samtools does that
  653. cl.append('O=%s' % pic.metricsOut)
  654. cl.append('TMP_DIR=%s' % opts.tmpdir)
  655. cl.append('I=%s' % sortedfile)
  656. opts.maxerrors = '99999999'
  657. cl.append('MAX_OUTPUT=%s' % opts.maxerrors)
  658. if opts.ignoreflags[0] <> 'None': # picard error values to ignore
  659. igs = ['IGNORE=%s' % x for x in opts.ignoreflags if x <> 'None']
  660. cl.append(' '.join(igs))
  661. if opts.bisulphite.lower() <> 'false':
  662. cl.append('IS_BISULFITE_SEQUENCED=true')
  663. if opts.ref <> None or opts.ref_file <> None:
  664. cl.append('R=%s' % ref_file_name)
  665. pic.runPic(opts.jar,cl)
  666. if opts.datatype == 'sam':
  667. pic.delme.append(tempbam)
  668. newsam = opts.output
  669. outformat = 'bam'
  670. pe = open(pic.metricsOut,'r').readlines()
  671. pic.cleanSam(insam=sortedfile, newsam=newsam, picardErrors=pe,outformat=outformat)
  672. pic.delme.append(sortedfile) # not wanted
  673. stf.close()
  674. pic.cleanup()
  675. else:
  676. print >> sys.stderr,'picard.py got an unknown tool name - %s' % pic.picname
  677. sys.exit(1)
  678. if haveTempout:
  679. # Some Picard tools produced a potentially intermediate bam file.
  680. # Either just move to final location or create sam
  681. shutil.move(tempout, os.path.abspath(opts.output))
  682. if opts.htmlout <> None or doFix: # return a pretty html page
  683. pic.fixPicardOutputs(transpose=doTranspose,maxloglines=maxloglines)
  684. if __name__=="__main__": __main__()