/tools/sr_mapping/bfast_wrapper.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 344 lines · 295 code · 12 blank · 37 comment · 84 complexity · 6e7c87ad530a3d87cbd06c67dd9b6779 MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. Runs BFAST on single-end or paired-end data.
  4. TODO: more documentation
  5. TODO:
  6. - auto-detect gzip or bz2
  7. - split options (?)
  8. - queue lengths (?)
  9. - assumes reference always has been indexed
  10. - main and secondary indexes
  11. - scoring matrix file ?
  12. - read group file ?
  13. usage: bfast_wrapper.py [options]
  14. -r, --ref=r: The reference genome to use or index
  15. -f, --fastq=f: The fastq file to use for the mapping
  16. -F, --output=u: The file to save the output (SAM format)
  17. -s, --fileSource=s: Whether to use a previously indexed reference sequence or one from history (indexed or history)
  18. -p, --params=p: Parameter setting to use (pre_set or full)
  19. -n, --numThreads=n: The number of threads to use
  20. -A, --space=A: The encoding space (0: base 1: color)
  21. -o, --offsets=o: The offsets for 'match'
  22. -l, --loadAllIndexes=l: Load all indexes into memory
  23. -k, --keySize=k: truncate key size in 'match'
  24. -K, --maxKeyMatches=K: the maximum number of matches to allow before a key is ignored
  25. -M, --maxNumMatches=M: the maximum number of matches to allow before the read is discarded
  26. -w, --whichStrand=w: the strands to consider (0: both 1: forward 2: reverse)
  27. -t, --timing=t: output timing information to stderr
  28. -u, --ungapped=u: performed ungapped local alignment
  29. -U, --unconstrained=U: performed local alignment without mask constraints
  30. -O, --offset=O: the number of bases before and after each hit to consider in local alignment
  31. -q, --avgMismatchQuality=q: average mismatch quality
  32. -a, --algorithm=a: post processing algorithm (0: no filtering, 1: all passing filters, 2: unique, 3: best scoring unique, 4: best score all)
  33. -P, --disallowPairing=P: do not choose alignments based on pairing
  34. -R, --reverse=R: paired end reads are given on reverse strands
  35. -z, --random=z: output a random best scoring alignment
  36. -D, --dbkey=D: Dbkey for reference genome
  37. -H, --suppressHeader=H: Suppress the sam header
  38. """
  39. import optparse, os, shutil, subprocess, sys, tempfile
  40. def stop_err( msg ):
  41. sys.stderr.write( '%s\n' % msg )
  42. sys.exit()
  43. def __main__():
  44. #Parse Command Line
  45. parser = optparse.OptionParser()
  46. parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to index and use' )
  47. parser.add_option( '-f', '--fastq', dest='fastq', help='The fastq file to use for the mapping' )
  48. parser.add_option( '-F', '--output', dest='output', help='The file to save the output (SAM format)' )
  49. parser.add_option( '-A', '--space', dest='space', type="choice", default='0', choices=('0','1' ), help='The encoding space (0: base 1: color)' )
  50. parser.add_option( '-H', '--suppressHeader', action="store_true", dest='suppressHeader', default=False, help='Suppress header' )
  51. parser.add_option( '-n', '--numThreads', dest='numThreads', type="int", default="1", help='The number of threads to use' )
  52. parser.add_option( '-t', '--timing', action="store_true", default=False, dest='timing', help='output timming information to stderr' )
  53. parser.add_option( '-l', '--loadAllIndexes', action="store_true", default=False, dest='loadAllIndexes', help='Load all indexes into memory' )
  54. parser.add_option( '-m', '--indexMask', dest='indexMask', help='String containing info on how to build custom indexes' )
  55. parser.add_option( "-b", "--buildIndex", action="store_true", dest="buildIndex", default=False, help='String containing info on how to build custom indexes' )
  56. parser.add_option( "--indexRepeatMasker", action="store_true", dest="indexRepeatMasker", default=False, help='Do not index lower case sequences. Such as those created by RepeatMasker' )
  57. parser.add_option( '--indexContigOptions', dest='indexContigOptions', default="", help='The contig range options to use for the indexing' )
  58. parser.add_option( '--indexExonsFileName', dest='indexExonsFileName', default="", help='The exons file to use for the indexing' )
  59. parser.add_option( '-o', '--offsets', dest='offsets', default="", help='The offsets for \'match\'' )
  60. parser.add_option( '-k', '--keySize', dest='keySize', type="int", default="-1", help='truncate key size in \'match\'' )
  61. parser.add_option( '-K', '--maxKeyMatches', dest='maxKeyMatches', type="int", default="-1", help='the maximum number of matches to allow before a key is ignored' )
  62. parser.add_option( '-M', '--maxNumMatches', dest='maxNumMatches', type="int", default="-1", help='the maximum number of matches to allow bfore the read is discarded' )
  63. parser.add_option( '-w', '--whichStrand', dest='whichStrand', type="choice", default='0', choices=('0','1','2'), help='the strands to consider (0: both 1: forward 2: reverse)' )
  64. parser.add_option( '--scoringMatrixFileName', dest='scoringMatrixFileName', help='Scoring Matrix file used to score the alignments' )
  65. parser.add_option( '-u', '--ungapped', dest='ungapped', action="store_true", default=False, help='performed ungapped local alignment' )
  66. parser.add_option( '-U', '--unconstrained', dest='unconstrained', action="store_true", default=False, help='performed local alignment without mask constraints' )
  67. parser.add_option( '-O', '--offset', dest='offset', type="int", default="0", help='the number of bases before and after each hit to consider in local alignment' )
  68. parser.add_option( '-q', '--avgMismatchQuality', type="int", default="-1", dest='avgMismatchQuality', help='average mismatch quality' )
  69. parser.add_option( '-a', '--algorithm', dest='algorithm', default='0', type="choice", choices=('0','1','2','3','4' ), help='post processing algorithm (0: no filtering, 1: all passing filters, 2: unique, 3: best scoring unique, 4: best score all' )
  70. parser.add_option( '--unpaired', dest='unpaired', action="store_true", default=False, help='do not choose alignments based on pairing' )
  71. parser.add_option( '--reverseStrand', dest='reverseStrand', action="store_true", default=False, help='paired end reads are given on reverse strands' )
  72. parser.add_option( '--pairedEndInfer', dest='pairedEndInfer', action="store_true", default=False, help='break ties when one end of a paired end read by estimating the insert size distribution' )
  73. parser.add_option( '--randomBest', dest='randomBest', action="store_true", default=False, help='output a random best scoring alignment' )
  74. (options, args) = parser.parse_args()
  75. # output version # of tool
  76. try:
  77. tmp = tempfile.NamedTemporaryFile().name
  78. tmp_stdout = open( tmp, 'wb' )
  79. proc = subprocess.Popen( args='bfast 2>&1', shell=True, stdout=tmp_stdout )
  80. tmp_stdout.close()
  81. returncode = proc.wait()
  82. stdout = None
  83. for line in open( tmp_stdout.name, 'rb' ):
  84. if line.lower().find( 'version' ) >= 0:
  85. stdout = line.strip()
  86. break
  87. if stdout:
  88. sys.stdout.write( '%s\n' % stdout )
  89. else:
  90. raise Exception
  91. except:
  92. sys.stdout.write( 'Could not determine BFAST version\n' )
  93. buffsize = 1048576
  94. # make temp directory for bfast, requires trailing slash
  95. tmp_dir = '%s/' % tempfile.mkdtemp()
  96. #'generic' options used in all bfast commands here
  97. if options.timing:
  98. all_cmd_options = "-t"
  99. else:
  100. all_cmd_options = ""
  101. try:
  102. if options.buildIndex:
  103. reference_filepath = tempfile.NamedTemporaryFile( dir=tmp_dir, suffix='.fa' ).name
  104. #build bfast indexes
  105. os.symlink( options.ref, reference_filepath )
  106. #bfast fast2brg
  107. try:
  108. nuc_space = [ "0" ]
  109. if options.space == "1":
  110. #color space localalign appears to require nuc space brg
  111. nuc_space.append( "1" )
  112. for space in nuc_space:
  113. cmd = 'bfast fasta2brg -f "%s" -A "%s" %s' % ( reference_filepath, space, all_cmd_options )
  114. tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
  115. tmp_stderr = open( tmp, 'wb' )
  116. proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
  117. returncode = proc.wait()
  118. tmp_stderr.close()
  119. # get stderr, allowing for case where it's very large
  120. tmp_stderr = open( tmp, 'rb' )
  121. stderr = ''
  122. try:
  123. while True:
  124. stderr += tmp_stderr.read( buffsize )
  125. if not stderr or len( stderr ) % buffsize != 0:
  126. break
  127. except OverflowError:
  128. pass
  129. tmp_stderr.close()
  130. if returncode != 0:
  131. raise Exception, stderr
  132. except Exception, e:
  133. raise Exception, 'Error in \'bfast fasta2brg\'.\n' + str( e )
  134. #bfast index
  135. try:
  136. all_index_cmds = 'bfast index %s -f "%s" -A "%s" -n "%s"' % ( all_cmd_options, reference_filepath, options.space, options.numThreads )
  137. if options.indexRepeatMasker:
  138. all_index_cmds += " -R"
  139. if options.indexContigOptions:
  140. index_contig_options = map( int, options.indexContigOptions.split( ',' ) )
  141. if index_contig_options[0] >= 0:
  142. all_index_cmds += ' -s "%s"' % index_contig_options[0]
  143. if index_contig_options[1] >= 0:
  144. all_index_cmds += ' -S "%s"' % index_contig_options[1]
  145. if index_contig_options[2] >= 0:
  146. all_index_cmds += ' -e "%s"' % index_contig_options[2]
  147. if index_contig_options[3] >= 0:
  148. all_index_cmds += ' -E "%s"' % index_contig_options[3]
  149. elif options.indexExonsFileName:
  150. all_index_cmds += ' -x "%s"' % options.indexExonsFileName
  151. index_count = 1
  152. for mask, hash_width in [ mask.split( ':' ) for mask in options.indexMask.split( ',' ) ]:
  153. cmd = '%s -m "%s" -w "%s" -i "%i"' % ( all_index_cmds, mask, hash_width, index_count )
  154. tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
  155. tmp_stderr = open( tmp, 'wb' )
  156. proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
  157. returncode = proc.wait()
  158. tmp_stderr.close()
  159. # get stderr, allowing for case where it's very large
  160. tmp_stderr = open( tmp, 'rb' )
  161. stderr = ''
  162. try:
  163. while True:
  164. stderr += tmp_stderr.read( buffsize )
  165. if not stderr or len( stderr ) % buffsize != 0:
  166. break
  167. except OverflowError:
  168. pass
  169. tmp_stderr.close()
  170. if returncode != 0:
  171. raise Exception, stderr
  172. index_count += 1
  173. except Exception, e:
  174. raise Exception, 'Error in \'bfast index\'.\n' + str( e )
  175. else:
  176. reference_filepath = options.ref
  177. assert reference_filepath and os.path.exists( reference_filepath ), 'A valid genome reference was not provided.'
  178. # set up aligning and generate aligning command options
  179. # set up temp output files
  180. tmp_bmf = tempfile.NamedTemporaryFile( dir=tmp_dir )
  181. tmp_bmf_name = tmp_bmf.name
  182. tmp_bmf.close()
  183. tmp_baf = tempfile.NamedTemporaryFile( dir=tmp_dir )
  184. tmp_baf_name = tmp_baf.name
  185. tmp_baf.close()
  186. bfast_match_cmd = 'bfast match -f "%s" -r "%s" -n "%s" -A "%s" -T "%s" -w "%s" %s' % ( reference_filepath, options.fastq, options.numThreads, options.space, tmp_dir, options.whichStrand, all_cmd_options )
  187. bfast_localalign_cmd = 'bfast localalign -f "%s" -m "%s" -n "%s" -A "%s" -o "%s" %s' % ( reference_filepath, tmp_bmf_name, options.numThreads, options.space, options.offset, all_cmd_options )
  188. bfast_postprocess_cmd = 'bfast postprocess -O 1 -f "%s" -i "%s" -n "%s" -A "%s" -a "%s" %s' % ( reference_filepath, tmp_baf_name, options.numThreads, options.space, options.algorithm, all_cmd_options )
  189. if options.offsets:
  190. bfast_match_cmd += ' -o "%s"' % options.offsets
  191. if options.keySize >= 0:
  192. bfast_match_cmd += ' -k "%s"' % options.keySize
  193. if options.maxKeyMatches >= 0:
  194. bfast_match_cmd += ' -K "%s"' % options.maxKeyMatches
  195. if options.maxNumMatches >= 0:
  196. bfast_match_cmd += ' -M "%s"' % options.maxNumMatches
  197. bfast_localalign_cmd += ' -M "%s"' % options.maxNumMatches
  198. if options.scoringMatrixFileName:
  199. bfast_localalign_cmd += ' -x "%s"' % options.scoringMatrixFileName
  200. bfast_postprocess_cmd += ' -x "%s"' % options.scoringMatrixFileName
  201. if options.ungapped:
  202. bfast_localalign_cmd += ' -u'
  203. if options.unconstrained:
  204. bfast_localalign_cmd += ' -U'
  205. if options.avgMismatchQuality >= 0:
  206. bfast_localalign_cmd += ' -q "%s"' % options.avgMismatchQuality
  207. bfast_postprocess_cmd += ' -q "%s"' % options.avgMismatchQuality
  208. if options.algorithm == 3:
  209. if options.pairedEndInfer:
  210. bfast_postprocess_cmd += ' -P'
  211. if options.randomBest:
  212. bfast_postprocess_cmd += ' -z'
  213. if options.unpaired:
  214. bfast_postprocess_cmd += ' -U'
  215. if options.reverseStrand:
  216. bfast_postprocess_cmd += ' -R'
  217. #instead of using temp files, should we stream through pipes?
  218. bfast_match_cmd += " > %s" % tmp_bmf_name
  219. bfast_localalign_cmd += " > %s" % tmp_baf_name
  220. bfast_postprocess_cmd += " > %s" % options.output
  221. # need to nest try-except in try-finally to handle 2.4
  222. try:
  223. # bfast 'match'
  224. try:
  225. tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
  226. tmp_stderr = open( tmp, 'wb' )
  227. proc = subprocess.Popen( args=bfast_match_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
  228. returncode = proc.wait()
  229. tmp_stderr.close()
  230. # get stderr, allowing for case where it's very large
  231. tmp_stderr = open( tmp, 'rb' )
  232. stderr = ''
  233. try:
  234. while True:
  235. stderr += tmp_stderr.read( buffsize )
  236. if not stderr or len( stderr ) % buffsize != 0:
  237. break
  238. except OverflowError:
  239. pass
  240. tmp_stderr.close()
  241. if returncode != 0:
  242. raise Exception, stderr
  243. except Exception, e:
  244. raise Exception, 'Error in \'bfast match\'. \n' + str( e )
  245. # bfast 'localalign'
  246. try:
  247. tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
  248. tmp_stderr = open( tmp, 'wb' )
  249. proc = subprocess.Popen( args=bfast_localalign_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
  250. returncode = proc.wait()
  251. tmp_stderr.close()
  252. # get stderr, allowing for case where it's very large
  253. tmp_stderr = open( tmp, 'rb' )
  254. stderr = ''
  255. try:
  256. while True:
  257. stderr += tmp_stderr.read( buffsize )
  258. if not stderr or len( stderr ) % buffsize != 0:
  259. break
  260. except OverflowError:
  261. pass
  262. tmp_stderr.close()
  263. if returncode != 0:
  264. raise Exception, stderr
  265. except Exception, e:
  266. raise Exception, 'Error in \'bfast localalign\'. \n' + str( e )
  267. # bfast 'postprocess'
  268. try:
  269. tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
  270. tmp_stderr = open( tmp, 'wb' )
  271. proc = subprocess.Popen( args=bfast_postprocess_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
  272. returncode = proc.wait()
  273. tmp_stderr.close()
  274. # get stderr, allowing for case where it's very large
  275. tmp_stderr = open( tmp, 'rb' )
  276. stderr = ''
  277. try:
  278. while True:
  279. stderr += tmp_stderr.read( buffsize )
  280. if not stderr or len( stderr ) % buffsize != 0:
  281. break
  282. except OverflowError:
  283. pass
  284. tmp_stderr.close()
  285. if returncode != 0:
  286. raise Exception, stderr
  287. except Exception, e:
  288. raise Exception, 'Error in \'bfast postprocess\'. \n' + str( e )
  289. # remove header if necessary
  290. if options.suppressHeader:
  291. tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir)
  292. tmp_out_name = tmp_out.name
  293. tmp_out.close()
  294. try:
  295. shutil.move( options.output, tmp_out_name )
  296. except Exception, e:
  297. raise Exception, 'Error moving output file before removing headers. \n' + str( e )
  298. fout = file( options.output, 'w' )
  299. for line in file( tmp_out.name, 'r' ):
  300. if len( line ) < 3 or line[0:3] not in [ '@HD', '@SQ', '@RG', '@PG', '@CO' ]:
  301. fout.write( line )
  302. fout.close()
  303. # check that there are results in the output file
  304. if os.path.getsize( options.output ) > 0:
  305. if "0" == options.space:
  306. sys.stdout.write( 'BFAST run on Base Space data' )
  307. else:
  308. sys.stdout.write( 'BFAST run on Color Space data' )
  309. else:
  310. raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.'
  311. except Exception, e:
  312. stop_err( 'The alignment failed.\n' + str( e ) )
  313. finally:
  314. # clean up temp dir
  315. if os.path.exists( tmp_dir ):
  316. shutil.rmtree( tmp_dir )
  317. if __name__=="__main__": __main__()