PageRenderTime 134ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/tools/sr_mapping/bowtie_wrapper.py

https://bitbucket.org/h_morita_dbcls/galaxy-central
Python | 401 lines | 373 code | 1 blank | 27 comment | 45 complexity | cb6b3cf7978ca017242865976460aa2b MD5 | raw file
  1. #!/usr/bin/env python
  2. """
  3. Runs Bowtie on single-end or paired-end data.
  4. For use with Bowtie v. 0.12.3
  5. usage: bowtie_wrapper.py [options]
  6. -t, --threads=t: The number of threads to run
  7. -i, --input1=i: The (forward or single-end) reads file in Sanger FASTQ format
  8. -I, --input2=I: The reverse reads file in Sanger FASTQ format
  9. -o, --output=o: The output file
  10. -4, --dataType=4: The type of data (SOLiD or Solexa)
  11. -2, --paired=2: Whether the data is single- or paired-end
  12. -g, --genomeSource=g: The type of reference provided
  13. -r, --ref=r: The reference genome to use or index
  14. -s, --skip=s: Skip the first n reads
  15. -a, --alignLimit=a: Only align the first n reads
  16. -T, --trimH=T: Trim n bases from high-quality (left) end of each read before alignment
  17. -L, --trimL=L: Trim n bases from low-quality (right) end of each read before alignment
  18. -m, --mismatchSeed=m: Maximum number of mismatches permitted in the seed
  19. -M, --mismatchQual=M: Maximum permitted total of quality values at mismatched read positions
  20. -l, --seedLen=l: Seed length
  21. -n, --rounding=n: Whether or not to round to the nearest 10 and saturating at 30
  22. -P, --maqSoapAlign=P: Choose MAQ- or SOAP-like alignment policy
  23. -w, --tryHard=: Whether or not to try as hard as possible to find valid alignments when they exist
  24. -v, --valAlign=v: Report up to n valid arguments per read
  25. -V, --allValAligns=V: Whether or not to report all valid alignments per read
  26. -G, --suppressAlign=G: Suppress all alignments for a read if more than n reportable alignments exist
  27. -b, --best=b: Whether or not to make Bowtie guarantee that reported singleton alignments are 'best' in terms of stratum and in terms of the quality values at the mismatched positions
  28. -B, --maxBacktracks=B: Maximum number of backtracks permitted when aligning a read
  29. -R, --strata=R: Whether or not to report only those alignments that fall in the best stratum if many valid alignments exist and are reportable
  30. -j, --minInsert=j: Minimum insert size for valid paired-end alignments
  31. -J, --maxInsert=J: Maximum insert size for valid paired-end alignments
  32. -O, --mateOrient=O: The upstream/downstream mate orientation for valid paired-end alignment against the forward reference strand
  33. -A, --maxAlignAttempt=A: Maximum number of attempts Bowtie will make to match an alignment for one mate with an alignment for the opposite mate
  34. -f, --forwardAlign=f: Whether or not to attempt to align the forward reference strand
  35. -E, --reverseAlign=E: Whether or not to attempt to align the reverse-complement reference strand
  36. -F, --offrate=F: Override the offrate of the index to n
  37. -8, --snpphred=8: SNP penalty on Phred scale
  38. -6, --snpfrac=6: Fraction of sites expected to be SNP sites
  39. -7, --keepends=7: Keep extreme-end nucleotides and qualities
  40. -S, --seed=S: Seed for pseudo-random number generator
  41. -C, --params=C: Whether to use default or specified parameters
  42. -u, --iautoB=u: Automatic or specified behavior
  43. -K, --ipacked=K: Whether or not to use a packed representation for DNA strings
  44. -Q, --ibmax=Q: Maximum number of suffixes allowed in a block
  45. -Y, --ibmaxdivn=Y: Maximum number of suffixes allowed in a block as a fraction of the length of the reference
  46. -D, --idcv=D: The period for the difference-cover sample
  47. -U, --inodc=U: Whether or not to disable the use of the difference-cover sample
  48. -y, --inoref=y: Whether or not to build the part of the reference index used only in paired-end alignment
  49. -z, --ioffrate=z: How many rows get marked during annotation of some or all of the Burrows-Wheeler rows
  50. -W, --iftab=W: The size of the lookup table used to calculate an initial Burrows-Wheeler range with respect to the first n characters of the query
  51. -X, --intoa=X: Whether or not to convert Ns in the reference sequence to As
  52. -N, --iendian=N: Endianness to use when serializing integers to the index file
  53. -Z, --iseed=Z: Seed for the pseudorandom number generator
  54. -c, --icutoff=c: Number of first bases of the reference sequence to index
  55. -x, --indexSettings=x: Whether or not indexing options are to be set
  56. -H, --suppressHeader=H: Suppress header
  57. """
  58. import optparse, os, shutil, subprocess, sys, tempfile
  59. def stop_err( msg ):
  60. sys.stderr.write( '%s\n' % msg )
  61. sys.exit()
  62. def __main__():
  63. #Parse Command Line
  64. parser = optparse.OptionParser()
  65. parser.add_option( '-t', '--threads', dest='threads', help='The number of threads to run' )
  66. parser.add_option( '-4', '--dataType', dest='dataType', help='The type of data (SOLiD or Solexa)' )
  67. parser.add_option( '-i', '--input1', dest='input1', help='The (forward or single-end) reads file in Sanger FASTQ format' )
  68. parser.add_option( '-I', '--input2', dest='input2', help='The reverse reads file in Sanger FASTQ format' )
  69. parser.add_option( '-o', '--output', dest='output', help='The output file' )
  70. parser.add_option( '-2', '--paired', dest='paired', help='Whether the data is single- or paired-end' )
  71. parser.add_option( '-g', '--genomeSource', dest='genomeSource', help='The type of reference provided' )
  72. parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to use or index' )
  73. parser.add_option( '-s', '--skip', dest='skip', help='Skip the first n reads' )
  74. parser.add_option( '-a', '--alignLimit', dest='alignLimit', help='Only align the first n reads' )
  75. parser.add_option( '-T', '--trimH', dest='trimH', help='Trim n bases from high-quality (left) end of each read before alignment' )
  76. parser.add_option( '-L', '--trimL', dest='trimL', help='Trim n bases from low-quality (right) end of each read before alignment' )
  77. parser.add_option( '-m', '--mismatchSeed', dest='mismatchSeed', help='Maximum number of mismatches permitted in the seed' )
  78. parser.add_option( '-M', '--mismatchQual', dest='mismatchQual', help='Maximum permitted total of quality values at mismatched read positions' )
  79. parser.add_option( '-l', '--seedLen', dest='seedLen', help='Seed length' )
  80. parser.add_option( '-n', '--rounding', dest='rounding', help='Whether or not to round to the nearest 10 and saturating at 30' )
  81. parser.add_option( '-P', '--maqSoapAlign', dest='maqSoapAlign', help='Choose MAQ- or SOAP-like alignment policy' )
  82. parser.add_option( '-w', '--tryHard', dest='tryHard', help='Whether or not to try as hard as possible to find valid alignments when they exist' )
  83. parser.add_option( '-v', '--valAlign', dest='valAlign', help='Report up to n valid arguments per read' )
  84. parser.add_option( '-V', '--allValAligns', dest='allValAligns', help='Whether or not to report all valid alignments per read' )
  85. parser.add_option( '-G', '--suppressAlign', dest='suppressAlign', help='Suppress all alignments for a read if more than n reportable alignments exist' )
  86. parser.add_option( '-b', '--best', dest='best', help="Whether or not to make Bowtie guarantee that reported singleton alignments are 'best' in terms of stratum and in terms of the quality values at the mismatched positions" )
  87. parser.add_option( '-B', '--maxBacktracks', dest='maxBacktracks', help='Maximum number of backtracks permitted when aligning a read' )
  88. parser.add_option( '-R', '--strata', dest='strata', help='Whether or not to report only those alignments that fall in the best stratum if many valid alignments exist and are reportable' )
  89. parser.add_option( '-j', '--minInsert', dest='minInsert', help='Minimum insert size for valid paired-end alignments' )
  90. parser.add_option( '-J', '--maxInsert', dest='maxInsert', help='Maximum insert size for valid paired-end alignments' )
  91. parser.add_option( '-O', '--mateOrient', dest='mateOrient', help='The upstream/downstream mate orientation for valid paired-end alignment against the forward reference strand' )
  92. parser.add_option( '-A', '--maxAlignAttempt', dest='maxAlignAttempt', help='Maximum number of attempts Bowtie will make to match an alignment for one mate with an alignment for the opposite mate' )
  93. parser.add_option( '-f', '--forwardAlign', dest='forwardAlign', help='Whether or not to attempt to align the forward reference strand' )
  94. parser.add_option( '-E', '--reverseAlign', dest='reverseAlign', help='Whether or not to attempt to align the reverse-complement reference strand' )
  95. parser.add_option( '-F', '--offrate', dest='offrate', help='Override the offrate of the index to n' )
  96. parser.add_option( '-S', '--seed', dest='seed', help='Seed for pseudo-random number generator' )
  97. parser.add_option( '-8', '--snpphred', dest='snpphred', help='SNP penalty on Phred scale' )
  98. parser.add_option( '-6', '--snpfrac', dest='snpfrac', help='Fraction of sites expected to be SNP sites' )
  99. parser.add_option( '-7', '--keepends', dest='keepends', help='Keep extreme-end nucleotides and qualities' )
  100. parser.add_option( '-C', '--params', dest='params', help='Whether to use default or specified parameters' )
  101. parser.add_option( '-u', '--iautoB', dest='iautoB', help='Automatic or specified behavior' )
  102. parser.add_option( '-K', '--ipacked', dest='ipacked', help='Whether or not to use a packed representation for DNA strings' )
  103. parser.add_option( '-Q', '--ibmax', dest='ibmax', help='Maximum number of suffixes allowed in a block' )
  104. parser.add_option( '-Y', '--ibmaxdivn', dest='ibmaxdivn', help='Maximum number of suffixes allowed in a block as a fraction of the length of the reference' )
  105. parser.add_option( '-D', '--idcv', dest='idcv', help='The period for the difference-cover sample' )
  106. parser.add_option( '-U', '--inodc', dest='inodc', help='Whether or not to disable the use of the difference-cover sample' )
  107. parser.add_option( '-y', '--inoref', dest='inoref', help='Whether or not to build the part of the reference index used only in paired-end alignment' )
  108. parser.add_option( '-z', '--ioffrate', dest='ioffrate', help='How many rows get marked during annotation of some or all of the Burrows-Wheeler rows' )
  109. parser.add_option( '-W', '--iftab', dest='iftab', help='The size of the lookup table used to calculate an initial Burrows-Wheeler range with respect to the first n characters of the query' )
  110. parser.add_option( '-X', '--intoa', dest='intoa', help='Whether or not to convert Ns in the reference sequence to As' )
  111. parser.add_option( '-N', '--iendian', dest='iendian', help='Endianness to use when serializing integers to the index file' )
  112. parser.add_option( '-Z', '--iseed', dest='iseed', help='Seed for the pseudorandom number generator' )
  113. parser.add_option( '-c', '--icutoff', dest='icutoff', help='Number of first bases of the reference sequence to index' )
  114. parser.add_option( '-x', '--indexSettings', dest='index_settings', help='Whether or not indexing options are to be set' )
  115. parser.add_option( '-H', '--suppressHeader', dest='suppressHeader', help='Suppress header' )
  116. (options, args) = parser.parse_args()
  117. stdout = ''
  118. # make temp directory for placement of indices and copy reference file there if necessary
  119. tmp_index_dir = tempfile.mkdtemp()
  120. # get type of data (solid or solexa)
  121. if options.dataType == 'solid':
  122. colorspace = '-C'
  123. else:
  124. colorspace = ''
  125. # index if necessary
  126. if options.genomeSource == 'history':
  127. # set up commands
  128. if options.index_settings =='indexPreSet':
  129. indexing_cmds = '%s' % colorspace
  130. else:
  131. try:
  132. if options.iautoB != 'None' and options.iautoB == 'set':
  133. iautoB = '--noauto'
  134. else:
  135. iautoB = ''
  136. if options. ipacked != 'None' and options.ipacked == 'packed':
  137. ipacked = '--packed'
  138. else:
  139. ipacked = ''
  140. if options.ibmax != 'None' and int( options.ibmax ) >= 1:
  141. ibmax = '--bmax %s' % options.ibmax
  142. else:
  143. ibmax = ''
  144. if options.ibmaxdivn != 'None' and int( options.ibmaxdivn ) >= 0:
  145. ibmaxdivn = '--bmaxdivn %s' % options.ibmaxdivn
  146. else:
  147. ibmaxdivn = ''
  148. if options.idcv != 'None' and int( options.idcv ) > 0:
  149. idcv = '--dcv %s' % options.idcv
  150. else:
  151. idcv = ''
  152. if options.inodc != 'None' and options.inodc == 'nodc':
  153. inodc = '--nodc'
  154. else:
  155. inodc = ''
  156. if options.inoref != 'None' and options.inoref == 'noref':
  157. inoref = '--noref'
  158. else:
  159. inoref = ''
  160. if options.iftab != 'None' and int( options.iftab ) >= 0:
  161. iftab = '--ftabchars %s' % options.iftab
  162. else:
  163. iftab = ''
  164. if options.intoa != 'None' and options.intoa == 'yes':
  165. intoa = '--ntoa'
  166. else:
  167. intoa = ''
  168. if options.iendian != 'None' and options.iendian == 'big':
  169. iendian = '--big'
  170. else:
  171. iendian = '--little'
  172. if options.iseed != 'None' and int( options.iseed ) > 0:
  173. iseed = '--seed %s' % options.iseed
  174. else:
  175. iseed = ''
  176. if options.icutoff != 'None' and int( options.icutoff ) > 0:
  177. icutoff = '--cutoff %s' % options.icutoff
  178. else:
  179. icutoff = ''
  180. indexing_cmds = '%s %s %s %s %s %s %s --offrate %s %s %s %s %s %s %s' % \
  181. ( iautoB, ipacked, ibmax, ibmaxdivn, idcv, inodc,
  182. inoref, options.ioffrate, iftab, intoa, iendian,
  183. iseed, icutoff, colorspace )
  184. except ValueError, e:
  185. # clean up temp dir
  186. if os.path.exists( tmp_index_dir ):
  187. shutil.rmtree( tmp_index_dir )
  188. stop_err( 'Something is wrong with the indexing parameters and the indexing and alignment could not be run\n' + str( e ) )
  189. ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir )
  190. ref_file_name = ref_file.name
  191. ref_file.close()
  192. os.symlink( options.ref, ref_file_name )
  193. cmd1 = 'bowtie-build %s -f %s %s' % ( indexing_cmds, ref_file_name, ref_file_name )
  194. try:
  195. tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
  196. tmp_stderr = open( tmp, 'wb' )
  197. proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )
  198. returncode = proc.wait()
  199. tmp_stderr.close()
  200. # get stderr, allowing for case where it's very large
  201. tmp_stderr = open( tmp, 'rb' )
  202. stderr = ''
  203. buffsize = 1048576
  204. try:
  205. while True:
  206. stderr += tmp_stderr.read( buffsize )
  207. if not stderr or len( stderr ) % buffsize != 0:
  208. break
  209. except OverflowError:
  210. pass
  211. tmp_stderr.close()
  212. if returncode != 0:
  213. raise Exception, stderr
  214. except Exception, e:
  215. # clean up temp dir
  216. if os.path.exists( tmp_index_dir ):
  217. shutil.rmtree( tmp_index_dir )
  218. stop_err( 'Error indexing reference sequence\n' + str( e ) )
  219. stdout += 'File indexed. '
  220. else:
  221. ref_file_name = options.ref
  222. # set up aligning and generate aligning command options
  223. # automatically set threads in both cases
  224. if options.suppressHeader == 'true':
  225. suppressHeader = '--sam-nohead'
  226. else:
  227. suppressHeader = ''
  228. if options.maxInsert != 'None' and int( options.maxInsert ) > 0:
  229. maxInsert = '-X %s' % options.maxInsert
  230. else:
  231. maxInsert = ''
  232. if options.mateOrient != 'None':
  233. mateOrient = '--%s' % options.mateOrient
  234. else:
  235. mateOrient = ''
  236. if options.params == 'preSet':
  237. aligning_cmds = '%s %s -p %s -S %s -q %s ' % \
  238. ( maxInsert, mateOrient, options.threads, suppressHeader, colorspace )
  239. else:
  240. try:
  241. if options.skip != 'None' and int( options.skip ) > 0:
  242. skip = '-s %s' % options.skip
  243. else:
  244. skip = ''
  245. if options.alignLimit != 'None' and int( options.alignLimit ) >= 0:
  246. alignLimit = '-u %s' % options.alignLimit
  247. else:
  248. alignLimit = ''
  249. if options.trimH != 'None' and int( options.trimH ) > 0:
  250. trimH = '-5 %s' % options.trimH
  251. else:
  252. trimH = ''
  253. if options.trimL != 'None' and int( options.trimL ) > 0:
  254. trimL = '-3 %s' % options.trimL
  255. else:
  256. trimL = ''
  257. if options. mismatchSeed != 'None' and (options.mismatchSeed == '0' or options.mismatchSeed == '1' \
  258. or options.mismatchSeed == '2' or options.mismatchSeed == '3'):
  259. mismatchSeed = '-n %s' % options.mismatchSeed
  260. else:
  261. mismatchSeed = ''
  262. if options.mismatchQual != 'None' and int( options.mismatchQual ) >= 0:
  263. mismatchQual = '-e %s' % options.mismatchQual
  264. else:
  265. mismatchQual = ''
  266. if options.seedLen != 'None' and int( options.seedLen ) >= 5:
  267. seedLen = '-l %s' % options.seedLen
  268. else:
  269. seedLen = ''
  270. if options.rounding == 'noRound':
  271. rounding = '--nomaqround'
  272. else:
  273. rounding = ''
  274. if options.maqSoapAlign != '-1':
  275. maqSoapAlign = '-v %s' % options.maqSoapAlign
  276. else:
  277. maqSoapAlign = ''
  278. if options.minInsert != 'None' and int( options.minInsert ) > 0:
  279. minInsert = '-I %s' % options.minInsert
  280. else:
  281. minInsert = ''
  282. if options.maxAlignAttempt != 'None' and int( options.maxAlignAttempt ) >= 0:
  283. maxAlignAttempt = '--pairtries %s' % options.maxAlignAttempt
  284. else:
  285. maxAlignAttempt = ''
  286. if options.forwardAlign == 'noForward':
  287. forwardAlign = '--nofw'
  288. else:
  289. forwardAlign = ''
  290. if options.reverseAlign == 'noReverse':
  291. reverseAlign = '--norc'
  292. else:
  293. reverseAlign = ''
  294. if options.maxBacktracks != 'None' and int( options.maxBacktracks ) > 0 and \
  295. ( options.mismatchSeed == '2' or options.mismatchSeed == '3' ):
  296. maxBacktracks = '--maxbts %s' % options.maxBacktracks
  297. else:
  298. maxBacktracks = ''
  299. if options.tryHard == 'doTryHard':
  300. tryHard = '-y'
  301. else:
  302. tryHard = ''
  303. if options.valAlign != 'None' and int( options.valAlign ) >= 0:
  304. valAlign = '-k %s' % options.valAlign
  305. else:
  306. valAlign = ''
  307. if options.allValAligns == 'doAllValAligns':
  308. allValAligns = '-a'
  309. else:
  310. allValAligns = ''
  311. if options.suppressAlign != 'None' and int( options.suppressAlign ) >= 0:
  312. suppressAlign = '-m %s' % options.suppressAlign
  313. else:
  314. suppressAlign = ''
  315. if options.best == 'doBest':
  316. best = '--best'
  317. else:
  318. best = ''
  319. if options.strata == 'doStrata':
  320. strata = '--strata'
  321. else:
  322. strata = ''
  323. if options.offrate != 'None' and int( options.offrate ) >= 0:
  324. offrate = '-o %s' % options.offrate
  325. else:
  326. offrate = ''
  327. if options.seed != 'None' and int( options.seed ) >= 0:
  328. seed = '--seed %s' % options.seed
  329. else:
  330. seed = ''
  331. if options.snpphred != 'None' and int( options.snpphred ) >= 0:
  332. snpphred = '--snpphred %s' % options.snpphred
  333. else:
  334. snpphred = ''
  335. if options.snpfrac != 'None' and float( options.snpfrac ) >= 0:
  336. snpfrac = '--snpfrac %s' % options.snpfrac
  337. else:
  338. snpfrac = ''
  339. if options.keepends != 'None' and options.keepends == 'doKeepends':
  340. keepends = '--col-keepends'
  341. else:
  342. keepends = ''
  343. aligning_cmds = '%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s ' \
  344. '%s %s %s %s %s %s %s %s %s %s %s -p %s -S %s -q' % \
  345. ( skip, alignLimit, trimH, trimL, mismatchSeed, mismatchQual,
  346. seedLen, rounding, maqSoapAlign, minInsert, maxInsert,
  347. mateOrient, maxAlignAttempt, forwardAlign, reverseAlign,
  348. maxBacktracks, tryHard, valAlign, allValAligns, suppressAlign,
  349. best, strata, offrate, seed, colorspace, snpphred, snpfrac,
  350. keepends, options.threads, suppressHeader )
  351. except ValueError, e:
  352. # clean up temp dir
  353. if os.path.exists( tmp_index_dir ):
  354. shutil.rmtree( tmp_index_dir )
  355. stop_err( 'Something is wrong with the alignment parameters and the alignment could not be run\n' + str( e ) )
  356. try:
  357. # have to nest try-except in try-finally to handle 2.4
  358. try:
  359. # prepare actual aligning commands
  360. if options.paired == 'paired':
  361. cmd2 = 'bowtie %s %s -1 %s -2 %s > %s' % ( aligning_cmds, ref_file_name, options.input1, options.input2, options.output )
  362. else:
  363. cmd2 = 'bowtie %s %s %s > %s' % ( aligning_cmds, ref_file_name, options.input1, options.output )
  364. # align
  365. tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
  366. tmp_stderr = open( tmp, 'wb' )
  367. proc = subprocess.Popen( args=cmd2, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )
  368. returncode = proc.wait()
  369. tmp_stderr.close()
  370. # get stderr, allowing for case where it's very large
  371. tmp_stderr = open( tmp, 'rb' )
  372. stderr = ''
  373. buffsize = 1048576
  374. try:
  375. while True:
  376. stderr += tmp_stderr.read( buffsize )
  377. if not stderr or len( stderr ) % buffsize != 0:
  378. break
  379. except OverflowError:
  380. pass
  381. tmp_stderr.close()
  382. if returncode != 0:
  383. raise Exception, stderr
  384. # check that there are results in the output file
  385. if os.path.getsize( options.output ) == 0:
  386. raise Exception, 'The output file is empty, there may be an error with your input file or settings.' + '\nextra: ' + str(extra)
  387. except Exception, e:
  388. stop_err( 'Error aligning sequence. ' + str( e ) )
  389. finally:
  390. # clean up temp dir
  391. if os.path.exists( tmp_index_dir ):
  392. shutil.rmtree( tmp_index_dir )
  393. stdout += 'Sequence file aligned.\n'
  394. sys.stdout.write( stdout )
  395. if __name__=="__main__": __main__()