/tools/sr_mapping/bowtie_wrapper.py
Python | 401 lines | 373 code | 1 blank | 27 comment | 59 complexity | cb6b3cf7978ca017242865976460aa2b MD5 | raw file
1#!/usr/bin/env python 2 3""" 4Runs Bowtie on single-end or paired-end data. 5For use with Bowtie v. 0.12.3 6 7usage: bowtie_wrapper.py [options] 8 -t, --threads=t: The number of threads to run 9 -i, --input1=i: The (forward or single-end) reads file in Sanger FASTQ format 10 -I, --input2=I: The reverse reads file in Sanger FASTQ format 11 -o, --output=o: The output file 12 -4, --dataType=4: The type of data (SOLiD or Solexa) 13 -2, --paired=2: Whether the data is single- or paired-end 14 -g, --genomeSource=g: The type of reference provided 15 -r, --ref=r: The reference genome to use or index 16 -s, --skip=s: Skip the first n reads 17 -a, --alignLimit=a: Only align the first n reads 18 -T, --trimH=T: Trim n bases from high-quality (left) end of each read before alignment 19 -L, --trimL=L: Trim n bases from low-quality (right) end of each read before alignment 20 -m, --mismatchSeed=m: Maximum number of mismatches permitted in the seed 21 -M, --mismatchQual=M: Maximum permitted total of quality values at mismatched read positions 22 -l, --seedLen=l: Seed length 23 -n, --rounding=n: Whether or not to round to the nearest 10 and saturating at 30 24 -P, --maqSoapAlign=P: Choose MAQ- or SOAP-like alignment policy 25 -w, --tryHard=: Whether or not to try as hard as possible to find valid alignments when they exist 26 -v, --valAlign=v: Report up to n valid arguments per read 27 -V, --allValAligns=V: Whether or not to report all valid alignments per read 28 -G, --suppressAlign=G: Suppress all alignments for a read if more than n reportable alignments exist 29 -b, --best=b: Whether or not to make Bowtie guarantee that reported singleton alignments are 'best' in terms of stratum and in terms of the quality values at the mismatched positions 30 -B, --maxBacktracks=B: Maximum number of backtracks permitted when aligning a read 31 -R, --strata=R: Whether or not to report only those alignments that fall in the best stratum if many valid alignments exist and are reportable 32 -j, --minInsert=j: Minimum insert size for valid paired-end alignments 33 -J, --maxInsert=J: Maximum insert size for valid paired-end alignments 34 -O, --mateOrient=O: The upstream/downstream mate orientation for valid paired-end alignment against the forward reference strand 35 -A, --maxAlignAttempt=A: Maximum number of attempts Bowtie will make to match an alignment for one mate with an alignment for the opposite mate 36 -f, --forwardAlign=f: Whether or not to attempt to align the forward reference strand 37 -E, --reverseAlign=E: Whether or not to attempt to align the reverse-complement reference strand 38 -F, --offrate=F: Override the offrate of the index to n 39 -8, --snpphred=8: SNP penalty on Phred scale 40 -6, --snpfrac=6: Fraction of sites expected to be SNP sites 41 -7, --keepends=7: Keep extreme-end nucleotides and qualities 42 -S, --seed=S: Seed for pseudo-random number generator 43 -C, --params=C: Whether to use default or specified parameters 44 -u, --iautoB=u: Automatic or specified behavior 45 -K, --ipacked=K: Whether or not to use a packed representation for DNA strings 46 -Q, --ibmax=Q: Maximum number of suffixes allowed in a block 47 -Y, --ibmaxdivn=Y: Maximum number of suffixes allowed in a block as a fraction of the length of the reference 48 -D, --idcv=D: The period for the difference-cover sample 49 -U, --inodc=U: Whether or not to disable the use of the difference-cover sample 50 -y, --inoref=y: Whether or not to build the part of the reference index used only in paired-end alignment 51 -z, --ioffrate=z: How many rows get marked during annotation of some or all of the Burrows-Wheeler rows 52 -W, --iftab=W: The size of the lookup table used to calculate an initial Burrows-Wheeler range with respect to the first n characters of the query 53 -X, --intoa=X: Whether or not to convert Ns in the reference sequence to As 54 -N, --iendian=N: Endianness to use when serializing integers to the index file 55 -Z, --iseed=Z: Seed for the pseudorandom number generator 56 -c, --icutoff=c: Number of first bases of the reference sequence to index 57 -x, --indexSettings=x: Whether or not indexing options are to be set 58 -H, --suppressHeader=H: Suppress header 59""" 60 61import optparse, os, shutil, subprocess, sys, tempfile 62 63def stop_err( msg ): 64 sys.stderr.write( '%s\n' % msg ) 65 sys.exit() 66 67def __main__(): 68 #Parse Command Line 69 parser = optparse.OptionParser() 70 parser.add_option( '-t', '--threads', dest='threads', help='The number of threads to run' ) 71 parser.add_option( '-4', '--dataType', dest='dataType', help='The type of data (SOLiD or Solexa)' ) 72 parser.add_option( '-i', '--input1', dest='input1', help='The (forward or single-end) reads file in Sanger FASTQ format' ) 73 parser.add_option( '-I', '--input2', dest='input2', help='The reverse reads file in Sanger FASTQ format' ) 74 parser.add_option( '-o', '--output', dest='output', help='The output file' ) 75 parser.add_option( '-2', '--paired', dest='paired', help='Whether the data is single- or paired-end' ) 76 parser.add_option( '-g', '--genomeSource', dest='genomeSource', help='The type of reference provided' ) 77 parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to use or index' ) 78 parser.add_option( '-s', '--skip', dest='skip', help='Skip the first n reads' ) 79 parser.add_option( '-a', '--alignLimit', dest='alignLimit', help='Only align the first n reads' ) 80 parser.add_option( '-T', '--trimH', dest='trimH', help='Trim n bases from high-quality (left) end of each read before alignment' ) 81 parser.add_option( '-L', '--trimL', dest='trimL', help='Trim n bases from low-quality (right) end of each read before alignment' ) 82 parser.add_option( '-m', '--mismatchSeed', dest='mismatchSeed', help='Maximum number of mismatches permitted in the seed' ) 83 parser.add_option( '-M', '--mismatchQual', dest='mismatchQual', help='Maximum permitted total of quality values at mismatched read positions' ) 84 parser.add_option( '-l', '--seedLen', dest='seedLen', help='Seed length' ) 85 parser.add_option( '-n', '--rounding', dest='rounding', help='Whether or not to round to the nearest 10 and saturating at 30' ) 86 parser.add_option( '-P', '--maqSoapAlign', dest='maqSoapAlign', help='Choose MAQ- or SOAP-like alignment policy' ) 87 parser.add_option( '-w', '--tryHard', dest='tryHard', help='Whether or not to try as hard as possible to find valid alignments when they exist' ) 88 parser.add_option( '-v', '--valAlign', dest='valAlign', help='Report up to n valid arguments per read' ) 89 parser.add_option( '-V', '--allValAligns', dest='allValAligns', help='Whether or not to report all valid alignments per read' ) 90 parser.add_option( '-G', '--suppressAlign', dest='suppressAlign', help='Suppress all alignments for a read if more than n reportable alignments exist' ) 91 parser.add_option( '-b', '--best', dest='best', help="Whether or not to make Bowtie guarantee that reported singleton alignments are 'best' in terms of stratum and in terms of the quality values at the mismatched positions" ) 92 parser.add_option( '-B', '--maxBacktracks', dest='maxBacktracks', help='Maximum number of backtracks permitted when aligning a read' ) 93 parser.add_option( '-R', '--strata', dest='strata', help='Whether or not to report only those alignments that fall in the best stratum if many valid alignments exist and are reportable' ) 94 parser.add_option( '-j', '--minInsert', dest='minInsert', help='Minimum insert size for valid paired-end alignments' ) 95 parser.add_option( '-J', '--maxInsert', dest='maxInsert', help='Maximum insert size for valid paired-end alignments' ) 96 parser.add_option( '-O', '--mateOrient', dest='mateOrient', help='The upstream/downstream mate orientation for valid paired-end alignment against the forward reference strand' ) 97 parser.add_option( '-A', '--maxAlignAttempt', dest='maxAlignAttempt', help='Maximum number of attempts Bowtie will make to match an alignment for one mate with an alignment for the opposite mate' ) 98 parser.add_option( '-f', '--forwardAlign', dest='forwardAlign', help='Whether or not to attempt to align the forward reference strand' ) 99 parser.add_option( '-E', '--reverseAlign', dest='reverseAlign', help='Whether or not to attempt to align the reverse-complement reference strand' ) 100 parser.add_option( '-F', '--offrate', dest='offrate', help='Override the offrate of the index to n' ) 101 parser.add_option( '-S', '--seed', dest='seed', help='Seed for pseudo-random number generator' ) 102 parser.add_option( '-8', '--snpphred', dest='snpphred', help='SNP penalty on Phred scale' ) 103 parser.add_option( '-6', '--snpfrac', dest='snpfrac', help='Fraction of sites expected to be SNP sites' ) 104 parser.add_option( '-7', '--keepends', dest='keepends', help='Keep extreme-end nucleotides and qualities' ) 105 parser.add_option( '-C', '--params', dest='params', help='Whether to use default or specified parameters' ) 106 parser.add_option( '-u', '--iautoB', dest='iautoB', help='Automatic or specified behavior' ) 107 parser.add_option( '-K', '--ipacked', dest='ipacked', help='Whether or not to use a packed representation for DNA strings' ) 108 parser.add_option( '-Q', '--ibmax', dest='ibmax', help='Maximum number of suffixes allowed in a block' ) 109 parser.add_option( '-Y', '--ibmaxdivn', dest='ibmaxdivn', help='Maximum number of suffixes allowed in a block as a fraction of the length of the reference' ) 110 parser.add_option( '-D', '--idcv', dest='idcv', help='The period for the difference-cover sample' ) 111 parser.add_option( '-U', '--inodc', dest='inodc', help='Whether or not to disable the use of the difference-cover sample' ) 112 parser.add_option( '-y', '--inoref', dest='inoref', help='Whether or not to build the part of the reference index used only in paired-end alignment' ) 113 parser.add_option( '-z', '--ioffrate', dest='ioffrate', help='How many rows get marked during annotation of some or all of the Burrows-Wheeler rows' ) 114 parser.add_option( '-W', '--iftab', dest='iftab', help='The size of the lookup table used to calculate an initial Burrows-Wheeler range with respect to the first n characters of the query' ) 115 parser.add_option( '-X', '--intoa', dest='intoa', help='Whether or not to convert Ns in the reference sequence to As' ) 116 parser.add_option( '-N', '--iendian', dest='iendian', help='Endianness to use when serializing integers to the index file' ) 117 parser.add_option( '-Z', '--iseed', dest='iseed', help='Seed for the pseudorandom number generator' ) 118 parser.add_option( '-c', '--icutoff', dest='icutoff', help='Number of first bases of the reference sequence to index' ) 119 parser.add_option( '-x', '--indexSettings', dest='index_settings', help='Whether or not indexing options are to be set' ) 120 parser.add_option( '-H', '--suppressHeader', dest='suppressHeader', help='Suppress header' ) 121 (options, args) = parser.parse_args() 122 stdout = '' 123 # make temp directory for placement of indices and copy reference file there if necessary 124 tmp_index_dir = tempfile.mkdtemp() 125 # get type of data (solid or solexa) 126 if options.dataType == 'solid': 127 colorspace = '-C' 128 else: 129 colorspace = '' 130 # index if necessary 131 if options.genomeSource == 'history': 132 # set up commands 133 if options.index_settings =='indexPreSet': 134 indexing_cmds = '%s' % colorspace 135 else: 136 try: 137 if options.iautoB != 'None' and options.iautoB == 'set': 138 iautoB = '--noauto' 139 else: 140 iautoB = '' 141 if options. ipacked != 'None' and options.ipacked == 'packed': 142 ipacked = '--packed' 143 else: 144 ipacked = '' 145 if options.ibmax != 'None' and int( options.ibmax ) >= 1: 146 ibmax = '--bmax %s' % options.ibmax 147 else: 148 ibmax = '' 149 if options.ibmaxdivn != 'None' and int( options.ibmaxdivn ) >= 0: 150 ibmaxdivn = '--bmaxdivn %s' % options.ibmaxdivn 151 else: 152 ibmaxdivn = '' 153 if options.idcv != 'None' and int( options.idcv ) > 0: 154 idcv = '--dcv %s' % options.idcv 155 else: 156 idcv = '' 157 if options.inodc != 'None' and options.inodc == 'nodc': 158 inodc = '--nodc' 159 else: 160 inodc = '' 161 if options.inoref != 'None' and options.inoref == 'noref': 162 inoref = '--noref' 163 else: 164 inoref = '' 165 if options.iftab != 'None' and int( options.iftab ) >= 0: 166 iftab = '--ftabchars %s' % options.iftab 167 else: 168 iftab = '' 169 if options.intoa != 'None' and options.intoa == 'yes': 170 intoa = '--ntoa' 171 else: 172 intoa = '' 173 if options.iendian != 'None' and options.iendian == 'big': 174 iendian = '--big' 175 else: 176 iendian = '--little' 177 if options.iseed != 'None' and int( options.iseed ) > 0: 178 iseed = '--seed %s' % options.iseed 179 else: 180 iseed = '' 181 if options.icutoff != 'None' and int( options.icutoff ) > 0: 182 icutoff = '--cutoff %s' % options.icutoff 183 else: 184 icutoff = '' 185 indexing_cmds = '%s %s %s %s %s %s %s --offrate %s %s %s %s %s %s %s' % \ 186 ( iautoB, ipacked, ibmax, ibmaxdivn, idcv, inodc, 187 inoref, options.ioffrate, iftab, intoa, iendian, 188 iseed, icutoff, colorspace ) 189 except ValueError, e: 190 # clean up temp dir 191 if os.path.exists( tmp_index_dir ): 192 shutil.rmtree( tmp_index_dir ) 193 stop_err( 'Something is wrong with the indexing parameters and the indexing and alignment could not be run\n' + str( e ) ) 194 ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir ) 195 ref_file_name = ref_file.name 196 ref_file.close() 197 os.symlink( options.ref, ref_file_name ) 198 cmd1 = 'bowtie-build %s -f %s %s' % ( indexing_cmds, ref_file_name, ref_file_name ) 199 try: 200 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name 201 tmp_stderr = open( tmp, 'wb' ) 202 proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() ) 203 returncode = proc.wait() 204 tmp_stderr.close() 205 # get stderr, allowing for case where it's very large 206 tmp_stderr = open( tmp, 'rb' ) 207 stderr = '' 208 buffsize = 1048576 209 try: 210 while True: 211 stderr += tmp_stderr.read( buffsize ) 212 if not stderr or len( stderr ) % buffsize != 0: 213 break 214 except OverflowError: 215 pass 216 tmp_stderr.close() 217 if returncode != 0: 218 raise Exception, stderr 219 except Exception, e: 220 # clean up temp dir 221 if os.path.exists( tmp_index_dir ): 222 shutil.rmtree( tmp_index_dir ) 223 stop_err( 'Error indexing reference sequence\n' + str( e ) ) 224 stdout += 'File indexed. ' 225 else: 226 ref_file_name = options.ref 227 # set up aligning and generate aligning command options 228 # automatically set threads in both cases 229 if options.suppressHeader == 'true': 230 suppressHeader = '--sam-nohead' 231 else: 232 suppressHeader = '' 233 if options.maxInsert != 'None' and int( options.maxInsert ) > 0: 234 maxInsert = '-X %s' % options.maxInsert 235 else: 236 maxInsert = '' 237 if options.mateOrient != 'None': 238 mateOrient = '--%s' % options.mateOrient 239 else: 240 mateOrient = '' 241 if options.params == 'preSet': 242 aligning_cmds = '%s %s -p %s -S %s -q %s ' % \ 243 ( maxInsert, mateOrient, options.threads, suppressHeader, colorspace ) 244 else: 245 try: 246 if options.skip != 'None' and int( options.skip ) > 0: 247 skip = '-s %s' % options.skip 248 else: 249 skip = '' 250 if options.alignLimit != 'None' and int( options.alignLimit ) >= 0: 251 alignLimit = '-u %s' % options.alignLimit 252 else: 253 alignLimit = '' 254 if options.trimH != 'None' and int( options.trimH ) > 0: 255 trimH = '-5 %s' % options.trimH 256 else: 257 trimH = '' 258 if options.trimL != 'None' and int( options.trimL ) > 0: 259 trimL = '-3 %s' % options.trimL 260 else: 261 trimL = '' 262 if options. mismatchSeed != 'None' and (options.mismatchSeed == '0' or options.mismatchSeed == '1' \ 263 or options.mismatchSeed == '2' or options.mismatchSeed == '3'): 264 mismatchSeed = '-n %s' % options.mismatchSeed 265 else: 266 mismatchSeed = '' 267 if options.mismatchQual != 'None' and int( options.mismatchQual ) >= 0: 268 mismatchQual = '-e %s' % options.mismatchQual 269 else: 270 mismatchQual = '' 271 if options.seedLen != 'None' and int( options.seedLen ) >= 5: 272 seedLen = '-l %s' % options.seedLen 273 else: 274 seedLen = '' 275 if options.rounding == 'noRound': 276 rounding = '--nomaqround' 277 else: 278 rounding = '' 279 if options.maqSoapAlign != '-1': 280 maqSoapAlign = '-v %s' % options.maqSoapAlign 281 else: 282 maqSoapAlign = '' 283 if options.minInsert != 'None' and int( options.minInsert ) > 0: 284 minInsert = '-I %s' % options.minInsert 285 else: 286 minInsert = '' 287 if options.maxAlignAttempt != 'None' and int( options.maxAlignAttempt ) >= 0: 288 maxAlignAttempt = '--pairtries %s' % options.maxAlignAttempt 289 else: 290 maxAlignAttempt = '' 291 if options.forwardAlign == 'noForward': 292 forwardAlign = '--nofw' 293 else: 294 forwardAlign = '' 295 if options.reverseAlign == 'noReverse': 296 reverseAlign = '--norc' 297 else: 298 reverseAlign = '' 299 if options.maxBacktracks != 'None' and int( options.maxBacktracks ) > 0 and \ 300 ( options.mismatchSeed == '2' or options.mismatchSeed == '3' ): 301 maxBacktracks = '--maxbts %s' % options.maxBacktracks 302 else: 303 maxBacktracks = '' 304 if options.tryHard == 'doTryHard': 305 tryHard = '-y' 306 else: 307 tryHard = '' 308 if options.valAlign != 'None' and int( options.valAlign ) >= 0: 309 valAlign = '-k %s' % options.valAlign 310 else: 311 valAlign = '' 312 if options.allValAligns == 'doAllValAligns': 313 allValAligns = '-a' 314 else: 315 allValAligns = '' 316 if options.suppressAlign != 'None' and int( options.suppressAlign ) >= 0: 317 suppressAlign = '-m %s' % options.suppressAlign 318 else: 319 suppressAlign = '' 320 if options.best == 'doBest': 321 best = '--best' 322 else: 323 best = '' 324 if options.strata == 'doStrata': 325 strata = '--strata' 326 else: 327 strata = '' 328 if options.offrate != 'None' and int( options.offrate ) >= 0: 329 offrate = '-o %s' % options.offrate 330 else: 331 offrate = '' 332 if options.seed != 'None' and int( options.seed ) >= 0: 333 seed = '--seed %s' % options.seed 334 else: 335 seed = '' 336 if options.snpphred != 'None' and int( options.snpphred ) >= 0: 337 snpphred = '--snpphred %s' % options.snpphred 338 else: 339 snpphred = '' 340 if options.snpfrac != 'None' and float( options.snpfrac ) >= 0: 341 snpfrac = '--snpfrac %s' % options.snpfrac 342 else: 343 snpfrac = '' 344 if options.keepends != 'None' and options.keepends == 'doKeepends': 345 keepends = '--col-keepends' 346 else: 347 keepends = '' 348 aligning_cmds = '%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s ' \ 349 '%s %s %s %s %s %s %s %s %s %s %s -p %s -S %s -q' % \ 350 ( skip, alignLimit, trimH, trimL, mismatchSeed, mismatchQual, 351 seedLen, rounding, maqSoapAlign, minInsert, maxInsert, 352 mateOrient, maxAlignAttempt, forwardAlign, reverseAlign, 353 maxBacktracks, tryHard, valAlign, allValAligns, suppressAlign, 354 best, strata, offrate, seed, colorspace, snpphred, snpfrac, 355 keepends, options.threads, suppressHeader ) 356 except ValueError, e: 357 # clean up temp dir 358 if os.path.exists( tmp_index_dir ): 359 shutil.rmtree( tmp_index_dir ) 360 stop_err( 'Something is wrong with the alignment parameters and the alignment could not be run\n' + str( e ) ) 361 try: 362 # have to nest try-except in try-finally to handle 2.4 363 try: 364 # prepare actual aligning commands 365 if options.paired == 'paired': 366 cmd2 = 'bowtie %s %s -1 %s -2 %s > %s' % ( aligning_cmds, ref_file_name, options.input1, options.input2, options.output ) 367 else: 368 cmd2 = 'bowtie %s %s %s > %s' % ( aligning_cmds, ref_file_name, options.input1, options.output ) 369 # align 370 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name 371 tmp_stderr = open( tmp, 'wb' ) 372 proc = subprocess.Popen( args=cmd2, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() ) 373 returncode = proc.wait() 374 tmp_stderr.close() 375 # get stderr, allowing for case where it's very large 376 tmp_stderr = open( tmp, 'rb' ) 377 stderr = '' 378 buffsize = 1048576 379 try: 380 while True: 381 stderr += tmp_stderr.read( buffsize ) 382 if not stderr or len( stderr ) % buffsize != 0: 383 break 384 except OverflowError: 385 pass 386 tmp_stderr.close() 387 if returncode != 0: 388 raise Exception, stderr 389 # check that there are results in the output file 390 if os.path.getsize( options.output ) == 0: 391 raise Exception, 'The output file is empty, there may be an error with your input file or settings.' + '\nextra: ' + str(extra) 392 except Exception, e: 393 stop_err( 'Error aligning sequence. ' + str( e ) ) 394 finally: 395 # clean up temp dir 396 if os.path.exists( tmp_index_dir ): 397 shutil.rmtree( tmp_index_dir ) 398 stdout += 'Sequence file aligned.\n' 399 sys.stdout.write( stdout ) 400 401if __name__=="__main__": __main__()