PageRenderTime 246ms CodeModel.GetById 9ms app.highlight 148ms RepoModel.GetById 63ms app.codeStats 0ms

/tools/sr_mapping/bowtie_wrapper.py

https://bitbucket.org/h_morita_dbcls/galaxy-central
Python | 401 lines | 373 code | 1 blank | 27 comment | 59 complexity | cb6b3cf7978ca017242865976460aa2b MD5 | raw file
  1#!/usr/bin/env python
  2
  3"""
  4Runs Bowtie on single-end or paired-end data.
  5For use with Bowtie v. 0.12.3
  6
  7usage: bowtie_wrapper.py [options]
  8    -t, --threads=t: The number of threads to run
  9    -i, --input1=i: The (forward or single-end) reads file in Sanger FASTQ format
 10    -I, --input2=I: The reverse reads file in Sanger FASTQ format
 11    -o, --output=o: The output file
 12    -4, --dataType=4: The type of data (SOLiD or Solexa)
 13    -2, --paired=2: Whether the data is single- or paired-end
 14    -g, --genomeSource=g: The type of reference provided
 15    -r, --ref=r: The reference genome to use or index
 16    -s, --skip=s: Skip the first n reads
 17    -a, --alignLimit=a: Only align the first n reads
 18    -T, --trimH=T: Trim n bases from high-quality (left) end of each read before alignment
 19    -L, --trimL=L: Trim n bases from low-quality (right) end of each read before alignment
 20    -m, --mismatchSeed=m: Maximum number of mismatches permitted in the seed
 21    -M, --mismatchQual=M: Maximum permitted total of quality values at mismatched read positions
 22    -l, --seedLen=l: Seed length
 23    -n, --rounding=n: Whether or not to round to the nearest 10 and saturating at 30
 24    -P, --maqSoapAlign=P: Choose MAQ- or SOAP-like alignment policy
 25    -w, --tryHard=: Whether or not to try as hard as possible to find valid alignments when they exist
 26    -v, --valAlign=v: Report up to n valid arguments per read
 27    -V, --allValAligns=V: Whether or not to report all valid alignments per read
 28    -G, --suppressAlign=G: Suppress all alignments for a read if more than n reportable alignments exist
 29    -b, --best=b: Whether or not to make Bowtie guarantee that reported singleton alignments are 'best' in terms of stratum and in terms of the quality values at the mismatched positions
 30    -B, --maxBacktracks=B: Maximum number of backtracks permitted when aligning a read
 31    -R, --strata=R: Whether or not to report only those alignments that fall in the best stratum if many valid alignments exist and are reportable
 32    -j, --minInsert=j: Minimum insert size for valid paired-end alignments
 33    -J, --maxInsert=J: Maximum insert size for valid paired-end alignments
 34    -O, --mateOrient=O: The upstream/downstream mate orientation for valid paired-end alignment against the forward reference strand
 35    -A, --maxAlignAttempt=A: Maximum number of attempts Bowtie will make to match an alignment for one mate with an alignment for the opposite mate
 36    -f, --forwardAlign=f: Whether or not to attempt to align the forward reference strand
 37    -E, --reverseAlign=E: Whether or not to attempt to align the reverse-complement reference strand
 38    -F, --offrate=F: Override the offrate of the index to n
 39    -8, --snpphred=8: SNP penalty on Phred scale
 40    -6, --snpfrac=6: Fraction of sites expected to be SNP sites
 41    -7, --keepends=7: Keep extreme-end nucleotides and qualities
 42    -S, --seed=S: Seed for pseudo-random number generator
 43    -C, --params=C: Whether to use default or specified parameters
 44    -u, --iautoB=u: Automatic or specified behavior
 45    -K, --ipacked=K: Whether or not to use a packed representation for DNA strings
 46    -Q, --ibmax=Q: Maximum number of suffixes allowed in a block
 47    -Y, --ibmaxdivn=Y: Maximum number of suffixes allowed in a block as a fraction of the length of the reference
 48    -D, --idcv=D: The period for the difference-cover sample
 49    -U, --inodc=U: Whether or not to disable the use of the difference-cover sample
 50    -y, --inoref=y: Whether or not to build the part of the reference index used only in paired-end alignment
 51    -z, --ioffrate=z: How many rows get marked during annotation of some or all of the Burrows-Wheeler rows
 52    -W, --iftab=W: The size of the lookup table used to calculate an initial Burrows-Wheeler range with respect to the first n characters of the query
 53    -X, --intoa=X: Whether or not to convert Ns in the reference sequence to As
 54    -N, --iendian=N: Endianness to use when serializing integers to the index file
 55    -Z, --iseed=Z: Seed for the pseudorandom number generator
 56    -c, --icutoff=c: Number of first bases of the reference sequence to index
 57    -x, --indexSettings=x: Whether or not indexing options are to be set
 58    -H, --suppressHeader=H: Suppress header
 59"""
 60
 61import optparse, os, shutil, subprocess, sys, tempfile
 62
 63def stop_err( msg ):
 64    sys.stderr.write( '%s\n' % msg )
 65    sys.exit()
 66
 67def __main__():
 68    #Parse Command Line
 69    parser = optparse.OptionParser()
 70    parser.add_option( '-t', '--threads', dest='threads', help='The number of threads to run' )
 71    parser.add_option( '-4', '--dataType', dest='dataType', help='The type of data (SOLiD or Solexa)' )
 72    parser.add_option( '-i', '--input1', dest='input1', help='The (forward or single-end) reads file in Sanger FASTQ format' )
 73    parser.add_option( '-I', '--input2', dest='input2', help='The reverse reads file in Sanger FASTQ format' )
 74    parser.add_option( '-o', '--output', dest='output', help='The output file' )
 75    parser.add_option( '-2', '--paired', dest='paired', help='Whether the data is single- or paired-end' )
 76    parser.add_option( '-g', '--genomeSource', dest='genomeSource', help='The type of reference provided' )
 77    parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to use or index' )
 78    parser.add_option( '-s', '--skip', dest='skip', help='Skip the first n reads' )
 79    parser.add_option( '-a', '--alignLimit', dest='alignLimit', help='Only align the first n reads' )
 80    parser.add_option( '-T', '--trimH', dest='trimH', help='Trim n bases from high-quality (left) end of each read before alignment' )
 81    parser.add_option( '-L', '--trimL', dest='trimL', help='Trim n bases from low-quality (right) end of each read before alignment' )
 82    parser.add_option( '-m', '--mismatchSeed', dest='mismatchSeed', help='Maximum number of mismatches permitted in the seed' )
 83    parser.add_option( '-M', '--mismatchQual', dest='mismatchQual', help='Maximum permitted total of quality values at mismatched read positions' )
 84    parser.add_option( '-l', '--seedLen', dest='seedLen', help='Seed length' )
 85    parser.add_option( '-n', '--rounding', dest='rounding', help='Whether or not to round to the nearest 10 and saturating at 30' )
 86    parser.add_option( '-P', '--maqSoapAlign', dest='maqSoapAlign', help='Choose MAQ- or SOAP-like alignment policy' )
 87    parser.add_option( '-w', '--tryHard', dest='tryHard', help='Whether or not to try as hard as possible to find valid alignments when they exist' )
 88    parser.add_option( '-v', '--valAlign', dest='valAlign', help='Report up to n valid arguments per read' )
 89    parser.add_option( '-V', '--allValAligns', dest='allValAligns', help='Whether or not to report all valid alignments per read' )
 90    parser.add_option( '-G', '--suppressAlign', dest='suppressAlign', help='Suppress all alignments for a read if more than n reportable alignments exist' )
 91    parser.add_option( '-b', '--best', dest='best', help="Whether or not to make Bowtie guarantee that reported singleton alignments are 'best' in terms of stratum and in terms of the quality values at the mismatched positions" )
 92    parser.add_option( '-B', '--maxBacktracks', dest='maxBacktracks', help='Maximum number of backtracks permitted when aligning a read' )
 93    parser.add_option( '-R', '--strata', dest='strata', help='Whether or not to report only those alignments that fall in the best stratum if many valid alignments exist and are reportable' )
 94    parser.add_option( '-j', '--minInsert', dest='minInsert', help='Minimum insert size for valid paired-end alignments' )
 95    parser.add_option( '-J', '--maxInsert', dest='maxInsert', help='Maximum insert size for valid paired-end alignments' )
 96    parser.add_option( '-O', '--mateOrient', dest='mateOrient', help='The upstream/downstream mate orientation for valid paired-end alignment against the forward reference strand' )
 97    parser.add_option( '-A', '--maxAlignAttempt', dest='maxAlignAttempt', help='Maximum number of attempts Bowtie will make to match an alignment for one mate with an alignment for the opposite mate' )
 98    parser.add_option( '-f', '--forwardAlign', dest='forwardAlign', help='Whether or not to attempt to align the forward reference strand' )
 99    parser.add_option( '-E', '--reverseAlign', dest='reverseAlign', help='Whether or not to attempt to align the reverse-complement reference strand' )
100    parser.add_option( '-F', '--offrate', dest='offrate', help='Override the offrate of the index to n' )
101    parser.add_option( '-S', '--seed', dest='seed', help='Seed for pseudo-random number generator' )
102    parser.add_option( '-8', '--snpphred', dest='snpphred', help='SNP penalty on Phred scale' )
103    parser.add_option( '-6', '--snpfrac', dest='snpfrac', help='Fraction of sites expected to be SNP sites' )
104    parser.add_option( '-7', '--keepends', dest='keepends', help='Keep extreme-end nucleotides and qualities' )
105    parser.add_option( '-C', '--params', dest='params', help='Whether to use default or specified parameters' )
106    parser.add_option( '-u', '--iautoB', dest='iautoB', help='Automatic or specified behavior' )
107    parser.add_option( '-K', '--ipacked', dest='ipacked', help='Whether or not to use a packed representation for DNA strings' )
108    parser.add_option( '-Q', '--ibmax', dest='ibmax', help='Maximum number of suffixes allowed in a block' )
109    parser.add_option( '-Y', '--ibmaxdivn', dest='ibmaxdivn', help='Maximum number of suffixes allowed in a block as a fraction of the length of the reference' )
110    parser.add_option( '-D', '--idcv', dest='idcv', help='The period for the difference-cover sample' )
111    parser.add_option( '-U', '--inodc', dest='inodc', help='Whether or not to disable the use of the difference-cover sample' )
112    parser.add_option( '-y', '--inoref', dest='inoref', help='Whether or not to build the part of the reference index used only in paired-end alignment' )
113    parser.add_option( '-z', '--ioffrate', dest='ioffrate', help='How many rows get marked during annotation of some or all of the Burrows-Wheeler rows' )
114    parser.add_option( '-W', '--iftab', dest='iftab', help='The size of the lookup table used to calculate an initial Burrows-Wheeler range with respect to the first n characters of the query' )
115    parser.add_option( '-X', '--intoa', dest='intoa', help='Whether or not to convert Ns in the reference sequence to As' )
116    parser.add_option( '-N', '--iendian', dest='iendian', help='Endianness to use when serializing integers to the index file' )
117    parser.add_option( '-Z', '--iseed', dest='iseed', help='Seed for the pseudorandom number generator' )
118    parser.add_option( '-c', '--icutoff', dest='icutoff', help='Number of first bases of the reference sequence to index' )
119    parser.add_option( '-x', '--indexSettings', dest='index_settings', help='Whether or not indexing options are to be set' )
120    parser.add_option( '-H', '--suppressHeader', dest='suppressHeader', help='Suppress header' )
121    (options, args) = parser.parse_args()
122    stdout = ''
123    # make temp directory for placement of indices and copy reference file there if necessary
124    tmp_index_dir = tempfile.mkdtemp()
125    # get type of data (solid or solexa)
126    if options.dataType == 'solid':
127        colorspace = '-C'
128    else:
129        colorspace = ''
130    # index if necessary
131    if options.genomeSource == 'history':
132        # set up commands
133        if options.index_settings =='indexPreSet':
134            indexing_cmds = '%s' % colorspace
135        else:
136            try:
137                if options.iautoB != 'None' and options.iautoB == 'set':
138                    iautoB = '--noauto'
139                else:
140                    iautoB = ''
141                if options. ipacked != 'None' and options.ipacked == 'packed':
142                    ipacked = '--packed'
143                else:
144                    ipacked = ''
145                if options.ibmax != 'None' and int( options.ibmax ) >= 1:
146                    ibmax = '--bmax %s' % options.ibmax 
147                else:
148                    ibmax = ''
149                if options.ibmaxdivn != 'None' and int( options.ibmaxdivn ) >= 0:
150                    ibmaxdivn = '--bmaxdivn %s' % options.ibmaxdivn
151                else:
152                    ibmaxdivn = ''
153                if options.idcv != 'None' and int( options.idcv ) > 0:
154                    idcv = '--dcv %s' % options.idcv
155                else:
156                    idcv = ''
157                if options.inodc != 'None' and options.inodc == 'nodc':
158                    inodc = '--nodc'
159                else:
160                    inodc = ''
161                if options.inoref != 'None' and options.inoref == 'noref':
162                    inoref = '--noref'
163                else:
164                    inoref = ''
165                if options.iftab != 'None' and int( options.iftab ) >= 0:
166                    iftab = '--ftabchars %s' % options.iftab
167                else:
168                    iftab = ''
169                if options.intoa != 'None' and options.intoa == 'yes':
170                    intoa = '--ntoa'
171                else:
172                    intoa = ''
173                if options.iendian != 'None' and options.iendian == 'big':
174                    iendian = '--big'
175                else:
176                    iendian = '--little'
177                if options.iseed != 'None' and int( options.iseed ) > 0:
178                    iseed = '--seed %s' % options.iseed
179                else:
180                    iseed = ''
181                if options.icutoff != 'None' and int( options.icutoff ) > 0:
182                    icutoff = '--cutoff %s' % options.icutoff
183                else:
184                    icutoff = ''
185                indexing_cmds = '%s %s %s %s %s %s %s --offrate %s %s %s %s %s %s %s' % \
186                                ( iautoB, ipacked, ibmax, ibmaxdivn, idcv, inodc, 
187                                  inoref, options.ioffrate, iftab, intoa, iendian, 
188                                  iseed, icutoff, colorspace )
189            except ValueError, e:
190                # clean up temp dir
191                if os.path.exists( tmp_index_dir ):
192                    shutil.rmtree( tmp_index_dir )
193                stop_err( 'Something is wrong with the indexing parameters and the indexing and alignment could not be run\n' + str( e ) )
194        ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir )
195        ref_file_name = ref_file.name
196        ref_file.close()
197        os.symlink( options.ref, ref_file_name )
198        cmd1 = 'bowtie-build %s -f %s %s' % ( indexing_cmds, ref_file_name, ref_file_name )
199        try:
200            tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
201            tmp_stderr = open( tmp, 'wb' )
202            proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )
203            returncode = proc.wait()
204            tmp_stderr.close()
205            # get stderr, allowing for case where it's very large
206            tmp_stderr = open( tmp, 'rb' )
207            stderr = ''
208            buffsize = 1048576
209            try:
210                while True:
211                    stderr += tmp_stderr.read( buffsize )
212                    if not stderr or len( stderr ) % buffsize != 0:
213                        break
214            except OverflowError:
215                pass
216            tmp_stderr.close()
217            if returncode != 0:
218                raise Exception, stderr
219        except Exception, e:
220            # clean up temp dir
221            if os.path.exists( tmp_index_dir ):
222                shutil.rmtree( tmp_index_dir )
223            stop_err( 'Error indexing reference sequence\n' + str( e ) )
224        stdout += 'File indexed. '
225    else:
226        ref_file_name = options.ref
227    # set up aligning and generate aligning command options
228    # automatically set threads in both cases
229    if options.suppressHeader == 'true':
230        suppressHeader = '--sam-nohead'
231    else:
232        suppressHeader = ''
233    if options.maxInsert != 'None' and int( options.maxInsert ) > 0:
234        maxInsert = '-X %s' % options.maxInsert
235    else:
236        maxInsert = ''
237    if options.mateOrient != 'None':
238        mateOrient = '--%s' % options.mateOrient
239    else:
240        mateOrient = ''
241    if options.params == 'preSet':
242        aligning_cmds = '%s %s -p %s -S %s -q %s ' % \
243                ( maxInsert, mateOrient, options.threads, suppressHeader, colorspace )
244    else:
245        try:
246            if options.skip != 'None' and int( options.skip ) > 0:
247                skip = '-s %s' % options.skip
248            else:
249                skip = ''
250            if options.alignLimit != 'None' and int( options.alignLimit ) >= 0:
251                alignLimit = '-u %s' % options.alignLimit
252            else:
253                alignLimit = ''
254            if options.trimH != 'None' and int( options.trimH ) > 0:
255                trimH = '-5 %s' % options.trimH
256            else:
257                trimH = ''
258            if options.trimL != 'None' and int( options.trimL ) > 0:
259                trimL = '-3 %s' % options.trimL
260            else:
261                trimL = ''
262            if options. mismatchSeed != 'None' and (options.mismatchSeed == '0' or options.mismatchSeed == '1' \
263                        or options.mismatchSeed == '2' or options.mismatchSeed == '3'):
264                mismatchSeed = '-n %s' % options.mismatchSeed
265            else:
266                mismatchSeed = ''
267            if options.mismatchQual != 'None' and int( options.mismatchQual ) >= 0:
268                mismatchQual = '-e %s' % options.mismatchQual
269            else:
270                mismatchQual = ''
271            if options.seedLen != 'None' and int( options.seedLen ) >= 5:
272                seedLen = '-l %s' % options.seedLen
273            else:
274                seedLen = ''
275            if options.rounding == 'noRound':
276                rounding = '--nomaqround'
277            else:
278                rounding = ''
279            if options.maqSoapAlign != '-1':
280                maqSoapAlign = '-v %s' % options.maqSoapAlign
281            else:
282                maqSoapAlign = ''
283            if options.minInsert != 'None' and int( options.minInsert ) > 0:
284                minInsert = '-I %s' % options.minInsert
285            else:
286                minInsert = ''
287            if options.maxAlignAttempt != 'None' and int( options.maxAlignAttempt ) >= 0:
288                maxAlignAttempt = '--pairtries %s' % options.maxAlignAttempt
289            else:
290                maxAlignAttempt = ''
291            if options.forwardAlign == 'noForward':
292                forwardAlign = '--nofw'
293            else:
294                forwardAlign = ''
295            if options.reverseAlign == 'noReverse':
296                reverseAlign = '--norc'
297            else:
298                reverseAlign = ''
299            if options.maxBacktracks != 'None' and int( options.maxBacktracks ) > 0 and \
300                    ( options.mismatchSeed == '2' or options.mismatchSeed == '3' ):
301                maxBacktracks = '--maxbts %s' % options.maxBacktracks
302            else:
303                maxBacktracks = ''
304            if options.tryHard == 'doTryHard':
305                tryHard = '-y'
306            else:
307                tryHard = ''
308            if options.valAlign != 'None' and int( options.valAlign ) >= 0:
309                valAlign = '-k %s' % options.valAlign
310            else:
311                valAlign = ''
312            if options.allValAligns == 'doAllValAligns':
313                allValAligns = '-a'
314            else:
315                allValAligns = ''
316            if options.suppressAlign != 'None' and int( options.suppressAlign ) >= 0:
317                suppressAlign = '-m %s' % options.suppressAlign
318            else:
319                suppressAlign = ''
320            if options.best == 'doBest':
321                best = '--best'
322            else:
323                best = ''
324            if options.strata == 'doStrata':
325                strata = '--strata'
326            else:
327                strata = ''
328            if options.offrate != 'None' and int( options.offrate ) >= 0:
329                offrate = '-o %s' % options.offrate
330            else:
331                offrate = ''
332            if options.seed != 'None' and int( options.seed ) >= 0:
333                seed = '--seed %s' % options.seed
334            else:
335                seed = ''
336            if options.snpphred != 'None' and int( options.snpphred ) >= 0:
337                snpphred = '--snpphred %s' % options.snpphred
338            else:
339                snpphred = ''
340                if options.snpfrac != 'None' and float( options.snpfrac ) >= 0:
341                    snpfrac = '--snpfrac %s' % options.snpfrac
342                else:
343                    snpfrac = ''
344            if options.keepends != 'None' and options.keepends == 'doKeepends':
345                keepends = '--col-keepends'
346            else:
347                keepends = ''
348            aligning_cmds = '%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s ' \
349                            '%s %s %s %s %s %s %s %s %s %s %s -p %s -S %s -q' % \
350                            ( skip, alignLimit, trimH, trimL, mismatchSeed, mismatchQual, 
351                              seedLen, rounding, maqSoapAlign, minInsert, maxInsert, 
352                              mateOrient, maxAlignAttempt, forwardAlign, reverseAlign, 
353                              maxBacktracks, tryHard, valAlign, allValAligns, suppressAlign, 
354                              best, strata, offrate, seed, colorspace, snpphred, snpfrac, 
355                              keepends, options.threads, suppressHeader )
356        except ValueError, e:
357            # clean up temp dir
358            if os.path.exists( tmp_index_dir ):
359                shutil.rmtree( tmp_index_dir )
360            stop_err( 'Something is wrong with the alignment parameters and the alignment could not be run\n' + str( e ) )
361    try:
362        # have to nest try-except in try-finally to handle 2.4
363        try:
364            # prepare actual aligning commands
365            if options.paired == 'paired':
366                cmd2 = 'bowtie %s %s -1 %s -2 %s > %s' % ( aligning_cmds, ref_file_name, options.input1, options.input2, options.output )
367            else:
368                cmd2 = 'bowtie %s %s %s > %s' % ( aligning_cmds, ref_file_name, options.input1, options.output )
369            # align
370            tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
371            tmp_stderr = open( tmp, 'wb' )
372            proc = subprocess.Popen( args=cmd2, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )
373            returncode = proc.wait()
374            tmp_stderr.close()
375            # get stderr, allowing for case where it's very large
376            tmp_stderr = open( tmp, 'rb' )
377            stderr = ''
378            buffsize = 1048576
379            try:
380                while True:
381                    stderr += tmp_stderr.read( buffsize )
382                    if not stderr or len( stderr ) % buffsize != 0:
383                        break
384            except OverflowError:
385                pass
386            tmp_stderr.close()
387            if returncode != 0:
388                raise Exception, stderr
389            # check that there are results in the output file
390            if os.path.getsize( options.output ) == 0:
391                raise Exception, 'The output file is empty, there may be an error with your input file or settings.' + '\nextra: ' + str(extra)
392        except Exception, e:
393            stop_err( 'Error aligning sequence. ' + str( e ) )
394    finally:
395        # clean up temp dir
396        if os.path.exists( tmp_index_dir ):
397            shutil.rmtree( tmp_index_dir )
398    stdout += 'Sequence file aligned.\n'
399    sys.stdout.write( stdout )
400
401if __name__=="__main__": __main__()