PageRenderTime 49ms CodeModel.GetById 19ms app.highlight 25ms RepoModel.GetById 1ms app.codeStats 1ms

/tools/sr_mapping/bfast_wrapper.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 344 lines | 321 code | 1 blank | 22 comment | 19 complexity | 6e7c87ad530a3d87cbd06c67dd9b6779 MD5 | raw file
  1#!/usr/bin/env python
  2
  3"""
  4Runs BFAST on single-end or paired-end data.
  5TODO: more documentation
  6
  7TODO: 
  8    - auto-detect gzip or bz2
  9    - split options (?)
 10    - queue lengths (?)
 11    - assumes reference always has been indexed
 12    - main and secondary indexes
 13    - scoring matrix file ?
 14    - read group file ?
 15
 16usage: bfast_wrapper.py [options]
 17    -r, --ref=r: The reference genome to use or index
 18    -f, --fastq=f: The fastq file to use for the mapping
 19    -F, --output=u: The file to save the output (SAM format)
 20    -s, --fileSource=s: Whether to use a previously indexed reference sequence or one from history (indexed or history)
 21    -p, --params=p: Parameter setting to use (pre_set or full)
 22    -n, --numThreads=n: The number of threads to use
 23    -A, --space=A: The encoding space (0: base 1: color)
 24    -o, --offsets=o: The offsets for 'match'
 25    -l, --loadAllIndexes=l: Load all indexes into memory
 26    -k, --keySize=k: truncate key size in 'match'
 27    -K, --maxKeyMatches=K: the maximum number of matches to allow before a key is ignored
 28    -M, --maxNumMatches=M: the maximum number of matches to allow before the read is discarded
 29    -w, --whichStrand=w: the strands to consider (0: both 1: forward 2: reverse)
 30    -t, --timing=t: output timing information to stderr
 31    -u, --ungapped=u: performed ungapped local alignment
 32    -U, --unconstrained=U: performed local alignment without mask constraints
 33    -O, --offset=O: the number of bases before and after each hit to consider in local alignment
 34    -q, --avgMismatchQuality=q: average mismatch quality
 35    -a, --algorithm=a: post processing algorithm (0: no filtering, 1: all passing filters, 2: unique, 3: best scoring unique, 4: best score all)
 36    -P, --disallowPairing=P: do not choose alignments based on pairing
 37    -R, --reverse=R: paired end reads are given on reverse strands
 38    -z, --random=z: output a random best scoring alignment
 39    -D, --dbkey=D: Dbkey for reference genome
 40    -H, --suppressHeader=H: Suppress the sam header
 41"""
 42
 43import optparse, os, shutil, subprocess, sys, tempfile
 44
 45def stop_err( msg ):
 46    sys.stderr.write( '%s\n' % msg )
 47    sys.exit()
 48
 49def __main__():
 50    #Parse Command Line
 51    parser = optparse.OptionParser()
 52    parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to index and use' )
 53    parser.add_option( '-f', '--fastq', dest='fastq', help='The fastq file to use for the mapping' )
 54    parser.add_option( '-F', '--output', dest='output', help='The file to save the output (SAM format)' )
 55    parser.add_option( '-A', '--space', dest='space', type="choice", default='0', choices=('0','1' ), help='The encoding space (0: base 1: color)' )
 56    parser.add_option( '-H', '--suppressHeader', action="store_true", dest='suppressHeader', default=False, help='Suppress header' )
 57    parser.add_option( '-n', '--numThreads', dest='numThreads', type="int", default="1", help='The number of threads to use' )
 58    parser.add_option( '-t', '--timing', action="store_true", default=False, dest='timing', help='output timming information to stderr' )
 59    parser.add_option( '-l', '--loadAllIndexes', action="store_true", default=False, dest='loadAllIndexes', help='Load all indexes into memory' )
 60    parser.add_option( '-m', '--indexMask', dest='indexMask', help='String containing info on how to build custom indexes' )
 61    parser.add_option( "-b", "--buildIndex", action="store_true", dest="buildIndex", default=False, help='String containing info on how to build custom indexes' )
 62    parser.add_option( "--indexRepeatMasker", action="store_true", dest="indexRepeatMasker", default=False, help='Do not index lower case sequences. Such as those created by RepeatMasker' )
 63    parser.add_option( '--indexContigOptions', dest='indexContigOptions', default="", help='The contig range options to use for the indexing' )
 64    parser.add_option( '--indexExonsFileName', dest='indexExonsFileName', default="", help='The exons file to use for the indexing' )
 65    
 66    parser.add_option( '-o', '--offsets', dest='offsets', default="", help='The offsets for \'match\'' )
 67    parser.add_option( '-k', '--keySize', dest='keySize', type="int", default="-1", help='truncate key size in \'match\'' )
 68    parser.add_option( '-K', '--maxKeyMatches', dest='maxKeyMatches', type="int", default="-1", help='the maximum number of matches to allow before a key is ignored' )
 69    parser.add_option( '-M', '--maxNumMatches', dest='maxNumMatches', type="int", default="-1", help='the maximum number of matches to allow bfore the read is discarded' )
 70    parser.add_option( '-w', '--whichStrand', dest='whichStrand', type="choice", default='0', choices=('0','1','2'), help='the strands to consider (0: both 1: forward 2: reverse)' )
 71    
 72    parser.add_option( '--scoringMatrixFileName', dest='scoringMatrixFileName', help='Scoring Matrix file used to score the alignments' )
 73    parser.add_option( '-u', '--ungapped', dest='ungapped', action="store_true", default=False, help='performed ungapped local alignment' )
 74    parser.add_option( '-U', '--unconstrained', dest='unconstrained', action="store_true", default=False, help='performed local alignment without mask constraints' )
 75    parser.add_option( '-O', '--offset', dest='offset', type="int", default="0", help='the number of bases before and after each hit to consider in local alignment' )
 76    parser.add_option( '-q', '--avgMismatchQuality', type="int", default="-1", dest='avgMismatchQuality', help='average mismatch quality' )
 77    
 78    parser.add_option( '-a', '--algorithm', dest='algorithm', default='0', type="choice", choices=('0','1','2','3','4' ), help='post processing algorithm (0: no filtering, 1: all passing filters, 2: unique, 3: best scoring unique, 4: best score all' )
 79    parser.add_option( '--unpaired', dest='unpaired', action="store_true", default=False, help='do not choose alignments based on pairing' )
 80    parser.add_option( '--reverseStrand', dest='reverseStrand', action="store_true", default=False, help='paired end reads are given on reverse strands' )
 81    parser.add_option( '--pairedEndInfer', dest='pairedEndInfer', action="store_true", default=False, help='break ties when one end of a paired end read by estimating the insert size distribution' )
 82    parser.add_option( '--randomBest', dest='randomBest', action="store_true", default=False, help='output a random best scoring alignment' )
 83    
 84    (options, args) = parser.parse_args()
 85
 86    # output version # of tool
 87    try:
 88        tmp = tempfile.NamedTemporaryFile().name
 89        tmp_stdout = open( tmp, 'wb' )
 90        proc = subprocess.Popen( args='bfast 2>&1', shell=True, stdout=tmp_stdout )
 91        tmp_stdout.close()
 92        returncode = proc.wait()
 93        stdout = None
 94        for line in open( tmp_stdout.name, 'rb' ):
 95            if line.lower().find( 'version' ) >= 0:
 96                stdout = line.strip()
 97                break
 98        if stdout:
 99            sys.stdout.write( '%s\n' % stdout )
100        else:
101            raise Exception
102    except:
103        sys.stdout.write( 'Could not determine BFAST version\n' )
104
105    buffsize = 1048576
106
107    # make temp directory for bfast, requires trailing slash
108    tmp_dir = '%s/' % tempfile.mkdtemp()
109    
110    #'generic' options used in all bfast commands here
111    if options.timing:
112        all_cmd_options = "-t"
113    else:
114        all_cmd_options = ""
115    
116    try:
117        if options.buildIndex:
118            reference_filepath = tempfile.NamedTemporaryFile( dir=tmp_dir, suffix='.fa' ).name
119            #build bfast indexes
120            os.symlink( options.ref, reference_filepath )
121            
122            #bfast fast2brg
123            try:
124                nuc_space = [ "0" ]
125                if options.space == "1":
126                    #color space localalign appears to require nuc space brg
127                    nuc_space.append( "1" )
128                for space in nuc_space:
129                    cmd = 'bfast fasta2brg -f "%s" -A "%s" %s' % ( reference_filepath, space, all_cmd_options )
130                    tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
131                    tmp_stderr = open( tmp, 'wb' )
132                    proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
133                    returncode = proc.wait()
134                    tmp_stderr.close()
135                    # get stderr, allowing for case where it's very large
136                    tmp_stderr = open( tmp, 'rb' )
137                    stderr = ''
138                    try:
139                        while True:
140                            stderr += tmp_stderr.read( buffsize )
141                            if not stderr or len( stderr ) % buffsize != 0:
142                                break
143                    except OverflowError:
144                        pass
145                    tmp_stderr.close()
146                    if returncode != 0:
147                        raise Exception, stderr
148            except Exception, e:
149                raise Exception, 'Error in \'bfast fasta2brg\'.\n' + str( e )
150            
151            #bfast index
152            try:
153                all_index_cmds = 'bfast index %s -f "%s" -A "%s" -n "%s"' % ( all_cmd_options, reference_filepath, options.space, options.numThreads )
154                
155                if options.indexRepeatMasker:
156                    all_index_cmds += " -R"
157                
158                if options.indexContigOptions:
159                    index_contig_options = map( int, options.indexContigOptions.split( ',' ) )
160                    if index_contig_options[0] >= 0:
161                        all_index_cmds += ' -s "%s"' % index_contig_options[0]
162                    if index_contig_options[1] >= 0:
163                        all_index_cmds += ' -S "%s"' % index_contig_options[1]
164                    if index_contig_options[2] >= 0:
165                        all_index_cmds += ' -e "%s"' % index_contig_options[2]
166                    if index_contig_options[3] >= 0:
167                        all_index_cmds += ' -E "%s"' % index_contig_options[3]
168                elif options.indexExonsFileName:
169                    all_index_cmds += ' -x "%s"' % options.indexExonsFileName
170                
171                index_count = 1
172                for mask, hash_width in [ mask.split( ':' ) for mask in options.indexMask.split( ',' ) ]:
173                    cmd = '%s -m "%s" -w "%s" -i "%i"' % ( all_index_cmds, mask, hash_width, index_count )
174                    tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
175                    tmp_stderr = open( tmp, 'wb' )
176                    proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
177                    returncode = proc.wait()
178                    tmp_stderr.close()
179                    # get stderr, allowing for case where it's very large
180                    tmp_stderr = open( tmp, 'rb' )
181                    stderr = ''
182                    try:
183                        while True:
184                            stderr += tmp_stderr.read( buffsize )
185                            if not stderr or len( stderr ) % buffsize != 0:
186                                break
187                    except OverflowError:
188                        pass
189                    tmp_stderr.close()
190                    if returncode != 0:
191                        raise Exception, stderr
192                    index_count += 1
193            except Exception, e:
194                raise Exception, 'Error in \'bfast index\'.\n' + str( e )
195            
196        else:
197            reference_filepath = options.ref
198        assert reference_filepath and os.path.exists( reference_filepath ), 'A valid genome reference was not provided.'
199        
200        # set up aligning and generate aligning command options
201        # set up temp output files
202        tmp_bmf = tempfile.NamedTemporaryFile( dir=tmp_dir )
203        tmp_bmf_name = tmp_bmf.name
204        tmp_bmf.close()
205        tmp_baf = tempfile.NamedTemporaryFile( dir=tmp_dir )
206        tmp_baf_name = tmp_baf.name
207        tmp_baf.close()
208        
209        bfast_match_cmd = 'bfast match -f "%s" -r "%s" -n "%s" -A "%s" -T "%s" -w "%s" %s' % ( reference_filepath, options.fastq, options.numThreads, options.space, tmp_dir, options.whichStrand, all_cmd_options )
210        bfast_localalign_cmd = 'bfast localalign -f "%s" -m "%s" -n "%s" -A "%s" -o "%s" %s' % ( reference_filepath, tmp_bmf_name, options.numThreads, options.space, options.offset, all_cmd_options )
211        bfast_postprocess_cmd = 'bfast postprocess -O 1 -f "%s" -i "%s" -n "%s" -A "%s" -a "%s" %s' % ( reference_filepath, tmp_baf_name, options.numThreads, options.space, options.algorithm, all_cmd_options )
212        
213        if options.offsets:
214            bfast_match_cmd += ' -o "%s"' % options.offsets
215        if options.keySize >= 0:
216            bfast_match_cmd += ' -k "%s"' % options.keySize
217        if options.maxKeyMatches >= 0:
218            bfast_match_cmd += ' -K "%s"' % options.maxKeyMatches
219        if options.maxNumMatches >= 0:
220            bfast_match_cmd += ' -M "%s"' % options.maxNumMatches
221            bfast_localalign_cmd += ' -M "%s"' % options.maxNumMatches
222        if options.scoringMatrixFileName:
223            bfast_localalign_cmd += ' -x "%s"' % options.scoringMatrixFileName
224            bfast_postprocess_cmd += ' -x "%s"' % options.scoringMatrixFileName
225        if options.ungapped:
226            bfast_localalign_cmd += ' -u'
227        if options.unconstrained:
228            bfast_localalign_cmd += ' -U'
229        if options.avgMismatchQuality >= 0:
230            bfast_localalign_cmd += ' -q "%s"' % options.avgMismatchQuality
231            bfast_postprocess_cmd += ' -q "%s"' % options.avgMismatchQuality
232        if options.algorithm == 3:
233            if options.pairedEndInfer:
234                bfast_postprocess_cmd += ' -P'
235            if options.randomBest:
236                bfast_postprocess_cmd += ' -z'
237        if options.unpaired:
238            bfast_postprocess_cmd += ' -U'
239        if options.reverseStrand:
240            bfast_postprocess_cmd += ' -R'
241        
242        #instead of using temp files, should we stream through pipes?
243        bfast_match_cmd += " > %s" % tmp_bmf_name
244        bfast_localalign_cmd += " > %s" % tmp_baf_name
245        bfast_postprocess_cmd += " > %s" % options.output
246        
247        # need to nest try-except in try-finally to handle 2.4
248        try:
249            # bfast 'match'
250            try:
251                tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
252                tmp_stderr = open( tmp, 'wb' )
253                proc = subprocess.Popen( args=bfast_match_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
254                returncode = proc.wait()
255                tmp_stderr.close()
256                # get stderr, allowing for case where it's very large
257                tmp_stderr = open( tmp, 'rb' )
258                stderr = ''
259                try:
260                    while True:
261                        stderr += tmp_stderr.read( buffsize )
262                        if not stderr or len( stderr ) % buffsize != 0:
263                            break
264                except OverflowError:
265                    pass
266                tmp_stderr.close()
267                if returncode != 0:
268                    raise Exception, stderr
269            except Exception, e:
270                raise Exception, 'Error in \'bfast match\'. \n' + str( e )
271            # bfast 'localalign'
272            try:
273                tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
274                tmp_stderr = open( tmp, 'wb' )
275                proc = subprocess.Popen( args=bfast_localalign_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
276                returncode = proc.wait()
277                tmp_stderr.close()
278                # get stderr, allowing for case where it's very large
279                tmp_stderr = open( tmp, 'rb' )
280                stderr = ''
281                try:
282                    while True:
283                        stderr += tmp_stderr.read( buffsize )
284                        if not stderr or len( stderr ) % buffsize != 0:
285                            break
286                except OverflowError:
287                    pass
288                tmp_stderr.close()
289                if returncode != 0:
290                    raise Exception, stderr
291            except Exception, e:
292                raise Exception, 'Error in \'bfast localalign\'. \n' + str( e )
293            # bfast 'postprocess'
294            try:
295                tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
296                tmp_stderr = open( tmp, 'wb' )
297                proc = subprocess.Popen( args=bfast_postprocess_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
298                returncode = proc.wait()
299                tmp_stderr.close()
300                # get stderr, allowing for case where it's very large
301                tmp_stderr = open( tmp, 'rb' )
302                stderr = ''
303                try:
304                    while True:
305                        stderr += tmp_stderr.read( buffsize )
306                        if not stderr or len( stderr ) % buffsize != 0:
307                            break
308                except OverflowError:
309                    pass
310                tmp_stderr.close()
311                if returncode != 0:
312                    raise Exception, stderr
313            except Exception, e:
314                raise Exception, 'Error in \'bfast postprocess\'. \n' + str( e )
315            # remove header if necessary
316            if options.suppressHeader:
317                tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir)
318                tmp_out_name = tmp_out.name
319                tmp_out.close()
320                try:
321                    shutil.move( options.output, tmp_out_name )
322                except Exception, e:
323                    raise Exception, 'Error moving output file before removing headers. \n' + str( e )
324                fout = file( options.output, 'w' )
325                for line in file( tmp_out.name, 'r' ):
326                    if len( line ) < 3 or line[0:3] not in [ '@HD', '@SQ', '@RG', '@PG', '@CO' ]:
327                        fout.write( line )
328                fout.close()
329            # check that there are results in the output file
330            if os.path.getsize( options.output ) > 0:
331                if "0" == options.space:
332                    sys.stdout.write( 'BFAST run on Base Space data' )
333                else:
334                    sys.stdout.write( 'BFAST run on Color Space data' )
335            else:
336                raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.'
337        except Exception, e:
338            stop_err( 'The alignment failed.\n' + str( e ) )
339    finally:
340        # clean up temp dir
341        if os.path.exists( tmp_dir ):
342            shutil.rmtree( tmp_dir )
343
344if __name__=="__main__": __main__()