PageRenderTime 49ms CodeModel.GetById 36ms app.highlight 10ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/metag_tools/megablast_wrapper.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 129 lines | 96 code | 10 blank | 23 comment | 2 complexity | 15c4a4055989d8199dfd6879ede7996a MD5 | raw file
  1#!/usr/bin/env python
  2"""
  3run megablast for metagenomics data
  4
  5usage: %prog [options]
  6   -d, --db_build=d: The database to use
  7   -i, --input=i: Input FASTQ candidate file
  8   -w, --word_size=w: Size of best perfect match
  9   -c, --identity_cutoff=c: Report hits at or above this identity
 10   -e, --eval_cutoff=e: Expectation value cutoff
 11   -f, --filter_query=f: Filter out low complexity regions
 12   -x, --index_dir=x: Data index directory
 13   -o, --output=o: Output file
 14   
 15usage: %prog db_build input_file word_size identity_cutoff eval_cutoff filter_query index_dir output_file
 16"""
 17
 18# This version (April 26, 2012) replaces megablast with blast+ blastn
 19# There is now no need to augment NCBI-formatted databases and these can be
 20# directly downloaded from NCBI ftp site
 21
 22import os, subprocess, sys, tempfile
 23from galaxy import eggs
 24import pkg_resources; pkg_resources.require( "bx-python" )
 25from bx.cookbook import doc_optparse
 26
 27assert sys.version_info[:2] >= ( 2, 4 )
 28
 29def stop_err( msg ):
 30    sys.stderr.write( "%s\n" % msg )
 31    sys.exit()
 32
 33def __main__():
 34    #Parse Command Line
 35    options, args = doc_optparse.parse( __doc__ )
 36    query_filename = options.input.strip() # -query
 37    output_filename = options.output.strip() # -out
 38    mega_word_size = options.word_size        # -word_size
 39    mega_iden_cutoff = options.identity_cutoff      # -perc_identity
 40    mega_evalue_cutoff = options.eval_cutoff      # -evalue
 41    mega_temp_output = tempfile.NamedTemporaryFile().name
 42    GALAXY_DATA_INDEX_DIR = options.index_dir
 43    DB_LOC = "%s/blastdb.loc" % GALAXY_DATA_INDEX_DIR
 44
 45    # megablast parameters
 46    try:
 47        int( mega_word_size )    
 48    except:
 49        stop_err( 'Invalid value for word size' )
 50    try:
 51        float( mega_iden_cutoff )
 52    except:
 53        stop_err( 'Invalid value for identity cut-off' )
 54    try:
 55        float( mega_evalue_cutoff )
 56    except:
 57        stop_err( 'Invalid value for Expectation value' )
 58
 59    if not os.path.exists( os.path.split( options.db_build )[0] ):
 60        stop_err( 'Cannot locate the target database directory. Please check your location file.' )
 61
 62    try:
 63        threads = int( os.environ['GALAXY_SLOTS'])
 64    except Exception:
 65        threads = 8
 66    # arguments for megablast
 67    megablast_command = "blastn -task megablast -db %s -query %s -out %s -outfmt '6 qseqid sgi slen ppos length mismatch gaps qstart qend sstart send evalue bitscore' -num_threads %d -word_size %s -perc_identity %s -evalue %s -dust %s > /dev/null" \
 68        % ( options.db_build, query_filename, mega_temp_output, threads, mega_word_size, mega_iden_cutoff, mega_evalue_cutoff, options.filter_query ) 
 69
 70    print megablast_command
 71
 72    tmp = tempfile.NamedTemporaryFile().name
 73    try:
 74        tmp_stderr = open( tmp, 'wb' )
 75        proc = subprocess.Popen( args=megablast_command, shell=True, stderr=tmp_stderr.fileno() )
 76        returncode = proc.wait()
 77        tmp_stderr.close()
 78        # get stderr, allowing for case where it's very large
 79        tmp_stderr = open( tmp, 'rb' )
 80        stderr = ''
 81        buffsize = 1048576
 82        try:
 83            while True:
 84                stderr += tmp_stderr.read( buffsize )
 85                if not stderr or len( stderr ) % buffsize != 0:
 86                    break
 87        except OverflowError:
 88            pass
 89        tmp_stderr.close()
 90        if returncode != 0:
 91            raise Exception, stderr
 92        if os.path.exists( tmp ):
 93            os.unlink( tmp )
 94    except Exception, e:
 95        if os.path.exists( mega_temp_output ):
 96            os.unlink( mega_temp_output )
 97        if os.path.exists( tmp ):
 98            os.unlink( tmp )
 99        stop_err( 'Cannot execute megaablast. ' + str( e ) )
100
101    output = open( output_filename, 'w' )
102    invalid_lines = 0
103    for i, line in enumerate( file( mega_temp_output ) ):
104        line = line.rstrip( '\r\n' )
105        fields = line.split()
106        try:
107            # convert the last column (bit-score as this is causing problem in filter tool) to float
108            fields[-1] = float( fields[-1] )
109            new_line = "%s\t%0.1f" % ( '\t'.join( fields[:-1] ), fields[-1] )
110        except:
111            new_line = line
112            invalid_lines += 1
113        output.write( "%s\n" % new_line )
114    output.close()
115
116    if os.path.exists( mega_temp_output ):
117        os.unlink( mega_temp_output ) #remove the tempfile that we just reformatted the contents of
118
119    if invalid_lines:
120        print "Unable to parse %d lines. Keep the default format." % invalid_lines
121
122    # megablast generates a file called error.log, if empty, delete it, if not, show the contents
123    if os.path.exists( './error.log' ):
124        for i, line in enumerate( file( './error.log' ) ):
125            line = line.rstrip( '\r\n' )
126            print line
127        os.remove( './error.log' )
128
129if __name__ == "__main__" : __main__()