/tools/metag_tools/megablast_wrapper.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 129 lines · 89 code · 15 blank · 25 comment · 24 complexity · 15c4a4055989d8199dfd6879ede7996a MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. run megablast for metagenomics data
  4. usage: %prog [options]
  5. -d, --db_build=d: The database to use
  6. -i, --input=i: Input FASTQ candidate file
  7. -w, --word_size=w: Size of best perfect match
  8. -c, --identity_cutoff=c: Report hits at or above this identity
  9. -e, --eval_cutoff=e: Expectation value cutoff
  10. -f, --filter_query=f: Filter out low complexity regions
  11. -x, --index_dir=x: Data index directory
  12. -o, --output=o: Output file
  13. usage: %prog db_build input_file word_size identity_cutoff eval_cutoff filter_query index_dir output_file
  14. """
  15. # This version (April 26, 2012) replaces megablast with blast+ blastn
  16. # There is now no need to augment NCBI-formatted databases and these can be
  17. # directly downloaded from NCBI ftp site
  18. import os, subprocess, sys, tempfile
  19. from galaxy import eggs
  20. import pkg_resources; pkg_resources.require( "bx-python" )
  21. from bx.cookbook import doc_optparse
  22. assert sys.version_info[:2] >= ( 2, 4 )
  23. def stop_err( msg ):
  24. sys.stderr.write( "%s\n" % msg )
  25. sys.exit()
  26. def __main__():
  27. #Parse Command Line
  28. options, args = doc_optparse.parse( __doc__ )
  29. query_filename = options.input.strip() # -query
  30. output_filename = options.output.strip() # -out
  31. mega_word_size = options.word_size # -word_size
  32. mega_iden_cutoff = options.identity_cutoff # -perc_identity
  33. mega_evalue_cutoff = options.eval_cutoff # -evalue
  34. mega_temp_output = tempfile.NamedTemporaryFile().name
  35. GALAXY_DATA_INDEX_DIR = options.index_dir
  36. DB_LOC = "%s/blastdb.loc" % GALAXY_DATA_INDEX_DIR
  37. # megablast parameters
  38. try:
  39. int( mega_word_size )
  40. except:
  41. stop_err( 'Invalid value for word size' )
  42. try:
  43. float( mega_iden_cutoff )
  44. except:
  45. stop_err( 'Invalid value for identity cut-off' )
  46. try:
  47. float( mega_evalue_cutoff )
  48. except:
  49. stop_err( 'Invalid value for Expectation value' )
  50. if not os.path.exists( os.path.split( options.db_build )[0] ):
  51. stop_err( 'Cannot locate the target database directory. Please check your location file.' )
  52. try:
  53. threads = int( os.environ['GALAXY_SLOTS'])
  54. except Exception:
  55. threads = 8
  56. # arguments for megablast
  57. megablast_command = "blastn -task megablast -db %s -query %s -out %s -outfmt '6 qseqid sgi slen ppos length mismatch gaps qstart qend sstart send evalue bitscore' -num_threads %d -word_size %s -perc_identity %s -evalue %s -dust %s > /dev/null" \
  58. % ( options.db_build, query_filename, mega_temp_output, threads, mega_word_size, mega_iden_cutoff, mega_evalue_cutoff, options.filter_query )
  59. print megablast_command
  60. tmp = tempfile.NamedTemporaryFile().name
  61. try:
  62. tmp_stderr = open( tmp, 'wb' )
  63. proc = subprocess.Popen( args=megablast_command, shell=True, stderr=tmp_stderr.fileno() )
  64. returncode = proc.wait()
  65. tmp_stderr.close()
  66. # get stderr, allowing for case where it's very large
  67. tmp_stderr = open( tmp, 'rb' )
  68. stderr = ''
  69. buffsize = 1048576
  70. try:
  71. while True:
  72. stderr += tmp_stderr.read( buffsize )
  73. if not stderr or len( stderr ) % buffsize != 0:
  74. break
  75. except OverflowError:
  76. pass
  77. tmp_stderr.close()
  78. if returncode != 0:
  79. raise Exception, stderr
  80. if os.path.exists( tmp ):
  81. os.unlink( tmp )
  82. except Exception, e:
  83. if os.path.exists( mega_temp_output ):
  84. os.unlink( mega_temp_output )
  85. if os.path.exists( tmp ):
  86. os.unlink( tmp )
  87. stop_err( 'Cannot execute megaablast. ' + str( e ) )
  88. output = open( output_filename, 'w' )
  89. invalid_lines = 0
  90. for i, line in enumerate( file( mega_temp_output ) ):
  91. line = line.rstrip( '\r\n' )
  92. fields = line.split()
  93. try:
  94. # convert the last column (bit-score as this is causing problem in filter tool) to float
  95. fields[-1] = float( fields[-1] )
  96. new_line = "%s\t%0.1f" % ( '\t'.join( fields[:-1] ), fields[-1] )
  97. except:
  98. new_line = line
  99. invalid_lines += 1
  100. output.write( "%s\n" % new_line )
  101. output.close()
  102. if os.path.exists( mega_temp_output ):
  103. os.unlink( mega_temp_output ) #remove the tempfile that we just reformatted the contents of
  104. if invalid_lines:
  105. print "Unable to parse %d lines. Keep the default format." % invalid_lines
  106. # megablast generates a file called error.log, if empty, delete it, if not, show the contents
  107. if os.path.exists( './error.log' ):
  108. for i, line in enumerate( file( './error.log' ) ):
  109. line = line.rstrip( '\r\n' )
  110. print line
  111. os.remove( './error.log' )
  112. if __name__ == "__main__" : __main__()