/tools/sr_mapping/lastz_wrapper.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 290 lines · 215 code · 9 blank · 66 comment · 32 complexity · b597014ae71612658adf9ac139a0957e MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. Runs Lastz
  4. Written for Lastz v. 1.01.88.
  5. usage: lastz_wrapper.py [options]
  6. --ref_name: The reference name to change all output matches to
  7. --ref_source: Whether the reference is cached or from the history
  8. --source_select: Whether to used pre-set or cached reference file
  9. --input1: The name of the reference file if using history or reference base name if using cached
  10. --input2: The reads file to align
  11. --ref_sequences: The number of sequences in the reference file if using one from history
  12. --pre_set_options: Which of the pre set options to use, if using pre-sets
  13. --strand: Which strand of the read to search, if specifying all parameters
  14. --seed: Seeding settings, if specifying all parameters
  15. --gfextend: Whether to perform gap-free extension of seed hits to HSPs (high scoring segment pairs), if specifying all parameters
  16. --chain: Whether to perform chaining of HSPs, if specifying all parameters
  17. --transition: Number of transitions to allow in each seed hit, if specifying all parameters
  18. --O: Gap opening penalty, if specifying all parameters
  19. --E: Gap extension penalty, if specifying all parameters
  20. --X: X-drop threshold, if specifying all parameters
  21. --Y: Y-drop threshold, if specifying all parameters
  22. --K: Threshold for HSPs, if specifying all parameters
  23. --L: Threshold for gapped alignments, if specifying all parameters
  24. --entropy: Whether to involve entropy when filtering HSPs, if specifying all parameters
  25. --identity_min: Minimum identity (don't report matches under this identity)
  26. --identity_max: Maximum identity (don't report matches above this identity)
  27. --coverage: The minimum coverage value (don't report matches covering less than this)
  28. --unmask: Whether to convert lowercase bases to uppercase
  29. --out_format: The format of the output file (sam, diffs, or tabular (general))
  30. --output: The name of the output file
  31. --lastzSeqsFileDir: Directory of local lastz_seqs.loc file
  32. """
  33. import optparse, os, subprocess, shutil, sys, tempfile, threading, time
  34. from Queue import Queue
  35. from galaxy import eggs
  36. import pkg_resources
  37. pkg_resources.require( 'bx-python' )
  38. from bx.seq.twobit import *
  39. from bx.seq.fasta import FastaReader
  40. from galaxy.util.bunch import Bunch
  41. STOP_SIGNAL = object()
  42. WORKERS = 4
  43. SLOTS = 128
  44. def stop_err( msg ):
  45. sys.stderr.write( "%s" % msg )
  46. sys.exit()
  47. def stop_queues( lastz, combine_data ):
  48. # This method should only be called if an error has been encountered.
  49. # Send STOP_SIGNAL to all worker threads
  50. for t in lastz.threads:
  51. lastz.put( STOP_SIGNAL, True )
  52. combine_data.put( STOP_SIGNAL, True )
  53. class BaseQueue( object ):
  54. def __init__( self, num_threads, slots=-1 ):
  55. # Initialize the queue and worker threads
  56. self.queue = Queue( slots )
  57. self.threads = []
  58. for i in range( num_threads ):
  59. worker = threading.Thread( target=self.run_next )
  60. worker.start()
  61. self.threads.append( worker )
  62. def run_next( self ):
  63. # Run the next job, waiting until one is available if necessary
  64. while True:
  65. job = self.queue.get()
  66. if job is STOP_SIGNAL:
  67. return self.shutdown()
  68. self.run_job( job )
  69. time.sleep( 1 )
  70. def run_job( self, job ):
  71. stop_err( 'Not Implemented' )
  72. def put( self, job, block=False ):
  73. # Add a job to the queue
  74. self.queue.put( job, block )
  75. def shutdown( self ):
  76. return
  77. class LastzJobQueue( BaseQueue ):
  78. """
  79. A queue that runs commands in parallel. Blocking is done so the queue will
  80. not consume much memory.
  81. """
  82. def run_job( self, job ):
  83. # Execute the job's command
  84. proc = subprocess.Popen( args=job.command, shell=True, stderr=subprocess.PIPE, )
  85. proc.wait()
  86. stderr = proc.stderr.read()
  87. proc.wait()
  88. if stderr:
  89. stop_queues( self, job.combine_data_queue )
  90. stop_err( stderr )
  91. job.combine_data_queue.put( job )
  92. class CombineDataQueue( BaseQueue ):
  93. """
  94. A queue that concatenates files in serial. Blocking is not done since this
  95. queue is not expected to grow larger than the command queue.
  96. """
  97. def __init__( self, output_filename, num_threads=1 ):
  98. BaseQueue.__init__( self, num_threads )
  99. self.CHUNK_SIZE = 2**20 # 1Mb
  100. self.output_file = open( output_filename, 'wb' )
  101. def run_job( self, job ):
  102. in_file = open( job.output, 'rb' )
  103. while True:
  104. chunk = in_file.read( self.CHUNK_SIZE )
  105. if not chunk:
  106. in_file.close()
  107. break
  108. self.output_file.write( chunk )
  109. for file_name in job.cleanup:
  110. os.remove( file_name )
  111. def shutdown( self ):
  112. self.output_file.close()
  113. return
  114. def __main__():
  115. #Parse Command Line
  116. parser = optparse.OptionParser()
  117. parser.add_option( '', '--ref_name', dest='ref_name', help='The reference name to change all output matches to' )
  118. parser.add_option( '', '--ref_source', dest='ref_source', help='Whether the reference is cached or from the history' )
  119. parser.add_option( '', '--ref_sequences', dest='ref_sequences', help='Number of sequences in the reference dataset' )
  120. parser.add_option( '', '--source_select', dest='source_select', help='Whether to used pre-set or cached reference file' )
  121. parser.add_option( '', '--input1', dest='input1', help='The name of the reference file if using history or reference base name if using cached' )
  122. parser.add_option( '', '--input2', dest='input2', help='The reads file to align' )
  123. parser.add_option( '', '--pre_set_options', dest='pre_set_options', help='Which of the pre set options to use, if using pre-sets' )
  124. parser.add_option( '', '--strand', dest='strand', help='Which strand of the read to search, if specifying all parameters' )
  125. parser.add_option( '', '--seed', dest='seed', help='Seeding settings, if specifying all parameters' )
  126. parser.add_option( '', '--transition', dest='transition', help='Number of transitions to allow in each seed hit, if specifying all parameters' )
  127. parser.add_option( '', '--gfextend', dest='gfextend', help='Whether to perform gap-free extension of seed hits to HSPs (high scoring segment pairs), if specifying all parameters' )
  128. parser.add_option( '', '--chain', dest='chain', help='Whether to perform chaining of HSPs, if specifying all parameters' )
  129. parser.add_option( '', '--O', dest='O', help='Gap opening penalty, if specifying all parameters' )
  130. parser.add_option( '', '--E', dest='E', help='Gap extension penalty, if specifying all parameters' )
  131. parser.add_option( '', '--X', dest='X', help='X-drop threshold, if specifying all parameters' )
  132. parser.add_option( '', '--Y', dest='Y', help='Y-drop threshold, if specifying all parameters' )
  133. parser.add_option( '', '--K', dest='K', help='Threshold for HSPs, if specifying all parameters' )
  134. parser.add_option( '', '--L', dest='L', help='Threshold for gapped alignments, if specifying all parameters' )
  135. parser.add_option( '', '--entropy', dest='entropy', help='Whether to involve entropy when filtering HSPs, if specifying all parameters' )
  136. parser.add_option( '', '--identity_min', dest='identity_min', help="Minimum identity (don't report matches under this identity)" )
  137. parser.add_option( '', '--identity_max', dest='identity_max', help="Maximum identity (don't report matches above this identity)" )
  138. parser.add_option( '', '--coverage', dest='coverage', help="The minimum coverage value (don't report matches covering less than this)" )
  139. parser.add_option( '', '--unmask', dest='unmask', help='Whether to convert lowercase bases to uppercase' )
  140. parser.add_option( '', '--out_format', dest='format', help='The format of the output file (sam, diffs, or tabular (general))' )
  141. parser.add_option( '', '--output', dest='output', help='The output file' )
  142. parser.add_option( '', '--lastzSeqsFileDir', dest='lastzSeqsFileDir', help='Directory of local lastz_seqs.loc file' )
  143. ( options, args ) = parser.parse_args()
  144. # output version # of tool
  145. try:
  146. tmp = tempfile.NamedTemporaryFile().name
  147. tmp_stdout = open( tmp, 'wb' )
  148. proc = subprocess.Popen( args='lastz -v', shell=True, stdout=tmp_stdout )
  149. tmp_stdout.close()
  150. returncode = proc.wait()
  151. stdout = None
  152. for line in open( tmp_stdout.name, 'rb' ):
  153. if line.lower().find( 'version' ) >= 0:
  154. stdout = line.strip()
  155. break
  156. if stdout:
  157. sys.stdout.write( '%s\n' % stdout )
  158. else:
  159. raise Exception
  160. except:
  161. sys.stdout.write( 'Could not determine Lastz version\n' )
  162. if options.unmask == 'yes':
  163. unmask = '[unmask]'
  164. else:
  165. unmask = ''
  166. if options.ref_name:
  167. ref_name = '[nickname=%s]' % options.ref_name
  168. else:
  169. ref_name = ''
  170. # Prepare for commonly-used preset options
  171. if options.source_select == 'pre_set':
  172. set_options = '--%s' % options.pre_set_options
  173. # Prepare for user-specified options
  174. else:
  175. set_options = '--%s --%s --gapped --strand=%s --seed=%s --%s O=%s E=%s X=%s Y=%s K=%s L=%s --%s' % \
  176. ( options.gfextend, options.chain, options.strand, options.seed, options.transition,
  177. options.O, options.E, options.X, options.Y, options.K, options.L, options.entropy )
  178. # Specify input2 and add [fullnames] modifier if output format is diffs
  179. if options.format == 'diffs':
  180. input2 = '%s[fullnames]' % options.input2
  181. else:
  182. input2 = options.input2
  183. if options.format == 'tabular':
  184. # Change output format to general if it's tabular and add field names for tabular output
  185. format = 'general-'
  186. tabular_fields = ':score,name1,strand1,size1,start1,zstart1,end1,length1,text1,name2,strand2,size2,start2,zstart2,end2,start2+,zstart2+,end2+,length2,text2,diff,cigar,identity,coverage,gaprate,diagonal,shingle'
  187. elif options.format == 'sam':
  188. # We currently ALWAYS suppress SAM headers.
  189. format = 'sam-'
  190. tabular_fields = ''
  191. else:
  192. format = options.format
  193. tabular_fields = ''
  194. # Set up our queues
  195. lastz_job_queue = LastzJobQueue( WORKERS, slots=SLOTS )
  196. combine_data_queue = CombineDataQueue( options.output )
  197. if options.ref_source == 'history':
  198. # Reference is a fasta dataset from the history, so split job across
  199. # the number of sequences in the dataset ( this could be a HUGE number )
  200. try:
  201. # Ensure there is at least 1 sequence in the dataset ( this may not be necessary ).
  202. error_msg = "The reference dataset is missing metadata, click the pencil icon in the history item and 'auto-detect' the metadata attributes."
  203. ref_sequences = int( options.ref_sequences )
  204. if ref_sequences < 1:
  205. stop_queues( lastz_job_queue, combine_data_queue )
  206. stop_err( error_msg )
  207. except:
  208. stop_queues( lastz_job_queue, combine_data_queue )
  209. stop_err( error_msg )
  210. seqs = 0
  211. fasta_reader = FastaReader( open( options.input1 ) )
  212. while True:
  213. # Read the next sequence from the reference dataset
  214. seq = fasta_reader.next()
  215. if not seq:
  216. break
  217. seqs += 1
  218. # Create a temporary file to contain the current sequence as input to lastz
  219. tmp_in_fd, tmp_in_name = tempfile.mkstemp( suffix='.in' )
  220. tmp_in = os.fdopen( tmp_in_fd, 'wb' )
  221. # Write the current sequence to the temporary input file
  222. tmp_in.write( '>%s\n%s\n' % ( seq.name, seq.text ) )
  223. tmp_in.close()
  224. # Create a 2nd temporary file to contain the output from lastz execution on the current sequence
  225. tmp_out_fd, tmp_out_name = tempfile.mkstemp( suffix='.out' )
  226. os.close( tmp_out_fd )
  227. # Generate the command line for calling lastz on the current sequence
  228. command = 'lastz %s%s%s %s %s --ambiguousn --nolaj --identity=%s..%s --coverage=%s --format=%s%s > %s' % \
  229. ( tmp_in_name, unmask, ref_name, input2, set_options, options.identity_min,
  230. options.identity_max, options.coverage, format, tabular_fields, tmp_out_name )
  231. # Create a job object
  232. job = Bunch()
  233. job.command = command
  234. job.output = tmp_out_name
  235. job.cleanup = [ tmp_in_name, tmp_out_name ]
  236. job.combine_data_queue = combine_data_queue
  237. # Add another job to the lastz_job_queue. Execution
  238. # will wait at this point if the queue is full.
  239. lastz_job_queue.put( job, block=True )
  240. # Make sure the value of sequences in the metadata is the same as the
  241. # number of sequences read from the dataset ( this may not be necessary ).
  242. if ref_sequences != seqs:
  243. stop_queues( lastz_job_queue, combine_data_queue )
  244. stop_err( "The value of metadata.sequences (%d) differs from the number of sequences read from the reference (%d)." % ( ref_sequences, seqs ) )
  245. else:
  246. # Reference is a locally cached 2bit file, split job across number of chroms in 2bit file
  247. tbf = TwoBitFile( open( options.input1, 'r' ) )
  248. for chrom in tbf.keys():
  249. # Create a temporary file to contain the output from lastz execution on the current chrom
  250. tmp_out_fd, tmp_out_name = tempfile.mkstemp( suffix='.out' )
  251. os.close( tmp_out_fd )
  252. command = 'lastz %s/%s%s%s %s %s --ambiguousn --nolaj --identity=%s..%s --coverage=%s --format=%s%s >> %s' % \
  253. ( options.input1, chrom, unmask, ref_name, input2, set_options, options.identity_min,
  254. options.identity_max, options.coverage, format, tabular_fields, tmp_out_name )
  255. # Create a job object
  256. job = Bunch()
  257. job.command = command
  258. job.output = tmp_out_name
  259. job.cleanup = [ tmp_out_name ]
  260. job.combine_data_queue = combine_data_queue
  261. # Add another job to the lastz_job_queue. Execution
  262. # will wait at this point if the queue is full.
  263. lastz_job_queue.put( job, block=True )
  264. # Stop the lastz_job_queue
  265. for t in lastz_job_queue.threads:
  266. lastz_job_queue.put( STOP_SIGNAL, True )
  267. # Although all jobs are submitted to the queue, we can't shut down the combine_data_queue
  268. # until we know that all jobs have been submitted to its queue. We do this by checking
  269. # whether all of the threads in the lastz_job_queue have terminated.
  270. while threading.activeCount() > 2:
  271. time.sleep( 1 )
  272. # Now it's safe to stop the combine_data_queue
  273. combine_data_queue.put( STOP_SIGNAL )
  274. if __name__=="__main__": __main__()