/tools/ngs_rna/tophat_wrapper.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 238 lines · 201 code · 15 blank · 22 comment · 54 complexity · a8e837aead14af8a38c01068e04a2c5a MD5 · raw file

  1. #!/usr/bin/env python
  2. import optparse, os, shutil, subprocess, sys, tempfile, fileinput
  3. def stop_err( msg ):
  4. sys.stderr.write( "%s\n" % msg )
  5. sys.exit()
  6. def __main__():
  7. #Parse Command Line
  8. parser = optparse.OptionParser()
  9. parser.add_option( '-p', '--num-threads', dest='num_threads', help='Use this many threads to align reads. The default is 1.' )
  10. parser.add_option( '-C', '--color-space', dest='color_space', action='store_true', help='This indicates color-space data' )
  11. parser.add_option( '-J', '--junctions-output', dest='junctions_output_file', help='Junctions output file; formate is BED.' )
  12. parser.add_option( '-H', '--hits-output', dest='accepted_hits_output_file', help='Accepted hits output file; formate is BAM.' )
  13. parser.add_option( '', '--own-file', dest='own_file', help='' )
  14. parser.add_option( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' )
  15. parser.add_option( '-r', '--mate-inner-dist', dest='mate_inner_dist', help='This is the expected (mean) inner distance between mate pairs. \
  16. For, example, for paired end runs with fragments selected at 300bp, \
  17. where each end is 50bp, you should set -r to be 200. There is no default, \
  18. and this parameter is required for paired end runs.')
  19. parser.add_option( '', '--mate-std-dev', dest='mate_std_dev', help='Standard deviation of distribution on inner distances between male pairs.' )
  20. parser.add_option( '-a', '--min-anchor-length', dest='min_anchor_length',
  21. help='The "anchor length". TopHat will report junctions spanned by reads with at least this many bases on each side of the junction.' )
  22. parser.add_option( '-m', '--splice-mismatches', dest='splice_mismatches', help='The maximum number of mismatches that can appear in the anchor region of a spliced alignment.' )
  23. parser.add_option( '-i', '--min-intron-length', dest='min_intron_length',
  24. help='The minimum intron length. TopHat will ignore donor/acceptor pairs closer than this many bases apart.' )
  25. parser.add_option( '-I', '--max-intron-length', dest='max_intron_length',
  26. help='The maximum intron length. When searching for junctions ab initio, TopHat will ignore donor/acceptor pairs farther than this many bases apart, except when such a pair is supported by a split segment alignment of a long read.' )
  27. parser.add_option( '-g', '--max_multihits', dest='max_multihits', help='Maximum number of alignments to be allowed' )
  28. parser.add_option( '', '--initial-read-mismatches', dest='initial_read_mismatches', help='Number of mismatches allowed in the initial read mapping' )
  29. parser.add_option( '', '--seg-mismatches', dest='seg_mismatches', help='Number of mismatches allowed in each segment alignment for reads mapped independently' )
  30. parser.add_option( '', '--seg-length', dest='seg_length', help='Minimum length of read segments' )
  31. parser.add_option( '', '--library-type', dest='library_type', help='TopHat will treat the reads as strand specific. Every read alignment will have an XS attribute tag. Consider supplying library type options below to select the correct RNA-seq protocol.' )
  32. parser.add_option( '', '--allow-indels', action="store_true", help='Allow indel search. Indel search is disabled by default.(Not used since version 1.3.0)' )
  33. parser.add_option( '', '--max-insertion-length', dest='max_insertion_length', help='The maximum insertion length. The default is 3.' )
  34. parser.add_option( '', '--max-deletion-length', dest='max_deletion_length', help='The maximum deletion length. The default is 3.' )
  35. # Options for supplying own junctions
  36. parser.add_option( '-G', '--GTF', dest='gene_model_annotations', help='Supply TopHat with a list of gene model annotations. \
  37. TopHat will use the exon records in this file to build \
  38. a set of known splice junctions for each gene, and will \
  39. attempt to align reads to these junctions even if they \
  40. would not normally be covered by the initial mapping.')
  41. parser.add_option( '-j', '--raw-juncs', dest='raw_juncs', help='Supply TopHat with a list of raw junctions. Junctions are \
  42. specified one per line, in a tab-delimited format. Records \
  43. look like: <chrom> <left> <right> <+/-> left and right are \
  44. zero-based coordinates, and specify the last character of the \
  45. left sequenced to be spliced to the first character of the right \
  46. sequence, inclusive.')
  47. parser.add_option( '', '--no-novel-juncs', action="store_true", dest='no_novel_juncs', help="Only look for junctions indicated in the \
  48. supplied GFF file. (ignored without -G)")
  49. parser.add_option( '', '--no-novel-indels', action="store_true", dest='no_novel_indels', help="Skip indel search. Indel search is enabled by default.")
  50. # Types of search.
  51. parser.add_option( '', '--microexon-search', action="store_true", dest='microexon_search', help='With this option, the pipeline will attempt to find alignments incident to microexons. Works only for reads 50bp or longer.')
  52. parser.add_option( '', '--closure-search', action="store_true", dest='closure_search', help='Enables the mate pair closure-based search for junctions. Closure-based search should only be used when the expected inner distance between mates is small (<= 50bp)')
  53. parser.add_option( '', '--no-closure-search', action="store_false", dest='closure_search' )
  54. parser.add_option( '', '--coverage-search', action="store_true", dest='coverage_search', help='Enables the coverage based search for junctions. Use when coverage search is disabled by default (such as for reads 75bp or longer), for maximum sensitivity.')
  55. parser.add_option( '', '--no-coverage-search', action="store_false", dest='coverage_search' )
  56. parser.add_option( '', '--min-segment-intron', dest='min_segment_intron', help='Minimum intron length that may be found during split-segment search' )
  57. parser.add_option( '', '--max-segment-intron', dest='max_segment_intron', help='Maximum intron length that may be found during split-segment search' )
  58. parser.add_option( '', '--min-closure-exon', dest='min_closure_exon', help='Minimum length for exonic hops in potential splice graph' )
  59. parser.add_option( '', '--min-closure-intron', dest='min_closure_intron', help='Minimum intron length that may be found during closure search' )
  60. parser.add_option( '', '--max-closure-intron', dest='max_closure_intron', help='Maximum intron length that may be found during closure search' )
  61. parser.add_option( '', '--min-coverage-intron', dest='min_coverage_intron', help='Minimum intron length that may be found during coverage search' )
  62. parser.add_option( '', '--max-coverage-intron', dest='max_coverage_intron', help='Maximum intron length that may be found during coverage search' )
  63. # Wrapper options.
  64. parser.add_option( '-1', '--input1', dest='input1', help='The (forward or single-end) reads file in Sanger FASTQ format' )
  65. parser.add_option( '-2', '--input2', dest='input2', help='The reverse reads file in Sanger FASTQ format' )
  66. parser.add_option( '', '--single-paired', dest='single_paired', help='' )
  67. parser.add_option( '', '--settings', dest='settings', help='' )
  68. (options, args) = parser.parse_args()
  69. # output version # of tool
  70. try:
  71. tmp = tempfile.NamedTemporaryFile().name
  72. tmp_stdout = open( tmp, 'wb' )
  73. proc = subprocess.Popen( args='tophat -v', shell=True, stdout=tmp_stdout )
  74. tmp_stdout.close()
  75. returncode = proc.wait()
  76. stdout = open( tmp_stdout.name, 'rb' ).readline().strip()
  77. if stdout:
  78. sys.stdout.write( '%s\n' % stdout )
  79. else:
  80. raise Exception
  81. except:
  82. sys.stdout.write( 'Could not determine Tophat version\n' )
  83. # Color or base space
  84. space = ''
  85. if options.color_space:
  86. space = '-C'
  87. # Creat bowtie index if necessary.
  88. tmp_index_dir = tempfile.mkdtemp()
  89. if options.own_file:
  90. index_path = os.path.join( tmp_index_dir, '.'.join( os.path.split( options.own_file )[1].split( '.' )[:-1] ) )
  91. try:
  92. os.link( options.own_file, index_path + '.fa' )
  93. except:
  94. # Tophat prefers (but doesn't require) fasta file to be in same directory, with .fa extension
  95. pass
  96. cmd_index = 'bowtie-build %s -f %s %s' % ( space, options.own_file, index_path )
  97. try:
  98. tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
  99. tmp_stderr = open( tmp, 'wb' )
  100. proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )
  101. returncode = proc.wait()
  102. tmp_stderr.close()
  103. # get stderr, allowing for case where it's very large
  104. tmp_stderr = open( tmp, 'rb' )
  105. stderr = ''
  106. buffsize = 1048576
  107. try:
  108. while True:
  109. stderr += tmp_stderr.read( buffsize )
  110. if not stderr or len( stderr ) % buffsize != 0:
  111. break
  112. except OverflowError:
  113. pass
  114. tmp_stderr.close()
  115. if returncode != 0:
  116. raise Exception, stderr
  117. except Exception, e:
  118. if os.path.exists( tmp_index_dir ):
  119. shutil.rmtree( tmp_index_dir )
  120. stop_err( 'Error indexing reference sequence\n' + str( e ) )
  121. else:
  122. index_path = options.index_path
  123. # Build tophat command.
  124. cmd = 'tophat %s %s %s'
  125. reads = options.input1
  126. if options.input2:
  127. reads += ' ' + options.input2
  128. opts = '-p %s %s' % ( options.num_threads, space )
  129. if options.single_paired == 'paired':
  130. opts += ' -r %s' % options.mate_inner_dist
  131. if options.settings == 'preSet':
  132. cmd = cmd % ( opts, index_path, reads )
  133. else:
  134. try:
  135. if int( options.min_anchor_length ) >= 3:
  136. opts += ' -a %s' % options.min_anchor_length
  137. else:
  138. raise Exception, 'Minimum anchor length must be 3 or greater'
  139. opts += ' -m %s' % options.splice_mismatches
  140. opts += ' -i %s' % options.min_intron_length
  141. opts += ' -I %s' % options.max_intron_length
  142. opts += ' -g %s' % options.max_multihits
  143. # Custom junctions options.
  144. if options.gene_model_annotations:
  145. opts += ' -G %s' % options.gene_model_annotations
  146. if options.raw_juncs:
  147. opts += ' -j %s' % options.raw_juncs
  148. if options.no_novel_juncs:
  149. opts += ' --no-novel-juncs'
  150. if options.library_type:
  151. opts += ' --library-type %s' % options.library_type
  152. if options.no_novel_indels:
  153. opts += ' --no-novel-indels'
  154. else:
  155. if options.max_insertion_length:
  156. opts += ' --max-insertion-length %i' % int( options.max_insertion_length )
  157. if options.max_deletion_length:
  158. opts += ' --max-deletion-length %i' % int( options.max_deletion_length )
  159. # Max options do not work for Tophat v1.2.0, despite documentation to the contrary. (Fixed in version 1.3.1)
  160. # need to warn user of this fact
  161. #sys.stdout.write( "Max insertion length and max deletion length options don't work in Tophat v1.2.0\n" )
  162. # Search type options.
  163. if options.coverage_search:
  164. opts += ' --coverage-search --min-coverage-intron %s --max-coverage-intron %s' % ( options.min_coverage_intron, options.max_coverage_intron )
  165. else:
  166. opts += ' --no-coverage-search'
  167. if options.closure_search:
  168. opts += ' --closure-search --min-closure-exon %s --min-closure-intron %s --max-closure-intron %s' % ( options.min_closure_exon, options.min_closure_intron, options.max_closure_intron )
  169. else:
  170. opts += ' --no-closure-search'
  171. if options.microexon_search:
  172. opts += ' --microexon-search'
  173. if options.single_paired == 'paired':
  174. opts += ' --mate-std-dev %s' % options.mate_std_dev
  175. if options.initial_read_mismatches:
  176. opts += ' --initial-read-mismatches %d' % int( options.initial_read_mismatches )
  177. if options.seg_mismatches:
  178. opts += ' --segment-mismatches %d' % int( options.seg_mismatches )
  179. if options.seg_length:
  180. opts += ' --segment-length %d' % int( options.seg_length )
  181. if options.min_segment_intron:
  182. opts += ' --min-segment-intron %d' % int( options.min_segment_intron )
  183. if options.max_segment_intron:
  184. opts += ' --max-segment-intron %d' % int( options.max_segment_intron )
  185. cmd = cmd % ( opts, index_path, reads )
  186. except Exception, e:
  187. # Clean up temp dirs
  188. if os.path.exists( tmp_index_dir ):
  189. shutil.rmtree( tmp_index_dir )
  190. stop_err( 'Something is wrong with the alignment parameters and the alignment could not be run\n' + str( e ) )
  191. #print cmd
  192. # Run
  193. try:
  194. tmp_out = tempfile.NamedTemporaryFile().name
  195. tmp_stdout = open( tmp_out, 'wb' )
  196. tmp_err = tempfile.NamedTemporaryFile().name
  197. tmp_stderr = open( tmp_err, 'wb' )
  198. print cmd
  199. proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr )
  200. returncode = proc.wait()
  201. tmp_stderr.close()
  202. # get stderr, allowing for case where it's very large
  203. tmp_stderr = open( tmp_err, 'rb' )
  204. stderr = ''
  205. buffsize = 1048576
  206. try:
  207. while True:
  208. stderr += tmp_stderr.read( buffsize )
  209. if not stderr or len( stderr ) % buffsize != 0:
  210. break
  211. except OverflowError:
  212. pass
  213. tmp_stdout.close()
  214. tmp_stderr.close()
  215. if returncode != 0:
  216. raise Exception, stderr
  217. # TODO: look for errors in program output.
  218. except Exception, e:
  219. stop_err( 'Error in tophat:\n' + str( e ) )
  220. # Clean up temp dirs
  221. if os.path.exists( tmp_index_dir ):
  222. shutil.rmtree( tmp_index_dir )
  223. if __name__=="__main__": __main__()