PageRenderTime 87ms CodeModel.GetById 59ms app.highlight 23ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/ngs_rna/tophat_wrapper.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 238 lines | 232 code | 3 blank | 3 comment | 33 complexity | a8e837aead14af8a38c01068e04a2c5a MD5 | raw file
  1#!/usr/bin/env python
  2
  3import optparse, os, shutil, subprocess, sys, tempfile, fileinput
  4
  5def stop_err( msg ):
  6    sys.stderr.write( "%s\n" % msg )
  7    sys.exit()
  8
  9def __main__():
 10    #Parse Command Line
 11    parser = optparse.OptionParser()
 12    parser.add_option( '-p', '--num-threads', dest='num_threads', help='Use this many threads to align reads. The default is 1.' )
 13    parser.add_option( '-C', '--color-space', dest='color_space', action='store_true', help='This indicates color-space data' )
 14    parser.add_option( '-J', '--junctions-output', dest='junctions_output_file', help='Junctions output file; formate is BED.' )
 15    parser.add_option( '-H', '--hits-output', dest='accepted_hits_output_file', help='Accepted hits output file; formate is BAM.' )
 16    parser.add_option( '', '--own-file', dest='own_file', help='' )
 17    parser.add_option( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' )
 18    parser.add_option( '-r', '--mate-inner-dist', dest='mate_inner_dist', help='This is the expected (mean) inner distance between mate pairs. \
 19                                                                                For, example, for paired end runs with fragments selected at 300bp, \
 20                                                                                where each end is 50bp, you should set -r to be 200. There is no default, \
 21                                                                                and this parameter is required for paired end runs.')
 22    parser.add_option( '', '--mate-std-dev', dest='mate_std_dev', help='Standard deviation of distribution on inner distances between male pairs.' )
 23    parser.add_option( '-a', '--min-anchor-length', dest='min_anchor_length', 
 24                        help='The "anchor length". TopHat will report junctions spanned by reads with at least this many bases on each side of the junction.' )
 25    parser.add_option( '-m', '--splice-mismatches', dest='splice_mismatches', help='The maximum number of mismatches that can appear in the anchor region of a spliced alignment.' )
 26    parser.add_option( '-i', '--min-intron-length', dest='min_intron_length', 
 27                        help='The minimum intron length. TopHat will ignore donor/acceptor pairs closer than this many bases apart.' )
 28    parser.add_option( '-I', '--max-intron-length', dest='max_intron_length', 
 29                        help='The maximum intron length. When searching for junctions ab initio, TopHat will ignore donor/acceptor pairs farther than this many bases apart, except when such a pair is supported by a split segment alignment of a long read.' )
 30    parser.add_option( '-g', '--max_multihits', dest='max_multihits', help='Maximum number of alignments to be allowed' )
 31    parser.add_option( '', '--initial-read-mismatches', dest='initial_read_mismatches', help='Number of mismatches allowed in the initial read mapping' )
 32    parser.add_option( '', '--seg-mismatches', dest='seg_mismatches', help='Number of mismatches allowed in each segment alignment for reads mapped independently' )
 33    parser.add_option( '', '--seg-length', dest='seg_length', help='Minimum length of read segments' )
 34    parser.add_option( '', '--library-type', dest='library_type', help='TopHat will treat the reads as strand specific. Every read alignment will have an XS attribute tag. Consider supplying library type options below to select the correct RNA-seq protocol.' )
 35    parser.add_option( '', '--allow-indels', action="store_true", help='Allow indel search. Indel search is disabled by default.(Not used since version 1.3.0)' )
 36    parser.add_option( '', '--max-insertion-length', dest='max_insertion_length', help='The maximum insertion length. The default is 3.' )
 37    parser.add_option( '', '--max-deletion-length', dest='max_deletion_length', help='The maximum deletion length. The default is 3.' )
 38
 39    # Options for supplying own junctions
 40    parser.add_option( '-G', '--GTF', dest='gene_model_annotations', help='Supply TopHat with a list of gene model annotations. \
 41                                                                           TopHat will use the exon records in this file to build \
 42                                                                           a set of known splice junctions for each gene, and will \
 43                                                                           attempt to align reads to these junctions even if they \
 44                                                                           would not normally be covered by the initial mapping.')
 45    parser.add_option( '-j', '--raw-juncs', dest='raw_juncs', help='Supply TopHat with a list of raw junctions. Junctions are \
 46                                                                    specified one per line, in a tab-delimited format. Records \
 47                                                                    look like: <chrom> <left> <right> <+/-> left and right are \
 48                                                                    zero-based coordinates, and specify the last character of the \
 49                                                                    left sequenced to be spliced to the first character of the right \
 50                                                                    sequence, inclusive.')
 51    parser.add_option( '', '--no-novel-juncs', action="store_true", dest='no_novel_juncs', help="Only look for junctions indicated in the \
 52                                                                                            supplied GFF file. (ignored without -G)")
 53    parser.add_option( '', '--no-novel-indels', action="store_true", dest='no_novel_indels', help="Skip indel search. Indel search is enabled by default.")
 54    # Types of search.
 55    parser.add_option( '', '--microexon-search', action="store_true", dest='microexon_search', help='With this option, the pipeline will attempt to find alignments incident to microexons. Works only for reads 50bp or longer.')
 56    parser.add_option( '', '--closure-search', action="store_true", dest='closure_search', help='Enables the mate pair closure-based search for junctions. Closure-based search should only be used when the expected inner distance between mates is small (<= 50bp)')
 57    parser.add_option( '', '--no-closure-search', action="store_false", dest='closure_search' )
 58    parser.add_option( '', '--coverage-search', action="store_true", dest='coverage_search', help='Enables the coverage based search for junctions. Use when coverage search is disabled by default (such as for reads 75bp or longer), for maximum sensitivity.')
 59    parser.add_option( '', '--no-coverage-search', action="store_false", dest='coverage_search' )
 60    parser.add_option( '', '--min-segment-intron', dest='min_segment_intron', help='Minimum intron length that may be found during split-segment search' )
 61    parser.add_option( '', '--max-segment-intron', dest='max_segment_intron', help='Maximum intron length that may be found during split-segment search' )
 62    parser.add_option( '', '--min-closure-exon', dest='min_closure_exon', help='Minimum length for exonic hops in potential splice graph' )
 63    parser.add_option( '', '--min-closure-intron', dest='min_closure_intron', help='Minimum intron length that may be found during closure search' )
 64    parser.add_option( '', '--max-closure-intron', dest='max_closure_intron', help='Maximum intron length that may be found during closure search' )
 65    parser.add_option( '', '--min-coverage-intron', dest='min_coverage_intron', help='Minimum intron length that may be found during coverage search' )
 66    parser.add_option( '', '--max-coverage-intron', dest='max_coverage_intron', help='Maximum intron length that may be found during coverage search' )
 67
 68    # Wrapper options.
 69    parser.add_option( '-1', '--input1', dest='input1', help='The (forward or single-end) reads file in Sanger FASTQ format' )
 70    parser.add_option( '-2', '--input2', dest='input2', help='The reverse reads file in Sanger FASTQ format' )
 71    parser.add_option( '', '--single-paired', dest='single_paired', help='' )
 72    parser.add_option( '', '--settings', dest='settings', help='' )
 73
 74    (options, args) = parser.parse_args()
 75
 76    # output version # of tool
 77    try:
 78        tmp = tempfile.NamedTemporaryFile().name
 79        tmp_stdout = open( tmp, 'wb' )
 80        proc = subprocess.Popen( args='tophat -v', shell=True, stdout=tmp_stdout )
 81        tmp_stdout.close()
 82        returncode = proc.wait()
 83        stdout = open( tmp_stdout.name, 'rb' ).readline().strip()
 84        if stdout:
 85            sys.stdout.write( '%s\n' % stdout )
 86        else:
 87            raise Exception
 88    except:
 89        sys.stdout.write( 'Could not determine Tophat version\n' )
 90
 91    # Color or base space
 92    space = ''
 93    if options.color_space:
 94        space = '-C'
 95
 96    # Creat bowtie index if necessary.
 97    tmp_index_dir = tempfile.mkdtemp()
 98    if options.own_file:
 99        index_path = os.path.join( tmp_index_dir, '.'.join( os.path.split( options.own_file )[1].split( '.' )[:-1] ) )
100        try:
101            os.link( options.own_file, index_path + '.fa' )
102        except:
103            # Tophat prefers (but doesn't require) fasta file to be in same directory, with .fa extension
104            pass
105        cmd_index = 'bowtie-build %s -f %s %s' % ( space, options.own_file, index_path )
106        try:
107            tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
108            tmp_stderr = open( tmp, 'wb' )
109            proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )
110            returncode = proc.wait()
111            tmp_stderr.close()
112            # get stderr, allowing for case where it's very large
113            tmp_stderr = open( tmp, 'rb' )
114            stderr = ''
115            buffsize = 1048576
116            try:
117                while True:
118                    stderr += tmp_stderr.read( buffsize )
119                    if not stderr or len( stderr ) % buffsize != 0:
120                        break
121            except OverflowError:
122                pass
123            tmp_stderr.close()
124            if returncode != 0:
125                raise Exception, stderr
126        except Exception, e:
127            if os.path.exists( tmp_index_dir ):
128                shutil.rmtree( tmp_index_dir )
129            stop_err( 'Error indexing reference sequence\n' + str( e ) )
130    else:
131        index_path = options.index_path
132
133    # Build tophat command.
134    cmd = 'tophat %s %s %s'
135    reads = options.input1
136    if options.input2:
137        reads += ' ' + options.input2
138    opts = '-p %s %s' % ( options.num_threads, space )
139    if options.single_paired == 'paired':
140        opts += ' -r %s' % options.mate_inner_dist
141    if options.settings == 'preSet':
142        cmd = cmd % ( opts, index_path, reads )
143    else:
144        try:
145            if int( options.min_anchor_length ) >= 3:
146                opts += ' -a %s' % options.min_anchor_length
147            else:
148                raise Exception, 'Minimum anchor length must be 3 or greater'
149            opts += ' -m %s' % options.splice_mismatches
150            opts += ' -i %s' % options.min_intron_length
151            opts += ' -I %s' % options.max_intron_length
152            opts += ' -g %s' % options.max_multihits
153            # Custom junctions options.
154            if options.gene_model_annotations:
155                opts += ' -G %s' % options.gene_model_annotations
156            if options.raw_juncs:
157                opts += ' -j %s' % options.raw_juncs
158            if options.no_novel_juncs:
159                opts += ' --no-novel-juncs'
160            if options.library_type:
161                opts += ' --library-type %s' % options.library_type
162            if options.no_novel_indels:
163                opts += ' --no-novel-indels'
164            else:
165                if options.max_insertion_length:
166                    opts += ' --max-insertion-length %i' % int( options.max_insertion_length )
167                if options.max_deletion_length:
168                    opts += ' --max-deletion-length %i' % int( options.max_deletion_length )
169                # Max options do not work for Tophat v1.2.0, despite documentation to the contrary. (Fixed in version 1.3.1)
170                # need to warn user of this fact
171                #sys.stdout.write( "Max insertion length and max deletion length options don't work in Tophat v1.2.0\n" )
172
173            # Search type options.
174            if options.coverage_search:
175                opts += ' --coverage-search --min-coverage-intron %s --max-coverage-intron %s' % ( options.min_coverage_intron, options.max_coverage_intron )
176            else:
177                opts += ' --no-coverage-search'
178            if options.closure_search:
179                opts += ' --closure-search --min-closure-exon %s --min-closure-intron %s --max-closure-intron %s'  % ( options.min_closure_exon, options.min_closure_intron, options.max_closure_intron ) 
180            else:
181                opts += ' --no-closure-search'
182            if options.microexon_search:
183                opts += ' --microexon-search'
184            if options.single_paired == 'paired':
185                opts += ' --mate-std-dev %s' % options.mate_std_dev
186            if options.initial_read_mismatches:
187                opts += ' --initial-read-mismatches %d' % int( options.initial_read_mismatches )
188            if options.seg_mismatches:
189                opts += ' --segment-mismatches %d' % int( options.seg_mismatches )
190            if options.seg_length:
191                opts += ' --segment-length %d' % int( options.seg_length )
192            if options.min_segment_intron:
193                opts += ' --min-segment-intron %d' % int( options.min_segment_intron )
194            if options.max_segment_intron:
195                opts += ' --max-segment-intron %d' % int( options.max_segment_intron )
196            cmd = cmd % ( opts, index_path, reads )
197        except Exception, e:
198            # Clean up temp dirs
199            if os.path.exists( tmp_index_dir ):
200                shutil.rmtree( tmp_index_dir )
201            stop_err( 'Something is wrong with the alignment parameters and the alignment could not be run\n' + str( e ) )
202    #print cmd
203
204    # Run
205    try:
206        tmp_out = tempfile.NamedTemporaryFile().name
207        tmp_stdout = open( tmp_out, 'wb' )
208        tmp_err = tempfile.NamedTemporaryFile().name
209        tmp_stderr = open( tmp_err, 'wb' )
210        print cmd
211        proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr )
212        returncode = proc.wait()
213        tmp_stderr.close()
214        # get stderr, allowing for case where it's very large
215        tmp_stderr = open( tmp_err, 'rb' )
216        stderr = ''
217        buffsize = 1048576
218        try:
219            while True:
220                stderr += tmp_stderr.read( buffsize )
221                if not stderr or len( stderr ) % buffsize != 0:
222                    break
223        except OverflowError:
224            pass
225        tmp_stdout.close()
226        tmp_stderr.close()
227        if returncode != 0:
228            raise Exception, stderr
229            
230        # TODO: look for errors in program output.
231    except Exception, e:
232        stop_err( 'Error in tophat:\n' + str( e ) ) 
233
234    # Clean up temp dirs
235    if os.path.exists( tmp_index_dir ):
236        shutil.rmtree( tmp_index_dir )
237
238if __name__=="__main__": __main__()