PageRenderTime 24ms CodeModel.GetById 2ms app.highlight 18ms RepoModel.GetById 1ms app.codeStats 1ms

/tools/ngs_rna/cuffdiff_wrapper.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 233 lines | 210 code | 14 blank | 9 comment | 12 complexity | 0ca110ffa7e3d79a223c8e9cc0d0d807 MD5 | raw file
  1#!/usr/bin/env python
  2
  3import optparse, os, shutil, subprocess, sys, tempfile
  4
  5def group_callback( option, op_str, value, parser ):
  6    groups = []
  7    flist = []
  8    for arg in parser.rargs:
  9        arg = arg.strip()
 10        if arg[0] is "-":
 11            break
 12        elif arg[0] is ",":
 13            groups.append(flist)
 14            flist = []
 15        else:
 16            flist.append(arg)
 17    groups.append(flist)
 18
 19    setattr(parser.values, option.dest, groups)
 20    
 21def label_callback( option, op_str, value, parser ):
 22    labels = []
 23    for arg in parser.rargs:
 24        arg = arg.strip()
 25        if arg[0] is "-":
 26            break
 27        else:
 28            labels.append(arg)
 29
 30    setattr(parser.values, option.dest, labels)
 31
 32def stop_err( msg ):
 33    sys.stderr.write( "%s\n" % msg )
 34    sys.exit()
 35    
 36# Copied from sam_to_bam.py:
 37def check_seq_file( dbkey, cached_seqs_pointer_file ):
 38    seq_path = ''
 39    for line in open( cached_seqs_pointer_file ):
 40        line = line.rstrip( '\r\n' )
 41        if line and not line.startswith( '#' ) and line.startswith( 'index' ):
 42            fields = line.split( '\t' )
 43            if len( fields ) < 3:
 44                continue
 45            if fields[1] == dbkey:
 46                seq_path = fields[2].strip()
 47                break
 48    return seq_path
 49
 50def __main__():
 51    #Parse Command Line
 52    parser = optparse.OptionParser()
 53    
 54    # Cuffdiff options.
 55    parser.add_option( '-s', '--inner-dist-std-dev', dest='inner_dist_std_dev', help='The standard deviation for the distribution on inner distances between mate pairs. The default is 20bp.' )
 56    parser.add_option( '-p', '--num-threads', dest='num_threads', help='Use this many threads to align reads. The default is 1.' )
 57    parser.add_option( '-m', '--inner-mean-dist', dest='inner_mean_dist', help='This is the expected (mean) inner distance between mate pairs. \
 58                                                                                For, example, for paired end runs with fragments selected at 300bp, \
 59                                                                                where each end is 50bp, you should set -r to be 200. The default is 45bp.')
 60    parser.add_option( '-c', '--min-alignment-count', dest='min_alignment_count', help='The minimum number of alignments in a locus for needed to conduct significance testing on changes in that locus observed between samples. If no testing is performed, changes in the locus are deemed not signficant, and the locus\' observed changes don\'t contribute to correction for multiple testing. The default is 1,000 fragment alignments (up to 2,000 paired reads).' )
 61    parser.add_option( '--FDR', dest='FDR', help='The allowed false discovery rate. The default is 0.05.' )
 62
 63    # Advanced Options:	
 64    parser.add_option( '--num-importance-samples', dest='num_importance_samples', help='Sets the number of importance samples generated for each locus during abundance estimation. Default: 1000' )
 65    parser.add_option( '--max-mle-iterations', dest='max_mle_iterations', help='Sets the number of iterations allowed during maximum likelihood estimation of abundances. Default: 5000' )
 66    
 67    # Wrapper / Galaxy options.
 68    parser.add_option( '-f', '--files', dest='groups', action="callback", callback=group_callback, help="Groups to be processed, groups are separated by spaces, replicates in a group comma separated. group1_rep1,group1_rep2 group2_rep1,group2_rep2, ..., groupN_rep1, groupN_rep2" )
 69    parser.add_option( '-A', '--inputA', dest='inputA', help='A transcript GTF file produced by cufflinks, cuffcompare, or other source.')
 70    parser.add_option( '-1', '--input1', dest='input1', help='File of RNA-Seq read alignments in the SAM format. SAM is a standard short read alignment, that allows aligners to attach custom tags to individual alignments, and Cufflinks requires that the alignments you supply have some of these tags. Please see Input formats for more details.' )
 71    parser.add_option( '-2', '--input2', dest='input2', help='File of RNA-Seq read alignments in the SAM format. SAM is a standard short read alignment, that allows aligners to attach custom tags to individual alignments, and Cufflinks requires that the alignments you supply have some of these tags. Please see Input formats for more details.' )
 72
 73    # Label options
 74    parser.add_option('-L', '--labels', dest='labels', action="callback", callback=label_callback, help="Labels for the groups the replicates are in.")
 75    
 76	# Normalization options.
 77    parser.add_option( "-N", "--quartile-normalization", dest="do_normalization", action="store_true" )
 78
 79    # Bias correction options.
 80    parser.add_option( '-b', dest='do_bias_correction', action="store_true", help='Providing Cufflinks with a multifasta file via this option instructs it to run our new bias detection and correction algorithm which can significantly improve accuracy of transcript abundance estimates.')
 81    parser.add_option( '', '--dbkey', dest='dbkey', help='The build of the reference dataset' )
 82    parser.add_option( '', '--index_dir', dest='index_dir', help='GALAXY_DATA_INDEX_DIR' )
 83    parser.add_option( '', '--ref_file', dest='ref_file', help='The reference dataset from the history' )
 84
 85    # Outputs.
 86    parser.add_option( "--isoforms_fpkm_tracking_output", dest="isoforms_fpkm_tracking_output" )
 87    parser.add_option( "--genes_fpkm_tracking_output", dest="genes_fpkm_tracking_output" )
 88    parser.add_option( "--cds_fpkm_tracking_output", dest="cds_fpkm_tracking_output" )
 89    parser.add_option( "--tss_groups_fpkm_tracking_output", dest="tss_groups_fpkm_tracking_output" )
 90    parser.add_option( "--isoforms_exp_output", dest="isoforms_exp_output" )
 91    parser.add_option( "--genes_exp_output", dest="genes_exp_output" )
 92    parser.add_option( "--tss_groups_exp_output", dest="tss_groups_exp_output" )
 93    parser.add_option( "--cds_exp_fpkm_tracking_output", dest="cds_exp_fpkm_tracking_output" )
 94    parser.add_option( "--splicing_diff_output", dest="splicing_diff_output" )
 95    parser.add_option( "--cds_diff_output", dest="cds_diff_output" )
 96    parser.add_option( "--promoters_diff_output", dest="promoters_diff_output" )
 97    
 98    (options, args) = parser.parse_args()
 99    
100    # output version # of tool
101    try:
102        tmp = tempfile.NamedTemporaryFile().name
103        tmp_stdout = open( tmp, 'wb' )
104        proc = subprocess.Popen( args='cuffdiff --no-update-check 2>&1', shell=True, stdout=tmp_stdout )
105        tmp_stdout.close()
106        returncode = proc.wait()
107        stdout = None
108        for line in open( tmp_stdout.name, 'rb' ):
109            if line.lower().find( 'cuffdiff v' ) >= 0:
110                stdout = line.strip()
111                break
112        if stdout:
113            sys.stdout.write( '%s\n' % stdout )
114        else:
115            raise Exception
116    except:
117        sys.stdout.write( 'Could not determine Cuffdiff version\n' )
118
119    # Make temp directory for output.
120    tmp_output_dir = tempfile.mkdtemp()
121    
122    # If doing bias correction, set/link to sequence file.
123    if options.do_bias_correction:
124        cached_seqs_pointer_file = os.path.join( options.index_dir, 'sam_fa_indices.loc' )
125        if not os.path.exists( cached_seqs_pointer_file ):
126            stop_err( 'The required file (%s) does not exist.' % cached_seqs_pointer_file )
127        # If found for the dbkey, seq_path will look something like /galaxy/data/equCab2/sam_index/equCab2.fa,
128        # and the equCab2.fa file will contain fasta sequences.
129        seq_path = check_seq_file( options.dbkey, cached_seqs_pointer_file )
130        if options.ref_file != 'None':
131            # Create symbolic link to ref_file so that index will be created in working directory.
132            seq_path = os.path.join( tmp_output_dir, "ref.fa" )
133            os.symlink( options.ref_file, seq_path  )
134    
135    # Build command.
136    
137    # Base; always use quiet mode to avoid problems with storing log output.
138    cmd = "cuffdiff --no-update-check -q"
139    
140    # Add options.
141    if options.inner_dist_std_dev:
142        cmd += ( " -s %i" % int ( options.inner_dist_std_dev ) )
143    if options.num_threads:
144        cmd += ( " -p %i" % int ( options.num_threads ) )
145    if options.inner_mean_dist:
146        cmd += ( " -m %i" % int ( options.inner_mean_dist ) )
147    if options.min_alignment_count:
148        cmd += ( " -c %i" % int ( options.min_alignment_count ) )
149    if options.FDR:
150        cmd += ( " --FDR %f" % float( options.FDR ) )
151    if options.num_importance_samples:
152        cmd += ( " --num-importance-samples %i" % int ( options.num_importance_samples ) )
153    if options.max_mle_iterations:
154        cmd += ( " --max-mle-iterations %i" % int ( options.max_mle_iterations ) )
155    if options.do_normalization:
156        cmd += ( " -N" )
157    if options.do_bias_correction:
158        cmd += ( " -b %s" % seq_path )
159            
160    # Add inputs.
161    # For replicate analysis: group1_rep1,group1_rep2 groupN_rep1,groupN_rep2
162    if options.groups:
163        cmd += " --labels "
164        for label in options.labels:
165            cmd += label + ","
166        cmd = cmd[:-1]
167
168        cmd += " " + options.inputA + " "
169
170        for group in options.groups:
171            for filename in group:
172                cmd += filename + ","
173            cmd = cmd[:-1] + " "
174    else: 
175        cmd += " " + options.inputA + " " + options.input1 + " " + options.input2
176        
177    # Debugging.
178    print cmd
179
180    # Run command.
181    try:
182        tmp_name = tempfile.NamedTemporaryFile( dir=tmp_output_dir ).name
183        tmp_stderr = open( tmp_name, 'wb' )
184        proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_output_dir, stderr=tmp_stderr.fileno() )
185        returncode = proc.wait()
186        tmp_stderr.close()
187        
188        # Get stderr, allowing for case where it's very large.
189        tmp_stderr = open( tmp_name, 'rb' )
190        stderr = ''
191        buffsize = 1048576
192        try:
193            while True:
194                stderr += tmp_stderr.read( buffsize )
195                if not stderr or len( stderr ) % buffsize != 0:
196                    break
197        except OverflowError:
198            pass
199        tmp_stderr.close()
200        
201        # Error checking.
202        if returncode != 0:
203            raise Exception, stderr
204            
205        # check that there are results in the output file
206        if len( open( os.path.join( tmp_output_dir, "isoforms.fpkm_tracking" ), 'rb' ).read().strip() ) == 0:
207            raise Exception, 'The main output file is empty, there may be an error with your input file or settings.'
208    except Exception, e:
209        stop_err( 'Error running cuffdiff. ' + str( e ) )
210
211        
212    # Copy output files from tmp directory to specified files.
213    try:
214        try:
215            shutil.copyfile( os.path.join( tmp_output_dir, "isoforms.fpkm_tracking" ), options.isoforms_fpkm_tracking_output )
216            shutil.copyfile( os.path.join( tmp_output_dir, "genes.fpkm_tracking" ), options.genes_fpkm_tracking_output )
217            shutil.copyfile( os.path.join( tmp_output_dir, "cds.fpkm_tracking" ), options.cds_fpkm_tracking_output )
218            shutil.copyfile( os.path.join( tmp_output_dir, "tss_groups.fpkm_tracking" ), options.tss_groups_fpkm_tracking_output )
219            shutil.copyfile( os.path.join( tmp_output_dir, "isoform_exp.diff" ), options.isoforms_exp_output )
220            shutil.copyfile( os.path.join( tmp_output_dir, "gene_exp.diff" ), options.genes_exp_output )
221            shutil.copyfile( os.path.join( tmp_output_dir, "tss_group_exp.diff" ), options.tss_groups_exp_output )
222            shutil.copyfile( os.path.join( tmp_output_dir, "splicing.diff" ), options.splicing_diff_output )
223            shutil.copyfile( os.path.join( tmp_output_dir, "cds.diff" ), options.cds_diff_output )
224            shutil.copyfile( os.path.join( tmp_output_dir, "cds_exp.diff" ), options.cds_exp_fpkm_tracking_output )
225            shutil.copyfile( os.path.join( tmp_output_dir, "promoters.diff" ), options.promoters_diff_output )    
226        except Exception, e:
227            stop_err( 'Error in cuffdiff:\n' + str( e ) ) 
228    finally:
229        # Clean up temp dirs
230        if os.path.exists( tmp_output_dir ):
231            shutil.rmtree( tmp_output_dir )
232
233if __name__=="__main__": __main__()