PageRenderTime 77ms CodeModel.GetById 55ms app.highlight 18ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/ngs_simulation/ngs_simulation.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 280 lines | 266 code | 3 blank | 11 comment | 5 complexity | 62277c41e2e5ef606145b376fbd802e0 MD5 | raw file
  1#!/usr/bin/env python
  2
  3"""
  4Runs Ben's simulation.
  5
  6usage: %prog [options]
  7   -i, --input=i: Input genome (FASTA format)
  8   -g, --genome=g: If built-in, the genome being used
  9   -l, --read_len=l: Read length
 10   -c, --avg_coverage=c: Average coverage
 11   -e, --error_rate=e: Error rate (0-1)
 12   -n, --num_sims=n: Number of simulations to run
 13   -p, --polymorphism=p: Frequency/ies for minor allele (comma-separate list of 0-1)
 14   -d, --detection_thresh=d: Detection thresholds (comma-separate list of 0-1)
 15   -p, --output_png=p: Plot output
 16   -s, --summary_out=s: Whether or not to output a file with summary of all simulations
 17   -m, --output_summary=m: File name for output summary of all simulations
 18   -f, --new_file_path=f: Directory for summary output files
 19
 20"""
 21# removed output of all simulation results on request (not working)
 22#   -r, --sim_results=r: Output all tabular simulation results (number of polymorphisms times number of detection thresholds)
 23#   -o, --output=o: Base name for summary output for each run
 24
 25from rpy import *
 26import os
 27import random, sys, tempfile
 28from galaxy import eggs
 29import pkg_resources; pkg_resources.require( "bx-python" )
 30from bx.cookbook import doc_optparse
 31
 32def stop_err( msg ):
 33    sys.stderr.write( '%s\n' % msg )
 34    sys.exit()
 35
 36def __main__():
 37    #Parse Command Line
 38    options, args = doc_optparse.parse( __doc__ )
 39    # validate parameters
 40    error = ''
 41    try:
 42        read_len = int( options.read_len )
 43        if read_len <= 0:
 44            raise Exception, ' greater than 0'
 45    except TypeError, e:
 46        error = ': %s' % str( e )
 47    if error:
 48        stop_err( 'Make sure your number of reads is an integer value%s' % error )
 49    error = ''
 50    try:
 51        avg_coverage = int( options.avg_coverage )
 52        if avg_coverage <= 0:
 53            raise Exception, ' greater than 0'
 54    except Exception, e:
 55        error = ': %s' % str( e )
 56    if error:
 57        stop_err( 'Make sure your average coverage is an integer value%s' % error )
 58    error = ''
 59    try:
 60        error_rate = float( options.error_rate )
 61        if error_rate >= 1.0:
 62            error_rate = 10 ** ( -error_rate / 10.0 )
 63        elif error_rate < 0:
 64            raise Exception, ' between 0 and 1'
 65    except Exception, e:
 66        error = ': %s' % str( e )
 67    if error:
 68        stop_err( 'Make sure the error rate is a decimal value%s or the quality score is at least 1' % error )
 69    try:
 70        num_sims = int( options.num_sims )
 71    except TypeError, e:
 72        stop_err( 'Make sure the number of simulations is an integer value: %s' % str( e ) )
 73    if options.polymorphism != 'None':
 74        polymorphisms = [ float( p ) for p in options.polymorphism.split( ',' ) ]
 75    else:
 76        stop_err( 'Select at least one polymorphism value to use' )
 77    if options.detection_thresh != 'None':
 78        detection_threshes = [ float( dt ) for dt in options.detection_thresh.split( ',' ) ]
 79    else:
 80        stop_err( 'Select at least one detection threshold to use' )
 81
 82    # mutation dictionaries
 83    hp_dict = { 'A':'G', 'G':'A', 'C':'T', 'T':'C', 'N':'N' } # heteroplasmy dictionary
 84    mt_dict = { 'A':'C', 'C':'A', 'G':'T', 'T':'G', 'N':'N'} # misread dictionary
 85
 86    # read fasta file to seq string
 87    all_lines = open( options.input, 'rb' ).readlines()
 88    seq = ''
 89    for line in all_lines:
 90        line = line.rstrip() 
 91        if line.startswith('>'):
 92            pass
 93        else:
 94            seq += line.upper()
 95    seq_len = len( seq )
 96
 97    # output file name template
 98# removed output of all simulation results on request (not working)
 99#    if options.sim_results == "true":
100#        out_name_template = os.path.join( options.new_file_path, 'primary_output%s_' + options.output + '_visible_tabular' )
101#    else:
102#        out_name_template = tempfile.NamedTemporaryFile().name + '_%s'
103    out_name_template = tempfile.NamedTemporaryFile().name + '_%s'
104    print 'out_name_template:', out_name_template
105
106    # set up output files
107    outputs = {}
108    i = 1
109    for p in polymorphisms:
110        outputs[ p ] = {}
111        for d in detection_threshes:
112            outputs[ p ][ d ] = out_name_template % i
113            i += 1
114
115    # run sims
116    for polymorphism in polymorphisms:
117        for detection_thresh in detection_threshes:
118            output = open( outputs[ polymorphism ][ detection_thresh ], 'wb' )
119            output.write( 'FP\tFN\tGENOMESIZE=%s\n' % seq_len )
120            sim_count = 0
121            while sim_count < num_sims:
122                # randomly pick heteroplasmic base index
123                hbase = random.choice( range( 0, seq_len ) )
124                #hbase = seq_len/2#random.randrange( 0, seq_len )
125                # create 2D quasispecies list
126                qspec = map( lambda x: [], [0] * seq_len )
127                # simulate read indices and assign to quasispecies
128                i = 0
129                while i < ( avg_coverage * ( seq_len / read_len ) ): # number of reads (approximates coverage)
130                    start = random.choice( range( 0, seq_len ) )
131                    #start = seq_len/2#random.randrange( 0, seq_len ) # assign read start
132                    if random.random() < 0.5: # positive sense read
133                        end = start + read_len # assign read end
134                        if end > seq_len: # overshooting origin
135                            read = range( start, seq_len ) + range( 0, ( end - seq_len ) )
136                        else: # regular read
137                            read = range( start, end )
138                    else: # negative sense read
139                        end = start - read_len # assign read end
140                        if end < -1: # overshooting origin
141                            read = range( start, -1, -1) + range( ( seq_len - 1 ), ( seq_len + end ), -1 )
142                        else: # regular read
143                            read = range( start, end, -1 )
144                    # assign read to quasispecies list by index
145                    for j in read:
146                        if j == hbase and random.random() < polymorphism: # heteroplasmic base is variant with p = het
147                            ref = hp_dict[ seq[ j ] ]
148                        else: # ref is the verbatim reference nucleotide (all positions)
149                            ref = seq[ j ]
150                        if random.random() < error_rate: # base in read is misread with p = err
151                            qspec[ j ].append( mt_dict[ ref ] )
152                        else: # otherwise we carry ref through to the end
153                            qspec[ j ].append(ref)
154                    # last but not least
155                    i += 1
156                bases, fpos, fneg = {}, 0, 0 # last two will be outputted to summary file later
157                for i, nuc in enumerate( seq ):
158                    cov = len( qspec[ i ] )
159                    bases[ 'A' ] = qspec[ i ].count( 'A' )
160                    bases[ 'C' ] = qspec[ i ].count( 'C' )
161                    bases[ 'G' ] = qspec[ i ].count( 'G' )
162                    bases[ 'T' ] = qspec[ i ].count( 'T' )
163                    # calculate max NON-REF deviation
164                    del bases[ nuc ]
165                    maxdev = float( max( bases.values() ) ) / cov
166                    # deal with non-het sites
167                    if i != hbase:
168                        if maxdev >= detection_thresh: # greater than detection threshold = false positive
169                            fpos += 1
170                    # deal with het sites
171                    if i == hbase:
172                        hnuc = hp_dict[ nuc ] # let's recover het variant
173                        if ( float( bases[ hnuc ] ) / cov ) < detection_thresh: # less than detection threshold = false negative
174                            fneg += 1
175                        del bases[ hnuc ] # ignore het variant
176                        maxdev = float( max( bases.values() ) ) / cov # check other non-ref bases at het site
177                        if maxdev >= detection_thresh: # greater than detection threshold = false positive (possible)
178                            fpos += 1
179                # output error sums and genome size to summary file
180                output.write( '%d\t%d\n' % ( fpos, fneg ) )
181                sim_count += 1
182            # close output up
183            output.close()
184
185    # Parameters (heteroplasmy, error threshold, colours)
186    r( '''
187    het=c(%s)
188    err=c(%s)
189    grade = (0:32)/32
190    hues = rev(gray(grade))
191    ''' % ( ','.join( [ str( p ) for p in polymorphisms ] ), ','.join( [ str( d ) for d in detection_threshes ] ) ) )
192
193    # Suppress warnings
194    r( 'options(warn=-1)' )
195
196    # Create allsum (for FP) and allneg (for FN) objects
197    r( 'allsum <- data.frame()' )
198    for polymorphism in polymorphisms:
199        for detection_thresh in detection_threshes:
200            output = outputs[ polymorphism ][ detection_thresh ]
201            cmd = '''
202                  ngsum = read.delim('%s', header=T)
203                  ngsum$fprate <- ngsum$FP/%s
204                  ngsum$hetcol <- %s
205                  ngsum$errcol <- %s
206                  allsum <- rbind(allsum, ngsum)
207                  ''' % ( output, seq_len, polymorphism, detection_thresh )
208            r( cmd )
209
210    if os.path.getsize( output ) == 0:
211        for p in outputs.keys():
212            for d in outputs[ p ].keys():
213                sys.stderr.write(outputs[ p ][ d ] + ' '+str( os.path.getsize( outputs[ p ][ d ] ) )+'\n')
214
215    if options.summary_out == "true":
216        r( 'write.table(summary(ngsum), file="%s", quote=FALSE, sep="\t", row.names=FALSE)' % options.output_summary )
217
218    # Summary objects (these could be printed)
219    r( '''
220    tr_pos <- tapply(allsum$fprate,list(allsum$hetcol,allsum$errcol), mean)
221    tr_neg <- tapply(allsum$FN,list(allsum$hetcol,allsum$errcol), mean)
222    cat('\nFalse Positive Rate Summary\n\t', file='%s', append=T, sep='\t')
223    write.table(format(tr_pos, digits=4), file='%s', append=T, quote=F, sep='\t')
224    cat('\nFalse Negative Rate Summary\n\t', file='%s', append=T, sep='\t')
225    write.table(format(tr_neg, digits=4), file='%s', append=T, quote=F, sep='\t')
226    ''' % tuple( [ options.output_summary ] * 4 ) )
227
228    # Setup graphs
229    #pdf(paste(prefix,'_jointgraph.pdf',sep=''), 15, 10)
230    r( '''
231    png('%s', width=800, height=500, units='px', res=250)
232    layout(matrix(data=c(1,2,1,3,1,4), nrow=2, ncol=3), widths=c(4,6,2), heights=c(1,10,10))
233    ''' % options.output_png )
234
235    # Main title
236    genome = ''
237    if options.genome:
238        genome = '%s: ' % options.genome
239    r( '''
240    par(mar=c(0,0,0,0))
241    plot(1, type='n', axes=F, xlab='', ylab='')
242    text(1,1,paste('%sVariation in False Positives and Negatives (', %s, ' simulations, coverage ', %s,')', sep=''), font=2, family='sans', cex=0.7)
243    ''' % ( genome, options.num_sims, options.avg_coverage ) )
244
245    # False positive boxplot
246    r( '''
247    par(mar=c(5,4,2,2), las=1, cex=0.35)
248    boxplot(allsum$fprate ~ allsum$errcol, horizontal=T, ylim=rev(range(allsum$fprate)), cex.axis=0.85)
249    title(main='False Positives', xlab='false positive rate', ylab='')
250    ''' )
251
252    # False negative heatmap (note zlim command!)
253    num_polys = len( polymorphisms )
254    num_dets = len( detection_threshes )
255    r( '''
256    par(mar=c(5,4,2,1), las=1, cex=0.35)
257    image(1:%s, 1:%s, tr_neg, zlim=c(0,1), col=hues, xlab='', ylab='', axes=F, border=1)
258    axis(1, at=1:%s, labels=rownames(tr_neg), lwd=1, cex.axis=0.85, axs='i')
259    axis(2, at=1:%s, labels=colnames(tr_neg), lwd=1, cex.axis=0.85)
260    title(main='False Negatives', xlab='minor allele frequency', ylab='detection threshold')
261    ''' % ( num_polys, num_dets, num_polys, num_dets ) )
262
263    # Scale alongside
264    r( '''
265    par(mar=c(2,2,2,3), las=1)
266    image(1, grade, matrix(grade, ncol=length(grade), nrow=1), col=hues, xlab='', ylab='', xaxt='n', las=1, cex.axis=0.85)
267    title(main='Key', cex=0.35)
268    mtext('false negative rate', side=1, cex=0.35)
269    ''' )
270
271    # Close graphics
272    r( '''
273    layout(1)
274    dev.off()
275    ''' )
276
277    # Tidy up
278#    r( 'rm(folder,prefix,sim,cov,het,err,grade,hues,i,j,ngsum)' )
279
280if __name__ == "__main__" : __main__()