ngs_simulation.py - This Python script uses R to simulate t…

/tools/ngs_simulation/ngs_simulation.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 280 lines · 266 code · 3 blank · 11 comment · 5 complexity · 62277c41e2e5ef606145b376fbd802e0 MD5 · raw file


#!/usr/bin/env python

"""
Runs Ben's simulation.

usage: %prog [options]
   -i, --input=i: Input genome (FASTA format)
   -g, --genome=g: If built-in, the genome being used
   -l, --read_len=l: Read length
   -c, --avg_coverage=c: Average coverage
   -e, --error_rate=e: Error rate (0-1)
   -n, --num_sims=n: Number of simulations to run
   -p, --polymorphism=p: Frequency/ies for minor allele (comma-separate list of 0-1)
   -d, --detection_thresh=d: Detection thresholds (comma-separate list of 0-1)
   -p, --output_png=p: Plot output
   -s, --summary_out=s: Whether or not to output a file with summary of all simulations
   -m, --output_summary=m: File name for output summary of all simulations
   -f, --new_file_path=f: Directory for summary output files

"""
# removed output of all simulation results on request (not working)
#   -r, --sim_results=r: Output all tabular simulation results (number of polymorphisms times number of detection thresholds)
#   -o, --output=o: Base name for summary output for each run

from rpy import *
import os
import random, sys, tempfile
from galaxy import eggs
import pkg_resources; pkg_resources.require( "bx-python" )
from bx.cookbook import doc_optparse

def stop_err( msg ):
    sys.stderr.write( '%s\n' % msg )
    sys.exit()

def __main__():
    #Parse Command Line
    options, args = doc_optparse.parse( __doc__ )
    # validate parameters
    error = ''
    try:
        read_len = int( options.read_len )
        if read_len <= 0:
            raise Exception, ' greater than 0'
    except TypeError, e:
        error = ': %s' % str( e )
    if error:
        stop_err( 'Make sure your number of reads is an integer value%s' % error )
    error = ''
    try:
        avg_coverage = int( options.avg_coverage )
        if avg_coverage <= 0:
            raise Exception, ' greater than 0'
    except Exception, e:
        error = ': %s' % str( e )
    if error:
        stop_err( 'Make sure your average coverage is an integer value%s' % error )
    error = ''
    try:
        error_rate = float( options.error_rate )
        if error_rate >= 1.0:
            error_rate = 10 ** ( -error_rate / 10.0 )
        elif error_rate < 0:
            raise Exception, ' between 0 and 1'
    except Exception, e:
        error = ': %s' % str( e )
    if error:
        stop_err( 'Make sure the error rate is a decimal value%s or the quality score is at least 1' % error )
    try:
        num_sims = int( options.num_sims )
    except TypeError, e:
        stop_err( 'Make sure the number of simulations is an integer value: %s' % str( e ) )
    if options.polymorphism != 'None':
        polymorphisms = [ float( p ) for p in options.polymorphism.split( ',' ) ]
    else:
        stop_err( 'Select at least one polymorphism value to use' )
    if options.detection_thresh != 'None':
        detection_threshes = [ float( dt ) for dt in options.detection_thresh.split( ',' ) ]
    else:
        stop_err( 'Select at least one detection threshold to use' )

    # mutation dictionaries
    hp_dict = { 'A':'G', 'G':'A', 'C':'T', 'T':'C', 'N':'N' } # heteroplasmy dictionary
    mt_dict = { 'A':'C', 'C':'A', 'G':'T', 'T':'G', 'N':'N'} # misread dictionary

    # read fasta file to seq string
    all_lines = open( options.input, 'rb' ).readlines()
    seq = ''
    for line in all_lines:
        line = line.rstrip() 
        if line.startswith('>'):
            pass
        else:
            seq += line.upper()
    seq_len = len( seq )

    # output file name template
# removed output of all simulation results on request (not working)
#    if options.sim_results == "true":
#        out_name_template = os.path.join( options.new_file_path, 'primary_output%s_' + options.output + '_visible_tabular' )
#    else:
#        out_name_template = tempfile.NamedTemporaryFile().name + '_%s'
    out_name_template = tempfile.NamedTemporaryFile().name + '_%s'
    print 'out_name_template:', out_name_template

    # set up output files
    outputs = {}
    i = 1
    for p in polymorphisms:
        outputs[ p ] = {}
        for d in detection_threshes:
            outputs[ p ][ d ] = out_name_template % i
            i += 1

    # run sims
    for polymorphism in polymorphisms:
        for detection_thresh in detection_threshes:
            output = open( outputs[ polymorphism ][ detection_thresh ], 'wb' )
            output.write( 'FP\tFN\tGENOMESIZE=%s\n' % seq_len )
            sim_count = 0
            while sim_count < num_sims:
                # randomly pick heteroplasmic base index
                hbase = random.choice( range( 0, seq_len ) )
                #hbase = seq_len/2#random.randrange( 0, seq_len )
                # create 2D quasispecies list
                qspec = map( lambda x: [], [0] * seq_len )
                # simulate read indices and assign to quasispecies
                i = 0
                while i < ( avg_coverage * ( seq_len / read_len ) ): # number of reads (approximates coverage)
                    start = random.choice( range( 0, seq_len ) )
                    #start = seq_len/2#random.randrange( 0, seq_len ) # assign read start
                    if random.random() < 0.5: # positive sense read
                        end = start + read_len # assign read end
                        if end > seq_len: # overshooting origin
                            read = range( start, seq_len ) + range( 0, ( end - seq_len ) )
                        else: # regular read
                            read = range( start, end )
                    else: # negative sense read
                        end = start - read_len # assign read end
                        if end < -1: # overshooting origin
                            read = range( start, -1, -1) + range( ( seq_len - 1 ), ( seq_len + end ), -1 )
                        else: # regular read
                            read = range( start, end, -1 )
                    # assign read to quasispecies list by index
                    for j in read:
                        if j == hbase and random.random() < polymorphism: # heteroplasmic base is variant with p = het
                            ref = hp_dict[ seq[ j ] ]
                        else: # ref is the verbatim reference nucleotide (all positions)
                            ref = seq[ j ]
                        if random.random() < error_rate: # base in read is misread with p = err
                            qspec[ j ].append( mt_dict[ ref ] )
                        else: # otherwise we carry ref through to the end
                            qspec[ j ].append(ref)
                    # last but not least
                    i += 1
                bases, fpos, fneg = {}, 0, 0 # last two will be outputted to summary file later
                for i, nuc in enumerate( seq ):
                    cov = len( qspec[ i ] )
                    bases[ 'A' ] = qspec[ i ].count( 'A' )
                    bases[ 'C' ] = qspec[ i ].count( 'C' )
                    bases[ 'G' ] = qspec[ i ].count( 'G' )
                    bases[ 'T' ] = qspec[ i ].count( 'T' )
                    # calculate max NON-REF deviation
                    del bases[ nuc ]
                    maxdev = float( max( bases.values() ) ) / cov
                    # deal with non-het sites
                    if i != hbase:
                        if maxdev >= detection_thresh: # greater than detection threshold = false positive
                            fpos += 1
                    # deal with het sites
                    if i == hbase:
                        hnuc = hp_dict[ nuc ] # let's recover het variant
                        if ( float( bases[ hnuc ] ) / cov ) < detection_thresh: # less than detection threshold = false negative
                            fneg += 1
                        del bases[ hnuc ] # ignore het variant
                        maxdev = float( max( bases.values() ) ) / cov # check other non-ref bases at het site
                        if maxdev >= detection_thresh: # greater than detection threshold = false positive (possible)
                            fpos += 1
                # output error sums and genome size to summary file
                output.write( '%d\t%d\n' % ( fpos, fneg ) )
                sim_count += 1
            # close output up
            output.close()

    # Parameters (heteroplasmy, error threshold, colours)
    r( '''
    het=c(%s)
    err=c(%s)
    grade = (0:32)/32
    hues = rev(gray(grade))
    ''' % ( ','.join( [ str( p ) for p in polymorphisms ] ), ','.join( [ str( d ) for d in detection_threshes ] ) ) )

    # Suppress warnings
    r( 'options(warn=-1)' )

    # Create allsum (for FP) and allneg (for FN) objects
    r( 'allsum <- data.frame()' )
    for polymorphism in polymorphisms:
        for detection_thresh in detection_threshes:
            output = outputs[ polymorphism ][ detection_thresh ]
            cmd = '''
                  ngsum = read.delim('%s', header=T)
                  ngsum$fprate <- ngsum$FP/%s
                  ngsum$hetcol <- %s
                  ngsum$errcol <- %s
                  allsum <- rbind(allsum, ngsum)
                  ''' % ( output, seq_len, polymorphism, detection_thresh )
            r( cmd )

    if os.path.getsize( output ) == 0:
        for p in outputs.keys():
            for d in outputs[ p ].keys():
                sys.stderr.write(outputs[ p ][ d ] + ' '+str( os.path.getsize( outputs[ p ][ d ] ) )+'\n')

    if options.summary_out == "true":
        r( 'write.table(summary(ngsum), file="%s", quote=FALSE, sep="\t", row.names=FALSE)' % options.output_summary )

    # Summary objects (these could be printed)
    r( '''
    tr_pos <- tapply(allsum$fprate,list(allsum$hetcol,allsum$errcol), mean)
    tr_neg <- tapply(allsum$FN,list(allsum$hetcol,allsum$errcol), mean)
    cat('\nFalse Positive Rate Summary\n\t', file='%s', append=T, sep='\t')
    write.table(format(tr_pos, digits=4), file='%s', append=T, quote=F, sep='\t')
    cat('\nFalse Negative Rate Summary\n\t', file='%s', append=T, sep='\t')
    write.table(format(tr_neg, digits=4), file='%s', append=T, quote=F, sep='\t')
    ''' % tuple( [ options.output_summary ] * 4 ) )

    # Setup graphs
    #pdf(paste(prefix,'_jointgraph.pdf',sep=''), 15, 10)
    r( '''
    png('%s', width=800, height=500, units='px', res=250)
    layout(matrix(data=c(1,2,1,3,1,4), nrow=2, ncol=3), widths=c(4,6,2), heights=c(1,10,10))
    ''' % options.output_png )

    # Main title
    genome = ''
    if options.genome:
        genome = '%s: ' % options.genome
    r( '''
    par(mar=c(0,0,0,0))
    plot(1, type='n', axes=F, xlab='', ylab='')
    text(1,1,paste('%sVariation in False Positives and Negatives (', %s, ' simulations, coverage ', %s,')', sep=''), font=2, family='sans', cex=0.7)
    ''' % ( genome, options.num_sims, options.avg_coverage ) )

    # False positive boxplot
    r( '''
    par(mar=c(5,4,2,2), las=1, cex=0.35)
    boxplot(allsum$fprate ~ allsum$errcol, horizontal=T, ylim=rev(range(allsum$fprate)), cex.axis=0.85)
    title(main='False Positives', xlab='false positive rate', ylab='')
    ''' )

    # False negative heatmap (note zlim command!)
    num_polys = len( polymorphisms )
    num_dets = len( detection_threshes )
    r( '''
    par(mar=c(5,4,2,1), las=1, cex=0.35)
    image(1:%s, 1:%s, tr_neg, zlim=c(0,1), col=hues, xlab='', ylab='', axes=F, border=1)
    axis(1, at=1:%s, labels=rownames(tr_neg), lwd=1, cex.axis=0.85, axs='i')
    axis(2, at=1:%s, labels=colnames(tr_neg), lwd=1, cex.axis=0.85)
    title(main='False Negatives', xlab='minor allele frequency', ylab='detection threshold')
    ''' % ( num_polys, num_dets, num_polys, num_dets ) )

    # Scale alongside
    r( '''
    par(mar=c(2,2,2,3), las=1)
    image(1, grade, matrix(grade, ncol=length(grade), nrow=1), col=hues, xlab='', ylab='', xaxt='n', las=1, cex.axis=0.85)
    title(main='Key', cex=0.35)
    mtext('false negative rate', side=1, cex=0.35)
    ''' )

    # Close graphics
    r( '''
    layout(1)
    dev.off()
    ''' )

    # Tidy up
#    r( 'rm(folder,prefix,sim,cov,het,err,grade,hues,i,j,ngsum)' )

if __name__ == "__main__" : __main__()

Summary ✨

This Python script uses R to simulate the detection of genetic variations in a sample based on the frequency of those variations in the population and the sensitivity of the detection method used. The script takes several command-line arguments, including the path to an input file containing the frequency of each variation in the population, the number of simulations to run, and the average coverage of the sample. It then uses R’s rpy2 library to execute R code that generates a graphical representation of the simulation results, including a boxplot showing the distribution of false positives and negatives and a heatmap showing the frequency of false negatives at different detection thresholds. The script also includes a key explaining the colors used in the heatmap.

Tech Fingerprint

Alerts (9)

Complexity hotspot; lines 16 to 18 (total complexity: 4)
16 17 18
'import *' Avoid to prevent namespace pollution; import specific names or use aliases
25
'def' Ensure functions have docstrings for documentation
32
'open(' Use 'with open()' to ensure Files are properly closed
87 118
'del' Avoid unless necessary; Python's garbage collector typically handles object deletion
164 175