/tools/ngs_simulation/ngs_simulation.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 280 lines · 266 code · 3 blank · 11 comment · 5 complexity · 62277c41e2e5ef606145b376fbd802e0 MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. Runs Ben's simulation.
  4. usage: %prog [options]
  5. -i, --input=i: Input genome (FASTA format)
  6. -g, --genome=g: If built-in, the genome being used
  7. -l, --read_len=l: Read length
  8. -c, --avg_coverage=c: Average coverage
  9. -e, --error_rate=e: Error rate (0-1)
  10. -n, --num_sims=n: Number of simulations to run
  11. -p, --polymorphism=p: Frequency/ies for minor allele (comma-separate list of 0-1)
  12. -d, --detection_thresh=d: Detection thresholds (comma-separate list of 0-1)
  13. -p, --output_png=p: Plot output
  14. -s, --summary_out=s: Whether or not to output a file with summary of all simulations
  15. -m, --output_summary=m: File name for output summary of all simulations
  16. -f, --new_file_path=f: Directory for summary output files
  17. """
  18. # removed output of all simulation results on request (not working)
  19. # -r, --sim_results=r: Output all tabular simulation results (number of polymorphisms times number of detection thresholds)
  20. # -o, --output=o: Base name for summary output for each run
  21. from rpy import *
  22. import os
  23. import random, sys, tempfile
  24. from galaxy import eggs
  25. import pkg_resources; pkg_resources.require( "bx-python" )
  26. from bx.cookbook import doc_optparse
  27. def stop_err( msg ):
  28. sys.stderr.write( '%s\n' % msg )
  29. sys.exit()
  30. def __main__():
  31. #Parse Command Line
  32. options, args = doc_optparse.parse( __doc__ )
  33. # validate parameters
  34. error = ''
  35. try:
  36. read_len = int( options.read_len )
  37. if read_len <= 0:
  38. raise Exception, ' greater than 0'
  39. except TypeError, e:
  40. error = ': %s' % str( e )
  41. if error:
  42. stop_err( 'Make sure your number of reads is an integer value%s' % error )
  43. error = ''
  44. try:
  45. avg_coverage = int( options.avg_coverage )
  46. if avg_coverage <= 0:
  47. raise Exception, ' greater than 0'
  48. except Exception, e:
  49. error = ': %s' % str( e )
  50. if error:
  51. stop_err( 'Make sure your average coverage is an integer value%s' % error )
  52. error = ''
  53. try:
  54. error_rate = float( options.error_rate )
  55. if error_rate >= 1.0:
  56. error_rate = 10 ** ( -error_rate / 10.0 )
  57. elif error_rate < 0:
  58. raise Exception, ' between 0 and 1'
  59. except Exception, e:
  60. error = ': %s' % str( e )
  61. if error:
  62. stop_err( 'Make sure the error rate is a decimal value%s or the quality score is at least 1' % error )
  63. try:
  64. num_sims = int( options.num_sims )
  65. except TypeError, e:
  66. stop_err( 'Make sure the number of simulations is an integer value: %s' % str( e ) )
  67. if options.polymorphism != 'None':
  68. polymorphisms = [ float( p ) for p in options.polymorphism.split( ',' ) ]
  69. else:
  70. stop_err( 'Select at least one polymorphism value to use' )
  71. if options.detection_thresh != 'None':
  72. detection_threshes = [ float( dt ) for dt in options.detection_thresh.split( ',' ) ]
  73. else:
  74. stop_err( 'Select at least one detection threshold to use' )
  75. # mutation dictionaries
  76. hp_dict = { 'A':'G', 'G':'A', 'C':'T', 'T':'C', 'N':'N' } # heteroplasmy dictionary
  77. mt_dict = { 'A':'C', 'C':'A', 'G':'T', 'T':'G', 'N':'N'} # misread dictionary
  78. # read fasta file to seq string
  79. all_lines = open( options.input, 'rb' ).readlines()
  80. seq = ''
  81. for line in all_lines:
  82. line = line.rstrip()
  83. if line.startswith('>'):
  84. pass
  85. else:
  86. seq += line.upper()
  87. seq_len = len( seq )
  88. # output file name template
  89. # removed output of all simulation results on request (not working)
  90. # if options.sim_results == "true":
  91. # out_name_template = os.path.join( options.new_file_path, 'primary_output%s_' + options.output + '_visible_tabular' )
  92. # else:
  93. # out_name_template = tempfile.NamedTemporaryFile().name + '_%s'
  94. out_name_template = tempfile.NamedTemporaryFile().name + '_%s'
  95. print 'out_name_template:', out_name_template
  96. # set up output files
  97. outputs = {}
  98. i = 1
  99. for p in polymorphisms:
  100. outputs[ p ] = {}
  101. for d in detection_threshes:
  102. outputs[ p ][ d ] = out_name_template % i
  103. i += 1
  104. # run sims
  105. for polymorphism in polymorphisms:
  106. for detection_thresh in detection_threshes:
  107. output = open( outputs[ polymorphism ][ detection_thresh ], 'wb' )
  108. output.write( 'FP\tFN\tGENOMESIZE=%s\n' % seq_len )
  109. sim_count = 0
  110. while sim_count < num_sims:
  111. # randomly pick heteroplasmic base index
  112. hbase = random.choice( range( 0, seq_len ) )
  113. #hbase = seq_len/2#random.randrange( 0, seq_len )
  114. # create 2D quasispecies list
  115. qspec = map( lambda x: [], [0] * seq_len )
  116. # simulate read indices and assign to quasispecies
  117. i = 0
  118. while i < ( avg_coverage * ( seq_len / read_len ) ): # number of reads (approximates coverage)
  119. start = random.choice( range( 0, seq_len ) )
  120. #start = seq_len/2#random.randrange( 0, seq_len ) # assign read start
  121. if random.random() < 0.5: # positive sense read
  122. end = start + read_len # assign read end
  123. if end > seq_len: # overshooting origin
  124. read = range( start, seq_len ) + range( 0, ( end - seq_len ) )
  125. else: # regular read
  126. read = range( start, end )
  127. else: # negative sense read
  128. end = start - read_len # assign read end
  129. if end < -1: # overshooting origin
  130. read = range( start, -1, -1) + range( ( seq_len - 1 ), ( seq_len + end ), -1 )
  131. else: # regular read
  132. read = range( start, end, -1 )
  133. # assign read to quasispecies list by index
  134. for j in read:
  135. if j == hbase and random.random() < polymorphism: # heteroplasmic base is variant with p = het
  136. ref = hp_dict[ seq[ j ] ]
  137. else: # ref is the verbatim reference nucleotide (all positions)
  138. ref = seq[ j ]
  139. if random.random() < error_rate: # base in read is misread with p = err
  140. qspec[ j ].append( mt_dict[ ref ] )
  141. else: # otherwise we carry ref through to the end
  142. qspec[ j ].append(ref)
  143. # last but not least
  144. i += 1
  145. bases, fpos, fneg = {}, 0, 0 # last two will be outputted to summary file later
  146. for i, nuc in enumerate( seq ):
  147. cov = len( qspec[ i ] )
  148. bases[ 'A' ] = qspec[ i ].count( 'A' )
  149. bases[ 'C' ] = qspec[ i ].count( 'C' )
  150. bases[ 'G' ] = qspec[ i ].count( 'G' )
  151. bases[ 'T' ] = qspec[ i ].count( 'T' )
  152. # calculate max NON-REF deviation
  153. del bases[ nuc ]
  154. maxdev = float( max( bases.values() ) ) / cov
  155. # deal with non-het sites
  156. if i != hbase:
  157. if maxdev >= detection_thresh: # greater than detection threshold = false positive
  158. fpos += 1
  159. # deal with het sites
  160. if i == hbase:
  161. hnuc = hp_dict[ nuc ] # let's recover het variant
  162. if ( float( bases[ hnuc ] ) / cov ) < detection_thresh: # less than detection threshold = false negative
  163. fneg += 1
  164. del bases[ hnuc ] # ignore het variant
  165. maxdev = float( max( bases.values() ) ) / cov # check other non-ref bases at het site
  166. if maxdev >= detection_thresh: # greater than detection threshold = false positive (possible)
  167. fpos += 1
  168. # output error sums and genome size to summary file
  169. output.write( '%d\t%d\n' % ( fpos, fneg ) )
  170. sim_count += 1
  171. # close output up
  172. output.close()
  173. # Parameters (heteroplasmy, error threshold, colours)
  174. r( '''
  175. het=c(%s)
  176. err=c(%s)
  177. grade = (0:32)/32
  178. hues = rev(gray(grade))
  179. ''' % ( ','.join( [ str( p ) for p in polymorphisms ] ), ','.join( [ str( d ) for d in detection_threshes ] ) ) )
  180. # Suppress warnings
  181. r( 'options(warn=-1)' )
  182. # Create allsum (for FP) and allneg (for FN) objects
  183. r( 'allsum <- data.frame()' )
  184. for polymorphism in polymorphisms:
  185. for detection_thresh in detection_threshes:
  186. output = outputs[ polymorphism ][ detection_thresh ]
  187. cmd = '''
  188. ngsum = read.delim('%s', header=T)
  189. ngsum$fprate <- ngsum$FP/%s
  190. ngsum$hetcol <- %s
  191. ngsum$errcol <- %s
  192. allsum <- rbind(allsum, ngsum)
  193. ''' % ( output, seq_len, polymorphism, detection_thresh )
  194. r( cmd )
  195. if os.path.getsize( output ) == 0:
  196. for p in outputs.keys():
  197. for d in outputs[ p ].keys():
  198. sys.stderr.write(outputs[ p ][ d ] + ' '+str( os.path.getsize( outputs[ p ][ d ] ) )+'\n')
  199. if options.summary_out == "true":
  200. r( 'write.table(summary(ngsum), file="%s", quote=FALSE, sep="\t", row.names=FALSE)' % options.output_summary )
  201. # Summary objects (these could be printed)
  202. r( '''
  203. tr_pos <- tapply(allsum$fprate,list(allsum$hetcol,allsum$errcol), mean)
  204. tr_neg <- tapply(allsum$FN,list(allsum$hetcol,allsum$errcol), mean)
  205. cat('\nFalse Positive Rate Summary\n\t', file='%s', append=T, sep='\t')
  206. write.table(format(tr_pos, digits=4), file='%s', append=T, quote=F, sep='\t')
  207. cat('\nFalse Negative Rate Summary\n\t', file='%s', append=T, sep='\t')
  208. write.table(format(tr_neg, digits=4), file='%s', append=T, quote=F, sep='\t')
  209. ''' % tuple( [ options.output_summary ] * 4 ) )
  210. # Setup graphs
  211. #pdf(paste(prefix,'_jointgraph.pdf',sep=''), 15, 10)
  212. r( '''
  213. png('%s', width=800, height=500, units='px', res=250)
  214. layout(matrix(data=c(1,2,1,3,1,4), nrow=2, ncol=3), widths=c(4,6,2), heights=c(1,10,10))
  215. ''' % options.output_png )
  216. # Main title
  217. genome = ''
  218. if options.genome:
  219. genome = '%s: ' % options.genome
  220. r( '''
  221. par(mar=c(0,0,0,0))
  222. plot(1, type='n', axes=F, xlab='', ylab='')
  223. text(1,1,paste('%sVariation in False Positives and Negatives (', %s, ' simulations, coverage ', %s,')', sep=''), font=2, family='sans', cex=0.7)
  224. ''' % ( genome, options.num_sims, options.avg_coverage ) )
  225. # False positive boxplot
  226. r( '''
  227. par(mar=c(5,4,2,2), las=1, cex=0.35)
  228. boxplot(allsum$fprate ~ allsum$errcol, horizontal=T, ylim=rev(range(allsum$fprate)), cex.axis=0.85)
  229. title(main='False Positives', xlab='false positive rate', ylab='')
  230. ''' )
  231. # False negative heatmap (note zlim command!)
  232. num_polys = len( polymorphisms )
  233. num_dets = len( detection_threshes )
  234. r( '''
  235. par(mar=c(5,4,2,1), las=1, cex=0.35)
  236. image(1:%s, 1:%s, tr_neg, zlim=c(0,1), col=hues, xlab='', ylab='', axes=F, border=1)
  237. axis(1, at=1:%s, labels=rownames(tr_neg), lwd=1, cex.axis=0.85, axs='i')
  238. axis(2, at=1:%s, labels=colnames(tr_neg), lwd=1, cex.axis=0.85)
  239. title(main='False Negatives', xlab='minor allele frequency', ylab='detection threshold')
  240. ''' % ( num_polys, num_dets, num_polys, num_dets ) )
  241. # Scale alongside
  242. r( '''
  243. par(mar=c(2,2,2,3), las=1)
  244. image(1, grade, matrix(grade, ncol=length(grade), nrow=1), col=hues, xlab='', ylab='', xaxt='n', las=1, cex.axis=0.85)
  245. title(main='Key', cex=0.35)
  246. mtext('false negative rate', side=1, cex=0.35)
  247. ''' )
  248. # Close graphics
  249. r( '''
  250. layout(1)
  251. dev.off()
  252. ''' )
  253. # Tidy up
  254. # r( 'rm(folder,prefix,sim,cov,het,err,grade,hues,i,j,ngsum)' )
  255. if __name__ == "__main__" : __main__()