PageRenderTime 43ms CodeModel.GetById 22ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/ilmn_pacbio/cov_model.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 238 lines | 168 code | 25 blank | 45 comment | 37 complexity | 44d457d3fe1b2026c599e6afa62baf53 MD5 | raw file
  1. #!/usr/bin/env python
  2. from optparse import OptionParser, SUPPRESS_HELP
  3. import os, random, quake
  4. ############################################################
  5. # cov_model.py
  6. #
  7. # Given a file of kmer counts, reports the cutoff to use
  8. # to separate trusted/untrusted kmers.
  9. ############################################################
  10. ############################################################
  11. # main
  12. ############################################################
  13. def main():
  14. usage = 'usage: %prog [options] <counts file>'
  15. parser = OptionParser(usage)
  16. parser.add_option('--int', dest='count_kmers', action='store_true', default=False, help='Kmers were counted as integers w/o the use of quality values [default: %default]')
  17. parser.add_option('--ratio', dest='ratio', type='int', default=200, help='Likelihood ratio to set trusted/untrusted cutoff [default: %default]')
  18. parser.add_option('--no_sample', dest='no_sample', action='store_true', default=False, help='Do not sample kmer coverages into kmers.txt because its already done [default: %default]')
  19. # help='Model kmer coverage as a function of GC content of kmers [default: %default]'
  20. parser.add_option('--gc', dest='model_gc', action='store_true', default=False, help=SUPPRESS_HELP)
  21. (options, args) = parser.parse_args()
  22. if len(args) != 1:
  23. parser.error('Must provide kmers counts file')
  24. else:
  25. ctsf = args[0]
  26. if options.count_kmers:
  27. model_cutoff(ctsf, options.ratio)
  28. print 'Cutoff: %s' % open('cutoff.txt').readline().rstrip()
  29. else:
  30. if options.model_gc:
  31. model_q_gc_cutoffs(ctsf, 25000, options.ratio)
  32. else:
  33. model_q_cutoff(ctsf, 50000, options.ratio, options.no_sample)
  34. print 'Cutoff: %s' % open('cutoff.txt').readline().rstrip()
  35. ############################################################
  36. # model_cutoff
  37. #
  38. # Make a histogram of kmers to give to R to learn the cutoff
  39. ############################################################
  40. def model_cutoff(ctsf, ratio):
  41. # make kmer histogram
  42. cov_max = 0
  43. for line in open(ctsf):
  44. cov = int(line.split()[1])
  45. if cov > cov_max:
  46. cov_max = cov
  47. kmer_hist = [0]*cov_max
  48. for line in open(ctsf):
  49. cov = int(line.split()[1])
  50. kmer_hist[cov-1] += 1
  51. cov_out = open('kmers.hist', 'w')
  52. for cov in range(0,cov_max):
  53. if kmer_hist[cov]:
  54. print >> cov_out, '%d\t%d' % (cov+1,kmer_hist[cov])
  55. cov_out.close()
  56. os.system('R --slave --args %d < %s/cov_model.r 2> r.log' % (ratio,quake.quake_dir))
  57. ############################################################
  58. # model_q_cutoff
  59. #
  60. # Sample kmers to give to R to learn the cutoff
  61. # 'div100' is necessary when the number of kmers is too
  62. # large for random.sample, so we only consider every 100th
  63. # kmer.
  64. ############################################################
  65. def model_q_cutoff(ctsf, sample, ratio, no_sample=False):
  66. if not no_sample:
  67. # count number of kmer coverages
  68. num_covs = 0
  69. for line in open(ctsf):
  70. num_covs += 1
  71. # choose random kmer coverages
  72. div100 = False
  73. if sample >= num_covs:
  74. rand_covs = range(num_covs)
  75. else:
  76. if num_covs > 1000000000:
  77. div100 = True
  78. rand_covs = random.sample(xrange(num_covs/100), sample)
  79. else:
  80. rand_covs = random.sample(xrange(num_covs), sample)
  81. rand_covs.sort()
  82. # print to file
  83. out = open('kmers.txt', 'w')
  84. kmer_i = 0
  85. rand_i = 0
  86. for line in open(ctsf):
  87. if div100:
  88. if kmer_i % 100 == 0 and kmer_i/100 == rand_covs[rand_i]:
  89. print >> out, line.split()[1]
  90. rand_i += 1
  91. if rand_i >= sample:
  92. break
  93. else:
  94. if kmer_i == rand_covs[rand_i]:
  95. print >> out, line.split()[1]
  96. rand_i += 1
  97. if rand_i >= sample:
  98. break
  99. kmer_i += 1
  100. out.close()
  101. os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r.log' % (ratio,quake.quake_dir))
  102. ############################################################
  103. # model_q_gc_cutoffs
  104. #
  105. # Sample kmers to give to R to learn the cutoff for each
  106. # GC value
  107. ############################################################
  108. def model_q_gc_cutoffs(ctsf, sample, ratio):
  109. # count number of kmer coverages at each at
  110. k = len(open(ctsf).readline().split()[0])
  111. num_covs_at = [0]*(k+1)
  112. for line in open(ctsf):
  113. kmer = line.split()[0]
  114. num_covs_at[count_at(kmer)] += 1
  115. # for each AT bin
  116. at_cutoffs = []
  117. for at in range(1,k):
  118. # sample covs
  119. if sample >= num_covs_at[at]:
  120. rand_covs = range(num_covs_at[at])
  121. else:
  122. rand_covs = random.sample(xrange(num_covs_at[at]), sample)
  123. rand_covs.sort()
  124. # print to file
  125. out = open('kmers.txt', 'w')
  126. kmer_i = 0
  127. rand_i = 0
  128. for line in open(ctsf):
  129. (kmer,cov) = line.split()
  130. if count_at(kmer) == at:
  131. if kmer_i == rand_covs[rand_i]:
  132. print >> out, cov
  133. rand_i += 1
  134. if rand_i >= sample:
  135. break
  136. kmer_i += 1
  137. out.close()
  138. os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r%d.log' % (ratio,quake.quake_dir,at))
  139. at_cutoffs.append( open('cutoff.txt').readline().rstrip() )
  140. if at in [1,k-1]: # setting extremes to next closests
  141. at_cutoffs.append( open('cutoff.txt').readline().rstrip() )
  142. os.system('mv kmers.txt kmers.at%d.txt' % at)
  143. os.system('mv cutoff.txt cutoff.at%d.txt' % at)
  144. out = open('cutoffs.gc.txt','w')
  145. print >> out, '\n'.join(at_cutoffs)
  146. out.close()
  147. ############################################################
  148. # model_q_gc_cutoffs_bigmem
  149. #
  150. # Sample kmers to give to R to learn the cutoff for each
  151. # GC value
  152. ############################################################
  153. def model_q_gc_cutoffs_bigmem(ctsf, sample, ratio):
  154. # input coverages
  155. k = 0
  156. for line in open(ctsf):
  157. (kmer,cov) = line.split()
  158. if k == 0:
  159. k = len(kmer)
  160. at_covs = ['']*(k+1)
  161. else:
  162. at = count_at(kmer)
  163. if at_covs[at]:
  164. at_covs[at].append(cov)
  165. else:
  166. at_covs[at] = [cov]
  167. for at in range(1,k):
  168. print '%d %d' % (at,len(at_covs[at]))
  169. # for each AT bin
  170. at_cutoffs = []
  171. for at in range(1,k):
  172. # sample covs
  173. if sample >= len(at_covs[at]):
  174. rand_covs = at_covs[at]
  175. else:
  176. rand_covs = random.sample(at_covs[at], sample)
  177. # print to file
  178. out = open('kmers.txt', 'w')
  179. for rc in rand_covs:
  180. print >> out, rc
  181. out.close()
  182. os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r%d.log' % (ratio,quake.quake_dir,at))
  183. at_cutoffs.append( open('cutoff.txt').readline().rstrip() )
  184. if at in [1,k-1]: # setting extremes to next closests
  185. at_cutoffs.append( open('cutoff.txt').readline().rstrip() )
  186. os.system('mv kmers.txt kmers.at%d.txt' % at)
  187. os.system('mv cutoff.txt cutoff.at%d.txt' % at)
  188. out = open('cutoffs.gc.txt','w')
  189. print >> out, '\n'.join(at_cutoffs)
  190. out.close()
  191. ############################################################
  192. # count_at
  193. #
  194. # Count A's and T's in the given sequence
  195. ############################################################
  196. def count_at(seq):
  197. return len([nt for nt in seq if nt in ['A','T']])
  198. ############################################################
  199. # __main__
  200. ############################################################
  201. if __name__ == '__main__':
  202. main()