PageRenderTime 36ms CodeModel.GetById 20ms app.highlight 13ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/ilmn_pacbio/cov_model.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 238 lines | 168 code | 25 blank | 45 comment | 41 complexity | 44d457d3fe1b2026c599e6afa62baf53 MD5 | raw file
  1#!/usr/bin/env python
  2from optparse import OptionParser, SUPPRESS_HELP
  3import os, random, quake
  4
  5############################################################
  6# cov_model.py
  7#
  8# Given a file of kmer counts, reports the cutoff to use
  9# to separate trusted/untrusted kmers.
 10############################################################
 11
 12############################################################
 13# main
 14############################################################
 15def main():
 16    usage = 'usage: %prog [options] <counts file>'
 17    parser = OptionParser(usage)
 18    parser.add_option('--int', dest='count_kmers', action='store_true', default=False, help='Kmers were counted as integers w/o the use of quality values [default: %default]')
 19    parser.add_option('--ratio', dest='ratio', type='int', default=200, help='Likelihood ratio to set trusted/untrusted cutoff [default: %default]')
 20    parser.add_option('--no_sample', dest='no_sample', action='store_true', default=False, help='Do not sample kmer coverages into kmers.txt because its already done [default: %default]')
 21    # help='Model kmer coverage as a function of GC content of kmers [default: %default]'
 22    parser.add_option('--gc', dest='model_gc', action='store_true', default=False, help=SUPPRESS_HELP)
 23    (options, args) = parser.parse_args()
 24
 25    if len(args) != 1:
 26        parser.error('Must provide kmers counts file')
 27    else:
 28        ctsf = args[0]
 29
 30    if options.count_kmers:
 31        model_cutoff(ctsf, options.ratio)
 32        print 'Cutoff: %s' % open('cutoff.txt').readline().rstrip()
 33        
 34    else:
 35        if options.model_gc:
 36            model_q_gc_cutoffs(ctsf, 25000, options.ratio)
 37        else:
 38            model_q_cutoff(ctsf, 50000, options.ratio, options.no_sample)
 39            print 'Cutoff: %s' % open('cutoff.txt').readline().rstrip()
 40
 41
 42############################################################
 43# model_cutoff
 44#
 45# Make a histogram of kmers to give to R to learn the cutoff
 46############################################################
 47def model_cutoff(ctsf, ratio):
 48    # make kmer histogram
 49    cov_max = 0
 50    for line in open(ctsf):
 51        cov = int(line.split()[1])
 52        if cov > cov_max:
 53            cov_max = cov
 54
 55    kmer_hist = [0]*cov_max
 56    for line in open(ctsf):
 57        cov = int(line.split()[1])
 58        kmer_hist[cov-1] += 1
 59
 60    cov_out = open('kmers.hist', 'w')
 61    for cov in range(0,cov_max):
 62        if kmer_hist[cov]:
 63            print >> cov_out, '%d\t%d' % (cov+1,kmer_hist[cov])
 64    cov_out.close()
 65
 66    os.system('R --slave --args %d < %s/cov_model.r 2> r.log' % (ratio,quake.quake_dir))
 67
 68
 69############################################################
 70# model_q_cutoff
 71#
 72# Sample kmers to give to R to learn the cutoff
 73# 'div100' is necessary when the number of kmers is too 
 74# large for random.sample, so we only consider every 100th
 75# kmer.
 76############################################################
 77def model_q_cutoff(ctsf, sample, ratio, no_sample=False):
 78    if not no_sample:
 79        # count number of kmer coverages
 80        num_covs = 0
 81        for line in open(ctsf):
 82            num_covs += 1
 83
 84        # choose random kmer coverages
 85        div100 = False
 86        if sample >= num_covs:
 87            rand_covs = range(num_covs)
 88        else:
 89            if num_covs > 1000000000:
 90                div100 = True
 91                rand_covs = random.sample(xrange(num_covs/100), sample)
 92            else:
 93                rand_covs = random.sample(xrange(num_covs), sample)
 94        rand_covs.sort()
 95
 96        # print to file
 97        out = open('kmers.txt', 'w')
 98        kmer_i = 0
 99        rand_i = 0
100        for line in open(ctsf):
101            if div100:
102                if kmer_i % 100 == 0 and kmer_i/100 == rand_covs[rand_i]:
103                    print >> out, line.split()[1]
104                    rand_i += 1
105                    if rand_i >= sample:
106                        break
107            else:
108                if kmer_i == rand_covs[rand_i]:
109                    print >> out, line.split()[1]
110                    rand_i += 1
111                    if rand_i >= sample:
112                        break
113            kmer_i += 1
114        out.close()
115
116    os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r.log' % (ratio,quake.quake_dir))
117
118
119############################################################
120# model_q_gc_cutoffs
121#
122# Sample kmers to give to R to learn the cutoff for each
123# GC value
124############################################################
125def model_q_gc_cutoffs(ctsf, sample, ratio):
126    # count number of kmer coverages at each at
127    k = len(open(ctsf).readline().split()[0])
128    num_covs_at = [0]*(k+1)
129    for line in open(ctsf):
130        kmer = line.split()[0]
131        num_covs_at[count_at(kmer)] += 1
132
133    # for each AT bin
134    at_cutoffs = []
135    for at in range(1,k):
136        # sample covs
137        if sample >= num_covs_at[at]:
138            rand_covs = range(num_covs_at[at])
139        else:
140            rand_covs = random.sample(xrange(num_covs_at[at]), sample)
141        rand_covs.sort()
142
143        # print to file
144        out = open('kmers.txt', 'w')
145        kmer_i = 0
146        rand_i = 0
147        for line in open(ctsf):
148            (kmer,cov) = line.split()
149            if count_at(kmer) == at:
150                if kmer_i == rand_covs[rand_i]:
151                    print >> out, cov
152                    rand_i += 1
153                    if rand_i >= sample:
154                        break
155                kmer_i += 1
156        out.close()
157        
158        os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r%d.log' % (ratio,quake.quake_dir,at))
159
160        at_cutoffs.append( open('cutoff.txt').readline().rstrip() )
161        if at in [1,k-1]:   # setting extremes to next closests
162            at_cutoffs.append( open('cutoff.txt').readline().rstrip() )
163
164        os.system('mv kmers.txt kmers.at%d.txt' % at)
165        os.system('mv cutoff.txt cutoff.at%d.txt' % at)
166
167    out = open('cutoffs.gc.txt','w')
168    print >> out, '\n'.join(at_cutoffs)
169    out.close()
170
171
172############################################################
173# model_q_gc_cutoffs_bigmem
174#
175# Sample kmers to give to R to learn the cutoff for each
176# GC value
177############################################################
178def model_q_gc_cutoffs_bigmem(ctsf, sample, ratio):
179    # input coverages
180    k = 0
181    for line in open(ctsf):
182        (kmer,cov) = line.split()
183        if k == 0:
184            k = len(kmer)
185            at_covs = ['']*(k+1)
186        else:
187            at = count_at(kmer)
188            if at_covs[at]:
189                at_covs[at].append(cov)
190            else:
191                at_covs[at] = [cov]
192
193    for at in range(1,k):
194        print '%d %d' % (at,len(at_covs[at]))
195
196    # for each AT bin
197    at_cutoffs = []
198    for at in range(1,k):
199        # sample covs
200        if sample >= len(at_covs[at]):
201            rand_covs = at_covs[at]
202        else:
203            rand_covs = random.sample(at_covs[at], sample)
204
205        # print to file
206        out = open('kmers.txt', 'w')
207        for rc in rand_covs:
208            print >> out, rc
209        out.close()
210
211        os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r%d.log' % (ratio,quake.quake_dir,at))
212
213        at_cutoffs.append( open('cutoff.txt').readline().rstrip() )
214        if at in [1,k-1]:   # setting extremes to next closests
215            at_cutoffs.append( open('cutoff.txt').readline().rstrip() )
216
217        os.system('mv kmers.txt kmers.at%d.txt' % at)
218        os.system('mv cutoff.txt cutoff.at%d.txt' % at)
219
220    out = open('cutoffs.gc.txt','w')
221    print >> out, '\n'.join(at_cutoffs)
222    out.close()
223        
224    
225############################################################
226# count_at
227#
228# Count A's and T's in the given sequence
229############################################################
230def count_at(seq):
231    return len([nt for nt in seq if nt in ['A','T']])
232
233
234############################################################
235# __main__
236############################################################
237if __name__ == '__main__':
238    main()