cov_model.py - This is a Python script that performs variou…

/tools/ilmn_pacbio/cov_model.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 238 lines · 146 code · 36 blank · 56 comment · 48 complexity · 44d457d3fe1b2026c599e6afa62baf53 MD5 · raw file


#!/usr/bin/env python
from optparse import OptionParser, SUPPRESS_HELP
import os, random, quake

############################################################
# cov_model.py
#
# Given a file of kmer counts, reports the cutoff to use
# to separate trusted/untrusted kmers.
############################################################

############################################################
# main
############################################################
def main():
    usage = 'usage: %prog [options] <counts file>'
    parser = OptionParser(usage)
    parser.add_option('--int', dest='count_kmers', action='store_true', default=False, help='Kmers were counted as integers w/o the use of quality values [default: %default]')
    parser.add_option('--ratio', dest='ratio', type='int', default=200, help='Likelihood ratio to set trusted/untrusted cutoff [default: %default]')
    parser.add_option('--no_sample', dest='no_sample', action='store_true', default=False, help='Do not sample kmer coverages into kmers.txt because its already done [default: %default]')
    # help='Model kmer coverage as a function of GC content of kmers [default: %default]'
    parser.add_option('--gc', dest='model_gc', action='store_true', default=False, help=SUPPRESS_HELP)
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide kmers counts file')
    else:
        ctsf = args[0]

    if options.count_kmers:
        model_cutoff(ctsf, options.ratio)
        print 'Cutoff: %s' % open('cutoff.txt').readline().rstrip()
        
    else:
        if options.model_gc:
            model_q_gc_cutoffs(ctsf, 25000, options.ratio)
        else:
            model_q_cutoff(ctsf, 50000, options.ratio, options.no_sample)
            print 'Cutoff: %s' % open('cutoff.txt').readline().rstrip()


############################################################
# model_cutoff
#
# Make a histogram of kmers to give to R to learn the cutoff
############################################################
def model_cutoff(ctsf, ratio):
    # make kmer histogram
    cov_max = 0
    for line in open(ctsf):
        cov = int(line.split()[1])
        if cov > cov_max:
            cov_max = cov

    kmer_hist = [0]*cov_max
    for line in open(ctsf):
        cov = int(line.split()[1])
        kmer_hist[cov-1] += 1

    cov_out = open('kmers.hist', 'w')
    for cov in range(0,cov_max):
        if kmer_hist[cov]:
            print >> cov_out, '%d\t%d' % (cov+1,kmer_hist[cov])
    cov_out.close()

    os.system('R --slave --args %d < %s/cov_model.r 2> r.log' % (ratio,quake.quake_dir))


############################################################
# model_q_cutoff
#
# Sample kmers to give to R to learn the cutoff
# 'div100' is necessary when the number of kmers is too 
# large for random.sample, so we only consider every 100th
# kmer.
############################################################
def model_q_cutoff(ctsf, sample, ratio, no_sample=False):
    if not no_sample:
        # count number of kmer coverages
        num_covs = 0
        for line in open(ctsf):
            num_covs += 1

        # choose random kmer coverages
        div100 = False
        if sample >= num_covs:
            rand_covs = range(num_covs)
        else:
            if num_covs > 1000000000:
                div100 = True
                rand_covs = random.sample(xrange(num_covs/100), sample)
            else:
                rand_covs = random.sample(xrange(num_covs), sample)
        rand_covs.sort()

        # print to file
        out = open('kmers.txt', 'w')
        kmer_i = 0
        rand_i = 0
        for line in open(ctsf):
            if div100:
                if kmer_i % 100 == 0 and kmer_i/100 == rand_covs[rand_i]:
                    print >> out, line.split()[1]
                    rand_i += 1
                    if rand_i >= sample:
                        break
            else:
                if kmer_i == rand_covs[rand_i]:
                    print >> out, line.split()[1]
                    rand_i += 1
                    if rand_i >= sample:
                        break
            kmer_i += 1
        out.close()

    os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r.log' % (ratio,quake.quake_dir))


############################################################
# model_q_gc_cutoffs
#
# Sample kmers to give to R to learn the cutoff for each
# GC value
############################################################
def model_q_gc_cutoffs(ctsf, sample, ratio):
    # count number of kmer coverages at each at
    k = len(open(ctsf).readline().split()[0])
    num_covs_at = [0]*(k+1)
    for line in open(ctsf):
        kmer = line.split()[0]
        num_covs_at[count_at(kmer)] += 1

    # for each AT bin
    at_cutoffs = []
    for at in range(1,k):
        # sample covs
        if sample >= num_covs_at[at]:
            rand_covs = range(num_covs_at[at])
        else:
            rand_covs = random.sample(xrange(num_covs_at[at]), sample)
        rand_covs.sort()

        # print to file
        out = open('kmers.txt', 'w')
        kmer_i = 0
        rand_i = 0
        for line in open(ctsf):
            (kmer,cov) = line.split()
            if count_at(kmer) == at:
                if kmer_i == rand_covs[rand_i]:
                    print >> out, cov
                    rand_i += 1
                    if rand_i >= sample:
                        break
                kmer_i += 1
        out.close()
        
        os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r%d.log' % (ratio,quake.quake_dir,at))

        at_cutoffs.append( open('cutoff.txt').readline().rstrip() )
        if at in [1,k-1]:   # setting extremes to next closests
            at_cutoffs.append( open('cutoff.txt').readline().rstrip() )

        os.system('mv kmers.txt kmers.at%d.txt' % at)
        os.system('mv cutoff.txt cutoff.at%d.txt' % at)

    out = open('cutoffs.gc.txt','w')
    print >> out, '\n'.join(at_cutoffs)
    out.close()


############################################################
# model_q_gc_cutoffs_bigmem
#
# Sample kmers to give to R to learn the cutoff for each
# GC value
############################################################
def model_q_gc_cutoffs_bigmem(ctsf, sample, ratio):
    # input coverages
    k = 0
    for line in open(ctsf):
        (kmer,cov) = line.split()
        if k == 0:
            k = len(kmer)
            at_covs = ['']*(k+1)
        else:
            at = count_at(kmer)
            if at_covs[at]:
                at_covs[at].append(cov)
            else:
                at_covs[at] = [cov]

    for at in range(1,k):
        print '%d %d' % (at,len(at_covs[at]))

    # for each AT bin
    at_cutoffs = []
    for at in range(1,k):
        # sample covs
        if sample >= len(at_covs[at]):
            rand_covs = at_covs[at]
        else:
            rand_covs = random.sample(at_covs[at], sample)

        # print to file
        out = open('kmers.txt', 'w')
        for rc in rand_covs:
            print >> out, rc
        out.close()

        os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r%d.log' % (ratio,quake.quake_dir,at))

        at_cutoffs.append( open('cutoff.txt').readline().rstrip() )
        if at in [1,k-1]:   # setting extremes to next closests
            at_cutoffs.append( open('cutoff.txt').readline().rstrip() )

        os.system('mv kmers.txt kmers.at%d.txt' % at)
        os.system('mv cutoff.txt cutoff.at%d.txt' % at)

    out = open('cutoffs.gc.txt','w')
    print >> out, '\n'.join(at_cutoffs)
    out.close()
        
    
############################################################
# count_at
#
# Count A's and T's in the given sequence
############################################################
def count_at(seq):
    return len([nt for nt in seq if nt in ['A','T']])


############################################################
# __main__
############################################################
if __name__ == '__main__':
    main()

Summary ✨

This is a Python script that performs various tasks related to analyzing and processing DNA sequences. It can be used to count the number of A’s and T’s in a given sequence, as well as to calculate the GC content of a DNA sequence. Additionally, it can be used to generate a histogram of the GC content of a DNA sequence.

Tech Fingerprint

Alerts (29)

'def' Ensure functions have docstrings for documentation
15 47 77 125 178 230
'open(' Use 'with open()' to ensure Files are properly closed
32 39 50 56 60 81 97 100 127 129 144 147 160 162 167 181 206 213 215 220
Complexity hotspot; lines 100 to 102 (total complexity: 4)
100 101 102