featureCounter.py - This Python script calculates the cover…

/tools/regVariation/featureCounter.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 148 lines · 121 code · 17 blank · 10 comment · 39 complexity · bb224ffabb429d6e5261587244a78801 MD5 · raw file


#!/usr/bin/env python
#Guruprasad Ananda
"""
Calculate count and coverage of one query on another, and append the Coverage and counts to
the last four columns as bases covered, percent coverage, number of completely present features, number of partially present/overlapping features.

usage: %prog bed_file_1 bed_file_2 out_file
    -1, --cols1=N,N,N,N: Columns for chr, start, end, strand in first file
    -2, --cols2=N,N,N,N: Columns for chr, start, end, strand in second file
"""
from galaxy import eggs
import pkg_resources
pkg_resources.require( "bx-python" )
import sys, traceback, fileinput
from warnings import warn
from bx.intervals.io import *
from bx.cookbook import doc_optparse
from bx.intervals.operations import quicksect
from galaxy.tools.util.galaxyops import *

assert sys.version_info[:2] >= ( 2, 4 )

def stop_err(msg):
    sys.stderr.write(msg)
    sys.exit()

def counter(node, start, end):
    global full, partial
    if node.start <= start and node.maxend > start:
        if node.end >= end or (node.start == start and end > node.end > start):
            full += 1
        elif end > node.end > start:
            partial += 1
        if node.left and node.left.maxend > start:
            counter(node.left, start, end)
        if node.right: 
            counter(node.right, start, end)
    elif start < node.start < end:
        if node.end <= end:
            full += 1
        else:
            partial += 1
        if node.left and node.left.maxend > start:
            counter(node.left, start, end)
        if node.right: 
            counter(node.right, start, end)
    else:
        if node.left: 
            counter(node.left, start, end)

def count_coverage( readers, comments=True ):
    primary = readers[0]
    secondary = readers[1]
    secondary_copy = readers[2]
    
    rightTree = quicksect.IntervalTree()
    for item in secondary:
        if type( item ) is GenomicInterval:
            rightTree.insert( item, secondary.linenum, item.fields )
    
    bitsets = secondary_copy.binned_bitsets() 
        
    global full, partial
    
    for interval in primary:
        if type( interval ) is Header:
            yield interval
        if type( interval ) is Comment and comments:
            yield interval
        elif type( interval ) == GenomicInterval:
            chrom = interval.chrom
            start = int(interval.start)
            end = int(interval.end)
            full = 0
            partial = 0
            if chrom not in bitsets:
                bases_covered = 0
                percent = 0.0
                full = 0
                partial = 0
            else:
                bases_covered = bitsets[ chrom ].count_range( start, end-start )
                if (end - start) == 0:
                    percent = 0
                else: 
                    percent = float(bases_covered) / float(end - start)
                if bases_covered:
                    root = rightTree.chroms[chrom]    #root node for the chrom tree
                    counter(root, start, end)
            interval.fields.append(str(bases_covered))
            interval.fields.append(str(percent))
            interval.fields.append(str(full))
            interval.fields.append(str(partial))
            yield interval
    
def main():
    options, args = doc_optparse.parse( __doc__ )
    
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )      
        in1_fname, in2_fname, out_fname = args
    except:
        stop_err( "Data issue: click the pencil icon in the history item to correct the metadata attributes." )
    
    g1 = NiceReaderWrapper( fileinput.FileInput( in1_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )
    g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
                            chrom_col=chr_col_2,
                            start_col=start_col_2,
                            end_col=end_col_2,
                            strand_col=strand_col_2,
                            fix_strand=True )
    g2_copy = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
                                 chrom_col=chr_col_2,
                                 start_col=start_col_2,
                                 end_col=end_col_2,
                                 strand_col=strand_col_2,
                                 fix_strand=True )
    

    out_file = open( out_fname, "w" )

    try:
        for line in count_coverage([g1,g2,g2_copy]):
            if type( line ) is GenomicInterval:
                out_file.write( "%s\n" % "\t".join( line.fields ) )
            else:
                out_file.write( "%s\n" % line )
    except ParseError, exc:
        out_file.close()
        fail( str( exc ) )

    out_file.close()

    if g1.skipped > 0:
        print skipped( g1, filedesc=" of 1st dataset" )
    if g2.skipped > 0:
        print skipped( g2, filedesc=" of 2nd dataset" )
    elif g2_copy.skipped > 0:
        print skipped( g2_copy, filedesc=" of 2nd dataset" )
        
if __name__ == "__main__":
    main()

Summary ✨

This Python script calculates the coverage of one query on another and appends the Coverage, Counts, and Percentage to the last four columns as bases covered, percent coverage, number of completely present features, and number of partially present/overlapping features.

Tech Fingerprint

Alerts (17)

'import *' Avoid to prevent namespace pollution; import specific names or use aliases
16 19
'def' Ensure functions have docstrings for documentation
23 27 51 96
'global' Avoid global variables; use function parameters or class attributes for better scope management
28 63
Complexity hotspot; lines 29 to 30 (total complexity: 5)
29 30
'type(' Use isinstance() for type checking instead of type()
58 66 68 70 130
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
103
'open(' Use 'with open()' to ensure Files are properly closed
126