PageRenderTime 21ms CodeModel.GetById 2ms app.highlight 16ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/regVariation/featureCounter.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 148 lines | 121 code | 17 blank | 10 comment | 32 complexity | bb224ffabb429d6e5261587244a78801 MD5 | raw file
  1#!/usr/bin/env python
  2#Guruprasad Ananda
  3"""
  4Calculate count and coverage of one query on another, and append the Coverage and counts to
  5the last four columns as bases covered, percent coverage, number of completely present features, number of partially present/overlapping features.
  6
  7usage: %prog bed_file_1 bed_file_2 out_file
  8    -1, --cols1=N,N,N,N: Columns for chr, start, end, strand in first file
  9    -2, --cols2=N,N,N,N: Columns for chr, start, end, strand in second file
 10"""
 11from galaxy import eggs
 12import pkg_resources
 13pkg_resources.require( "bx-python" )
 14import sys, traceback, fileinput
 15from warnings import warn
 16from bx.intervals.io import *
 17from bx.cookbook import doc_optparse
 18from bx.intervals.operations import quicksect
 19from galaxy.tools.util.galaxyops import *
 20
 21assert sys.version_info[:2] >= ( 2, 4 )
 22
 23def stop_err(msg):
 24    sys.stderr.write(msg)
 25    sys.exit()
 26
 27def counter(node, start, end):
 28    global full, partial
 29    if node.start <= start and node.maxend > start:
 30        if node.end >= end or (node.start == start and end > node.end > start):
 31            full += 1
 32        elif end > node.end > start:
 33            partial += 1
 34        if node.left and node.left.maxend > start:
 35            counter(node.left, start, end)
 36        if node.right: 
 37            counter(node.right, start, end)
 38    elif start < node.start < end:
 39        if node.end <= end:
 40            full += 1
 41        else:
 42            partial += 1
 43        if node.left and node.left.maxend > start:
 44            counter(node.left, start, end)
 45        if node.right: 
 46            counter(node.right, start, end)
 47    else:
 48        if node.left: 
 49            counter(node.left, start, end)
 50
 51def count_coverage( readers, comments=True ):
 52    primary = readers[0]
 53    secondary = readers[1]
 54    secondary_copy = readers[2]
 55    
 56    rightTree = quicksect.IntervalTree()
 57    for item in secondary:
 58        if type( item ) is GenomicInterval:
 59            rightTree.insert( item, secondary.linenum, item.fields )
 60    
 61    bitsets = secondary_copy.binned_bitsets() 
 62        
 63    global full, partial
 64    
 65    for interval in primary:
 66        if type( interval ) is Header:
 67            yield interval
 68        if type( interval ) is Comment and comments:
 69            yield interval
 70        elif type( interval ) == GenomicInterval:
 71            chrom = interval.chrom
 72            start = int(interval.start)
 73            end = int(interval.end)
 74            full = 0
 75            partial = 0
 76            if chrom not in bitsets:
 77                bases_covered = 0
 78                percent = 0.0
 79                full = 0
 80                partial = 0
 81            else:
 82                bases_covered = bitsets[ chrom ].count_range( start, end-start )
 83                if (end - start) == 0:
 84                    percent = 0
 85                else: 
 86                    percent = float(bases_covered) / float(end - start)
 87                if bases_covered:
 88                    root = rightTree.chroms[chrom]    #root node for the chrom tree
 89                    counter(root, start, end)
 90            interval.fields.append(str(bases_covered))
 91            interval.fields.append(str(percent))
 92            interval.fields.append(str(full))
 93            interval.fields.append(str(partial))
 94            yield interval
 95    
 96def main():
 97    options, args = doc_optparse.parse( __doc__ )
 98    
 99    try:
100        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
101        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )      
102        in1_fname, in2_fname, out_fname = args
103    except:
104        stop_err( "Data issue: click the pencil icon in the history item to correct the metadata attributes." )
105    
106    g1 = NiceReaderWrapper( fileinput.FileInput( in1_fname ),
107                            chrom_col=chr_col_1,
108                            start_col=start_col_1,
109                            end_col=end_col_1,
110                            strand_col=strand_col_1,
111                            fix_strand=True )
112    g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
113                            chrom_col=chr_col_2,
114                            start_col=start_col_2,
115                            end_col=end_col_2,
116                            strand_col=strand_col_2,
117                            fix_strand=True )
118    g2_copy = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
119                                 chrom_col=chr_col_2,
120                                 start_col=start_col_2,
121                                 end_col=end_col_2,
122                                 strand_col=strand_col_2,
123                                 fix_strand=True )
124    
125
126    out_file = open( out_fname, "w" )
127
128    try:
129        for line in count_coverage([g1,g2,g2_copy]):
130            if type( line ) is GenomicInterval:
131                out_file.write( "%s\n" % "\t".join( line.fields ) )
132            else:
133                out_file.write( "%s\n" % line )
134    except ParseError, exc:
135        out_file.close()
136        fail( str( exc ) )
137
138    out_file.close()
139
140    if g1.skipped > 0:
141        print skipped( g1, filedesc=" of 1st dataset" )
142    if g2.skipped > 0:
143        print skipped( g2, filedesc=" of 2nd dataset" )
144    elif g2_copy.skipped > 0:
145        print skipped( g2_copy, filedesc=" of 2nd dataset" )
146        
147if __name__ == "__main__":
148    main()