flanking_features.py - By: Guruprasad Ananda

/tools/new_operations/flanking_features.py

https://bitbucket.org/ialbert/galaxy-genetrack · Python · 175 lines · 146 code · 14 blank · 15 comment · 70 complexity · 2a55b7ac3867a259f9209ec575467a2d MD5 · raw file


#!/usr/bin/env python
#By: Guruprasad Ananda
"""
Fetch closest up/downstream interval from features corresponding to every interval in primary

usage: %prog primary_file features_file out_file direction
    -1, --cols1=N,N,N,N: Columns for start, end, strand in first file
    -2, --cols2=N,N,N,N: Columns for start, end, strand in second file
"""
from galaxy import eggs
import pkg_resources
pkg_resources.require( "bx-python" )
import sys, traceback, fileinput
from warnings import warn
from bx.cookbook import doc_optparse
from galaxy.tools.util.galaxyops import *
from bx.intervals.io import *
from bx.intervals.operations import quicksect

assert sys.version_info[:2] >= ( 2, 4 )

def get_closest_feature (node, direction, threshold_up, threshold_down, report_func_up, report_func_down):
    #direction=1 for +ve strand upstream and -ve strand downstream cases; and it is 0 for +ve strand downstream and -ve strand upstream cases
    #threhold_Up is equal to the interval start for +ve strand, and interval end for -ve strand
    #threhold_down is equal to the interval end for +ve strand, and interval start for -ve strand
    if direction == 1: 
        if node.maxend < threshold_up:
            if node.end == node.maxend:
                report_func_up(node)
            elif node.right and node.left:
                if node.right.maxend == node.maxend:
                    get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
                elif node.left.maxend == node.maxend:
                    get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
            elif node.right and node.right.maxend == node.maxend:
                get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
            elif node.left and node.left.maxend == node.maxend:
                get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
        elif node.minend < threshold_up:
            if node.end < threshold_up:
                report_func_up(node)
            if node.left and node.right:
                if node.right.minend < threshold_up:
                    get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
                if node.left.minend < threshold_up:
                    get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
            elif node.left:
                if node.left.minend < threshold_up:
                    get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
            elif node.right:
                if node.right.minend < threshold_up:
                    get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
    elif direction == 0:
        if node.start > threshold_down:
            report_func_down(node)
            if node.left:
                get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
        else:
            if node.right:
                get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)

def proximal_region_finder(readers, region, comments=True):
    primary = readers[0]
    features = readers[1]
    either = False
    if region == 'Upstream':
        up, down = True, False
    elif region == 'Downstream':
        up, down = False, True
    else:
        up, down = True, True
        if region == 'Either':
            either = True
        
    # Read features into memory:
    rightTree = quicksect.IntervalTree()
    for item in features:
        if type( item ) is GenomicInterval:
            rightTree.insert( item, features.linenum, item.fields )
            
    for interval in primary:
        if type( interval ) is Header:
            yield interval
        if type( interval ) is Comment and comments:
            yield interval
        elif type( interval ) == GenomicInterval:
            chrom = interval.chrom
            start = int(interval.start)
            end = int(interval.end)
            strand = interval.strand
            if chrom not in rightTree.chroms:
                continue
            else:
                root = rightTree.chroms[chrom]    #root node for the chrom tree
                result_up = []
                result_down = []
                if (strand == '+' and up) or (strand == '-' and down): 
                    #upstream +ve strand and downstream -ve strand cases
                    get_closest_feature (root, 1, start, None, lambda node: result_up.append( node ), None)
                    
                if (strand == '+' and down) or (strand == '-' and up):
                    #downstream +ve strand and upstream -ve strand case
                    get_closest_feature (root, 0, None, end, None, lambda node: result_down.append( node ))
                
                if result_up:
                    outfields = list(interval)
                    if len(result_up) > 1: #The results_up list has a list of intervals upstream to the given interval. 
                        ends = []
                        for n in result_up:
                            ends.append(n.end)
                        res_ind = ends.index(max(ends)) #fetch the index of the closest interval i.e. the interval with the max end from the results_up list
                    else:
                        res_ind = 0
                    if not(either):
                        map(outfields.append, result_up[res_ind].other)
                        yield outfields
                
                if result_down:    
                    outfields = list(interval)
                    if not(either):
                        map(outfields.append, result_down[-1].other) #The last element of result_down will be the closest element to the given interval
                        yield outfields
                
                if either:
                    if result_up and result_down:
                        if abs(start - int(result_up[res_ind].end)) <= abs(end - int(result_down[-1].start)):
                            map(outfields.append, result_up[res_ind].other)
                        else:
                            map(outfields.append, result_down[-1].other) #The last element of result_down will be the closest element to the given interval
                    elif result_up:
                        map(outfields.append, result_up[res_ind].other)
                    else:
                        map(outfields.append, result_down[-1].other) #The last element of result_down will be the closest element to the given interval
                    yield outfields
                    
                        
def main():
    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )      
        in_fname, in2_fname, out_fname, direction = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )
    g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
                            chrom_col=chr_col_2,
                            start_col=start_col_2,
                            end_col=end_col_2,
                            strand_col=strand_col_2,
                            fix_strand=True )
    out_file = open( out_fname, "w" )
    try:
        for line in proximal_region_finder([g1,g2], direction):
            if type( line ) is list:
                out_file.write( "%s\n" % "\t".join( line ) )
            else:
                out_file.write( "%s\n" % line )
    except ParseError, exc:
        fail( "Invalid file format: %s" % str( exc ) )

    print "Direction: %s" %(direction)
    if g1.skipped > 0:
        print skipped( g1, filedesc=" of 1st dataset" )
    if g2.skipped > 0:
        print skipped( g2, filedesc=" of 2nd dataset" )

if __name__ == "__main__":
    main()

Tech Fingerprint

Alerts (21)

'import *' Avoid to prevent namespace pollution; import specific names or use aliases
16 17
'def' Ensure functions have docstrings for documentation
22 62 137
'type(' Use isinstance() for type checking instead of type()
78 82 84 86 161
Complexity hotspot; line 97 (total complexity: 4)
97
'lambda' Avoid complex 'lambda' functions; prefer named functions for clarity and debugging
99 103
Complexity hotspot; line 101 (total complexity: 4)
101
'list(' Avoid unnecessary list conversions; use generators where possible
106 119
Complexity hotspot; lines 124 to 126 (total complexity: 4)
124 125 126
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
143
'open(' Use 'with open()' to ensure Files are properly closed
158