PageRenderTime 68ms CodeModel.GetById 24ms app.highlight 38ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/new_operations/flanking_features.py

https://bitbucket.org/ialbert/galaxy-genetrack
Python | 175 lines | 146 code | 14 blank | 15 comment | 70 complexity | 2a55b7ac3867a259f9209ec575467a2d MD5 | raw file
  1#!/usr/bin/env python
  2#By: Guruprasad Ananda
  3"""
  4Fetch closest up/downstream interval from features corresponding to every interval in primary
  5
  6usage: %prog primary_file features_file out_file direction
  7    -1, --cols1=N,N,N,N: Columns for start, end, strand in first file
  8    -2, --cols2=N,N,N,N: Columns for start, end, strand in second file
  9"""
 10from galaxy import eggs
 11import pkg_resources
 12pkg_resources.require( "bx-python" )
 13import sys, traceback, fileinput
 14from warnings import warn
 15from bx.cookbook import doc_optparse
 16from galaxy.tools.util.galaxyops import *
 17from bx.intervals.io import *
 18from bx.intervals.operations import quicksect
 19
 20assert sys.version_info[:2] >= ( 2, 4 )
 21
 22def get_closest_feature (node, direction, threshold_up, threshold_down, report_func_up, report_func_down):
 23    #direction=1 for +ve strand upstream and -ve strand downstream cases; and it is 0 for +ve strand downstream and -ve strand upstream cases
 24    #threhold_Up is equal to the interval start for +ve strand, and interval end for -ve strand
 25    #threhold_down is equal to the interval end for +ve strand, and interval start for -ve strand
 26    if direction == 1: 
 27        if node.maxend < threshold_up:
 28            if node.end == node.maxend:
 29                report_func_up(node)
 30            elif node.right and node.left:
 31                if node.right.maxend == node.maxend:
 32                    get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
 33                elif node.left.maxend == node.maxend:
 34                    get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
 35            elif node.right and node.right.maxend == node.maxend:
 36                get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
 37            elif node.left and node.left.maxend == node.maxend:
 38                get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
 39        elif node.minend < threshold_up:
 40            if node.end < threshold_up:
 41                report_func_up(node)
 42            if node.left and node.right:
 43                if node.right.minend < threshold_up:
 44                    get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
 45                if node.left.minend < threshold_up:
 46                    get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
 47            elif node.left:
 48                if node.left.minend < threshold_up:
 49                    get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
 50            elif node.right:
 51                if node.right.minend < threshold_up:
 52                    get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
 53    elif direction == 0:
 54        if node.start > threshold_down:
 55            report_func_down(node)
 56            if node.left:
 57                get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
 58        else:
 59            if node.right:
 60                get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
 61
 62def proximal_region_finder(readers, region, comments=True):
 63    primary = readers[0]
 64    features = readers[1]
 65    either = False
 66    if region == 'Upstream':
 67        up, down = True, False
 68    elif region == 'Downstream':
 69        up, down = False, True
 70    else:
 71        up, down = True, True
 72        if region == 'Either':
 73            either = True
 74        
 75    # Read features into memory:
 76    rightTree = quicksect.IntervalTree()
 77    for item in features:
 78        if type( item ) is GenomicInterval:
 79            rightTree.insert( item, features.linenum, item.fields )
 80            
 81    for interval in primary:
 82        if type( interval ) is Header:
 83            yield interval
 84        if type( interval ) is Comment and comments:
 85            yield interval
 86        elif type( interval ) == GenomicInterval:
 87            chrom = interval.chrom
 88            start = int(interval.start)
 89            end = int(interval.end)
 90            strand = interval.strand
 91            if chrom not in rightTree.chroms:
 92                continue
 93            else:
 94                root = rightTree.chroms[chrom]    #root node for the chrom tree
 95                result_up = []
 96                result_down = []
 97                if (strand == '+' and up) or (strand == '-' and down): 
 98                    #upstream +ve strand and downstream -ve strand cases
 99                    get_closest_feature (root, 1, start, None, lambda node: result_up.append( node ), None)
100                    
101                if (strand == '+' and down) or (strand == '-' and up):
102                    #downstream +ve strand and upstream -ve strand case
103                    get_closest_feature (root, 0, None, end, None, lambda node: result_down.append( node ))
104                
105                if result_up:
106                    outfields = list(interval)
107                    if len(result_up) > 1: #The results_up list has a list of intervals upstream to the given interval. 
108                        ends = []
109                        for n in result_up:
110                            ends.append(n.end)
111                        res_ind = ends.index(max(ends)) #fetch the index of the closest interval i.e. the interval with the max end from the results_up list
112                    else:
113                        res_ind = 0
114                    if not(either):
115                        map(outfields.append, result_up[res_ind].other)
116                        yield outfields
117                
118                if result_down:    
119                    outfields = list(interval)
120                    if not(either):
121                        map(outfields.append, result_down[-1].other) #The last element of result_down will be the closest element to the given interval
122                        yield outfields
123                
124                if either:
125                    if result_up and result_down:
126                        if abs(start - int(result_up[res_ind].end)) <= abs(end - int(result_down[-1].start)):
127                            map(outfields.append, result_up[res_ind].other)
128                        else:
129                            map(outfields.append, result_down[-1].other) #The last element of result_down will be the closest element to the given interval
130                    elif result_up:
131                        map(outfields.append, result_up[res_ind].other)
132                    else:
133                        map(outfields.append, result_down[-1].other) #The last element of result_down will be the closest element to the given interval
134                    yield outfields
135                    
136                        
137def main():
138    options, args = doc_optparse.parse( __doc__ )
139    try:
140        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
141        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )      
142        in_fname, in2_fname, out_fname, direction = args
143    except:
144        doc_optparse.exception()
145
146    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
147                            chrom_col=chr_col_1,
148                            start_col=start_col_1,
149                            end_col=end_col_1,
150                            strand_col=strand_col_1,
151                            fix_strand=True )
152    g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
153                            chrom_col=chr_col_2,
154                            start_col=start_col_2,
155                            end_col=end_col_2,
156                            strand_col=strand_col_2,
157                            fix_strand=True )
158    out_file = open( out_fname, "w" )
159    try:
160        for line in proximal_region_finder([g1,g2], direction):
161            if type( line ) is list:
162                out_file.write( "%s\n" % "\t".join( line ) )
163            else:
164                out_file.write( "%s\n" % line )
165    except ParseError, exc:
166        fail( "Invalid file format: %s" % str( exc ) )
167
168    print "Direction: %s" %(direction)
169    if g1.skipped > 0:
170        print skipped( g1, filedesc=" of 1st dataset" )
171    if g2.skipped > 0:
172        print skipped( g2, filedesc=" of 2nd dataset" )
173
174if __name__ == "__main__":
175    main()