/tools/new_operations/flanking_features.py
Python | 175 lines | 146 code | 14 blank | 15 comment | 70 complexity | 2a55b7ac3867a259f9209ec575467a2d MD5 | raw file
1#!/usr/bin/env python
2#By: Guruprasad Ananda
3"""
4Fetch closest up/downstream interval from features corresponding to every interval in primary
5
6usage: %prog primary_file features_file out_file direction
7 -1, --cols1=N,N,N,N: Columns for start, end, strand in first file
8 -2, --cols2=N,N,N,N: Columns for start, end, strand in second file
9"""
10from galaxy import eggs
11import pkg_resources
12pkg_resources.require( "bx-python" )
13import sys, traceback, fileinput
14from warnings import warn
15from bx.cookbook import doc_optparse
16from galaxy.tools.util.galaxyops import *
17from bx.intervals.io import *
18from bx.intervals.operations import quicksect
19
20assert sys.version_info[:2] >= ( 2, 4 )
21
22def get_closest_feature (node, direction, threshold_up, threshold_down, report_func_up, report_func_down):
23 #direction=1 for +ve strand upstream and -ve strand downstream cases; and it is 0 for +ve strand downstream and -ve strand upstream cases
24 #threhold_Up is equal to the interval start for +ve strand, and interval end for -ve strand
25 #threhold_down is equal to the interval end for +ve strand, and interval start for -ve strand
26 if direction == 1:
27 if node.maxend < threshold_up:
28 if node.end == node.maxend:
29 report_func_up(node)
30 elif node.right and node.left:
31 if node.right.maxend == node.maxend:
32 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
33 elif node.left.maxend == node.maxend:
34 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
35 elif node.right and node.right.maxend == node.maxend:
36 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
37 elif node.left and node.left.maxend == node.maxend:
38 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
39 elif node.minend < threshold_up:
40 if node.end < threshold_up:
41 report_func_up(node)
42 if node.left and node.right:
43 if node.right.minend < threshold_up:
44 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
45 if node.left.minend < threshold_up:
46 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
47 elif node.left:
48 if node.left.minend < threshold_up:
49 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
50 elif node.right:
51 if node.right.minend < threshold_up:
52 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
53 elif direction == 0:
54 if node.start > threshold_down:
55 report_func_down(node)
56 if node.left:
57 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
58 else:
59 if node.right:
60 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
61
62def proximal_region_finder(readers, region, comments=True):
63 primary = readers[0]
64 features = readers[1]
65 either = False
66 if region == 'Upstream':
67 up, down = True, False
68 elif region == 'Downstream':
69 up, down = False, True
70 else:
71 up, down = True, True
72 if region == 'Either':
73 either = True
74
75 # Read features into memory:
76 rightTree = quicksect.IntervalTree()
77 for item in features:
78 if type( item ) is GenomicInterval:
79 rightTree.insert( item, features.linenum, item.fields )
80
81 for interval in primary:
82 if type( interval ) is Header:
83 yield interval
84 if type( interval ) is Comment and comments:
85 yield interval
86 elif type( interval ) == GenomicInterval:
87 chrom = interval.chrom
88 start = int(interval.start)
89 end = int(interval.end)
90 strand = interval.strand
91 if chrom not in rightTree.chroms:
92 continue
93 else:
94 root = rightTree.chroms[chrom] #root node for the chrom tree
95 result_up = []
96 result_down = []
97 if (strand == '+' and up) or (strand == '-' and down):
98 #upstream +ve strand and downstream -ve strand cases
99 get_closest_feature (root, 1, start, None, lambda node: result_up.append( node ), None)
100
101 if (strand == '+' and down) or (strand == '-' and up):
102 #downstream +ve strand and upstream -ve strand case
103 get_closest_feature (root, 0, None, end, None, lambda node: result_down.append( node ))
104
105 if result_up:
106 outfields = list(interval)
107 if len(result_up) > 1: #The results_up list has a list of intervals upstream to the given interval.
108 ends = []
109 for n in result_up:
110 ends.append(n.end)
111 res_ind = ends.index(max(ends)) #fetch the index of the closest interval i.e. the interval with the max end from the results_up list
112 else:
113 res_ind = 0
114 if not(either):
115 map(outfields.append, result_up[res_ind].other)
116 yield outfields
117
118 if result_down:
119 outfields = list(interval)
120 if not(either):
121 map(outfields.append, result_down[-1].other) #The last element of result_down will be the closest element to the given interval
122 yield outfields
123
124 if either:
125 if result_up and result_down:
126 if abs(start - int(result_up[res_ind].end)) <= abs(end - int(result_down[-1].start)):
127 map(outfields.append, result_up[res_ind].other)
128 else:
129 map(outfields.append, result_down[-1].other) #The last element of result_down will be the closest element to the given interval
130 elif result_up:
131 map(outfields.append, result_up[res_ind].other)
132 else:
133 map(outfields.append, result_down[-1].other) #The last element of result_down will be the closest element to the given interval
134 yield outfields
135
136
137def main():
138 options, args = doc_optparse.parse( __doc__ )
139 try:
140 chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
141 chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
142 in_fname, in2_fname, out_fname, direction = args
143 except:
144 doc_optparse.exception()
145
146 g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
147 chrom_col=chr_col_1,
148 start_col=start_col_1,
149 end_col=end_col_1,
150 strand_col=strand_col_1,
151 fix_strand=True )
152 g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
153 chrom_col=chr_col_2,
154 start_col=start_col_2,
155 end_col=end_col_2,
156 strand_col=strand_col_2,
157 fix_strand=True )
158 out_file = open( out_fname, "w" )
159 try:
160 for line in proximal_region_finder([g1,g2], direction):
161 if type( line ) is list:
162 out_file.write( "%s\n" % "\t".join( line ) )
163 else:
164 out_file.write( "%s\n" % line )
165 except ParseError, exc:
166 fail( "Invalid file format: %s" % str( exc ) )
167
168 print "Direction: %s" %(direction)
169 if g1.skipped > 0:
170 print skipped( g1, filedesc=" of 1st dataset" )
171 if g2.skipped > 0:
172 print skipped( g2, filedesc=" of 2nd dataset" )
173
174if __name__ == "__main__":
175 main()