/tools/new_operations/flanking_features.py

https://bitbucket.org/ialbert/galaxy-genetrack · Python · 175 lines · 146 code · 14 blank · 15 comment · 70 complexity · 2a55b7ac3867a259f9209ec575467a2d MD5 · raw file

  1. #!/usr/bin/env python
  2. #By: Guruprasad Ananda
  3. """
  4. Fetch closest up/downstream interval from features corresponding to every interval in primary
  5. usage: %prog primary_file features_file out_file direction
  6. -1, --cols1=N,N,N,N: Columns for start, end, strand in first file
  7. -2, --cols2=N,N,N,N: Columns for start, end, strand in second file
  8. """
  9. from galaxy import eggs
  10. import pkg_resources
  11. pkg_resources.require( "bx-python" )
  12. import sys, traceback, fileinput
  13. from warnings import warn
  14. from bx.cookbook import doc_optparse
  15. from galaxy.tools.util.galaxyops import *
  16. from bx.intervals.io import *
  17. from bx.intervals.operations import quicksect
  18. assert sys.version_info[:2] >= ( 2, 4 )
  19. def get_closest_feature (node, direction, threshold_up, threshold_down, report_func_up, report_func_down):
  20. #direction=1 for +ve strand upstream and -ve strand downstream cases; and it is 0 for +ve strand downstream and -ve strand upstream cases
  21. #threhold_Up is equal to the interval start for +ve strand, and interval end for -ve strand
  22. #threhold_down is equal to the interval end for +ve strand, and interval start for -ve strand
  23. if direction == 1:
  24. if node.maxend < threshold_up:
  25. if node.end == node.maxend:
  26. report_func_up(node)
  27. elif node.right and node.left:
  28. if node.right.maxend == node.maxend:
  29. get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
  30. elif node.left.maxend == node.maxend:
  31. get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
  32. elif node.right and node.right.maxend == node.maxend:
  33. get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
  34. elif node.left and node.left.maxend == node.maxend:
  35. get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
  36. elif node.minend < threshold_up:
  37. if node.end < threshold_up:
  38. report_func_up(node)
  39. if node.left and node.right:
  40. if node.right.minend < threshold_up:
  41. get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
  42. if node.left.minend < threshold_up:
  43. get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
  44. elif node.left:
  45. if node.left.minend < threshold_up:
  46. get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
  47. elif node.right:
  48. if node.right.minend < threshold_up:
  49. get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
  50. elif direction == 0:
  51. if node.start > threshold_down:
  52. report_func_down(node)
  53. if node.left:
  54. get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
  55. else:
  56. if node.right:
  57. get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
  58. def proximal_region_finder(readers, region, comments=True):
  59. primary = readers[0]
  60. features = readers[1]
  61. either = False
  62. if region == 'Upstream':
  63. up, down = True, False
  64. elif region == 'Downstream':
  65. up, down = False, True
  66. else:
  67. up, down = True, True
  68. if region == 'Either':
  69. either = True
  70. # Read features into memory:
  71. rightTree = quicksect.IntervalTree()
  72. for item in features:
  73. if type( item ) is GenomicInterval:
  74. rightTree.insert( item, features.linenum, item.fields )
  75. for interval in primary:
  76. if type( interval ) is Header:
  77. yield interval
  78. if type( interval ) is Comment and comments:
  79. yield interval
  80. elif type( interval ) == GenomicInterval:
  81. chrom = interval.chrom
  82. start = int(interval.start)
  83. end = int(interval.end)
  84. strand = interval.strand
  85. if chrom not in rightTree.chroms:
  86. continue
  87. else:
  88. root = rightTree.chroms[chrom] #root node for the chrom tree
  89. result_up = []
  90. result_down = []
  91. if (strand == '+' and up) or (strand == '-' and down):
  92. #upstream +ve strand and downstream -ve strand cases
  93. get_closest_feature (root, 1, start, None, lambda node: result_up.append( node ), None)
  94. if (strand == '+' and down) or (strand == '-' and up):
  95. #downstream +ve strand and upstream -ve strand case
  96. get_closest_feature (root, 0, None, end, None, lambda node: result_down.append( node ))
  97. if result_up:
  98. outfields = list(interval)
  99. if len(result_up) > 1: #The results_up list has a list of intervals upstream to the given interval.
  100. ends = []
  101. for n in result_up:
  102. ends.append(n.end)
  103. res_ind = ends.index(max(ends)) #fetch the index of the closest interval i.e. the interval with the max end from the results_up list
  104. else:
  105. res_ind = 0
  106. if not(either):
  107. map(outfields.append, result_up[res_ind].other)
  108. yield outfields
  109. if result_down:
  110. outfields = list(interval)
  111. if not(either):
  112. map(outfields.append, result_down[-1].other) #The last element of result_down will be the closest element to the given interval
  113. yield outfields
  114. if either:
  115. if result_up and result_down:
  116. if abs(start - int(result_up[res_ind].end)) <= abs(end - int(result_down[-1].start)):
  117. map(outfields.append, result_up[res_ind].other)
  118. else:
  119. map(outfields.append, result_down[-1].other) #The last element of result_down will be the closest element to the given interval
  120. elif result_up:
  121. map(outfields.append, result_up[res_ind].other)
  122. else:
  123. map(outfields.append, result_down[-1].other) #The last element of result_down will be the closest element to the given interval
  124. yield outfields
  125. def main():
  126. options, args = doc_optparse.parse( __doc__ )
  127. try:
  128. chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
  129. chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
  130. in_fname, in2_fname, out_fname, direction = args
  131. except:
  132. doc_optparse.exception()
  133. g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
  134. chrom_col=chr_col_1,
  135. start_col=start_col_1,
  136. end_col=end_col_1,
  137. strand_col=strand_col_1,
  138. fix_strand=True )
  139. g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
  140. chrom_col=chr_col_2,
  141. start_col=start_col_2,
  142. end_col=end_col_2,
  143. strand_col=strand_col_2,
  144. fix_strand=True )
  145. out_file = open( out_fname, "w" )
  146. try:
  147. for line in proximal_region_finder([g1,g2], direction):
  148. if type( line ) is list:
  149. out_file.write( "%s\n" % "\t".join( line ) )
  150. else:
  151. out_file.write( "%s\n" % line )
  152. except ParseError, exc:
  153. fail( "Invalid file format: %s" % str( exc ) )
  154. print "Direction: %s" %(direction)
  155. if g1.skipped > 0:
  156. print skipped( g1, filedesc=" of 1st dataset" )
  157. if g2.skipped > 0:
  158. print skipped( g2, filedesc=" of 2nd dataset" )
  159. if __name__ == "__main__":
  160. main()