/tools/new_operations/gops_cluster.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 132 lines · 101 code · 13 blank · 18 comment · 34 complexity · 4c056b62a5b4d01487f1260d90d3ba50 MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. Cluster regions of intervals.
  4. usage: %prog in_file out_file
  5. -1, --cols1=N,N,N,N: Columns for start, end, strand in file
  6. -d, --distance=N: Maximum distance between clustered intervals
  7. -v, --overlap=N: Minimum overlap require (negative distance)
  8. -m, --minregions=N: Minimum regions per cluster
  9. -o, --output=N: 1)merged 2)filtered 3)clustered 4) minimum 5) maximum
  10. """
  11. from galaxy import eggs
  12. import pkg_resources
  13. pkg_resources.require( "bx-python" )
  14. import sys, traceback, fileinput
  15. from warnings import warn
  16. from bx.intervals import *
  17. from bx.intervals.io import *
  18. from bx.intervals.operations.find_clusters import *
  19. from bx.cookbook import doc_optparse
  20. from galaxy.tools.util.galaxyops import *
  21. assert sys.version_info[:2] >= ( 2, 4 )
  22. def main():
  23. distance = 0
  24. minregions = 2
  25. output = 1
  26. upstream_pad = 0
  27. downstream_pad = 0
  28. options, args = doc_optparse.parse( __doc__ )
  29. try:
  30. chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
  31. if options.distance: distance = int( options.distance )
  32. if options.overlap: distance = -1 * int( options.overlap )
  33. if options.output: output = int( options.output )
  34. if options.minregions: minregions = int( options.minregions )
  35. in_fname, out_fname = args
  36. except:
  37. doc_optparse.exception()
  38. g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
  39. chrom_col=chr_col_1,
  40. start_col=start_col_1,
  41. end_col=end_col_1,
  42. strand_col=strand_col_1,
  43. fix_strand=True )
  44. # Get the cluster tree
  45. try:
  46. clusters, extra = find_clusters( g1, mincols=distance, minregions=minregions)
  47. except ParseError, exc:
  48. fail( "Invalid file format: %s" % str( exc ) )
  49. f1 = open( in_fname, "r" )
  50. out_file = open( out_fname, "w" )
  51. # If "merge"
  52. if output == 1:
  53. fields = ["." for x in range(max(g1.chrom_col, g1.start_col, g1.end_col)+1)]
  54. for chrom, tree in clusters.items():
  55. for start, end, lines in tree.getregions():
  56. fields[g1.chrom_col] = chrom
  57. fields[g1.start_col] = str(start)
  58. fields[g1.end_col] = str(end)
  59. out_file.write( "%s\n" % "\t".join( fields ) )
  60. # If "filtered" we preserve order of file and comments, etc.
  61. if output == 2:
  62. linenums = dict()
  63. for chrom, tree in clusters.items():
  64. for linenum in tree.getlines():
  65. linenums[linenum] = 0
  66. linenum = -1
  67. f1.seek(0)
  68. for line in f1.readlines():
  69. linenum += 1
  70. if linenum in linenums or linenum in extra:
  71. out_file.write( "%s\n" % line.rstrip( "\n\r" ) )
  72. # If "clustered" we output original intervals, but near each other (i.e. clustered)
  73. if output == 3:
  74. linenums = list()
  75. f1.seek(0)
  76. fileLines = f1.readlines()
  77. for chrom, tree in clusters.items():
  78. for linenum in tree.getlines():
  79. out_file.write( "%s\n" % fileLines[linenum].rstrip( "\n\r" ) )
  80. # If "minimum" we output the smallest interval in each cluster
  81. if output == 4 or output == 5:
  82. linenums = list()
  83. f1.seek(0)
  84. fileLines = f1.readlines()
  85. for chrom, tree in clusters.items():
  86. regions = tree.getregions()
  87. for start, end, lines in tree.getregions():
  88. outsize = -1
  89. outinterval = None
  90. for line in lines:
  91. # three nested for loops?
  92. # should only execute this code once per line
  93. fileline = fileLines[line].rstrip("\n\r")
  94. try:
  95. cluster_interval = GenomicInterval( g1, fileline.split("\t"),
  96. g1.chrom_col,
  97. g1.start_col,
  98. g1.end_col,
  99. g1.strand_col,
  100. g1.default_strand,
  101. g1.fix_strand )
  102. except Exception, exc:
  103. print >> sys.stderr, str( exc )
  104. f1.close()
  105. sys.exit()
  106. interval_size = cluster_interval.end - cluster_interval.start
  107. if outsize == -1 or \
  108. ( outsize > interval_size and output == 4 ) or \
  109. ( outsize < interval_size and output == 5 ) :
  110. outinterval = cluster_interval
  111. outsize = interval_size
  112. out_file.write( "%s\n" % outinterval )
  113. f1.close()
  114. out_file.close()
  115. if g1.skipped > 0:
  116. print skipped( g1, filedesc="" )
  117. if __name__ == "__main__":
  118. main()