/tools/regVariation/featureCounter.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 148 lines · 121 code · 17 blank · 10 comment · 39 complexity · bb224ffabb429d6e5261587244a78801 MD5 · raw file

  1. #!/usr/bin/env python
  2. #Guruprasad Ananda
  3. """
  4. Calculate count and coverage of one query on another, and append the Coverage and counts to
  5. the last four columns as bases covered, percent coverage, number of completely present features, number of partially present/overlapping features.
  6. usage: %prog bed_file_1 bed_file_2 out_file
  7. -1, --cols1=N,N,N,N: Columns for chr, start, end, strand in first file
  8. -2, --cols2=N,N,N,N: Columns for chr, start, end, strand in second file
  9. """
  10. from galaxy import eggs
  11. import pkg_resources
  12. pkg_resources.require( "bx-python" )
  13. import sys, traceback, fileinput
  14. from warnings import warn
  15. from bx.intervals.io import *
  16. from bx.cookbook import doc_optparse
  17. from bx.intervals.operations import quicksect
  18. from galaxy.tools.util.galaxyops import *
  19. assert sys.version_info[:2] >= ( 2, 4 )
  20. def stop_err(msg):
  21. sys.stderr.write(msg)
  22. sys.exit()
  23. def counter(node, start, end):
  24. global full, partial
  25. if node.start <= start and node.maxend > start:
  26. if node.end >= end or (node.start == start and end > node.end > start):
  27. full += 1
  28. elif end > node.end > start:
  29. partial += 1
  30. if node.left and node.left.maxend > start:
  31. counter(node.left, start, end)
  32. if node.right:
  33. counter(node.right, start, end)
  34. elif start < node.start < end:
  35. if node.end <= end:
  36. full += 1
  37. else:
  38. partial += 1
  39. if node.left and node.left.maxend > start:
  40. counter(node.left, start, end)
  41. if node.right:
  42. counter(node.right, start, end)
  43. else:
  44. if node.left:
  45. counter(node.left, start, end)
  46. def count_coverage( readers, comments=True ):
  47. primary = readers[0]
  48. secondary = readers[1]
  49. secondary_copy = readers[2]
  50. rightTree = quicksect.IntervalTree()
  51. for item in secondary:
  52. if type( item ) is GenomicInterval:
  53. rightTree.insert( item, secondary.linenum, item.fields )
  54. bitsets = secondary_copy.binned_bitsets()
  55. global full, partial
  56. for interval in primary:
  57. if type( interval ) is Header:
  58. yield interval
  59. if type( interval ) is Comment and comments:
  60. yield interval
  61. elif type( interval ) == GenomicInterval:
  62. chrom = interval.chrom
  63. start = int(interval.start)
  64. end = int(interval.end)
  65. full = 0
  66. partial = 0
  67. if chrom not in bitsets:
  68. bases_covered = 0
  69. percent = 0.0
  70. full = 0
  71. partial = 0
  72. else:
  73. bases_covered = bitsets[ chrom ].count_range( start, end-start )
  74. if (end - start) == 0:
  75. percent = 0
  76. else:
  77. percent = float(bases_covered) / float(end - start)
  78. if bases_covered:
  79. root = rightTree.chroms[chrom] #root node for the chrom tree
  80. counter(root, start, end)
  81. interval.fields.append(str(bases_covered))
  82. interval.fields.append(str(percent))
  83. interval.fields.append(str(full))
  84. interval.fields.append(str(partial))
  85. yield interval
  86. def main():
  87. options, args = doc_optparse.parse( __doc__ )
  88. try:
  89. chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
  90. chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
  91. in1_fname, in2_fname, out_fname = args
  92. except:
  93. stop_err( "Data issue: click the pencil icon in the history item to correct the metadata attributes." )
  94. g1 = NiceReaderWrapper( fileinput.FileInput( in1_fname ),
  95. chrom_col=chr_col_1,
  96. start_col=start_col_1,
  97. end_col=end_col_1,
  98. strand_col=strand_col_1,
  99. fix_strand=True )
  100. g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
  101. chrom_col=chr_col_2,
  102. start_col=start_col_2,
  103. end_col=end_col_2,
  104. strand_col=strand_col_2,
  105. fix_strand=True )
  106. g2_copy = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
  107. chrom_col=chr_col_2,
  108. start_col=start_col_2,
  109. end_col=end_col_2,
  110. strand_col=strand_col_2,
  111. fix_strand=True )
  112. out_file = open( out_fname, "w" )
  113. try:
  114. for line in count_coverage([g1,g2,g2_copy]):
  115. if type( line ) is GenomicInterval:
  116. out_file.write( "%s\n" % "\t".join( line.fields ) )
  117. else:
  118. out_file.write( "%s\n" % line )
  119. except ParseError, exc:
  120. out_file.close()
  121. fail( str( exc ) )
  122. out_file.close()
  123. if g1.skipped > 0:
  124. print skipped( g1, filedesc=" of 1st dataset" )
  125. if g2.skipped > 0:
  126. print skipped( g2, filedesc=" of 2nd dataset" )
  127. elif g2_copy.skipped > 0:
  128. print skipped( g2_copy, filedesc=" of 2nd dataset" )
  129. if __name__ == "__main__":
  130. main()