/tools/extract/phastOdds/get_scores_galaxy.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 147 lines · 121 code · 14 blank · 12 comment · 27 complexity · b68f17867c7dee9d8d7c8b512e4edc3c MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. usage: %prog data_file.h5 region_mapping.bed in_file out_file chrom_col start_col end_col [options]
  4. -p, --perCol: standardize to lod per column
  5. """
  6. from __future__ import division
  7. import sys
  8. from galaxy import eggs
  9. from numpy import *
  10. from tables import *
  11. import pkg_resources; pkg_resources.require( "bx-python" )
  12. from bx.cookbook import doc_optparse
  13. from bx import intervals
  14. # ignore wanrnings about NumArray flavor
  15. from warnings import filterwarnings
  16. from tables.exceptions import FlavorWarning
  17. filterwarnings("ignore", category=FlavorWarning)
  18. assert sys.version_info[:2] >= ( 2, 4 )
  19. def stop_err( msg ):
  20. sys.stderr.write(msg)
  21. sys.exit()
  22. def main():
  23. # Parse command line
  24. options, args = doc_optparse.parse( __doc__ )
  25. try:
  26. h5_fname = args[0]
  27. mapping_fname = args[1]
  28. in_fname = args[2]
  29. out_fname = args[3]
  30. chrom_col, start_col, end_col = map( lambda x: int( x ) - 1, args[4:7] )
  31. per_col = bool( options.perCol )
  32. except Exception, e:
  33. doc_optparse.exception()
  34. if h5_fname == 'None.h5':
  35. stop_err( 'Invalid genome build, this tool currently only works with data from build hg17. Click the pencil icon in your history item to correct the build if appropriate.' )
  36. # Open the h5 file
  37. h5 = openFile( h5_fname, mode = "r" )
  38. # Load intervals and names for the subregions
  39. intersecters = {}
  40. for i, line in enumerate( file( mapping_fname ) ):
  41. line = line.rstrip( '\r\n' )
  42. if line and not line.startswith( '#' ):
  43. chr, start, end, name = line.split()[0:4]
  44. if not intersecters.has_key( chr ):
  45. intersecters[ chr ] = intervals.Intersecter()
  46. intersecters[ chr ].add_interval( intervals.Interval( int( start ), int( end ), name ) )
  47. # Find the subregion containing each input interval
  48. skipped_lines = 0
  49. first_invalid_line = 0
  50. invalid_line = ''
  51. out_file = open( out_fname, "w" )
  52. warnings = []
  53. warning = ''
  54. for i, line in enumerate( file( in_fname ) ):
  55. line = line.rstrip( '\r\n' )
  56. if line.startswith( '#' ):
  57. if i == 0:
  58. out_file.write( "%s\tscore\n" % line )
  59. else:
  60. out_file.write( "%s\n" % line )
  61. fields = line.split( "\t" )
  62. try:
  63. chr = fields[ chrom_col ]
  64. start = int( fields[ start_col ] )
  65. end = int( fields[ end_col ] )
  66. except:
  67. warning = "Invalid value for chrom, start or end column."
  68. warnings.append( warning )
  69. skipped_lines += 1
  70. if not invalid_line:
  71. first_invalid_line = i + 1
  72. invalid_line = line
  73. continue
  74. # Find matching interval
  75. try:
  76. matches = intersecters[ chr ].find( start, end )
  77. except:
  78. warning = "'%s' is not a valid chrom value for the region. " %chr
  79. warnings.append( warning )
  80. skipped_lines += 1
  81. if not invalid_line:
  82. first_invalid_line = i + 1
  83. invalid_line = line
  84. continue
  85. if not len( matches ) == 1:
  86. warning = "Interval must match exactly one target region. "
  87. warnings.append( warning )
  88. skipped_lines += 1
  89. if not invalid_line:
  90. first_invalid_line = i + 1
  91. invalid_line = line
  92. continue
  93. region = matches[0]
  94. if not ( start >= region.start and end <= region.end ):
  95. warning = "Interval must fall entirely within region. "
  96. warnings.append( warning )
  97. skipped_lines += 1
  98. if not invalid_line:
  99. first_invalid_line = i + 1
  100. invalid_line = line
  101. continue
  102. region_name = region.value
  103. rel_start = start - region.start
  104. rel_end = end - region.start
  105. if not rel_start < rel_end:
  106. warning = "Region %s is empty, relative start:%d, relative end:%d. " % ( region_name, rel_start, rel_end )
  107. warnings.append( warning )
  108. skipped_lines += 1
  109. if not invalid_line:
  110. first_invalid_line = i + 1
  111. invalid_line = line
  112. continue
  113. s = h5.getNode( h5.root, "scores_" + region_name )
  114. c = h5.getNode( h5.root, "counts_" + region_name )
  115. score = s[rel_end-1]
  116. count = c[rel_end-1]
  117. if rel_start > 0:
  118. score -= s[rel_start-1]
  119. count -= c[rel_start-1]
  120. if per_col:
  121. score /= count
  122. fields.append( str( score ) )
  123. out_file.write( "%s\n" % "\t".join( fields ) )
  124. # Close the file handle
  125. h5.close()
  126. out_file.close()
  127. if warnings:
  128. warn_msg = "PhastOdds scores are only available for ENCODE regions. %d warnings, 1st is: " % len( warnings )
  129. warn_msg += warnings[0]
  130. print warn_msg
  131. if skipped_lines:
  132. print 'Skipped %d invalid lines, 1st is #%d, "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
  133. if __name__ == "__main__": main()