/tools/stats/aggregate_scores_in_intervals.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 243 lines · 229 code · 6 blank · 8 comment · 25 complexity · 333c0af384d9cce18721527e504261d4 MD5 · raw file

  1. #!/usr/bin/env python
  2. # Greg Von Kuster
  3. """
  4. usage: %prog score_file interval_file chrom start stop [out_file] [options]
  5. -b, --binned: 'score_file' is actually a directory of binned array files
  6. -m, --mask=FILE: bed file containing regions not to consider valid
  7. -c, --chrom_buffer=INT: number of chromosomes (default is 3) to keep in memory when using a user supplied score file
  8. """
  9. from __future__ import division
  10. from galaxy import eggs
  11. import pkg_resources
  12. pkg_resources.require( "bx-python" )
  13. pkg_resources.require( "lrucache" )
  14. try:
  15. pkg_resources.require( "python-lzo" )
  16. except:
  17. pass
  18. import psyco_full
  19. import sys
  20. import os, os.path
  21. from UserDict import DictMixin
  22. import bx.wiggle
  23. from bx.binned_array import BinnedArray, FileBinnedArray
  24. from bx.bitset import *
  25. from bx.bitset_builders import *
  26. from math import isnan
  27. from bx.cookbook import doc_optparse
  28. from galaxy.tools.exception_handling import *
  29. assert sys.version_info[:2] >= ( 2, 4 )
  30. import tempfile, struct
  31. class PositionalScoresOnDisk:
  32. fmt = 'f'
  33. fmt_size = struct.calcsize( fmt )
  34. default_value = float( 'nan' )
  35. def __init__( self ):
  36. self.file = tempfile.TemporaryFile( 'w+b' )
  37. self.length = 0
  38. def __getitem__( self, i ):
  39. if i < 0: i = self.length + i
  40. if i < 0 or i >= self.length: return self.default_value
  41. try:
  42. self.file.seek( i * self.fmt_size )
  43. return struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0]
  44. except Exception, e:
  45. raise IndexError, e
  46. def __setitem__( self, i, value ):
  47. if i < 0: i = self.length + i
  48. if i < 0: raise IndexError, 'Negative assignment index out of range'
  49. if i >= self.length:
  50. self.file.seek( self.length * self.fmt_size )
  51. self.file.write( struct.pack( self.fmt, self.default_value ) * ( i - self.length ) )
  52. self.length = i + 1
  53. self.file.seek( i * self.fmt_size )
  54. self.file.write( struct.pack( self.fmt, value ) )
  55. def __len__( self ):
  56. return self.length
  57. def __repr__( self ):
  58. i = 0
  59. repr = "[ "
  60. for i in xrange( self.length ):
  61. repr = "%s %s," % ( repr, self[i] )
  62. return "%s ]" % ( repr )
  63. class FileBinnedArrayDir( DictMixin ):
  64. """
  65. Adapter that makes a directory of FileBinnedArray files look like
  66. a regular dict of BinnedArray objects.
  67. """
  68. def __init__( self, dir ):
  69. self.dir = dir
  70. self.cache = dict()
  71. def __getitem__( self, key ):
  72. value = None
  73. if key in self.cache:
  74. value = self.cache[key]
  75. else:
  76. fname = os.path.join( self.dir, "%s.ba" % key )
  77. if os.path.exists( fname ):
  78. value = FileBinnedArray( open( fname ) )
  79. self.cache[key] = value
  80. if value is None:
  81. raise KeyError( "File does not exist: " + fname )
  82. return value
  83. def stop_err(msg):
  84. sys.stderr.write(msg)
  85. sys.exit()
  86. def load_scores_wiggle( fname, chrom_buffer_size = 3 ):
  87. """
  88. Read a wiggle file and return a dict of BinnedArray objects keyed
  89. by chromosome.
  90. """
  91. scores_by_chrom = dict()
  92. try:
  93. for chrom, pos, val in bx.wiggle.Reader( UCSCOutWrapper( open( fname ) ) ):
  94. if chrom not in scores_by_chrom:
  95. if chrom_buffer_size:
  96. scores_by_chrom[chrom] = BinnedArray()
  97. chrom_buffer_size -= 1
  98. else:
  99. scores_by_chrom[chrom] = PositionalScoresOnDisk()
  100. scores_by_chrom[chrom][pos] = val
  101. except UCSCLimitException:
  102. # Wiggle data was truncated, at the very least need to warn the user.
  103. print 'Encountered message from UCSC: "Reached output limit of 100000 data values", so be aware your data was truncated.'
  104. except IndexError:
  105. stop_err('Data error: one or more column data values is missing in "%s"' %fname)
  106. except ValueError:
  107. stop_err('Data error: invalid data type for one or more values in "%s".' %fname)
  108. return scores_by_chrom
  109. def load_scores_ba_dir( dir ):
  110. """
  111. Return a dict-like object (keyed by chromosome) that returns
  112. FileBinnedArray objects created from "key.ba" files in `dir`
  113. """
  114. return FileBinnedArrayDir( dir )
  115. def main():
  116. # Parse command line
  117. options, args = doc_optparse.parse( __doc__ )
  118. try:
  119. score_fname = args[0]
  120. interval_fname = args[1]
  121. chrom_col = args[2]
  122. start_col = args[3]
  123. stop_col = args[4]
  124. if len( args ) > 5:
  125. out_file = open( args[5], 'w' )
  126. else:
  127. out_file = sys.stdout
  128. binned = bool( options.binned )
  129. mask_fname = options.mask
  130. except:
  131. doc_optparse.exit()
  132. if score_fname == 'None':
  133. stop_err( 'This tool works with data from genome builds hg16, hg17 or hg18. Click the pencil icon in your history item to set the genome build if appropriate.' )
  134. try:
  135. chrom_col = int(chrom_col) - 1
  136. start_col = int(start_col) - 1
  137. stop_col = int(stop_col) - 1
  138. except:
  139. stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' )
  140. if chrom_col < 0 or start_col < 0 or stop_col < 0:
  141. stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' )
  142. if binned:
  143. scores_by_chrom = load_scores_ba_dir( score_fname )
  144. else:
  145. try:
  146. chrom_buffer = int( options.chrom_buffer )
  147. except:
  148. chrom_buffer = 3
  149. scores_by_chrom = load_scores_wiggle( score_fname, chrom_buffer )
  150. if mask_fname:
  151. masks = binned_bitsets_from_file( open( mask_fname ) )
  152. else:
  153. masks = None
  154. skipped_lines = 0
  155. first_invalid_line = 0
  156. invalid_line = ''
  157. for i, line in enumerate( open( interval_fname )):
  158. valid = True
  159. line = line.rstrip('\r\n')
  160. if line and not line.startswith( '#' ):
  161. fields = line.split()
  162. try:
  163. chrom, start, stop = fields[chrom_col], int( fields[start_col] ), int( fields[stop_col] )
  164. except:
  165. valid = False
  166. skipped_lines += 1
  167. if not invalid_line:
  168. first_invalid_line = i + 1
  169. invalid_line = line
  170. if valid:
  171. total = 0
  172. count = 0
  173. min_score = 100000000
  174. max_score = -100000000
  175. for j in range( start, stop ):
  176. if chrom in scores_by_chrom:
  177. try:
  178. # Skip if base is masked
  179. if masks and chrom in masks:
  180. if masks[chrom][j]:
  181. continue
  182. # Get the score, only count if not 'nan'
  183. score = scores_by_chrom[chrom][j]
  184. if not isnan( score ):
  185. total += score
  186. count += 1
  187. max_score = max( score, max_score )
  188. min_score = min( score, min_score )
  189. except:
  190. continue
  191. if count > 0:
  192. avg = total/count
  193. else:
  194. avg = "nan"
  195. min_score = "nan"
  196. max_score = "nan"
  197. # Build the resulting line of data
  198. out_line = []
  199. for k in range(0, len(fields)):
  200. out_line.append(fields[k])
  201. out_line.append(avg)
  202. out_line.append(min_score)
  203. out_line.append(max_score)
  204. print >> out_file, "\t".join( map( str, out_line ) )
  205. else:
  206. skipped_lines += 1
  207. if not invalid_line:
  208. first_invalid_line = i + 1
  209. invalid_line = line
  210. elif line.startswith( '#' ):
  211. # We'll save the original comments
  212. print >> out_file, line
  213. out_file.close()
  214. if skipped_lines > 0:
  215. print 'Data issue: skipped %d invalid lines starting at line #%d which is "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
  216. if skipped_lines == i:
  217. print 'Consider changing the metadata for the input dataset by clicking on the pencil icon in the history item.'
  218. if __name__ == "__main__": main()