PageRenderTime 58ms CodeModel.GetById 10ms app.highlight 42ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/stats/aggregate_scores_in_intervals.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 243 lines | 229 code | 6 blank | 8 comment | 33 complexity | 333c0af384d9cce18721527e504261d4 MD5 | raw file
  1#!/usr/bin/env python
  2# Greg Von Kuster
  3"""
  4usage: %prog score_file interval_file chrom start stop [out_file] [options] 
  5    -b, --binned: 'score_file' is actually a directory of binned array files
  6    -m, --mask=FILE: bed file containing regions not to consider valid
  7    -c, --chrom_buffer=INT: number of chromosomes (default is 3) to keep in memory when using a user supplied score file
  8"""
  9
 10from __future__ import division
 11from galaxy import eggs
 12import pkg_resources 
 13pkg_resources.require( "bx-python" )
 14pkg_resources.require( "lrucache" )
 15try:
 16    pkg_resources.require( "python-lzo" )
 17except:
 18    pass
 19
 20import psyco_full
 21import sys
 22import os, os.path
 23from UserDict import DictMixin
 24import bx.wiggle
 25from bx.binned_array import BinnedArray, FileBinnedArray
 26from bx.bitset import *
 27from bx.bitset_builders import *
 28from math import isnan
 29from bx.cookbook import doc_optparse
 30from galaxy.tools.exception_handling import *
 31
 32assert sys.version_info[:2] >= ( 2, 4 )
 33
 34import tempfile, struct
 35class PositionalScoresOnDisk:
 36    fmt = 'f'
 37    fmt_size = struct.calcsize( fmt )
 38    default_value = float( 'nan' )
 39    
 40    def __init__( self ):
 41        self.file = tempfile.TemporaryFile( 'w+b' )
 42        self.length = 0
 43    def __getitem__( self, i ):
 44        if i < 0: i = self.length + i
 45        if i < 0 or i >= self.length: return self.default_value
 46        try:
 47            self.file.seek( i * self.fmt_size )
 48            return struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0]
 49        except Exception, e:
 50            raise IndexError, e
 51    def __setitem__( self, i, value ):
 52        if i < 0: i = self.length + i
 53        if i < 0: raise IndexError, 'Negative assignment index out of range'
 54        if i >= self.length:
 55            self.file.seek( self.length * self.fmt_size )
 56            self.file.write( struct.pack( self.fmt, self.default_value ) * ( i - self.length ) )
 57            self.length = i + 1
 58        self.file.seek( i * self.fmt_size )
 59        self.file.write( struct.pack( self.fmt, value ) )
 60    def __len__( self ):
 61        return self.length
 62    def __repr__( self ):
 63        i = 0
 64        repr = "[ "
 65        for i in xrange( self.length ):
 66            repr = "%s %s," % ( repr, self[i] )
 67        return "%s ]" % ( repr )
 68
 69class FileBinnedArrayDir( DictMixin ):
 70    """
 71    Adapter that makes a directory of FileBinnedArray files look like
 72    a regular dict of BinnedArray objects. 
 73    """
 74    def __init__( self, dir ):
 75        self.dir = dir
 76        self.cache = dict()
 77    def __getitem__( self, key ):
 78        value = None
 79        if key in self.cache:
 80            value = self.cache[key]
 81        else:
 82            fname = os.path.join( self.dir, "%s.ba" % key )
 83            if os.path.exists( fname ):
 84                value = FileBinnedArray( open( fname ) )
 85                self.cache[key] = value
 86        if value is None:
 87            raise KeyError( "File does not exist: " + fname )
 88        return value
 89
 90def stop_err(msg):
 91    sys.stderr.write(msg)
 92    sys.exit()
 93    
 94def load_scores_wiggle( fname, chrom_buffer_size = 3 ):
 95    """
 96    Read a wiggle file and return a dict of BinnedArray objects keyed 
 97    by chromosome.
 98    """ 
 99    scores_by_chrom = dict()
100    try:
101        for chrom, pos, val in bx.wiggle.Reader( UCSCOutWrapper( open( fname ) ) ):
102            if chrom not in scores_by_chrom:
103                if chrom_buffer_size:
104                    scores_by_chrom[chrom] = BinnedArray()
105                    chrom_buffer_size -= 1
106                else:
107                    scores_by_chrom[chrom] = PositionalScoresOnDisk()
108            scores_by_chrom[chrom][pos] = val
109    except UCSCLimitException:
110        # Wiggle data was truncated, at the very least need to warn the user.
111        print 'Encountered message from UCSC: "Reached output limit of 100000 data values", so be aware your data was truncated.'
112    except IndexError:
113        stop_err('Data error: one or more column data values is missing in "%s"' %fname)
114    except ValueError:
115        stop_err('Data error: invalid data type for one or more values in "%s".' %fname)
116    return scores_by_chrom
117
118def load_scores_ba_dir( dir ):
119    """
120    Return a dict-like object (keyed by chromosome) that returns 
121    FileBinnedArray objects created from "key.ba" files in `dir`
122    """
123    return FileBinnedArrayDir( dir )
124    
125def main():
126
127    # Parse command line
128    options, args = doc_optparse.parse( __doc__ )
129
130    try:
131        score_fname = args[0]
132        interval_fname = args[1]
133        chrom_col = args[2]
134        start_col = args[3]
135        stop_col = args[4]
136        if len( args ) > 5:
137            out_file = open( args[5], 'w' )
138        else:
139            out_file = sys.stdout
140        binned = bool( options.binned )
141        mask_fname = options.mask
142    except:
143        doc_optparse.exit()
144
145    if score_fname == 'None':
146        stop_err( 'This tool works with data from genome builds hg16, hg17 or hg18.  Click the pencil icon in your history item to set the genome build if appropriate.' )
147    
148    try:
149        chrom_col = int(chrom_col) - 1
150        start_col = int(start_col) - 1
151        stop_col = int(stop_col) - 1
152    except:
153        stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' )
154
155    if chrom_col < 0 or start_col < 0 or stop_col < 0:
156        stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' )
157        
158    if binned:
159        scores_by_chrom = load_scores_ba_dir( score_fname )
160    else:
161        try:
162            chrom_buffer = int( options.chrom_buffer )
163        except:
164            chrom_buffer = 3
165        scores_by_chrom = load_scores_wiggle( score_fname, chrom_buffer )
166
167    if mask_fname:
168        masks = binned_bitsets_from_file( open( mask_fname ) )
169    else:
170        masks = None
171
172    skipped_lines = 0
173    first_invalid_line = 0
174    invalid_line = ''
175
176    for i, line in enumerate( open( interval_fname )):
177        valid = True
178        line = line.rstrip('\r\n')
179        if line and not line.startswith( '#' ):
180            fields = line.split()
181            
182            try:
183                chrom, start, stop = fields[chrom_col], int( fields[start_col] ), int( fields[stop_col] )
184            except:
185                valid = False
186                skipped_lines += 1
187                if not invalid_line:
188                    first_invalid_line = i + 1
189                    invalid_line = line
190            if valid:
191                total = 0
192                count = 0
193                min_score = 100000000
194                max_score = -100000000
195                for j in range( start, stop ):
196                    if chrom in scores_by_chrom:
197                        try:
198                            # Skip if base is masked
199                            if masks and chrom in masks:
200                                if masks[chrom][j]:
201                                    continue
202                            # Get the score, only count if not 'nan'
203                            score = scores_by_chrom[chrom][j]
204                            if not isnan( score ):
205                                total += score
206                                count += 1
207                                max_score = max( score, max_score )
208                                min_score = min( score, min_score )
209                        except:
210                            continue
211                if count > 0:
212                    avg = total/count
213                else:
214                    avg = "nan"
215                    min_score = "nan"
216                    max_score = "nan"
217                
218                # Build the resulting line of data
219                out_line = []
220                for k in range(0, len(fields)):
221                    out_line.append(fields[k])
222                out_line.append(avg)
223                out_line.append(min_score)
224                out_line.append(max_score)
225                
226                print >> out_file, "\t".join( map( str, out_line ) )
227            else:
228                skipped_lines += 1
229                if not invalid_line:
230                    first_invalid_line = i + 1
231                    invalid_line = line
232        elif line.startswith( '#' ):
233            # We'll save the original comments
234            print >> out_file, line
235            
236    out_file.close()
237
238    if skipped_lines > 0:
239        print 'Data issue: skipped %d invalid lines starting at line #%d which is "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
240        if skipped_lines == i:
241            print 'Consider changing the metadata for the input dataset by clicking on the pencil icon in the history item.'
242
243if __name__ == "__main__": main()