aggregate_scores_in_intervals.py - This is a Python script …

/tools/stats/aggregate_scores_in_intervals.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 243 lines · 229 code · 6 blank · 8 comment · 25 complexity · 333c0af384d9cce18721527e504261d4 MD5 · raw file


#!/usr/bin/env python
# Greg Von Kuster
"""
usage: %prog score_file interval_file chrom start stop [out_file] [options] 
    -b, --binned: 'score_file' is actually a directory of binned array files
    -m, --mask=FILE: bed file containing regions not to consider valid
    -c, --chrom_buffer=INT: number of chromosomes (default is 3) to keep in memory when using a user supplied score file
"""

from __future__ import division
from galaxy import eggs
import pkg_resources 
pkg_resources.require( "bx-python" )
pkg_resources.require( "lrucache" )
try:
    pkg_resources.require( "python-lzo" )
except:
    pass

import psyco_full
import sys
import os, os.path
from UserDict import DictMixin
import bx.wiggle
from bx.binned_array import BinnedArray, FileBinnedArray
from bx.bitset import *
from bx.bitset_builders import *
from math import isnan
from bx.cookbook import doc_optparse
from galaxy.tools.exception_handling import *

assert sys.version_info[:2] >= ( 2, 4 )

import tempfile, struct
class PositionalScoresOnDisk:
    fmt = 'f'
    fmt_size = struct.calcsize( fmt )
    default_value = float( 'nan' )
    
    def __init__( self ):
        self.file = tempfile.TemporaryFile( 'w+b' )
        self.length = 0
    def __getitem__( self, i ):
        if i < 0: i = self.length + i
        if i < 0 or i >= self.length: return self.default_value
        try:
            self.file.seek( i * self.fmt_size )
            return struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0]
        except Exception, e:
            raise IndexError, e
    def __setitem__( self, i, value ):
        if i < 0: i = self.length + i
        if i < 0: raise IndexError, 'Negative assignment index out of range'
        if i >= self.length:
            self.file.seek( self.length * self.fmt_size )
            self.file.write( struct.pack( self.fmt, self.default_value ) * ( i - self.length ) )
            self.length = i + 1
        self.file.seek( i * self.fmt_size )
        self.file.write( struct.pack( self.fmt, value ) )
    def __len__( self ):
        return self.length
    def __repr__( self ):
        i = 0
        repr = "[ "
        for i in xrange( self.length ):
            repr = "%s %s," % ( repr, self[i] )
        return "%s ]" % ( repr )

class FileBinnedArrayDir( DictMixin ):
    """
    Adapter that makes a directory of FileBinnedArray files look like
    a regular dict of BinnedArray objects. 
    """
    def __init__( self, dir ):
        self.dir = dir
        self.cache = dict()
    def __getitem__( self, key ):
        value = None
        if key in self.cache:
            value = self.cache[key]
        else:
            fname = os.path.join( self.dir, "%s.ba" % key )
            if os.path.exists( fname ):
                value = FileBinnedArray( open( fname ) )
                self.cache[key] = value
        if value is None:
            raise KeyError( "File does not exist: " + fname )
        return value

def stop_err(msg):
    sys.stderr.write(msg)
    sys.exit()
    
def load_scores_wiggle( fname, chrom_buffer_size = 3 ):
    """
    Read a wiggle file and return a dict of BinnedArray objects keyed 
    by chromosome.
    """ 
    scores_by_chrom = dict()
    try:
        for chrom, pos, val in bx.wiggle.Reader( UCSCOutWrapper( open( fname ) ) ):
            if chrom not in scores_by_chrom:
                if chrom_buffer_size:
                    scores_by_chrom[chrom] = BinnedArray()
                    chrom_buffer_size -= 1
                else:
                    scores_by_chrom[chrom] = PositionalScoresOnDisk()
            scores_by_chrom[chrom][pos] = val
    except UCSCLimitException:
        # Wiggle data was truncated, at the very least need to warn the user.
        print 'Encountered message from UCSC: "Reached output limit of 100000 data values", so be aware your data was truncated.'
    except IndexError:
        stop_err('Data error: one or more column data values is missing in "%s"' %fname)
    except ValueError:
        stop_err('Data error: invalid data type for one or more values in "%s".' %fname)
    return scores_by_chrom

def load_scores_ba_dir( dir ):
    """
    Return a dict-like object (keyed by chromosome) that returns 
    FileBinnedArray objects created from "key.ba" files in `dir`
    """
    return FileBinnedArrayDir( dir )
    
def main():

    # Parse command line
    options, args = doc_optparse.parse( __doc__ )

    try:
        score_fname = args[0]
        interval_fname = args[1]
        chrom_col = args[2]
        start_col = args[3]
        stop_col = args[4]
        if len( args ) > 5:
            out_file = open( args[5], 'w' )
        else:
            out_file = sys.stdout
        binned = bool( options.binned )
        mask_fname = options.mask
    except:
        doc_optparse.exit()

    if score_fname == 'None':
        stop_err( 'This tool works with data from genome builds hg16, hg17 or hg18.  Click the pencil icon in your history item to set the genome build if appropriate.' )
    
    try:
        chrom_col = int(chrom_col) - 1
        start_col = int(start_col) - 1
        stop_col = int(stop_col) - 1
    except:
        stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' )

    if chrom_col < 0 or start_col < 0 or stop_col < 0:
        stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' )
        
    if binned:
        scores_by_chrom = load_scores_ba_dir( score_fname )
    else:
        try:
            chrom_buffer = int( options.chrom_buffer )
        except:
            chrom_buffer = 3
        scores_by_chrom = load_scores_wiggle( score_fname, chrom_buffer )

    if mask_fname:
        masks = binned_bitsets_from_file( open( mask_fname ) )
    else:
        masks = None

    skipped_lines = 0
    first_invalid_line = 0
    invalid_line = ''

    for i, line in enumerate( open( interval_fname )):
        valid = True
        line = line.rstrip('\r\n')
        if line and not line.startswith( '#' ):
            fields = line.split()
            
            try:
                chrom, start, stop = fields[chrom_col], int( fields[start_col] ), int( fields[stop_col] )
            except:
                valid = False
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
            if valid:
                total = 0
                count = 0
                min_score = 100000000
                max_score = -100000000
                for j in range( start, stop ):
                    if chrom in scores_by_chrom:
                        try:
                            # Skip if base is masked
                            if masks and chrom in masks:
                                if masks[chrom][j]:
                                    continue
                            # Get the score, only count if not 'nan'
                            score = scores_by_chrom[chrom][j]
                            if not isnan( score ):
                                total += score
                                count += 1
                                max_score = max( score, max_score )
                                min_score = min( score, min_score )
                        except:
                            continue
                if count > 0:
                    avg = total/count
                else:
                    avg = "nan"
                    min_score = "nan"
                    max_score = "nan"
                
                # Build the resulting line of data
                out_line = []
                for k in range(0, len(fields)):
                    out_line.append(fields[k])
                out_line.append(avg)
                out_line.append(min_score)
                out_line.append(max_score)
                
                print >> out_file, "\t".join( map( str, out_line ) )
            else:
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
        elif line.startswith( '#' ):
            # We'll save the original comments
            print >> out_file, line
            
    out_file.close()

    if skipped_lines > 0:
        print 'Data issue: skipped %d invalid lines starting at line #%d which is "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
        if skipped_lines == i:
            print 'Consider changing the metadata for the input dataset by clicking on the pencil icon in the history item.'

if __name__ == "__main__": main()

Summary ✨

This is a Python script that takes two files as input: one containing genomic intervals and another containing scores for those intervals. It outputs a new file with the same intervals but with additional columns containing the average, minimum, and maximum scores for each interval.

Tech Fingerprint

Alerts (21)

'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
17 142 152 163 184 209
'import *' Avoid to prevent namespace pollution; import specific names or use aliases
26 27 30
'open(' Use 'with open()' to ensure Files are properly closed
84 101 168 176
'def' Ensure functions have docstrings for documentation
90 125
Complexity hotspot; line 146 (total complexity: 3)
146
Complexity hotspot; lines 195 to 197 (total complexity: 3)
195 196 197
Complexity hotspot; lines 199 to 200 (total complexity: 3)
199 200