PageRenderTime 36ms CodeModel.GetById 12ms app.highlight 20ms RepoModel.GetById 2ms app.codeStats 0ms

/tools/extract/phastOdds/get_scores_galaxy.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 147 lines | 121 code | 14 blank | 12 comment | 31 complexity | b68f17867c7dee9d8d7c8b512e4edc3c MD5 | raw file
  1#!/usr/bin/env python
  2
  3"""
  4usage: %prog data_file.h5 region_mapping.bed in_file out_file chrom_col start_col end_col [options]
  5   -p, --perCol: standardize to lod per column
  6"""
  7
  8from __future__ import division
  9
 10import sys
 11from galaxy import eggs
 12from numpy import *
 13from tables import *
 14
 15import pkg_resources; pkg_resources.require( "bx-python" )
 16from bx.cookbook import doc_optparse
 17
 18from bx import intervals
 19
 20# ignore wanrnings about NumArray flavor
 21from warnings import filterwarnings
 22from tables.exceptions import FlavorWarning
 23filterwarnings("ignore", category=FlavorWarning)
 24
 25assert sys.version_info[:2] >= ( 2, 4 )
 26
 27def stop_err( msg ):
 28    sys.stderr.write(msg)
 29    sys.exit()
 30
 31def main():
 32    # Parse command line
 33    options, args = doc_optparse.parse( __doc__ )
 34    try:
 35        h5_fname = args[0]
 36        mapping_fname = args[1]
 37        in_fname = args[2]
 38        out_fname = args[3]
 39        chrom_col, start_col, end_col = map( lambda x: int( x ) - 1, args[4:7] )
 40        per_col = bool( options.perCol )
 41    except Exception, e:
 42        doc_optparse.exception()
 43        
 44    if h5_fname == 'None.h5':
 45        stop_err( 'Invalid genome build, this tool currently only works with data from build hg17.  Click the pencil icon in your history item to correct the build if appropriate.' )
 46        
 47    # Open the h5 file
 48    h5 = openFile( h5_fname, mode = "r" )
 49    # Load intervals and names for the subregions
 50    intersecters = {}
 51    for i, line in enumerate( file( mapping_fname ) ):
 52        line = line.rstrip( '\r\n' )
 53        if line and not line.startswith( '#' ):
 54            chr, start, end, name = line.split()[0:4]
 55            if not intersecters.has_key( chr ): 
 56                intersecters[ chr ] = intervals.Intersecter()
 57            intersecters[ chr ].add_interval( intervals.Interval( int( start ), int( end ), name ) )
 58
 59    # Find the subregion containing each input interval
 60    skipped_lines = 0
 61    first_invalid_line = 0
 62    invalid_line = ''
 63    out_file = open( out_fname, "w" )
 64    warnings = []
 65    warning = ''
 66    for i, line in enumerate( file( in_fname ) ):
 67        line = line.rstrip( '\r\n' )
 68        if line.startswith( '#' ):
 69            if i == 0:
 70                out_file.write( "%s\tscore\n" % line )
 71            else:
 72                out_file.write( "%s\n" % line )
 73        fields = line.split( "\t" )
 74        try:
 75            chr = fields[ chrom_col ]
 76            start = int( fields[ start_col ] )
 77            end = int( fields[ end_col ] )
 78        except:
 79            warning = "Invalid value for chrom, start or end column."
 80            warnings.append( warning )
 81            skipped_lines += 1
 82            if not invalid_line:
 83                first_invalid_line = i + 1
 84                invalid_line = line
 85            continue
 86        # Find matching interval
 87        try:
 88            matches = intersecters[ chr ].find( start, end )
 89        except:
 90            warning = "'%s' is not a valid chrom value for the region. " %chr
 91            warnings.append( warning )
 92            skipped_lines += 1
 93            if not invalid_line:
 94                first_invalid_line = i + 1
 95                invalid_line = line
 96            continue
 97        if not len( matches ) == 1:
 98            warning = "Interval must match exactly one target region. "
 99            warnings.append( warning )
100            skipped_lines += 1
101            if not invalid_line:
102                first_invalid_line = i + 1
103                invalid_line = line
104            continue
105        region = matches[0]
106        if not ( start >= region.start and end <= region.end ):
107            warning = "Interval must fall entirely within region. "
108            warnings.append( warning )
109            skipped_lines += 1
110            if not invalid_line:
111                first_invalid_line = i + 1
112                invalid_line = line
113            continue
114        region_name = region.value
115        rel_start = start - region.start
116        rel_end = end - region.start
117        if not rel_start < rel_end:
118            warning = "Region %s is empty, relative start:%d, relative end:%d. " % ( region_name, rel_start, rel_end )
119            warnings.append( warning )
120            skipped_lines += 1
121            if not invalid_line:
122                first_invalid_line = i + 1
123                invalid_line = line
124            continue
125        s = h5.getNode( h5.root, "scores_" + region_name )
126        c = h5.getNode( h5.root, "counts_" + region_name )
127        score = s[rel_end-1]
128        count = c[rel_end-1]
129        if rel_start > 0:
130            score -= s[rel_start-1]
131            count -= c[rel_start-1]
132        if per_col: 
133            score /= count
134        fields.append( str( score ) )
135        out_file.write( "%s\n" % "\t".join( fields ) )
136    # Close the file handle
137    h5.close()
138    out_file.close()
139
140    if warnings:
141        warn_msg = "PhastOdds scores are only available for ENCODE regions. %d warnings, 1st is: " % len( warnings )
142        warn_msg += warnings[0]
143        print warn_msg
144    if skipped_lines:
145        print 'Skipped %d invalid lines, 1st is #%d, "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
146
147if __name__ == "__main__": main()