/tools/extract/phastOdds/get_scores_galaxy.py
https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 147 lines · 121 code · 14 blank · 12 comment · 27 complexity · b68f17867c7dee9d8d7c8b512e4edc3c MD5 · raw file
- #!/usr/bin/env python
- """
- usage: %prog data_file.h5 region_mapping.bed in_file out_file chrom_col start_col end_col [options]
- -p, --perCol: standardize to lod per column
- """
- from __future__ import division
- import sys
- from galaxy import eggs
- from numpy import *
- from tables import *
- import pkg_resources; pkg_resources.require( "bx-python" )
- from bx.cookbook import doc_optparse
- from bx import intervals
- # ignore wanrnings about NumArray flavor
- from warnings import filterwarnings
- from tables.exceptions import FlavorWarning
- filterwarnings("ignore", category=FlavorWarning)
- assert sys.version_info[:2] >= ( 2, 4 )
- def stop_err( msg ):
- sys.stderr.write(msg)
- sys.exit()
- def main():
- # Parse command line
- options, args = doc_optparse.parse( __doc__ )
- try:
- h5_fname = args[0]
- mapping_fname = args[1]
- in_fname = args[2]
- out_fname = args[3]
- chrom_col, start_col, end_col = map( lambda x: int( x ) - 1, args[4:7] )
- per_col = bool( options.perCol )
- except Exception, e:
- doc_optparse.exception()
-
- if h5_fname == 'None.h5':
- stop_err( 'Invalid genome build, this tool currently only works with data from build hg17. Click the pencil icon in your history item to correct the build if appropriate.' )
-
- # Open the h5 file
- h5 = openFile( h5_fname, mode = "r" )
- # Load intervals and names for the subregions
- intersecters = {}
- for i, line in enumerate( file( mapping_fname ) ):
- line = line.rstrip( '\r\n' )
- if line and not line.startswith( '#' ):
- chr, start, end, name = line.split()[0:4]
- if not intersecters.has_key( chr ):
- intersecters[ chr ] = intervals.Intersecter()
- intersecters[ chr ].add_interval( intervals.Interval( int( start ), int( end ), name ) )
- # Find the subregion containing each input interval
- skipped_lines = 0
- first_invalid_line = 0
- invalid_line = ''
- out_file = open( out_fname, "w" )
- warnings = []
- warning = ''
- for i, line in enumerate( file( in_fname ) ):
- line = line.rstrip( '\r\n' )
- if line.startswith( '#' ):
- if i == 0:
- out_file.write( "%s\tscore\n" % line )
- else:
- out_file.write( "%s\n" % line )
- fields = line.split( "\t" )
- try:
- chr = fields[ chrom_col ]
- start = int( fields[ start_col ] )
- end = int( fields[ end_col ] )
- except:
- warning = "Invalid value for chrom, start or end column."
- warnings.append( warning )
- skipped_lines += 1
- if not invalid_line:
- first_invalid_line = i + 1
- invalid_line = line
- continue
- # Find matching interval
- try:
- matches = intersecters[ chr ].find( start, end )
- except:
- warning = "'%s' is not a valid chrom value for the region. " %chr
- warnings.append( warning )
- skipped_lines += 1
- if not invalid_line:
- first_invalid_line = i + 1
- invalid_line = line
- continue
- if not len( matches ) == 1:
- warning = "Interval must match exactly one target region. "
- warnings.append( warning )
- skipped_lines += 1
- if not invalid_line:
- first_invalid_line = i + 1
- invalid_line = line
- continue
- region = matches[0]
- if not ( start >= region.start and end <= region.end ):
- warning = "Interval must fall entirely within region. "
- warnings.append( warning )
- skipped_lines += 1
- if not invalid_line:
- first_invalid_line = i + 1
- invalid_line = line
- continue
- region_name = region.value
- rel_start = start - region.start
- rel_end = end - region.start
- if not rel_start < rel_end:
- warning = "Region %s is empty, relative start:%d, relative end:%d. " % ( region_name, rel_start, rel_end )
- warnings.append( warning )
- skipped_lines += 1
- if not invalid_line:
- first_invalid_line = i + 1
- invalid_line = line
- continue
- s = h5.getNode( h5.root, "scores_" + region_name )
- c = h5.getNode( h5.root, "counts_" + region_name )
- score = s[rel_end-1]
- count = c[rel_end-1]
- if rel_start > 0:
- score -= s[rel_start-1]
- count -= c[rel_start-1]
- if per_col:
- score /= count
- fields.append( str( score ) )
- out_file.write( "%s\n" % "\t".join( fields ) )
- # Close the file handle
- h5.close()
- out_file.close()
- if warnings:
- warn_msg = "PhastOdds scores are only available for ENCODE regions. %d warnings, 1st is: " % len( warnings )
- warn_msg += warnings[0]
- print warn_msg
- if skipped_lines:
- print 'Skipped %d invalid lines, 1st is #%d, "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
- if __name__ == "__main__": main()