PageRenderTime 46ms CodeModel.GetById 14ms app.highlight 26ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/stats/gsummary.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 114 lines | 91 code | 15 blank | 8 comment | 39 complexity | a7ea785f992504f41d51b08cfdd64b4a MD5 | raw file
  1#!/usr/bin/env python
  2
  3import sys, re, tempfile
  4from rpy import *
  5# Older py compatibility
  6try:
  7    set()
  8except:
  9    from sets import Set as set
 10
 11assert sys.version_info[:2] >= ( 2, 4 )
 12
 13def stop_err( msg ):
 14    sys.stderr.write( msg )
 15    sys.exit()
 16
 17def S3_METHODS( all="key" ):
 18    Group_Math =  [ "abs", "sign", "sqrt", "floor", "ceiling", "trunc", "round", "signif",
 19        "exp", "log", "cos", "sin", "tan", "acos", "asin", "atan", "cosh", "sinh", "tanh",
 20        "acosh", "asinh", "atanh", "lgamma", "gamma", "gammaCody", "digamma", "trigamma",
 21        "cumsum", "cumprod", "cummax", "cummin", "c" ]
 22    Group_Ops = [ "+", "-", "*", "/", "^", "%%", "%/%", "&", "|", "!", "==", "!=", "<", "<=", ">=", ">", "(", ")", "~", "," ]
 23    if all is "key":
 24        return { 'Math' : Group_Math, 'Ops' : Group_Ops }
 25
 26def main():
 27    try:
 28        datafile = sys.argv[1]
 29        outfile_name = sys.argv[2]
 30        expression = sys.argv[3]
 31    except: 
 32        stop_err( 'Usage: python gsummary.py input_file ouput_file expression' )
 33
 34    math_allowed = S3_METHODS()[ 'Math' ]
 35    ops_allowed = S3_METHODS()[ 'Ops' ]
 36
 37    # Check for invalid expressions
 38    for word in re.compile( '[a-zA-Z]+' ).findall( expression ):
 39        if word and not word in math_allowed: 
 40            stop_err( "Invalid expression '%s': term '%s' is not recognized or allowed" %( expression, word ) )
 41    symbols = set()
 42    for symbol in re.compile( '[^a-z0-9\s]+' ).findall( expression ):
 43        if symbol and not symbol in ops_allowed:
 44            stop_err( "Invalid expression '%s': operator '%s' is not recognized or allowed" % ( expression, symbol ) )
 45        else:
 46            symbols.add( symbol )
 47    if len( symbols ) == 1 and ',' in symbols:
 48        # User may have entered a comma-separated list r_data_frame columns
 49        stop_err( "Invalid columns '%s': this tool requires a single column or expression" % expression )
 50
 51    # Find all column references in the expression
 52    cols = []
 53    for col in re.compile( 'c[0-9]+' ).findall( expression ):
 54        try:
 55            cols.append( int( col[1:] ) - 1 )
 56        except:
 57            pass
 58 
 59    tmp_file = tempfile.NamedTemporaryFile( 'w+b' )
 60    # Write the R header row to the temporary file
 61    hdr_str = "\t".join( "c%s" % str( col+1 ) for col in cols )
 62    tmp_file.write( "%s\n" % hdr_str )
 63    skipped_lines = 0
 64    first_invalid_line = 0
 65    i = 0
 66    for i, line in enumerate( file( datafile ) ):
 67        line = line.rstrip( '\r\n' )
 68        if line and not line.startswith( '#' ):
 69            valid = True
 70            fields = line.split( '\t' )
 71            # Write the R data row to the temporary file
 72            for col in cols:
 73                try:
 74                    float( fields[ col ] )
 75                except:
 76                    skipped_lines += 1
 77                    if not first_invalid_line:
 78                        first_invalid_line = i + 1
 79                    valid = False
 80                    break
 81            if valid:
 82                data_str = "\t".join( fields[ col ] for col in cols )
 83                tmp_file.write( "%s\n" % data_str )
 84    tmp_file.flush()
 85
 86    if skipped_lines == i + 1:
 87        stop_err( "Invalid column or column data values invalid for computation.  See tool tips and syntax for data requirements." )
 88    else:
 89        # summary function and return labels
 90        summary_func = r( "function( x ) { c( sum=sum( as.numeric( x ), na.rm=T ), mean=mean( as.numeric( x ), na.rm=T ), stdev=sd( as.numeric( x ), na.rm=T ), quantile( as.numeric( x ), na.rm=TRUE ) ) }" )
 91        headings = [ 'sum', 'mean', 'stdev', '0%', '25%', '50%', '75%', '100%' ]
 92        headings_str = "\t".join( headings )
 93        
 94        set_default_mode( NO_CONVERSION )
 95        r_data_frame = r.read_table( tmp_file.name, header=True, sep="\t" )
 96        
 97        outfile = open( outfile_name, 'w' )
 98
 99        for col in re.compile( 'c[0-9]+' ).findall( expression ):
100            r.assign( col, r[ "$" ]( r_data_frame, col ) )
101        try:
102            summary = summary_func( r( expression ) )
103        except RException, s:
104            outfile.close()
105            stop_err( "Computation resulted in the following error: %s" % str( s ) )
106        summary = summary.as_py( BASIC_CONVERSION )
107        outfile.write( "#%s\n" % headings_str )
108        outfile.write( "%s\n" % "\t".join( [ "%g" % ( summary[ k ] ) for k in headings ] ) )
109        outfile.close()
110
111        if skipped_lines:
112            print "Skipped %d invalid lines beginning with line #%d.  See tool tips for data requirements." % ( skipped_lines, first_invalid_line )        
113
114if __name__ == "__main__": main()