gsummary.py - This is a Python script that calculates summa…

/tools/stats/gsummary.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 114 lines · 91 code · 15 blank · 8 comment · 31 complexity · a7ea785f992504f41d51b08cfdd64b4a MD5 · raw file


#!/usr/bin/env python

import sys, re, tempfile
from rpy import *
# Older py compatibility
try:
    set()
except:
    from sets import Set as set

assert sys.version_info[:2] >= ( 2, 4 )

def stop_err( msg ):
    sys.stderr.write( msg )
    sys.exit()

def S3_METHODS( all="key" ):
    Group_Math =  [ "abs", "sign", "sqrt", "floor", "ceiling", "trunc", "round", "signif",
        "exp", "log", "cos", "sin", "tan", "acos", "asin", "atan", "cosh", "sinh", "tanh",
        "acosh", "asinh", "atanh", "lgamma", "gamma", "gammaCody", "digamma", "trigamma",
        "cumsum", "cumprod", "cummax", "cummin", "c" ]
    Group_Ops = [ "+", "-", "*", "/", "^", "%%", "%/%", "&", "|", "!", "==", "!=", "<", "<=", ">=", ">", "(", ")", "~", "," ]
    if all is "key":
        return { 'Math' : Group_Math, 'Ops' : Group_Ops }

def main():
    try:
        datafile = sys.argv[1]
        outfile_name = sys.argv[2]
        expression = sys.argv[3]
    except: 
        stop_err( 'Usage: python gsummary.py input_file ouput_file expression' )

    math_allowed = S3_METHODS()[ 'Math' ]
    ops_allowed = S3_METHODS()[ 'Ops' ]

    # Check for invalid expressions
    for word in re.compile( '[a-zA-Z]+' ).findall( expression ):
        if word and not word in math_allowed: 
            stop_err( "Invalid expression '%s': term '%s' is not recognized or allowed" %( expression, word ) )
    symbols = set()
    for symbol in re.compile( '[^a-z0-9\s]+' ).findall( expression ):
        if symbol and not symbol in ops_allowed:
            stop_err( "Invalid expression '%s': operator '%s' is not recognized or allowed" % ( expression, symbol ) )
        else:
            symbols.add( symbol )
    if len( symbols ) == 1 and ',' in symbols:
        # User may have entered a comma-separated list r_data_frame columns
        stop_err( "Invalid columns '%s': this tool requires a single column or expression" % expression )

    # Find all column references in the expression
    cols = []
    for col in re.compile( 'c[0-9]+' ).findall( expression ):
        try:
            cols.append( int( col[1:] ) - 1 )
        except:
            pass
 
    tmp_file = tempfile.NamedTemporaryFile( 'w+b' )
    # Write the R header row to the temporary file
    hdr_str = "\t".join( "c%s" % str( col+1 ) for col in cols )
    tmp_file.write( "%s\n" % hdr_str )
    skipped_lines = 0
    first_invalid_line = 0
    i = 0
    for i, line in enumerate( file( datafile ) ):
        line = line.rstrip( '\r\n' )
        if line and not line.startswith( '#' ):
            valid = True
            fields = line.split( '\t' )
            # Write the R data row to the temporary file
            for col in cols:
                try:
                    float( fields[ col ] )
                except:
                    skipped_lines += 1
                    if not first_invalid_line:
                        first_invalid_line = i + 1
                    valid = False
                    break
            if valid:
                data_str = "\t".join( fields[ col ] for col in cols )
                tmp_file.write( "%s\n" % data_str )
    tmp_file.flush()

    if skipped_lines == i + 1:
        stop_err( "Invalid column or column data values invalid for computation.  See tool tips and syntax for data requirements." )
    else:
        # summary function and return labels
        summary_func = r( "function( x ) { c( sum=sum( as.numeric( x ), na.rm=T ), mean=mean( as.numeric( x ), na.rm=T ), stdev=sd( as.numeric( x ), na.rm=T ), quantile( as.numeric( x ), na.rm=TRUE ) ) }" )
        headings = [ 'sum', 'mean', 'stdev', '0%', '25%', '50%', '75%', '100%' ]
        headings_str = "\t".join( headings )
        
        set_default_mode( NO_CONVERSION )
        r_data_frame = r.read_table( tmp_file.name, header=True, sep="\t" )
        
        outfile = open( outfile_name, 'w' )

        for col in re.compile( 'c[0-9]+' ).findall( expression ):
            r.assign( col, r[ "$" ]( r_data_frame, col ) )
        try:
            summary = summary_func( r( expression ) )
        except RException, s:
            outfile.close()
            stop_err( "Computation resulted in the following error: %s" % str( s ) )
        summary = summary.as_py( BASIC_CONVERSION )
        outfile.write( "#%s\n" % headings_str )
        outfile.write( "%s\n" % "\t".join( [ "%g" % ( summary[ k ] ) for k in headings ] ) )
        outfile.close()

        if skipped_lines:
            print "Skipped %d invalid lines beginning with line #%d.  See tool tips for data requirements." % ( skipped_lines, first_invalid_line )        

if __name__ == "__main__": main()

Summary ✨

This is a Python script that calculates summary statistics for a given expression using R’s summary() function. It takes three arguments: an input file, an output file, and an expression. The script first checks if the expression is valid by checking if all terms are recognized math functions or operators. If any term is not recognized, it raises an error. Then, it reads the input file and writes a header row to the temporary file with the column names. It then loops through each line in the input file, skipping any lines that start with ‘#’ (comments) or are empty. For each valid line, it extracts the values for the specified columns and writes them to the temporary file as R data rows. After all lines have been processed, it reads the temporary file into an R data frame using read_table() and assigns the column names to variables using $. It then evaluates the expression using summary() and writes the results to the output file in a tab-delimited format with headers for each summary statistic. If any lines were skipped due to invalid values, it prints a message indicating how many lines were skipped and where they begin.

Tech Fingerprint

Standard Library: OS Interaction

Alerts (13)

'import *' Avoid to prevent namespace pollution; import specific names or use aliases
4
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
8 31 56 75
'def' Ensure functions have docstrings for documentation
13 17 26
Complexity hotspot; lines 38 to 39 (total complexity: 3)
38 39
Complexity hotspot; lines 42 to 43 (total complexity: 3)
42 43
'open(' Use 'with open()' to ensure Files are properly closed
97