rcve.py - This Python script performs a linear regression a…

/tools/regVariation/rcve.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 143 lines · 116 code · 17 blank · 10 comment · 35 complexity · d65c61429aa4220abc57576d55d5ee9a MD5 · raw file


#!/usr/bin/env python

from galaxy import eggs

import sys, string
from rpy import *
import numpy

def stop_err(msg):
    sys.stderr.write(msg)
    sys.exit()

def sscombs(s):
    if len(s) == 1:
        return [s]
    else:
        ssc = sscombs(s[1:])
        return [s[0]] + [s[0]+comb for comb in ssc] + ssc


infile = sys.argv[1]
y_col = int(sys.argv[2])-1
x_cols = sys.argv[3].split(',')
outfile = sys.argv[4]

print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1)
fout = open(outfile,'w')

for i, line in enumerate( file ( infile )):
    line = line.rstrip('\r\n')
    if len( line )>0 and not line.startswith( '#' ):
        elems = line.split( '\t' )
        break 
    if i == 30:
        break # Hopefully we'll never get here...

if len( elems )<1:
    stop_err( "The data in your input dataset is either missing or not formatted properly." )

y_vals = []
x_vals = []

for k,col in enumerate(x_cols):
    x_cols[k] = int(col)-1
    x_vals.append([])
    """
    try:
        float( elems[x_cols[k]] )
    except:
        try:
            msg = "This operation cannot be performed on non-numeric column %d containing value '%s'." %( col, elems[x_cols[k]] )
        except:
            msg = "This operation cannot be performed on non-numeric data."
        stop_err( msg )
    """
NA = 'NA'
for ind,line in enumerate( file( infile )):
    if line and not line.startswith( '#' ):
        try:
            fields = line.split("\t")
            try:
                yval = float(fields[y_col])
            except Exception, ey:
                yval = r('NA')
                #print >>sys.stderr, "ey = %s" %ey
            y_vals.append(yval)
            for k,col in enumerate(x_cols):
                try:
                    xval = float(fields[col])
                except Exception, ex:
                    xval = r('NA')
                    #print >>sys.stderr, "ex = %s" %ex
                x_vals[k].append(xval)
        except:
            pass

x_vals1 = numpy.asarray(x_vals).transpose()
dat= r.list(x=array(x_vals1), y=y_vals)

set_default_mode(NO_CONVERSION)
try:
    full = r.lm(r("y ~ x"), data= r.na_exclude(dat))    #full model includes all the predictor variables specified by the user
except RException, rex:
    stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.")
set_default_mode(BASIC_CONVERSION)

summary = r.summary(full)
fullr2 = summary.get('r.squared','NA')

if fullr2 == 'NA':
    stop_error("Error in linear regression")

if len(x_vals) < 10:
    s = ""
    for ch in range(len(x_vals)):
        s += str(ch)
else:
    stop_err("This tool only works with less than 10 predictors.")

print >>fout, "#Model\tR-sq\tRCVE_Terms\tRCVE_Value"
all_combos = sorted(sscombs(s), key=len)
all_combos.reverse()
for j,cols in enumerate(all_combos):
    #if len(cols) == len(s):    #Same as the full model above
    #    continue
    if len(cols) == 1:
        x_vals1 = x_vals[int(cols)]
    else:
        x_v = []
        for col in cols:
            x_v.append(x_vals[int(col)])
        x_vals1 = numpy.asarray(x_v).transpose()
    dat= r.list(x=array(x_vals1), y=y_vals)
    set_default_mode(NO_CONVERSION)
    red = r.lm(r("y ~ x"), data= dat)    #Reduced model
    set_default_mode(BASIC_CONVERSION)
    summary = r.summary(red)
    redr2 = summary.get('r.squared','NA')
    try:
        rcve = (float(fullr2)-float(redr2))/float(fullr2)
    except:
        rcve = 'NA'
    col_str = ""
    for col in cols:
        col_str = col_str + str(int(x_cols[int(col)]) + 1) + " "
    col_str.strip()
    rcve_col_str = ""
    for col in s:
        if col not in cols:
            rcve_col_str = rcve_col_str + str(int(x_cols[int(col)]) + 1) + " "
    rcve_col_str.strip()
    if len(cols) == len(s):    #full model
        rcve_col_str = "-"
        rcve = "-"
    try:
        redr2 = "%.4f" %(float(redr2))
    except:
        pass
    try:
        rcve = "%.4f" %(float(rcve))
    except:
        pass
    print >>fout, "%s\t%s\t%s\t%s" %(col_str,redr2,rcve_col_str,rcve)

Summary ✨

This Python script performs a linear regression analysis on a dataset and outputs the R-squared value, RCVE (Reduced Columns Variance Explained), and the list of columns that contribute to the most variance in the response variable.

Tech Fingerprint

Alerts (11)

'import *' Avoid to prevent namespace pollution; import specific names or use aliases
6
'def' Ensure functions have docstrings for documentation
9 13
'open(' Use 'with open()' to ensure Files are properly closed
27
Complexity hotspot; lines 57 to 59 (total complexity: 4)
57 58 59
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
74 121 137 141