best_regression_subsets.py - This Python script uses R to p…

/tools/regVariation/best_regression_subsets.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 90 lines · 71 code · 18 blank · 1 comment · 19 complexity · 23a774896b797f177edb7b97479b92aa MD5 · raw file


#!/usr/bin/env python

from galaxy import eggs

import sys, string
from rpy import *
import numpy

def stop_err(msg):
    sys.stderr.write(msg)
    sys.exit()

infile = sys.argv[1]
y_col = int(sys.argv[2])-1
x_cols = sys.argv[3].split(',')
outfile = sys.argv[4]
outfile2 = sys.argv[5]
print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1)
fout = open(outfile,'w')

for i, line in enumerate( file ( infile )):
    line = line.rstrip('\r\n')
    if len( line )>0 and not line.startswith( '#' ):
        elems = line.split( '\t' )
        break 
    if i == 30:
        break # Hopefully we'll never get here...

if len( elems )<1:
    stop_err( "The data in your input dataset is either missing or not formatted properly." )

y_vals = []
x_vals = []

for k,col in enumerate(x_cols):
    x_cols[k] = int(col)-1
    x_vals.append([])
    
NA = 'NA'
for ind,line in enumerate( file( infile )):
    if line and not line.startswith( '#' ):
        try:
            fields = line.split("\t")
            try:
                yval = float(fields[y_col])
            except Exception, ey:
                yval = r('NA')
            y_vals.append(yval)
            for k,col in enumerate(x_cols):
                try:
                    xval = float(fields[col])
                except Exception, ex:
                    xval = r('NA')
                x_vals[k].append(xval)
        except:
            pass

response_term = ""

x_vals1 = numpy.asarray(x_vals).transpose()

dat= r.list(x=array(x_vals1), y=y_vals)

r.library("leaps")
 
set_default_mode(NO_CONVERSION)
try:
    leaps = r.regsubsets(r("y ~ x"), data= r.na_exclude(dat))
except RException, rex:
    stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.")
set_default_mode(BASIC_CONVERSION)

summary = r.summary(leaps)
tot = len(x_vals)
pattern = "["
for i in range(tot):
    pattern = pattern + 'c' + str(int(x_cols[int(i)]) + 1) + ' '
pattern = pattern.strip() + ']'  
print >>fout, "#Vars\t%s\tR-sq\tAdj. R-sq\tC-p\tbic" %(pattern)
for ind,item in enumerate(summary['outmat']):
    print >>fout, "%s\t%s\t%s\t%s\t%s\t%s" %(str(item).count('*'), item, summary['rsq'][ind], summary['adjr2'][ind], summary['cp'][ind], summary['bic'][ind])


r.pdf( outfile2, 8, 8 )
r.plot(leaps, scale="Cp", main="Best subsets using Cp Criterion")
r.plot(leaps, scale="r2", main="Best subsets using R-sq Criterion")
r.plot(leaps, scale="adjr2", main="Best subsets using Adjusted R-sq Criterion")
r.plot(leaps, scale="bic", main="Best subsets using bic Criterion")

r.dev_off()

Summary ✨

This Python script uses R to perform linear regression on a dataset and then uses the leaps package to identify the best subsets of predictor variables that are associated with the response variable. The script takes in four command-line arguments: the input dataset, the column number of the response variable, the columns numbers of the predictor variables, and the output file names for the summary statistics and the plot. The script first reads in the data from the input file, then performs linear regression on the data using R’s lm() function. It then uses the leaps package to identify the best subsets of predictor variables that are associated with the response variable, and outputs the results to two files: a summary statistics file and a plot file.

Tech Fingerprint

Alerts (7)

'import *' Avoid to prevent namespace pollution; import specific names or use aliases
6
'def' Ensure functions have docstrings for documentation
9
'open(' Use 'with open()' to ensure Files are properly closed
19
Complexity hotspot; lines 40 to 42 (total complexity: 4)
40 41 42
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
55