PageRenderTime 30ms CodeModel.GetById 16ms app.highlight 11ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/regVariation/rcve.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 143 lines | 124 code | 11 blank | 8 comment | 19 complexity | d65c61429aa4220abc57576d55d5ee9a MD5 | raw file
  1#!/usr/bin/env python
  2
  3from galaxy import eggs
  4
  5import sys, string
  6from rpy import *
  7import numpy
  8
  9def stop_err(msg):
 10    sys.stderr.write(msg)
 11    sys.exit()
 12
 13def sscombs(s):
 14    if len(s) == 1:
 15        return [s]
 16    else:
 17        ssc = sscombs(s[1:])
 18        return [s[0]] + [s[0]+comb for comb in ssc] + ssc
 19
 20
 21infile = sys.argv[1]
 22y_col = int(sys.argv[2])-1
 23x_cols = sys.argv[3].split(',')
 24outfile = sys.argv[4]
 25
 26print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1)
 27fout = open(outfile,'w')
 28
 29for i, line in enumerate( file ( infile )):
 30    line = line.rstrip('\r\n')
 31    if len( line )>0 and not line.startswith( '#' ):
 32        elems = line.split( '\t' )
 33        break 
 34    if i == 30:
 35        break # Hopefully we'll never get here...
 36
 37if len( elems )<1:
 38    stop_err( "The data in your input dataset is either missing or not formatted properly." )
 39
 40y_vals = []
 41x_vals = []
 42
 43for k,col in enumerate(x_cols):
 44    x_cols[k] = int(col)-1
 45    x_vals.append([])
 46    """
 47    try:
 48        float( elems[x_cols[k]] )
 49    except:
 50        try:
 51            msg = "This operation cannot be performed on non-numeric column %d containing value '%s'." %( col, elems[x_cols[k]] )
 52        except:
 53            msg = "This operation cannot be performed on non-numeric data."
 54        stop_err( msg )
 55    """
 56NA = 'NA'
 57for ind,line in enumerate( file( infile )):
 58    if line and not line.startswith( '#' ):
 59        try:
 60            fields = line.split("\t")
 61            try:
 62                yval = float(fields[y_col])
 63            except Exception, ey:
 64                yval = r('NA')
 65                #print >>sys.stderr, "ey = %s" %ey
 66            y_vals.append(yval)
 67            for k,col in enumerate(x_cols):
 68                try:
 69                    xval = float(fields[col])
 70                except Exception, ex:
 71                    xval = r('NA')
 72                    #print >>sys.stderr, "ex = %s" %ex
 73                x_vals[k].append(xval)
 74        except:
 75            pass
 76
 77x_vals1 = numpy.asarray(x_vals).transpose()
 78dat= r.list(x=array(x_vals1), y=y_vals)
 79
 80set_default_mode(NO_CONVERSION)
 81try:
 82    full = r.lm(r("y ~ x"), data= r.na_exclude(dat))    #full model includes all the predictor variables specified by the user
 83except RException, rex:
 84    stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.")
 85set_default_mode(BASIC_CONVERSION)
 86
 87summary = r.summary(full)
 88fullr2 = summary.get('r.squared','NA')
 89
 90if fullr2 == 'NA':
 91    stop_error("Error in linear regression")
 92
 93if len(x_vals) < 10:
 94    s = ""
 95    for ch in range(len(x_vals)):
 96        s += str(ch)
 97else:
 98    stop_err("This tool only works with less than 10 predictors.")
 99
100print >>fout, "#Model\tR-sq\tRCVE_Terms\tRCVE_Value"
101all_combos = sorted(sscombs(s), key=len)
102all_combos.reverse()
103for j,cols in enumerate(all_combos):
104    #if len(cols) == len(s):    #Same as the full model above
105    #    continue
106    if len(cols) == 1:
107        x_vals1 = x_vals[int(cols)]
108    else:
109        x_v = []
110        for col in cols:
111            x_v.append(x_vals[int(col)])
112        x_vals1 = numpy.asarray(x_v).transpose()
113    dat= r.list(x=array(x_vals1), y=y_vals)
114    set_default_mode(NO_CONVERSION)
115    red = r.lm(r("y ~ x"), data= dat)    #Reduced model
116    set_default_mode(BASIC_CONVERSION)
117    summary = r.summary(red)
118    redr2 = summary.get('r.squared','NA')
119    try:
120        rcve = (float(fullr2)-float(redr2))/float(fullr2)
121    except:
122        rcve = 'NA'
123    col_str = ""
124    for col in cols:
125        col_str = col_str + str(int(x_cols[int(col)]) + 1) + " "
126    col_str.strip()
127    rcve_col_str = ""
128    for col in s:
129        if col not in cols:
130            rcve_col_str = rcve_col_str + str(int(x_cols[int(col)]) + 1) + " "
131    rcve_col_str.strip()
132    if len(cols) == len(s):    #full model
133        rcve_col_str = "-"
134        rcve = "-"
135    try:
136        redr2 = "%.4f" %(float(redr2))
137    except:
138        pass
139    try:
140        rcve = "%.4f" %(float(rcve))
141    except:
142        pass
143    print >>fout, "%s\t%s\t%s\t%s" %(col_str,redr2,rcve_col_str,rcve)