/tools/regVariation/rcve.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 143 lines · 116 code · 17 blank · 10 comment · 35 complexity · d65c61429aa4220abc57576d55d5ee9a MD5 · raw file

  1. #!/usr/bin/env python
  2. from galaxy import eggs
  3. import sys, string
  4. from rpy import *
  5. import numpy
  6. def stop_err(msg):
  7. sys.stderr.write(msg)
  8. sys.exit()
  9. def sscombs(s):
  10. if len(s) == 1:
  11. return [s]
  12. else:
  13. ssc = sscombs(s[1:])
  14. return [s[0]] + [s[0]+comb for comb in ssc] + ssc
  15. infile = sys.argv[1]
  16. y_col = int(sys.argv[2])-1
  17. x_cols = sys.argv[3].split(',')
  18. outfile = sys.argv[4]
  19. print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1)
  20. fout = open(outfile,'w')
  21. for i, line in enumerate( file ( infile )):
  22. line = line.rstrip('\r\n')
  23. if len( line )>0 and not line.startswith( '#' ):
  24. elems = line.split( '\t' )
  25. break
  26. if i == 30:
  27. break # Hopefully we'll never get here...
  28. if len( elems )<1:
  29. stop_err( "The data in your input dataset is either missing or not formatted properly." )
  30. y_vals = []
  31. x_vals = []
  32. for k,col in enumerate(x_cols):
  33. x_cols[k] = int(col)-1
  34. x_vals.append([])
  35. """
  36. try:
  37. float( elems[x_cols[k]] )
  38. except:
  39. try:
  40. msg = "This operation cannot be performed on non-numeric column %d containing value '%s'." %( col, elems[x_cols[k]] )
  41. except:
  42. msg = "This operation cannot be performed on non-numeric data."
  43. stop_err( msg )
  44. """
  45. NA = 'NA'
  46. for ind,line in enumerate( file( infile )):
  47. if line and not line.startswith( '#' ):
  48. try:
  49. fields = line.split("\t")
  50. try:
  51. yval = float(fields[y_col])
  52. except Exception, ey:
  53. yval = r('NA')
  54. #print >>sys.stderr, "ey = %s" %ey
  55. y_vals.append(yval)
  56. for k,col in enumerate(x_cols):
  57. try:
  58. xval = float(fields[col])
  59. except Exception, ex:
  60. xval = r('NA')
  61. #print >>sys.stderr, "ex = %s" %ex
  62. x_vals[k].append(xval)
  63. except:
  64. pass
  65. x_vals1 = numpy.asarray(x_vals).transpose()
  66. dat= r.list(x=array(x_vals1), y=y_vals)
  67. set_default_mode(NO_CONVERSION)
  68. try:
  69. full = r.lm(r("y ~ x"), data= r.na_exclude(dat)) #full model includes all the predictor variables specified by the user
  70. except RException, rex:
  71. stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.")
  72. set_default_mode(BASIC_CONVERSION)
  73. summary = r.summary(full)
  74. fullr2 = summary.get('r.squared','NA')
  75. if fullr2 == 'NA':
  76. stop_error("Error in linear regression")
  77. if len(x_vals) < 10:
  78. s = ""
  79. for ch in range(len(x_vals)):
  80. s += str(ch)
  81. else:
  82. stop_err("This tool only works with less than 10 predictors.")
  83. print >>fout, "#Model\tR-sq\tRCVE_Terms\tRCVE_Value"
  84. all_combos = sorted(sscombs(s), key=len)
  85. all_combos.reverse()
  86. for j,cols in enumerate(all_combos):
  87. #if len(cols) == len(s): #Same as the full model above
  88. # continue
  89. if len(cols) == 1:
  90. x_vals1 = x_vals[int(cols)]
  91. else:
  92. x_v = []
  93. for col in cols:
  94. x_v.append(x_vals[int(col)])
  95. x_vals1 = numpy.asarray(x_v).transpose()
  96. dat= r.list(x=array(x_vals1), y=y_vals)
  97. set_default_mode(NO_CONVERSION)
  98. red = r.lm(r("y ~ x"), data= dat) #Reduced model
  99. set_default_mode(BASIC_CONVERSION)
  100. summary = r.summary(red)
  101. redr2 = summary.get('r.squared','NA')
  102. try:
  103. rcve = (float(fullr2)-float(redr2))/float(fullr2)
  104. except:
  105. rcve = 'NA'
  106. col_str = ""
  107. for col in cols:
  108. col_str = col_str + str(int(x_cols[int(col)]) + 1) + " "
  109. col_str.strip()
  110. rcve_col_str = ""
  111. for col in s:
  112. if col not in cols:
  113. rcve_col_str = rcve_col_str + str(int(x_cols[int(col)]) + 1) + " "
  114. rcve_col_str.strip()
  115. if len(cols) == len(s): #full model
  116. rcve_col_str = "-"
  117. rcve = "-"
  118. try:
  119. redr2 = "%.4f" %(float(redr2))
  120. except:
  121. pass
  122. try:
  123. rcve = "%.4f" %(float(rcve))
  124. except:
  125. pass
  126. print >>fout, "%s\t%s\t%s\t%s" %(col_str,redr2,rcve_col_str,rcve)