PageRenderTime 54ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/HLI_ExomeseqQC_optimized/pymodules/python2.7/lib/python/statsmodels-0.5.0-py2.7-linux-x86_64.egg/statsmodels/sandbox/regression/penalized.py

https://gitlab.com/pooja043/Globus_Docker_4
Python | 363 lines | 207 code | 61 blank | 95 comment | 24 complexity | c6421afd47630783633bf096c023a34b MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. """linear model with Theil prior probabilistic restrictions, generalized Ridge
  3. Created on Tue Dec 20 00:10:10 2011
  4. Author: Josef Perktold
  5. License: BSD-3
  6. open issues
  7. * selection of smoothing factor, strength of prior, cross validation
  8. * GLS, does this really work this way
  9. * None of inherited results have been checked yet,
  10. I'm not sure if any need to be adjusted or if only interpretation changes
  11. One question is which results are based on likelihood (residuals) and which
  12. are based on "posterior" as for example bse and cov_params
  13. * helper functions to construct priors?
  14. * increasing penalization for ordered regressors, e.g. polynomials
  15. * compare with random/mixed effects/coefficient, like estimated priors
  16. there is something fishy with the result instance, some things, e.g.
  17. normalized_cov_params, don't look like they update correctly as we
  18. search over lambda -> some stale state again ?
  19. I added df_model to result class using the hatmatrix, but df_model is defined
  20. in model instance not in result instance. -> not clear where refactoring should
  21. occur. df_resid doesn't get updated correctly.
  22. problem with definition of df_model, it has 1 subtracted for constant
  23. """
  24. import numpy as np
  25. import statsmodels.base.model as base
  26. from statsmodels.regression.linear_model import OLS, GLS, RegressionResults
  27. def atleast_2dcols(x):
  28. x = np.asarray(x)
  29. if x.ndim == 1:
  30. x = x[:,None]
  31. return x
  32. class TheilGLS(GLS):
  33. '''GLS with probabilistic restrictions
  34. essentially Bayes with informative prior
  35. note: I'm making up the GLS part, might work only for OLS
  36. '''
  37. def __init__(self, endog, exog, r_matrix, q_matrix=None, sigma_prior=None, sigma=None):
  38. self.r_matrix = np.asarray(r_matrix)
  39. self.q_matrix = atleast_2dcols(q_matrix)
  40. if np.size(sigma_prior) == 1:
  41. sigma_prior = sigma_prior * np.eye(self.r_matrix.shape[0]) #no numerical shortcuts
  42. self.sigma_prior = sigma_prior
  43. self.sigma_prior_inv = np.linalg.pinv(sigma_prior) #or inv
  44. super(self.__class__, self).__init__(endog, exog, sigma=sigma)
  45. def fit(self, lambd=1.):
  46. #this does duplicate transformation, but I need resid not wresid
  47. res_gls = GLS(self.endog, self.exog, sigma=self.sigma).fit()
  48. self.res_gls = res_gls
  49. sigma2_e = res_gls.mse_resid
  50. r_matrix = self.r_matrix
  51. q_matrix = self.q_matrix
  52. sigma_prior_inv = self.sigma_prior_inv
  53. x = self.wexog
  54. y = self.wendog[:,None]
  55. #why are sigma2_e * lambd multiplied, not ratio?
  56. #larger lambd -> stronger prior (it's not the variance)
  57. #print 'lambd inside fit', lambd
  58. xpx = np.dot(x.T, x) + \
  59. sigma2_e * lambd * np.dot(r_matrix.T, np.dot(sigma_prior_inv, r_matrix))
  60. xpy = np.dot(x.T, y) + \
  61. sigma2_e * lambd * np.dot(r_matrix.T, np.dot(sigma_prior_inv, q_matrix))
  62. #xpy = xpy[:,None]
  63. xpxi = np.linalg.pinv(xpx)
  64. params = np.dot(xpxi, xpy) #or solve
  65. params = np.squeeze(params)
  66. self.normalized_cov_params = xpxi #why attach it to self, i.e. model?
  67. lfit = TheilRegressionResults(self, params,
  68. normalized_cov_params=xpxi)
  69. lfit.penalization_factor = lambd
  70. return lfit
  71. def fit_minic(self):
  72. #this doesn't make sense, since number of parameters stays unchanged
  73. #need leave-one-out, gcv; or some penalization for weak priors
  74. #added extra penalization for lambd
  75. def get_bic(lambd):
  76. #return self.fit(lambd).bic #+lambd #+ 1./lambd #added 1/lambd for checking
  77. #return self.fit(lambd).gcv()
  78. #return self.fit(lambd).cv()
  79. return self.fit(lambd).aicc()
  80. from scipy import optimize
  81. lambd = optimize.fmin(get_bic, 1.)
  82. return lambd
  83. #TODO:
  84. #I need the hatmatrix in the model if I want to do iterative fitting, e.g. GCV
  85. #move to model or use it from a results instance inside the model,
  86. # each call to fit returns results instance
  87. class TheilRegressionResults(RegressionResults):
  88. #cache
  89. def hatmatrix_diag(self):
  90. '''
  91. diag(X' xpxi X)
  92. where xpxi = (X'X + lambd * sigma_prior)^{-1}
  93. Notes
  94. -----
  95. uses wexog, so this includes weights or sigma - check this case
  96. not clear whether I need to multiply by sigmahalf, i.e.
  97. (W^{-0.5} X) (X' W X)^{-1} (W^{-0.5} X)' or
  98. (W X) (X' W X)^{-1} (W X)'
  99. projection y_hat = H y or in terms of transformed variables (W^{-0.5} y)
  100. might be wrong for WLS and GLS case
  101. '''
  102. xpxi = self.model.normalized_cov_params
  103. #something fishy with self.normalized_cov_params in result, doesn't update
  104. #print self.model.wexog.shape, np.dot(xpxi, self.model.wexog.T).shape
  105. return (self.model.wexog * np.dot(xpxi, self.model.wexog.T).T).sum(1)
  106. def hatmatrix_trace(self):
  107. return self.hatmatrix_diag().sum()
  108. #this doesn't update df_resid
  109. @property #needs to be property or attribute (no call)
  110. def df_model(self):
  111. return self.hatmatrix_trace()
  112. #Note: mse_resid uses df_resid not nobs-k_vars, which might differ if df_model, tr(H), is used
  113. #in paper for gcv ess/nobs is used instead of mse_resid
  114. def gcv(self):
  115. return self.mse_resid / (1. - self.hatmatrix_trace() / self.nobs)**2
  116. def cv(self):
  117. return ((self.resid / (1. - self.hatmatrix_diag()))**2).sum() / self.nobs
  118. def aicc(self):
  119. aic = np.log(self.mse_resid) + 1
  120. aic += 2 * (1. + self.hatmatrix_trace()) / (self.nobs - self.hatmatrix_trace() -2)
  121. return aic
  122. #contrast/restriction matrices, temporary location
  123. def coef_restriction_meandiff(n_coeffs, n_vars=None, position=0):
  124. reduced = np.eye(n_coeffs) - 1./n_coeffs
  125. if n_vars is None:
  126. return reduced
  127. else:
  128. full = np.zeros((n_coeffs, n_vars))
  129. full[:, position:position+n_coeffs] = reduced
  130. return full
  131. def coef_restriction_diffbase(n_coeffs, n_vars=None, position=0, base_idx=0):
  132. reduced = -np.eye(n_coeffs) #make all rows, drop one row later
  133. reduced[:, base_idx] = 1
  134. keep = range(n_coeffs)
  135. del keep[base_idx]
  136. reduced = np.take(reduced, keep, axis=0)
  137. if n_vars is None:
  138. return reduced
  139. else:
  140. full = np.zeros((n_coeffs-1, n_vars))
  141. full[:, position:position+n_coeffs] = reduced
  142. return full
  143. def next_odd(d):
  144. return d + (1 - d % 2)
  145. def coef_restriction_diffseq(n_coeffs, degree=1, n_vars=None, position=0, base_idx=0):
  146. #check boundaries, returns "valid" ?
  147. if degree == 1:
  148. diff_coeffs = [-1, 1]
  149. n_points = 2
  150. elif degree > 1:
  151. from scipy import misc
  152. n_points = next_odd(degree + 1) #next odd integer after degree+1
  153. diff_coeffs = misc.central_diff_weights(n_points, ndiv=degree)
  154. dff = np.concatenate((diff_coeffs, np.zeros(n_coeffs - len(diff_coeffs))))
  155. from scipy import linalg
  156. reduced = linalg.toeplitz(dff, np.zeros(n_coeffs - len(diff_coeffs) + 1)).T
  157. #reduced = np.kron(np.eye(n_coeffs-n_points), diff_coeffs)
  158. if n_vars is None:
  159. return reduced
  160. else:
  161. full = np.zeros((n_coeffs-1, n_vars))
  162. full[:, position:position+n_coeffs] = reduced
  163. return full
  164. ##
  165. ## R = np.c_[np.zeros((n_groups, k_vars-1)), np.eye(n_groups)]
  166. ## r = np.zeros(n_groups)
  167. ## R = np.c_[np.zeros((n_groups-1, k_vars)),
  168. ## np.eye(n_groups-1)-1./n_groups * np.ones((n_groups-1, n_groups-1))]
  169. if __name__ == '__main__':
  170. import numpy as np
  171. import statsmodels.api as sm
  172. examples = [2]
  173. np.random.seed(765367)
  174. np.random.seed(97653679)
  175. nsample = 100
  176. x = np.linspace(0,10, nsample)
  177. X = sm.add_constant(np.column_stack((x, x**2, (x/5.)**3)), prepend=True)
  178. beta = np.array([10, 1, 0.1, 0.5])
  179. y = np.dot(X, beta) + np.random.normal(size=nsample)
  180. res_ols = sm.OLS(y, X).fit()
  181. R = [[0, 0, 0 , 1]]
  182. r = [0] #, 0, 0 , 0]
  183. lambd = 1 #1e-4
  184. mod = TheilGLS(y, X, r_matrix=R, q_matrix=r, sigma_prior=lambd)
  185. res = mod.fit()
  186. print res_ols.params
  187. print res.params
  188. #example 2
  189. #I need more flexible penalization in example, the penalization should
  190. #get stronger for higher order terms
  191. #np.random.seed(1)
  192. nobs = 200
  193. k_vars = 10
  194. k_true = 6
  195. sig_e = 0.25 #0.5
  196. x = np.linspace(-2,2, nobs)
  197. #X = sm.add_constant(np.column_stack((x, x**2, (x/5.)**3)), prepend=True)
  198. X = (x/x.max())[:,None]**np.arange(k_vars)
  199. beta = np.zeros(k_vars)
  200. beta[:k_true] = np.array([1, -2, 0.5, 1.5, -0.1, 0.1])[:k_true]
  201. y_true = np.dot(X, beta)
  202. y = y_true + sig_e * np.random.normal(size=nobs)
  203. res_ols = sm.OLS(y, X).fit()
  204. #R = np.c_[np.zeros((k_vars-4, 4)), np.eye(k_vars-4)] # has two large true coefficients penalized
  205. not_penalized = 4
  206. R = np.c_[np.zeros((k_vars-not_penalized, not_penalized)), np.eye(k_vars-not_penalized)]
  207. #increasingly strong penalization
  208. R = np.c_[np.zeros((k_vars-not_penalized, not_penalized)), np.diag((1+2*np.arange(k_vars-not_penalized)))]
  209. r = np.zeros(k_vars-not_penalized)
  210. ## R = -coef_restriction_diffseq(6, 1, n_vars=10, position=4) #doesn't make sense for polynomial
  211. ## R = np.vstack((R, np.zeros(R.shape[1])))
  212. ## R[-1,-1] = 1
  213. r = np.zeros(R.shape[0])
  214. lambd = 2 #1e-4
  215. mod = TheilGLS(y, X, r_matrix=R, q_matrix=r, sigma_prior=lambd)
  216. res = mod.fit()
  217. print res_ols.params
  218. print res.params
  219. res_bic = mod.fit_minic() #this will just return zero
  220. res = mod.fit(res_bic)
  221. print res_bic
  222. for lambd in np.linspace(0, 80, 21):
  223. res_l = mod.fit(lambd)
  224. #print lambd, res_l.params[-2:], res_l.bic, res_l.bic + 1./lambd, res.df_model
  225. print lambd, res_l.params[-2:], res_l.bic, res.df_model, np.trace(res.normalized_cov_params)
  226. import matplotlib.pyplot as plt
  227. plt.figure()
  228. plt.plot(beta, 'k-o', label='true')
  229. plt.plot(res_ols.params, '-o', label='ols')
  230. plt.plot(res.params, '-o', label='theil')
  231. plt.legend()
  232. plt.title('Polynomial fitting: estimated coefficients')
  233. plt.figure()
  234. plt.plot(y, 'o')
  235. plt.plot(y_true, 'k-', label='true')
  236. plt.plot(res_ols.fittedvalues, '-', label='ols')
  237. plt.plot(res.fittedvalues, '-', label='theil')
  238. plt.legend()
  239. plt.title('Polynomial fitting: fitted values')
  240. #plt.show()
  241. if 3 in examples:
  242. #example 3
  243. nobs = 600
  244. nobs_i = 20
  245. n_groups = nobs // nobs_i
  246. k_vars = 3
  247. from statsmodels.sandbox.panel.random_panel import PanelSample
  248. dgp = PanelSample(nobs, k_vars, n_groups)
  249. dgp.group_means = 2 + np.random.randn(n_groups) #add random intercept
  250. print 'seed', dgp.seed
  251. y = dgp.generate_panel()
  252. X = np.column_stack((dgp.exog[:,1:],
  253. dgp.groups[:,None] == np.arange(n_groups)))
  254. res_ols = sm.OLS(y, X).fit()
  255. R = np.c_[np.zeros((n_groups, k_vars-1)), np.eye(n_groups)]
  256. r = np.zeros(n_groups)
  257. R = np.c_[np.zeros((n_groups-1, k_vars)),
  258. np.eye(n_groups-1)-1./n_groups * np.ones((n_groups-1, n_groups-1))]
  259. r = np.zeros(n_groups-1)
  260. R[:, k_vars-1] = -1
  261. lambd = 1 #1e-4
  262. mod = TheilGLS(y, X, r_matrix=R, q_matrix=r, sigma_prior=lambd)
  263. res = mod.fit()
  264. print res.params
  265. params_l = []
  266. for lambd in np.linspace(0, 20, 21):
  267. params_l.append(mod.fit(5.*lambd).params)
  268. params_l = np.array(params_l)
  269. plt.figure()
  270. plt.plot(params_l.T)
  271. plt.title('Panel Data with random intercept: shrinkage to being equal')
  272. plt.xlabel('parameter index')
  273. plt.figure()
  274. plt.plot(params_l[:,k_vars:])
  275. plt.title('Panel Data with random intercept: shrinkage to being equal')
  276. plt.xlabel('strength of prior')
  277. #plt.show()