PageRenderTime 56ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/stats/ols.py

http://github.com/wesm/pandas
Python | 1376 lines | 1218 code | 64 blank | 94 comment | 33 complexity | 7fc01ad8d8698f49e0779b65c697a0fc MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. """
  2. Ordinary least squares regression
  3. """
  4. # pylint: disable-msg=W0201
  5. # flake8: noqa
  6. from pandas.compat import zip, range, StringIO
  7. from itertools import starmap
  8. from pandas import compat
  9. import numpy as np
  10. from pandas.core.api import DataFrame, Series, isnull
  11. from pandas.core.base import StringMixin
  12. from pandas.types.common import _ensure_float64
  13. from pandas.core.index import MultiIndex
  14. from pandas.core.panel import Panel
  15. from pandas.util.decorators import cache_readonly
  16. import pandas.stats.common as scom
  17. import pandas.stats.math as math
  18. import pandas.stats.moments as moments
  19. _FP_ERR = 1e-8
  20. class OLS(StringMixin):
  21. """
  22. Runs a full sample ordinary least squares regression.
  23. Parameters
  24. ----------
  25. y : Series
  26. x : Series, DataFrame, dict of Series
  27. intercept : bool
  28. True if you want an intercept.
  29. weights : array-like, optional
  30. 1d array of weights. If you supply 1/W then the variables are pre-
  31. multiplied by 1/sqrt(W). If no weights are supplied the default value
  32. is 1 and WLS reults are the same as OLS.
  33. nw_lags : None or int
  34. Number of Newey-West lags.
  35. nw_overlap : boolean, default False
  36. Assume data is overlapping when computing Newey-West estimator
  37. """
  38. _panel_model = False
  39. def __init__(self, y, x, intercept=True, weights=None, nw_lags=None,
  40. nw_overlap=False):
  41. import warnings
  42. warnings.warn("The pandas.stats.ols module is deprecated and will be "
  43. "removed in a future version. We refer to external packages "
  44. "like statsmodels, see some examples here: "
  45. "http://www.statsmodels.org/stable/regression.html",
  46. FutureWarning, stacklevel=4)
  47. try:
  48. import statsmodels.api as sm
  49. except ImportError:
  50. import scikits.statsmodels.api as sm
  51. self._x_orig = x
  52. self._y_orig = y
  53. self._weights_orig = weights
  54. self._intercept = intercept
  55. self._nw_lags = nw_lags
  56. self._nw_overlap = nw_overlap
  57. (self._y, self._x, self._weights, self._x_filtered,
  58. self._index, self._time_has_obs) = self._prepare_data()
  59. if self._weights is not None:
  60. self._x_trans = self._x.mul(np.sqrt(self._weights), axis=0)
  61. self._y_trans = self._y * np.sqrt(self._weights)
  62. self.sm_ols = sm.WLS(self._y.get_values(),
  63. self._x.get_values(),
  64. weights=self._weights.values).fit()
  65. else:
  66. self._x_trans = self._x
  67. self._y_trans = self._y
  68. self.sm_ols = sm.OLS(self._y.get_values(),
  69. self._x.get_values()).fit()
  70. def _prepare_data(self):
  71. """
  72. Cleans the input for single OLS.
  73. Parameters
  74. ----------
  75. lhs: Series
  76. Dependent variable in the regression.
  77. rhs: dict, whose values are Series, DataFrame, or dict
  78. Explanatory variables of the regression.
  79. Returns
  80. -------
  81. Series, DataFrame
  82. Cleaned lhs and rhs
  83. """
  84. (filt_lhs, filt_rhs, filt_weights,
  85. pre_filt_rhs, index, valid) = _filter_data(self._y_orig, self._x_orig,
  86. self._weights_orig)
  87. if self._intercept:
  88. filt_rhs['intercept'] = 1.
  89. pre_filt_rhs['intercept'] = 1.
  90. if hasattr(filt_weights, 'to_dense'):
  91. filt_weights = filt_weights.to_dense()
  92. return (filt_lhs, filt_rhs, filt_weights,
  93. pre_filt_rhs, index, valid)
  94. @property
  95. def nobs(self):
  96. return self._nobs
  97. @property
  98. def _nobs(self):
  99. return len(self._y)
  100. @property
  101. def nw_lags(self):
  102. return self._nw_lags
  103. @property
  104. def x(self):
  105. """Returns the filtered x used in the regression."""
  106. return self._x
  107. @property
  108. def y(self):
  109. """Returns the filtered y used in the regression."""
  110. return self._y
  111. @cache_readonly
  112. def _beta_raw(self):
  113. """Runs the regression and returns the beta."""
  114. return self.sm_ols.params
  115. @cache_readonly
  116. def beta(self):
  117. """Returns the betas in Series form."""
  118. return Series(self._beta_raw, index=self._x.columns)
  119. @cache_readonly
  120. def _df_raw(self):
  121. """Returns the degrees of freedom."""
  122. return math.rank(self._x.values)
  123. @cache_readonly
  124. def df(self):
  125. """Returns the degrees of freedom.
  126. This equals the rank of the X matrix.
  127. """
  128. return self._df_raw
  129. @cache_readonly
  130. def _df_model_raw(self):
  131. """Returns the raw model degrees of freedom."""
  132. return self.sm_ols.df_model
  133. @cache_readonly
  134. def df_model(self):
  135. """Returns the degrees of freedom of the model."""
  136. return self._df_model_raw
  137. @cache_readonly
  138. def _df_resid_raw(self):
  139. """Returns the raw residual degrees of freedom."""
  140. return self.sm_ols.df_resid
  141. @cache_readonly
  142. def df_resid(self):
  143. """Returns the degrees of freedom of the residuals."""
  144. return self._df_resid_raw
  145. @cache_readonly
  146. def _f_stat_raw(self):
  147. """Returns the raw f-stat value."""
  148. from scipy.stats import f
  149. cols = self._x.columns
  150. if self._nw_lags is None:
  151. F = self._r2_raw / (self._r2_raw - self._r2_adj_raw)
  152. q = len(cols)
  153. if 'intercept' in cols:
  154. q -= 1
  155. shape = q, self.df_resid
  156. p_value = 1 - f.cdf(F, shape[0], shape[1])
  157. return F, shape, p_value
  158. k = len(cols)
  159. R = np.eye(k)
  160. r = np.zeros((k, 1))
  161. try:
  162. intercept = cols.get_loc('intercept')
  163. R = np.concatenate((R[0: intercept], R[intercept + 1:]))
  164. r = np.concatenate((r[0: intercept], r[intercept + 1:]))
  165. except KeyError:
  166. # no intercept
  167. pass
  168. return math.calc_F(R, r, self._beta_raw, self._var_beta_raw,
  169. self._nobs, self.df)
  170. @cache_readonly
  171. def f_stat(self):
  172. """Returns the f-stat value."""
  173. return f_stat_to_dict(self._f_stat_raw)
  174. def f_test(self, hypothesis):
  175. """Runs the F test, given a joint hypothesis. The hypothesis is
  176. represented by a collection of equations, in the form
  177. A*x_1+B*x_2=C
  178. You must provide the coefficients even if they're 1. No spaces.
  179. The equations can be passed as either a single string or a
  180. list of strings.
  181. Examples
  182. --------
  183. o = ols(...)
  184. o.f_test('1*x1+2*x2=0,1*x3=0')
  185. o.f_test(['1*x1+2*x2=0','1*x3=0'])
  186. """
  187. x_names = self._x.columns
  188. R = []
  189. r = []
  190. if isinstance(hypothesis, str):
  191. eqs = hypothesis.split(',')
  192. elif isinstance(hypothesis, list):
  193. eqs = hypothesis
  194. else: # pragma: no cover
  195. raise Exception('hypothesis must be either string or list')
  196. for equation in eqs:
  197. row = np.zeros(len(x_names))
  198. lhs, rhs = equation.split('=')
  199. for s in lhs.split('+'):
  200. ss = s.split('*')
  201. coeff = float(ss[0])
  202. x_name = ss[1]
  203. if x_name not in x_names:
  204. raise Exception('no coefficient named %s' % x_name)
  205. idx = x_names.get_loc(x_name)
  206. row[idx] = coeff
  207. rhs = float(rhs)
  208. R.append(row)
  209. r.append(rhs)
  210. R = np.array(R)
  211. q = len(r)
  212. r = np.array(r).reshape(q, 1)
  213. result = math.calc_F(R, r, self._beta_raw, self._var_beta_raw,
  214. self._nobs, self.df)
  215. return f_stat_to_dict(result)
  216. @cache_readonly
  217. def _p_value_raw(self):
  218. """Returns the raw p values."""
  219. from scipy.stats import t
  220. return 2 * t.sf(np.fabs(self._t_stat_raw),
  221. self._df_resid_raw)
  222. @cache_readonly
  223. def p_value(self):
  224. """Returns the p values."""
  225. return Series(self._p_value_raw, index=self.beta.index)
  226. @cache_readonly
  227. def _r2_raw(self):
  228. """Returns the raw r-squared values."""
  229. if self._use_centered_tss:
  230. return 1 - self.sm_ols.ssr / self.sm_ols.centered_tss
  231. else:
  232. return 1 - self.sm_ols.ssr / self.sm_ols.uncentered_tss
  233. @property
  234. def _use_centered_tss(self):
  235. # has_intercept = np.abs(self._resid_raw.sum()) < _FP_ERR
  236. return self._intercept
  237. @cache_readonly
  238. def r2(self):
  239. """Returns the r-squared values."""
  240. return self._r2_raw
  241. @cache_readonly
  242. def _r2_adj_raw(self):
  243. """Returns the raw r-squared adjusted values."""
  244. return self.sm_ols.rsquared_adj
  245. @cache_readonly
  246. def r2_adj(self):
  247. """Returns the r-squared adjusted values."""
  248. return self._r2_adj_raw
  249. @cache_readonly
  250. def _resid_raw(self):
  251. """Returns the raw residuals."""
  252. return self.sm_ols.resid
  253. @cache_readonly
  254. def resid(self):
  255. """Returns the residuals."""
  256. return Series(self._resid_raw, index=self._x.index)
  257. @cache_readonly
  258. def _rmse_raw(self):
  259. """Returns the raw rmse values."""
  260. return np.sqrt(self.sm_ols.mse_resid)
  261. @cache_readonly
  262. def rmse(self):
  263. """Returns the rmse value."""
  264. return self._rmse_raw
  265. @cache_readonly
  266. def _std_err_raw(self):
  267. """Returns the raw standard err values."""
  268. return np.sqrt(np.diag(self._var_beta_raw))
  269. @cache_readonly
  270. def std_err(self):
  271. """Returns the standard err values of the betas."""
  272. return Series(self._std_err_raw, index=self.beta.index)
  273. @cache_readonly
  274. def _t_stat_raw(self):
  275. """Returns the raw t-stat value."""
  276. return self._beta_raw / self._std_err_raw
  277. @cache_readonly
  278. def t_stat(self):
  279. """Returns the t-stat values of the betas."""
  280. return Series(self._t_stat_raw, index=self.beta.index)
  281. @cache_readonly
  282. def _var_beta_raw(self):
  283. """
  284. Returns the raw covariance of beta.
  285. """
  286. x = self._x.values
  287. y = self._y.values
  288. xx = np.dot(x.T, x)
  289. if self._nw_lags is None:
  290. return math.inv(xx) * (self._rmse_raw ** 2)
  291. else:
  292. resid = y - np.dot(x, self._beta_raw)
  293. m = (x.T * resid).T
  294. xeps = math.newey_west(m, self._nw_lags, self._nobs, self._df_raw,
  295. self._nw_overlap)
  296. xx_inv = math.inv(xx)
  297. return np.dot(xx_inv, np.dot(xeps, xx_inv))
  298. @cache_readonly
  299. def var_beta(self):
  300. """Returns the variance-covariance matrix of beta."""
  301. return DataFrame(self._var_beta_raw, index=self.beta.index,
  302. columns=self.beta.index)
  303. @cache_readonly
  304. def _y_fitted_raw(self):
  305. """Returns the raw fitted y values."""
  306. if self._weights is None:
  307. X = self._x_filtered.values
  308. else:
  309. # XXX
  310. return self.sm_ols.fittedvalues
  311. b = self._beta_raw
  312. return np.dot(X, b)
  313. @cache_readonly
  314. def y_fitted(self):
  315. """Returns the fitted y values. This equals BX."""
  316. if self._weights is None:
  317. index = self._x_filtered.index
  318. orig_index = index
  319. else:
  320. index = self._y.index
  321. orig_index = self._y_orig.index
  322. result = Series(self._y_fitted_raw, index=index)
  323. return result.reindex(orig_index)
  324. @cache_readonly
  325. def _y_predict_raw(self):
  326. """Returns the raw predicted y values."""
  327. return self._y_fitted_raw
  328. @cache_readonly
  329. def y_predict(self):
  330. """Returns the predicted y values.
  331. For in-sample, this is same as y_fitted."""
  332. return self.y_fitted
  333. def predict(self, beta=None, x=None, fill_value=None,
  334. fill_method=None, axis=0):
  335. """
  336. Parameters
  337. ----------
  338. beta : Series
  339. x : Series or DataFrame
  340. fill_value : scalar or dict, default None
  341. fill_method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
  342. axis : {0, 1}, default 0
  343. See DataFrame.fillna for more details
  344. Notes
  345. -----
  346. 1. If both fill_value and fill_method are None then NaNs are dropped
  347. (this is the default behavior)
  348. 2. An intercept will be automatically added to the new_y_values if
  349. the model was fitted using an intercept
  350. Returns
  351. -------
  352. Series of predicted values
  353. """
  354. if beta is None and x is None:
  355. return self.y_predict
  356. if beta is None:
  357. beta = self.beta
  358. else:
  359. beta = beta.reindex(self.beta.index)
  360. if isnull(beta).any():
  361. raise ValueError('Must supply betas for same variables')
  362. if x is None:
  363. x = self._x
  364. orig_x = x
  365. else:
  366. orig_x = x
  367. if fill_value is None and fill_method is None:
  368. x = x.dropna(how='any')
  369. else:
  370. x = x.fillna(value=fill_value, method=fill_method, axis=axis)
  371. if isinstance(x, Series):
  372. x = DataFrame({'x': x})
  373. if self._intercept:
  374. x['intercept'] = 1.
  375. x = x.reindex(columns=self._x.columns)
  376. rs = np.dot(x.values, beta.values)
  377. return Series(rs, x.index).reindex(orig_x.index)
  378. RESULT_FIELDS = ['r2', 'r2_adj', 'df', 'df_model', 'df_resid', 'rmse',
  379. 'f_stat', 'beta', 'std_err', 't_stat', 'p_value', 'nobs']
  380. @cache_readonly
  381. def _results(self):
  382. results = {}
  383. for result in self.RESULT_FIELDS:
  384. results[result] = getattr(self, result)
  385. return results
  386. @cache_readonly
  387. def _coef_table(self):
  388. buf = StringIO()
  389. buf.write('%14s %10s %10s %10s %10s %10s %10s\n' %
  390. ('Variable', 'Coef', 'Std Err', 't-stat',
  391. 'p-value', 'CI 2.5%', 'CI 97.5%'))
  392. buf.write(scom.banner(''))
  393. coef_template = '\n%14s %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f'
  394. results = self._results
  395. beta = results['beta']
  396. for i, name in enumerate(beta.index):
  397. if i and not (i % 5):
  398. buf.write('\n' + scom.banner(''))
  399. std_err = results['std_err'][name]
  400. CI1 = beta[name] - 1.96 * std_err
  401. CI2 = beta[name] + 1.96 * std_err
  402. t_stat = results['t_stat'][name]
  403. p_value = results['p_value'][name]
  404. line = coef_template % (name,
  405. beta[name], std_err, t_stat, p_value, CI1, CI2)
  406. buf.write(line)
  407. if self.nw_lags is not None:
  408. buf.write('\n')
  409. buf.write('*** The calculations are Newey-West '
  410. 'adjusted with lags %5d\n' % self.nw_lags)
  411. return buf.getvalue()
  412. @cache_readonly
  413. def summary_as_matrix(self):
  414. """Returns the formatted results of the OLS as a DataFrame."""
  415. results = self._results
  416. beta = results['beta']
  417. data = {'beta': results['beta'],
  418. 't-stat': results['t_stat'],
  419. 'p-value': results['p_value'],
  420. 'std err': results['std_err']}
  421. return DataFrame(data, beta.index).T
  422. @cache_readonly
  423. def summary(self):
  424. """
  425. This returns the formatted result of the OLS computation
  426. """
  427. template = """
  428. %(bannerTop)s
  429. Formula: Y ~ %(formula)s
  430. Number of Observations: %(nobs)d
  431. Number of Degrees of Freedom: %(df)d
  432. R-squared: %(r2)10.4f
  433. Adj R-squared: %(r2_adj)10.4f
  434. Rmse: %(rmse)10.4f
  435. F-stat %(f_stat_shape)s: %(f_stat)10.4f, p-value: %(f_stat_p_value)10.4f
  436. Degrees of Freedom: model %(df_model)d, resid %(df_resid)d
  437. %(bannerCoef)s
  438. %(coef_table)s
  439. %(bannerEnd)s
  440. """
  441. coef_table = self._coef_table
  442. results = self._results
  443. f_stat = results['f_stat']
  444. bracketed = ['<%s>' % str(c) for c in results['beta'].index]
  445. formula = StringIO()
  446. formula.write(bracketed[0])
  447. tot = len(bracketed[0])
  448. line = 1
  449. for coef in bracketed[1:]:
  450. tot = tot + len(coef) + 3
  451. if tot // (68 * line):
  452. formula.write('\n' + ' ' * 12)
  453. line += 1
  454. formula.write(' + ' + coef)
  455. params = {
  456. 'bannerTop': scom.banner('Summary of Regression Analysis'),
  457. 'bannerCoef': scom.banner('Summary of Estimated Coefficients'),
  458. 'bannerEnd': scom.banner('End of Summary'),
  459. 'formula': formula.getvalue(),
  460. 'r2': results['r2'],
  461. 'r2_adj': results['r2_adj'],
  462. 'nobs': results['nobs'],
  463. 'df': results['df'],
  464. 'df_model': results['df_model'],
  465. 'df_resid': results['df_resid'],
  466. 'coef_table': coef_table,
  467. 'rmse': results['rmse'],
  468. 'f_stat': f_stat['f-stat'],
  469. 'f_stat_shape': '(%d, %d)' % (f_stat['DF X'], f_stat['DF Resid']),
  470. 'f_stat_p_value': f_stat['p-value'],
  471. }
  472. return template % params
  473. def __unicode__(self):
  474. return self.summary
  475. @cache_readonly
  476. def _time_obs_count(self):
  477. # XXX
  478. return self._time_has_obs.astype(int)
  479. @property
  480. def _total_times(self):
  481. return self._time_has_obs.sum()
  482. class MovingOLS(OLS):
  483. """
  484. Runs a rolling/expanding simple OLS.
  485. Parameters
  486. ----------
  487. y : Series
  488. x : Series, DataFrame, or dict of Series
  489. weights : array-like, optional
  490. 1d array of weights. If None, equivalent to an unweighted OLS.
  491. window_type : {'full sample', 'rolling', 'expanding'}
  492. Default expanding
  493. window : int
  494. size of window (for rolling/expanding OLS)
  495. min_periods : int
  496. Threshold of non-null data points to require.
  497. If None, defaults to size of window for window_type='rolling' and 1
  498. otherwise
  499. intercept : bool
  500. True if you want an intercept.
  501. nw_lags : None or int
  502. Number of Newey-West lags.
  503. nw_overlap : boolean, default False
  504. Assume data is overlapping when computing Newey-West estimator
  505. """
  506. def __init__(self, y, x, weights=None, window_type='expanding',
  507. window=None, min_periods=None, intercept=True,
  508. nw_lags=None, nw_overlap=False):
  509. self._args = dict(intercept=intercept, nw_lags=nw_lags,
  510. nw_overlap=nw_overlap)
  511. OLS.__init__(self, y=y, x=x, weights=weights, **self._args)
  512. self._set_window(window_type, window, min_periods)
  513. def _set_window(self, window_type, window, min_periods):
  514. self._window_type = scom._get_window_type(window_type)
  515. if self._is_rolling:
  516. if window is None:
  517. raise AssertionError("Must specify window.")
  518. if min_periods is None:
  519. min_periods = window
  520. else:
  521. window = len(self._x)
  522. if min_periods is None:
  523. min_periods = 1
  524. self._window = int(window)
  525. self._min_periods = min_periods
  526. #------------------------------------------------------------------------------
  527. # "Public" results
  528. @cache_readonly
  529. def beta(self):
  530. """Returns the betas in Series/DataFrame form."""
  531. return DataFrame(self._beta_raw,
  532. index=self._result_index,
  533. columns=self._x.columns)
  534. @cache_readonly
  535. def rank(self):
  536. return Series(self._rank_raw, index=self._result_index)
  537. @cache_readonly
  538. def df(self):
  539. """Returns the degrees of freedom."""
  540. return Series(self._df_raw, index=self._result_index)
  541. @cache_readonly
  542. def df_model(self):
  543. """Returns the model degrees of freedom."""
  544. return Series(self._df_model_raw, index=self._result_index)
  545. @cache_readonly
  546. def df_resid(self):
  547. """Returns the residual degrees of freedom."""
  548. return Series(self._df_resid_raw, index=self._result_index)
  549. @cache_readonly
  550. def f_stat(self):
  551. """Returns the f-stat value."""
  552. f_stat_dicts = dict((date, f_stat_to_dict(f_stat))
  553. for date, f_stat in zip(self.beta.index,
  554. self._f_stat_raw))
  555. return DataFrame(f_stat_dicts).T
  556. def f_test(self, hypothesis):
  557. raise NotImplementedError('must use full sample')
  558. @cache_readonly
  559. def forecast_mean(self):
  560. return Series(self._forecast_mean_raw, index=self._result_index)
  561. @cache_readonly
  562. def forecast_vol(self):
  563. return Series(self._forecast_vol_raw, index=self._result_index)
  564. @cache_readonly
  565. def p_value(self):
  566. """Returns the p values."""
  567. cols = self.beta.columns
  568. return DataFrame(self._p_value_raw, columns=cols,
  569. index=self._result_index)
  570. @cache_readonly
  571. def r2(self):
  572. """Returns the r-squared values."""
  573. return Series(self._r2_raw, index=self._result_index)
  574. @cache_readonly
  575. def resid(self):
  576. """Returns the residuals."""
  577. return Series(self._resid_raw[self._valid_obs_labels],
  578. index=self._result_index)
  579. @cache_readonly
  580. def r2_adj(self):
  581. """Returns the r-squared adjusted values."""
  582. index = self.r2.index
  583. return Series(self._r2_adj_raw, index=index)
  584. @cache_readonly
  585. def rmse(self):
  586. """Returns the rmse values."""
  587. return Series(self._rmse_raw, index=self._result_index)
  588. @cache_readonly
  589. def std_err(self):
  590. """Returns the standard err values."""
  591. return DataFrame(self._std_err_raw, columns=self.beta.columns,
  592. index=self._result_index)
  593. @cache_readonly
  594. def t_stat(self):
  595. """Returns the t-stat value."""
  596. return DataFrame(self._t_stat_raw, columns=self.beta.columns,
  597. index=self._result_index)
  598. @cache_readonly
  599. def var_beta(self):
  600. """Returns the covariance of beta."""
  601. result = {}
  602. result_index = self._result_index
  603. for i in range(len(self._var_beta_raw)):
  604. dm = DataFrame(self._var_beta_raw[i], columns=self.beta.columns,
  605. index=self.beta.columns)
  606. result[result_index[i]] = dm
  607. return Panel.from_dict(result, intersect=False)
  608. @cache_readonly
  609. def y_fitted(self):
  610. """Returns the fitted y values."""
  611. return Series(self._y_fitted_raw[self._valid_obs_labels],
  612. index=self._result_index)
  613. @cache_readonly
  614. def y_predict(self):
  615. """Returns the predicted y values."""
  616. return Series(self._y_predict_raw[self._valid_obs_labels],
  617. index=self._result_index)
  618. #------------------------------------------------------------------------------
  619. # "raw" attributes, calculations
  620. @property
  621. def _is_rolling(self):
  622. return self._window_type == 'rolling'
  623. @cache_readonly
  624. def _beta_raw(self):
  625. """Runs the regression and returns the beta."""
  626. beta, indices, mask = self._rolling_ols_call
  627. return beta[indices]
  628. @cache_readonly
  629. def _result_index(self):
  630. return self._index[self._valid_indices]
  631. @property
  632. def _valid_indices(self):
  633. return self._rolling_ols_call[1]
  634. @cache_readonly
  635. def _rolling_ols_call(self):
  636. return self._calc_betas(self._x_trans, self._y_trans)
  637. def _calc_betas(self, x, y):
  638. N = len(self._index)
  639. K = len(self._x.columns)
  640. betas = np.empty((N, K), dtype=float)
  641. betas[:] = np.NaN
  642. valid = self._time_has_obs
  643. enough = self._enough_obs
  644. window = self._window
  645. # Use transformed (demeaned) Y, X variables
  646. cum_xx = self._cum_xx(x)
  647. cum_xy = self._cum_xy(x, y)
  648. for i in range(N):
  649. if not valid[i] or not enough[i]:
  650. continue
  651. xx = cum_xx[i]
  652. xy = cum_xy[i]
  653. if self._is_rolling and i >= window:
  654. xx = xx - cum_xx[i - window]
  655. xy = xy - cum_xy[i - window]
  656. betas[i] = math.solve(xx, xy)
  657. mask = ~np.isnan(betas).any(axis=1)
  658. have_betas = np.arange(N)[mask]
  659. return betas, have_betas, mask
  660. def _rolling_rank(self):
  661. dates = self._index
  662. window = self._window
  663. ranks = np.empty(len(dates), dtype=float)
  664. ranks[:] = np.NaN
  665. for i, date in enumerate(dates):
  666. if self._is_rolling and i >= window:
  667. prior_date = dates[i - window + 1]
  668. else:
  669. prior_date = dates[0]
  670. x_slice = self._x.truncate(before=prior_date, after=date).values
  671. if len(x_slice) == 0:
  672. continue
  673. ranks[i] = math.rank(x_slice)
  674. return ranks
  675. def _cum_xx(self, x):
  676. dates = self._index
  677. K = len(x.columns)
  678. valid = self._time_has_obs
  679. cum_xx = []
  680. slicer = lambda df, dt: df.truncate(dt, dt).values
  681. if not self._panel_model:
  682. _get_index = x.index.get_loc
  683. def slicer(df, dt):
  684. i = _get_index(dt)
  685. return df.values[i:i + 1, :]
  686. last = np.zeros((K, K))
  687. for i, date in enumerate(dates):
  688. if not valid[i]:
  689. cum_xx.append(last)
  690. continue
  691. x_slice = slicer(x, date)
  692. xx = last = last + np.dot(x_slice.T, x_slice)
  693. cum_xx.append(xx)
  694. return cum_xx
  695. def _cum_xy(self, x, y):
  696. dates = self._index
  697. valid = self._time_has_obs
  698. cum_xy = []
  699. x_slicer = lambda df, dt: df.truncate(dt, dt).values
  700. if not self._panel_model:
  701. _get_index = x.index.get_loc
  702. def x_slicer(df, dt):
  703. i = _get_index(dt)
  704. return df.values[i:i + 1]
  705. _y_get_index = y.index.get_loc
  706. _values = y.values
  707. if isinstance(y.index, MultiIndex):
  708. def y_slicer(df, dt):
  709. loc = _y_get_index(dt)
  710. return _values[loc]
  711. else:
  712. def y_slicer(df, dt):
  713. i = _y_get_index(dt)
  714. return _values[i:i + 1]
  715. last = np.zeros(len(x.columns))
  716. for i, date in enumerate(dates):
  717. if not valid[i]:
  718. cum_xy.append(last)
  719. continue
  720. x_slice = x_slicer(x, date)
  721. y_slice = y_slicer(y, date)
  722. xy = last = last + np.dot(x_slice.T, y_slice)
  723. cum_xy.append(xy)
  724. return cum_xy
  725. @cache_readonly
  726. def _rank_raw(self):
  727. rank = self._rolling_rank()
  728. return rank[self._valid_indices]
  729. @cache_readonly
  730. def _df_raw(self):
  731. """Returns the degrees of freedom."""
  732. return self._rank_raw
  733. @cache_readonly
  734. def _df_model_raw(self):
  735. """Returns the raw model degrees of freedom."""
  736. return self._df_raw - 1
  737. @cache_readonly
  738. def _df_resid_raw(self):
  739. """Returns the raw residual degrees of freedom."""
  740. return self._nobs - self._df_raw
  741. @cache_readonly
  742. def _f_stat_raw(self):
  743. """Returns the raw f-stat value."""
  744. from scipy.stats import f
  745. items = self.beta.columns
  746. nobs = self._nobs
  747. df = self._df_raw
  748. df_resid = nobs - df
  749. # var_beta has not been newey-west adjusted
  750. if self._nw_lags is None:
  751. F = self._r2_raw / (self._r2_raw - self._r2_adj_raw)
  752. q = len(items)
  753. if 'intercept' in items:
  754. q -= 1
  755. def get_result_simple(Fst, d):
  756. return Fst, (q, d), 1 - f.cdf(Fst, q, d)
  757. # Compute the P-value for each pair
  758. result = starmap(get_result_simple, zip(F, df_resid))
  759. return list(result)
  760. K = len(items)
  761. R = np.eye(K)
  762. r = np.zeros((K, 1))
  763. try:
  764. intercept = items.get_loc('intercept')
  765. R = np.concatenate((R[0: intercept], R[intercept + 1:]))
  766. r = np.concatenate((r[0: intercept], r[intercept + 1:]))
  767. except KeyError:
  768. # no intercept
  769. pass
  770. def get_result(beta, vcov, n, d):
  771. return math.calc_F(R, r, beta, vcov, n, d)
  772. results = starmap(get_result,
  773. zip(self._beta_raw, self._var_beta_raw, nobs, df))
  774. return list(results)
  775. @cache_readonly
  776. def _p_value_raw(self):
  777. """Returns the raw p values."""
  778. from scipy.stats import t
  779. result = [2 * t.sf(a, b)
  780. for a, b in zip(np.fabs(self._t_stat_raw),
  781. self._df_resid_raw)]
  782. return np.array(result)
  783. @cache_readonly
  784. def _resid_stats(self):
  785. uncentered_sst = []
  786. sst = []
  787. sse = []
  788. Yreg = self._y
  789. Y = self._y_trans
  790. X = self._x_trans
  791. weights = self._weights
  792. dates = self._index
  793. window = self._window
  794. for n, index in enumerate(self._valid_indices):
  795. if self._is_rolling and index >= window:
  796. prior_date = dates[index - window + 1]
  797. else:
  798. prior_date = dates[0]
  799. date = dates[index]
  800. beta = self._beta_raw[n]
  801. X_slice = X.truncate(before=prior_date, after=date).values
  802. Y_slice = _y_converter(Y.truncate(before=prior_date, after=date))
  803. resid = Y_slice - np.dot(X_slice, beta)
  804. if weights is not None:
  805. Y_slice = _y_converter(Yreg.truncate(before=prior_date,
  806. after=date))
  807. weights_slice = weights.truncate(prior_date, date)
  808. demeaned = Y_slice - np.average(Y_slice, weights=weights_slice)
  809. SS_total = (weights_slice * demeaned ** 2).sum()
  810. else:
  811. SS_total = ((Y_slice - Y_slice.mean()) ** 2).sum()
  812. SS_err = (resid ** 2).sum()
  813. SST_uncentered = (Y_slice ** 2).sum()
  814. sse.append(SS_err)
  815. sst.append(SS_total)
  816. uncentered_sst.append(SST_uncentered)
  817. return {
  818. 'sse': np.array(sse),
  819. 'centered_tss': np.array(sst),
  820. 'uncentered_tss': np.array(uncentered_sst),
  821. }
  822. @cache_readonly
  823. def _rmse_raw(self):
  824. """Returns the raw rmse values."""
  825. return np.sqrt(self._resid_stats['sse'] / self._df_resid_raw)
  826. @cache_readonly
  827. def _r2_raw(self):
  828. rs = self._resid_stats
  829. if self._use_centered_tss:
  830. return 1 - rs['sse'] / rs['centered_tss']
  831. else:
  832. return 1 - rs['sse'] / rs['uncentered_tss']
  833. @cache_readonly
  834. def _r2_adj_raw(self):
  835. """Returns the raw r-squared adjusted values."""
  836. nobs = self._nobs
  837. factors = (nobs - 1) / (nobs - self._df_raw)
  838. return 1 - (1 - self._r2_raw) * factors
  839. @cache_readonly
  840. def _resid_raw(self):
  841. """Returns the raw residuals."""
  842. return (self._y.values - self._y_fitted_raw)
  843. @cache_readonly
  844. def _std_err_raw(self):
  845. """Returns the raw standard err values."""
  846. results = []
  847. for i in range(len(self._var_beta_raw)):
  848. results.append(np.sqrt(np.diag(self._var_beta_raw[i])))
  849. return np.array(results)
  850. @cache_readonly
  851. def _t_stat_raw(self):
  852. """Returns the raw t-stat value."""
  853. return self._beta_raw / self._std_err_raw
  854. @cache_readonly
  855. def _var_beta_raw(self):
  856. """Returns the raw covariance of beta."""
  857. x = self._x_trans
  858. y = self._y_trans
  859. dates = self._index
  860. nobs = self._nobs
  861. rmse = self._rmse_raw
  862. beta = self._beta_raw
  863. df = self._df_raw
  864. window = self._window
  865. cum_xx = self._cum_xx(self._x)
  866. results = []
  867. for n, i in enumerate(self._valid_indices):
  868. xx = cum_xx[i]
  869. date = dates[i]
  870. if self._is_rolling and i >= window:
  871. xx = xx - cum_xx[i - window]
  872. prior_date = dates[i - window + 1]
  873. else:
  874. prior_date = dates[0]
  875. x_slice = x.truncate(before=prior_date, after=date)
  876. y_slice = y.truncate(before=prior_date, after=date)
  877. xv = x_slice.values
  878. yv = np.asarray(y_slice)
  879. if self._nw_lags is None:
  880. result = math.inv(xx) * (rmse[n] ** 2)
  881. else:
  882. resid = yv - np.dot(xv, beta[n])
  883. m = (xv.T * resid).T
  884. xeps = math.newey_west(m, self._nw_lags, nobs[n], df[n],
  885. self._nw_overlap)
  886. xx_inv = math.inv(xx)
  887. result = np.dot(xx_inv, np.dot(xeps, xx_inv))
  888. results.append(result)
  889. return np.array(results)
  890. @cache_readonly
  891. def _forecast_mean_raw(self):
  892. """Returns the raw covariance of beta."""
  893. nobs = self._nobs
  894. window = self._window
  895. # x should be ones
  896. dummy = DataFrame(index=self._y.index)
  897. dummy['y'] = 1
  898. cum_xy = self._cum_xy(dummy, self._y)
  899. results = []
  900. for n, i in enumerate(self._valid_indices):
  901. sumy = cum_xy[i]
  902. if self._is_rolling and i >= window:
  903. sumy = sumy - cum_xy[i - window]
  904. results.append(sumy[0] / nobs[n])
  905. return np.array(results)
  906. @cache_readonly
  907. def _forecast_vol_raw(self):
  908. """Returns the raw covariance of beta."""
  909. beta = self._beta_raw
  910. window = self._window
  911. dates = self._index
  912. x = self._x
  913. results = []
  914. for n, i in enumerate(self._valid_indices):
  915. date = dates[i]
  916. if self._is_rolling and i >= window:
  917. prior_date = dates[i - window + 1]
  918. else:
  919. prior_date = dates[0]
  920. x_slice = x.truncate(prior_date, date).values
  921. x_demeaned = x_slice - x_slice.mean(0)
  922. x_cov = np.dot(x_demeaned.T, x_demeaned) / (len(x_slice) - 1)
  923. B = beta[n]
  924. result = np.dot(B, np.dot(x_cov, B))
  925. results.append(np.sqrt(result))
  926. return np.array(results)
  927. @cache_readonly
  928. def _y_fitted_raw(self):
  929. """Returns the raw fitted y values."""
  930. return (self._x.values * self._beta_matrix(lag=0)).sum(1)
  931. @cache_readonly
  932. def _y_predict_raw(self):
  933. """Returns the raw predicted y values."""
  934. return (self._x.values * self._beta_matrix(lag=1)).sum(1)
  935. @cache_readonly
  936. def _results(self):
  937. results = {}
  938. for result in self.RESULT_FIELDS:
  939. value = getattr(self, result)
  940. if isinstance(value, Series):
  941. value = value[self.beta.index[-1]]
  942. elif isinstance(value, DataFrame):
  943. value = value.xs(self.beta.index[-1])
  944. else: # pragma: no cover
  945. raise Exception('Problem retrieving %s' % result)
  946. results[result] = value
  947. return results
  948. @cache_readonly
  949. def _window_time_obs(self):
  950. window_obs = (Series(self._time_obs_count > 0)
  951. .rolling(self._window, min_periods=1)
  952. .sum()
  953. .values
  954. )
  955. window_obs[np.isnan(window_obs)] = 0
  956. return window_obs.astype(int)
  957. @cache_readonly
  958. def _nobs_raw(self):
  959. if self._is_rolling:
  960. window = self._window
  961. else:
  962. # expanding case
  963. window = len(self._index)
  964. result = Series(self._time_obs_count).rolling(
  965. window, min_periods=1).sum().values
  966. return result.astype(int)
  967. def _beta_matrix(self, lag=0):
  968. if lag < 0:
  969. raise AssertionError("'lag' must be greater than or equal to 0, "
  970. "input was {0}".format(lag))
  971. betas = self._beta_raw
  972. labels = np.arange(len(self._y)) - lag
  973. indexer = self._valid_obs_labels.searchsorted(labels, side='left')
  974. indexer[indexer == len(betas)] = len(betas) - 1
  975. beta_matrix = betas[indexer]
  976. beta_matrix[labels < self._valid_obs_labels[0]] = np.NaN
  977. return beta_matrix
  978. @cache_readonly
  979. def _valid_obs_labels(self):
  980. dates = self._index[self._valid_indices]
  981. return self._y.index.searchsorted(dates)
  982. @cache_readonly
  983. def _nobs(self):
  984. return self._nobs_raw[self._valid_indices]
  985. @property
  986. def nobs(self):
  987. return Series(self._nobs, index=self._result_index)
  988. @cache_readonly
  989. def _enough_obs(self):
  990. # XXX: what's the best way to determine where to start?
  991. return self._nobs_raw >= max(self._min_periods,
  992. len(self._x.columns) + 1)
  993. def _safe_update(d, other):
  994. """
  995. Combine dictionaries with non-overlapping keys
  996. """
  997. for k, v in compat.iteritems(other):
  998. if k in d:
  999. raise Exception('Duplicate regressor: %s' % k)
  1000. d[k] = v
  1001. def _filter_data(lhs, rhs, weights=None):
  1002. """
  1003. Cleans the input for single OLS.
  1004. Parameters
  1005. ----------
  1006. lhs : Series
  1007. Dependent variable in the regression.
  1008. rhs : dict, whose values are Series, DataFrame, or dict
  1009. Explanatory variables of the regression.
  1010. weights : array-like, optional
  1011. 1d array of weights. If None, equivalent to an unweighted OLS.
  1012. Returns
  1013. -------
  1014. Series, DataFrame
  1015. Cleaned lhs and rhs
  1016. """
  1017. if not isinstance(lhs, Series):
  1018. if len(lhs) != len(rhs):
  1019. raise AssertionError("length of lhs must equal length of rhs")
  1020. lhs = Series(lhs, index=rhs.index)
  1021. rhs = _combine_rhs(rhs)
  1022. lhs = DataFrame({'__y__': lhs}, dtype=float)
  1023. pre_filt_rhs = rhs.dropna(how='any')
  1024. combined = rhs.join(lhs, how='outer')
  1025. if weights is not None:
  1026. combined['__weights__'] = weights
  1027. valid = (combined.count(1) == len(combined.columns)).values
  1028. index = combined.index
  1029. combined = combined[valid]
  1030. if weights is not None:
  1031. filt_weights = combined.pop('__weights__')
  1032. else:
  1033. filt_weights = None
  1034. filt_lhs = combined.pop('__y__')
  1035. filt_rhs = combined
  1036. if hasattr(filt_weights, 'to_dense'):
  1037. filt_weights = filt_weights.to_dense()
  1038. return (filt_lhs.to_dense(), filt_rhs.to_dense(), filt_weights,
  1039. pre_filt_rhs.to_dense(), index, valid)
  1040. def _combine_rhs(rhs):
  1041. """
  1042. Glue input X variables together while checking for potential
  1043. duplicates
  1044. """
  1045. series = {}
  1046. if isinstance(rhs, Series):
  1047. series['x'] = rhs
  1048. elif isinstance(rhs, DataFrame):
  1049. series = rhs.copy()
  1050. elif isinstance(rhs, dict):
  1051. for name, value in compat.iteritems(rhs):
  1052. if isinstance(value, Series):
  1053. _safe_update(series, {name: value})
  1054. elif isinstance(value, (dict, DataFrame)):
  1055. _safe_update(series, value)
  1056. else: # pragma: no cover
  1057. raise Exception('Invalid RHS data type: %s' % type(value))
  1058. else: # pragma: no cover
  1059. raise Exception('Invalid RHS type: %s' % type(rhs))
  1060. if not isinstance(series, DataFrame):
  1061. series = DataFrame(series, dtype=float)
  1062. return series
  1063. # A little kludge so we can use this method for both
  1064. # MovingOLS and MovingPanelOLS
  1065. def _y_converter(y):
  1066. y = y.values.squeeze()
  1067. if y.ndim == 0: # pragma: no cover
  1068. return np.array([y])
  1069. else:
  1070. return y
  1071. def f_stat_to_dict(result):
  1072. f_stat, shape, p_value = result
  1073. result = {}
  1074. result['f-stat'] = f_stat
  1075. result['DF X'] = shape[0]
  1076. result['DF Resid'] = shape[1]
  1077. result['p-value'] = p_value
  1078. return result