PageRenderTime 44ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/stats/var.py

http://github.com/wesm/pandas
Python | 605 lines | 457 code | 51 blank | 97 comment | 19 complexity | 77f730283600318b2783e40a1e9cad9d MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. # flake8: noqa
  2. from __future__ import division
  3. from pandas.compat import range, lrange, zip, reduce
  4. from pandas import compat
  5. import numpy as np
  6. from pandas.core.base import StringMixin
  7. from pandas.util.decorators import cache_readonly
  8. from pandas.core.frame import DataFrame
  9. from pandas.core.panel import Panel
  10. from pandas.core.series import Series
  11. import pandas.stats.common as common
  12. from pandas.stats.math import inv
  13. from pandas.stats.ols import _combine_rhs
  14. class VAR(StringMixin):
  15. """
  16. Estimates VAR(p) regression on multivariate time series data
  17. presented in pandas data structures.
  18. Parameters
  19. ----------
  20. data : DataFrame or dict of Series
  21. p : lags to include
  22. """
  23. def __init__(self, data, p=1, intercept=True):
  24. import warnings
  25. warnings.warn("The pandas.stats.var module is deprecated and will be "
  26. "removed in a future version. We refer to external packages "
  27. "like statsmodels, see some examples here: "
  28. "http://www.statsmodels.org/stable/vector_ar.html#var",
  29. FutureWarning, stacklevel=4)
  30. try:
  31. import statsmodels.tsa.vector_ar.api as sm_var
  32. except ImportError:
  33. import scikits.statsmodels.tsa.var as sm_var
  34. self._data = DataFrame(_combine_rhs(data))
  35. self._p = p
  36. self._columns = self._data.columns
  37. self._index = self._data.index
  38. self._intercept = intercept
  39. @cache_readonly
  40. def aic(self):
  41. """Returns the Akaike information criterion."""
  42. return self._ic['aic']
  43. @cache_readonly
  44. def bic(self):
  45. """Returns the Bayesian information criterion."""
  46. return self._ic['bic']
  47. @cache_readonly
  48. def beta(self):
  49. """
  50. Returns a DataFrame, where each column x1 contains the betas
  51. calculated by regressing the x1 column of the VAR input with
  52. the lagged input.
  53. Returns
  54. -------
  55. DataFrame
  56. """
  57. d = dict([(key, value.beta)
  58. for (key, value) in compat.iteritems(self.ols_results)])
  59. return DataFrame(d)
  60. def forecast(self, h):
  61. """
  62. Returns a DataFrame containing the forecasts for 1, 2, ..., n time
  63. steps. Each column x1 contains the forecasts of the x1 column.
  64. Parameters
  65. ----------
  66. n: int
  67. Number of time steps ahead to forecast.
  68. Returns
  69. -------
  70. DataFrame
  71. """
  72. forecast = self._forecast_raw(h)[:, 0, :]
  73. return DataFrame(forecast, index=lrange(1, 1 + h),
  74. columns=self._columns)
  75. def forecast_cov(self, h):
  76. """
  77. Returns the covariance of the forecast residuals.
  78. Returns
  79. -------
  80. DataFrame
  81. """
  82. return [DataFrame(value, index=self._columns, columns=self._columns)
  83. for value in self._forecast_cov_raw(h)]
  84. def forecast_std_err(self, h):
  85. """
  86. Returns the standard errors of the forecast residuals.
  87. Returns
  88. -------
  89. DataFrame
  90. """
  91. return DataFrame(self._forecast_std_err_raw(h),
  92. index=lrange(1, 1 + h), columns=self._columns)
  93. @cache_readonly
  94. def granger_causality(self):
  95. """Returns the f-stats and p-values from the Granger Causality Test.
  96. If the data consists of columns x1, x2, x3, then we perform the
  97. following regressions:
  98. x1 ~ L(x2, x3)
  99. x1 ~ L(x1, x3)
  100. x1 ~ L(x1, x2)
  101. The f-stats of these results are placed in the 'x1' column of the
  102. returned DataFrame. We then repeat for x2, x3.
  103. Returns
  104. -------
  105. Dict, where 'f-stat' returns the DataFrame containing the f-stats,
  106. and 'p-value' returns the DataFrame containing the corresponding
  107. p-values of the f-stats.
  108. """
  109. from pandas.stats.api import ols
  110. from scipy.stats import f
  111. d = {}
  112. for col in self._columns:
  113. d[col] = {}
  114. for i in range(1, 1 + self._p):
  115. lagged_data = self._lagged_data[i].filter(
  116. self._columns - [col])
  117. for key, value in compat.iteritems(lagged_data):
  118. d[col][_make_param_name(i, key)] = value
  119. f_stat_dict = {}
  120. p_value_dict = {}
  121. for col, y in compat.iteritems(self._data):
  122. ssr_full = (self.resid[col] ** 2).sum()
  123. f_stats = []
  124. p_values = []
  125. for col2 in self._columns:
  126. result = ols(y=y, x=d[col2])
  127. resid = result.resid
  128. ssr_reduced = (resid ** 2).sum()
  129. M = self._p
  130. N = self._nobs
  131. K = self._k * self._p + 1
  132. f_stat = ((ssr_reduced - ssr_full) / M) / (ssr_full / (N - K))
  133. f_stats.append(f_stat)
  134. p_value = f.sf(f_stat, M, N - K)
  135. p_values.append(p_value)
  136. f_stat_dict[col] = Series(f_stats, self._columns)
  137. p_value_dict[col] = Series(p_values, self._columns)
  138. f_stat_mat = DataFrame(f_stat_dict)
  139. p_value_mat = DataFrame(p_value_dict)
  140. return {
  141. 'f-stat': f_stat_mat,
  142. 'p-value': p_value_mat,
  143. }
  144. @cache_readonly
  145. def ols_results(self):
  146. """
  147. Returns the results of the regressions:
  148. x_1 ~ L(X)
  149. x_2 ~ L(X)
  150. ...
  151. x_k ~ L(X)
  152. where X = [x_1, x_2, ..., x_k]
  153. and L(X) represents the columns of X lagged 1, 2, ..., n lags
  154. (n is the user-provided number of lags).
  155. Returns
  156. -------
  157. dict
  158. """
  159. from pandas.stats.api import ols
  160. d = {}
  161. for i in range(1, 1 + self._p):
  162. for col, series in compat.iteritems(self._lagged_data[i]):
  163. d[_make_param_name(i, col)] = series
  164. result = dict([(col, ols(y=y, x=d, intercept=self._intercept))
  165. for col, y in compat.iteritems(self._data)])
  166. return result
  167. @cache_readonly
  168. def resid(self):
  169. """
  170. Returns the DataFrame containing the residuals of the VAR regressions.
  171. Each column x1 contains the residuals generated by regressing the x1
  172. column of the input against the lagged input.
  173. Returns
  174. -------
  175. DataFrame
  176. """
  177. d = dict([(col, series.resid)
  178. for (col, series) in compat.iteritems(self.ols_results)])
  179. return DataFrame(d, index=self._index)
  180. @cache_readonly
  181. def summary(self):
  182. template = """
  183. %(banner_top)s
  184. Number of Observations: %(nobs)d
  185. AIC: %(aic).3f
  186. BIC: %(bic).3f
  187. %(banner_coef)s
  188. %(coef_table)s
  189. %(banner_end)s
  190. """
  191. params = {
  192. 'banner_top': common.banner('Summary of VAR'),
  193. 'banner_coef': common.banner('Summary of Estimated Coefficients'),
  194. 'banner_end': common.banner('End of Summary'),
  195. 'coef_table': self.beta,
  196. 'aic': self.aic,
  197. 'bic': self.bic,
  198. 'nobs': self._nobs,
  199. }
  200. return template % params
  201. @cache_readonly
  202. def _alpha(self):
  203. """
  204. Returns array where the i-th element contains the intercept
  205. when regressing the i-th column of self._data with the lagged data.
  206. """
  207. if self._intercept:
  208. return self._beta_raw[-1]
  209. else:
  210. return np.zeros(self._k)
  211. @cache_readonly
  212. def _beta_raw(self):
  213. return np.array([list(self.beta[col].values()) for col in self._columns]).T
  214. def _trans_B(self, h):
  215. """
  216. Returns 0, 1, ..., (h-1)-th power of transpose of B as defined in
  217. equation (4) on p. 142 of the Stata 11 Time Series reference book.
  218. """
  219. result = [np.eye(1 + self._k * self._p)]
  220. row1 = np.zeros((1, 1 + self._k * self._p))
  221. row1[0, 0] = 1
  222. v = self._alpha.reshape((self._k, 1))
  223. row2 = np.hstack(tuple([v] + self._lag_betas))
  224. m = self._k * (self._p - 1)
  225. row3 = np.hstack((
  226. np.zeros((m, 1)),
  227. np.eye(m),
  228. np.zeros((m, self._k))
  229. ))
  230. trans_B = np.vstack((row1, row2, row3)).T
  231. result.append(trans_B)
  232. for i in range(2, h):
  233. result.append(np.dot(trans_B, result[i - 1]))
  234. return result
  235. @cache_readonly
  236. def _x(self):
  237. values = np.array([
  238. list(self._lagged_data[i][col].values())
  239. for i in range(1, 1 + self._p)
  240. for col in self._columns
  241. ]).T
  242. x = np.hstack((np.ones((len(values), 1)), values))[self._p:]
  243. return x
  244. @cache_readonly
  245. def _cov_beta(self):
  246. cov_resid = self._sigma
  247. x = self._x
  248. inv_cov_x = inv(np.dot(x.T, x))
  249. return np.kron(inv_cov_x, cov_resid)
  250. def _data_xs(self, i):
  251. """
  252. Returns the cross-section of the data at the given timestep.
  253. """
  254. return self._data.values[i]
  255. def _forecast_cov_raw(self, n):
  256. resid = self._forecast_cov_resid_raw(n)
  257. # beta = self._forecast_cov_beta_raw(n)
  258. # return [a + b for a, b in zip(resid, beta)]
  259. # TODO: ignore the beta forecast std err until it's verified
  260. return resid
  261. def _forecast_cov_beta_raw(self, n):
  262. """
  263. Returns the covariance of the beta errors for the forecast at
  264. 1, 2, ..., n timesteps.
  265. """
  266. p = self._p
  267. values = self._data.values
  268. T = len(values) - self._p - 1
  269. results = []
  270. for h in range(1, n + 1):
  271. psi = self._psi(h)
  272. trans_B = self._trans_B(h)
  273. sum = 0
  274. cov_beta = self._cov_beta
  275. for t in range(T + 1):
  276. index = t + p
  277. y = values.take(lrange(index, index - p, -1), axis=0).ravel()
  278. trans_Z = np.hstack(([1], y))
  279. trans_Z = trans_Z.reshape(1, len(trans_Z))
  280. sum2 = 0
  281. for i in range(h):
  282. ZB = np.dot(trans_Z, trans_B[h - 1 - i])
  283. prod = np.kron(ZB, psi[i])
  284. sum2 = sum2 + prod
  285. sum = sum + chain_dot(sum2, cov_beta, sum2.T)
  286. results.append(sum / (T + 1))
  287. return results
  288. def _forecast_cov_resid_raw(self, h):
  289. """
  290. Returns the covariance of the residual errors for the forecast at
  291. 1, 2, ..., h timesteps.
  292. """
  293. psi_values = self._psi(h)
  294. sum = 0
  295. result = []
  296. for i in range(h):
  297. psi = psi_values[i]
  298. sum = sum + chain_dot(psi, self._sigma, psi.T)
  299. result.append(sum)
  300. return result
  301. def _forecast_raw(self, h):
  302. """
  303. Returns the forecast at 1, 2, ..., h timesteps in the future.
  304. """
  305. k = self._k
  306. result = []
  307. for i in range(h):
  308. sum = self._alpha.reshape(1, k)
  309. for j in range(self._p):
  310. beta = self._lag_betas[j]
  311. idx = i - j
  312. if idx > 0:
  313. y = result[idx - 1]
  314. else:
  315. y = self._data_xs(idx - 1)
  316. sum = sum + np.dot(beta, y.T).T
  317. result.append(sum)
  318. return np.array(result)
  319. def _forecast_std_err_raw(self, h):
  320. """
  321. Returns the standard error of the forecasts
  322. at 1, 2, ..., n timesteps.
  323. """
  324. return np.array([np.sqrt(np.diag(value))
  325. for value in self._forecast_cov_raw(h)])
  326. @cache_readonly
  327. def _ic(self):
  328. """
  329. Returns the Akaike/Bayesian information criteria.
  330. """
  331. RSS = self._rss
  332. k = self._p * (self._k * self._p + 1)
  333. n = self._nobs * self._k
  334. return {'aic': 2 * k + n * np.log(RSS / n),
  335. 'bic': n * np.log(RSS / n) + k * np.log(n)}
  336. @cache_readonly
  337. def _k(self):
  338. return len(self._columns)
  339. @cache_readonly
  340. def _lag_betas(self):
  341. """
  342. Returns list of B_i, where B_i represents the (k, k) matrix
  343. with the j-th row containing the betas of regressing the j-th
  344. column of self._data with self._data lagged i time steps.
  345. First element is B_1, second element is B_2, etc.
  346. """
  347. k = self._k
  348. b = self._beta_raw
  349. return [b[k * i: k * (i + 1)].T for i in range(self._p)]
  350. @cache_readonly
  351. def _lagged_data(self):
  352. return dict([(i, self._data.shift(i))
  353. for i in range(1, 1 + self._p)])
  354. @cache_readonly
  355. def _nobs(self):
  356. return len(self._data) - self._p
  357. def _psi(self, h):
  358. """
  359. psi value used for calculating standard error.
  360. Returns [psi_0, psi_1, ..., psi_(h - 1)]
  361. """
  362. k = self._k
  363. result = [np.eye(k)]
  364. for i in range(1, h):
  365. result.append(sum(
  366. [np.dot(result[i - j], self._lag_betas[j - 1])
  367. for j in range(1, 1 + i)
  368. if j <= self._p]))
  369. return result
  370. @cache_readonly
  371. def _resid_raw(self):
  372. resid = np.array([self.ols_results[col]._resid_raw
  373. for col in self._columns])
  374. return resid
  375. @cache_readonly
  376. def _rss(self):
  377. """Returns the sum of the squares of the residuals."""
  378. return (self._resid_raw ** 2).sum()
  379. @cache_readonly
  380. def _sigma(self):
  381. """Returns covariance of resids."""
  382. k = self._k
  383. n = self._nobs
  384. resid = self._resid_raw
  385. return np.dot(resid, resid.T) / (n - k)
  386. def __unicode__(self):
  387. return self.summary
  388. def lag_select(data, max_lags=5, ic=None):
  389. """
  390. Select number of lags based on a variety of information criteria
  391. Parameters
  392. ----------
  393. data : DataFrame-like
  394. max_lags : int
  395. Maximum number of lags to evaluate
  396. ic : {None, 'aic', 'bic', ...}
  397. Choosing None will just display the results
  398. Returns
  399. -------
  400. None
  401. """
  402. pass
  403. class PanelVAR(VAR):
  404. """
  405. Performs Vector Autoregression on panel data.
  406. Parameters
  407. ----------
  408. data: Panel or dict of DataFrame
  409. lags: int
  410. """
  411. def __init__(self, data, lags, intercept=True):
  412. self._data = _prep_panel_data(data)
  413. self._p = lags
  414. self._intercept = intercept
  415. self._columns = self._data.items
  416. @cache_readonly
  417. def _nobs(self):
  418. """Returns the number of observations."""
  419. _, timesteps, entities = self._data.values.shape
  420. return (timesteps - self._p) * entities
  421. @cache_readonly
  422. def _rss(self):
  423. """Returns the sum of the squares of the residuals."""
  424. return (self.resid.values ** 2).sum()
  425. def forecast(self, h):
  426. """
  427. Returns the forecasts at 1, 2, ..., n timesteps in the future.
  428. """
  429. forecast = self._forecast_raw(h).T.swapaxes(1, 2)
  430. index = lrange(1, 1 + h)
  431. w = Panel(forecast, items=self._data.items, major_axis=index,
  432. minor_axis=self._data.minor_axis)
  433. return w
  434. @cache_readonly
  435. def resid(self):
  436. """
  437. Returns the DataFrame containing the residuals of the VAR regressions.
  438. Each column x1 contains the residuals generated by regressing the x1
  439. column of the input against the lagged input.
  440. Returns
  441. -------
  442. DataFrame
  443. """
  444. d = dict([(key, value.resid)
  445. for (key, value) in compat.iteritems(self.ols_results)])
  446. return Panel.fromDict(d)
  447. def _data_xs(self, i):
  448. return self._data.values[:, i, :].T
  449. @cache_readonly
  450. def _sigma(self):
  451. """Returns covariance of resids."""
  452. k = self._k
  453. resid = _drop_incomplete_rows(self.resid.toLong().values)
  454. n = len(resid)
  455. return np.dot(resid.T, resid) / (n - k)
  456. def _prep_panel_data(data):
  457. """Converts the given data into a Panel."""
  458. if isinstance(data, Panel):
  459. return data
  460. return Panel.fromDict(data)
  461. def _drop_incomplete_rows(array):
  462. mask = np.isfinite(array).all(1)
  463. indices = np.arange(len(array))[mask]
  464. return array.take(indices, 0)
  465. def _make_param_name(lag, name):
  466. return 'L%d.%s' % (lag, name)
  467. def chain_dot(*matrices):
  468. """
  469. Returns the dot product of the given matrices.
  470. Parameters
  471. ----------
  472. matrices: argument list of ndarray
  473. """
  474. return reduce(lambda x, y: np.dot(y, x), matrices[::-1])