PageRenderTime 46ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/stats/var.py

http://github.com/pydata/pandas
Python | 595 lines | 450 code | 49 blank | 96 comment | 19 complexity | c942bde78ba53e08ebdcd97466beadec MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. from __future__ import division
  2. from pandas.compat import range, lrange, zip, reduce
  3. from pandas import compat
  4. import numpy as np
  5. from pandas.core.base import StringMixin
  6. from pandas.util.decorators import cache_readonly
  7. from pandas.core.frame import DataFrame
  8. from pandas.core.panel import Panel
  9. from pandas.core.series import Series
  10. import pandas.stats.common as common
  11. from pandas.stats.math import inv
  12. from pandas.stats.ols import _combine_rhs
  13. class VAR(StringMixin):
  14. """
  15. Estimates VAR(p) regression on multivariate time series data
  16. presented in pandas data structures.
  17. Parameters
  18. ----------
  19. data : DataFrame or dict of Series
  20. p : lags to include
  21. """
  22. def __init__(self, data, p=1, intercept=True):
  23. try:
  24. import statsmodels.tsa.vector_ar.api as sm_var
  25. except ImportError:
  26. import scikits.statsmodels.tsa.var as sm_var
  27. self._data = DataFrame(_combine_rhs(data))
  28. self._p = p
  29. self._columns = self._data.columns
  30. self._index = self._data.index
  31. self._intercept = intercept
  32. @cache_readonly
  33. def aic(self):
  34. """Returns the Akaike information criterion."""
  35. return self._ic['aic']
  36. @cache_readonly
  37. def bic(self):
  38. """Returns the Bayesian information criterion."""
  39. return self._ic['bic']
  40. @cache_readonly
  41. def beta(self):
  42. """
  43. Returns a DataFrame, where each column x1 contains the betas
  44. calculated by regressing the x1 column of the VAR input with
  45. the lagged input.
  46. Returns
  47. -------
  48. DataFrame
  49. """
  50. d = dict([(key, value.beta)
  51. for (key, value) in compat.iteritems(self.ols_results)])
  52. return DataFrame(d)
  53. def forecast(self, h):
  54. """
  55. Returns a DataFrame containing the forecasts for 1, 2, ..., n time
  56. steps. Each column x1 contains the forecasts of the x1 column.
  57. Parameters
  58. ----------
  59. n: int
  60. Number of time steps ahead to forecast.
  61. Returns
  62. -------
  63. DataFrame
  64. """
  65. forecast = self._forecast_raw(h)[:, 0, :]
  66. return DataFrame(forecast, index=lrange(1, 1 + h),
  67. columns=self._columns)
  68. def forecast_cov(self, h):
  69. """
  70. Returns the covariance of the forecast residuals.
  71. Returns
  72. -------
  73. DataFrame
  74. """
  75. return [DataFrame(value, index=self._columns, columns=self._columns)
  76. for value in self._forecast_cov_raw(h)]
  77. def forecast_std_err(self, h):
  78. """
  79. Returns the standard errors of the forecast residuals.
  80. Returns
  81. -------
  82. DataFrame
  83. """
  84. return DataFrame(self._forecast_std_err_raw(h),
  85. index=lrange(1, 1 + h), columns=self._columns)
  86. @cache_readonly
  87. def granger_causality(self):
  88. """Returns the f-stats and p-values from the Granger Causality Test.
  89. If the data consists of columns x1, x2, x3, then we perform the
  90. following regressions:
  91. x1 ~ L(x2, x3)
  92. x1 ~ L(x1, x3)
  93. x1 ~ L(x1, x2)
  94. The f-stats of these results are placed in the 'x1' column of the
  95. returned DataFrame. We then repeat for x2, x3.
  96. Returns
  97. -------
  98. Dict, where 'f-stat' returns the DataFrame containing the f-stats,
  99. and 'p-value' returns the DataFrame containing the corresponding
  100. p-values of the f-stats.
  101. """
  102. from pandas.stats.api import ols
  103. from scipy.stats import f
  104. d = {}
  105. for col in self._columns:
  106. d[col] = {}
  107. for i in range(1, 1 + self._p):
  108. lagged_data = self._lagged_data[i].filter(
  109. self._columns - [col])
  110. for key, value in compat.iteritems(lagged_data):
  111. d[col][_make_param_name(i, key)] = value
  112. f_stat_dict = {}
  113. p_value_dict = {}
  114. for col, y in compat.iteritems(self._data):
  115. ssr_full = (self.resid[col] ** 2).sum()
  116. f_stats = []
  117. p_values = []
  118. for col2 in self._columns:
  119. result = ols(y=y, x=d[col2])
  120. resid = result.resid
  121. ssr_reduced = (resid ** 2).sum()
  122. M = self._p
  123. N = self._nobs
  124. K = self._k * self._p + 1
  125. f_stat = ((ssr_reduced - ssr_full) / M) / (ssr_full / (N - K))
  126. f_stats.append(f_stat)
  127. p_value = f.sf(f_stat, M, N - K)
  128. p_values.append(p_value)
  129. f_stat_dict[col] = Series(f_stats, self._columns)
  130. p_value_dict[col] = Series(p_values, self._columns)
  131. f_stat_mat = DataFrame(f_stat_dict)
  132. p_value_mat = DataFrame(p_value_dict)
  133. return {
  134. 'f-stat': f_stat_mat,
  135. 'p-value': p_value_mat,
  136. }
  137. @cache_readonly
  138. def ols_results(self):
  139. """
  140. Returns the results of the regressions:
  141. x_1 ~ L(X)
  142. x_2 ~ L(X)
  143. ...
  144. x_k ~ L(X)
  145. where X = [x_1, x_2, ..., x_k]
  146. and L(X) represents the columns of X lagged 1, 2, ..., n lags
  147. (n is the user-provided number of lags).
  148. Returns
  149. -------
  150. dict
  151. """
  152. from pandas.stats.api import ols
  153. d = {}
  154. for i in range(1, 1 + self._p):
  155. for col, series in compat.iteritems(self._lagged_data[i]):
  156. d[_make_param_name(i, col)] = series
  157. result = dict([(col, ols(y=y, x=d, intercept=self._intercept))
  158. for col, y in compat.iteritems(self._data)])
  159. return result
  160. @cache_readonly
  161. def resid(self):
  162. """
  163. Returns the DataFrame containing the residuals of the VAR regressions.
  164. Each column x1 contains the residuals generated by regressing the x1
  165. column of the input against the lagged input.
  166. Returns
  167. -------
  168. DataFrame
  169. """
  170. d = dict([(col, series.resid)
  171. for (col, series) in compat.iteritems(self.ols_results)])
  172. return DataFrame(d, index=self._index)
  173. @cache_readonly
  174. def summary(self):
  175. template = """
  176. %(banner_top)s
  177. Number of Observations: %(nobs)d
  178. AIC: %(aic).3f
  179. BIC: %(bic).3f
  180. %(banner_coef)s
  181. %(coef_table)s
  182. %(banner_end)s
  183. """
  184. params = {
  185. 'banner_top': common.banner('Summary of VAR'),
  186. 'banner_coef': common.banner('Summary of Estimated Coefficients'),
  187. 'banner_end': common.banner('End of Summary'),
  188. 'coef_table': self.beta,
  189. 'aic': self.aic,
  190. 'bic': self.bic,
  191. 'nobs': self._nobs,
  192. }
  193. return template % params
  194. @cache_readonly
  195. def _alpha(self):
  196. """
  197. Returns array where the i-th element contains the intercept
  198. when regressing the i-th column of self._data with the lagged data.
  199. """
  200. if self._intercept:
  201. return self._beta_raw[-1]
  202. else:
  203. return np.zeros(self._k)
  204. @cache_readonly
  205. def _beta_raw(self):
  206. return np.array([list(self.beta[col].values()) for col in self._columns]).T
  207. def _trans_B(self, h):
  208. """
  209. Returns 0, 1, ..., (h-1)-th power of transpose of B as defined in
  210. equation (4) on p. 142 of the Stata 11 Time Series reference book.
  211. """
  212. result = [np.eye(1 + self._k * self._p)]
  213. row1 = np.zeros((1, 1 + self._k * self._p))
  214. row1[0, 0] = 1
  215. v = self._alpha.reshape((self._k, 1))
  216. row2 = np.hstack(tuple([v] + self._lag_betas))
  217. m = self._k * (self._p - 1)
  218. row3 = np.hstack((
  219. np.zeros((m, 1)),
  220. np.eye(m),
  221. np.zeros((m, self._k))
  222. ))
  223. trans_B = np.vstack((row1, row2, row3)).T
  224. result.append(trans_B)
  225. for i in range(2, h):
  226. result.append(np.dot(trans_B, result[i - 1]))
  227. return result
  228. @cache_readonly
  229. def _x(self):
  230. values = np.array([
  231. list(self._lagged_data[i][col].values())
  232. for i in range(1, 1 + self._p)
  233. for col in self._columns
  234. ]).T
  235. x = np.hstack((np.ones((len(values), 1)), values))[self._p:]
  236. return x
  237. @cache_readonly
  238. def _cov_beta(self):
  239. cov_resid = self._sigma
  240. x = self._x
  241. inv_cov_x = inv(np.dot(x.T, x))
  242. return np.kron(inv_cov_x, cov_resid)
  243. def _data_xs(self, i):
  244. """
  245. Returns the cross-section of the data at the given timestep.
  246. """
  247. return self._data.values[i]
  248. def _forecast_cov_raw(self, n):
  249. resid = self._forecast_cov_resid_raw(n)
  250. # beta = self._forecast_cov_beta_raw(n)
  251. # return [a + b for a, b in zip(resid, beta)]
  252. # TODO: ignore the beta forecast std err until it's verified
  253. return resid
  254. def _forecast_cov_beta_raw(self, n):
  255. """
  256. Returns the covariance of the beta errors for the forecast at
  257. 1, 2, ..., n timesteps.
  258. """
  259. p = self._p
  260. values = self._data.values
  261. T = len(values) - self._p - 1
  262. results = []
  263. for h in range(1, n + 1):
  264. psi = self._psi(h)
  265. trans_B = self._trans_B(h)
  266. sum = 0
  267. cov_beta = self._cov_beta
  268. for t in range(T + 1):
  269. index = t + p
  270. y = values.take(lrange(index, index - p, -1), axis=0).ravel()
  271. trans_Z = np.hstack(([1], y))
  272. trans_Z = trans_Z.reshape(1, len(trans_Z))
  273. sum2 = 0
  274. for i in range(h):
  275. ZB = np.dot(trans_Z, trans_B[h - 1 - i])
  276. prod = np.kron(ZB, psi[i])
  277. sum2 = sum2 + prod
  278. sum = sum + chain_dot(sum2, cov_beta, sum2.T)
  279. results.append(sum / (T + 1))
  280. return results
  281. def _forecast_cov_resid_raw(self, h):
  282. """
  283. Returns the covariance of the residual errors for the forecast at
  284. 1, 2, ..., h timesteps.
  285. """
  286. psi_values = self._psi(h)
  287. sum = 0
  288. result = []
  289. for i in range(h):
  290. psi = psi_values[i]
  291. sum = sum + chain_dot(psi, self._sigma, psi.T)
  292. result.append(sum)
  293. return result
  294. def _forecast_raw(self, h):
  295. """
  296. Returns the forecast at 1, 2, ..., h timesteps in the future.
  297. """
  298. k = self._k
  299. result = []
  300. for i in range(h):
  301. sum = self._alpha.reshape(1, k)
  302. for j in range(self._p):
  303. beta = self._lag_betas[j]
  304. idx = i - j
  305. if idx > 0:
  306. y = result[idx - 1]
  307. else:
  308. y = self._data_xs(idx - 1)
  309. sum = sum + np.dot(beta, y.T).T
  310. result.append(sum)
  311. return np.array(result)
  312. def _forecast_std_err_raw(self, h):
  313. """
  314. Returns the standard error of the forecasts
  315. at 1, 2, ..., n timesteps.
  316. """
  317. return np.array([np.sqrt(np.diag(value))
  318. for value in self._forecast_cov_raw(h)])
  319. @cache_readonly
  320. def _ic(self):
  321. """
  322. Returns the Akaike/Bayesian information criteria.
  323. """
  324. RSS = self._rss
  325. k = self._p * (self._k * self._p + 1)
  326. n = self._nobs * self._k
  327. return {'aic': 2 * k + n * np.log(RSS / n),
  328. 'bic': n * np.log(RSS / n) + k * np.log(n)}
  329. @cache_readonly
  330. def _k(self):
  331. return len(self._columns)
  332. @cache_readonly
  333. def _lag_betas(self):
  334. """
  335. Returns list of B_i, where B_i represents the (k, k) matrix
  336. with the j-th row containing the betas of regressing the j-th
  337. column of self._data with self._data lagged i time steps.
  338. First element is B_1, second element is B_2, etc.
  339. """
  340. k = self._k
  341. b = self._beta_raw
  342. return [b[k * i: k * (i + 1)].T for i in range(self._p)]
  343. @cache_readonly
  344. def _lagged_data(self):
  345. return dict([(i, self._data.shift(i))
  346. for i in range(1, 1 + self._p)])
  347. @cache_readonly
  348. def _nobs(self):
  349. return len(self._data) - self._p
  350. def _psi(self, h):
  351. """
  352. psi value used for calculating standard error.
  353. Returns [psi_0, psi_1, ..., psi_(h - 1)]
  354. """
  355. k = self._k
  356. result = [np.eye(k)]
  357. for i in range(1, h):
  358. result.append(sum(
  359. [np.dot(result[i - j], self._lag_betas[j - 1])
  360. for j in range(1, 1 + i)
  361. if j <= self._p]))
  362. return result
  363. @cache_readonly
  364. def _resid_raw(self):
  365. resid = np.array([self.ols_results[col]._resid_raw
  366. for col in self._columns])
  367. return resid
  368. @cache_readonly
  369. def _rss(self):
  370. """Returns the sum of the squares of the residuals."""
  371. return (self._resid_raw ** 2).sum()
  372. @cache_readonly
  373. def _sigma(self):
  374. """Returns covariance of resids."""
  375. k = self._k
  376. n = self._nobs
  377. resid = self._resid_raw
  378. return np.dot(resid, resid.T) / (n - k)
  379. def __unicode__(self):
  380. return self.summary
  381. def lag_select(data, max_lags=5, ic=None):
  382. """
  383. Select number of lags based on a variety of information criteria
  384. Parameters
  385. ----------
  386. data : DataFrame-like
  387. max_lags : int
  388. Maximum number of lags to evaluate
  389. ic : {None, 'aic', 'bic', ...}
  390. Choosing None will just display the results
  391. Returns
  392. -------
  393. None
  394. """
  395. pass
  396. class PanelVAR(VAR):
  397. """
  398. Performs Vector Autoregression on panel data.
  399. Parameters
  400. ----------
  401. data: Panel or dict of DataFrame
  402. lags: int
  403. """
  404. def __init__(self, data, lags, intercept=True):
  405. self._data = _prep_panel_data(data)
  406. self._p = lags
  407. self._intercept = intercept
  408. self._columns = self._data.items
  409. @cache_readonly
  410. def _nobs(self):
  411. """Returns the number of observations."""
  412. _, timesteps, entities = self._data.values.shape
  413. return (timesteps - self._p) * entities
  414. @cache_readonly
  415. def _rss(self):
  416. """Returns the sum of the squares of the residuals."""
  417. return (self.resid.values ** 2).sum()
  418. def forecast(self, h):
  419. """
  420. Returns the forecasts at 1, 2, ..., n timesteps in the future.
  421. """
  422. forecast = self._forecast_raw(h).T.swapaxes(1, 2)
  423. index = lrange(1, 1 + h)
  424. w = Panel(forecast, items=self._data.items, major_axis=index,
  425. minor_axis=self._data.minor_axis)
  426. return w
  427. @cache_readonly
  428. def resid(self):
  429. """
  430. Returns the DataFrame containing the residuals of the VAR regressions.
  431. Each column x1 contains the residuals generated by regressing the x1
  432. column of the input against the lagged input.
  433. Returns
  434. -------
  435. DataFrame
  436. """
  437. d = dict([(key, value.resid)
  438. for (key, value) in compat.iteritems(self.ols_results)])
  439. return Panel.fromDict(d)
  440. def _data_xs(self, i):
  441. return self._data.values[:, i, :].T
  442. @cache_readonly
  443. def _sigma(self):
  444. """Returns covariance of resids."""
  445. k = self._k
  446. resid = _drop_incomplete_rows(self.resid.toLong().values)
  447. n = len(resid)
  448. return np.dot(resid.T, resid) / (n - k)
  449. def _prep_panel_data(data):
  450. """Converts the given data into a Panel."""
  451. if isinstance(data, Panel):
  452. return data
  453. return Panel.fromDict(data)
  454. def _drop_incomplete_rows(array):
  455. mask = np.isfinite(array).all(1)
  456. indices = np.arange(len(array))[mask]
  457. return array.take(indices, 0)
  458. def _make_param_name(lag, name):
  459. return 'L%d.%s' % (lag, name)
  460. def chain_dot(*matrices):
  461. """
  462. Returns the dot product of the given matrices.
  463. Parameters
  464. ----------
  465. matrices: argument list of ndarray
  466. """
  467. return reduce(lambda x, y: np.dot(y, x), matrices[::-1])