/pandas/stats/var.py
Python | 605 lines | 457 code | 51 blank | 97 comment | 19 complexity | 77f730283600318b2783e40a1e9cad9d MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
- # flake8: noqa
- from __future__ import division
- from pandas.compat import range, lrange, zip, reduce
- from pandas import compat
- import numpy as np
- from pandas.core.base import StringMixin
- from pandas.util.decorators import cache_readonly
- from pandas.core.frame import DataFrame
- from pandas.core.panel import Panel
- from pandas.core.series import Series
- import pandas.stats.common as common
- from pandas.stats.math import inv
- from pandas.stats.ols import _combine_rhs
- class VAR(StringMixin):
- """
- Estimates VAR(p) regression on multivariate time series data
- presented in pandas data structures.
- Parameters
- ----------
- data : DataFrame or dict of Series
- p : lags to include
- """
- def __init__(self, data, p=1, intercept=True):
- import warnings
- warnings.warn("The pandas.stats.var module is deprecated and will be "
- "removed in a future version. We refer to external packages "
- "like statsmodels, see some examples here: "
- "http://www.statsmodels.org/stable/vector_ar.html#var",
- FutureWarning, stacklevel=4)
- try:
- import statsmodels.tsa.vector_ar.api as sm_var
- except ImportError:
- import scikits.statsmodels.tsa.var as sm_var
- self._data = DataFrame(_combine_rhs(data))
- self._p = p
- self._columns = self._data.columns
- self._index = self._data.index
- self._intercept = intercept
- @cache_readonly
- def aic(self):
- """Returns the Akaike information criterion."""
- return self._ic['aic']
- @cache_readonly
- def bic(self):
- """Returns the Bayesian information criterion."""
- return self._ic['bic']
- @cache_readonly
- def beta(self):
- """
- Returns a DataFrame, where each column x1 contains the betas
- calculated by regressing the x1 column of the VAR input with
- the lagged input.
- Returns
- -------
- DataFrame
- """
- d = dict([(key, value.beta)
- for (key, value) in compat.iteritems(self.ols_results)])
- return DataFrame(d)
- def forecast(self, h):
- """
- Returns a DataFrame containing the forecasts for 1, 2, ..., n time
- steps. Each column x1 contains the forecasts of the x1 column.
- Parameters
- ----------
- n: int
- Number of time steps ahead to forecast.
- Returns
- -------
- DataFrame
- """
- forecast = self._forecast_raw(h)[:, 0, :]
- return DataFrame(forecast, index=lrange(1, 1 + h),
- columns=self._columns)
- def forecast_cov(self, h):
- """
- Returns the covariance of the forecast residuals.
- Returns
- -------
- DataFrame
- """
- return [DataFrame(value, index=self._columns, columns=self._columns)
- for value in self._forecast_cov_raw(h)]
- def forecast_std_err(self, h):
- """
- Returns the standard errors of the forecast residuals.
- Returns
- -------
- DataFrame
- """
- return DataFrame(self._forecast_std_err_raw(h),
- index=lrange(1, 1 + h), columns=self._columns)
- @cache_readonly
- def granger_causality(self):
- """Returns the f-stats and p-values from the Granger Causality Test.
- If the data consists of columns x1, x2, x3, then we perform the
- following regressions:
- x1 ~ L(x2, x3)
- x1 ~ L(x1, x3)
- x1 ~ L(x1, x2)
- The f-stats of these results are placed in the 'x1' column of the
- returned DataFrame. We then repeat for x2, x3.
- Returns
- -------
- Dict, where 'f-stat' returns the DataFrame containing the f-stats,
- and 'p-value' returns the DataFrame containing the corresponding
- p-values of the f-stats.
- """
- from pandas.stats.api import ols
- from scipy.stats import f
- d = {}
- for col in self._columns:
- d[col] = {}
- for i in range(1, 1 + self._p):
- lagged_data = self._lagged_data[i].filter(
- self._columns - [col])
- for key, value in compat.iteritems(lagged_data):
- d[col][_make_param_name(i, key)] = value
- f_stat_dict = {}
- p_value_dict = {}
- for col, y in compat.iteritems(self._data):
- ssr_full = (self.resid[col] ** 2).sum()
- f_stats = []
- p_values = []
- for col2 in self._columns:
- result = ols(y=y, x=d[col2])
- resid = result.resid
- ssr_reduced = (resid ** 2).sum()
- M = self._p
- N = self._nobs
- K = self._k * self._p + 1
- f_stat = ((ssr_reduced - ssr_full) / M) / (ssr_full / (N - K))
- f_stats.append(f_stat)
- p_value = f.sf(f_stat, M, N - K)
- p_values.append(p_value)
- f_stat_dict[col] = Series(f_stats, self._columns)
- p_value_dict[col] = Series(p_values, self._columns)
- f_stat_mat = DataFrame(f_stat_dict)
- p_value_mat = DataFrame(p_value_dict)
- return {
- 'f-stat': f_stat_mat,
- 'p-value': p_value_mat,
- }
- @cache_readonly
- def ols_results(self):
- """
- Returns the results of the regressions:
- x_1 ~ L(X)
- x_2 ~ L(X)
- ...
- x_k ~ L(X)
- where X = [x_1, x_2, ..., x_k]
- and L(X) represents the columns of X lagged 1, 2, ..., n lags
- (n is the user-provided number of lags).
- Returns
- -------
- dict
- """
- from pandas.stats.api import ols
- d = {}
- for i in range(1, 1 + self._p):
- for col, series in compat.iteritems(self._lagged_data[i]):
- d[_make_param_name(i, col)] = series
- result = dict([(col, ols(y=y, x=d, intercept=self._intercept))
- for col, y in compat.iteritems(self._data)])
- return result
- @cache_readonly
- def resid(self):
- """
- Returns the DataFrame containing the residuals of the VAR regressions.
- Each column x1 contains the residuals generated by regressing the x1
- column of the input against the lagged input.
- Returns
- -------
- DataFrame
- """
- d = dict([(col, series.resid)
- for (col, series) in compat.iteritems(self.ols_results)])
- return DataFrame(d, index=self._index)
- @cache_readonly
- def summary(self):
- template = """
- %(banner_top)s
- Number of Observations: %(nobs)d
- AIC: %(aic).3f
- BIC: %(bic).3f
- %(banner_coef)s
- %(coef_table)s
- %(banner_end)s
- """
- params = {
- 'banner_top': common.banner('Summary of VAR'),
- 'banner_coef': common.banner('Summary of Estimated Coefficients'),
- 'banner_end': common.banner('End of Summary'),
- 'coef_table': self.beta,
- 'aic': self.aic,
- 'bic': self.bic,
- 'nobs': self._nobs,
- }
- return template % params
- @cache_readonly
- def _alpha(self):
- """
- Returns array where the i-th element contains the intercept
- when regressing the i-th column of self._data with the lagged data.
- """
- if self._intercept:
- return self._beta_raw[-1]
- else:
- return np.zeros(self._k)
- @cache_readonly
- def _beta_raw(self):
- return np.array([list(self.beta[col].values()) for col in self._columns]).T
- def _trans_B(self, h):
- """
- Returns 0, 1, ..., (h-1)-th power of transpose of B as defined in
- equation (4) on p. 142 of the Stata 11 Time Series reference book.
- """
- result = [np.eye(1 + self._k * self._p)]
- row1 = np.zeros((1, 1 + self._k * self._p))
- row1[0, 0] = 1
- v = self._alpha.reshape((self._k, 1))
- row2 = np.hstack(tuple([v] + self._lag_betas))
- m = self._k * (self._p - 1)
- row3 = np.hstack((
- np.zeros((m, 1)),
- np.eye(m),
- np.zeros((m, self._k))
- ))
- trans_B = np.vstack((row1, row2, row3)).T
- result.append(trans_B)
- for i in range(2, h):
- result.append(np.dot(trans_B, result[i - 1]))
- return result
- @cache_readonly
- def _x(self):
- values = np.array([
- list(self._lagged_data[i][col].values())
- for i in range(1, 1 + self._p)
- for col in self._columns
- ]).T
- x = np.hstack((np.ones((len(values), 1)), values))[self._p:]
- return x
- @cache_readonly
- def _cov_beta(self):
- cov_resid = self._sigma
- x = self._x
- inv_cov_x = inv(np.dot(x.T, x))
- return np.kron(inv_cov_x, cov_resid)
- def _data_xs(self, i):
- """
- Returns the cross-section of the data at the given timestep.
- """
- return self._data.values[i]
- def _forecast_cov_raw(self, n):
- resid = self._forecast_cov_resid_raw(n)
- # beta = self._forecast_cov_beta_raw(n)
- # return [a + b for a, b in zip(resid, beta)]
- # TODO: ignore the beta forecast std err until it's verified
- return resid
- def _forecast_cov_beta_raw(self, n):
- """
- Returns the covariance of the beta errors for the forecast at
- 1, 2, ..., n timesteps.
- """
- p = self._p
- values = self._data.values
- T = len(values) - self._p - 1
- results = []
- for h in range(1, n + 1):
- psi = self._psi(h)
- trans_B = self._trans_B(h)
- sum = 0
- cov_beta = self._cov_beta
- for t in range(T + 1):
- index = t + p
- y = values.take(lrange(index, index - p, -1), axis=0).ravel()
- trans_Z = np.hstack(([1], y))
- trans_Z = trans_Z.reshape(1, len(trans_Z))
- sum2 = 0
- for i in range(h):
- ZB = np.dot(trans_Z, trans_B[h - 1 - i])
- prod = np.kron(ZB, psi[i])
- sum2 = sum2 + prod
- sum = sum + chain_dot(sum2, cov_beta, sum2.T)
- results.append(sum / (T + 1))
- return results
- def _forecast_cov_resid_raw(self, h):
- """
- Returns the covariance of the residual errors for the forecast at
- 1, 2, ..., h timesteps.
- """
- psi_values = self._psi(h)
- sum = 0
- result = []
- for i in range(h):
- psi = psi_values[i]
- sum = sum + chain_dot(psi, self._sigma, psi.T)
- result.append(sum)
- return result
- def _forecast_raw(self, h):
- """
- Returns the forecast at 1, 2, ..., h timesteps in the future.
- """
- k = self._k
- result = []
- for i in range(h):
- sum = self._alpha.reshape(1, k)
- for j in range(self._p):
- beta = self._lag_betas[j]
- idx = i - j
- if idx > 0:
- y = result[idx - 1]
- else:
- y = self._data_xs(idx - 1)
- sum = sum + np.dot(beta, y.T).T
- result.append(sum)
- return np.array(result)
- def _forecast_std_err_raw(self, h):
- """
- Returns the standard error of the forecasts
- at 1, 2, ..., n timesteps.
- """
- return np.array([np.sqrt(np.diag(value))
- for value in self._forecast_cov_raw(h)])
- @cache_readonly
- def _ic(self):
- """
- Returns the Akaike/Bayesian information criteria.
- """
- RSS = self._rss
- k = self._p * (self._k * self._p + 1)
- n = self._nobs * self._k
- return {'aic': 2 * k + n * np.log(RSS / n),
- 'bic': n * np.log(RSS / n) + k * np.log(n)}
- @cache_readonly
- def _k(self):
- return len(self._columns)
- @cache_readonly
- def _lag_betas(self):
- """
- Returns list of B_i, where B_i represents the (k, k) matrix
- with the j-th row containing the betas of regressing the j-th
- column of self._data with self._data lagged i time steps.
- First element is B_1, second element is B_2, etc.
- """
- k = self._k
- b = self._beta_raw
- return [b[k * i: k * (i + 1)].T for i in range(self._p)]
- @cache_readonly
- def _lagged_data(self):
- return dict([(i, self._data.shift(i))
- for i in range(1, 1 + self._p)])
- @cache_readonly
- def _nobs(self):
- return len(self._data) - self._p
- def _psi(self, h):
- """
- psi value used for calculating standard error.
- Returns [psi_0, psi_1, ..., psi_(h - 1)]
- """
- k = self._k
- result = [np.eye(k)]
- for i in range(1, h):
- result.append(sum(
- [np.dot(result[i - j], self._lag_betas[j - 1])
- for j in range(1, 1 + i)
- if j <= self._p]))
- return result
- @cache_readonly
- def _resid_raw(self):
- resid = np.array([self.ols_results[col]._resid_raw
- for col in self._columns])
- return resid
- @cache_readonly
- def _rss(self):
- """Returns the sum of the squares of the residuals."""
- return (self._resid_raw ** 2).sum()
- @cache_readonly
- def _sigma(self):
- """Returns covariance of resids."""
- k = self._k
- n = self._nobs
- resid = self._resid_raw
- return np.dot(resid, resid.T) / (n - k)
- def __unicode__(self):
- return self.summary
- def lag_select(data, max_lags=5, ic=None):
- """
- Select number of lags based on a variety of information criteria
- Parameters
- ----------
- data : DataFrame-like
- max_lags : int
- Maximum number of lags to evaluate
- ic : {None, 'aic', 'bic', ...}
- Choosing None will just display the results
- Returns
- -------
- None
- """
- pass
- class PanelVAR(VAR):
- """
- Performs Vector Autoregression on panel data.
- Parameters
- ----------
- data: Panel or dict of DataFrame
- lags: int
- """
- def __init__(self, data, lags, intercept=True):
- self._data = _prep_panel_data(data)
- self._p = lags
- self._intercept = intercept
- self._columns = self._data.items
- @cache_readonly
- def _nobs(self):
- """Returns the number of observations."""
- _, timesteps, entities = self._data.values.shape
- return (timesteps - self._p) * entities
- @cache_readonly
- def _rss(self):
- """Returns the sum of the squares of the residuals."""
- return (self.resid.values ** 2).sum()
- def forecast(self, h):
- """
- Returns the forecasts at 1, 2, ..., n timesteps in the future.
- """
- forecast = self._forecast_raw(h).T.swapaxes(1, 2)
- index = lrange(1, 1 + h)
- w = Panel(forecast, items=self._data.items, major_axis=index,
- minor_axis=self._data.minor_axis)
- return w
- @cache_readonly
- def resid(self):
- """
- Returns the DataFrame containing the residuals of the VAR regressions.
- Each column x1 contains the residuals generated by regressing the x1
- column of the input against the lagged input.
- Returns
- -------
- DataFrame
- """
- d = dict([(key, value.resid)
- for (key, value) in compat.iteritems(self.ols_results)])
- return Panel.fromDict(d)
- def _data_xs(self, i):
- return self._data.values[:, i, :].T
- @cache_readonly
- def _sigma(self):
- """Returns covariance of resids."""
- k = self._k
- resid = _drop_incomplete_rows(self.resid.toLong().values)
- n = len(resid)
- return np.dot(resid.T, resid) / (n - k)
- def _prep_panel_data(data):
- """Converts the given data into a Panel."""
- if isinstance(data, Panel):
- return data
- return Panel.fromDict(data)
- def _drop_incomplete_rows(array):
- mask = np.isfinite(array).all(1)
- indices = np.arange(len(array))[mask]
- return array.take(indices, 0)
- def _make_param_name(lag, name):
- return 'L%d.%s' % (lag, name)
- def chain_dot(*matrices):
- """
- Returns the dot product of the given matrices.
- Parameters
- ----------
- matrices: argument list of ndarray
- """
- return reduce(lambda x, y: np.dot(y, x), matrices[::-1])