PageRenderTime 71ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/statsmodels/tsa/ar_model.py

http://github.com/statsmodels/statsmodels
Python | 1923 lines | 1881 code | 19 blank | 23 comment | 7 complexity | 346b37557ff1e705265984996bbf06e9 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. # -*- coding: utf-8 -*-
  2. from __future__ import annotations
  3. from statsmodels.compat.pandas import (
  4. Appender,
  5. Substitution,
  6. call_cached_func,
  7. to_numpy,
  8. )
  9. from collections.abc import Iterable
  10. import datetime as dt
  11. from types import SimpleNamespace
  12. from typing import List, Tuple
  13. import warnings
  14. import numpy as np
  15. import pandas as pd
  16. from scipy.stats import gaussian_kde, norm
  17. import statsmodels.base.wrapper as wrap
  18. from statsmodels.iolib.summary import Summary
  19. from statsmodels.regression.linear_model import OLS
  20. from statsmodels.tools import eval_measures
  21. from statsmodels.tools.decorators import cache_readonly, cache_writable
  22. from statsmodels.tools.docstring import Docstring, remove_parameters
  23. from statsmodels.tools.sm_exceptions import SpecificationWarning
  24. from statsmodels.tools.validation import (
  25. array_like,
  26. bool_like,
  27. int_like,
  28. string_like,
  29. )
  30. from statsmodels.tsa.arima_process import arma2ma
  31. from statsmodels.tsa.base import tsa_model
  32. from statsmodels.tsa.base.prediction import PredictionResults
  33. from statsmodels.tsa.deterministic import (
  34. DeterministicProcess,
  35. Seasonality,
  36. TimeTrend,
  37. )
  38. from statsmodels.tsa.tsatools import freq_to_period, lagmat
  39. __all__ = ["AR", "AutoReg"]
  40. AR_DEPRECATION_WARN = """
  41. statsmodels.tsa.AR has been deprecated in favor of statsmodels.tsa.AutoReg and
  42. statsmodels.tsa.SARIMAX.
  43. AutoReg adds the ability to specify exogenous variables, include time trends,
  44. and add seasonal dummies. The AutoReg API differs from AR since the model is
  45. treated as immutable, and so the entire specification including the lag
  46. length must be specified when creating the model. This change is too
  47. substantial to incorporate into the existing AR api. The function
  48. ar_select_order performs lag length selection for AutoReg models.
  49. AutoReg only estimates parameters using conditional MLE (OLS). Use SARIMAX to
  50. estimate ARX and related models using full MLE via the Kalman Filter.
  51. To silence this warning and continue using AR until it is removed, use:
  52. import warnings
  53. warnings.filterwarnings('ignore', 'statsmodels.tsa.ar_model.AR', FutureWarning)
  54. """
  55. REPEATED_FIT_ERROR = """
  56. Model has been fit using maxlag={0}, method={1}, ic={2}, trend={3}. These
  57. cannot be changed in subsequent calls to `fit`. Instead, use a new instance of
  58. AR.
  59. """
  60. def sumofsq(x, axis=0):
  61. """Helper function to calculate sum of squares along first axis"""
  62. return np.sum(x ** 2, axis=axis)
  63. def _get_period(data, index_freq):
  64. """Shared helper to get period from frequenc or raise"""
  65. if data.freq:
  66. return freq_to_period(index_freq)
  67. raise ValueError(
  68. "freq cannot be inferred from endog and model includes seasonal "
  69. "terms. The number of periods must be explicitly set when the "
  70. "endog's index does not contain a frequency."
  71. )
  72. class AutoReg(tsa_model.TimeSeriesModel):
  73. """
  74. Autoregressive AR-X(p) model
  75. Estimate an AR-X model using Conditional Maximum Likelihood (OLS).
  76. Parameters
  77. ----------
  78. endog : array_like
  79. A 1-d endogenous response variable. The dependent variable.
  80. lags : {None, int, list[int]}
  81. The number of lags to include in the model if an integer or the
  82. list of lag indices to include. For example, [1, 4] will only
  83. include lags 1 and 4 while lags=4 will include lags 1, 2, 3, and 4.
  84. None excludes all AR lags, and behave identically to 0.
  85. trend : {'n', 'c', 't', 'ct'}
  86. The trend to include in the model:
  87. * 'n' - No trend.
  88. * 'c' - Constant only.
  89. * 't' - Time trend only.
  90. * 'ct' - Constant and time trend.
  91. seasonal : bool
  92. Flag indicating whether to include seasonal dummies in the model. If
  93. seasonal is True and trend includes 'c', then the first period
  94. is excluded from the seasonal terms.
  95. exog : array_like, optional
  96. Exogenous variables to include in the model. Must have the same number
  97. of observations as endog and should be aligned so that endog[i] is
  98. regressed on exog[i].
  99. hold_back : {None, int}
  100. Initial observations to exclude from the estimation sample. If None,
  101. then hold_back is equal to the maximum lag in the model. Set to a
  102. non-zero value to produce comparable models with different lag
  103. length. For example, to compare the fit of a model with lags=3 and
  104. lags=1, set hold_back=3 which ensures that both models are estimated
  105. using observations 3,...,nobs. hold_back must be >= the maximum lag in
  106. the model.
  107. period : {None, int}
  108. The period of the data. Only used if seasonal is True. This parameter
  109. can be omitted if using a pandas object for endog that contains a
  110. recognized frequency.
  111. missing : str
  112. Available options are 'none', 'drop', and 'raise'. If 'none', no nan
  113. checking is done. If 'drop', any observations with nans are dropped.
  114. If 'raise', an error is raised. Default is 'none'.
  115. deterministic : DeterministicProcess
  116. A deterministic process. If provided, trend and seasonal are ignored.
  117. A warning is raised if trend is not "n" and seasonal is not False.
  118. old_names : bool
  119. Flag indicating whether to use the v0.11 names or the v0.12+ names.
  120. .. deprecated:: 0.13
  121. old_names is deprecated and will be removed after 0.14 is
  122. released. You must update any code reliant on the old variable
  123. names to use the new names.
  124. See Also
  125. --------
  126. statsmodels.tsa.statespace.sarimax.SARIMAX
  127. Estimation of SARIMAX models using exact likelihood and the
  128. Kalman Filter.
  129. Examples
  130. --------
  131. >>> import statsmodels.api as sm
  132. >>> from statsmodels.tsa.ar_model import AutoReg
  133. >>> data = sm.datasets.sunspots.load_pandas().data['SUNACTIVITY']
  134. >>> out = 'AIC: {0:0.3f}, HQIC: {1:0.3f}, BIC: {2:0.3f}'
  135. Start by fitting an unrestricted Seasonal AR model
  136. >>> res = AutoReg(data, lags = [1, 11, 12]).fit()
  137. >>> print(out.format(res.aic, res.hqic, res.bic))
  138. AIC: 5.945, HQIC: 5.970, BIC: 6.007
  139. An alternative used seasonal dummies
  140. >>> res = AutoReg(data, lags=1, seasonal=True, period=11).fit()
  141. >>> print(out.format(res.aic, res.hqic, res.bic))
  142. AIC: 6.017, HQIC: 6.080, BIC: 6.175
  143. Finally, both the seasonal AR structure and dummies can be included
  144. >>> res = AutoReg(data, lags=[1, 11, 12], seasonal=True, period=11).fit()
  145. >>> print(out.format(res.aic, res.hqic, res.bic))
  146. AIC: 5.884, HQIC: 5.959, BIC: 6.071
  147. """
  148. def __init__(
  149. self,
  150. endog,
  151. lags,
  152. trend="c",
  153. seasonal=False,
  154. exog=None,
  155. hold_back=None,
  156. period=None,
  157. missing="none",
  158. *,
  159. deterministic=None,
  160. old_names=False,
  161. ):
  162. super().__init__(endog, exog, None, None, missing=missing)
  163. self._trend = string_like(
  164. trend, "trend", options=("n", "c", "t", "ct"), optional=False
  165. )
  166. self._seasonal = bool_like(seasonal, "seasonal")
  167. self._period = int_like(period, "period", optional=True)
  168. if self._period is None and self._seasonal:
  169. self._period = _get_period(self.data, self._index_freq)
  170. terms = [TimeTrend.from_string(self._trend)]
  171. if seasonal:
  172. terms.append(Seasonality(self._period))
  173. if hasattr(self.data.orig_endog, "index"):
  174. index = self.data.orig_endog.index
  175. else:
  176. index = np.arange(self.data.endog.shape[0])
  177. self._user_deterministic = False
  178. if deterministic is not None:
  179. if not isinstance(deterministic, DeterministicProcess):
  180. raise TypeError("deterministic must be a DeterministicProcess")
  181. self._deterministics = deterministic
  182. self._user_deterministic = True
  183. else:
  184. self._deterministics = DeterministicProcess(
  185. index, additional_terms=terms
  186. )
  187. self._lags = lags
  188. self._exog_names = []
  189. self._k_ar = 0
  190. self._hold_back = int_like(hold_back, "hold_back", optional=True)
  191. self._old_names = bool_like(old_names, "old_names", optional=False)
  192. if deterministic is not None and (
  193. self._trend != "n" or self._seasonal
  194. ):
  195. warnings.warn(
  196. 'When using deterministic, trend must be "n" and '
  197. "seasonal must be False.",
  198. SpecificationWarning,
  199. )
  200. if self._old_names:
  201. warnings.warn(
  202. "old_names will be removed after the 0.14 release. You should "
  203. "stop setting this parameter and use the new names.",
  204. FutureWarning,
  205. )
  206. self._lags, self._hold_back = self._check_lags()
  207. self._setup_regressors()
  208. self.nobs = self._y.shape[0]
  209. self.data.xnames = self.exog_names
  210. @property
  211. def ar_lags(self):
  212. """The autoregressive lags included in the model"""
  213. lags = list(self._lags)
  214. return None if not lags else lags
  215. @property
  216. def hold_back(self):
  217. """The number of initial obs. excluded from the estimation sample."""
  218. return self._hold_back
  219. @property
  220. def trend(self):
  221. """The trend used in the model."""
  222. return self._trend
  223. @property
  224. def seasonal(self):
  225. """Flag indicating that the model contains a seasonal component."""
  226. return self._seasonal
  227. @property
  228. def deterministic(self):
  229. """The deterministic used to construct the model"""
  230. return self._deterministics if self._user_deterministic else None
  231. @property
  232. def period(self):
  233. """The period of the seasonal component."""
  234. return self._period
  235. @property
  236. def df_model(self):
  237. """The model degrees of freedom."""
  238. return self._x.shape[1]
  239. @property
  240. def exog_names(self):
  241. """Names of exogenous variables included in model"""
  242. return self._exog_names
  243. def initialize(self):
  244. """Initialize the model (no-op)."""
  245. pass
  246. def _check_lags(self) -> Tuple[List[int], int]:
  247. lags = self._lags
  248. if lags is None:
  249. lags = []
  250. self._maxlag = 0
  251. elif isinstance(lags, Iterable):
  252. lags = np.array(sorted([int_like(lag, "lags") for lag in lags]))
  253. if np.any(lags < 1) or np.unique(lags).shape[0] != lags.shape[0]:
  254. raise ValueError(
  255. "All values in lags must be positive and distinct."
  256. )
  257. self._maxlag = np.max(lags)
  258. else:
  259. self._maxlag = int_like(lags, "lags")
  260. if self._maxlag < 0:
  261. raise ValueError("lags must be a non-negative scalar.")
  262. lags = np.arange(1, self._maxlag + 1)
  263. hold_back = self._hold_back
  264. if hold_back is None:
  265. hold_back = self._maxlag
  266. if hold_back < self._maxlag:
  267. raise ValueError(
  268. "hold_back must be >= lags if lags is an int or"
  269. "max(lags) if lags is array_like."
  270. )
  271. return list(lags), int(hold_back)
  272. def _setup_regressors(self):
  273. maxlag = self._maxlag
  274. hold_back = self._hold_back
  275. exog_names = []
  276. endog_names = self.endog_names
  277. x, y = lagmat(self.endog, maxlag, original="sep")
  278. exog_names.extend(
  279. [endog_names + ".L{0}".format(lag) for lag in self._lags]
  280. )
  281. if len(self._lags) < maxlag:
  282. x = x[:, np.asarray(self._lags) - 1]
  283. self._k_ar = x.shape[1]
  284. deterministic = self._deterministics.in_sample()
  285. if deterministic.shape[1]:
  286. x = np.c_[to_numpy(deterministic), x]
  287. if self._old_names:
  288. deterministic_names = []
  289. if "c" in self._trend:
  290. deterministic_names.append("intercept")
  291. if "t" in self._trend:
  292. deterministic_names.append("trend")
  293. if self._seasonal:
  294. period = self._period
  295. names = ["seasonal.{0}".format(i) for i in range(period)]
  296. if "c" in self._trend:
  297. names = names[1:]
  298. deterministic_names.extend(names)
  299. else:
  300. deterministic_names = list(deterministic.columns)
  301. exog_names = deterministic_names + exog_names
  302. if self.exog is not None:
  303. x = np.c_[x, self.exog]
  304. exog_names.extend(self.data.param_names)
  305. y = y[hold_back:]
  306. x = x[hold_back:]
  307. if y.shape[0] < x.shape[1]:
  308. reg = x.shape[1]
  309. period = self._period
  310. trend = 0 if self._trend == "n" else len(self._trend)
  311. seas = 0 if not self._seasonal else period - ("c" in self._trend)
  312. lags = len(self._lags)
  313. nobs = y.shape[0]
  314. raise ValueError(
  315. "The model specification cannot be estimated. "
  316. f"The model contains {reg} regressors ({trend} trend, "
  317. f"{seas} seasonal, {lags} lags) but after adjustment "
  318. "for hold_back and creation of the lags, there "
  319. f"are only {nobs} data points available to estimate "
  320. "parameters."
  321. )
  322. self._y, self._x = y, x
  323. self._exog_names = exog_names
  324. def fit(self, cov_type="nonrobust", cov_kwds=None, use_t=False):
  325. """
  326. Estimate the model parameters.
  327. Parameters
  328. ----------
  329. cov_type : str
  330. The covariance estimator to use. The most common choices are listed
  331. below. Supports all covariance estimators that are available
  332. in ``OLS.fit``.
  333. * 'nonrobust' - The class OLS covariance estimator that assumes
  334. homoskedasticity.
  335. * 'HC0', 'HC1', 'HC2', 'HC3' - Variants of White's
  336. (or Eiker-Huber-White) covariance estimator. `HC0` is the
  337. standard implementation. The other make corrections to improve
  338. the finite sample performance of the heteroskedasticity robust
  339. covariance estimator.
  340. * 'HAC' - Heteroskedasticity-autocorrelation robust covariance
  341. estimation. Supports cov_kwds.
  342. - `maxlags` integer (required) : number of lags to use.
  343. - `kernel` callable or str (optional) : kernel
  344. currently available kernels are ['bartlett', 'uniform'],
  345. default is Bartlett.
  346. - `use_correction` bool (optional) : If true, use small sample
  347. correction.
  348. cov_kwds : dict, optional
  349. A dictionary of keyword arguments to pass to the covariance
  350. estimator. `nonrobust` and `HC#` do not support cov_kwds.
  351. use_t : bool, optional
  352. A flag indicating that inference should use the Student's t
  353. distribution that accounts for model degree of freedom. If False,
  354. uses the normal distribution. If None, defers the choice to
  355. the cov_type. It also removes degree of freedom corrections from
  356. the covariance estimator when cov_type is 'nonrobust'.
  357. Returns
  358. -------
  359. AutoRegResults
  360. Estimation results.
  361. See Also
  362. --------
  363. statsmodels.regression.linear_model.OLS
  364. Ordinary Least Squares estimation.
  365. statsmodels.regression.linear_model.RegressionResults
  366. See ``get_robustcov_results`` for a detailed list of available
  367. covariance estimators and options.
  368. Notes
  369. -----
  370. Use ``OLS`` to estimate model parameters and to estimate parameter
  371. covariance.
  372. """
  373. # TODO: Determine correction for degree-of-freedom
  374. # Special case parameterless model
  375. if self._x.shape[1] == 0:
  376. return AutoRegResultsWrapper(
  377. AutoRegResults(self, np.empty(0), np.empty((0, 0)))
  378. )
  379. ols_mod = OLS(self._y, self._x)
  380. ols_res = ols_mod.fit(
  381. cov_type=cov_type, cov_kwds=cov_kwds, use_t=use_t
  382. )
  383. cov_params = ols_res.cov_params()
  384. use_t = ols_res.use_t
  385. if cov_type == "nonrobust" and not use_t:
  386. nobs = self._y.shape[0]
  387. k = self._x.shape[1]
  388. scale = nobs / (nobs - k)
  389. cov_params /= scale
  390. res = AutoRegResults(
  391. self,
  392. ols_res.params,
  393. cov_params,
  394. ols_res.normalized_cov_params,
  395. use_t=use_t,
  396. )
  397. return AutoRegResultsWrapper(res)
  398. def _resid(self, params):
  399. params = array_like(params, "params", ndim=2)
  400. return self._y.squeeze() - (self._x @ params).squeeze()
  401. def loglike(self, params):
  402. """
  403. Log-likelihood of model.
  404. Parameters
  405. ----------
  406. params : ndarray
  407. The model parameters used to compute the log-likelihood.
  408. Returns
  409. -------
  410. float
  411. The log-likelihood value.
  412. """
  413. nobs = self.nobs
  414. resid = self._resid(params)
  415. ssr = resid @ resid
  416. llf = -(nobs / 2) * (np.log(2 * np.pi) + np.log(ssr / nobs) + 1)
  417. return llf
  418. def score(self, params):
  419. """
  420. Score vector of model.
  421. The gradient of logL with respect to each parameter.
  422. Parameters
  423. ----------
  424. params : ndarray
  425. The parameters to use when evaluating the Hessian.
  426. Returns
  427. -------
  428. ndarray
  429. The score vector evaluated at the parameters.
  430. """
  431. resid = self._resid(params)
  432. return self._x.T @ resid
  433. def information(self, params):
  434. """
  435. Fisher information matrix of model.
  436. Returns -1 * Hessian of the log-likelihood evaluated at params.
  437. Parameters
  438. ----------
  439. params : ndarray
  440. The model parameters.
  441. Returns
  442. -------
  443. ndarray
  444. The information matrix.
  445. """
  446. resid = self._resid(params)
  447. sigma2 = resid @ resid / self.nobs
  448. return (self._x.T @ self._x) * (1 / sigma2)
  449. def hessian(self, params):
  450. """
  451. The Hessian matrix of the model.
  452. Parameters
  453. ----------
  454. params : ndarray
  455. The parameters to use when evaluating the Hessian.
  456. Returns
  457. -------
  458. ndarray
  459. The hessian evaluated at the parameters.
  460. """
  461. return -self.information(params)
  462. def _setup_oos_forecast(self, add_forecasts, exog_oos):
  463. x = np.zeros((add_forecasts, self._x.shape[1]))
  464. oos_exog = self._deterministics.out_of_sample(steps=add_forecasts)
  465. n_deterministic = oos_exog.shape[1]
  466. x[:, :n_deterministic] = to_numpy(oos_exog)
  467. # skip the AR columns
  468. loc = n_deterministic + len(self._lags)
  469. if self.exog is not None:
  470. x[:, loc:] = exog_oos[:add_forecasts]
  471. return x
  472. def _wrap_prediction(self, prediction, start, end, pad):
  473. prediction = np.hstack([np.full(pad, np.nan), prediction])
  474. n_values = end - start + pad
  475. if not isinstance(self.data.orig_endog, (pd.Series, pd.DataFrame)):
  476. return prediction[-n_values:]
  477. index = self._index
  478. if end > self.endog.shape[0]:
  479. freq = getattr(index, "freq", None)
  480. if freq:
  481. if isinstance(index, pd.PeriodIndex):
  482. index = pd.period_range(index[0], freq=freq, periods=end)
  483. else:
  484. index = pd.date_range(index[0], freq=freq, periods=end)
  485. else:
  486. index = pd.RangeIndex(end)
  487. index = index[start - pad : end]
  488. prediction = prediction[-n_values:]
  489. return pd.Series(prediction, index=index)
  490. def _dynamic_predict(
  491. self, params, start, end, dynamic, num_oos, exog, exog_oos
  492. ):
  493. """
  494. :param params:
  495. :param start:
  496. :param end:
  497. :param dynamic:
  498. :param num_oos:
  499. :param exog:
  500. :param exog_oos:
  501. :return:
  502. """
  503. reg = []
  504. hold_back = self._hold_back
  505. adj = 0
  506. if start < hold_back:
  507. # Adjust start and dynamic
  508. adj = hold_back - start
  509. start += adj
  510. # New offset shifts, but must remain non-negative
  511. dynamic = max(dynamic - adj, 0)
  512. if (start - hold_back) <= self.nobs:
  513. # _x is missing hold_back observations, which is why
  514. # it is shifted by this amount
  515. is_loc = slice(start - hold_back, end + 1 - hold_back)
  516. x = self._x[is_loc]
  517. if exog is not None:
  518. x = x.copy()
  519. # Replace final columns
  520. x[:, -exog.shape[1] :] = exog[start : end + 1]
  521. reg.append(x)
  522. if num_oos > 0:
  523. reg.append(self._setup_oos_forecast(num_oos, exog_oos))
  524. reg = np.vstack(reg)
  525. det_col_idx = self._x.shape[1] - len(self._lags)
  526. det_col_idx -= 0 if self.exog is None else self.exog.shape[1]
  527. # Simple 1-step static forecasts for dynamic observations
  528. forecasts = np.empty(reg.shape[0])
  529. forecasts[:dynamic] = reg[:dynamic] @ params
  530. for h in range(dynamic, reg.shape[0]):
  531. # Fill in regressor matrix
  532. for j, lag in enumerate(self._lags):
  533. fcast_loc = h - lag
  534. if fcast_loc >= dynamic:
  535. val = forecasts[fcast_loc]
  536. else:
  537. # If before the start of the forecasts, use actual values
  538. val = self.endog[fcast_loc + start]
  539. reg[h, det_col_idx + j] = val
  540. forecasts[h] = reg[h : h + 1] @ params
  541. return self._wrap_prediction(forecasts, start, end + 1 + num_oos, adj)
  542. def _static_oos_predict(self, params, num_oos, exog_oos):
  543. new_x = self._setup_oos_forecast(num_oos, exog_oos)
  544. if self._maxlag == 0:
  545. return new_x @ params
  546. forecasts = np.empty(num_oos)
  547. nexog = 0 if self.exog is None else self.exog.shape[1]
  548. ar_offset = self._x.shape[1] - nexog - len(self._lags)
  549. for i in range(num_oos):
  550. for j, lag in enumerate(self._lags):
  551. loc = i - lag
  552. val = self._y[loc] if loc < 0 else forecasts[loc]
  553. new_x[i, ar_offset + j] = val
  554. forecasts[i] = new_x[i : i + 1] @ params
  555. return forecasts
  556. def _static_predict(self, params, start, end, num_oos, exog, exog_oos):
  557. """
  558. Path for static predictions
  559. Parameters
  560. ----------
  561. start : int
  562. Index of first observation
  563. end : int
  564. Index of last in-sample observation. Inclusive, so start:end+1
  565. in slice notation.
  566. num_oos : int
  567. Number of out-of-sample observations, so that the returned size is
  568. num_oos + (end - start + 1).
  569. exog : ndarray
  570. Array containing replacement exog values
  571. exog_oos : ndarray
  572. Containing forecast exog values
  573. """
  574. hold_back = self._hold_back
  575. nobs = self.endog.shape[0]
  576. x = np.empty((0, self._x.shape[1]))
  577. # Adjust start to reflect observations lost
  578. adj = max(0, hold_back - start)
  579. start += adj
  580. if start <= nobs:
  581. # Use existing regressors
  582. is_loc = slice(start - hold_back, end + 1 - hold_back)
  583. x = self._x[is_loc]
  584. if exog is not None:
  585. x = x.copy()
  586. # Replace final columns
  587. x[:, -exog.shape[1] :] = exog[start : end + 1]
  588. in_sample = x @ params
  589. if num_oos == 0: # No out of sample
  590. return self._wrap_prediction(in_sample, start, end + 1, adj)
  591. out_of_sample = self._static_oos_predict(params, num_oos, exog_oos)
  592. prediction = np.hstack((in_sample, out_of_sample))
  593. return self._wrap_prediction(prediction, start, end + 1 + num_oos, adj)
  594. def _prepare_prediction(self, params, exog, exog_oos, start, end):
  595. params = array_like(params, "params")
  596. if not isinstance(exog, pd.DataFrame):
  597. exog = array_like(exog, "exog", ndim=2, optional=True)
  598. if not isinstance(exog_oos, pd.DataFrame):
  599. exog_oos = array_like(exog_oos, "exog_oos", ndim=2, optional=True)
  600. start = 0 if start is None else start
  601. end = self._index[-1] if end is None else end
  602. start, end, num_oos, _ = self._get_prediction_index(start, end)
  603. return params, exog, exog_oos, start, end, num_oos
  604. def _parse_dynamic(self, dynamic, start):
  605. if isinstance(
  606. dynamic, (str, bytes, pd.Timestamp, dt.datetime, pd.Period)
  607. ):
  608. dynamic_loc, _, _ = self._get_index_loc(dynamic)
  609. # Adjust since relative to start
  610. dynamic_loc -= start
  611. elif dynamic is True:
  612. # if True, all forecasts are dynamic
  613. dynamic_loc = 0
  614. else:
  615. dynamic_loc = int(dynamic)
  616. # At this point dynamic is an offset relative to start
  617. # and it must be non-negative
  618. if dynamic_loc < 0:
  619. raise ValueError(
  620. "Dynamic prediction cannot begin prior to the "
  621. "first observation in the sample."
  622. )
  623. return dynamic_loc
  624. def predict(
  625. self,
  626. params,
  627. start=None,
  628. end=None,
  629. dynamic=False,
  630. exog=None,
  631. exog_oos=None,
  632. ):
  633. """
  634. In-sample prediction and out-of-sample forecasting.
  635. Parameters
  636. ----------
  637. params : array_like
  638. The fitted model parameters.
  639. start : int, str, or datetime, optional
  640. Zero-indexed observation number at which to start forecasting,
  641. i.e., the first forecast is start. Can also be a date string to
  642. parse or a datetime type. Default is the the zeroth observation.
  643. end : int, str, or datetime, optional
  644. Zero-indexed observation number at which to end forecasting, i.e.,
  645. the last forecast is end. Can also be a date string to
  646. parse or a datetime type. However, if the dates index does not
  647. have a fixed frequency, end must be an integer index if you
  648. want out-of-sample prediction. Default is the last observation in
  649. the sample. Unlike standard python slices, end is inclusive so
  650. that all the predictions [start, start+1, ..., end-1, end] are
  651. returned.
  652. dynamic : {bool, int, str, datetime, Timestamp}, optional
  653. Integer offset relative to `start` at which to begin dynamic
  654. prediction. Prior to this observation, true endogenous values
  655. will be used for prediction; starting with this observation and
  656. continuing through the end of prediction, forecasted endogenous
  657. values will be used instead. Datetime-like objects are not
  658. interpreted as offsets. They are instead used to find the index
  659. location of `dynamic` which is then used to to compute the offset.
  660. exog : array_like
  661. A replacement exogenous array. Must have the same shape as the
  662. exogenous data array used when the model was created.
  663. exog_oos : array_like
  664. An array containing out-of-sample values of the exogenous variable.
  665. Must has the same number of columns as the exog used when the
  666. model was created, and at least as many rows as the number of
  667. out-of-sample forecasts.
  668. Returns
  669. -------
  670. predictions : {ndarray, Series}
  671. Array of out of in-sample predictions and / or out-of-sample
  672. forecasts.
  673. """
  674. params, exog, exog_oos, start, end, num_oos = self._prepare_prediction(
  675. params, exog, exog_oos, start, end
  676. )
  677. if self.exog is None and (exog is not None or exog_oos is not None):
  678. raise ValueError(
  679. "exog and exog_oos cannot be used when the model "
  680. "does not contains exogenous regressors."
  681. )
  682. elif self.exog is not None:
  683. if exog is not None and exog.shape != self.exog.shape:
  684. msg = (
  685. "The shape of exog {0} must match the shape of the "
  686. "exog variable used to create the model {1}."
  687. )
  688. raise ValueError(msg.format(exog.shape, self.exog.shape))
  689. if (
  690. exog_oos is not None
  691. and exog_oos.shape[1] != self.exog.shape[1]
  692. ):
  693. msg = (
  694. "The number of columns in exog_oos ({0}) must match "
  695. "the number of columns in the exog variable used to "
  696. "create the model ({1})."
  697. )
  698. raise ValueError(
  699. msg.format(exog_oos.shape[1], self.exog.shape[1])
  700. )
  701. if num_oos > 0 and exog_oos is None:
  702. raise ValueError(
  703. "exog_oos must be provided when producing "
  704. "out-of-sample forecasts."
  705. )
  706. elif exog_oos is not None and num_oos > exog_oos.shape[0]:
  707. msg = (
  708. "start and end indicate that {0} out-of-sample "
  709. "predictions must be computed. exog_oos has {1} rows "
  710. "but must have at least {0}."
  711. )
  712. raise ValueError(msg.format(num_oos, exog_oos.shape[0]))
  713. if (isinstance(dynamic, bool) and not dynamic) or self._maxlag == 0:
  714. # If model has no lags, static and dynamic are identical
  715. return self._static_predict(
  716. params, start, end, num_oos, exog, exog_oos
  717. )
  718. dynamic = self._parse_dynamic(dynamic, start)
  719. return self._dynamic_predict(
  720. params, start, end, dynamic, num_oos, exog, exog_oos
  721. )
  722. class AR:
  723. """
  724. The AR class has been removed and replaced with AutoReg
  725. See Also
  726. --------
  727. AutoReg
  728. The replacement for AR that improved deterministic modeling
  729. """
  730. def __init__(self, *args, **kwargs):
  731. raise NotImplementedError(
  732. "AR has been removed from statsmodels and replaced with "
  733. "statsmodels.tsa.ar_model.AutoReg."
  734. )
  735. class ARResults:
  736. """
  737. Removed and replaced by AutoRegResults.
  738. See Also
  739. --------
  740. AutoReg
  741. """
  742. def __init__(self, *args, **kwargs):
  743. raise NotImplementedError(
  744. "AR and ARResults have been removed and replaced by "
  745. "AutoReg And AutoRegResults."
  746. )
  747. doc = Docstring(AutoReg.predict.__doc__)
  748. _predict_params = doc.extract_parameters(
  749. ["start", "end", "dynamic", "exog", "exog_oos"], 8
  750. )
  751. class AutoRegResults(tsa_model.TimeSeriesModelResults):
  752. """
  753. Class to hold results from fitting an AutoReg model.
  754. Parameters
  755. ----------
  756. model : AutoReg
  757. Reference to the model that is fit.
  758. params : ndarray
  759. The fitted parameters from the AR Model.
  760. cov_params : ndarray
  761. The estimated covariance matrix of the model parameters.
  762. normalized_cov_params : ndarray
  763. The array inv(dot(x.T,x)) where x contains the regressors in the
  764. model.
  765. scale : float, optional
  766. An estimate of the scale of the model.
  767. use_t : bool, optional
  768. Whether use_t was set in fit
  769. """
  770. _cache = {} # for scale setter
  771. def __init__(
  772. self,
  773. model,
  774. params,
  775. cov_params,
  776. normalized_cov_params=None,
  777. scale=1.0,
  778. use_t=False,
  779. ):
  780. super().__init__(model, params, normalized_cov_params, scale)
  781. self._cache = {}
  782. self._params = params
  783. self._nobs = model.nobs
  784. self._n_totobs = model.endog.shape[0]
  785. self._df_model = model.df_model
  786. self._ar_lags = model.ar_lags
  787. self._use_t = use_t
  788. if self._ar_lags is not None:
  789. self._max_lag = max(self._ar_lags)
  790. else:
  791. self._max_lag = 0
  792. self._hold_back = self.model.hold_back
  793. self.cov_params_default = cov_params
  794. def initialize(self, model, params, **kwargs):
  795. """
  796. Initialize (possibly re-initialize) a Results instance.
  797. Parameters
  798. ----------
  799. model : Model
  800. The model instance.
  801. params : ndarray
  802. The model parameters.
  803. **kwargs
  804. Any additional keyword arguments required to initialize the model.
  805. """
  806. self._params = params
  807. self.model = model
  808. @property
  809. def ar_lags(self):
  810. """The autoregressive lags included in the model"""
  811. return self._ar_lags
  812. @property
  813. def params(self):
  814. """The estimated parameters."""
  815. return self._params
  816. @property
  817. def df_model(self):
  818. """The degrees of freedom consumed by the model."""
  819. return self._df_model
  820. @property
  821. def df_resid(self):
  822. """The remaining degrees of freedom in the residuals."""
  823. return self.nobs - self._df_model
  824. @property
  825. def nobs(self):
  826. """
  827. The number of observations after adjusting for losses due to lags.
  828. """
  829. return self._nobs
  830. @cache_writable()
  831. def sigma2(self):
  832. return 1.0 / self.nobs * sumofsq(self.resid)
  833. @cache_writable() # for compatability with RegressionResults
  834. def scale(self):
  835. return self.sigma2
  836. @cache_readonly
  837. def bse(self): # allow user to specify?
  838. """
  839. The standard errors of the estimated parameters.
  840. If `method` is 'cmle', then the standard errors that are returned are
  841. the OLS standard errors of the coefficients. If the `method` is 'mle'
  842. then they are computed using the numerical Hessian.
  843. """
  844. return np.sqrt(np.diag(self.cov_params()))
  845. @cache_readonly
  846. def aic(self):
  847. r"""
  848. Akaike Information Criterion using Lutkepohl's definition.
  849. :math:`-2 llf + \ln(nobs) (1 + df_{model})`
  850. """
  851. # This is based on loglike with dropped constant terms ?
  852. # Lutkepohl
  853. # return np.log(self.sigma2) + 1./self.model.nobs * self.k_ar
  854. # Include constant as estimated free parameter and double the loss
  855. # Stata defintion
  856. # nobs = self.nobs
  857. # return -2 * self.llf/nobs + 2 * (self.k_ar+self.k_trend)/nobs
  858. return eval_measures.aic(self.llf, self.nobs, self.df_model + 1)
  859. @cache_readonly
  860. def hqic(self):
  861. r"""
  862. Hannan-Quinn Information Criterion using Lutkepohl's definition.
  863. :math:`-2 llf + 2 \ln(\ln(nobs)) (1 + df_{model})`
  864. """
  865. # Lutkepohl
  866. # return np.log(self.sigma2)+ 2 * np.log(np.log(nobs))/nobs * self.k_ar
  867. # R uses all estimated parameters rather than just lags
  868. # Stata
  869. # nobs = self.nobs
  870. # return -2 * self.llf/nobs + 2 * np.log(np.log(nobs))/nobs * \
  871. # (self.k_ar + self.k_trend)
  872. return eval_measures.hqic(self.llf, self.nobs, self.df_model + 1)
  873. @cache_readonly
  874. def fpe(self):
  875. r"""
  876. Final prediction error using Lütkepohl's definition.
  877. :math:`((nobs+df_{model})/(nobs-df_{model})) \sigma^2`
  878. """
  879. nobs = self.nobs
  880. df_model = self.df_model
  881. # Lutkepohl
  882. return self.sigma2 * ((nobs + df_model) / (nobs - df_model))
  883. @cache_readonly
  884. def aicc(self):
  885. r"""
  886. Akaike Information Criterion with small sample correction
  887. :math:`2.0 * df_{model} * nobs / (nobs - df_{model} - 1.0)`
  888. """
  889. return eval_measures.aicc(self.llf, self.nobs, self.df_model + 1)
  890. @cache_readonly
  891. def bic(self):
  892. r"""
  893. Bayes Information Criterion
  894. :math:`-2 llf + \ln(nobs) (1 + df_{model})`
  895. """
  896. # Lutkepohl
  897. # np.log(self.sigma2) + np.log(nobs)/nobs * self.k_ar
  898. # Include constant as est. free parameter
  899. # Stata
  900. # -2 * self.llf/nobs + np.log(nobs)/nobs * (self.k_ar + self.k_trend)
  901. return eval_measures.bic(self.llf, self.nobs, self.df_model + 1)
  902. @cache_readonly
  903. def resid(self):
  904. """
  905. The residuals of the model.
  906. """
  907. model = self.model
  908. endog = model.endog.squeeze()
  909. return endog[self._hold_back :] - self.fittedvalues
  910. def _lag_repr(self):
  911. """Returns poly repr of an AR, (1 -phi1 L -phi2 L^2-...)"""
  912. ar_lags = self._ar_lags if self._ar_lags is not None else []
  913. k_ar = len(ar_lags)
  914. ar_params = np.zeros(self._max_lag + 1)
  915. ar_params[0] = 1
  916. df_model = self._df_model
  917. exog = self.model.exog
  918. k_exog = exog.shape[1] if exog is not None else 0
  919. params = self._params[df_model - k_ar - k_exog : df_model - k_exog]
  920. for i, lag in enumerate(ar_lags):
  921. ar_params[lag] = -params[i]
  922. return ar_params
  923. @cache_readonly
  924. def roots(self):
  925. """
  926. The roots of the AR process.
  927. The roots are the solution to
  928. (1 - arparams[0]*z - arparams[1]*z**2 -...- arparams[p-1]*z**k_ar) = 0.
  929. Stability requires that the roots in modulus lie outside the unit
  930. circle.
  931. """
  932. # TODO: Specific to AR
  933. lag_repr = self._lag_repr()
  934. if lag_repr.shape[0] == 1:
  935. return np.empty(0)
  936. return np.roots(lag_repr) ** -1
  937. @cache_readonly
  938. def arfreq(self):
  939. r"""
  940. Returns the frequency of the AR roots.
  941. This is the solution, x, to z = abs(z)*exp(2j*np.pi*x) where z are the
  942. roots.
  943. """
  944. # TODO: Specific to AR
  945. z = self.roots
  946. return np.arctan2(z.imag, z.real) / (2 * np.pi)
  947. @cache_readonly
  948. def fittedvalues(self):
  949. """
  950. The in-sample predicted values of the fitted AR model.
  951. The `k_ar` initial values are computed via the Kalman Filter if the
  952. model is fit by `mle`.
  953. """
  954. return self.model.predict(self.params)[self._hold_back :]
  955. def test_serial_correlation(self, lags=None, model_df=None):
  956. """
  957. Ljung-Box test for residual serial correlation
  958. Parameters
  959. ----------
  960. lags : int
  961. The maximum number of lags to use in the test. Jointly tests that
  962. all autocorrelations up to and including lag j are zero for
  963. j = 1, 2, ..., lags. If None, uses lag=12*(nobs/100)^{1/4}.
  964. After 0.12 the number of lags will change to min(10, nobs // 5).
  965. model_df : int
  966. The model degree of freedom to use when adjusting computing the
  967. test statistic to account for parameter estimation. If None, uses
  968. the number of AR lags included in the model.
  969. Returns
  970. -------
  971. output : DataFrame
  972. DataFrame containing three columns: the test statistic, the
  973. p-value of the test, and the degree of freedom used in the test.
  974. Notes
  975. -----
  976. Null hypothesis is no serial correlation.
  977. The the test degree-of-freedom is 0 or negative once accounting for
  978. model_df, then the test statistic's p-value is missing.
  979. See Also
  980. --------
  981. statsmodels.stats.diagnostic.acorr_ljungbox
  982. Ljung-Box test for serial correlation.
  983. """
  984. # Deferred to prevent circular import
  985. from statsmodels.stats.diagnostic import acorr_ljungbox
  986. lags = int_like(lags, "lags", optional=True)
  987. model_df = int_like(model_df, "df_model", optional=True)
  988. model_df = self.df_model if model_df is None else model_df
  989. nobs_effective = self.resid.shape[0]
  990. if lags is None:
  991. lags = min(nobs_effective // 5, 10)
  992. test_stats = acorr_ljungbox(
  993. self.resid,
  994. lags=lags,
  995. boxpierce=False,
  996. model_df=model_df,
  997. return_df=False,
  998. )
  999. cols = ["Ljung-Box", "LB P-value", "DF"]
  1000. if lags == 1:
  1001. test_stats = [list(test_stats) + [max(0, 1 - model_df)]]
  1002. else:
  1003. df = np.clip(np.arange(1, lags + 1) - model_df, 0, np.inf).astype(
  1004. int
  1005. )
  1006. test_stats = list(test_stats) + [df]
  1007. test_stats = [
  1008. [test_stats[i][j] for i in range(3)] for j in range(lags)
  1009. ]
  1010. index = pd.RangeIndex(1, lags + 1, name="Lag")
  1011. return pd.DataFrame(test_stats, columns=cols, index=index)
  1012. def test_normality(self):
  1013. """
  1014. Test for normality of standardized residuals.
  1015. Returns
  1016. -------
  1017. Series
  1018. Series containing four values, the test statistic and its p-value,
  1019. the skewness and the kurtosis.
  1020. Notes
  1021. -----
  1022. Null hypothesis is normality.
  1023. See Also
  1024. --------
  1025. statsmodels.stats.stattools.jarque_bera
  1026. The Jarque-Bera test of normality.
  1027. """
  1028. # Deferred to prevent circular import
  1029. from statsmodels.stats.stattools import jarque_bera
  1030. index = ["Jarque-Bera", "P-value", "Skewness", "Kurtosis"]
  1031. return pd.Series(jarque_bera(self.resid), index=index)
  1032. def test_heteroskedasticity(self, lags=None):
  1033. """
  1034. ARCH-LM test of residual heteroskedasticity
  1035. Parameters
  1036. ----------
  1037. lags : int
  1038. The maximum number of lags to use in the test. Jointly tests that
  1039. all squared autocorrelations up to and including lag j are zero for
  1040. j = 1, 2, ..., lags. If None, uses lag=12*(nobs/100)^{1/4}.
  1041. Returns
  1042. -------
  1043. Series
  1044. Series containing the test statistic and its p-values.
  1045. See Also
  1046. --------
  1047. statsmodels.stats.diagnostic.het_arch
  1048. ARCH-LM test.
  1049. statsmodels.stats.diagnostic.acorr_lm
  1050. LM test for autocorrelation.
  1051. """
  1052. from statsmodels.stats.diagnostic import het_arch
  1053. lags = int_like(lags, "lags", optional=True)
  1054. nobs_effective = self.resid.shape[0]
  1055. if lags is None:
  1056. lags = min(nobs_effective // 5, 10)
  1057. out = []
  1058. for lag in range(1, lags + 1):
  1059. res = het_arch(self.resid, nlags=lag, autolag=None)
  1060. out.append([res[0], res[1], lag])
  1061. index = pd.RangeIndex(1, lags + 1, name="Lag")
  1062. cols = ["ARCH-LM", "P-value", "DF"]
  1063. return pd.DataFrame(out, columns=cols, index=index)
  1064. def diagnostic_summary(self):
  1065. """
  1066. Returns a summary containing standard model diagnostic tests
  1067. Returns
  1068. -------
  1069. Summary
  1070. A summary instance with panels for serial correlation tests,
  1071. normality tests and heteroskedasticity tests.
  1072. See Also
  1073. --------
  1074. test_serial_correlation
  1075. Test models residuals for serial correlation.
  1076. test_normality
  1077. Test models residuals for deviations from normality.
  1078. test_heteroskedasticity
  1079. Test models residuals for conditional heteroskedasticity.
  1080. """
  1081. from statsmodels.iolib.table import SimpleTable
  1082. spacer = SimpleTable([""])
  1083. smry = Summary()
  1084. sc = self.test_serial_correlation()
  1085. sc = sc.loc[sc.DF > 0]
  1086. values = [[i + 1] + row for i, row in enumerate(sc.values.tolist())]
  1087. data_fmts = ("%10d", "%10.3f", "%10.3f", "%10d")
  1088. if sc.shape[0]:
  1089. tab = SimpleTable(
  1090. values,
  1091. headers=["Lag"] + list(sc.columns),
  1092. title="Test of No Serial Correlation",
  1093. header_align="r",
  1094. data_fmts=data_fmts,
  1095. )
  1096. smry.tables.append(tab)
  1097. smry.tables.append(spacer)
  1098. jb = self.test_normality()
  1099. data_fmts = ("%10.3f", "%10.3f", "%10.3f", "%10.3f")
  1100. tab = SimpleTable(
  1101. [jb.values],
  1102. headers=list(jb.index),
  1103. title="Test of Normality",
  1104. header_align="r",
  1105. data_fmts=data_fmts,
  1106. )
  1107. smry.tables.append(tab)
  1108. smry.tables.append(spacer)
  1109. arch_lm = self.test_heteroskedasticity()
  1110. values = [
  1111. [i + 1] + row for i, row in enumerate(arch_lm.values.tolist())
  1112. ]
  1113. data_fmts = ("%10d", "%10.3f", "%10.3f", "%10d")
  1114. tab = SimpleTable(
  1115. values,
  1116. headers=["Lag"] + list(arch_lm.columns),
  1117. title="Test of Conditional Homoskedasticity",
  1118. header_align="r",
  1119. data_fmts=data_fmts,
  1120. )
  1121. smry.tables.append(tab)
  1122. return smry
  1123. @Appender(remove_parameters(AutoReg.predict.__doc__, "params"))
  1124. def predict(
  1125. self, start=None, end=None, dynamic=False, exog=None, exog_oos=None
  1126. ):
  1127. return self.model.predict(
  1128. self._params,
  1129. start=start,
  1130. end=end,
  1131. dynamic=dynamic,
  1132. exog=exog,
  1133. exog_oos=exog_oos,
  1134. )
  1135. def get_prediction(
  1136. self, start=None, end=None, dynamic=False, exog=None, exog_oos=None
  1137. ):
  1138. """
  1139. Predictions and prediction intervals
  1140. Parameters
  1141. ----------
  1142. start : int, str, or datetime, optional
  1143. Zero-indexed observation number at which to start forecasting,
  1144. i.e., the first forecast is start. Can also be a date string to
  1145. parse or a datetime type. Default is the the zeroth observation.
  1146. end : int, str, or datetime, optional
  1147. Zero-indexed observation number at which to end forecasting, i.e.,
  1148. the last forecast is end. Can also be a date string to
  1149. parse or a datetime type. However, if the dates index does not
  1150. have a fixed frequency, end must be an integer index if you
  1151. want out-of-sample prediction. Default is the last observation in
  1152. the sample. Unlike standard python slices, end is inclusive so
  1153. that all the predictions [start, start+1, ..., end-1, end] are
  1154. returned.
  1155. dynamic : {bool, int, str, datetime, Timestamp}, optional
  1156. Integer offset relative to `start` at which to begin dynamic
  1157. prediction. Prior to this observation, true endogenous values
  1158. will be used for prediction; starting with this observation and
  1159. continuing through the end of prediction, forecasted endogenous
  1160. values will be used instead. Datetime-like objects are not
  1161. interpreted as offsets. They are instead used to find the index
  1162. location of `dynamic` which is then used to to compute the offset.
  1163. exog : array_like
  1164. A replacement exogenous array. Must have the same shape as the
  1165. exogenous data array used when the model was created.
  1166. exog_oos : array_like
  1167. An array containing out-of-sample values of the exogenous variable.
  1168. Must has the same number of columns as the exog used when the
  1169. model was created, and at least as many rows as the number of
  1170. out-of-sample forecasts.
  1171. Returns
  1172. -------
  1173. PredictionResults
  1174. Prediction results with mean and prediction intervals
  1175. """
  1176. mean = self.predict(
  1177. start=start, end=end, dynamic=dynamic, exog=exog, exog_oos=exog_oos
  1178. )
  1179. mean_var = np.full_like(mean, self.sigma2)
  1180. mean_var[np.isnan(mean)] = np.nan
  1181. start = 0 if start is None else start
  1182. end = self.model._index[-1] if end is None else end
  1183. _, _, oos, _ = self.model._get_prediction_index(start, end)
  1184. if oos > 0:
  1185. ar_params = self._lag_repr()
  1186. ma = arma2ma(ar_params, np.ones(1), lags=oos)
  1187. mean_var[-oos:] = self.sigma2 * np.cumsum(ma ** 2)
  1188. if isinstance(mean, pd.Series):
  1189. mean_var = pd.Series(mean_var, index=mean.index)
  1190. return PredictionResults(mean, mean_var)
  1191. def forecast(self, steps=1, exog=None):
  1192. """
  1193. Out-of-sample forecasts
  1194. Parameters
  1195. ----------
  1196. steps : {int, str, datetime}, default 1
  1197. If an integer, the number of steps to forecast from the end of the
  1198. sample. Can also be a date string to parse or a datetime type.
  1199. However, if the dates index does not have a fixed frequency,
  1200. steps must be an integer.
  1201. exog : {ndarray, DataFrame}
  1202. Exogenous values to use out-of-sample. Must have same number of
  1203. columns as original exog data and at least `steps` rows
  1204. Returns
  1205. -------
  1206. array_like
  1207. Array of out of in-sample predictions and / or out-of-sample
  1208. forecasts.
  1209. See Also
  1210. --------
  1211. AutoRegResults.predict
  1212. In- and out-of-sample predictions
  1213. AutoRegResults.get_prediction
  1214. In- and out-of-sample predictions and confidence intervals
  1215. """
  1216. start = self.model.data.orig_endog.shape[0]
  1217. if isinstance(steps, (int, np.integer)):
  1218. end = start + steps - 1
  1219. else:
  1220. end = steps
  1221. return self.predict(start=start, end=end, dynamic=False, exog_oos=exog)
  1222. def _plot_predictions(
  1223. self,
  1224. predictions,
  1225. start,
  1226. end,
  1227. alpha,
  1228. in_sample,
  1229. fig,
  1230. figsize,
  1231. ):
  1232. """Shared helper for plotting predictions"""
  1233. from statsmodels.graphics.utils import _import_mpl, create_mpl_fig
  1234. _import_mpl()
  1235. fig = create_mpl_fig(fig, figsize)
  1236. start = 0 if start is None else start
  1237. end = self.model._index[-1] if end is None else end
  1238. _, _, oos, _ = self.model._get_prediction_index(start, end)
  1239. ax = fig.add_subplot(111)
  1240. mean = predictions.predicted_mean
  1241. if not in_sample and oos:
  1242. if isinstance(mean, pd.Series):
  1243. mean = mean.iloc[-oos:]
  1244. elif not in_sample:
  1245. raise ValueError(
  1246. "in_sample is False but there are no"
  1247. "out-of-sample forecasts to plot."
  1248. )
  1249. ax.plot(mean, zorder=2)
  1250. if oos and alpha is not None:
  1251. ci = np.asarray(predictions.conf_int(alpha))
  1252. lower, upper = ci[-oos:, 0], ci[-oos:, 1]
  1253. label = "{0:.0%} confidence interval".format(1 - alpha)
  1254. x = ax.get_lines()[-1].get_xdata()
  1255. ax.fill_between(
  1256. x[-oos:],
  1257. lower,
  1258. upper,
  1259. color="gray",
  1260. alpha=0.5,
  1261. label=label,
  1262. zorder=1,
  1263. )
  1264. ax.legend(loc="best")
  1265. return fig
  1266. @Substitution(predict_params=_predict_params)
  1267. def plot_predict(
  1268. self,
  1269. start=None,
  1270. end=None,
  1271. dynamic=False,
  1272. exog=None,
  1273. exog_oos=None,
  1274. alpha=0.05,
  1275. in_sample=True,
  1276. fig=None,
  1277. figsize=None,
  1278. ):
  1279. """
  1280. Plot in- and out-of-sample predictions
  1281. Parameters
  1282. ----------
  1283. %(predict_params)s
  1284. alpha : {float, None}
  1285. The tail probability not covered by the confidence interval. Must
  1286. be in (0, 1). Confidence interval is constructed assuming normally
  1287. distributed shocks. If None, figure will not show the confidence
  1288. interval.
  1289. in_sample : bool
  1290. Flag indicating whether to include the in-sample period in the
  1291. plot.
  1292. fig : Figure
  1293. An existing figure handle. If not provided, a new figure is
  1294. created.
  1295. figsize: tuple[float, float]
  1296. Tuple containing the figure size values.
  1297. Returns
  1298. -------
  1299. Figure
  1300. Figure handle containing the plot.
  1301. """
  1302. predictions = self.get_prediction(
  1303. start=start, end=end, dynamic=dynamic, exog=exog, exog_oos=exog_oos
  1304. )
  1305. return self._plot_predictions(
  1306. predictions, start, end, alpha, in_sample, fig, figsize
  1307. )
  1308. def plot_diagnostics(self, lags=10, fig=None, figsize=None):
  1309. """
  1310. Diagnostic plots for standardized residuals
  1311. Parameters
  1312. ----------
  1313. lags : int, optional
  1314. Number of lags to include in the correlogram. Default is 10.
  1315. fig : Figure, optional
  1316. If given, subplots are created in this figure instead of in a new
  1317. figure. Note that the 2x2 grid will be created in the provided
  1318. figure using `fig.add_subplot()`.
  1319. figsize : tuple, optional
  1320. If a figure is created, this argument allows specifying a size.
  1321. The tuple is (width, height).
  1322. Notes
  1323. -----
  1324. Produces a 2x2 plot grid with the following plots (ordered clockwise
  1325. from top left):
  1326. 1. Standardized residuals over time
  1327. 2. Histogram plus estimated density of standardized residuals, along
  1328. with a Normal(0,1) density plotted for reference.
  1329. 3. Normal Q-Q plot, with Normal reference line.
  1330. 4. Correlogram
  1331. See Also
  1332. --------
  1333. statsmodels.graphics.gofplots.qqplot
  1334. statsmodels.graphics.tsaplots.plot_acf
  1335. """
  1336. from statsmodels.graphics.utils import _import_mpl, create_mpl_fig
  1337. _import_mpl()
  1338. fig = create_mpl_fig(fig, figsize)
  1339. # Eliminate residuals associated with burned or diffuse likelihoods
  1340. resid = self.resid
  1341. # Top-left: residuals vs time
  1342. ax = fig.add_subplot(221)
  1343. if hasattr(self.model.data, "dates") and self.data.dates is not None:
  1344. x = self.model.data.dates._mpl_repr()
  1345. x = x[self.model.hold_back :]
  1346. else:
  1347. hold_back = self.model.hold_back
  1348. x = hold_back + np.arange(self.resid.shape[0])
  1349. std_resid = resid / np.sqrt(self.sigma2)
  1350. ax.plot(x, std_resid)
  1351. ax.hlines(0, x[0], x[-1], alpha=0.5)
  1352. ax.set_xlim(x[0], x[-1])
  1353. ax.set_title("Standardized residual")
  1354. # Top-right: histogram, Gaussian kernel density, Normal density
  1355. # Can only do histogram and Gaussian kernel density on the non-null
  1356. # elements
  1357. std_resid_nonmissing = std_resid[~(np.isnan(resid))]
  1358. ax = fig.add_subplot(222)
  1359. ax.hist(std_resid_nonmissing, density=True, label="Hist")
  1360. kde = gaussian_kde(std_resid)
  1361. xlim = (-1.96 * 2, 1.96 * 2)
  1362. x = np.linspace(xlim[0], xlim[1])
  1363. ax.plot(x, kde(x), label="KDE")
  1364. ax.plot(x, norm.pdf(x), label="N(0,1)")
  1365. ax.set_xlim(xlim)
  1366. ax.legend()
  1367. ax.set_title("Histogram plus estimated density")
  1368. # Bottom-left: QQ plot
  1369. ax = fig.add_subplot(223)
  1370. from statsmodels.graphics.gofplots import qqplot
  1371. qqplot(std_resid, line="s", ax=ax)
  1372. ax.set_title("Normal Q-Q")
  1373. # Bottom-right: Correlogram
  1374. ax = fig.add_subplot(224)
  1375. from statsmodels.graphics.tsaplots import plot_acf
  1376. plot_acf(resid, ax=ax, lags=lags)
  1377. ax.set_title("Correlogram")
  1378. ax.set_ylim(-1, 1)
  1379. return fig
  1380. def summary(self, alpha=0.05):
  1381. """
  1382. Summarize the Model
  1383. Parameters
  1384. ----------
  1385. alpha : float, optional
  1386. Significance level for the confidence intervals.
  1387. Returns
  1388. -------
  1389. smry : Summary instance
  1390. This holds the summary table and text, which can be printed or
  1391. converted to various output formats.
  1392. See Also
  1393. --------
  1394. statsmodels.iolib.summary.Summary
  1395. """
  1396. model = self.model
  1397. title = model.__class__.__name__ + " Model Results"
  1398. method = "Conditional MLE"
  1399. # get sample
  1400. start = self._hold_back
  1401. if self.data.dates is not None:
  1402. dates = self.data.dates
  1403. sample = [dates[start].strftime("%m-%d-%Y")]
  1404. sample += ["- " + dates[-1].strftime("%m-%d-%Y")]
  1405. else:
  1406. sample = [str(start), str(len(self.data.orig_endog))]
  1407. model = model.__class__.__name__
  1408. if self.model.seasonal:
  1409. model = "Seas. " + model
  1410. if self.ar_lags is not None and len(self.ar_lags) < self._max_lag:
  1411. model = "Restr. " + model
  1412. if self.model.exog is not None:
  1413. model += "-X"
  1414. order = "({0})".format(self._max_lag)
  1415. dep_name = str(self.model.endog_names)
  1416. top_left = [
  1417. ("Dep. Variable:", [dep_name]),
  1418. ("Model:", [model + order]),
  1419. ("Method:", [method]),
  1420. ("Date:", None),
  1421. ("Time:", None),
  1422. ("Sample:", [sample[0]]),
  1423. ("", [sample[1]]),
  1424. ]
  1425. top_right = [
  1426. ("No. Observations:", [str(len(self.model.endog))]),
  1427. ("Log Likelihood", ["%#5.3f" % self.llf]),
  1428. ("S.D. of innovations", ["%#5.3f" % self.sigma2 ** 0.5]),
  1429. ("AIC", ["%#5.3f" % self.aic]),
  1430. ("BIC", ["%#5.3f" % self.bic]),
  1431. ("HQIC", ["%#5.3f" % self.hqic]),
  1432. ]
  1433. smry = Summary()
  1434. smry.add_table_2cols(
  1435. self, gleft=top_left, gright=top_right, title=title
  1436. )
  1437. smry.add_table_params(self, alpha=alpha, use_t=False)
  1438. # Make the roots table
  1439. from statsmodels.iolib.table import SimpleTable
  1440. if self._max_lag:
  1441. arstubs = ["AR.%d" % i for i in range(1, self._max_lag + 1)]
  1442. stubs = arstubs
  1443. roots = self.roots
  1444. freq = self.arfreq
  1445. modulus = np.abs(roots)
  1446. data = np.column_stack((roots.real, roots.imag, modulus, freq))
  1447. roots_table = SimpleTable(
  1448. [
  1449. (
  1450. "%17.4f" % row[0],
  1451. "%+17.4fj" % row[1],
  1452. "%17.4f" % row[2],
  1453. "%17.4f" % row[3],
  1454. )
  1455. for row in data
  1456. ],
  1457. headers=[
  1458. " Real",
  1459. " Imaginary",
  1460. " Modulus",
  1461. " Frequency",
  1462. ],
  1463. title="Roots",
  1464. stubs=stubs,
  1465. )
  1466. smry.tables.append(roots_table)
  1467. return smry
  1468. class AutoRegResultsWrapper(wrap.ResultsWrapper):
  1469. _attrs = {}
  1470. _wrap_attrs = wrap.union_dicts(
  1471. tsa_model.TimeSeriesResultsWrapper._wrap_attrs, _attrs
  1472. )
  1473. _methods = {}
  1474. _wrap_methods = wrap.union_dicts(
  1475. tsa_model.TimeSeriesResultsWrapper._wrap_methods, _methods
  1476. )
  1477. wrap.populate_wrapper(AutoRegResultsWrapper, AutoRegResults)
  1478. doc = Docstring(AutoReg.__doc__)
  1479. _auto_reg_params = doc.extract_parameters(
  1480. [
  1481. "trend",
  1482. "seasonal",
  1483. "exog",
  1484. "hold_back",
  1485. "period",
  1486. "missing",
  1487. "old_names",
  1488. ],
  1489. 4,
  1490. )
  1491. @Substitution(auto_reg_params=_auto_reg_params)
  1492. def ar_select_order(
  1493. endog,
  1494. maxlag,
  1495. ic="bic",
  1496. glob=False,
  1497. trend="c",
  1498. seasonal=False,
  1499. exog=None,
  1500. hold_back=None,
  1501. period=None,
  1502. missing="none",
  1503. old_names=False,
  1504. ):
  1505. """
  1506. Autoregressive AR-X(p) model order selection.
  1507. Parameters
  1508. ----------
  1509. endog : array_like
  1510. A 1-d endogenous response variable. The independent variable.
  1511. maxlag : int
  1512. The maximum lag to consider.
  1513. ic : {'aic', 'hqic', 'bic'}
  1514. The information criterion to use in the selection.
  1515. glob : bool
  1516. Flag indicating where to use a global search across all combinations
  1517. of lags. In practice, this option is not computational feasible when
  1518. maxlag is larger than 15 (or perhaps 20) since the global search
  1519. requires fitting 2**maxlag models.
  1520. %(auto_reg_params)s
  1521. Returns
  1522. -------
  1523. AROrderSelectionResults
  1524. A results holder containing the model and the complete set of
  1525. information criteria for all models fit.
  1526. Examples
  1527. --------
  1528. >>> from statsmodels.tsa.ar_model import ar_select_order
  1529. >>> data = sm.datasets.sunspots.load_pandas().data['SUNACTIVITY']
  1530. Determine the optimal lag structure
  1531. >>> mod = ar_select_order(data, maxlag=13)
  1532. >>> mod.ar_lags
  1533. array([1, 2, 3, 4, 5, 6, 7, 8, 9])
  1534. Determine the optimal lag structure with seasonal terms
  1535. >>> mod = ar_select_order(data, maxlag=13, seasonal=True, period=12)
  1536. >>> mod.ar_lags
  1537. array([1, 2, 3, 4, 5, 6, 7, 8, 9])
  1538. Globally determine the optimal lag structure
  1539. >>> mod = ar_select_order(data, maxlag=13, glob=True)
  1540. >>> mod.ar_lags
  1541. array([1, 2, 9])
  1542. """
  1543. full_mod = AutoReg(
  1544. endog,
  1545. maxlag,
  1546. trend=trend,
  1547. seasonal=seasonal,
  1548. exog=exog,
  1549. hold_back=hold_back,
  1550. period=period,
  1551. missing=missing,
  1552. old_names=old_names,
  1553. )
  1554. nexog = full_mod.exog.shape[1] if full_mod.exog is not None else 0
  1555. y, x = full_mod._y, full_mod._x
  1556. base_col = x.shape[1] - nexog - maxlag
  1557. sel = np.ones(x.shape[1], dtype=bool)
  1558. ics = []
  1559. def compute_ics(res):
  1560. nobs = res.nobs
  1561. df_model = res.df_model
  1562. sigma2 = 1.0 / nobs * sumofsq(res.resid)
  1563. llf = -nobs * (np.log(2 * np.pi * sigma2) + 1) / 2
  1564. res = SimpleNamespace(
  1565. nobs=nobs, df_model=df_model, sigma2=sigma2, llf=llf
  1566. )
  1567. aic = call_cached_func(AutoRegResults.aic, res)
  1568. bic = call_cached_func(AutoRegResults.bic, res)
  1569. hqic = call_cached_func(AutoRegResults.hqic, res)
  1570. return aic, bic, hqic
  1571. def ic_no_data():
  1572. """Fake mod and results to handle no regressor case"""
  1573. mod = SimpleNamespace(
  1574. nobs=y.shape[0], endog=y, exog=np.empty((y.shape[0], 0))
  1575. )
  1576. llf = OLS.loglike(mod, np.empty(0))
  1577. res = SimpleNamespace(
  1578. resid=y, nobs=y.shape[0], llf=llf, df_model=0, k_constant=0
  1579. )
  1580. return compute_ics(res)
  1581. if not glob:
  1582. sel[base_col : base_col + maxlag] = False
  1583. for i in range(maxlag + 1):
  1584. sel[base_col : base_col + i] = True
  1585. if not np.any(sel):
  1586. ics.append((0, ic_no_data()))
  1587. continue
  1588. res = OLS(y, x[:, sel]).fit()
  1589. lags = tuple(j for j in range(1, i + 1))
  1590. lags = 0 if not lags else lags
  1591. ics.append((lags, compute_ics(res)))
  1592. else:
  1593. bits = np.arange(2 ** maxlag, dtype=np.int32)[:, None]
  1594. bits = bits.view(np.uint8)
  1595. bits = np.unpackbits(bits).reshape(-1, 32)
  1596. for i in range(4):
  1597. bits[:, 8 * i : 8 * (i + 1)] = bits[:, 8 * i : 8 * (i + 1)][
  1598. :, ::-1
  1599. ]
  1600. masks = bits[:, :maxlag]
  1601. for mask in masks:
  1602. sel[base_col : base_col + maxlag] = mask
  1603. if not np.any(sel):
  1604. ics.append((0, ic_no_data()))
  1605. continue
  1606. res = OLS(y, x[:, sel]).fit()
  1607. lags = tuple(np.where(mask)[0] + 1)
  1608. lags = 0 if not lags else lags
  1609. ics.append((lags, compute_ics(res)))
  1610. key_loc = {"aic": 0, "bic": 1, "hqic": 2}[ic]
  1611. ics = sorted(ics, key=lambda x: x[1][key_loc])
  1612. selected_model = ics[0][0]
  1613. mod = AutoReg(
  1614. endog,
  1615. selected_model,
  1616. trend=trend,
  1617. seasonal=seasonal,
  1618. exog=exog,
  1619. hold_back=hold_back,
  1620. period=period,
  1621. missing=missing,
  1622. old_names=old_names,
  1623. )
  1624. return AROrderSelectionResults(mod, ics, trend, seasonal, period)
  1625. class AROrderSelectionResults(object):
  1626. """
  1627. Results from an AR order selection
  1628. Contains the information criteria for all fitted model orders.
  1629. """
  1630. def __init__(self, model, ics, trend, seasonal, period):
  1631. self._model = model
  1632. self._ics = ics
  1633. self._trend = trend
  1634. self._seasonal = seasonal
  1635. self._period = period
  1636. aic = sorted(ics, key=lambda r: r[1][0])
  1637. self._aic = dict([(key, val[0]) for key, val in aic])
  1638. bic = sorted(ics, key=lambda r: r[1][1])
  1639. self._bic = dict([(key, val[1]) for key, val in bic])
  1640. hqic = sorted(ics, key=lambda r: r[1][2])
  1641. self._hqic = dict([(key, val[2]) for key, val in hqic])
  1642. @property
  1643. def model(self):
  1644. """The model selected using the chosen information criterion."""
  1645. return self._model
  1646. @property
  1647. def seasonal(self):
  1648. """Flag indicating if a seasonal component is included."""
  1649. return self._seasonal
  1650. @property
  1651. def trend(self):
  1652. """The trend included in the model selection."""
  1653. return self._trend
  1654. @property
  1655. def period(self):
  1656. """The period of the seasonal component."""
  1657. return self._period
  1658. @property
  1659. def aic(self):
  1660. """
  1661. The Akaike information criterion for the models fit.
  1662. Returns
  1663. -------
  1664. dict[tuple, float]
  1665. """
  1666. return self._aic
  1667. @property
  1668. def bic(self):
  1669. """
  1670. The Bayesian (Schwarz) information criteria for the models fit.
  1671. Returns
  1672. -------
  1673. dict[tuple, float]
  1674. """
  1675. return self._bic
  1676. @property
  1677. def hqic(self):
  1678. """
  1679. The Hannan-Quinn information criteria for the models fit.
  1680. Returns
  1681. -------
  1682. dict[tuple, float]
  1683. """
  1684. return self._hqic
  1685. @property
  1686. def ar_lags(self):
  1687. """The lags included in the selected model."""
  1688. return self._model.ar_lags