PageRenderTime 88ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/statsmodels/stats/tests/test_diagnostic.py

http://github.com/statsmodels/statsmodels
Python | 1921 lines | 1499 code | 199 blank | 223 comment | 29 complexity | 4e656a602527e8420fed08b0de221a2b MD5 | raw file
Possible License(s): BSD-3-Clause

Large files files are truncated, but you can click here to view the full file

  1. # -*- coding: utf-8 -*-
  2. """Tests for Regression Diagnostics and Specification Tests
  3. Created on Thu Feb 09 13:19:47 2012
  4. Author: Josef Perktold
  5. License: BSD-3
  6. currently all tests are against R
  7. """
  8. import json
  9. import os
  10. import numpy as np
  11. from numpy.testing import (
  12. assert_,
  13. assert_allclose,
  14. assert_almost_equal,
  15. assert_array_equal,
  16. assert_equal,
  17. )
  18. import pandas as pd
  19. from pandas.testing import assert_frame_equal
  20. import pytest
  21. from statsmodels.datasets import macrodata, sunspots
  22. from statsmodels.regression.linear_model import OLS
  23. import statsmodels.stats.diagnostic as smsdia
  24. import statsmodels.stats.outliers_influence as oi
  25. import statsmodels.stats.sandwich_covariance as sw
  26. from statsmodels.tools.tools import Bunch, add_constant
  27. from statsmodels.tsa.ar_model import AutoReg
  28. from statsmodels.tsa.arima.model import ARIMA
  29. cur_dir = os.path.abspath(os.path.dirname(__file__))
  30. @pytest.fixture(scope="module")
  31. def diagnostic_data():
  32. rs = np.random.RandomState(93674328)
  33. e = rs.standard_normal(500)
  34. x = rs.standard_normal((500, 3))
  35. y = x.sum(1) + e
  36. c = np.ones_like(y)
  37. data = pd.DataFrame(np.c_[y, c, x], columns=["y", "c", "x1", "x2", "x3"])
  38. return data
  39. def compare_to_reference(sp, sp_dict, decimal=(12, 12)):
  40. assert_allclose(
  41. sp[0],
  42. sp_dict["statistic"],
  43. atol=10 ** -decimal[0],
  44. rtol=10 ** -decimal[0],
  45. )
  46. assert_allclose(
  47. sp[1],
  48. sp_dict["pvalue"],
  49. atol=10 ** -decimal[1],
  50. rtol=10 ** -decimal[0],
  51. )
  52. def test_gq():
  53. d = macrodata.load().data
  54. realinv = d["realinv"]
  55. realgdp = d["realgdp"]
  56. realint = d["realint"]
  57. endog = realinv
  58. exog = add_constant(np.c_[realgdp, realint])
  59. # goldfeld-quandt
  60. het_gq_greater = dict(
  61. statistic=13.20512768685082,
  62. df1=99,
  63. df2=98,
  64. pvalue=1.246141976112324e-30,
  65. distr="f",
  66. )
  67. gq = smsdia.het_goldfeldquandt(endog, exog, split=0.5)
  68. compare_to_reference(gq, het_gq_greater, decimal=(12, 12))
  69. assert_equal(gq[-1], "increasing")
  70. class TestDiagnosticG(object):
  71. @classmethod
  72. def setup_class(cls):
  73. d = macrodata.load_pandas().data
  74. # growth rates
  75. gs_l_realinv = 400 * np.diff(np.log(d["realinv"].values))
  76. gs_l_realgdp = 400 * np.diff(np.log(d["realgdp"].values))
  77. lint = d["realint"][:-1].values
  78. tbilrate = d["tbilrate"][:-1].values
  79. endogg = gs_l_realinv
  80. exogg = add_constant(np.c_[gs_l_realgdp, lint])
  81. exogg2 = add_constant(np.c_[gs_l_realgdp, tbilrate])
  82. exogg3 = add_constant(np.c_[gs_l_realgdp])
  83. res_ols = OLS(endogg, exogg).fit()
  84. res_ols2 = OLS(endogg, exogg2).fit()
  85. res_ols3 = OLS(endogg, exogg3).fit()
  86. cls.res = res_ols
  87. cls.res2 = res_ols2
  88. cls.res3 = res_ols3
  89. cls.endog = cls.res.model.endog
  90. cls.exog = cls.res.model.exog
  91. def test_basic(self):
  92. # mainly to check I got the right regression
  93. # > mkarray(fm$coefficients, "params")
  94. params = np.array(
  95. [-9.48167277465485, 4.3742216647032, -0.613996969478989]
  96. )
  97. assert_almost_equal(self.res.params, params, decimal=12)
  98. def test_hac(self):
  99. res = self.res
  100. # > nw = NeweyWest(fm, lag = 4, prewhite = FALSE, verbose=TRUE)
  101. # > nw2 = NeweyWest(fm, lag=10, prewhite = FALSE, verbose=TRUE)
  102. # > mkarray(nw, "cov_hac_4")
  103. cov_hac_4 = np.array(
  104. [
  105. 1.385551290884014,
  106. -0.3133096102522685,
  107. -0.0597207976835705,
  108. -0.3133096102522685,
  109. 0.1081011690351306,
  110. 0.000389440793564336,
  111. -0.0597207976835705,
  112. 0.000389440793564339,
  113. 0.0862118527405036,
  114. ]
  115. ).reshape((3, 3), order="F")
  116. # > mkarray(nw2, "cov_hac_10")
  117. cov_hac_10 = np.array(
  118. [
  119. 1.257386180080192,
  120. -0.2871560199899846,
  121. -0.03958300024627573,
  122. -0.2871560199899845,
  123. 0.1049107028987101,
  124. 0.0003896205316866944,
  125. -0.03958300024627578,
  126. 0.0003896205316866961,
  127. 0.0985539340694839,
  128. ]
  129. ).reshape((3, 3), order="F")
  130. cov = sw.cov_hac_simple(res, nlags=4, use_correction=False)
  131. bse_hac = sw.se_cov(cov)
  132. assert_almost_equal(cov, cov_hac_4, decimal=12)
  133. assert_almost_equal(bse_hac, np.sqrt(np.diag(cov)), decimal=12)
  134. cov = sw.cov_hac_simple(res, nlags=10, use_correction=False)
  135. bse_hac = sw.se_cov(cov)
  136. assert_almost_equal(cov, cov_hac_10, decimal=12)
  137. assert_almost_equal(bse_hac, np.sqrt(np.diag(cov)), decimal=12)
  138. def test_het_goldfeldquandt(self):
  139. # TODO: test options missing
  140. # > gq = gqtest(fm, alternative='greater')
  141. # > mkhtest_f(gq, 'het_gq_greater', 'f')
  142. het_gq_greater = dict(
  143. statistic=0.5313259064778423,
  144. pvalue=0.9990217851193723,
  145. parameters=(98, 98),
  146. distr="f",
  147. )
  148. # > gq = gqtest(fm, alternative='less')
  149. # > mkhtest_f(gq, 'het_gq_less', 'f')
  150. het_gq_less = dict(
  151. statistic=0.5313259064778423,
  152. pvalue=0.000978214880627621,
  153. parameters=(98, 98),
  154. distr="f",
  155. )
  156. # > gq = gqtest(fm, alternative='two.sided')
  157. # > mkhtest_f(gq, 'het_gq_two_sided', 'f')
  158. het_gq_two_sided = dict(
  159. statistic=0.5313259064778423,
  160. pvalue=0.001956429761255241,
  161. parameters=(98, 98),
  162. distr="f",
  163. )
  164. # > gq = gqtest(fm, fraction=0.1, alternative='two.sided')
  165. # > mkhtest_f(gq, 'het_gq_two_sided_01', 'f')
  166. het_gq_two_sided_01 = dict(
  167. statistic=0.5006976835928314,
  168. pvalue=0.001387126702579789,
  169. parameters=(88, 87),
  170. distr="f",
  171. )
  172. endogg, exogg = self.endog, self.exog
  173. # tests
  174. gq = smsdia.het_goldfeldquandt(endogg, exogg, split=0.5)
  175. compare_to_reference(gq, het_gq_greater, decimal=(12, 12))
  176. assert_equal(gq[-1], "increasing")
  177. gq = smsdia.het_goldfeldquandt(
  178. endogg, exogg, split=0.5, alternative="decreasing"
  179. )
  180. compare_to_reference(gq, het_gq_less, decimal=(12, 12))
  181. assert_equal(gq[-1], "decreasing")
  182. gq = smsdia.het_goldfeldquandt(
  183. endogg, exogg, split=0.5, alternative="two-sided"
  184. )
  185. compare_to_reference(gq, het_gq_two_sided, decimal=(12, 12))
  186. assert_equal(gq[-1], "two-sided")
  187. # TODO: forcing the same split as R 202-90-90-1=21
  188. gq = smsdia.het_goldfeldquandt(
  189. endogg, exogg, split=90, drop=21, alternative="two-sided"
  190. )
  191. compare_to_reference(gq, het_gq_two_sided_01, decimal=(12, 12))
  192. assert_equal(gq[-1], "two-sided")
  193. # TODO other options ???
  194. def test_het_breusch_pagan(self):
  195. res = self.res
  196. bptest = dict(
  197. statistic=0.709924388395087,
  198. pvalue=0.701199952134347,
  199. parameters=(2,),
  200. distr="f",
  201. )
  202. bp = smsdia.het_breuschpagan(res.resid, res.model.exog)
  203. compare_to_reference(bp, bptest, decimal=(12, 12))
  204. def test_het_breusch_pagan_nonrobust(self):
  205. res = self.res
  206. bptest = dict(
  207. statistic=1.302014063483341,
  208. pvalue=0.5215203247110649,
  209. parameters=(2,),
  210. distr="f",
  211. )
  212. bp = smsdia.het_breuschpagan(res.resid, res.model.exog, robust=False)
  213. compare_to_reference(bp, bptest, decimal=(12, 12))
  214. def test_het_white(self):
  215. res = self.res
  216. # TODO: regressiontest, compare with Greene or Gretl or Stata
  217. hw = smsdia.het_white(res.resid, res.model.exog)
  218. hw_values = (
  219. 33.503722896538441,
  220. 2.9887960597830259e-06,
  221. 7.7945101228430946,
  222. 1.0354575277704231e-06,
  223. )
  224. assert_almost_equal(hw, hw_values)
  225. def test_het_white_error(self):
  226. res = self.res
  227. with pytest.raises(ValueError, match="White's heteroskedasticity"):
  228. smsdia.het_white(res.resid, res.model.exog[:, :1])
  229. def test_het_arch(self):
  230. # test het_arch and indirectly het_lm against R
  231. # > library(FinTS)
  232. # > at = ArchTest(residuals(fm), lags=4)
  233. # > mkhtest(at, 'archtest_4', 'chi2')
  234. archtest_4 = dict(
  235. statistic=3.43473400836259,
  236. pvalue=0.487871315392619,
  237. parameters=(4,),
  238. distr="chi2",
  239. )
  240. # > at = ArchTest(residuals(fm), lags=12)
  241. # > mkhtest(at, 'archtest_12', 'chi2')
  242. archtest_12 = dict(
  243. statistic=8.648320999014171,
  244. pvalue=0.732638635007718,
  245. parameters=(12,),
  246. distr="chi2",
  247. )
  248. at4 = smsdia.het_arch(self.res.resid, nlags=4)
  249. at12 = smsdia.het_arch(self.res.resid, nlags=12)
  250. compare_to_reference(at4[:2], archtest_4, decimal=(12, 13))
  251. compare_to_reference(at12[:2], archtest_12, decimal=(12, 13))
  252. def test_het_arch2(self):
  253. # test autolag options, this also test het_lm
  254. # unfortunately optimal lag=1 for this data
  255. resid = self.res.resid
  256. res1 = smsdia.het_arch(resid, nlags=5, store=True)
  257. rs1 = res1[-1]
  258. res2 = smsdia.het_arch(resid, nlags=5, store=True)
  259. rs2 = res2[-1]
  260. assert_almost_equal(rs2.resols.params, rs1.resols.params, decimal=12)
  261. assert_almost_equal(res2[:4], res1[:4], decimal=12)
  262. # test that smallest lag, nlags=1 works
  263. res3 = smsdia.het_arch(resid, nlags=5)
  264. assert_almost_equal(res3[:4], res1[:4], decimal=12)
  265. def test_acorr_breusch_godfrey(self):
  266. res = self.res
  267. # bgf = bgtest(fm, order = 4, type="F")
  268. breuschgodfrey_f = dict(
  269. statistic=1.179280833676792,
  270. pvalue=0.321197487261203,
  271. parameters=(
  272. 4,
  273. 195,
  274. ),
  275. distr="f",
  276. )
  277. # > bgc = bgtest(fm, order = 4, type="Chisq")
  278. # > mkhtest(bgc, "breuschpagan_c", "chi2")
  279. breuschgodfrey_c = dict(
  280. statistic=4.771042651230007,
  281. pvalue=0.3116067133066697,
  282. parameters=(4,),
  283. distr="chi2",
  284. )
  285. bg = smsdia.acorr_breusch_godfrey(res, nlags=4)
  286. bg_r = [
  287. breuschgodfrey_c["statistic"],
  288. breuschgodfrey_c["pvalue"],
  289. breuschgodfrey_f["statistic"],
  290. breuschgodfrey_f["pvalue"],
  291. ]
  292. assert_almost_equal(bg, bg_r, decimal=11)
  293. # check that lag choice works
  294. bg2 = smsdia.acorr_breusch_godfrey(res, nlags=None)
  295. bg3 = smsdia.acorr_breusch_godfrey(res, nlags=10)
  296. assert_almost_equal(bg2, bg3, decimal=12)
  297. def test_acorr_breusch_godfrey_multidim(self):
  298. res = Bunch(resid=np.empty((100, 2)))
  299. with pytest.raises(ValueError, match="Model resid must be a 1d array"):
  300. smsdia.acorr_breusch_godfrey(res)
  301. def test_acorr_breusch_godfrey_exogs(self):
  302. data = sunspots.load_pandas().data["SUNACTIVITY"]
  303. res = ARIMA(data, order=(1, 0, 0), trend="n").fit()
  304. smsdia.acorr_breusch_godfrey(res, nlags=1)
  305. def test_acorr_ljung_box(self):
  306. # unit-test which may be useful later
  307. # ddof correction for fitted parameters in ARMA(p,q) fitdf=p+q
  308. # bt = Box.test(residuals(fm), lag=4, type = "Ljung-Box", fitdf=2)
  309. # mkhtest(bt, "ljung_box_4df2", "chi2")
  310. # ljung_box_4df2 = dict(statistic=5.23587172795227,
  311. # pvalue=0.0729532930400377,
  312. # parameters=(2,), distr='chi2')
  313. # bt = Box.test(residuals(fm), lag=4, type = "Box-Pierce", fitdf=2)
  314. # mkhtest(bt, "ljung_box_bp_4df2", "chi2")
  315. # ljung_box_bp_4df2 = dict(statistic=5.12462932741681,
  316. # pvalue=0.0771260128929921,
  317. # parameters=(2,), distr='chi2')
  318. res = self.res
  319. # general test
  320. # bt = Box.test(residuals(fm), lag=4, type = "Ljung-Box")
  321. # mkhtest(bt, "ljung_box_4", "chi2")
  322. ljung_box_4 = dict(
  323. statistic=5.23587172795227,
  324. pvalue=0.263940335284713,
  325. parameters=(4,),
  326. distr="chi2",
  327. )
  328. # bt = Box.test(residuals(fm), lag=4, type = "Box-Pierce")
  329. # mkhtest(bt, "ljung_box_bp_4", "chi2")
  330. ljung_box_bp_4 = dict(
  331. statistic=5.12462932741681,
  332. pvalue=0.2747471266820692,
  333. parameters=(4,),
  334. distr="chi2",
  335. )
  336. lb, lbpval, bp, bppval = smsdia.acorr_ljungbox(
  337. res.resid, 4, boxpierce=True, return_df=False
  338. )
  339. compare_to_reference(
  340. [lb[-1], lbpval[-1]], ljung_box_4, decimal=(12, 12)
  341. )
  342. compare_to_reference(
  343. [bp[-1], bppval[-1]], ljung_box_bp_4, decimal=(12, 12)
  344. )
  345. def test_acorr_ljung_box_big_default(self):
  346. res = self.res
  347. # R test with big dataset and default lag
  348. # bt = Box.test(residuals(fm), type = "Ljung-Box")
  349. # mkhtest(bt, "ljung_box_none", "chi2")
  350. ljung_box_none = dict(
  351. statistic=51.03724531797195, pvalue=0.11334744923390, distr="chi2"
  352. )
  353. # bt = Box.test(residuals(fm), type = "Box-Pierce")
  354. # mkhtest(bt, "ljung_box_bp_none", "chi2")
  355. ljung_box_bp_none = dict(
  356. statistic=45.12238537034000, pvalue=0.26638168491464, distr="chi2"
  357. )
  358. lags = min(40, res.resid.shape[0] // 2 - 2)
  359. lb, lbpval, bp, bppval = smsdia.acorr_ljungbox(
  360. res.resid, boxpierce=True, lags=lags, return_df=False
  361. )
  362. compare_to_reference(
  363. [lb[-1], lbpval[-1]], ljung_box_none, decimal=(12, 12)
  364. )
  365. compare_to_reference(
  366. [bp[-1], bppval[-1]], ljung_box_bp_none, decimal=(12, 12)
  367. )
  368. def test_acorr_ljung_box_small_default(self):
  369. res = self.res
  370. # R test with small dataset and default lag
  371. # bt = Box.test(residuals(fm), type = "Ljung-Box")
  372. # mkhtest(bt, "ljung_box_small", "chi2")
  373. ljung_box_small = dict(
  374. statistic=9.61503968281915,
  375. pvalue=0.72507000996945,
  376. parameters=(0,),
  377. distr="chi2",
  378. )
  379. # bt = Box.test(residuals(fm), type = "Box-Pierce")
  380. # mkhtest(bt, "ljung_box_bp_small", "chi2")
  381. ljung_box_bp_small = dict(
  382. statistic=7.41692150864936,
  383. pvalue=0.87940785887006,
  384. parameters=(0,),
  385. distr="chi2",
  386. )
  387. lb, lbpval, bp, bppval = smsdia.acorr_ljungbox(
  388. res.resid[:30], boxpierce=True, lags=13, return_df=False
  389. )
  390. compare_to_reference(
  391. [lb[-1], lbpval[-1]], ljung_box_small, decimal=(12, 12)
  392. )
  393. compare_to_reference(
  394. [bp[-1], bppval[-1]], ljung_box_bp_small, decimal=(12, 12)
  395. )
  396. def test_acorr_ljung_box_against_r(self, reset_randomstate):
  397. rs = np.random.RandomState(9876543)
  398. y1 = rs.standard_normal(100)
  399. e = rs.standard_normal(201)
  400. y2 = np.zeros_like(e)
  401. y2[0] = e[0]
  402. for i in range(1, 201):
  403. y2[i] = 0.5 * y2[i - 1] - 0.4 * e[i - 1] + e[i]
  404. y2 = y2[-100:]
  405. r_results_y1_lb = [
  406. [0.15685, 1, 0.6921],
  407. [5.4737, 5, 0.3608],
  408. [10.508, 10, 0.3971],
  409. ]
  410. r_results_y2_lb = [
  411. [2.8764, 1, 0.08989],
  412. [3.8104, 5, 0.577],
  413. [8.4779, 10, 0.5823],
  414. ]
  415. res_y1 = smsdia.acorr_ljungbox(y1, 10, return_df=True)
  416. res_y2 = smsdia.acorr_ljungbox(y2, 10, return_df=True)
  417. for i, loc in enumerate((1, 5, 10)):
  418. row = res_y1.loc[loc]
  419. assert_allclose(
  420. r_results_y1_lb[i][0], row.loc["lb_stat"], rtol=1e-3
  421. )
  422. assert_allclose(
  423. r_results_y1_lb[i][2], row.loc["lb_pvalue"], rtol=1e-3
  424. )
  425. row = res_y2.loc[loc]
  426. assert_allclose(
  427. r_results_y2_lb[i][0], row.loc["lb_stat"], rtol=1e-3
  428. )
  429. assert_allclose(
  430. r_results_y2_lb[i][2], row.loc["lb_pvalue"], rtol=1e-3
  431. )
  432. res = smsdia.acorr_ljungbox(y2, 10, boxpierce=True, return_df=True)
  433. assert_allclose(res.loc[10, "bp_stat"], 7.8935, rtol=1e-3)
  434. assert_allclose(res.loc[10, "bp_pvalue"], 0.639, rtol=1e-3)
  435. res = smsdia.acorr_ljungbox(
  436. y2, 10, boxpierce=True, return_df=True, model_df=1
  437. )
  438. assert_allclose(res.loc[10, "bp_pvalue"], 0.5449, rtol=1e-3)
  439. def test_harvey_collier(self):
  440. # > hc = harvtest(fm, order.by = NULL, data = list())
  441. # > mkhtest_f(hc, 'harvey_collier', 't')
  442. harvey_collier = dict(
  443. statistic=0.494432160939874,
  444. pvalue=0.6215491310408242,
  445. parameters=198,
  446. distr="t",
  447. )
  448. hc = smsdia.linear_harvey_collier(self.res)
  449. compare_to_reference(hc, harvey_collier, decimal=(12, 12))
  450. hc_skip = smsdia.linear_harvey_collier(self.res, skip=20)
  451. assert not np.allclose(hc[0], hc_skip[0])
  452. def test_rainbow(self):
  453. # rainbow test
  454. # > rt = raintest(fm)
  455. # > mkhtest_f(rt, 'raintest', 'f')
  456. raintest = dict(
  457. statistic=0.6809600116739604,
  458. pvalue=0.971832843583418,
  459. parameters=(101, 98),
  460. distr="f",
  461. )
  462. # > rt = raintest(fm, fraction=0.4)
  463. # > mkhtest_f(rt, 'raintest_fraction_04', 'f')
  464. raintest_fraction_04 = dict(
  465. statistic=0.565551237772662,
  466. pvalue=0.997592305968473,
  467. parameters=(122, 77),
  468. distr="f",
  469. )
  470. rb = smsdia.linear_rainbow(self.res)
  471. compare_to_reference(rb, raintest, decimal=(12, 12))
  472. rb = smsdia.linear_rainbow(self.res, frac=0.4)
  473. compare_to_reference(rb, raintest_fraction_04, decimal=(12, 12))
  474. def test_compare_lr(self):
  475. res = self.res
  476. res3 = self.res3 # nested within res
  477. # lrtest
  478. # lrt = lrtest(fm, fm2)
  479. # Model 1: ginv ~ ggdp + lint
  480. # Model 2: ginv ~ ggdp
  481. lrtest = dict(
  482. loglike1=-763.9752181602237,
  483. loglike2=-766.3091902020184,
  484. chi2value=4.66794408358942,
  485. pvalue=0.03073069384028677,
  486. df=(4, 3, 1),
  487. )
  488. lrt = res.compare_lr_test(res3)
  489. assert_almost_equal(lrt[0], lrtest["chi2value"], decimal=11)
  490. assert_almost_equal(lrt[1], lrtest["pvalue"], decimal=11)
  491. waldtest = dict(
  492. fvalue=4.65216373312492,
  493. pvalue=0.03221346195239025,
  494. df=(199, 200, 1),
  495. )
  496. wt = res.compare_f_test(res3)
  497. assert_almost_equal(wt[0], waldtest["fvalue"], decimal=11)
  498. assert_almost_equal(wt[1], waldtest["pvalue"], decimal=11)
  499. def test_compare_j(self):
  500. res = self.res
  501. res2 = self.res2
  502. # jt = jtest(fm, lm(ginv ~ ggdp + tbilrate))
  503. # Estimate Std. Error t value Pr(>|t|)
  504. jtest = [
  505. (
  506. "M1 + fitted(M2)",
  507. 1.591505670785873,
  508. 0.7384552861695823,
  509. 2.155182176352370,
  510. 0.032354572525314450,
  511. "*",
  512. ),
  513. (
  514. "M2 + fitted(M1)",
  515. 1.305687653016899,
  516. 0.4808385176653064,
  517. 2.715438978051544,
  518. 0.007203854534057954,
  519. "**",
  520. ),
  521. ]
  522. jt1 = smsdia.compare_j(res2, res)
  523. assert_almost_equal(jt1, jtest[0][3:5], decimal=12)
  524. jt2 = smsdia.compare_j(res, res2)
  525. assert_almost_equal(jt2, jtest[1][3:5], decimal=12)
  526. @pytest.mark.parametrize("comp", [smsdia.compare_cox, smsdia.compare_j])
  527. def test_compare_error(self, comp, diagnostic_data):
  528. data = diagnostic_data
  529. res1 = OLS(data.y, data[["c", "x1"]]).fit()
  530. res2 = OLS(data.x3, data[["c", "x2"]]).fit()
  531. with pytest.raises(ValueError, match="endogenous variables"):
  532. comp(res1, res2)
  533. @pytest.mark.parametrize("comp", [smsdia.compare_cox, smsdia.compare_j])
  534. def test_compare_nested(self, comp, diagnostic_data):
  535. data = diagnostic_data
  536. res1 = OLS(data.y, data[["c", "x1"]]).fit()
  537. res2 = OLS(data.y, data[["c", "x1", "x2"]]).fit()
  538. with pytest.raises(ValueError, match="The exog in results_x"):
  539. comp(res1, res2)
  540. with pytest.raises(ValueError, match="The exog in results_x"):
  541. comp(res2, res1)
  542. def test_compare_cox(self):
  543. res = self.res
  544. res2 = self.res2
  545. # Estimate Std. Error z value Pr(>|z|)
  546. coxtest = [
  547. (
  548. "fitted(M1) ~ M2",
  549. -0.782030488930356,
  550. 0.599696502782265,
  551. -1.304043770977755,
  552. 1.922186587840554e-01,
  553. " ",
  554. ),
  555. (
  556. "fitted(M2) ~ M1",
  557. -2.248817107408537,
  558. 0.392656854330139,
  559. -5.727181590258883,
  560. 1.021128495098556e-08,
  561. "***",
  562. ),
  563. ]
  564. ct1 = smsdia.compare_cox(res, res2)
  565. assert_almost_equal(ct1, coxtest[0][3:5], decimal=12)
  566. ct2 = smsdia.compare_cox(res2, res)
  567. assert_almost_equal(ct2, coxtest[1][3:5], decimal=12)
  568. _, _, store = smsdia.compare_cox(res, res2, store=True)
  569. assert isinstance(store, smsdia.ResultsStore)
  570. def test_cusum_ols(self):
  571. # R library(strucchange)
  572. # > sc = sctest(ginv ~ ggdp + lint, type="OLS-CUSUM")
  573. # > mkhtest(sc, 'cusum_ols', 'BB')
  574. cusum_ols = dict(
  575. statistic=1.055750610401214,
  576. pvalue=0.2149567397376543,
  577. parameters=(),
  578. distr="BB",
  579. ) # Brownian Bridge
  580. k_vars = 3
  581. cs_ols = smsdia.breaks_cusumolsresid(self.res.resid, ddof=k_vars) #
  582. compare_to_reference(cs_ols, cusum_ols, decimal=(12, 12))
  583. def test_breaks_hansen(self):
  584. # > sc = sctest(ginv ~ ggdp + lint, type="Nyblom-Hansen")
  585. # > mkhtest(sc, 'breaks_nyblom_hansen', 'BB')
  586. breaks_nyblom_hansen = dict(
  587. statistic=1.0300792740544484,
  588. pvalue=0.1136087530212015,
  589. parameters=(),
  590. distr="BB",
  591. )
  592. bh = smsdia.breaks_hansen(self.res)
  593. assert_almost_equal(
  594. bh[0], breaks_nyblom_hansen["statistic"], decimal=12
  595. )
  596. # TODO: breaks_hansen does not return pvalues
  597. def test_recursive_residuals(self):
  598. reccumres_standardize = np.array(
  599. [
  600. -2.151,
  601. -3.748,
  602. -3.114,
  603. -3.096,
  604. -1.865,
  605. -2.230,
  606. -1.194,
  607. -3.500,
  608. -3.638,
  609. -4.447,
  610. -4.602,
  611. -4.631,
  612. -3.999,
  613. -4.830,
  614. -5.429,
  615. -5.435,
  616. -6.554,
  617. -8.093,
  618. -8.567,
  619. -7.532,
  620. -7.079,
  621. -8.468,
  622. -9.320,
  623. -12.256,
  624. -11.932,
  625. -11.454,
  626. -11.690,
  627. -11.318,
  628. -12.665,
  629. -12.842,
  630. -11.693,
  631. -10.803,
  632. -12.113,
  633. -12.109,
  634. -13.002,
  635. -11.897,
  636. -10.787,
  637. -10.159,
  638. -9.038,
  639. -9.007,
  640. -8.634,
  641. -7.552,
  642. -7.153,
  643. -6.447,
  644. -5.183,
  645. -3.794,
  646. -3.511,
  647. -3.979,
  648. -3.236,
  649. -3.793,
  650. -3.699,
  651. -5.056,
  652. -5.724,
  653. -4.888,
  654. -4.309,
  655. -3.688,
  656. -3.918,
  657. -3.735,
  658. -3.452,
  659. -2.086,
  660. -6.520,
  661. -7.959,
  662. -6.760,
  663. -6.855,
  664. -6.032,
  665. -4.405,
  666. -4.123,
  667. -4.075,
  668. -3.235,
  669. -3.115,
  670. -3.131,
  671. -2.986,
  672. -1.813,
  673. -4.824,
  674. -4.424,
  675. -4.796,
  676. -4.000,
  677. -3.390,
  678. -4.485,
  679. -4.669,
  680. -4.560,
  681. -3.834,
  682. -5.507,
  683. -3.792,
  684. -2.427,
  685. -1.756,
  686. -0.354,
  687. 1.150,
  688. 0.586,
  689. 0.643,
  690. 1.773,
  691. -0.830,
  692. -0.388,
  693. 0.517,
  694. 0.819,
  695. 2.240,
  696. 3.791,
  697. 3.187,
  698. 3.409,
  699. 2.431,
  700. 0.668,
  701. 0.957,
  702. -0.928,
  703. 0.327,
  704. -0.285,
  705. -0.625,
  706. -2.316,
  707. -1.986,
  708. -0.744,
  709. -1.396,
  710. -1.728,
  711. -0.646,
  712. -2.602,
  713. -2.741,
  714. -2.289,
  715. -2.897,
  716. -1.934,
  717. -2.532,
  718. -3.175,
  719. -2.806,
  720. -3.099,
  721. -2.658,
  722. -2.487,
  723. -2.515,
  724. -2.224,
  725. -2.416,
  726. -1.141,
  727. 0.650,
  728. -0.947,
  729. 0.725,
  730. 0.439,
  731. 0.885,
  732. 2.419,
  733. 2.642,
  734. 2.745,
  735. 3.506,
  736. 4.491,
  737. 5.377,
  738. 4.624,
  739. 5.523,
  740. 6.488,
  741. 6.097,
  742. 5.390,
  743. 6.299,
  744. 6.656,
  745. 6.735,
  746. 8.151,
  747. 7.260,
  748. 7.846,
  749. 8.771,
  750. 8.400,
  751. 8.717,
  752. 9.916,
  753. 9.008,
  754. 8.910,
  755. 8.294,
  756. 8.982,
  757. 8.540,
  758. 8.395,
  759. 7.782,
  760. 7.794,
  761. 8.142,
  762. 8.362,
  763. 8.400,
  764. 7.850,
  765. 7.643,
  766. 8.228,
  767. 6.408,
  768. 7.218,
  769. 7.699,
  770. 7.895,
  771. 8.725,
  772. 8.938,
  773. 8.781,
  774. 8.350,
  775. 9.136,
  776. 9.056,
  777. 10.365,
  778. 10.495,
  779. 10.704,
  780. 10.784,
  781. 10.275,
  782. 10.389,
  783. 11.586,
  784. 11.033,
  785. 11.335,
  786. 11.661,
  787. 10.522,
  788. 10.392,
  789. 10.521,
  790. 10.126,
  791. 9.428,
  792. 9.734,
  793. 8.954,
  794. 9.949,
  795. 10.595,
  796. 8.016,
  797. 6.636,
  798. 6.975,
  799. ]
  800. )
  801. rr = smsdia.recursive_olsresiduals(self.res, skip=3, alpha=0.95)
  802. assert_equal(
  803. np.round(rr[5][1:], 3), reccumres_standardize
  804. ) # extra zero in front
  805. # assert_equal(np.round(rr[3][4:], 3), np.diff(reccumres_standardize))
  806. assert_almost_equal(rr[3][4:], np.diff(reccumres_standardize), 3)
  807. assert_almost_equal(rr[4][3:].std(ddof=1), 10.7242, decimal=4)
  808. order = np.arange(self.res.model.endog.shape[0])
  809. rr_order = smsdia.recursive_olsresiduals(
  810. self.res, skip=3, alpha=0.95, order_by=order
  811. )
  812. assert_allclose(rr[0], rr_order[0], atol=1e-6)
  813. # regression number, visually checked with graph from gretl
  814. ub0 = np.array(
  815. [13.37318571, 13.50758959, 13.64199346, 13.77639734, 13.91080121]
  816. )
  817. ub1 = np.array(
  818. [39.44753774, 39.58194162, 39.7163455, 39.85074937, 39.98515325]
  819. )
  820. lb, ub = rr[6]
  821. assert_almost_equal(ub[:5], ub0, decimal=7)
  822. assert_almost_equal(lb[:5], -ub0, decimal=7)
  823. assert_almost_equal(ub[-5:], ub1, decimal=7)
  824. assert_almost_equal(lb[-5:], -ub1, decimal=7)
  825. # test a few values with explicit OLS
  826. endog = self.res.model.endog
  827. exog = self.res.model.exog
  828. params = []
  829. ypred = []
  830. for i in range(3, 10):
  831. resi = OLS(endog[:i], exog[:i]).fit()
  832. ypred.append(resi.model.predict(resi.params, exog[i]))
  833. params.append(resi.params)
  834. assert_almost_equal(rr[2][3:10], ypred, decimal=12)
  835. assert_almost_equal(rr[0][3:10], endog[3:10] - ypred, decimal=12)
  836. assert_almost_equal(rr[1][2:9], params, decimal=12)
  837. def test_normality(self):
  838. res = self.res
  839. # > library(nortest) #Lilliefors (Kolmogorov-Smirnov) normality test
  840. # > lt = lillie.test(residuals(fm))
  841. # > mkhtest(lt, "lilliefors", "-")
  842. lilliefors1 = dict(
  843. statistic=0.0723390908786589,
  844. pvalue=0.01204113540102896,
  845. parameters=(),
  846. distr="-",
  847. )
  848. # > lt = lillie.test(residuals(fm)**2)
  849. # > mkhtest(lt, "lilliefors", "-")
  850. lilliefors2 = dict(
  851. statistic=0.301311621898024,
  852. pvalue=1.004305736618051e-51,
  853. parameters=(),
  854. distr="-",
  855. )
  856. # > lt = lillie.test(residuals(fm)[1:20])
  857. # > mkhtest(lt, "lilliefors", "-")
  858. lilliefors3 = dict(
  859. statistic=0.1333956004203103,
  860. pvalue=0.455683,
  861. parameters=(),
  862. distr="-",
  863. )
  864. lf1 = smsdia.lilliefors(res.resid, pvalmethod="approx")
  865. lf2 = smsdia.lilliefors(res.resid ** 2, pvalmethod="approx")
  866. lf3 = smsdia.lilliefors(res.resid[:20], pvalmethod="approx")
  867. compare_to_reference(lf1, lilliefors1, decimal=(12, 12))
  868. compare_to_reference(
  869. lf2, lilliefors2, decimal=(12, 12)
  870. ) # pvalue very small
  871. assert_allclose(lf2[1], lilliefors2["pvalue"], rtol=1e-10)
  872. compare_to_reference(lf3, lilliefors3, decimal=(12, 1))
  873. # R uses different approximation for pvalue in last case
  874. # > ad = ad.test(residuals(fm))
  875. # > mkhtest(ad, "ad3", "-")
  876. adr1 = dict(
  877. statistic=1.602209621518313,
  878. pvalue=0.0003937979149362316,
  879. parameters=(),
  880. distr="-",
  881. )
  882. # > ad = ad.test(residuals(fm)**2)
  883. # > mkhtest(ad, "ad3", "-")
  884. # > ad = ad.test(residuals(fm)[1:20])
  885. # > mkhtest(ad, "ad3", "-")
  886. adr3 = dict(
  887. statistic=0.3017073732210775,
  888. pvalue=0.5443499281265933,
  889. parameters=(),
  890. distr="-",
  891. )
  892. ad1 = smsdia.normal_ad(res.resid)
  893. compare_to_reference(ad1, adr1, decimal=(11, 13))
  894. ad2 = smsdia.normal_ad(res.resid ** 2)
  895. assert_(np.isinf(ad2[0]))
  896. ad3 = smsdia.normal_ad(res.resid[:20])
  897. compare_to_reference(ad3, adr3, decimal=(11, 12))
  898. def test_influence(self):
  899. res = self.res
  900. # this test is slow
  901. infl = oi.OLSInfluence(res)
  902. path = os.path.join(cur_dir, "results", "influence_lsdiag_R.json")
  903. with open(path, "r") as fp:
  904. lsdiag = json.load(fp)
  905. # basic
  906. assert_almost_equal(
  907. np.array(lsdiag["cov.scaled"]).reshape(3, 3),
  908. res.cov_params(),
  909. decimal=12,
  910. )
  911. assert_almost_equal(
  912. np.array(lsdiag["cov.unscaled"]).reshape(3, 3),
  913. res.normalized_cov_params,
  914. decimal=12,
  915. )
  916. c0, c1 = infl.cooks_distance # TODO: what's c1
  917. assert_almost_equal(c0, lsdiag["cooks"], decimal=12)
  918. assert_almost_equal(infl.hat_matrix_diag, lsdiag["hat"], decimal=12)
  919. assert_almost_equal(
  920. infl.resid_studentized_internal, lsdiag["std.res"], decimal=12
  921. )
  922. # slow:
  923. # infl._get_all_obs() #slow, nobs estimation loop, called implicitly
  924. dffits, dffth = infl.dffits
  925. assert_almost_equal(dffits, lsdiag["dfits"], decimal=12)
  926. assert_almost_equal(
  927. infl.resid_studentized_external, lsdiag["stud.res"], decimal=12
  928. )
  929. fn = os.path.join(cur_dir, "results/influence_measures_R.csv")
  930. infl_r = pd.read_csv(fn, index_col=0)
  931. # not used yet:
  932. # infl_bool_r = pandas.read_csv(fn, index_col=0,
  933. # converters=dict(zip(lrange(7),[conv]*7)))
  934. infl_r2 = np.asarray(infl_r)
  935. assert_almost_equal(infl.dfbetas, infl_r2[:, :3], decimal=12)
  936. assert_almost_equal(infl.cov_ratio, infl_r2[:, 4], decimal=12)
  937. # duplicates
  938. assert_almost_equal(dffits, infl_r2[:, 3], decimal=12)
  939. assert_almost_equal(c0, infl_r2[:, 5], decimal=12)
  940. assert_almost_equal(infl.hat_matrix_diag, infl_r2[:, 6], decimal=12)
  941. # TODO: finish and check thresholds and pvalues
  942. # Note: for dffits, R uses a threshold around 0.36,
  943. # mine: dffits[1]=0.24373
  944. # R has
  945. # >>> np.nonzero(np.asarray(infl_bool_r["dffit"]))[0]
  946. # array([ 6, 26, 63, 76, 90, 199])
  947. # >>> np.nonzero(np.asarray(infl_bool_r["cov.r"]))[0]
  948. # array([ 4, 26, 59, 61, 63, 72, 76, 84, 91, 92, 94, 95,
  949. # 108, 197, 198])
  950. # >>> np.nonzero(np.asarray(infl_bool_r["hat"]))[0]
  951. # array([ 62, 76, 84, 90, 91, 92, 95, 108, 197, 199])
  952. class TestDiagnosticGPandas(TestDiagnosticG):
  953. @classmethod
  954. def setup_class(cls):
  955. d = macrodata.load_pandas().data
  956. # growth rates
  957. d["gs_l_realinv"] = 400 * np.log(d["realinv"]).diff()
  958. d["gs_l_realgdp"] = 400 * np.log(d["realgdp"]).diff()
  959. d["lint"] = d["realint"].shift(1)
  960. d["tbilrate"] = d["tbilrate"].shift(1)
  961. d = d.dropna()
  962. cls.d = d
  963. endogg = d["gs_l_realinv"]
  964. exogg = add_constant(d[["gs_l_realgdp", "lint"]])
  965. exogg2 = add_constant(d[["gs_l_realgdp", "tbilrate"]])
  966. exogg3 = add_constant(d[["gs_l_realgdp"]])
  967. res_ols = OLS(endogg, exogg).fit()
  968. res_ols2 = OLS(endogg, exogg2).fit()
  969. res_ols3 = OLS(endogg, exogg3).fit()
  970. cls.res = res_ols
  971. cls.res2 = res_ols2
  972. cls.res3 = res_ols3
  973. cls.endog = cls.res.model.endog
  974. cls.exog = cls.res.model.exog
  975. def test_spec_white():
  976. resdir = os.path.join(cur_dir, "results")
  977. wsfiles = ["wspec1.csv", "wspec2.csv", "wspec3.csv", "wspec4.csv"]
  978. for file in wsfiles:
  979. mdlfile = os.path.join(resdir, file)
  980. mdl = np.asarray(pd.read_csv(mdlfile))
  981. # DV is in last column
  982. lastcol = mdl.shape[1] - 1
  983. dv = mdl[:, lastcol]
  984. # create design matrix
  985. design = np.concatenate(
  986. (np.ones((mdl.shape[0], 1)), np.delete(mdl, lastcol, 1)), axis=1
  987. )
  988. # perform OLS and generate residuals
  989. resids = dv - np.dot(design, np.linalg.lstsq(design, dv, rcond=-1)[0])
  990. # perform White spec test. wspec3/wspec4 contain dummies.
  991. wsres = smsdia.spec_white(resids, design)
  992. # compare results to SAS 9.3 output
  993. if file == "wspec1.csv":
  994. assert_almost_equal(wsres, [3.251, 0.661, 5], decimal=3)
  995. elif file == "wspec2.csv":
  996. assert_almost_equal(wsres, [6.070, 0.733, 9], decimal=3)
  997. elif file == "wspec3.csv":
  998. assert_almost_equal(wsres, [6.767, 0.454, 7], decimal=3)
  999. else:
  1000. assert_almost_equal(wsres, [8.462, 0.671, 11], decimal=3)
  1001. def test_spec_white_error(reset_randomstate):
  1002. with pytest.raises(ValueError, match="White's specification test "):
  1003. smsdia.spec_white(
  1004. np.random.standard_normal(100), np.random.standard_normal((100, 1))
  1005. )
  1006. with pytest.raises(ValueError, match="White's specification test "):
  1007. smsdia.spec_white(
  1008. np.random.standard_normal(100), np.random.standard_normal((100, 2))
  1009. )
  1010. def test_linear_lm_direct(reset_randomstate):
  1011. endog = np.random.standard_normal(500)
  1012. exog = add_constant(np.random.standard_normal((500, 3)))
  1013. res = OLS(endog, exog).fit()
  1014. lm_res = smsdia.linear_lm(res.resid, exog)
  1015. aug = np.hstack([exog, exog[:, 1:] ** 2])
  1016. res_aug = OLS(res.resid, aug).fit()
  1017. stat = res_aug.rsquared * aug.shape[0]
  1018. assert_allclose(lm_res[0], stat)
  1019. # TODO: make this a test or move/remove
  1020. def grangertest():
  1021. # > gt = grangertest(ginv, ggdp, order=4)
  1022. # > gt
  1023. # Granger causality test
  1024. #
  1025. # Model 1: ggdp ~ Lags(ggdp, 1:4) + Lags(ginv, 1:4)
  1026. # Model 2: ggdp ~ Lags(ggdp, 1:4)
  1027. dict(fvalue=1.589672703015157, pvalue=0.178717196987075, df=(198, 193))
  1028. @pytest.mark.smoke
  1029. def test_outlier_influence_funcs(reset_randomstate):
  1030. x = add_constant(np.random.randn(10, 2))
  1031. y = x.sum(1) + np.random.randn(10)
  1032. res = OLS(y, x).fit()
  1033. out_05 = oi.summary_table(res)
  1034. # GH3344 : Check alpha has an effect
  1035. out_01 = oi.summary_table(res, alpha=0.01)
  1036. assert_(np.all(out_01[1][:, 6] <= out_05[1][:, 6]))
  1037. assert_(np.all(out_01[1][:, 7] >= out_05[1][:, 7]))
  1038. res2 = OLS(y, x[:, 0]).fit()
  1039. oi.summary_table(res2, alpha=0.05)
  1040. infl = res2.get_influence()
  1041. infl.summary_table()
  1042. def test_influence_wrapped():
  1043. from pandas import DataFrame
  1044. d = macrodata.load_pandas().data
  1045. # growth rates
  1046. gs_l_realinv = 400 * np.log(d["realinv"]).diff().dropna()
  1047. gs_l_realgdp = 400 * np.log(d["realgdp"]).diff().dropna()
  1048. lint = d["realint"][:-1]
  1049. # re-index these because they will not conform to lint
  1050. gs_l_realgdp.index = lint.index
  1051. gs_l_realinv.index = lint.index
  1052. data = dict(const=np.ones_like(lint), lint=lint, lrealgdp=gs_l_realgdp)
  1053. # order is important
  1054. exog = DataFrame(data, columns=["const", "lrealgdp", "lint"])
  1055. res = OLS(gs_l_realinv, exog).fit()
  1056. # basic
  1057. # already tested
  1058. # assert_almost_equal(lsdiag['cov.scaled'],
  1059. # res.cov_params().values.ravel(), decimal=12)
  1060. # assert_almost_equal(lsdiag['cov.unscaled'],
  1061. # res.normalized_cov_params.values.ravel(), decimal=12)
  1062. infl = oi.OLSInfluence(res)
  1063. # smoke test just to make sure it works, results separately tested
  1064. df = infl.summary_frame()
  1065. assert_(isinstance(df, DataFrame))
  1066. # this test is slow
  1067. path = os.path.join(cur_dir, "results", "influence_lsdiag_R.json")
  1068. with open(path, "r") as fp:
  1069. lsdiag = json.load(fp)
  1070. c0, c1 = infl.cooks_distance # TODO: what's c1, it's pvalues? -ss
  1071. # NOTE: we get a hard-cored 5 decimals with pandas testing
  1072. assert_almost_equal(c0, lsdiag["cooks"], 12)
  1073. assert_almost_equal(infl.hat_matrix_diag, (lsdiag["hat"]), 12)
  1074. assert_almost_equal(infl.resid_studentized_internal, lsdiag["std.res"], 12)
  1075. # slow:
  1076. dffits, dffth = infl.dffits
  1077. assert_almost_equal(dffits, lsdiag["dfits"], 12)
  1078. assert_almost_equal(
  1079. infl.resid_studentized_external, lsdiag["stud.res"], 12
  1080. )
  1081. fn = os.path.join(cur_dir, "results/influence_measures_R.csv")
  1082. infl_r = pd.read_csv(fn, index_col=0)
  1083. # not used yet:
  1084. # infl_bool_r = pandas.read_csv(fn, index_col=0,
  1085. # converters=dict(zip(lrange(7),[conv]*7)))
  1086. infl_r2 = np.asarray(infl_r)
  1087. # TODO: finish wrapping this stuff
  1088. assert_almost_equal(infl.dfbetas, infl_r2[:, :3], decimal=12)
  1089. assert_almost_equal(infl.cov_ratio, infl_r2[:, 4], decimal=12)
  1090. def test_influence_dtype():
  1091. # see #2148 bug when endog is integer
  1092. y = np.ones(20)
  1093. np.random.seed(123)
  1094. x = np.random.randn(20, 3)
  1095. res1 = OLS(y, x).fit()
  1096. res2 = OLS(y * 1.0, x).fit()
  1097. cr1 = res1.get_influence().cov_ratio
  1098. cr2 = res2.get_influence().cov_ratio
  1099. assert_allclose(cr1, cr2, rtol=1e-14)
  1100. # regression test for values
  1101. cr3 = np.array(
  1102. [
  1103. 1.22239215,
  1104. 1.31551021,
  1105. 1.52671069,
  1106. 1.05003921,
  1107. 0.89099323,
  1108. 1.57405066,
  1109. 1.03230092,
  1110. 0.95844196,
  1111. 1.15531836,
  1112. 1.21963623,
  1113. 0.87699564,
  1114. 1.16707748,
  1115. 1.10481391,
  1116. 0.98839447,
  1117. 1.08999334,
  1118. 1.35680102,
  1119. 1.46227715,
  1120. 1.45966708,
  1121. 1.13659521,
  1122. 1.22799038,
  1123. ]
  1124. )
  1125. assert_almost_equal(cr1, cr3, decimal=8)
  1126. def get_duncan_data():
  1127. # results from R with NA -> 1. Just testing interface here because
  1128. # outlier_test is just a wrapper
  1129. labels = [
  1130. "accountant",
  1131. "pilot",
  1132. "architect",
  1133. "author",
  1134. "chemist",
  1135. "minister",
  1136. "professor",
  1137. "dentist",
  1138. "reporter",
  1139. "engineer",
  1140. "undertaker",
  1141. "lawyer",
  1142. "physician",
  1143. "welfare.worker",
  1144. "teacher",
  1145. "conductor",
  1146. "contractor",
  1147. "factory.owner",
  1148. "store.manager",
  1149. "banker",
  1150. "bookkeeper",
  1151. "mail.carrier",
  1152. "insurance.agent",
  1153. "store.clerk",
  1154. "carpenter",
  1155. "electrician",
  1156. "RR.engineer",
  1157. "machinist",
  1158. "auto.repairman",
  1159. "plumber",
  1160. "gas.stn.attendant",
  1161. "coal.miner",
  1162. "streetcar.motorman",
  1163. "taxi.driver",
  1164. "truck.driver",
  1165. "machine.operator",
  1166. "barber",
  1167. "bartender",
  1168. "shoe.shiner",
  1169. "cook",
  1170. "soda.clerk",
  1171. "watchman",
  1172. "janitor",
  1173. "policeman",
  1174. "waiter",
  1175. ]
  1176. # Duncan's prestige data from car
  1177. exog = [
  1178. [1.0, 62.0, 86.0],
  1179. [1.0, 72.0, 76.0],
  1180. [1.0, 75.0, 92.0],
  1181. [1.0, 55.0, 90.0],
  1182. [1.0, 64.0, 86.0],
  1183. [1.0, 21.0, 84.0],
  1184. [1.0, 64.0, 93.0],
  1185. [1.0, 80.0, 100.0],
  1186. [1.0, 67.0, 87.0],
  1187. [1.0, 72.0, 86.0],
  1188. [1.0, 42.0, 74.0],
  1189. [1.0, 76.0, 98.0],
  1190. [1.0, 76.0, 97.0],
  1191. [1.0, 41.0, 84.0],
  1192. [1.0, 48.0, 91.0],
  1193. [1.0, 76.0, 34.0],
  1194. [1.0, 53.0, 45.0],
  1195. [1.0, 60.0, 56.0],
  1196. [1.0, 42.0, 44.0],
  1197. [1.0, 78.0, 82.0],
  1198. [1.0, 29.0, 72.0],
  1199. [1.0, 48.0, 55.0],
  1200. [1.0, 55.0, 71.0],
  1201. [1.0, 29.0, 50.0],
  1202. [1.0, 21.0, 23.0],
  1203. [1.0, 47.0, 39.0],
  1204. [1.0, 81.0, 28.0],
  1205. [1.0, 36.0, 32.0],
  1206. [1.0, 22.0, 22.0],
  1207. [1.0, 44.0, 25.0],
  1208. [1.0, 15.0, 29.0],
  1209. [1.0, 7.0, 7.0],
  1210. [1.0, 42.0, 26.0],
  1211. [1.0, 9.0, 19.0],
  1212. [1.0, 21.0, 15.0],
  1213. [1.0, 21.0, 20.0],
  1214. [1.0, 16.0, 26.0],
  1215. [1.0, 16.0, 28.0],
  1216. [1.0, 9.0, 17.0],
  1217. [1.0, 14.0, 22.0],
  1218. [1.0, 12.0, 30.0],
  1219. [1.0, 17.0, 25.0],
  1220. [1.0, 7.0, 20.0],
  1221. [1.0, 34.0, 47.0],
  1222. [1.0, 8.0, 32.0],
  1223. ]
  1224. endog = [
  1225. 82.0,
  1226. 83.0,
  1227. 90.0,
  1228. 76.0,
  1229. 90.0,
  1230. 87.0,
  1231. 93.0,
  1232. 90.0,
  1233. 52.0,
  1234. 88.0,
  1235. 57.0,
  1236. 89.0,
  1237. 97.0,
  1238. 59.0,
  1239. 73.0,
  1240. 38.0,
  1241. 76.0,
  1242. 81.0,
  1243. 45.0,
  1244. 92.0,
  1245. 39.0,
  1246. 34.0,
  1247. 41.0,
  1248. 16.0,
  1249. 33.0,
  1250. 53.0,
  1251. 67.0,
  1252. 57.0,
  1253. 26.0,
  1254. 29.0,
  1255. 10.0,
  1256. 15.0,
  1257. 19.0,
  1258. 10.0,
  1259. 13.0,
  1260. 24.0,
  1261. 20.0,
  1262. 7.0,
  1263. 3.0,
  1264. 16.0,
  1265. 6.0,
  1266. 11.0,
  1267. 8.0,
  1268. 41.0,
  1269. 10.0,
  1270. ]
  1271. return endog, exog, labels
  1272. def test_outlier_test():
  1273. endog, exog, labels = get_duncan_data()
  1274. ndarray_mod = OLS(endog, exog).fit()
  1275. rstudent = [
  1276. 3.1345185839,
  1277. -2.3970223990,
  1278. 2.0438046359,
  1279. -1.9309187757,
  1280. 1.8870465798,
  1281. -1.7604905300,
  1282. -1.7040324156,
  1283. 1.6024285876,
  1284. -1.4332485037,
  1285. -1.1044851583,
  1286. 1.0688582315,
  1287. 1.0185271840,
  1288. -0.9024219332,
  1289. -0.9023876471,
  1290. -0.8830953936,
  1291. 0.8265782334,
  1292. 0.8089220547,
  1293. 0.7682770197,
  1294. 0.7319491074,
  1295. -0.6665962829,
  1296. 0.5227352794,
  1297. -0.5135016547,
  1298. 0.5083881518,
  1299. 0.4999224372,
  1300. -0.4980818221,
  1301. -0.4759717075,
  1302. -0.4293565820,
  1303. -0.4114056499,
  1304. -0.3779540862,
  1305. 0.3556874030,
  1306. 0.3409200462,
  1307. 0.3062248646,
  1308. 0.3038999429,
  1309. -0.3030815773,
  1310. -0.1873387893,
  1311. 0.1738050251,
  1312. 0.1424246593,
  1313. -0.1292266025,
  1314. 0.1272066463,
  1315. -0.0798902878,
  1316. 0.0788467222,
  1317. 0.0722556991,
  1318. 0.0505098280,
  1319. 0.0233215136,
  1320. 0.0007112055,
  1321. ]
  1322. unadj_p = [
  1323. 0.003177202,
  1324. 0.021170298,
  1325. 0.047432955,
  1326. 0.060427645,
  1327. 0.066248120,
  1328. 0.085783008,
  1329. 0.095943909,
  1330. 0.116738318,
  1331. 0.159368890,
  1332. 0.275822623,
  1333. 0.291386358,
  1334. 0.314400295,
  1335. 0.372104049,
  1336. 0.372122040,
  1337. 0.382333561,
  1338. 0.413260793,
  1339. 0.423229432,
  1340. 0.446725370,
  1341. 0.468363101,
  1342. 0.508764039,
  1343. 0.603971990,
  1344. 0.610356737,
  1345. 0.613905871,
  1346. 0.619802317,
  1347. 0.621087703,
  1348. 0.636621083,
  1349. 0.669911674,
  1350. 0.682917818,
  1351. 0.707414459,
  1352. 0.723898263,
  1353. 0.734904667,
  1354. 0.760983108,
  1355. 0.762741124,
  1356. 0.763360242,
  1357. 0.852319039,
  1358. 0.862874018,
  1359. 0.887442197,
  1360. 0.897810225,
  1361. 0.899398691,
  1362. 0.936713197,
  1363. 0.937538115,
  1364. 0.942749758,
  1365. 0.959961394,
  1366. 0.981506948,
  1367. 0.999435989,
  1368. ]
  1369. bonf_p = [
  1370. 0.1429741,
  1371. 0.9526634,
  1372. 2.1344830,
  1373. 2.7192440,
  1374. 2.9811654,
  1375. 3.8602354,
  1376. 4.3174759,
  1377. 5.2532243,
  1378. 7.1716001,
  1379. 12.4120180,
  1380. 13.1123861,
  1381. 14.1480133,
  1382. 16.7446822,
  1383. 16.7454918,
  1384. 17.2050103,
  1385. 18.5967357,
  1386. 19.0453245,
  1387. 20.1026416,
  1388. 21.0763395,
  1389. 22.8943818,
  1390. 27.1787396,
  1391. 27.4660532,
  1392. 27.6257642,
  1393. 27.8911043,
  1394. 27.9489466,
  1395. 28.6479487,
  1396. 30.1460253,
  1397. 30.7313018,
  1398. 31.8336506,
  1399. 32.5754218,
  1400. 33.0707100,
  1401. 34.2442399,
  1402. 34.3233506,
  1403. 34.3512109,
  1404. 38.3543568,
  1405. 38.8293308,
  1406. 39.9348989,
  1407. 40.4014601,
  1408. 40.4729411,
  1409. 42.1520939,
  1410. 42.1892152,
  1411. 42.4237391,
  1412. 43.1982627,
  1413. 44.1678127,
  1414. 44.9746195,
  1415. ]
  1416. bonf_p = np.array(bonf_p)
  1417. bonf_p[bonf_p > 1] = 1
  1418. sorted_labels = [
  1419. "minister",
  1420. "reporter",
  1421. "contractor",
  1422. "insurance.agent",
  1423. "machinist",
  1424. "store.clerk",
  1425. "conductor",
  1426. "factory.owner",
  1427. "mail.carrier",
  1428. "streetcar.motorman",
  1429. "carpenter",
  1430. "coal.miner",
  1431. "bartender",
  1432. "bookkeeper",
  1433. "soda.clerk",
  1434. "chemist",
  1435. "RR.engineer",
  1436. "professor",
  1437. "electrician",
  1438. "gas.stn.attendant",
  1439. "auto.repairman",
  1440. "watchman",
  1441. "banker",
  1442. "machine.operator",
  1443. "dentist",
  1444. "waiter",
  1445. "shoe.shiner",
  1446. "welfare.worker",
  1447. "plumber",
  1448. "physician",
  1449. "pilot",
  1450. "engineer",
  1451. "accountant",
  1452. "lawyer",
  1453. "undertaker",
  1454. "barber",
  1455. "store.manager",
  1456. "truck.driver",
  1457. "cook",
  1458. "janitor",
  1459. "policeman",
  1460. "architect",
  1461. "teacher",
  1462. "taxi.driver",
  1463. "author",
  1464. ]
  1465. res2 = np.c_[rstudent, unadj_p, bonf_p]
  1466. res = oi.outlier_test(ndarray_mod, method="b", labels=labels, order=True)
  1467. np.testing.assert_almost_equal(res.values, res2, 7)
  1468. np.testing.assert_equal(
  1469. res.index.tolist(), sorted_labels
  1470. ) # pylint: disable-msg=E1103
  1471. data = pd.DataFrame(
  1472. np.column_stack((endog, exog)),
  1473. columns="y const var1 var2".split(),
  1474. index=labels,
  1475. )
  1476. # check `order` with pandas bug in #3971
  1477. res_pd = OLS.from_formula("y ~ const + var1 + var2 - 0", data).fit()
  1478. res_outl2 = oi.outlier_test(res_pd, method="b", order=True)
  1479. assert_almost_equal(res_outl2.values, res2, 7)
  1480. assert_equal(res_outl2.index.tolist(), sorted_labels)
  1481. res_outl1 = res_pd.outlier_test(method="b")
  1482. res_outl1 = res_outl1.sort_values(["unadj_p"], ascending=True)
  1483. assert_almost_equal(res_outl1.values, res2, 7)
  1484. assert_equal(res_outl1.index.tolist(), sorted_labels)
  1485. assert_array_equal(res_outl2.index, res_outl1.index)
  1486. # additional keywords in method
  1487. res_outl3 = res_pd.outlier_test(method="b", order=True)
  1488. assert_equal(res_outl3.index.tolist(), sorted_labels)
  1489. res_outl4 = res_pd.outlier_test(method="b", order=True, cutoff=0.15)
  1490. assert_equal(res_outl4.index.tolist(), sorted_labels[:1])
  1491. def test_ljungbox_dof_adj():
  1492. data = sunspots.load_pandas().data["SUNACTIVITY"]
  1493. res = AutoReg(data, 4, old_names=False).fit()
  1494. resid = res.resid
  1495. res1 = smsdia.acorr_ljungbox(resid, lags=10, return_df=False

Large files files are truncated, but you can click here to view the full file