PageRenderTime 175ms CodeModel.GetById 68ms app.highlight 96ms RepoModel.GetById 1ms app.codeStats 1ms

/statsmodels/stats/tests/test_diagnostic.py

http://github.com/statsmodels/statsmodels
Python | 1294 lines | 872 code | 197 blank | 225 comment | 17 complexity | 9d42b49ae0b7a16b6b4dee9923772695 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1# -*- coding: utf-8 -*-
   2"""Tests for Regression Diagnostics and Specification Tests
   3
   4Created on Thu Feb 09 13:19:47 2012
   5
   6Author: Josef Perktold
   7License: BSD-3
   8
   9currently all tests are against R
  10
  11"""
  12import json
  13import os
  14
  15import numpy as np
  16import pandas as pd
  17import pytest
  18from numpy.testing import (assert_, assert_almost_equal, assert_equal,
  19                           assert_allclose, assert_array_equal)
  20from pandas.testing import assert_frame_equal
  21
  22import statsmodels.stats.diagnostic as smsdia
  23import statsmodels.stats.outliers_influence as oi
  24import statsmodels.stats.sandwich_covariance as sw
  25from statsmodels.datasets import macrodata
  26from statsmodels.datasets import sunspots
  27from statsmodels.regression.linear_model import OLS
  28from statsmodels.tools.tools import Bunch
  29from statsmodels.tools.tools import add_constant
  30from statsmodels.tsa.ar_model import AutoReg
  31
  32cur_dir = os.path.abspath(os.path.dirname(__file__))
  33
  34
  35@pytest.fixture(scope="module")
  36def diagnostic_data():
  37    rs = np.random.RandomState(93674328)
  38    e = rs.standard_normal(500)
  39    x = rs.standard_normal((500, 3))
  40    y = x.sum(1) + e
  41    c = np.ones_like(y)
  42    data = pd.DataFrame(np.c_[y, c, x], columns=["y", "c", "x1", "x2", "x3"])
  43
  44    return data
  45
  46
  47def compare_to_reference(sp, sp_dict, decimal=(14, 14)):
  48    assert_allclose(sp[0], sp_dict['statistic'], atol=10 ** -decimal[0],
  49                    rtol=10 ** -decimal[0])
  50    assert_allclose(sp[1], sp_dict['pvalue'], atol=10 ** -decimal[1],
  51                    rtol=10 ** -decimal[0])
  52
  53
  54def test_gq():
  55    d = macrodata.load(as_pandas=False).data
  56
  57    realinv = d['realinv']
  58    realgdp = d['realgdp']
  59    realint = d['realint']
  60    endog = realinv
  61    exog = add_constant(np.c_[realgdp, realint])
  62
  63    # goldfeld-quandt
  64    het_gq_greater = dict(statistic=13.20512768685082, df1=99, df2=98,
  65                          pvalue=1.246141976112324e-30, distr='f')
  66
  67    gq = smsdia.het_goldfeldquandt(endog, exog, split=0.5)
  68    compare_to_reference(gq, het_gq_greater, decimal=(13, 14))
  69    assert_equal(gq[-1], 'increasing')
  70
  71
  72class TestDiagnosticG(object):
  73
  74    @classmethod
  75    def setup_class(cls):
  76        d = macrodata.load_pandas().data
  77        # growth rates
  78        gs_l_realinv = 400 * np.diff(np.log(d['realinv'].values))
  79        gs_l_realgdp = 400 * np.diff(np.log(d['realgdp'].values))
  80        lint = d['realint'][:-1].values
  81        tbilrate = d['tbilrate'][:-1].values
  82
  83        endogg = gs_l_realinv
  84        exogg = add_constant(np.c_[gs_l_realgdp, lint])
  85        exogg2 = add_constant(np.c_[gs_l_realgdp, tbilrate])
  86        exogg3 = add_constant(np.c_[gs_l_realgdp])
  87
  88        res_ols = OLS(endogg, exogg).fit()
  89        res_ols2 = OLS(endogg, exogg2).fit()
  90
  91        res_ols3 = OLS(endogg, exogg3).fit()
  92
  93        cls.res = res_ols
  94        cls.res2 = res_ols2
  95        cls.res3 = res_ols3
  96        cls.endog = cls.res.model.endog
  97        cls.exog = cls.res.model.exog
  98
  99    def test_basic(self):
 100        # mainly to check I got the right regression
 101        # > mkarray(fm$coefficients, "params")
 102        params = np.array([-9.48167277465485, 4.3742216647032,
 103                           -0.613996969478989])
 104
 105        assert_almost_equal(self.res.params, params, decimal=12)
 106
 107    def test_hac(self):
 108        res = self.res
 109        # > nw = NeweyWest(fm, lag = 4, prewhite = FALSE, verbose=TRUE)
 110        # > nw2 = NeweyWest(fm, lag=10, prewhite = FALSE, verbose=TRUE)
 111
 112        # > mkarray(nw, "cov_hac_4")
 113        cov_hac_4 = np.array([1.385551290884014, -0.3133096102522685,
 114                              -0.0597207976835705, -0.3133096102522685,
 115                              0.1081011690351306,
 116                              0.000389440793564336, -0.0597207976835705,
 117                              0.000389440793564339,
 118                              0.0862118527405036]).reshape((3, 3), order='F')
 119
 120        # > mkarray(nw2, "cov_hac_10")
 121        cov_hac_10 = np.array([1.257386180080192, -0.2871560199899846,
 122                               -0.03958300024627573, -0.2871560199899845,
 123                               0.1049107028987101,
 124                               0.0003896205316866944, -0.03958300024627578,
 125                               0.0003896205316866961,
 126                               0.0985539340694839]).reshape((3, 3), order='F')
 127
 128        cov = sw.cov_hac_simple(res, nlags=4, use_correction=False)
 129        bse_hac = sw.se_cov(cov)
 130        assert_almost_equal(cov, cov_hac_4, decimal=14)
 131        assert_almost_equal(bse_hac, np.sqrt(np.diag(cov)), decimal=14)
 132
 133        cov = sw.cov_hac_simple(res, nlags=10, use_correction=False)
 134        bse_hac = sw.se_cov(cov)
 135        assert_almost_equal(cov, cov_hac_10, decimal=14)
 136        assert_almost_equal(bse_hac, np.sqrt(np.diag(cov)), decimal=14)
 137
 138    def test_het_goldfeldquandt(self):
 139        # TODO: test options missing
 140
 141        # > gq = gqtest(fm, alternative='greater')
 142        # > mkhtest_f(gq, 'het_gq_greater', 'f')
 143        het_gq_greater = dict(statistic=0.5313259064778423,
 144                              pvalue=0.9990217851193723,
 145                              parameters=(98, 98), distr='f')
 146
 147        # > gq = gqtest(fm, alternative='less')
 148        # > mkhtest_f(gq, 'het_gq_less', 'f')
 149        het_gq_less = dict(statistic=0.5313259064778423,
 150                           pvalue=0.000978214880627621,
 151                           parameters=(98, 98), distr='f')
 152
 153        # > gq = gqtest(fm, alternative='two.sided')
 154        # > mkhtest_f(gq, 'het_gq_two_sided', 'f')
 155        het_gq_two_sided = dict(statistic=0.5313259064778423,
 156                                pvalue=0.001956429761255241,
 157                                parameters=(98, 98), distr='f')
 158
 159        # > gq = gqtest(fm, fraction=0.1, alternative='two.sided')
 160        # > mkhtest_f(gq, 'het_gq_two_sided_01', 'f')
 161        het_gq_two_sided_01 = dict(statistic=0.5006976835928314,
 162                                   pvalue=0.001387126702579789,
 163                                   parameters=(88, 87), distr='f')
 164
 165        endogg, exogg = self.endog, self.exog
 166        # tests
 167        gq = smsdia.het_goldfeldquandt(endogg, exogg, split=0.5)
 168        compare_to_reference(gq, het_gq_greater, decimal=(14, 14))
 169        assert_equal(gq[-1], 'increasing')
 170
 171        gq = smsdia.het_goldfeldquandt(endogg, exogg, split=0.5,
 172                                       alternative='decreasing')
 173        compare_to_reference(gq, het_gq_less, decimal=(14, 14))
 174        assert_equal(gq[-1], 'decreasing')
 175
 176        gq = smsdia.het_goldfeldquandt(endogg, exogg, split=0.5,
 177                                       alternative='two-sided')
 178        compare_to_reference(gq, het_gq_two_sided, decimal=(14, 14))
 179        assert_equal(gq[-1], 'two-sided')
 180
 181        # TODO: forcing the same split as R 202-90-90-1=21
 182        gq = smsdia.het_goldfeldquandt(endogg, exogg, split=90, drop=21,
 183                                       alternative='two-sided')
 184        compare_to_reference(gq, het_gq_two_sided_01, decimal=(14, 14))
 185        assert_equal(gq[-1], 'two-sided')
 186        # TODO other options ???
 187
 188    def test_het_breusch_pagan(self):
 189        res = self.res
 190
 191        bptest = dict(statistic=0.709924388395087, pvalue=0.701199952134347,
 192                      parameters=(2,), distr='f')
 193
 194        bp = smsdia.het_breuschpagan(res.resid, res.model.exog)
 195        compare_to_reference(bp, bptest, decimal=(12, 12))
 196
 197    def test_het_breusch_pagan_nonrobust(self):
 198        res = self.res
 199
 200        bptest = dict(statistic=1.302014063483341, pvalue=0.5215203247110649,
 201                      parameters=(2,), distr='f')
 202
 203        bp = smsdia.het_breuschpagan(res.resid, res.model.exog, robust=False)
 204        compare_to_reference(bp, bptest, decimal=(12, 12))
 205
 206    def test_het_white(self):
 207        res = self.res
 208
 209        # TODO: regressiontest, compare with Greene or Gretl or Stata
 210        hw = smsdia.het_white(res.resid, res.model.exog)
 211        hw_values = (33.503722896538441, 2.9887960597830259e-06,
 212                     7.7945101228430946, 1.0354575277704231e-06)
 213        assert_almost_equal(hw, hw_values)
 214
 215    def test_het_white_error(self):
 216        res = self.res
 217
 218        with pytest.raises(ValueError, match="White's heteroskedasticity"):
 219            smsdia.het_white(res.resid, res.model.exog[:, :1])
 220
 221    def test_het_arch(self):
 222        # test het_arch and indirectly het_lm against R
 223        # > library(FinTS)
 224        # > at = ArchTest(residuals(fm), lags=4)
 225        # > mkhtest(at, 'archtest_4', 'chi2')
 226        archtest_4 = dict(statistic=3.43473400836259,
 227                          pvalue=0.487871315392619, parameters=(4,),
 228                          distr='chi2')
 229
 230        # > at = ArchTest(residuals(fm), lags=12)
 231        # > mkhtest(at, 'archtest_12', 'chi2')
 232        archtest_12 = dict(statistic=8.648320999014171,
 233                           pvalue=0.732638635007718, parameters=(12,),
 234                           distr='chi2')
 235
 236        at4 = smsdia.het_arch(self.res.resid, nlags=4)
 237        at12 = smsdia.het_arch(self.res.resid, nlags=12)
 238        compare_to_reference(at4[:2], archtest_4, decimal=(12, 13))
 239        compare_to_reference(at12[:2], archtest_12, decimal=(12, 13))
 240
 241    def test_het_arch2(self):
 242        # test autolag options, this also test het_lm
 243        # unfortunately optimal lag=1 for this data
 244        resid = self.res.resid
 245
 246        res1 = smsdia.het_arch(resid, nlags=1, autolag=None, store=True)
 247        rs1 = res1[-1]
 248        with pytest.warns(FutureWarning, match="autolag is deprecated and"):
 249            res2 = smsdia.het_arch(resid, nlags=5, autolag='aic', store=True)
 250        rs2 = res2[-1]
 251
 252        assert_almost_equal(rs2.resols.params, rs1.resols.params, decimal=13)
 253        assert_almost_equal(res2[:4], res1[:4], decimal=13)
 254
 255        # test that smallest lag, nlags=1 works
 256        with pytest.warns(FutureWarning, match="autolag is deprecated and"):
 257            res3 = smsdia.het_arch(resid, nlags=1, autolag='aic')
 258        assert_almost_equal(res3[:4], res1[:4], decimal=13)
 259
 260    def test_acorr_breusch_godfrey(self):
 261        res = self.res
 262
 263        # bgf = bgtest(fm, order = 4, type="F")
 264        breuschgodfrey_f = dict(statistic=1.179280833676792,
 265                                pvalue=0.321197487261203,
 266                                parameters=(4, 195,), distr='f')
 267
 268        # > bgc = bgtest(fm, order = 4, type="Chisq")
 269        # > mkhtest(bgc, "breuschpagan_c", "chi2")
 270        breuschgodfrey_c = dict(statistic=4.771042651230007,
 271                                pvalue=0.3116067133066697,
 272                                parameters=(4,), distr='chi2')
 273
 274        bg = smsdia.acorr_breusch_godfrey(res, nlags=4)
 275        bg_r = [breuschgodfrey_c['statistic'], breuschgodfrey_c['pvalue'],
 276                breuschgodfrey_f['statistic'], breuschgodfrey_f['pvalue']]
 277        assert_almost_equal(bg, bg_r, decimal=13)
 278
 279        # check that lag choice works
 280        with pytest.warns(FutureWarning, match="The default value of nlags"):
 281            bg2 = smsdia.acorr_breusch_godfrey(res, nlags=None)
 282        bg3 = smsdia.acorr_breusch_godfrey(res, nlags=14)
 283        assert_almost_equal(bg2, bg3, decimal=13)
 284
 285    def test_acorr_breusch_godfrey_multidim(self):
 286        res = Bunch(resid=np.empty((100, 2)))
 287        with pytest.raises(ValueError, match='Model resid must be a 1d array'):
 288            smsdia.acorr_breusch_godfrey(res)
 289
 290    def test_acorr_ljung_box(self):
 291
 292        # unit-test which may be useful later
 293        # ddof correction for fitted parameters in ARMA(p,q) fitdf=p+q
 294        # bt = Box.test(residuals(fm), lag=4, type = "Ljung-Box", fitdf=2)
 295        # mkhtest(bt, "ljung_box_4df2", "chi2")
 296        # ljung_box_4df2 = dict(statistic=5.23587172795227,
 297        #                       pvalue=0.0729532930400377,
 298        #                       parameters=(2,), distr='chi2')
 299
 300        # bt = Box.test(residuals(fm), lag=4, type = "Box-Pierce", fitdf=2)
 301        # mkhtest(bt, "ljung_box_bp_4df2", "chi2")
 302        # ljung_box_bp_4df2 = dict(statistic=5.12462932741681,
 303        #                          pvalue=0.0771260128929921,
 304        #                          parameters=(2,), distr='chi2')
 305
 306        res = self.res
 307        # general test
 308        # bt = Box.test(residuals(fm), lag=4, type = "Ljung-Box")
 309        # mkhtest(bt, "ljung_box_4", "chi2")
 310        ljung_box_4 = dict(statistic=5.23587172795227,
 311                           pvalue=0.263940335284713,
 312                           parameters=(4,), distr='chi2')
 313
 314        # bt = Box.test(residuals(fm), lag=4, type = "Box-Pierce")
 315        # mkhtest(bt, "ljung_box_bp_4", "chi2")
 316        ljung_box_bp_4 = dict(statistic=5.12462932741681,
 317                              pvalue=0.2747471266820692,
 318                              parameters=(4,), distr='chi2')
 319
 320        lb, lbpval, bp, bppval = smsdia.acorr_ljungbox(res.resid, 4,
 321                                                       boxpierce=True,
 322                                                       return_df=False)
 323        compare_to_reference([lb[-1], lbpval[-1]], ljung_box_4,
 324                             decimal=(13, 13))
 325        compare_to_reference([bp[-1], bppval[-1]], ljung_box_bp_4,
 326                             decimal=(13, 13))
 327
 328    def test_acorr_ljung_box_big_default(self):
 329        res = self.res
 330        # R test with big dataset and default lag
 331        # bt = Box.test(residuals(fm), type = "Ljung-Box")
 332        # mkhtest(bt, "ljung_box_none", "chi2")
 333        ljung_box_none = dict(statistic=51.03724531797195,
 334                              pvalue=0.11334744923390,
 335                              distr='chi2')
 336
 337        # bt = Box.test(residuals(fm), type = "Box-Pierce")
 338        # mkhtest(bt, "ljung_box_bp_none", "chi2")
 339        ljung_box_bp_none = dict(statistic=45.12238537034000,
 340                                 pvalue=0.26638168491464,
 341                                 distr='chi2')
 342        lags = min(40, res.resid.shape[0] // 2 - 2)
 343        lb, lbpval, bp, bppval = smsdia.acorr_ljungbox(res.resid,
 344                                                       boxpierce=True,
 345                                                       lags=lags,
 346                                                       return_df=False)
 347        compare_to_reference([lb[-1], lbpval[-1]], ljung_box_none,
 348                             decimal=(13, 13))
 349        compare_to_reference([bp[-1], bppval[-1]], ljung_box_bp_none,
 350                             decimal=(13, 13))
 351
 352    def test_acorr_ljung_box_small_default(self):
 353        res = self.res
 354        # R test with small dataset and default lag
 355        # bt = Box.test(residuals(fm), type = "Ljung-Box")
 356        # mkhtest(bt, "ljung_box_small", "chi2")
 357        ljung_box_small = dict(statistic=9.61503968281915,
 358                               pvalue=0.72507000996945,
 359                               parameters=(0,), distr='chi2')
 360
 361        # bt = Box.test(residuals(fm), type = "Box-Pierce")
 362        # mkhtest(bt, "ljung_box_bp_small", "chi2")
 363        ljung_box_bp_small = dict(statistic=7.41692150864936,
 364                                  pvalue=0.87940785887006,
 365                                  parameters=(0,),
 366                                  distr='chi2')
 367
 368        lb, lbpval, bp, bppval = smsdia.acorr_ljungbox(res.resid[:30],
 369                                                       boxpierce=True,
 370                                                       lags=13,
 371                                                       return_df=False)
 372        compare_to_reference([lb[-1], lbpval[-1]], ljung_box_small,
 373                             decimal=(13, 13))
 374        compare_to_reference([bp[-1], bppval[-1]], ljung_box_bp_small,
 375                             decimal=(13, 13))
 376
 377    def test_acorr_ljung_box_against_r(self, reset_randomstate):
 378        rs = np.random.RandomState(9876543)
 379        y1 = rs.standard_normal(100)
 380        e = rs.standard_normal(201)
 381        y2 = np.zeros_like(e)
 382        y2[0] = e[0]
 383        for i in range(1, 201):
 384            y2[i] = 0.5 * y2[i - 1] - 0.4 * e[i - 1] + e[i]
 385        y2 = y2[-100:]
 386
 387        r_results_y1_lb = [[0.15685, 1, 0.6921],
 388                           [5.4737, 5, 0.3608],
 389                           [10.508, 10, 0.3971]]
 390
 391        r_results_y2_lb = [[2.8764, 1, 0.08989],
 392                           [3.8104, 5, 0.577],
 393                           [8.4779, 10, 0.5823]]
 394        res_y1 = smsdia.acorr_ljungbox(y1, 10, return_df=True)
 395        res_y2 = smsdia.acorr_ljungbox(y2, 10, return_df=True)
 396        for i, loc in enumerate((1, 5, 10)):
 397            row = res_y1.loc[loc]
 398            assert_allclose(r_results_y1_lb[i][0], row.loc["lb_stat"],
 399                            rtol=1e-3)
 400            assert_allclose(r_results_y1_lb[i][2], row.loc["lb_pvalue"],
 401                            rtol=1e-3)
 402
 403            row = res_y2.loc[loc]
 404            assert_allclose(r_results_y2_lb[i][0], row.loc["lb_stat"],
 405                            rtol=1e-3)
 406            assert_allclose(r_results_y2_lb[i][2], row.loc["lb_pvalue"],
 407                            rtol=1e-3)
 408
 409        res = smsdia.acorr_ljungbox(y2, 10, boxpierce=True, return_df=True)
 410        assert_allclose(res.loc[10, "bp_stat"], 7.8935, rtol=1e-3)
 411        assert_allclose(res.loc[10, "bp_pvalue"], 0.639, rtol=1e-3)
 412
 413        res = smsdia.acorr_ljungbox(y2, 10, boxpierce=True, return_df=True,
 414                                    model_df=1)
 415        assert_allclose(res.loc[10, "bp_pvalue"], 0.5449, rtol=1e-3)
 416
 417    def test_harvey_collier(self):
 418        # > hc = harvtest(fm, order.by = NULL, data = list())
 419        # > mkhtest_f(hc, 'harvey_collier', 't')
 420        harvey_collier = dict(statistic=0.494432160939874,
 421                              pvalue=0.6215491310408242,
 422                              parameters=198, distr='t')
 423
 424        hc = smsdia.linear_harvey_collier(self.res)
 425        compare_to_reference(hc, harvey_collier, decimal=(12, 12))
 426
 427    def test_rainbow(self):
 428        # rainbow test
 429        # > rt = raintest(fm)
 430        # > mkhtest_f(rt, 'raintest', 'f')
 431        raintest = dict(statistic=0.6809600116739604, pvalue=0.971832843583418,
 432                        parameters=(101, 98), distr='f')
 433
 434        # > rt = raintest(fm, fraction=0.4)
 435        # > mkhtest_f(rt, 'raintest_fraction_04', 'f')
 436        raintest_fraction_04 = dict(statistic=0.565551237772662,
 437                                    pvalue=0.997592305968473,
 438                                    parameters=(122, 77), distr='f')
 439
 440        rb = smsdia.linear_rainbow(self.res)
 441        compare_to_reference(rb, raintest, decimal=(13, 14))
 442        rb = smsdia.linear_rainbow(self.res, frac=0.4)
 443        compare_to_reference(rb, raintest_fraction_04, decimal=(13, 14))
 444
 445    def test_compare_lr(self):
 446        res = self.res
 447        res3 = self.res3  # nested within res
 448        # lrtest
 449        # lrt = lrtest(fm, fm2)
 450        # Model 1: ginv ~ ggdp + lint
 451        # Model 2: ginv ~ ggdp
 452
 453        lrtest = dict(loglike1=-763.9752181602237, loglike2=-766.3091902020184,
 454                      chi2value=4.66794408358942, pvalue=0.03073069384028677,
 455                      df=(4, 3, 1))
 456        lrt = res.compare_lr_test(res3)
 457        assert_almost_equal(lrt[0], lrtest['chi2value'], decimal=11)
 458        assert_almost_equal(lrt[1], lrtest['pvalue'], decimal=11)
 459
 460        waldtest = dict(fvalue=4.65216373312492, pvalue=0.03221346195239025,
 461                        df=(199, 200, 1))
 462
 463        wt = res.compare_f_test(res3)
 464        assert_almost_equal(wt[0], waldtest['fvalue'], decimal=11)
 465        assert_almost_equal(wt[1], waldtest['pvalue'], decimal=11)
 466
 467    def test_compare_j(self):
 468        res = self.res
 469        res2 = self.res2
 470        # jt = jtest(fm, lm(ginv ~ ggdp + tbilrate))
 471        # Estimate         Std. Error  t value Pr(>|t|)
 472        jtest = [('M1 + fitted(M2)', 1.591505670785873, 0.7384552861695823,
 473                  2.155182176352370, 0.032354572525314450, '*'),
 474                 ('M2 + fitted(M1)', 1.305687653016899, 0.4808385176653064,
 475                  2.715438978051544, 0.007203854534057954, '**')]
 476
 477        jt1 = smsdia.compare_j(res2, res)
 478        assert_almost_equal(jt1, jtest[0][3:5], decimal=13)
 479
 480        jt2 = smsdia.compare_j(res, res2)
 481        assert_almost_equal(jt2, jtest[1][3:5], decimal=13)
 482
 483    def test_compare_j_class(self, diagnostic_data):
 484        # Only to verify not broken, deprecated
 485        with pytest.warns(FutureWarning, match="CompareJ is deprecated"):
 486            cj = smsdia.CompareJ()
 487        data = diagnostic_data
 488        res1 = OLS(data.y, data[["c", "x1"]]).fit()
 489        res2 = OLS(data.y, data[["c", "x2"]]).fit()
 490        cj(res1, res2)
 491        cj.run(res1, res2, attach=True)
 492        assert cj.res_zx is not None
 493
 494    @pytest.mark.parametrize("comp", [smsdia.compare_cox, smsdia.compare_j])
 495    def test_compare_error(self, comp, diagnostic_data):
 496        data = diagnostic_data
 497        res1 = OLS(data.y, data[["c", "x1"]]).fit()
 498        res2 = OLS(data.x3, data[["c", "x2"]]).fit()
 499        with pytest.raises(ValueError, match="endogenous variables"):
 500            comp(res1, res2)
 501
 502    @pytest.mark.parametrize("comp", [smsdia.compare_cox, smsdia.compare_j])
 503    def test_compare_nested(self, comp, diagnostic_data):
 504        data = diagnostic_data
 505        res1 = OLS(data.y, data[["c", "x1"]]).fit()
 506        res2 = OLS(data.y, data[["c", "x1", "x2"]]).fit()
 507        with pytest.raises(ValueError, match="The exog in results_x"):
 508            comp(res1, res2)
 509        with pytest.raises(ValueError, match="The exog in results_x"):
 510            comp(res2, res1)
 511
 512    def test_compare_cox(self):
 513        res = self.res
 514        res2 = self.res2
 515        # Estimate        Std. Error  z value   Pr(>|z|)
 516        coxtest = [('fitted(M1) ~ M2', -0.782030488930356, 0.599696502782265,
 517                    -1.304043770977755, 1.922186587840554e-01, ' '),
 518                   ('fitted(M2) ~ M1', -2.248817107408537, 0.392656854330139,
 519                    -5.727181590258883, 1.021128495098556e-08, '***')]
 520
 521        ct1 = smsdia.compare_cox(res, res2)
 522        assert_almost_equal(ct1, coxtest[0][3:5], decimal=12)
 523
 524        ct2 = smsdia.compare_cox(res2, res)
 525        assert_almost_equal(ct2, coxtest[1][3:5], decimal=12)
 526
 527        _, _, store = smsdia.compare_cox(res, res2, store=True)
 528        assert isinstance(store, smsdia.ResultsStore)
 529
 530    def test_compare_cox_class(self, diagnostic_data):
 531        # Only to verify not broken, deprecated
 532        with pytest.warns(FutureWarning, match="CompareCox is deprecated"):
 533            cc = smsdia.CompareCox()
 534        data = diagnostic_data
 535        res1 = OLS(data.y, data[["c", "x1"]]).fit()
 536        res2 = OLS(data.y, data[["c", "x2"]]).fit()
 537        cc(res1, res2)
 538        cc.run(res1, res2, attach=True)
 539        assert cc.res_dx is not None
 540
 541    def test_cusum_ols(self):
 542        # R library(strucchange)
 543        # > sc = sctest(ginv ~ ggdp + lint, type="OLS-CUSUM")
 544        # > mkhtest(sc, 'cusum_ols', 'BB')
 545        cusum_ols = dict(statistic=1.055750610401214,
 546                         pvalue=0.2149567397376543,
 547                         parameters=(), distr='BB')  # Brownian Bridge
 548
 549        k_vars = 3
 550        cs_ols = smsdia.breaks_cusumolsresid(self.res.resid, ddof=k_vars)  #
 551        compare_to_reference(cs_ols, cusum_ols, decimal=(12, 12))
 552
 553    def test_breaks_hansen(self):
 554        # > sc = sctest(ginv ~ ggdp + lint, type="Nyblom-Hansen")
 555        # > mkhtest(sc, 'breaks_nyblom_hansen', 'BB')
 556        breaks_nyblom_hansen = dict(statistic=1.0300792740544484,
 557                                    pvalue=0.1136087530212015,
 558                                    parameters=(), distr='BB')
 559
 560        bh = smsdia.breaks_hansen(self.res)
 561        assert_almost_equal(bh[0], breaks_nyblom_hansen['statistic'],
 562                            decimal=13)
 563        # TODO: breaks_hansen does not return pvalues
 564
 565    def test_recursive_residuals(self):
 566
 567        reccumres_standardize = np.array([-2.151, -3.748, -3.114, -3.096,
 568                                          -1.865, -2.230, -1.194, -3.500,
 569                                          -3.638, -4.447, -4.602, -4.631,
 570                                          -3.999,
 571                                          -4.830, -5.429, -5.435, -6.554,
 572                                          -8.093, -8.567, -7.532, -7.079,
 573                                          -8.468,
 574                                          -9.320, -12.256, -11.932, -11.454,
 575                                          -11.690, -11.318, -12.665, -12.842,
 576                                          -11.693, -10.803, -12.113, -12.109,
 577                                          -13.002, -11.897, -10.787, -10.159,
 578                                          -9.038, -9.007, -8.634, -7.552,
 579                                          -7.153, -6.447, -5.183, -3.794,
 580                                          -3.511,
 581                                          -3.979, -3.236, -3.793, -3.699,
 582                                          -5.056, -5.724, -4.888, -4.309,
 583                                          -3.688,
 584                                          -3.918, -3.735, -3.452, -2.086,
 585                                          -6.520, -7.959, -6.760, -6.855,
 586                                          -6.032,
 587                                          -4.405, -4.123, -4.075, -3.235,
 588                                          -3.115, -3.131, -2.986, -1.813,
 589                                          -4.824,
 590                                          -4.424, -4.796, -4.000, -3.390,
 591                                          -4.485, -4.669, -4.560, -3.834,
 592                                          -5.507,
 593                                          -3.792, -2.427, -1.756, -0.354,
 594                                          1.150, 0.586, 0.643, 1.773, -0.830,
 595                                          -0.388, 0.517, 0.819, 2.240, 3.791,
 596                                          3.187, 3.409, 2.431, 0.668, 0.957,
 597                                          -0.928, 0.327, -0.285, -0.625,
 598                                          -2.316, -1.986, -0.744, -1.396,
 599                                          -1.728,
 600                                          -0.646, -2.602, -2.741, -2.289,
 601                                          -2.897, -1.934, -2.532, -3.175,
 602                                          -2.806,
 603                                          -3.099, -2.658, -2.487, -2.515,
 604                                          -2.224, -2.416, -1.141, 0.650,
 605                                          -0.947,
 606                                          0.725, 0.439, 0.885, 2.419, 2.642,
 607                                          2.745, 3.506, 4.491, 5.377, 4.624,
 608                                          5.523, 6.488, 6.097, 5.390, 6.299,
 609                                          6.656, 6.735, 8.151, 7.260, 7.846,
 610                                          8.771, 8.400, 8.717, 9.916, 9.008,
 611                                          8.910, 8.294, 8.982, 8.540, 8.395,
 612                                          7.782, 7.794, 8.142, 8.362, 8.400,
 613                                          7.850, 7.643, 8.228, 6.408, 7.218,
 614                                          7.699, 7.895, 8.725, 8.938, 8.781,
 615                                          8.350, 9.136, 9.056, 10.365, 10.495,
 616                                          10.704, 10.784, 10.275, 10.389,
 617                                          11.586, 11.033, 11.335, 11.661,
 618                                          10.522,
 619                                          10.392, 10.521, 10.126, 9.428, 9.734,
 620                                          8.954, 9.949, 10.595, 8.016,
 621                                          6.636, 6.975])
 622
 623        rr = smsdia.recursive_olsresiduals(self.res, skip=3, alpha=0.95)
 624        assert_equal(np.round(rr[5][1:], 3),
 625                     reccumres_standardize)  # extra zero in front
 626        # assert_equal(np.round(rr[3][4:], 3), np.diff(reccumres_standardize))
 627        assert_almost_equal(rr[3][4:], np.diff(reccumres_standardize), 3)
 628        assert_almost_equal(rr[4][3:].std(ddof=1), 10.7242, decimal=4)
 629        order = np.arange(self.res.model.endog.shape[0])
 630        rr_order = smsdia.recursive_olsresiduals(self.res, skip=3, alpha=0.95,
 631                                                 order_by=order)
 632        assert_allclose(rr[0], rr_order[0], atol=1e-6)
 633
 634        # regression number, visually checked with graph from gretl
 635        ub0 = np.array([13.37318571, 13.50758959, 13.64199346, 13.77639734,
 636                        13.91080121])
 637        ub1 = np.array([39.44753774, 39.58194162, 39.7163455, 39.85074937,
 638                        39.98515325])
 639        lb, ub = rr[6]
 640        assert_almost_equal(ub[:5], ub0, decimal=7)
 641        assert_almost_equal(lb[:5], -ub0, decimal=7)
 642        assert_almost_equal(ub[-5:], ub1, decimal=7)
 643        assert_almost_equal(lb[-5:], -ub1, decimal=7)
 644
 645        # test a few values with explicit OLS
 646        endog = self.res.model.endog
 647        exog = self.res.model.exog
 648        params = []
 649        ypred = []
 650        for i in range(3, 10):
 651            resi = OLS(endog[:i], exog[:i]).fit()
 652            ypred.append(resi.model.predict(resi.params, exog[i]))
 653            params.append(resi.params)
 654        assert_almost_equal(rr[2][3:10], ypred, decimal=12)
 655        assert_almost_equal(rr[0][3:10], endog[3:10] - ypred, decimal=12)
 656        assert_almost_equal(rr[1][2:9], params, decimal=12)
 657
 658    def test_normality(self):
 659        res = self.res
 660
 661        # > library(nortest) #Lilliefors (Kolmogorov-Smirnov) normality test
 662        # > lt = lillie.test(residuals(fm))
 663        # > mkhtest(lt, "lilliefors", "-")
 664        lilliefors1 = dict(statistic=0.0723390908786589,
 665                           pvalue=0.01204113540102896, parameters=(),
 666                           distr='-')
 667
 668        # > lt = lillie.test(residuals(fm)**2)
 669        # > mkhtest(lt, "lilliefors", "-")
 670        lilliefors2 = dict(statistic=0.301311621898024,
 671                           pvalue=1.004305736618051e-51,
 672                           parameters=(), distr='-')
 673
 674        # > lt = lillie.test(residuals(fm)[1:20])
 675        # > mkhtest(lt, "lilliefors", "-")
 676        lilliefors3 = dict(statistic=0.1333956004203103,
 677                           pvalue=0.455683, parameters=(), distr='-')
 678
 679        lf1 = smsdia.lilliefors(res.resid, pvalmethod='approx')
 680        lf2 = smsdia.lilliefors(res.resid ** 2, pvalmethod='approx')
 681        lf3 = smsdia.lilliefors(res.resid[:20], pvalmethod='approx')
 682
 683        compare_to_reference(lf1, lilliefors1, decimal=(14, 14))
 684        compare_to_reference(lf2, lilliefors2,
 685                             decimal=(14, 14))  # pvalue very small
 686        assert_allclose(lf2[1], lilliefors2['pvalue'], rtol=1e-10)
 687        compare_to_reference(lf3, lilliefors3, decimal=(14, 1))
 688        # R uses different approximation for pvalue in last case
 689
 690        # > ad = ad.test(residuals(fm))
 691        # > mkhtest(ad, "ad3", "-")
 692        adr1 = dict(statistic=1.602209621518313, pvalue=0.0003937979149362316,
 693                    parameters=(), distr='-')
 694
 695        # > ad = ad.test(residuals(fm)**2)
 696        # > mkhtest(ad, "ad3", "-")
 697
 698        # > ad = ad.test(residuals(fm)[1:20])
 699        # > mkhtest(ad, "ad3", "-")
 700        adr3 = dict(statistic=0.3017073732210775, pvalue=0.5443499281265933,
 701                    parameters=(), distr='-')
 702
 703        ad1 = smsdia.normal_ad(res.resid)
 704        compare_to_reference(ad1, adr1, decimal=(11, 13))
 705        ad2 = smsdia.normal_ad(res.resid ** 2)
 706        assert_(np.isinf(ad2[0]))
 707        ad3 = smsdia.normal_ad(res.resid[:20])
 708        compare_to_reference(ad3, adr3, decimal=(11, 12))
 709
 710    def test_influence(self):
 711        res = self.res
 712
 713        # this test is slow
 714        infl = oi.OLSInfluence(res)
 715
 716        path = os.path.join(cur_dir, "results", "influence_lsdiag_R.json")
 717        with open(path, 'r') as fp:
 718            lsdiag = json.load(fp)
 719
 720        # basic
 721        assert_almost_equal(np.array(lsdiag['cov.scaled']).reshape(3, 3),
 722                            res.cov_params(), decimal=14)
 723        assert_almost_equal(np.array(lsdiag['cov.unscaled']).reshape(3, 3),
 724                            res.normalized_cov_params, decimal=14)
 725
 726        c0, c1 = infl.cooks_distance  # TODO: what's c1
 727
 728        assert_almost_equal(c0, lsdiag['cooks'], decimal=14)
 729        assert_almost_equal(infl.hat_matrix_diag, lsdiag['hat'], decimal=14)
 730        assert_almost_equal(infl.resid_studentized_internal,
 731                            lsdiag['std.res'], decimal=14)
 732
 733        # slow:
 734        # infl._get_all_obs()  #slow, nobs estimation loop, called implicitly
 735        dffits, dffth = infl.dffits
 736        assert_almost_equal(dffits, lsdiag['dfits'], decimal=14)
 737        assert_almost_equal(infl.resid_studentized_external,
 738                            lsdiag['stud.res'], decimal=14)
 739
 740        fn = os.path.join(cur_dir, "results/influence_measures_R.csv")
 741        infl_r = pd.read_csv(fn, index_col=0)
 742        # not used yet:
 743        # infl_bool_r  = pandas.read_csv(fn, index_col=0,
 744        #                                converters=dict(zip(lrange(7),[conv]*7)))
 745        infl_r2 = np.asarray(infl_r)
 746        assert_almost_equal(infl.dfbetas, infl_r2[:, :3], decimal=13)
 747        assert_almost_equal(infl.cov_ratio, infl_r2[:, 4], decimal=14)
 748        # duplicates
 749        assert_almost_equal(dffits, infl_r2[:, 3], decimal=14)
 750        assert_almost_equal(c0, infl_r2[:, 5], decimal=14)
 751        assert_almost_equal(infl.hat_matrix_diag, infl_r2[:, 6], decimal=14)
 752
 753        # TODO: finish and check thresholds and pvalues
 754        # Note: for dffits, R uses a threshold around 0.36,
 755        #        mine: dffits[1]=0.24373
 756        # R has
 757        # >>> np.nonzero(np.asarray(infl_bool_r["dffit"]))[0]
 758        # array([  6,  26,  63,  76,  90, 199])
 759        # >>> np.nonzero(np.asarray(infl_bool_r["cov.r"]))[0]
 760        # array([  4,  26,  59,  61,  63,  72,  76,  84,  91,  92,  94,  95,
 761        #        108, 197, 198])
 762        # >>> np.nonzero(np.asarray(infl_bool_r["hat"]))[0]
 763        # array([ 62,  76,  84,  90,  91,  92,  95, 108, 197, 199])
 764
 765
 766class TestDiagnosticGPandas(TestDiagnosticG):
 767
 768    @classmethod
 769    def setup_class(cls):
 770        d = macrodata.load_pandas().data
 771        # growth rates
 772        d['gs_l_realinv'] = 400 * np.log(d['realinv']).diff()
 773        d['gs_l_realgdp'] = 400 * np.log(d['realgdp']).diff()
 774        d['lint'] = d['realint'].shift(1)
 775        d['tbilrate'] = d['tbilrate'].shift(1)
 776
 777        d = d.dropna()
 778        cls.d = d
 779        endogg = d['gs_l_realinv']
 780        exogg = add_constant(d[['gs_l_realgdp', 'lint']])
 781        exogg2 = add_constant(d[['gs_l_realgdp', 'tbilrate']])
 782        exogg3 = add_constant(d[['gs_l_realgdp']])
 783
 784        res_ols = OLS(endogg, exogg).fit()
 785        res_ols2 = OLS(endogg, exogg2).fit()
 786
 787        res_ols3 = OLS(endogg, exogg3).fit()
 788
 789        cls.res = res_ols
 790        cls.res2 = res_ols2
 791        cls.res3 = res_ols3
 792        cls.endog = cls.res.model.endog
 793        cls.exog = cls.res.model.exog
 794
 795
 796def test_spec_white():
 797    resdir = os.path.join(cur_dir, "results")
 798    wsfiles = ['wspec1.csv', 'wspec2.csv', 'wspec3.csv', 'wspec4.csv']
 799    for file in wsfiles:
 800        mdlfile = os.path.join(resdir, file)
 801        mdl = np.asarray(pd.read_csv(mdlfile))
 802        # DV is in last column
 803        lastcol = mdl.shape[1] - 1
 804        dv = mdl[:, lastcol]
 805        # create design matrix
 806        design = np.concatenate((np.ones((mdl.shape[0], 1)),
 807                                 np.delete(mdl, lastcol, 1)), axis=1)
 808        # perform OLS and generate residuals
 809        resids = dv - np.dot(design, np.linalg.lstsq(design, dv, rcond=-1)[0])
 810        # perform White spec test. wspec3/wspec4 contain dummies.
 811        wsres = smsdia.spec_white(resids, design)
 812        # compare results to SAS 9.3 output
 813        if file == 'wspec1.csv':
 814            assert_almost_equal(wsres, [3.251, 0.661, 5], decimal=3)
 815        elif file == 'wspec2.csv':
 816            assert_almost_equal(wsres, [6.070, 0.733, 9], decimal=3)
 817        elif file == 'wspec3.csv':
 818            assert_almost_equal(wsres, [6.767, 0.454, 7], decimal=3)
 819        else:
 820            assert_almost_equal(wsres, [8.462, 0.671, 11], decimal=3)
 821
 822
 823def test_spec_white_error(reset_randomstate):
 824    with pytest.raises(ValueError, match="White's specification test "):
 825        smsdia.spec_white(np.random.standard_normal(100),
 826                          np.random.standard_normal((100, 1)))
 827    with pytest.raises(ValueError, match="White's specification test "):
 828        smsdia.spec_white(np.random.standard_normal(100),
 829                          np.random.standard_normal((100, 2)))
 830
 831
 832def test_linear_lm_direct(reset_randomstate):
 833    endog = np.random.standard_normal(500)
 834    exog = add_constant(np.random.standard_normal((500, 3)))
 835    res = OLS(endog, exog).fit()
 836    lm_res = smsdia.linear_lm(res.resid, exog)
 837    aug = np.hstack([exog, exog[:, 1:]**2])
 838    res_aug = OLS(res.resid, aug).fit()
 839    stat = res_aug.rsquared * aug.shape[0]
 840    assert_allclose(lm_res[0], stat)
 841
 842
 843# TODO: make this a test or move/remove
 844def grangertest():
 845    # > gt = grangertest(ginv, ggdp, order=4)
 846    # > gt
 847    # Granger causality test
 848    #
 849    # Model 1: ggdp ~ Lags(ggdp, 1:4) + Lags(ginv, 1:4)
 850    # Model 2: ggdp ~ Lags(ggdp, 1:4)
 851
 852    dict(fvalue=1.589672703015157, pvalue=0.178717196987075,
 853         df=(198, 193))
 854
 855
 856@pytest.mark.smoke
 857def test_outlier_influence_funcs(reset_randomstate):
 858    x = add_constant(np.random.randn(10, 2))
 859    y = x.sum(1) + np.random.randn(10)
 860    res = OLS(y, x).fit()
 861    out_05 = oi.summary_table(res)
 862    # GH3344 : Check alpha has an effect
 863    out_01 = oi.summary_table(res, alpha=0.01)
 864    assert_(np.all(out_01[1][:, 6] <= out_05[1][:, 6]))
 865    assert_(np.all(out_01[1][:, 7] >= out_05[1][:, 7]))
 866
 867    res2 = OLS(y, x[:, 0]).fit()
 868    oi.summary_table(res2, alpha=0.05)
 869    infl = res2.get_influence()
 870    infl.summary_table()
 871
 872
 873def test_influence_wrapped():
 874    from pandas import DataFrame
 875
 876    d = macrodata.load_pandas().data
 877    # growth rates
 878    gs_l_realinv = 400 * np.log(d['realinv']).diff().dropna()
 879    gs_l_realgdp = 400 * np.log(d['realgdp']).diff().dropna()
 880    lint = d['realint'][:-1]
 881
 882    # re-index these because they will not conform to lint
 883    gs_l_realgdp.index = lint.index
 884    gs_l_realinv.index = lint.index
 885
 886    data = dict(const=np.ones_like(lint), lint=lint, lrealgdp=gs_l_realgdp)
 887    # order is important
 888    exog = DataFrame(data, columns=['const', 'lrealgdp', 'lint'])
 889
 890    res = OLS(gs_l_realinv, exog).fit()
 891
 892    # basic
 893    # already tested
 894    # assert_almost_equal(lsdiag['cov.scaled'],
 895    #                    res.cov_params().values.ravel(), decimal=14)
 896    # assert_almost_equal(lsdiag['cov.unscaled'],
 897    #                    res.normalized_cov_params.values.ravel(), decimal=14)
 898
 899    infl = oi.OLSInfluence(res)
 900
 901    # smoke test just to make sure it works, results separately tested
 902    df = infl.summary_frame()
 903    assert_(isinstance(df, DataFrame))
 904
 905    # this test is slow
 906    path = os.path.join(cur_dir, "results", "influence_lsdiag_R.json")
 907    with open(path, "r") as fp:
 908        lsdiag = json.load(fp)
 909
 910    c0, c1 = infl.cooks_distance  # TODO: what's c1, it's pvalues? -ss
 911
 912    # NOTE: we get a hard-cored 5 decimals with pandas testing
 913    assert_almost_equal(c0, lsdiag['cooks'], 14)
 914    assert_almost_equal(infl.hat_matrix_diag, (lsdiag['hat']), 14)
 915    assert_almost_equal(infl.resid_studentized_internal,
 916                        lsdiag['std.res'], 14)
 917
 918    # slow:
 919    dffits, dffth = infl.dffits
 920    assert_almost_equal(dffits, lsdiag['dfits'], 14)
 921    assert_almost_equal(infl.resid_studentized_external,
 922                        lsdiag['stud.res'], 14)
 923
 924    fn = os.path.join(cur_dir, "results/influence_measures_R.csv")
 925    infl_r = pd.read_csv(fn, index_col=0)
 926    # not used yet:
 927    # infl_bool_r  = pandas.read_csv(fn, index_col=0,
 928    #                                converters=dict(zip(lrange(7),[conv]*7)))
 929    infl_r2 = np.asarray(infl_r)
 930    # TODO: finish wrapping this stuff
 931    assert_almost_equal(infl.dfbetas, infl_r2[:, :3], decimal=13)
 932    assert_almost_equal(infl.cov_ratio, infl_r2[:, 4], decimal=14)
 933
 934
 935def test_influence_dtype():
 936    # see #2148  bug when endog is integer
 937    y = np.ones(20)
 938    np.random.seed(123)
 939    x = np.random.randn(20, 3)
 940    res1 = OLS(y, x).fit()
 941
 942    res2 = OLS(y * 1., x).fit()
 943    cr1 = res1.get_influence().cov_ratio
 944    cr2 = res2.get_influence().cov_ratio
 945    assert_allclose(cr1, cr2, rtol=1e-14)
 946    # regression test for values
 947    cr3 = np.array(
 948        [1.22239215, 1.31551021, 1.52671069, 1.05003921, 0.89099323,
 949         1.57405066, 1.03230092, 0.95844196, 1.15531836, 1.21963623,
 950         0.87699564, 1.16707748, 1.10481391, 0.98839447, 1.08999334,
 951         1.35680102, 1.46227715, 1.45966708, 1.13659521, 1.22799038])
 952    assert_almost_equal(cr1, cr3, decimal=8)
 953
 954
 955def get_duncan_data():
 956    # results from R with NA -> 1. Just testing interface here because
 957    # outlier_test is just a wrapper
 958    labels = ['accountant', 'pilot', 'architect', 'author', 'chemist',
 959              'minister', 'professor', 'dentist', 'reporter', 'engineer',
 960              'undertaker', 'lawyer', 'physician', 'welfare.worker', 'teacher',
 961              'conductor', 'contractor', 'factory.owner', 'store.manager',
 962              'banker', 'bookkeeper', 'mail.carrier', 'insurance.agent',
 963              'store.clerk', 'carpenter', 'electrician', 'RR.engineer',
 964              'machinist', 'auto.repairman', 'plumber', 'gas.stn.attendant',
 965              'coal.miner', 'streetcar.motorman', 'taxi.driver',
 966              'truck.driver', 'machine.operator', 'barber', 'bartender',
 967              'shoe.shiner', 'cook', 'soda.clerk', 'watchman', 'janitor',
 968              'policeman', 'waiter']
 969    # Duncan's prestige data from car
 970    exog = [[1.0, 62.0, 86.0], [1.0, 72.0, 76.0], [1.0, 75.0, 92.0],
 971            [1.0, 55.0, 90.0], [1.0, 64.0, 86.0], [1.0, 21.0, 84.0],
 972            [1.0, 64.0, 93.0], [1.0, 80.0, 100.0], [1.0, 67.0, 87.0],
 973            [1.0, 72.0, 86.0], [1.0, 42.0, 74.0], [1.0, 76.0, 98.0],
 974            [1.0, 76.0, 97.0], [1.0, 41.0, 84.0], [1.0, 48.0, 91.0],
 975            [1.0, 76.0, 34.0], [1.0, 53.0, 45.0], [1.0, 60.0, 56.0],
 976            [1.0, 42.0, 44.0], [1.0, 78.0, 82.0], [1.0, 29.0, 72.0],
 977            [1.0, 48.0, 55.0], [1.0, 55.0, 71.0], [1.0, 29.0, 50.0],
 978            [1.0, 21.0, 23.0], [1.0, 47.0, 39.0], [1.0, 81.0, 28.0],
 979            [1.0, 36.0, 32.0], [1.0, 22.0, 22.0], [1.0, 44.0, 25.0],
 980            [1.0, 15.0, 29.0], [1.0, 7.0, 7.0], [1.0, 42.0, 26.0],
 981            [1.0, 9.0, 19.0], [1.0, 21.0, 15.0], [1.0, 21.0, 20.0],
 982            [1.0, 16.0, 26.0], [1.0, 16.0, 28.0], [1.0, 9.0, 17.0],
 983            [1.0, 14.0, 22.0], [1.0, 12.0, 30.0], [1.0, 17.0, 25.0],
 984            [1.0, 7.0, 20.0], [1.0, 34.0, 47.0], [1.0, 8.0, 32.0]]
 985    endog = [82., 83., 90., 76., 90., 87., 93., 90., 52., 88., 57.,
 986             89., 97., 59., 73., 38., 76., 81., 45., 92., 39., 34.,
 987             41., 16., 33., 53., 67., 57., 26., 29., 10., 15., 19.,
 988             10., 13., 24., 20., 7., 3., 16., 6., 11., 8., 41.,
 989             10.]
 990
 991    return endog, exog, labels
 992
 993
 994def test_outlier_test():
 995    endog, exog, labels = get_duncan_data()
 996    ndarray_mod = OLS(endog, exog).fit()
 997    rstudent = [3.1345185839, -2.3970223990, 2.0438046359, -1.9309187757,
 998                1.8870465798, -1.7604905300, -1.7040324156, 1.6024285876,
 999                -1.4332485037, -1.1044851583, 1.0688582315, 1.0185271840,
1000                -0.9024219332, -0.9023876471, -0.8830953936, 0.8265782334,
1001                0.8089220547, 0.7682770197, 0.7319491074, -0.6665962829,
1002                0.5227352794, -0.5135016547, 0.5083881518, 0.4999224372,
1003                -0.4980818221, -0.4759717075, -0.4293565820, -0.4114056499,
1004                -0.3779540862, 0.3556874030, 0.3409200462, 0.3062248646,
1005                0.3038999429, -0.3030815773, -0.1873387893, 0.1738050251,
1006                0.1424246593, -0.1292266025, 0.1272066463, -0.0798902878,
1007                0.0788467222, 0.0722556991, 0.0505098280, 0.0233215136,
1008                0.0007112055]
1009    unadj_p = [0.003177202, 0.021170298, 0.047432955, 0.060427645, 0.066248120,
1010               0.085783008, 0.095943909, 0.116738318, 0.159368890, 0.275822623,
1011               0.291386358, 0.314400295, 0.372104049, 0.372122040, 0.382333561,
1012               0.413260793, 0.423229432, 0.446725370, 0.468363101, 0.508764039,
1013               0.603971990, 0.610356737, 0.613905871, 0.619802317, 0.621087703,
1014               0.636621083, 0.669911674, 0.682917818, 0.707414459, 0.723898263,
1015               0.734904667, 0.760983108, 0.762741124, 0.763360242, 0.852319039,
1016               0.862874018, 0.887442197, 0.897810225, 0.899398691, 0.936713197,
1017               0.937538115, 0.942749758, 0.959961394, 0.981506948, 0.999435989]
1018    bonf_p = [0.1429741, 0.9526634, 2.1344830, 2.7192440, 2.9811654, 3.8602354,
1019              4.3174759, 5.2532243, 7.1716001, 12.4120180, 13.1123861,
1020              14.1480133,
1021              16.7446822, 16.7454918, 17.2050103, 18.5967357, 19.0453245,
1022              20.1026416, 21.0763395, 22.8943818, 27.1787396, 27.4660532,
1023              27.6257642, 27.8911043, 27.9489466, 28.6479487, 30.1460253,
1024              30.7313018, 31.8336506, 32.5754218, 33.0707100, 34.2442399,
1025              34.3233506, 34.3512109, 38.3543568, 38.8293308, 39.9348989,
1026              40.4014601, 40.4729411, 42.1520939, 42.1892152, 42.4237391,
1027              43.1982627, 44.1678127, 44.9746195]
1028    bonf_p = np.array(bonf_p)
1029    bonf_p[bonf_p > 1] = 1
1030    sorted_labels = ["minister", "reporter", "contractor", "insurance.agent",
1031                     "machinist", "store.clerk", "conductor", "factory.owner",
1032                     "mail.carrier", "streetcar.motorman", "carpenter",
1033                     "coal.miner",
1034                     "bartender", "bookkeeper", "soda.clerk", "chemist",
1035                     "RR.engineer",
1036                     "professor", "electrician", "gas.stn.attendant",
1037                     "auto.repairman",
1038                     "watchman", "banker", "machine.operator", "dentist",
1039                     "waiter",
1040                     "shoe.shiner", "welfare.worker", "plumber", "physician",
1041                     "pilot",
1042                     "engineer", "accountant", "lawyer", "undertaker",
1043                     "barber",
1044                     "store.manager", "truck.driver", "cook", "janitor",
1045                     "policeman",
1046                     "architect", "teacher", "taxi.driver", "author"]
1047
1048    res2 = np.c_[rstudent, unadj_p, bonf_p]
1049    res = oi.outlier_test(ndarray_mod, method='b', labels=labels, order=True)
1050    np.testing.assert_almost_equal(res.values, res2, 7)
1051    np.testing.assert_equal(res.index.tolist(),
1052                            sorted_labels)  # pylint: disable-msg=E1103
1053
1054    data = pd.DataFrame(np.column_stack((endog, exog)),
1055                        columns='y const var1 var2'.split(),
1056                        index=labels)
1057
1058    # check `order` with pandas bug in #3971
1059    res_pd = OLS.from_formula('y ~ const + var1 + var2 - 0', data).fit()
1060
1061    res_outl2 = oi.outlier_test(res_pd, method='b', order=True)
1062    assert_almost_equal(res_outl2.values, res2, 7)
1063    assert_equal(res_outl2.index.tolist(), sorted_labels)
1064
1065    res_outl1 = res_pd.outlier_test(method='b')
1066    res_outl1 = res_outl1.sort_values(['unadj_p'], ascending=True)
1067    assert_almost_equal(res_outl1.values, res2, 7)
1068    assert_equal(res_outl1.index.tolist(), sorted_labels)
1069    assert_array_equal(res_outl2.index, res_outl1.index)
1070
1071    # additional keywords in method
1072    res_outl3 = res_pd.outlier_test(method='b', order=True)
1073    assert_equal(res_outl3.index.tolist(), sorted_labels)
1074    res_outl4 = res_pd.outlier_test(method='b', order=True, cutoff=0.15)
1075    assert_equal(res_outl4.index.tolist(), sorted_labels[:1])
1076
1077
1078def test_ljungbox_dof_adj():
1079    data = sunspots.load_pandas().data['SUNACTIVITY']
1080    res = AutoReg(data, 4).fit()
1081    resid = res.resid
1082    res1 = smsdia.acorr_ljungbox(resid, lags=10, return_df=False)
1083    res2 = smsdia.acorr_ljungbox(resid, lags=10, model_df=4, return_df=False)
1084    assert_allclose(res1[0], res2[0])
1085    assert np.all(np.isnan(res2[1][:4]))
1086    assert np.all(res2[1][4:] <= res1[1][4:])
1087
1088
1089def test_ljungbox_errors_warnings():
1090    data = sunspots.load_pandas().data['SUNACTIVITY']
1091    with pytest.raises(ValueError, match="model_df must"):
1092        smsdia.acorr_ljungbox(data, model_df=-1)
1093    with pytest.raises(ValueError, match="period must"):
1094        smsdia.acorr_ljungbox(data, model_df=-1, period=1)
1095    with pytest.raises(ValueError, match="period must"):
1096        smsdia.acorr_ljungbox(data, model_df=-1, period=-2)
1097    with pytest.warns(FutureWarning, match="The default value of lags"):
1098        smsdia.acorr_ljungbox(data, return_df=False)
1099    with pytest.warns(FutureWarning, match="The value returned"):
1100        smsdia.acorr_ljungbox(data, lags=10)
1101
1102
1103def test_ljungbox_period():
1104    data = sunspots.load_pandas().data['SUNACTIVITY']
1105    ar_res = AutoReg(data, 4).fit()
1106    res = smsdia.acorr_ljungbox(ar_res.resid, period=13, return_df=True)
1107    res2 = smsdia.acorr_ljungbox(ar_res.resid, lags=26, return_df=True)
1108    assert_frame_equal(res, res2)
1109
1110
1111@pytest.mark.parametrize('cov_type', ['nonrobust', 'HC0'])
1112def test_encompasing_direct(cov_type, reset_randomstate):
1113    x = np.random.standard_normal((500, 2))
1114    e = np.random.standard_normal((500, 1))
1115    x_extra = np.random.standard_normal((500, 2))
1116    z_extra = np.random.standard_normal((500, 3))
1117    y = x @ np.ones((2, 1)) + e
1118    x1 = np.hstack([x[:, :1], x_extra])
1119    z1 = np.hstack([x, z_extra])
1120    res1 = OLS(y, x1).fit()
1121    res2 = OLS(y, z1).fit()
1122    df = smsdia.compare_encompassing(res1, res2, cov_type=cov_type)
1123
1124    direct1 = OLS(y, np.hstack([x1, x[:, 1:], z_extra])).fit(cov_type=cov_type)
1125    r1 = np.zeros((4, 3 + 1 + 3))
1126    r1[:, -4:] = np.eye(4)
1127    direct_test_1 = direct1.wald_test(r1, use_f=True)
1128    expected = (float(np.squeeze(direct_test_1.statistic)),
1129       

Large files files are truncated, but you can click here to view the full file