test_diagnostic.py | searchcode

/statsmodels/stats/tests/test_diagnostic.py

http://github.com/statsmodels/statsmodels · Python · 1921 lines · 1499 code · 199 blank · 223 comment · 29 complexity · 4e656a602527e8420fed08b0de221a2b MD5 · raw file

# -*- coding: utf-8 -*-
"""Tests for Regression Diagnostics and Specification Tests

Created on Thu Feb 09 13:19:47 2012

Author: Josef Perktold
License: BSD-3

currently all tests are against R

"""
import json
import os

import numpy as np
from numpy.testing import (
    assert_,
    assert_allclose,
    assert_almost_equal,
    assert_array_equal,
    assert_equal,
)
import pandas as pd
from pandas.testing import assert_frame_equal
import pytest

from statsmodels.datasets import macrodata, sunspots
from statsmodels.regression.linear_model import OLS
import statsmodels.stats.diagnostic as smsdia
import statsmodels.stats.outliers_influence as oi
import statsmodels.stats.sandwich_covariance as sw
from statsmodels.tools.tools import Bunch, add_constant
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.arima.model import ARIMA

cur_dir = os.path.abspath(os.path.dirname(__file__))


@pytest.fixture(scope="module")
def diagnostic_data():
    rs = np.random.RandomState(93674328)
    e = rs.standard_normal(500)
    x = rs.standard_normal((500, 3))
    y = x.sum(1) + e
    c = np.ones_like(y)
    data = pd.DataFrame(np.c_[y, c, x], columns=["y", "c", "x1", "x2", "x3"])

    return data


def compare_to_reference(sp, sp_dict, decimal=(12, 12)):
    assert_allclose(
        sp[0],
        sp_dict["statistic"],
        atol=10 ** -decimal[0],
        rtol=10 ** -decimal[0],
    )
    assert_allclose(
        sp[1],
        sp_dict["pvalue"],
        atol=10 ** -decimal[1],
        rtol=10 ** -decimal[0],
    )


def test_gq():
    d = macrodata.load().data

    realinv = d["realinv"]
    realgdp = d["realgdp"]
    realint = d["realint"]
    endog = realinv
    exog = add_constant(np.c_[realgdp, realint])

    # goldfeld-quandt
    het_gq_greater = dict(
        statistic=13.20512768685082,
        df1=99,
        df2=98,
        pvalue=1.246141976112324e-30,
        distr="f",
    )

    gq = smsdia.het_goldfeldquandt(endog, exog, split=0.5)
    compare_to_reference(gq, het_gq_greater, decimal=(12, 12))
    assert_equal(gq[-1], "increasing")


class TestDiagnosticG(object):
    @classmethod
    def setup_class(cls):
        d = macrodata.load_pandas().data
        # growth rates
        gs_l_realinv = 400 * np.diff(np.log(d["realinv"].values))
        gs_l_realgdp = 400 * np.diff(np.log(d["realgdp"].values))
        lint = d["realint"][:-1].values
        tbilrate = d["tbilrate"][:-1].values

        endogg = gs_l_realinv
        exogg = add_constant(np.c_[gs_l_realgdp, lint])
        exogg2 = add_constant(np.c_[gs_l_realgdp, tbilrate])
        exogg3 = add_constant(np.c_[gs_l_realgdp])

        res_ols = OLS(endogg, exogg).fit()
        res_ols2 = OLS(endogg, exogg2).fit()

        res_ols3 = OLS(endogg, exogg3).fit()

        cls.res = res_ols
        cls.res2 = res_ols2
        cls.res3 = res_ols3
        cls.endog = cls.res.model.endog
        cls.exog = cls.res.model.exog

    def test_basic(self):
        # mainly to check I got the right regression
        # > mkarray(fm$coefficients, "params")
        params = np.array(
            [-9.48167277465485, 4.3742216647032, -0.613996969478989]
        )

        assert_almost_equal(self.res.params, params, decimal=12)

    def test_hac(self):
        res = self.res
        # > nw = NeweyWest(fm, lag = 4, prewhite = FALSE, verbose=TRUE)
        # > nw2 = NeweyWest(fm, lag=10, prewhite = FALSE, verbose=TRUE)

        # > mkarray(nw, "cov_hac_4")
        cov_hac_4 = np.array(
            [
                1.385551290884014,
                -0.3133096102522685,
                -0.0597207976835705,
                -0.3133096102522685,
                0.1081011690351306,
                0.000389440793564336,
                -0.0597207976835705,
                0.000389440793564339,
                0.0862118527405036,
            ]
        ).reshape((3, 3), order="F")

        # > mkarray(nw2, "cov_hac_10")
        cov_hac_10 = np.array(
            [
                1.257386180080192,
                -0.2871560199899846,
                -0.03958300024627573,
                -0.2871560199899845,
                0.1049107028987101,
                0.0003896205316866944,
                -0.03958300024627578,
                0.0003896205316866961,
                0.0985539340694839,
            ]
        ).reshape((3, 3), order="F")

        cov = sw.cov_hac_simple(res, nlags=4, use_correction=False)
        bse_hac = sw.se_cov(cov)
        assert_almost_equal(cov, cov_hac_4, decimal=12)
        assert_almost_equal(bse_hac, np.sqrt(np.diag(cov)), decimal=12)

        cov = sw.cov_hac_simple(res, nlags=10, use_correction=False)
        bse_hac = sw.se_cov(cov)
        assert_almost_equal(cov, cov_hac_10, decimal=12)
        assert_almost_equal(bse_hac, np.sqrt(np.diag(cov)), decimal=12)

    def test_het_goldfeldquandt(self):
        # TODO: test options missing

        # > gq = gqtest(fm, alternative='greater')
        # > mkhtest_f(gq, 'het_gq_greater', 'f')
        het_gq_greater = dict(
            statistic=0.5313259064778423,
            pvalue=0.9990217851193723,
            parameters=(98, 98),
            distr="f",
        )

        # > gq = gqtest(fm, alternative='less')
        # > mkhtest_f(gq, 'het_gq_less', 'f')
        het_gq_less = dict(
            statistic=0.5313259064778423,
            pvalue=0.000978214880627621,
            parameters=(98, 98),
            distr="f",
        )

        # > gq = gqtest(fm, alternative='two.sided')
        # > mkhtest_f(gq, 'het_gq_two_sided', 'f')
        het_gq_two_sided = dict(
            statistic=0.5313259064778423,
            pvalue=0.001956429761255241,
            parameters=(98, 98),
            distr="f",
        )

        # > gq = gqtest(fm, fraction=0.1, alternative='two.sided')
        # > mkhtest_f(gq, 'het_gq_two_sided_01', 'f')
        het_gq_two_sided_01 = dict(
            statistic=0.5006976835928314,
            pvalue=0.001387126702579789,
            parameters=(88, 87),
            distr="f",
        )

        endogg, exogg = self.endog, self.exog
        # tests
        gq = smsdia.het_goldfeldquandt(endogg, exogg, split=0.5)
        compare_to_reference(gq, het_gq_greater, decimal=(12, 12))
        assert_equal(gq[-1], "increasing")

        gq = smsdia.het_goldfeldquandt(
            endogg, exogg, split=0.5, alternative="decreasing"
        )
        compare_to_reference(gq, het_gq_less, decimal=(12, 12))
        assert_equal(gq[-1], "decreasing")

        gq = smsdia.het_goldfeldquandt(
            endogg, exogg, split=0.5, alternative="two-sided"
        )
        compare_to_reference(gq, het_gq_two_sided, decimal=(12, 12))
        assert_equal(gq[-1], "two-sided")

        # TODO: forcing the same split as R 202-90-90-1=21
        gq = smsdia.het_goldfeldquandt(
            endogg, exogg, split=90, drop=21, alternative="two-sided"
        )
        compare_to_reference(gq, het_gq_two_sided_01, decimal=(12, 12))
        assert_equal(gq[-1], "two-sided")
        # TODO other options ???

    def test_het_breusch_pagan(self):
        res = self.res

        bptest = dict(
            statistic=0.709924388395087,
            pvalue=0.701199952134347,
            parameters=(2,),
            distr="f",
        )

        bp = smsdia.het_breuschpagan(res.resid, res.model.exog)
        compare_to_reference(bp, bptest, decimal=(12, 12))

    def test_het_breusch_pagan_nonrobust(self):
        res = self.res

        bptest = dict(
            statistic=1.302014063483341,
            pvalue=0.5215203247110649,
            parameters=(2,),
            distr="f",
        )

        bp = smsdia.het_breuschpagan(res.resid, res.model.exog, robust=False)
        compare_to_reference(bp, bptest, decimal=(12, 12))

    def test_het_white(self):
        res = self.res

        # TODO: regressiontest, compare with Greene or Gretl or Stata
        hw = smsdia.het_white(res.resid, res.model.exog)
        hw_values = (
            33.503722896538441,
            2.9887960597830259e-06,
            7.7945101228430946,
            1.0354575277704231e-06,
        )
        assert_almost_equal(hw, hw_values)

    def test_het_white_error(self):
        res = self.res

        with pytest.raises(ValueError, match="White's heteroskedasticity"):
            smsdia.het_white(res.resid, res.model.exog[:, :1])

    def test_het_arch(self):
        # test het_arch and indirectly het_lm against R
        # > library(FinTS)
        # > at = ArchTest(residuals(fm), lags=4)
        # > mkhtest(at, 'archtest_4', 'chi2')
        archtest_4 = dict(
            statistic=3.43473400836259,
            pvalue=0.487871315392619,
            parameters=(4,),
            distr="chi2",
        )

        # > at = ArchTest(residuals(fm), lags=12)
        # > mkhtest(at, 'archtest_12', 'chi2')
        archtest_12 = dict(
            statistic=8.648320999014171,
            pvalue=0.732638635007718,
            parameters=(12,),
            distr="chi2",
        )

        at4 = smsdia.het_arch(self.res.resid, nlags=4)
        at12 = smsdia.het_arch(self.res.resid, nlags=12)
        compare_to_reference(at4[:2], archtest_4, decimal=(12, 13))
        compare_to_reference(at12[:2], archtest_12, decimal=(12, 13))

    def test_het_arch2(self):
        # test autolag options, this also test het_lm
        # unfortunately optimal lag=1 for this data
        resid = self.res.resid

        res1 = smsdia.het_arch(resid, nlags=5, store=True)
        rs1 = res1[-1]
        res2 = smsdia.het_arch(resid, nlags=5, store=True)
        rs2 = res2[-1]

        assert_almost_equal(rs2.resols.params, rs1.resols.params, decimal=12)
        assert_almost_equal(res2[:4], res1[:4], decimal=12)

        # test that smallest lag, nlags=1 works
        res3 = smsdia.het_arch(resid, nlags=5)
        assert_almost_equal(res3[:4], res1[:4], decimal=12)

    def test_acorr_breusch_godfrey(self):
        res = self.res

        # bgf = bgtest(fm, order = 4, type="F")
        breuschgodfrey_f = dict(
            statistic=1.179280833676792,
            pvalue=0.321197487261203,
            parameters=(
                4,
                195,
            ),
            distr="f",
        )

        # > bgc = bgtest(fm, order = 4, type="Chisq")
        # > mkhtest(bgc, "breuschpagan_c", "chi2")
        breuschgodfrey_c = dict(
            statistic=4.771042651230007,
            pvalue=0.3116067133066697,
            parameters=(4,),
            distr="chi2",
        )

        bg = smsdia.acorr_breusch_godfrey(res, nlags=4)
        bg_r = [
            breuschgodfrey_c["statistic"],
            breuschgodfrey_c["pvalue"],
            breuschgodfrey_f["statistic"],
            breuschgodfrey_f["pvalue"],
        ]
        assert_almost_equal(bg, bg_r, decimal=11)

        # check that lag choice works
        bg2 = smsdia.acorr_breusch_godfrey(res, nlags=None)
        bg3 = smsdia.acorr_breusch_godfrey(res, nlags=10)
        assert_almost_equal(bg2, bg3, decimal=12)

    def test_acorr_breusch_godfrey_multidim(self):
        res = Bunch(resid=np.empty((100, 2)))
        with pytest.raises(ValueError, match="Model resid must be a 1d array"):
            smsdia.acorr_breusch_godfrey(res)

    def test_acorr_breusch_godfrey_exogs(self):
        data = sunspots.load_pandas().data["SUNACTIVITY"]
        res = ARIMA(data, order=(1, 0, 0), trend="n").fit()
        smsdia.acorr_breusch_godfrey(res, nlags=1)

    def test_acorr_ljung_box(self):

        # unit-test which may be useful later
        # ddof correction for fitted parameters in ARMA(p,q) fitdf=p+q
        # bt = Box.test(residuals(fm), lag=4, type = "Ljung-Box", fitdf=2)
        # mkhtest(bt, "ljung_box_4df2", "chi2")
        # ljung_box_4df2 = dict(statistic=5.23587172795227,
        #                       pvalue=0.0729532930400377,
        #                       parameters=(2,), distr='chi2')

        # bt = Box.test(residuals(fm), lag=4, type = "Box-Pierce", fitdf=2)
        # mkhtest(bt, "ljung_box_bp_4df2", "chi2")
        # ljung_box_bp_4df2 = dict(statistic=5.12462932741681,
        #                          pvalue=0.0771260128929921,
        #                          parameters=(2,), distr='chi2')

        res = self.res
        # general test
        # bt = Box.test(residuals(fm), lag=4, type = "Ljung-Box")
        # mkhtest(bt, "ljung_box_4", "chi2")
        ljung_box_4 = dict(
            statistic=5.23587172795227,
            pvalue=0.263940335284713,
            parameters=(4,),
            distr="chi2",
        )

        # bt = Box.test(residuals(fm), lag=4, type = "Box-Pierce")
        # mkhtest(bt, "ljung_box_bp_4", "chi2")
        ljung_box_bp_4 = dict(
            statistic=5.12462932741681,
            pvalue=0.2747471266820692,
            parameters=(4,),
            distr="chi2",
        )

        lb, lbpval, bp, bppval = smsdia.acorr_ljungbox(
            res.resid, 4, boxpierce=True, return_df=False
        )
        compare_to_reference(
            [lb[-1], lbpval[-1]], ljung_box_4, decimal=(12, 12)
        )
        compare_to_reference(
            [bp[-1], bppval[-1]], ljung_box_bp_4, decimal=(12, 12)
        )

    def test_acorr_ljung_box_big_default(self):
        res = self.res
        # R test with big dataset and default lag
        # bt = Box.test(residuals(fm), type = "Ljung-Box")
        # mkhtest(bt, "ljung_box_none", "chi2")
        ljung_box_none = dict(
            statistic=51.03724531797195, pvalue=0.11334744923390, distr="chi2"
        )

        # bt = Box.test(residuals(fm), type = "Box-Pierce")
        # mkhtest(bt, "ljung_box_bp_none", "chi2")
        ljung_box_bp_none = dict(
            statistic=45.12238537034000, pvalue=0.26638168491464, distr="chi2"
        )
        lags = min(40, res.resid.shape[0] // 2 - 2)
        lb, lbpval, bp, bppval = smsdia.acorr_ljungbox(
            res.resid, boxpierce=True, lags=lags, return_df=False
        )
        compare_to_reference(
            [lb[-1], lbpval[-1]], ljung_box_none, decimal=(12, 12)
        )
        compare_to_reference(
            [bp[-1], bppval[-1]], ljung_box_bp_none, decimal=(12, 12)
        )

    def test_acorr_ljung_box_small_default(self):
        res = self.res
        # R test with small dataset and default lag
        # bt = Box.test(residuals(fm), type = "Ljung-Box")
        # mkhtest(bt, "ljung_box_small", "chi2")
        ljung_box_small = dict(
            statistic=9.61503968281915,
            pvalue=0.72507000996945,
            parameters=(0,),
            distr="chi2",
        )

        # bt = Box.test(residuals(fm), type = "Box-Pierce")
        # mkhtest(bt, "ljung_box_bp_small", "chi2")
        ljung_box_bp_small = dict(
            statistic=7.41692150864936,
            pvalue=0.87940785887006,
            parameters=(0,),
            distr="chi2",
        )

        lb, lbpval, bp, bppval = smsdia.acorr_ljungbox(
            res.resid[:30], boxpierce=True, lags=13, return_df=False
        )
        compare_to_reference(
            [lb[-1], lbpval[-1]], ljung_box_small, decimal=(12, 12)
        )
        compare_to_reference(
            [bp[-1], bppval[-1]], ljung_box_bp_small, decimal=(12, 12)
        )

    def test_acorr_ljung_box_against_r(self, reset_randomstate):
        rs = np.random.RandomState(9876543)
        y1 = rs.standard_normal(100)
        e = rs.standard_normal(201)
        y2 = np.zeros_like(e)
        y2[0] = e[0]
        for i in range(1, 201):
            y2[i] = 0.5 * y2[i - 1] - 0.4 * e[i - 1] + e[i]
        y2 = y2[-100:]

        r_results_y1_lb = [
            [0.15685, 1, 0.6921],
            [5.4737, 5, 0.3608],
            [10.508, 10, 0.3971],
        ]

        r_results_y2_lb = [
            [2.8764, 1, 0.08989],
            [3.8104, 5, 0.577],
            [8.4779, 10, 0.5823],
        ]
        res_y1 = smsdia.acorr_ljungbox(y1, 10, return_df=True)
        res_y2 = smsdia.acorr_ljungbox(y2, 10, return_df=True)
        for i, loc in enumerate((1, 5, 10)):
            row = res_y1.loc[loc]
            assert_allclose(
                r_results_y1_lb[i][0], row.loc["lb_stat"], rtol=1e-3
            )
            assert_allclose(
                r_results_y1_lb[i][2], row.loc["lb_pvalue"], rtol=1e-3
            )

            row = res_y2.loc[loc]
            assert_allclose(
                r_results_y2_lb[i][0], row.loc["lb_stat"], rtol=1e-3
            )
            assert_allclose(
                r_results_y2_lb[i][2], row.loc["lb_pvalue"], rtol=1e-3
            )

        res = smsdia.acorr_ljungbox(y2, 10, boxpierce=True, return_df=True)
        assert_allclose(res.loc[10, "bp_stat"], 7.8935, rtol=1e-3)
        assert_allclose(res.loc[10, "bp_pvalue"], 0.639, rtol=1e-3)

        res = smsdia.acorr_ljungbox(
            y2, 10, boxpierce=True, return_df=True, model_df=1
        )
        assert_allclose(res.loc[10, "bp_pvalue"], 0.5449, rtol=1e-3)

    def test_harvey_collier(self):
        # > hc = harvtest(fm, order.by = NULL, data = list())
        # > mkhtest_f(hc, 'harvey_collier', 't')
        harvey_collier = dict(
            statistic=0.494432160939874,
            pvalue=0.6215491310408242,
            parameters=198,
            distr="t",
        )

        hc = smsdia.linear_harvey_collier(self.res)
        compare_to_reference(hc, harvey_collier, decimal=(12, 12))
        hc_skip = smsdia.linear_harvey_collier(self.res, skip=20)
        assert not np.allclose(hc[0], hc_skip[0])

    def test_rainbow(self):
        # rainbow test
        # > rt = raintest(fm)
        # > mkhtest_f(rt, 'raintest', 'f')
        raintest = dict(
            statistic=0.6809600116739604,
            pvalue=0.971832843583418,
            parameters=(101, 98),
            distr="f",
        )

        # > rt = raintest(fm, fraction=0.4)
        # > mkhtest_f(rt, 'raintest_fraction_04', 'f')
        raintest_fraction_04 = dict(
            statistic=0.565551237772662,
            pvalue=0.997592305968473,
            parameters=(122, 77),
            distr="f",
        )

        rb = smsdia.linear_rainbow(self.res)
        compare_to_reference(rb, raintest, decimal=(12, 12))
        rb = smsdia.linear_rainbow(self.res, frac=0.4)
        compare_to_reference(rb, raintest_fraction_04, decimal=(12, 12))

    def test_compare_lr(self):
        res = self.res
        res3 = self.res3  # nested within res
        # lrtest
        # lrt = lrtest(fm, fm2)
        # Model 1: ginv ~ ggdp + lint
        # Model 2: ginv ~ ggdp

        lrtest = dict(
            loglike1=-763.9752181602237,
            loglike2=-766.3091902020184,
            chi2value=4.66794408358942,
            pvalue=0.03073069384028677,
            df=(4, 3, 1),
        )
        lrt = res.compare_lr_test(res3)
        assert_almost_equal(lrt[0], lrtest["chi2value"], decimal=11)
        assert_almost_equal(lrt[1], lrtest["pvalue"], decimal=11)

        waldtest = dict(
            fvalue=4.65216373312492,
            pvalue=0.03221346195239025,
            df=(199, 200, 1),
        )

        wt = res.compare_f_test(res3)
        assert_almost_equal(wt[0], waldtest["fvalue"], decimal=11)
        assert_almost_equal(wt[1], waldtest["pvalue"], decimal=11)

    def test_compare_j(self):
        res = self.res
        res2 = self.res2
        # jt = jtest(fm, lm(ginv ~ ggdp + tbilrate))
        # Estimate         Std. Error  t value Pr(>|t|)
        jtest = [
            (
                "M1 + fitted(M2)",
                1.591505670785873,
                0.7384552861695823,
                2.155182176352370,
                0.032354572525314450,
                "*",
            ),
            (
                "M2 + fitted(M1)",
                1.305687653016899,
                0.4808385176653064,
                2.715438978051544,
                0.007203854534057954,
                "**",
            ),
        ]

        jt1 = smsdia.compare_j(res2, res)
        assert_almost_equal(jt1, jtest[0][3:5], decimal=12)

        jt2 = smsdia.compare_j(res, res2)
        assert_almost_equal(jt2, jtest[1][3:5], decimal=12)

    @pytest.mark.parametrize("comp", [smsdia.compare_cox, smsdia.compare_j])
    def test_compare_error(self, comp, diagnostic_data):
        data = diagnostic_data
        res1 = OLS(data.y, data[["c", "x1"]]).fit()
        res2 = OLS(data.x3, data[["c", "x2"]]).fit()
        with pytest.raises(ValueError, match="endogenous variables"):
            comp(res1, res2)

    @pytest.mark.parametrize("comp", [smsdia.compare_cox, smsdia.compare_j])
    def test_compare_nested(self, comp, diagnostic_data):
        data = diagnostic_data
        res1 = OLS(data.y, data[["c", "x1"]]).fit()
        res2 = OLS(data.y, data[["c", "x1", "x2"]]).fit()
        with pytest.raises(ValueError, match="The exog in results_x"):
            comp(res1, res2)
        with pytest.raises(ValueError, match="The exog in results_x"):
            comp(res2, res1)

    def test_compare_cox(self):
        res = self.res
        res2 = self.res2
        # Estimate        Std. Error  z value   Pr(>|z|)
        coxtest = [
            (
                "fitted(M1) ~ M2",
                -0.782030488930356,
                0.599696502782265,
                -1.304043770977755,
                1.922186587840554e-01,
                " ",
            ),
            (
                "fitted(M2) ~ M1",
                -2.248817107408537,
                0.392656854330139,
                -5.727181590258883,
                1.021128495098556e-08,
                "***",
            ),
        ]

        ct1 = smsdia.compare_cox(res, res2)
        assert_almost_equal(ct1, coxtest[0][3:5], decimal=12)

        ct2 = smsdia.compare_cox(res2, res)
        assert_almost_equal(ct2, coxtest[1][3:5], decimal=12)

        _, _, store = smsdia.compare_cox(res, res2, store=True)
        assert isinstance(store, smsdia.ResultsStore)

    def test_cusum_ols(self):
        # R library(strucchange)
        # > sc = sctest(ginv ~ ggdp + lint, type="OLS-CUSUM")
        # > mkhtest(sc, 'cusum_ols', 'BB')
        cusum_ols = dict(
            statistic=1.055750610401214,
            pvalue=0.2149567397376543,
            parameters=(),
            distr="BB",
        )  # Brownian Bridge

        k_vars = 3
        cs_ols = smsdia.breaks_cusumolsresid(self.res.resid, ddof=k_vars)  #
        compare_to_reference(cs_ols, cusum_ols, decimal=(12, 12))

    def test_breaks_hansen(self):
        # > sc = sctest(ginv ~ ggdp + lint, type="Nyblom-Hansen")
        # > mkhtest(sc, 'breaks_nyblom_hansen', 'BB')
        breaks_nyblom_hansen = dict(
            statistic=1.0300792740544484,
            pvalue=0.1136087530212015,
            parameters=(),
            distr="BB",
        )

        bh = smsdia.breaks_hansen(self.res)
        assert_almost_equal(
            bh[0], breaks_nyblom_hansen["statistic"], decimal=12
        )
        # TODO: breaks_hansen does not return pvalues

    def test_recursive_residuals(self):

        reccumres_standardize = np.array(
            [
                -2.151,
                -3.748,
                -3.114,
                -3.096,
                -1.865,
                -2.230,
                -1.194,
                -3.500,
                -3.638,
                -4.447,
                -4.602,
                -4.631,
                -3.999,
                -4.830,
                -5.429,
                -5.435,
                -6.554,
                -8.093,
                -8.567,
                -7.532,
                -7.079,
                -8.468,
                -9.320,
                -12.256,
                -11.932,
                -11.454,
                -11.690,
                -11.318,
                -12.665,
                -12.842,
                -11.693,
                -10.803,
                -12.113,
                -12.109,
                -13.002,
                -11.897,
                -10.787,
                -10.159,
                -9.038,
                -9.007,
                -8.634,
                -7.552,
                -7.153,
                -6.447,
                -5.183,
                -3.794,
                -3.511,
                -3.979,
                -3.236,
                -3.793,
                -3.699,
                -5.056,
                -5.724,
                -4.888,
                -4.309,
                -3.688,
                -3.918,
                -3.735,
                -3.452,
                -2.086,
                -6.520,
                -7.959,
                -6.760,
                -6.855,
                -6.032,
                -4.405,
                -4.123,
                -4.075,
                -3.235,
                -3.115,
                -3.131,
                -2.986,
                -1.813,
                -4.824,
                -4.424,
                -4.796,
                -4.000,
                -3.390,
                -4.485,
                -4.669,
                -4.560,
                -3.834,
                -5.507,
                -3.792,
                -2.427,
                -1.756,
                -0.354,
                1.150,
                0.586,
                0.643,
                1.773,
                -0.830,
                -0.388,
                0.517,
                0.819,
                2.240,
                3.791,
                3.187,
                3.409,
                2.431,
                0.668,
                0.957,
                -0.928,
                0.327,
                -0.285,
                -0.625,
                -2.316,
                -1.986,
                -0.744,
                -1.396,
                -1.728,
                -0.646,
                -2.602,
                -2.741,
                -2.289,
                -2.897,
                -1.934,
                -2.532,
                -3.175,
                -2.806,
                -3.099,
                -2.658,
                -2.487,
                -2.515,
                -2.224,
                -2.416,
                -1.141,
                0.650,
                -0.947,
                0.725,
                0.439,
                0.885,
                2.419,
                2.642,
                2.745,
                3.506,
                4.491,
                5.377,
                4.624,
                5.523,
                6.488,
                6.097,
                5.390,
                6.299,
                6.656,
                6.735,
                8.151,
                7.260,
                7.846,
                8.771,
                8.400,
                8.717,
                9.916,
                9.008,
                8.910,
                8.294,
                8.982,
                8.540,
                8.395,
                7.782,
                7.794,
                8.142,
                8.362,
                8.400,
                7.850,
                7.643,
                8.228,
                6.408,
                7.218,
                7.699,
                7.895,
                8.725,
                8.938,
                8.781,
                8.350,
                9.136,
                9.056,
                10.365,
                10.495,
                10.704,
                10.784,
                10.275,
                10.389,
                11.586,
                11.033,
                11.335,
                11.661,
                10.522,
                10.392,
                10.521,
                10.126,
                9.428,
                9.734,
                8.954,
                9.949,
                10.595,
                8.016,
                6.636,
                6.975,
            ]
        )

        rr = smsdia.recursive_olsresiduals(self.res, skip=3, alpha=0.95)
        assert_equal(
            np.round(rr[5][1:], 3), reccumres_standardize
        )  # extra zero in front
        # assert_equal(np.round(rr[3][4:], 3), np.diff(reccumres_standardize))
        assert_almost_equal(rr[3][4:], np.diff(reccumres_standardize), 3)
        assert_almost_equal(rr[4][3:].std(ddof=1), 10.7242, decimal=4)
        order = np.arange(self.res.model.endog.shape[0])
        rr_order = smsdia.recursive_olsresiduals(
            self.res, skip=3, alpha=0.95, order_by=order
        )
        assert_allclose(rr[0], rr_order[0], atol=1e-6)

        # regression number, visually checked with graph from gretl
        ub0 = np.array(
            [13.37318571, 13.50758959, 13.64199346, 13.77639734, 13.91080121]
        )
        ub1 = np.array(
            [39.44753774, 39.58194162, 39.7163455, 39.85074937, 39.98515325]
        )
        lb, ub = rr[6]
        assert_almost_equal(ub[:5], ub0, decimal=7)
        assert_almost_equal(lb[:5], -ub0, decimal=7)
        assert_almost_equal(ub[-5:], ub1, decimal=7)
        assert_almost_equal(lb[-5:], -ub1, decimal=7)

        # test a few values with explicit OLS
        endog = self.res.model.endog
        exog = self.res.model.exog
        params = []
        ypred = []
        for i in range(3, 10):
            resi = OLS(endog[:i], exog[:i]).fit()
            ypred.append(resi.model.predict(resi.params, exog[i]))
            params.append(resi.params)
        assert_almost_equal(rr[2][3:10], ypred, decimal=12)
        assert_almost_equal(rr[0][3:10], endog[3:10] - ypred, decimal=12)
        assert_almost_equal(rr[1][2:9], params, decimal=12)

    def test_normality(self):
        res = self.res

        # > library(nortest) #Lilliefors (Kolmogorov-Smirnov) normality test
        # > lt = lillie.test(residuals(fm))
        # > mkhtest(lt, "lilliefors", "-")
        lilliefors1 = dict(
            statistic=0.0723390908786589,
            pvalue=0.01204113540102896,
            parameters=(),
            distr="-",
        )

        # > lt = lillie.test(residuals(fm)**2)
        # > mkhtest(lt, "lilliefors", "-")
        lilliefors2 = dict(
            statistic=0.301311621898024,
            pvalue=1.004305736618051e-51,
            parameters=(),
            distr="-",
        )

        # > lt = lillie.test(residuals(fm)[1:20])
        # > mkhtest(lt, "lilliefors", "-")
        lilliefors3 = dict(
            statistic=0.1333956004203103,
            pvalue=0.455683,
            parameters=(),
            distr="-",
        )

        lf1 = smsdia.lilliefors(res.resid, pvalmethod="approx")
        lf2 = smsdia.lilliefors(res.resid ** 2, pvalmethod="approx")
        lf3 = smsdia.lilliefors(res.resid[:20], pvalmethod="approx")

        compare_to_reference(lf1, lilliefors1, decimal=(12, 12))
        compare_to_reference(
            lf2, lilliefors2, decimal=(12, 12)
        )  # pvalue very small
        assert_allclose(lf2[1], lilliefors2["pvalue"], rtol=1e-10)
        compare_to_reference(lf3, lilliefors3, decimal=(12, 1))
        # R uses different approximation for pvalue in last case

        # > ad = ad.test(residuals(fm))
        # > mkhtest(ad, "ad3", "-")
        adr1 = dict(
            statistic=1.602209621518313,
            pvalue=0.0003937979149362316,
            parameters=(),
            distr="-",
        )

        # > ad = ad.test(residuals(fm)**2)
        # > mkhtest(ad, "ad3", "-")

        # > ad = ad.test(residuals(fm)[1:20])
        # > mkhtest(ad, "ad3", "-")
        adr3 = dict(
            statistic=0.3017073732210775,
            pvalue=0.5443499281265933,
            parameters=(),
            distr="-",
        )

        ad1 = smsdia.normal_ad(res.resid)
        compare_to_reference(ad1, adr1, decimal=(11, 13))
        ad2 = smsdia.normal_ad(res.resid ** 2)
        assert_(np.isinf(ad2[0]))
        ad3 = smsdia.normal_ad(res.resid[:20])
        compare_to_reference(ad3, adr3, decimal=(11, 12))

    def test_influence(self):
        res = self.res

        # this test is slow
        infl = oi.OLSInfluence(res)

        path = os.path.join(cur_dir, "results", "influence_lsdiag_R.json")
        with open(path, "r") as fp:
            lsdiag = json.load(fp)

        # basic
        assert_almost_equal(
            np.array(lsdiag["cov.scaled"]).reshape(3, 3),
            res.cov_params(),
            decimal=12,
        )
        assert_almost_equal(
            np.array(lsdiag["cov.unscaled"]).reshape(3, 3),
            res.normalized_cov_params,
            decimal=12,
        )

        c0, c1 = infl.cooks_distance  # TODO: what's c1

        assert_almost_equal(c0, lsdiag["cooks"], decimal=12)
        assert_almost_equal(infl.hat_matrix_diag, lsdiag["hat"], decimal=12)
        assert_almost_equal(
            infl.resid_studentized_internal, lsdiag["std.res"], decimal=12
        )

        # slow:
        # infl._get_all_obs()  #slow, nobs estimation loop, called implicitly
        dffits, dffth = infl.dffits
        assert_almost_equal(dffits, lsdiag["dfits"], decimal=12)
        assert_almost_equal(
            infl.resid_studentized_external, lsdiag["stud.res"], decimal=12
        )

        fn = os.path.join(cur_dir, "results/influence_measures_R.csv")
        infl_r = pd.read_csv(fn, index_col=0)
        # not used yet:
        # infl_bool_r  = pandas.read_csv(fn, index_col=0,
        #                                converters=dict(zip(lrange(7),[conv]*7)))
        infl_r2 = np.asarray(infl_r)
        assert_almost_equal(infl.dfbetas, infl_r2[:, :3], decimal=12)
        assert_almost_equal(infl.cov_ratio, infl_r2[:, 4], decimal=12)
        # duplicates
        assert_almost_equal(dffits, infl_r2[:, 3], decimal=12)
        assert_almost_equal(c0, infl_r2[:, 5], decimal=12)
        assert_almost_equal(infl.hat_matrix_diag, infl_r2[:, 6], decimal=12)

        # TODO: finish and check thresholds and pvalues
        # Note: for dffits, R uses a threshold around 0.36,
        #        mine: dffits[1]=0.24373
        # R has
        # >>> np.nonzero(np.asarray(infl_bool_r["dffit"]))[0]
        # array([  6,  26,  63,  76,  90, 199])
        # >>> np.nonzero(np.asarray(infl_bool_r["cov.r"]))[0]
        # array([  4,  26,  59,  61,  63,  72,  76,  84,  91,  92,  94,  95,
        #        108, 197, 198])
        # >>> np.nonzero(np.asarray(infl_bool_r["hat"]))[0]
        # array([ 62,  76,  84,  90,  91,  92,  95, 108, 197, 199])


class TestDiagnosticGPandas(TestDiagnosticG):
    @classmethod
    def setup_class(cls):
        d = macrodata.load_pandas().data
        # growth rates
        d["gs_l_realinv"] = 400 * np.log(d["realinv"]).diff()
        d["gs_l_realgdp"] = 400 * np.log(d["realgdp"]).diff()
        d["lint"] = d["realint"].shift(1)
        d["tbilrate"] = d["tbilrate"].shift(1)

        d = d.dropna()
        cls.d = d
        endogg = d["gs_l_realinv"]
        exogg = add_constant(d[["gs_l_realgdp", "lint"]])
        exogg2 = add_constant(d[["gs_l_realgdp", "tbilrate"]])
        exogg3 = add_constant(d[["gs_l_realgdp"]])

        res_ols = OLS(endogg, exogg).fit()
        res_ols2 = OLS(endogg, exogg2).fit()

        res_ols3 = OLS(endogg, exogg3).fit()

        cls.res = res_ols
        cls.res2 = res_ols2
        cls.res3 = res_ols3
        cls.endog = cls.res.model.endog
        cls.exog = cls.res.model.exog


def test_spec_white():
    resdir = os.path.join(cur_dir, "results")
    wsfiles = ["wspec1.csv", "wspec2.csv", "wspec3.csv", "wspec4.csv"]
    for file in wsfiles:
        mdlfile = os.path.join(resdir, file)
        mdl = np.asarray(pd.read_csv(mdlfile))
        # DV is in last column
        lastcol = mdl.shape[1] - 1
        dv = mdl[:, lastcol]
        # create design matrix
        design = np.concatenate(
            (np.ones((mdl.shape[0], 1)), np.delete(mdl, lastcol, 1)), axis=1
        )
        # perform OLS and generate residuals
        resids = dv - np.dot(design, np.linalg.lstsq(design, dv, rcond=-1)[0])
        # perform White spec test. wspec3/wspec4 contain dummies.
        wsres = smsdia.spec_white(resids, design)
        # compare results to SAS 9.3 output
        if file == "wspec1.csv":
            assert_almost_equal(wsres, [3.251, 0.661, 5], decimal=3)
        elif file == "wspec2.csv":
            assert_almost_equal(wsres, [6.070, 0.733, 9], decimal=3)
        elif file == "wspec3.csv":
            assert_almost_equal(wsres, [6.767, 0.454, 7], decimal=3)
        else:
            assert_almost_equal(wsres, [8.462, 0.671, 11], decimal=3)


def test_spec_white_error(reset_randomstate):
    with pytest.raises(ValueError, match="White's specification test "):
        smsdia.spec_white(
            np.random.standard_normal(100), np.random.standard_normal((100, 1))
        )
    with pytest.raises(ValueError, match="White's specification test "):
        smsdia.spec_white(
            np.random.standard_normal(100), np.random.standard_normal((100, 2))
        )


def test_linear_lm_direct(reset_randomstate):
    endog = np.random.standard_normal(500)
    exog = add_constant(np.random.standard_normal((500, 3)))
    res = OLS(endog, exog).fit()
    lm_res = smsdia.linear_lm(res.resid, exog)
    aug = np.hstack([exog, exog[:, 1:] ** 2])
    res_aug = OLS(res.resid, aug).fit()
    stat = res_aug.rsquared * aug.shape[0]
    assert_allclose(lm_res[0], stat)


# TODO: make this a test or move/remove
def grangertest():
    # > gt = grangertest(ginv, ggdp, order=4)
    # > gt
    # Granger causality test
    #
    # Model 1: ggdp ~ Lags(ggdp, 1:4) + Lags(ginv, 1:4)
    # Model 2: ggdp ~ Lags(ggdp, 1:4)

    dict(fvalue=1.589672703015157, pvalue=0.178717196987075, df=(198, 193))


@pytest.mark.smoke
def test_outlier_influence_funcs(reset_randomstate):
    x = add_constant(np.random.randn(10, 2))
    y = x.sum(1) + np.random.randn(10)
    res = OLS(y, x).fit()
    out_05 = oi.summary_table(res)
    # GH3344 : Check alpha has an effect
    out_01 = oi.summary_table(res, alpha=0.01)
    assert_(np.all(out_01[1][:, 6] <= out_05[1][:, 6]))
    assert_(np.all(out_01[1][:, 7] >= out_05[1][:, 7]))

    res2 = OLS(y, x[:, 0]).fit()
    oi.summary_table(res2, alpha=0.05)
    infl = res2.get_influence()
    infl.summary_table()


def test_influence_wrapped():
    from pandas import DataFrame

    d = macrodata.load_pandas().data
    # growth rates
    gs_l_realinv = 400 * np.log(d["realinv"]).diff().dropna()
    gs_l_realgdp = 400 * np.log(d["realgdp"]).diff().dropna()
    lint = d["realint"][:-1]

    # re-index these because they will not conform to lint
    gs_l_realgdp.index = lint.index
    gs_l_realinv.index = lint.index

    data = dict(const=np.ones_like(lint), lint=lint, lrealgdp=gs_l_realgdp)
    # order is important
    exog = DataFrame(data, columns=["const", "lrealgdp", "lint"])

    res = OLS(gs_l_realinv, exog).fit()

    # basic
    # already tested
    # assert_almost_equal(lsdiag['cov.scaled'],
    #                    res.cov_params().values.ravel(), decimal=12)
    # assert_almost_equal(lsdiag['cov.unscaled'],
    #                    res.normalized_cov_params.values.ravel(), decimal=12)

    infl = oi.OLSInfluence(res)

    # smoke test just to make sure it works, results separately tested
    df = infl.summary_frame()
    assert_(isinstance(df, DataFrame))

    # this test is slow
    path = os.path.join(cur_dir, "results", "influence_lsdiag_R.json")
    with open(path, "r") as fp:
        lsdiag = json.load(fp)

    c0, c1 = infl.cooks_distance  # TODO: what's c1, it's pvalues? -ss

    # NOTE: we get a hard-cored 5 decimals with pandas testing
    assert_almost_equal(c0, lsdiag["cooks"], 12)
    assert_almost_equal(infl.hat_matrix_diag, (lsdiag["hat"]), 12)
    assert_almost_equal(infl.resid_studentized_internal, lsdiag["std.res"], 12)

    # slow:
    dffits, dffth = infl.dffits
    assert_almost_equal(dffits, lsdiag["dfits"], 12)
    assert_almost_equal(
        infl.resid_studentized_external, lsdiag["stud.res"], 12
    )

    fn = os.path.join(cur_dir, "results/influence_measures_R.csv")
    infl_r = pd.read_csv(fn, index_col=0)
    # not used yet:
    # infl_bool_r  = pandas.read_csv(fn, index_col=0,
    #                                converters=dict(zip(lrange(7),[conv]*7)))
    infl_r2 = np.asarray(infl_r)
    # TODO: finish wrapping this stuff
    assert_almost_equal(infl.dfbetas, infl_r2[:, :3], decimal=12)
    assert_almost_equal(infl.cov_ratio, infl_r2[:, 4], decimal=12)


def test_influence_dtype():
    # see #2148  bug when endog is integer
    y = np.ones(20)
    np.random.seed(123)
    x = np.random.randn(20, 3)
    res1 = OLS(y, x).fit()

    res2 = OLS(y * 1.0, x).fit()
    cr1 = res1.get_influence().cov_ratio
    cr2 = res2.get_influence().cov_ratio
    assert_allclose(cr1, cr2, rtol=1e-14)
    # regression test for values
    cr3 = np.array(
        [
            1.22239215,
            1.31551021,
            1.52671069,
            1.05003921,
            0.89099323,
            1.57405066,
            1.03230092,
            0.95844196,
            1.15531836,
            1.21963623,
            0.87699564,
            1.16707748,
            1.10481391,
            0.98839447,
            1.08999334,
            1.35680102,
            1.46227715,
            1.45966708,
            1.13659521,
            1.22799038,
        ]
    )
    assert_almost_equal(cr1, cr3, decimal=8)


def get_duncan_data():
    # results from R with NA -> 1. Just testing interface here because
    # outlier_test is just a wrapper
    labels = [
        "accountant",
        "pilot",
        "architect",
        "author",
        "chemist",
        "minister",
        "professor",
        "dentist",
        "reporter",
        "engineer",
        "undertaker",
        "lawyer",
        "physician",
        "welfare.worker",
        "teacher",
        "conductor",
        "contractor",
        "factory.owner",
        "store.manager",
        "banker",
        "bookkeeper",
        "mail.carrier",
        "insurance.agent",
        "store.clerk",
        "carpenter",
        "electrician",
        "RR.engineer",
        "machinist",
        "auto.repairman",
        "plumber",
        "gas.stn.attendant",
        "coal.miner",
        "streetcar.motorman",
        "taxi.driver",
        "truck.driver",
        "machine.operator",
        "barber",
        "bartender",
        "shoe.shiner",
        "cook",
        "soda.clerk",
        "watchman",
        "janitor",
        "policeman",
        "waiter",
    ]
    # Duncan's prestige data from car
    exog = [
        [1.0, 62.0, 86.0],
        [1.0, 72.0, 76.0],
        [1.0, 75.0, 92.0],
        [1.0, 55.0, 90.0],
        [1.0, 64.0, 86.0],
        [1.0, 21.0, 84.0],
        [1.0, 64.0, 93.0],
        [1.0, 80.0, 100.0],
        [1.0, 67.0, 87.0],
        [1.0, 72.0, 86.0],
        [1.0, 42.0, 74.0],
        [1.0, 76.0, 98.0],
        [1.0, 76.0, 97.0],
        [1.0, 41.0, 84.0],
        [1.0, 48.0, 91.0],
        [1.0, 76.0, 34.0],
        [1.0, 53.0, 45.0],
        [1.0, 60.0, 56.0],
        [1.0, 42.0, 44.0],
        [1.0, 78.0, 82.0],
        [1.0, 29.0, 72.0],
        [1.0, 48.0, 55.0],
        [1.0, 55.0, 71.0],
        [1.0, 29.0, 50.0],
        [1.0, 21.0, 23.0],
        [1.0, 47.0, 39.0],
        [1.0, 81.0, 28.0],
        [1.0, 36.0, 32.0],
        [1.0, 22.0, 22.0],
        [1.0, 44.0, 25.0],
        [1.0, 15.0, 29.0],
        [1.0, 7.0, 7.0],
        [1.0, 42.0, 26.0],
        [1.0, 9.0, 19.0],
        [1.0, 21.0, 15.0],
        [1.0, 21.0, 20.0],
        [1.0, 16.0, 26.0],
        [1.0, 16.0, 28.0],
        [1.0, 9.0, 17.0],
        [1.0, 14.0, 22.0],
        [1.0, 12.0, 30.0],
        [1.0, 17.0, 25.0],
        [1.0, 7.0, 20.0],
        [1.0, 34.0, 47.0],
        [1.0, 8.0, 32.0],
    ]
    endog = [
        82.0,
        83.0,
        90.0,
        76.0,
        90.0,
        87.0,
        93.0,
        90.0,
        52.0,
        88.0,
        57.0,
        89.0,
        97.0,
        59.0,
        73.0,
        38.0,
        76.0,
        81.0,
        45.0,
        92.0,
        39.0,
        34.0,
        41.0,
        16.0,
        33.0,
        53.0,
        67.0,
        57.0,
        26.0,
        29.0,
        10.0,
        15.0,
        19.0,
        10.0,
        13.0,
        24.0,
        20.0,
        7.0,
        3.0,
        16.0,
        6.0,
        11.0,
        8.0,
        41.0,
        10.0,
    ]

    return endog, exog, labels


def test_outlier_test():
    endog, exog, labels = get_duncan_data()
    ndarray_mod = OLS(endog, exog).fit()
    rstudent = [
        3.1345185839,
        -2.3970223990,
        2.0438046359,
        -1.9309187757,
        1.8870465798,
        -1.7604905300,
        -1.7040324156,
        1.6024285876,
        -1.4332485037,
        -1.1044851583,
        1.0688582315,
        1.0185271840,
        -0.9024219332,
        -0.9023876471,
        -0.8830953936,
        0.8265782334,
        0.8089220547,
        0.7682770197,
        0.7319491074,
        -0.6665962829,
        0.5227352794,
        -0.5135016547,
        0.5083881518,
        0.4999224372,
        -0.4980818221,
        -0.4759717075,
        -0.4293565820,
        -0.4114056499,
        -0.3779540862,
        0.3556874030,
        0.3409200462,
        0.3062248646,
        0.3038999429,
        -0.3030815773,
        -0.1873387893,
        0.1738050251,
        0.1424246593,
        -0.1292266025,
        0.1272066463,
        -0.0798902878,
        0.0788467222,
        0.0722556991,
        0.0505098280,
        0.0233215136,
        0.0007112055,
    ]
    unadj_p = [
        0.003177202,
        0.021170298,
        0.047432955,
        0.060427645,
        0.066248120,
        0.085783008,
        0.095943909,
        0.116738318,
        0.159368890,
        0.275822623,
        0.291386358,
        0.314400295,
        0.372104049,
        0.372122040,
        0.382333561,
        0.413260793,
        0.423229432,
        0.446725370,
        0.468363101,
        0.508764039,
        0.603971990,
        0.610356737,
        0.613905871,
        0.619802317,
        0.621087703,
        0.636621083,
        0.669911674,
        0.682917818,
        0.707414459,
        0.723898263,
        0.734904667,
        0.760983108,
        0.762741124,
        0.763360242,
        0.852319039,
        0.862874018,
        0.887442197,
        0.897810225,
        0.899398691,
        0.936713197,
        0.937538115,
        0.942749758,
        0.959961394,
        0.981506948,
        0.999435989,
    ]
    bonf_p = [
        0.1429741,
        0.9526634,
        2.1344830,
        2.7192440,
        2.9811654,
        3.8602354,
        4.3174759,
        5.2532243,
        7.1716001,
        12.4120180,
        13.1123861,
        14.1480133,
        16.7446822,
        16.7454918,
        17.2050103,
        18.5967357,
        19.0453245,
        20.1026416,
        21.0763395,
        22.8943818,
        27.1787396,
        27.4660532,
        27.6257642,
        27.8911043,
        27.9489466,
        28.6479487,
        30.1460253,
        30.7313018,
        31.8336506,
        32.5754218,
        33.0707100,
        34.2442399,
        34.3233506,
        34.3512109,
        38.3543568,
        38.8293308,
        39.9348989,
        40.4014601,
        40.4729411,
        42.1520939,
        42.1892152,
        42.4237391,
        43.1982627,
        44.1678127,
        44.9746195,
    ]
    bonf_p = np.array(bonf_p)
    bonf_p[bonf_p > 1] = 1
    sorted_labels = [
        "minister",
        "reporter",
        "contractor",
        "insurance.agent",
        "machinist",
        "store.clerk",
        "conductor",
        "factory.owner",
        "mail.carrier",
        "streetcar.motorman",
        "carpenter",
        "coal.miner",
        "bartender",
        "bookkeeper",
        "soda.clerk",
        "chemist",
        "RR.engineer",
        "professor",
        "electrician",
        "gas.stn.attendant",
        "auto.repairman",
        "watchman",
        "banker",
        "machine.operator",
        "dentist",
        "waiter",
        "shoe.shiner",
        "welfare.worker",
        "plumber",
        "physician",
        "pilot",
        "engineer",
        "accountant",
        "lawyer",
        "undertaker",
        "barber",
        "store.manager",
        "truck.driver",
        "cook",
        "janitor",
        "policeman",
        "architect",
        "teacher",
        "taxi.driver",
        "author",
    ]

    res2 = np.c_[rstudent, unadj_p, bonf_p]
    res = oi.outlier_test(ndarray_mod, method="b", labels=labels, order=True)
    np.testing.assert_almost_equal(res.values, res2, 7)
    np.testing.assert_equal(
        res.index.tolist(), sorted_labels
    )  # pylint: disable-msg=E1103

    data = pd.DataFrame(
        np.column_stack((endog, exog)),
        columns="y const var1 var2".split(),
        index=labels,
    )

    # check `order` with pandas bug in #3971
    res_pd = OLS.from_formula("y ~ const + var1 + var2 - 0", data).fit()

    res_outl2 = oi.outlier_test(res_pd, method="b", order=True)
    assert_almost_equal(res_outl2.values, res2, 7)
    assert_equal(res_outl2.index.tolist(), sorted_labels)

    res_outl1 = res_pd.outlier_test(method="b")
    res_outl1 = res_outl1.sort_values(["unadj_p"], ascending=True)
    assert_almost_equal(res_outl1.values, res2, 7)
    assert_equal(res_outl1.index.tolist(), sorted_labels)
    assert_array_equal(res_outl2.index, res_outl1.index)

    # additional keywords in method
    res_outl3 = res_pd.outlier_test(method="b", order=True)
    assert_equal(res_outl3.index.tolist(), sorted_labels)
    res_outl4 = res_pd.outlier_test(method="b", order=True, cutoff=0.15)
    assert_equal(res_outl4.index.tolist(), sorted_labels[:1])


def test_ljungbox_dof_adj():
    data = sunspots.load_pandas().data["SUNACTIVITY"]
    res = AutoReg(data, 4, old_names=False).fit()
    resid = res.resid
    res1 = smsdia.acorr_ljungbox(resid, lags=10, return_df=False)
    res2 = smsdia.acorr_ljungbox(resid, lags=10, model_df=4, return_df=False)
    assert_allclose(res1[0], res2[0])
    assert np.all(np.isnan(res2[1][:4]))
    assert np.all(res2[1][4:] <= res1[1][4:])


def test_ljungbox_auto_lag_selection():
    data = sunspots.load_pandas().data["SUNACTIVITY"]
    res = AutoReg(data, 4, old_names=False).fit()
    resid = res.resid
    res1 = smsdia.acorr_ljungbox(resid, return_df=False, auto_lag=True)
    res2 = smsdia.acorr_ljungbox(
        resid, model_df=4, return_df=False, auto_lag=True
    )
    assert_allclose(res1[0], res2[0])
    assert np.all(np.isnan(res2[1][:4]))
    assert np.all(res2[1][4:] <= res1[1][4:])


def test_ljungbox_errors_warnings():
    data = sunspots.load_pandas().data["SUNACTIVITY"]
    with pytest.raises(ValueError, match="model_df must"):
        smsdia.acorr_ljungbox(data, model_df=-1)
    with pytest.raises(ValueError, match="period must"):
        smsdia.acorr_ljungbox(data, model_df=-1, period=1)
    with pytest.raises(ValueError, match="period must"):
        smsdia.acorr_ljungbox(data, model_df=-1, period=-2)
    smsdia.acorr_ljungbox(data, return_df=False)
    ret = smsdia.acorr_ljungbox(data, lags=10)
    assert isinstance(ret, pd.DataFrame)


def test_ljungbox_period():
    data = sunspots.load_pandas().data["SUNACTIVITY"]
    ar_res = AutoReg(data, 4, old_names=False).fit()
    res = smsdia.acorr_ljungbox(ar_res.resid, period=13, return_df=True)
    res2 = smsdia.acorr_ljungbox(ar_res.resid, lags=26, return_df=True)
    assert_frame_equal(res, res2)


@pytest.mark.parametrize("cov_type", ["nonrobust", "HC0"])
def test_encompasing_direct(cov_type, reset_randomstate):
    x = np.random.standard_normal((500, 2))
    e = np.random.standard_normal((500, 1))
    x_extra = np.random.standard_normal((500, 2))
    z_extra = np.random.standard_normal((500, 3))
    y = x @ np.ones((2, 1)) + e
    x1 = np.hstack([x[:, :1], x_extra])
    z1 = np.hstack([x, z_extra])
    res1 = OLS(y, x1).fit()
    res2 = OLS(y, z1).fit()
    df = smsdia.compare_encompassing(res1, res2, cov_type=cov_type)

    direct1 = OLS(y, np.hstack([x1, x[:, 1:], z_extra])).fit(cov_type=cov_type)
    r1 = np.zeros((4, 3 + 1 + 3))
    r1[:, -4:] = np.eye(4)
    direct_test_1 = direct1.wald_test(r1, use_f=True)
    expected = (
        float(np.squeeze(direct_test_1.statistic)),
        float(np.squeeze(direct_test_1.pvalue)),
        int(direct_test_1.df_num),
        int(direct_test_1.df_denom),
    )
    assert_allclose(np.asarray(df.loc["x"]), expected, atol=1e-8)

    direct2 = OLS(y, np.hstack([z1, x_extra])).fit(cov_type=cov_type)
    r2 = np.zeros((2, 2 + 3 + 2))
    r2[:, -2:] = np.eye(2)
    direct_test_2 = direct2.wald_test(r2, use_f=True)
    expected = (
        float(np.squeeze(direct_test_2.statistic)),
        float(np.squeeze(direct_test_2.pvalue)),
        int(direct_test_2.df_num),
        int(direct_test_2.df_denom),
    )
    assert_allclose(np.asarray(df.loc["z"]), expected, atol=1e-8)


def test_encompasing_error(reset_randomstate):
    x = np.random.standard_normal((500, 2))
    e = np.random.standard_normal((500, 1))
    z_extra = np.random.standard_normal((500, 3))
    y = x @ np.ones((2, 1)) + e
    z = np.hstack([x, z_extra])
    res1 = OLS(y, x).fit()
    res2 = OLS(y, z).fit()
    with pytest.raises(ValueError, match="The exog in results_x"):
        smsdia.compare_encompassing(res1, res2)
    with pytest.raises(TypeError, match="results_z must come from a linear"):
        smsdia.compare_encompassing(res1, 2)
    with pytest.raises(TypeError, match="results_x must come from a linear"):
        smsdia.compare_encompassing(4, 2)


@pytest.mark.smoke
@pytest.mark.parametrize("power", [2, 3])
@pytest.mark.parametrize("test_type", ["fitted", "exog", "princomp"])
@pytest.mark.parametrize("use_f", [True, False])
@pytest.mark.parametrize(
    "cov",
    [
        dict(cov_type="nonrobust", cov_kwargs={}),
        dict(cov_type="HC0", cov_kwargs={}),
    ],
)
def test_reset_smoke(power, test_type, use_f, cov, reset_randomstate):
    x = add_constant(np.random.standard_normal((1000, 3)))
    e = np.random.standard_normal((1000, 1))
    x = np.hstack([x, x[:, 1:] ** 2])
    y = x @ np.ones((7, 1)) + e
    res = OLS(y, x[:, :4]).fit()
    smsdia.linear_reset(
        res, power=power, test_type=test_type, use_f=use_f, **cov
    )


@pytest.mark.smoke
@pytest.mark.parametrize("store", [True, False])
@pytest.mark.parametrize("ddof", [0, 2])
@pytest.mark.parametrize(
    "cov",
    [
        dict(cov_type="nonrobust", cov_kwargs={}),
        dict(cov_type="HC0", cov_kwargs={}),
    ],
)
def test_acorr_lm_smoke(store, ddof, cov, reset_randomstate):
    e = np.random.standard_normal(250)
    smsdia.acorr_lm(e, nlags=6, store=store, ddof=ddof, **cov)

    smsdia.acorr_lm(e, nlags=None, store=store, period=12, ddof=ddof, **cov)


def test_acorr_lm_smoke_no_autolag(reset_randomstate):
    e = np.random.standard_normal(250)
    smsdia.acorr_lm(e, nlags=6, autolag=None, store=False, ddof=0)


@pytest.mark.parametrize("frac", [0.25, 0.5, 0.75])
@pytest.mark.parametrize(
    "order_by",
    [
        None,
        np.arange(500),
        np.random.choice(500, size=500, replace=False),
        "x0",
        ["x0", "x2"],
    ],
)
def test_rainbow_smoke_order_by(frac, order_by, reset_randomstate):
    e = pd.DataFrame(np.random.standard_normal((500, 1)))
    x = pd.DataFrame(
        np.random.standard_normal((500, 3)),
        columns=["x{0}".format(i) for i in range(3)],
    )
    y = x @ np.ones((3, 1)) + e
    res = OLS(y, x).fit()
    smsdia.linear_rainbow(res, frac=frac, order_by=order_by)


@pytest.mark.parametrize("center", [None, 0.33, 300])
def test_rainbow_smoke_centered(center, reset_randomstate):
    e = pd.DataFrame(np.random.standard_normal((500, 1)))
    x = pd.DataFrame(
        np.random.standard_normal((500, 3)),
        columns=["x{0}".format(i) for i in range(3)],
    )
    y = x @ np.ones((3, 1)) + e
    res = OLS(y, x).fit()
    smsdia.linear_rainbow(res, use_distance=True, center=center)


def test_rainbow_exception(reset_randomstate):
    e = pd.DataFrame(np.random.standard_normal((500, 1)))
    x = pd.DataFrame(
        np.random.standard_normal((500, 3)),
        columns=["x{0}".format(i) for i in range(3)],
    )
    y = x @ np.ones((3, 1)) + e
    res = OLS(y, x).fit()
    with pytest.raises(TypeError, match="order_by must contain"):
        smsdia.linear_rainbow(res, order_by="x5")
    res = OLS(np.asarray(y), np.asarray(x)).fit()
    with pytest.raises(TypeError, match="order_by must contain"):
        smsdia.linear_rainbow(res, order_by=("x0",))


def test_small_skip(reset_randomstate):
    y = np.random.standard_normal(10)
    x = np.random.standard_normal((10, 3))
    x[:3] = x[:1]
    with pytest.raises(ValueError, match="The initial regressor matrix,"):
        smsdia.recursive_olsresiduals(OLS(y, x).fit())


# R code used in testing
# J test
#
# Model 1: ginv ~ ggdp + lint
# Model 2: ginv ~ ggdp + tbilrate
#                          Estimate         Std. Error t value  Pr(>|t|)
# M1 + fitted(M2) 1.591505670785873 0.7384552861695823 2.15518 0.0323546 *
# M2 + fitted(M1) 1.305687653016899 0.4808385176653064 2.71544 0.0072039 **
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
#
#  = lm(ginv ~ ggdp + tbilrate)
# > ct = coxtest(fm, fm3)
# > ct
# Cox test
#
# Model 1: ginv ~ ggdp + lint
# Model 2: ginv ~ ggdp + tbilrate
#                           Estimate        Std. Error  z value   Pr(>|z|)
# fitted(M1) ~ M2 -0.782030488930356 0.599696502782265 -1.30404    0.19222
# fitted(M2) ~ M1 -2.248817107408537 0.392656854330139 -5.72718 1.0211e-08 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
#
#
# > et = encomptest(fm, fm3)
# > et
# Encompassing test
#
# Model 1: ginv ~ ggdp + lint
# Model 2: ginv ~ ggdp + tbilrate
# Model E: ginv ~ ggdp + lint + tbilrate
#           Res.Df Df       F    Pr(>F)
# M1 vs. ME    198 -1 4.64481 0.0323546 *
# M2 vs. ME    198 -1 7.37361 0.0072039 **
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
#
# > fm4 = lm(realinv ~ realgdp + realint, data=d)
# > fm5 = lm(log(realinv) ~ realgdp + realint, data=d)
# > pet = petest(fm4, fm5)
# > pet
# PE test
#
# Model 1: realinv ~ realgdp + realint
# Model 2: log(realinv) ~ realgdp + realint
#                                       Estimate          Std. Error  t value
# M1 + log(fit(M1))-fit(M2) -229.281878354594596 44.5087822087058598 -5.15139
# M2 + fit(M1)-exp(fit(M2))    0.000634664704814  0.0000462387010349 13.72583
#                             Pr(>|t|)
# M1 + log(fit(M1))-fit(M2) 6.2013e-07 ***
# M2 + fit(M1)-exp(fit(M2)) < 2.22e-16 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Tech Fingerprint

Alerts (103)

'def' Ensure functions have docstrings for documentation
40 51 66 91 115 124 169 234 247 260 273 279 305 322 359 364 369 415 440 471 520 535 560 589 620 628 637 669 684 700 945 1016 1082 1109 1137 1148 1160 1172 1188 1250 1289 1438 1665 1676 1689 1702 1711 1748 1775 1796 1803 1819 1831 1842 1857
All Visa card numbers start with a 4. New cards have 16 digits. Old cards have 13.
80 119 151 268 269 285 286 294 400 455 524 598 600 606 646 652 675 953 962 1004 1871 1872 1886
Diners Club card numbers begin with 300 through 305, 36 or 38. All have 14 digits. There are Diners Club cards that begin with 5 and have 16 digits. These are a joint venture between Diners Club and MasterCard, and should be processed like a MasterCard.
132 137 139 152 154 203 251 598 605 646 688 689 961 1003 1871 1872
MasterCard numbers either start with the numbers 51 through 55 or with the numbers 2221 through 2720. All have 16 digits.
175 184 193 252 600 1004
'list(' Avoid unnecessary list conversions; use generators where possible
521
'isinstance(' Overuse may indicate design issues; consider polymorphism
667 1699