ols.py | searchcode

/pandas/stats/ols.py

http://github.com/wesm/pandas
Python | 1376 lines | 1218 code | 64 blank | 94 comment | 33 complexity | 7fc01ad8d8698f49e0779b65c697a0fc MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0

"""
Ordinary least squares regression
"""

# pylint: disable-msg=W0201

# flake8: noqa

from pandas.compat import zip, range, StringIO
from itertools import starmap
from pandas import compat
import numpy as np

from pandas.core.api import DataFrame, Series, isnull
from pandas.core.base import StringMixin
from pandas.types.common import _ensure_float64
from pandas.core.index import MultiIndex
from pandas.core.panel import Panel
from pandas.util.decorators import cache_readonly

import pandas.stats.common as scom
import pandas.stats.math as math
import pandas.stats.moments as moments

_FP_ERR = 1e-8

class OLS(StringMixin):
    """
    Runs a full sample ordinary least squares regression.

    Parameters
    ----------
    y : Series
    x : Series, DataFrame, dict of Series
    intercept : bool
        True if you want an intercept.
    weights : array-like, optional
        1d array of weights.  If you supply 1/W then the variables are pre-
        multiplied by 1/sqrt(W).  If no weights are supplied the default value
        is 1 and WLS reults are the same as OLS.
    nw_lags : None or int
        Number of Newey-West lags.
    nw_overlap : boolean, default False
        Assume data is overlapping when computing Newey-West estimator

    """
    _panel_model = False

    def __init__(self, y, x, intercept=True, weights=None, nw_lags=None,
                 nw_overlap=False):
        import warnings
        warnings.warn("The pandas.stats.ols module is deprecated and will be "
                      "removed in a future version. We refer to external packages "
                      "like statsmodels, see some examples here: "
                      "http://www.statsmodels.org/stable/regression.html",
                      FutureWarning, stacklevel=4)

        try:
            import statsmodels.api as sm
        except ImportError:
            import scikits.statsmodels.api as sm

        self._x_orig = x
        self._y_orig = y
        self._weights_orig = weights
        self._intercept = intercept
        self._nw_lags = nw_lags
        self._nw_overlap = nw_overlap

        (self._y, self._x, self._weights, self._x_filtered,
         self._index, self._time_has_obs) = self._prepare_data()

        if self._weights is not None:
            self._x_trans = self._x.mul(np.sqrt(self._weights), axis=0)
            self._y_trans = self._y * np.sqrt(self._weights)
            self.sm_ols = sm.WLS(self._y.get_values(),
                                 self._x.get_values(),
                                 weights=self._weights.values).fit()
        else:
            self._x_trans = self._x
            self._y_trans = self._y
            self.sm_ols = sm.OLS(self._y.get_values(),
                                 self._x.get_values()).fit()

    def _prepare_data(self):
        """
        Cleans the input for single OLS.

        Parameters
        ----------
        lhs: Series
            Dependent variable in the regression.
        rhs: dict, whose values are Series, DataFrame, or dict
            Explanatory variables of the regression.

        Returns
        -------
        Series, DataFrame
            Cleaned lhs and rhs
        """
        (filt_lhs, filt_rhs, filt_weights,
         pre_filt_rhs, index, valid) = _filter_data(self._y_orig, self._x_orig,
                                                    self._weights_orig)
        if self._intercept:
            filt_rhs['intercept'] = 1.
            pre_filt_rhs['intercept'] = 1.

        if hasattr(filt_weights, 'to_dense'):
            filt_weights = filt_weights.to_dense()

        return (filt_lhs, filt_rhs, filt_weights,
                pre_filt_rhs, index, valid)

    @property
    def nobs(self):
        return self._nobs

    @property
    def _nobs(self):
        return len(self._y)

    @property
    def nw_lags(self):
        return self._nw_lags

    @property
    def x(self):
        """Returns the filtered x used in the regression."""
        return self._x

    @property
    def y(self):
        """Returns the filtered y used in the regression."""
        return self._y

    @cache_readonly
    def _beta_raw(self):
        """Runs the regression and returns the beta."""
        return self.sm_ols.params

    @cache_readonly
    def beta(self):
        """Returns the betas in Series form."""
        return Series(self._beta_raw, index=self._x.columns)

    @cache_readonly
    def _df_raw(self):
        """Returns the degrees of freedom."""
        return math.rank(self._x.values)

    @cache_readonly
    def df(self):
        """Returns the degrees of freedom.

        This equals the rank of the X matrix.
        """
        return self._df_raw

    @cache_readonly
    def _df_model_raw(self):
        """Returns the raw model degrees of freedom."""
        return self.sm_ols.df_model

    @cache_readonly
    def df_model(self):
        """Returns the degrees of freedom of the model."""
        return self._df_model_raw

    @cache_readonly
    def _df_resid_raw(self):
        """Returns the raw residual degrees of freedom."""
        return self.sm_ols.df_resid

    @cache_readonly
    def df_resid(self):
        """Returns the degrees of freedom of the residuals."""
        return self._df_resid_raw

    @cache_readonly
    def _f_stat_raw(self):
        """Returns the raw f-stat value."""
        from scipy.stats import f

        cols = self._x.columns

        if self._nw_lags is None:
            F = self._r2_raw / (self._r2_raw - self._r2_adj_raw)

            q = len(cols)
            if 'intercept' in cols:
                q -= 1

            shape = q, self.df_resid
            p_value = 1 - f.cdf(F, shape[0], shape[1])
            return F, shape, p_value

        k = len(cols)
        R = np.eye(k)
        r = np.zeros((k, 1))

        try:
            intercept = cols.get_loc('intercept')
            R = np.concatenate((R[0: intercept], R[intercept + 1:]))
            r = np.concatenate((r[0: intercept], r[intercept + 1:]))
        except KeyError:
            # no intercept
            pass

        return math.calc_F(R, r, self._beta_raw, self._var_beta_raw,
                           self._nobs, self.df)

    @cache_readonly
    def f_stat(self):
        """Returns the f-stat value."""
        return f_stat_to_dict(self._f_stat_raw)

    def f_test(self, hypothesis):
        """Runs the F test, given a joint hypothesis.  The hypothesis is
        represented by a collection of equations, in the form

        A*x_1+B*x_2=C

        You must provide the coefficients even if they're 1.  No spaces.

        The equations can be passed as either a single string or a
        list of strings.

        Examples
        --------
        o = ols(...)
        o.f_test('1*x1+2*x2=0,1*x3=0')
        o.f_test(['1*x1+2*x2=0','1*x3=0'])
        """

        x_names = self._x.columns

        R = []
        r = []

        if isinstance(hypothesis, str):
            eqs = hypothesis.split(',')
        elif isinstance(hypothesis, list):
            eqs = hypothesis
        else:  # pragma: no cover
            raise Exception('hypothesis must be either string or list')
        for equation in eqs:
            row = np.zeros(len(x_names))
            lhs, rhs = equation.split('=')
            for s in lhs.split('+'):
                ss = s.split('*')
                coeff = float(ss[0])
                x_name = ss[1]

                if x_name not in x_names:
                    raise Exception('no coefficient named %s' % x_name)
                idx = x_names.get_loc(x_name)
                row[idx] = coeff
            rhs = float(rhs)

            R.append(row)
            r.append(rhs)

        R = np.array(R)
        q = len(r)
        r = np.array(r).reshape(q, 1)

        result = math.calc_F(R, r, self._beta_raw, self._var_beta_raw,
                             self._nobs, self.df)

        return f_stat_to_dict(result)

    @cache_readonly
    def _p_value_raw(self):
        """Returns the raw p values."""
        from scipy.stats import t

        return 2 * t.sf(np.fabs(self._t_stat_raw),
                        self._df_resid_raw)

    @cache_readonly
    def p_value(self):
        """Returns the p values."""
        return Series(self._p_value_raw, index=self.beta.index)

    @cache_readonly
    def _r2_raw(self):
        """Returns the raw r-squared values."""
        if self._use_centered_tss:
            return 1 - self.sm_ols.ssr / self.sm_ols.centered_tss
        else:
            return 1 - self.sm_ols.ssr / self.sm_ols.uncentered_tss

    @property
    def _use_centered_tss(self):
        # has_intercept = np.abs(self._resid_raw.sum()) < _FP_ERR
        return self._intercept

    @cache_readonly
    def r2(self):
        """Returns the r-squared values."""
        return self._r2_raw

    @cache_readonly
    def _r2_adj_raw(self):
        """Returns the raw r-squared adjusted values."""
        return self.sm_ols.rsquared_adj

    @cache_readonly
    def r2_adj(self):
        """Returns the r-squared adjusted values."""
        return self._r2_adj_raw

    @cache_readonly
    def _resid_raw(self):
        """Returns the raw residuals."""
        return self.sm_ols.resid

    @cache_readonly
    def resid(self):
        """Returns the residuals."""
        return Series(self._resid_raw, index=self._x.index)

    @cache_readonly
    def _rmse_raw(self):
        """Returns the raw rmse values."""
        return np.sqrt(self.sm_ols.mse_resid)

    @cache_readonly
    def rmse(self):
        """Returns the rmse value."""
        return self._rmse_raw

    @cache_readonly
    def _std_err_raw(self):
        """Returns the raw standard err values."""
        return np.sqrt(np.diag(self._var_beta_raw))

    @cache_readonly
    def std_err(self):
        """Returns the standard err values of the betas."""
        return Series(self._std_err_raw, index=self.beta.index)

    @cache_readonly
    def _t_stat_raw(self):
        """Returns the raw t-stat value."""
        return self._beta_raw / self._std_err_raw

    @cache_readonly
    def t_stat(self):
        """Returns the t-stat values of the betas."""
        return Series(self._t_stat_raw, index=self.beta.index)

    @cache_readonly
    def _var_beta_raw(self):
        """
        Returns the raw covariance of beta.
        """
        x = self._x.values
        y = self._y.values

        xx = np.dot(x.T, x)

        if self._nw_lags is None:
            return math.inv(xx) * (self._rmse_raw ** 2)
        else:
            resid = y - np.dot(x, self._beta_raw)
            m = (x.T * resid).T

            xeps = math.newey_west(m, self._nw_lags, self._nobs, self._df_raw,
                                   self._nw_overlap)

            xx_inv = math.inv(xx)
            return np.dot(xx_inv, np.dot(xeps, xx_inv))

    @cache_readonly
    def var_beta(self):
        """Returns the variance-covariance matrix of beta."""
        return DataFrame(self._var_beta_raw, index=self.beta.index,
                         columns=self.beta.index)

    @cache_readonly
    def _y_fitted_raw(self):
        """Returns the raw fitted y values."""
        if self._weights is None:
            X = self._x_filtered.values
        else:
            # XXX
            return self.sm_ols.fittedvalues

        b = self._beta_raw
        return np.dot(X, b)

    @cache_readonly
    def y_fitted(self):
        """Returns the fitted y values.  This equals BX."""
        if self._weights is None:
            index = self._x_filtered.index
            orig_index = index
        else:
            index = self._y.index
            orig_index = self._y_orig.index

        result = Series(self._y_fitted_raw, index=index)
        return result.reindex(orig_index)

    @cache_readonly
    def _y_predict_raw(self):
        """Returns the raw predicted y values."""
        return self._y_fitted_raw

    @cache_readonly
    def y_predict(self):
        """Returns the predicted y values.

        For in-sample, this is same as y_fitted."""
        return self.y_fitted

    def predict(self, beta=None, x=None, fill_value=None,
                fill_method=None, axis=0):
        """
        Parameters
        ----------
        beta : Series
        x : Series or DataFrame
        fill_value : scalar or dict, default None
        fill_method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
        axis : {0, 1}, default 0
            See DataFrame.fillna for more details

        Notes
        -----
        1. If both fill_value and fill_method are None then NaNs are dropped
        (this is the default behavior)
        2. An intercept will be automatically added to the new_y_values if
           the model was fitted using an intercept

        Returns
        -------
        Series of predicted values
        """
        if beta is None and x is None:
            return self.y_predict

        if beta is None:
            beta = self.beta
        else:
            beta = beta.reindex(self.beta.index)
            if isnull(beta).any():
                raise ValueError('Must supply betas for same variables')

        if x is None:
            x = self._x
            orig_x = x
        else:
            orig_x = x
            if fill_value is None and fill_method is None:
                x = x.dropna(how='any')
            else:
                x = x.fillna(value=fill_value, method=fill_method, axis=axis)
            if isinstance(x, Series):
                x = DataFrame({'x': x})
            if self._intercept:
                x['intercept'] = 1.

            x = x.reindex(columns=self._x.columns)

        rs = np.dot(x.values, beta.values)
        return Series(rs, x.index).reindex(orig_x.index)

    RESULT_FIELDS = ['r2', 'r2_adj', 'df', 'df_model', 'df_resid', 'rmse',
                     'f_stat', 'beta', 'std_err', 't_stat', 'p_value', 'nobs']

    @cache_readonly
    def _results(self):
        results = {}
        for result in self.RESULT_FIELDS:
            results[result] = getattr(self, result)

        return results

    @cache_readonly
    def _coef_table(self):
        buf = StringIO()

        buf.write('%14s %10s %10s %10s %10s %10s %10s\n' %
                  ('Variable', 'Coef', 'Std Err', 't-stat',
                   'p-value', 'CI 2.5%', 'CI 97.5%'))
        buf.write(scom.banner(''))
        coef_template = '\n%14s %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f'

        results = self._results

        beta = results['beta']

        for i, name in enumerate(beta.index):
            if i and not (i % 5):
                buf.write('\n' + scom.banner(''))

            std_err = results['std_err'][name]
            CI1 = beta[name] - 1.96 * std_err
            CI2 = beta[name] + 1.96 * std_err

            t_stat = results['t_stat'][name]
            p_value = results['p_value'][name]

            line = coef_template % (name,
                                    beta[name], std_err, t_stat, p_value, CI1, CI2)

            buf.write(line)

        if self.nw_lags is not None:
            buf.write('\n')
            buf.write('*** The calculations are Newey-West '
                      'adjusted with lags %5d\n' % self.nw_lags)

        return buf.getvalue()

    @cache_readonly
    def summary_as_matrix(self):
        """Returns the formatted results of the OLS as a DataFrame."""
        results = self._results
        beta = results['beta']
        data = {'beta': results['beta'],
                't-stat': results['t_stat'],
                'p-value': results['p_value'],
                'std err': results['std_err']}
        return DataFrame(data, beta.index).T

    @cache_readonly
    def summary(self):
        """
        This returns the formatted result of the OLS computation
        """
        template = """
%(bannerTop)s

Formula: Y ~ %(formula)s

Number of Observations:         %(nobs)d
Number of Degrees of Freedom:   %(df)d

R-squared:     %(r2)10.4f
Adj R-squared: %(r2_adj)10.4f

Rmse:          %(rmse)10.4f

F-stat %(f_stat_shape)s: %(f_stat)10.4f, p-value: %(f_stat_p_value)10.4f

Degrees of Freedom: model %(df_model)d, resid %(df_resid)d

%(bannerCoef)s
%(coef_table)s
%(bannerEnd)s
"""
        coef_table = self._coef_table

        results = self._results

        f_stat = results['f_stat']

        bracketed = ['<%s>' % str(c) for c in results['beta'].index]

        formula = StringIO()
        formula.write(bracketed[0])
        tot = len(bracketed[0])
        line = 1
        for coef in bracketed[1:]:
            tot = tot + len(coef) + 3

            if tot // (68 * line):
                formula.write('\n' + ' ' * 12)
                line += 1

            formula.write(' + ' + coef)

        params = {
            'bannerTop': scom.banner('Summary of Regression Analysis'),
            'bannerCoef': scom.banner('Summary of Estimated Coefficients'),
            'bannerEnd': scom.banner('End of Summary'),
            'formula': formula.getvalue(),
            'r2': results['r2'],
            'r2_adj': results['r2_adj'],
            'nobs': results['nobs'],
            'df': results['df'],
            'df_model': results['df_model'],
            'df_resid': results['df_resid'],
            'coef_table': coef_table,
            'rmse': results['rmse'],
            'f_stat': f_stat['f-stat'],
            'f_stat_shape': '(%d, %d)' % (f_stat['DF X'], f_stat['DF Resid']),
            'f_stat_p_value': f_stat['p-value'],
        }

        return template % params

    def __unicode__(self):
        return self.summary

    @cache_readonly
    def _time_obs_count(self):
        # XXX
        return self._time_has_obs.astype(int)

    @property
    def _total_times(self):
        return self._time_has_obs.sum()


class MovingOLS(OLS):
    """
    Runs a rolling/expanding simple OLS.

    Parameters
    ----------
    y : Series
    x : Series, DataFrame, or dict of Series
    weights : array-like, optional
        1d array of weights.  If None, equivalent to an unweighted OLS.
    window_type : {'full sample', 'rolling', 'expanding'}
        Default expanding
    window : int
        size of window (for rolling/expanding OLS)
    min_periods : int
        Threshold of non-null data points to require.
        If None, defaults to size of window for window_type='rolling' and 1
        otherwise
    intercept : bool
        True if you want an intercept.
    nw_lags : None or int
        Number of Newey-West lags.
    nw_overlap : boolean, default False
        Assume data is overlapping when computing Newey-West estimator

    """

    def __init__(self, y, x, weights=None, window_type='expanding',
                 window=None, min_periods=None, intercept=True,
                 nw_lags=None, nw_overlap=False):

        self._args = dict(intercept=intercept, nw_lags=nw_lags,
                          nw_overlap=nw_overlap)

        OLS.__init__(self, y=y, x=x, weights=weights, **self._args)

        self._set_window(window_type, window, min_periods)

    def _set_window(self, window_type, window, min_periods):
        self._window_type = scom._get_window_type(window_type)

        if self._is_rolling:
            if window is None:
                raise AssertionError("Must specify window.")
            if min_periods is None:
                min_periods = window
        else:
            window = len(self._x)
            if min_periods is None:
                min_periods = 1

        self._window = int(window)
        self._min_periods = min_periods

#------------------------------------------------------------------------------
# "Public" results

    @cache_readonly
    def beta(self):
        """Returns the betas in Series/DataFrame form."""
        return DataFrame(self._beta_raw,
                         index=self._result_index,
                         columns=self._x.columns)

    @cache_readonly
    def rank(self):
        return Series(self._rank_raw, index=self._result_index)

    @cache_readonly
    def df(self):
        """Returns the degrees of freedom."""
        return Series(self._df_raw, index=self._result_index)

    @cache_readonly
    def df_model(self):
        """Returns the model degrees of freedom."""
        return Series(self._df_model_raw, index=self._result_index)

    @cache_readonly
    def df_resid(self):
        """Returns the residual degrees of freedom."""
        return Series(self._df_resid_raw, index=self._result_index)

    @cache_readonly
    def f_stat(self):
        """Returns the f-stat value."""
        f_stat_dicts = dict((date, f_stat_to_dict(f_stat))
                            for date, f_stat in zip(self.beta.index,
                                                    self._f_stat_raw))

        return DataFrame(f_stat_dicts).T

    def f_test(self, hypothesis):
        raise NotImplementedError('must use full sample')

    @cache_readonly
    def forecast_mean(self):
        return Series(self._forecast_mean_raw, index=self._result_index)

    @cache_readonly
    def forecast_vol(self):
        return Series(self._forecast_vol_raw, index=self._result_index)

    @cache_readonly
    def p_value(self):
        """Returns the p values."""
        cols = self.beta.columns
        return DataFrame(self._p_value_raw, columns=cols,
                         index=self._result_index)

    @cache_readonly
    def r2(self):
        """Returns the r-squared values."""
        return Series(self._r2_raw, index=self._result_index)

    @cache_readonly
    def resid(self):
        """Returns the residuals."""
        return Series(self._resid_raw[self._valid_obs_labels],
                      index=self._result_index)

    @cache_readonly
    def r2_adj(self):
        """Returns the r-squared adjusted values."""
        index = self.r2.index

        return Series(self._r2_adj_raw, index=index)

    @cache_readonly
    def rmse(self):
        """Returns the rmse values."""
        return Series(self._rmse_raw, index=self._result_index)

    @cache_readonly
    def std_err(self):
        """Returns the standard err values."""
        return DataFrame(self._std_err_raw, columns=self.beta.columns,
                         index=self._result_index)

    @cache_readonly
    def t_stat(self):
        """Returns the t-stat value."""
        return DataFrame(self._t_stat_raw, columns=self.beta.columns,
                         index=self._result_index)

    @cache_readonly
    def var_beta(self):
        """Returns the covariance of beta."""
        result = {}
        result_index = self._result_index
        for i in range(len(self._var_beta_raw)):
            dm = DataFrame(self._var_beta_raw[i], columns=self.beta.columns,
                           index=self.beta.columns)
            result[result_index[i]] = dm

        return Panel.from_dict(result, intersect=False)

    @cache_readonly
    def y_fitted(self):
        """Returns the fitted y values."""
        return Series(self._y_fitted_raw[self._valid_obs_labels],
                      index=self._result_index)

    @cache_readonly
    def y_predict(self):
        """Returns the predicted y values."""
        return Series(self._y_predict_raw[self._valid_obs_labels],
                      index=self._result_index)

#------------------------------------------------------------------------------
# "raw" attributes, calculations

    @property
    def _is_rolling(self):
        return self._window_type == 'rolling'

    @cache_readonly
    def _beta_raw(self):
        """Runs the regression and returns the beta."""
        beta, indices, mask = self._rolling_ols_call

        return beta[indices]

    @cache_readonly
    def _result_index(self):
        return self._index[self._valid_indices]

    @property
    def _valid_indices(self):
        return self._rolling_ols_call[1]

    @cache_readonly
    def _rolling_ols_call(self):
        return self._calc_betas(self._x_trans, self._y_trans)

    def _calc_betas(self, x, y):
        N = len(self._index)
        K = len(self._x.columns)

        betas = np.empty((N, K), dtype=float)
        betas[:] = np.NaN

        valid = self._time_has_obs
        enough = self._enough_obs
        window = self._window

        # Use transformed (demeaned) Y, X variables
        cum_xx = self._cum_xx(x)
        cum_xy = self._cum_xy(x, y)

        for i in range(N):
            if not valid[i] or not enough[i]:
                continue

            xx = cum_xx[i]
            xy = cum_xy[i]
            if self._is_rolling and i >= window:
                xx = xx - cum_xx[i - window]
                xy = xy - cum_xy[i - window]

            betas[i] = math.solve(xx, xy)

        mask = ~np.isnan(betas).any(axis=1)
        have_betas = np.arange(N)[mask]

        return betas, have_betas, mask

    def _rolling_rank(self):
        dates = self._index
        window = self._window

        ranks = np.empty(len(dates), dtype=float)
        ranks[:] = np.NaN
        for i, date in enumerate(dates):
            if self._is_rolling and i >= window:
                prior_date = dates[i - window + 1]
            else:
                prior_date = dates[0]

            x_slice = self._x.truncate(before=prior_date, after=date).values

            if len(x_slice) == 0:
                continue

            ranks[i] = math.rank(x_slice)

        return ranks

    def _cum_xx(self, x):
        dates = self._index
        K = len(x.columns)
        valid = self._time_has_obs
        cum_xx = []

        slicer = lambda df, dt: df.truncate(dt, dt).values
        if not self._panel_model:
            _get_index = x.index.get_loc

            def slicer(df, dt):
                i = _get_index(dt)
                return df.values[i:i + 1, :]

        last = np.zeros((K, K))

        for i, date in enumerate(dates):
            if not valid[i]:
                cum_xx.append(last)
                continue

            x_slice = slicer(x, date)
            xx = last = last + np.dot(x_slice.T, x_slice)
            cum_xx.append(xx)

        return cum_xx

    def _cum_xy(self, x, y):
        dates = self._index
        valid = self._time_has_obs
        cum_xy = []

        x_slicer = lambda df, dt: df.truncate(dt, dt).values
        if not self._panel_model:
            _get_index = x.index.get_loc

            def x_slicer(df, dt):
                i = _get_index(dt)
                return df.values[i:i + 1]

        _y_get_index = y.index.get_loc
        _values = y.values
        if isinstance(y.index, MultiIndex):
            def y_slicer(df, dt):
                loc = _y_get_index(dt)
                return _values[loc]
        else:
            def y_slicer(df, dt):
                i = _y_get_index(dt)
                return _values[i:i + 1]

        last = np.zeros(len(x.columns))
        for i, date in enumerate(dates):
            if not valid[i]:
                cum_xy.append(last)
                continue

            x_slice = x_slicer(x, date)
            y_slice = y_slicer(y, date)

            xy = last = last + np.dot(x_slice.T, y_slice)
            cum_xy.append(xy)

        return cum_xy

    @cache_readonly
    def _rank_raw(self):
        rank = self._rolling_rank()
        return rank[self._valid_indices]

    @cache_readonly
    def _df_raw(self):
        """Returns the degrees of freedom."""
        return self._rank_raw

    @cache_readonly
    def _df_model_raw(self):
        """Returns the raw model degrees of freedom."""
        return self._df_raw - 1

    @cache_readonly
    def _df_resid_raw(self):
        """Returns the raw residual degrees of freedom."""
        return self._nobs - self._df_raw

    @cache_readonly
    def _f_stat_raw(self):
        """Returns the raw f-stat value."""
        from scipy.stats import f

        items = self.beta.columns
        nobs = self._nobs
        df = self._df_raw
        df_resid = nobs - df

        # var_beta has not been newey-west adjusted
        if self._nw_lags is None:
            F = self._r2_raw / (self._r2_raw - self._r2_adj_raw)

            q = len(items)
            if 'intercept' in items:
                q -= 1

            def get_result_simple(Fst, d):
                return Fst, (q, d), 1 - f.cdf(Fst, q, d)

            # Compute the P-value for each pair
            result = starmap(get_result_simple, zip(F, df_resid))

            return list(result)

        K = len(items)
        R = np.eye(K)
        r = np.zeros((K, 1))

        try:
            intercept = items.get_loc('intercept')
            R = np.concatenate((R[0: intercept], R[intercept + 1:]))
            r = np.concatenate((r[0: intercept], r[intercept + 1:]))
        except KeyError:
            # no intercept
            pass

        def get_result(beta, vcov, n, d):
            return math.calc_F(R, r, beta, vcov, n, d)

        results = starmap(get_result,
                          zip(self._beta_raw, self._var_beta_raw, nobs, df))

        return list(results)

    @cache_readonly
    def _p_value_raw(self):
        """Returns the raw p values."""
        from scipy.stats import t

        result = [2 * t.sf(a, b)
                  for a, b in zip(np.fabs(self._t_stat_raw),
                                  self._df_resid_raw)]

        return np.array(result)

    @cache_readonly
    def _resid_stats(self):
        uncentered_sst = []
        sst = []
        sse = []

        Yreg = self._y
        Y = self._y_trans
        X = self._x_trans
        weights = self._weights

        dates = self._index
        window = self._window
        for n, index in enumerate(self._valid_indices):
            if self._is_rolling and index >= window:
                prior_date = dates[index - window + 1]
            else:
                prior_date = dates[0]

            date = dates[index]
            beta = self._beta_raw[n]

            X_slice = X.truncate(before=prior_date, after=date).values
            Y_slice = _y_converter(Y.truncate(before=prior_date, after=date))

            resid = Y_slice - np.dot(X_slice, beta)

            if weights is not None:
                Y_slice = _y_converter(Yreg.truncate(before=prior_date,
                                                     after=date))
                weights_slice = weights.truncate(prior_date, date)
                demeaned = Y_slice - np.average(Y_slice, weights=weights_slice)
                SS_total = (weights_slice * demeaned ** 2).sum()
            else:
                SS_total = ((Y_slice - Y_slice.mean()) ** 2).sum()

            SS_err = (resid ** 2).sum()
            SST_uncentered = (Y_slice ** 2).sum()

            sse.append(SS_err)
            sst.append(SS_total)
            uncentered_sst.append(SST_uncentered)

        return {
            'sse': np.array(sse),
            'centered_tss': np.array(sst),
            'uncentered_tss': np.array(uncentered_sst),
        }

    @cache_readonly
    def _rmse_raw(self):
        """Returns the raw rmse values."""
        return np.sqrt(self._resid_stats['sse'] / self._df_resid_raw)

    @cache_readonly
    def _r2_raw(self):
        rs = self._resid_stats

        if self._use_centered_tss:
            return 1 - rs['sse'] / rs['centered_tss']
        else:
            return 1 - rs['sse'] / rs['uncentered_tss']

    @cache_readonly
    def _r2_adj_raw(self):
        """Returns the raw r-squared adjusted values."""
        nobs = self._nobs
        factors = (nobs - 1) / (nobs - self._df_raw)
        return 1 - (1 - self._r2_raw) * factors

    @cache_readonly
    def _resid_raw(self):
        """Returns the raw residuals."""
        return (self._y.values - self._y_fitted_raw)

    @cache_readonly
    def _std_err_raw(self):
        """Returns the raw standard err values."""
        results = []
        for i in range(len(self._var_beta_raw)):
            results.append(np.sqrt(np.diag(self._var_beta_raw[i])))

        return np.array(results)

    @cache_readonly
    def _t_stat_raw(self):
        """Returns the raw t-stat value."""
        return self._beta_raw / self._std_err_raw

    @cache_readonly
    def _var_beta_raw(self):
        """Returns the raw covariance of beta."""
        x = self._x_trans
        y = self._y_trans
        dates = self._index
        nobs = self._nobs
        rmse = self._rmse_raw
        beta = self._beta_raw
        df = self._df_raw
        window = self._window
        cum_xx = self._cum_xx(self._x)

        results = []
        for n, i in enumerate(self._valid_indices):
            xx = cum_xx[i]
            date = dates[i]

            if self._is_rolling and i >= window:
                xx = xx - cum_xx[i - window]
                prior_date = dates[i - window + 1]
            else:
                prior_date = dates[0]

            x_slice = x.truncate(before=prior_date, after=date)
            y_slice = y.truncate(before=prior_date, after=date)
            xv = x_slice.values
            yv = np.asarray(y_slice)

            if self._nw_lags is None:
                result = math.inv(xx) * (rmse[n] ** 2)
            else:
                resid = yv - np.dot(xv, beta[n])
                m = (xv.T * resid).T

                xeps = math.newey_west(m, self._nw_lags, nobs[n], df[n],
                                       self._nw_overlap)

                xx_inv = math.inv(xx)
                result = np.dot(xx_inv, np.dot(xeps, xx_inv))

            results.append(result)

        return np.array(results)

    @cache_readonly
    def _forecast_mean_raw(self):
        """Returns the raw covariance of beta."""
        nobs = self._nobs
        window = self._window

        # x should be ones
        dummy = DataFrame(index=self._y.index)
        dummy['y'] = 1

        cum_xy = self._cum_xy(dummy, self._y)

        results = []
        for n, i in enumerate(self._valid_indices):
            sumy = cum_xy[i]

            if self._is_rolling and i >= window:
                sumy = sumy - cum_xy[i - window]

            results.append(sumy[0] / nobs[n])

        return np.array(results)

    @cache_readonly
    def _forecast_vol_raw(self):
        """Returns the raw covariance of beta."""
        beta = self._beta_raw
        window = self._window
        dates = self._index
        x = self._x

        results = []
        for n, i in enumerate(self._valid_indices):
            date = dates[i]
            if self._is_rolling and i >= window:
                prior_date = dates[i - window + 1]
            else:
                prior_date = dates[0]

            x_slice = x.truncate(prior_date, date).values
            x_demeaned = x_slice - x_slice.mean(0)
            x_cov = np.dot(x_demeaned.T, x_demeaned) / (len(x_slice) - 1)

            B = beta[n]
            result = np.dot(B, np.dot(x_cov, B))
            results.append(np.sqrt(result))

        return np.array(results)

    @cache_readonly
    def _y_fitted_raw(self):
        """Returns the raw fitted y values."""
        return (self._x.values * self._beta_matrix(lag=0)).sum(1)

    @cache_readonly
    def _y_predict_raw(self):
        """Returns the raw predicted y values."""
        return (self._x.values * self._beta_matrix(lag=1)).sum(1)

    @cache_readonly
    def _results(self):
        results = {}
        for result in self.RESULT_FIELDS:
            value = getattr(self, result)
            if isinstance(value, Series):
                value = value[self.beta.index[-1]]
            elif isinstance(value, DataFrame):
                value = value.xs(self.beta.index[-1])
            else:  # pragma: no cover
                raise Exception('Problem retrieving %s' % result)
            results[result] = value

        return results

    @cache_readonly
    def _window_time_obs(self):
        window_obs = (Series(self._time_obs_count > 0)
                      .rolling(self._window, min_periods=1)
                      .sum()
                      .values
                      )

        window_obs[np.isnan(window_obs)] = 0
        return window_obs.astype(int)

    @cache_readonly
    def _nobs_raw(self):
        if self._is_rolling:
            window = self._window
        else:
            # expanding case
            window = len(self._index)

        result = Series(self._time_obs_count).rolling(
            window, min_periods=1).sum().values

        return result.astype(int)

    def _beta_matrix(self, lag=0):
        if lag < 0:
            raise AssertionError("'lag' must be greater than or equal to 0, "
                                 "input was {0}".format(lag))

        betas = self._beta_raw

        labels = np.arange(len(self._y)) - lag
        indexer = self._valid_obs_labels.searchsorted(labels, side='left')
        indexer[indexer == len(betas)] = len(betas) - 1

        beta_matrix = betas[indexer]
        beta_matrix[labels < self._valid_obs_labels[0]] = np.NaN

        return beta_matrix

    @cache_readonly
    def _valid_obs_labels(self):
        dates = self._index[self._valid_indices]
        return self._y.index.searchsorted(dates)

    @cache_readonly
    def _nobs(self):
        return self._nobs_raw[self._valid_indices]

    @property
    def nobs(self):
        return Series(self._nobs, index=self._result_index)

    @cache_readonly
    def _enough_obs(self):
        # XXX: what's the best way to determine where to start?
        return self._nobs_raw >= max(self._min_periods,
                                     len(self._x.columns) + 1)


def _safe_update(d, other):
    """
    Combine dictionaries with non-overlapping keys
    """
    for k, v in compat.iteritems(other):
        if k in d:
            raise Exception('Duplicate regressor: %s' % k)

        d[k] = v


def _filter_data(lhs, rhs, weights=None):
    """
    Cleans the input for single OLS.

    Parameters
    ----------
    lhs : Series
        Dependent variable in the regression.
    rhs : dict, whose values are Series, DataFrame, or dict
        Explanatory variables of the regression.
    weights : array-like, optional
        1d array of weights.  If None, equivalent to an unweighted OLS.

    Returns
    -------
    Series, DataFrame
        Cleaned lhs and rhs
    """
    if not isinstance(lhs, Series):
        if len(lhs) != len(rhs):
            raise AssertionError("length of lhs must equal length of rhs")
        lhs = Series(lhs, index=rhs.index)

    rhs = _combine_rhs(rhs)
    lhs = DataFrame({'__y__': lhs}, dtype=float)
    pre_filt_rhs = rhs.dropna(how='any')

    combined = rhs.join(lhs, how='outer')
    if weights is not None:
        combined['__weights__'] = weights

    valid = (combined.count(1) == len(combined.columns)).values
    index = combined.index
    combined = combined[valid]

    if weights is not None:
        filt_weights = combined.pop('__weights__')
    else:
        filt_weights = None

    filt_lhs = combined.pop('__y__')
    filt_rhs = combined

    if hasattr(filt_weights, 'to_dense'):
        filt_weights = filt_weights.to_dense()

    return (filt_lhs.to_dense(), filt_rhs.to_dense(), filt_weights,
            pre_filt_rhs.to_dense(), index, valid)


def _combine_rhs(rhs):
    """
    Glue input X variables together while checking for potential
    duplicates
    """
    series = {}

    if isinstance(rhs, Series):
        series['x'] = rhs
    elif isinstance(rhs, DataFrame):
        series = rhs.copy()
    elif isinstance(rhs, dict):
        for name, value in compat.iteritems(rhs):
            if isinstance(value, Series):
                _safe_update(series, {name: value})
            elif isinstance(value, (dict, DataFrame)):
                _safe_update(series, value)
            else:  # pragma: no cover
                raise Exception('Invalid RHS data type: %s' % type(value))
    else:  # pragma: no cover
        raise Exception('Invalid RHS type: %s' % type(rhs))

    if not isinstance(series, DataFrame):
        series = DataFrame(series, dtype=float)

    return series

# A little kludge so we can use this method for both
# MovingOLS and MovingPanelOLS


def _y_converter(y):
    y = y.values.squeeze()
    if y.ndim == 0:  # pragma: no cover
        return np.array([y])
    else:
        return y


def f_stat_to_dict(result):
    f_stat, shape, p_value = result

    result = {}
    result['f-stat'] = f_stat
    result['DF X'] = shape[0]
    result['DF Resid'] = shape[1]
    result['p-value'] = p_value

    return result