plm.py - This Python code implements a panel data analysis …

/pandas/stats/plm.py

http://github.com/wesm/pandas · Python · 863 lines · 654 code · 119 blank · 90 comment · 60 complexity · 495cb1035c51ce4c912e5f301ec74ced MD5 · raw file

"""
Linear regression objects for panel data
"""

# pylint: disable-msg=W0231
# pylint: disable-msg=E1101,E1103

# flake8: noqa

from __future__ import division
from pandas.compat import range
from pandas import compat
import warnings

import numpy as np

from pandas.core.panel import Panel
from pandas.core.frame import DataFrame
from pandas.core.reshape import get_dummies
from pandas.core.series import Series
from pandas.stats.ols import OLS, MovingOLS
import pandas.stats.common as com
import pandas.stats.math as math
from pandas.util.decorators import cache_readonly


class PanelOLS(OLS):
    """Implements panel OLS.

    See ols function docs
    """
    _panel_model = True

    def __init__(self, y, x, weights=None, intercept=True, nw_lags=None,
                 entity_effects=False, time_effects=False, x_effects=None,
                 cluster=None, dropped_dummies=None, verbose=False,
                 nw_overlap=False):
        import warnings
        warnings.warn("The pandas.stats.plm module is deprecated and will be "
                      "removed in a future version. We refer to external packages "
                      "like statsmodels, see some examples here: "
                      "http://www.statsmodels.org/stable/mixed_linear.html",
                      FutureWarning, stacklevel=4)
        self._x_orig = x
        self._y_orig = y
        self._weights = weights

        self._intercept = intercept
        self._nw_lags = nw_lags
        self._nw_overlap = nw_overlap
        self._entity_effects = entity_effects
        self._time_effects = time_effects
        self._x_effects = x_effects
        self._dropped_dummies = dropped_dummies or {}
        self._cluster = com._get_cluster_type(cluster)
        self._verbose = verbose

        (self._x, self._x_trans,
         self._x_filtered, self._y,
         self._y_trans) = self._prepare_data()

        self._index = self._x.index.levels[0]

        self._T = len(self._index)

    def log(self, msg):
        if self._verbose:  # pragma: no cover
            print(msg)

    def _prepare_data(self):
        """Cleans and stacks input data into DataFrame objects

        If time effects is True, then we turn off intercepts and omit an item
        from every (entity and x) fixed effect.

        Otherwise:
           - If we have an intercept, we omit an item from every fixed effect.
           - Else, we omit an item from every fixed effect except one of them.

        The categorical variables will get dropped from x.
        """
        (x, x_filtered, y, weights, cat_mapping) = self._filter_data()

        self.log('Adding dummies to X variables')
        x = self._add_dummies(x, cat_mapping)

        self.log('Adding dummies to filtered X variables')
        x_filtered = self._add_dummies(x_filtered, cat_mapping)

        if self._x_effects:
            x = x.drop(self._x_effects, axis=1)
            x_filtered = x_filtered.drop(self._x_effects, axis=1)

        if self._time_effects:
            x_regressor = x.sub(x.mean(level=0), level=0)

            unstacked_y = y.unstack()
            y_regressor = unstacked_y.sub(unstacked_y.mean(1), axis=0).stack()
            y_regressor.index = y.index

        elif self._intercept:
            # only add intercept when no time effects
            self.log('Adding intercept')
            x = x_regressor = add_intercept(x)
            x_filtered = add_intercept(x_filtered)
            y_regressor = y
        else:
            self.log('No intercept added')
            x_regressor = x
            y_regressor = y

        if weights is not None:
            if not y_regressor.index.equals(weights.index):
                raise AssertionError("y_regressor and weights must have the "
                                     "same index")
            if not x_regressor.index.equals(weights.index):
                raise AssertionError("x_regressor and weights must have the "
                                     "same index")

            rt_weights = np.sqrt(weights)
            y_regressor = y_regressor * rt_weights
            x_regressor = x_regressor.mul(rt_weights, axis=0)

        return x, x_regressor, x_filtered, y, y_regressor

    def _filter_data(self):
        """

        """
        data = self._x_orig
        cat_mapping = {}

        if isinstance(data, DataFrame):
            data = data.to_panel()
        else:
            if isinstance(data, Panel):
                data = data.copy()

            data, cat_mapping = self._convert_x(data)

            if not isinstance(data, Panel):
                data = Panel.from_dict(data, intersect=True)

        x_names = data.items

        if self._weights is not None:
            data['__weights__'] = self._weights

        # Filter x's without y (so we can make a prediction)
        filtered = data.to_frame()

        # Filter all data together using to_frame

        # convert to DataFrame
        y = self._y_orig
        if isinstance(y, Series):
            y = y.unstack()

        data['__y__'] = y
        data_long = data.to_frame()

        x_filt = filtered.filter(x_names)
        x = data_long.filter(x_names)
        y = data_long['__y__']

        if self._weights is not None and not self._weights.empty:
            weights = data_long['__weights__']
        else:
            weights = None

        return x, x_filt, y, weights, cat_mapping

    def _convert_x(self, x):
        # Converts non-numeric data in x to floats. x_converted is the
        # DataFrame with converted values, and x_conversion is a dict that
        # provides the reverse mapping.  For example, if 'A' was converted to 0
        # for x named 'variety', then x_conversion['variety'][0] is 'A'.
        x_converted = {}
        cat_mapping = {}
        # x can be either a dict or a Panel, but in Python 3, dicts don't have
        # .iteritems
        iteritems = getattr(x, 'iteritems', x.items)
        for key, df in iteritems():
            if not isinstance(df, DataFrame):
                raise AssertionError("all input items must be DataFrames, "
                                     "at least one is of "
                                     "type {0}".format(type(df)))

            if _is_numeric(df):
                x_converted[key] = df
            else:
                try:
                    df = df.astype(float)
                except (TypeError, ValueError):
                    values = df.values
                    distinct_values = sorted(set(values.flat))
                    cat_mapping[key] = dict(enumerate(distinct_values))
                    new_values = np.searchsorted(distinct_values, values)
                    x_converted[key] = DataFrame(new_values, index=df.index,
                                                 columns=df.columns)

        if len(cat_mapping) == 0:
            x_converted = x

        return x_converted, cat_mapping

    def _add_dummies(self, panel, mapping):
        """
        Add entity and / or categorical dummies to input X DataFrame

        Returns
        -------
        DataFrame
        """
        panel = self._add_entity_effects(panel)
        panel = self._add_categorical_dummies(panel, mapping)

        return panel

    def _add_entity_effects(self, panel):
        """
        Add entity dummies to panel

        Returns
        -------
        DataFrame
        """
        from pandas.core.reshape import make_axis_dummies

        if not self._entity_effects:
            return panel

        self.log('-- Adding entity fixed effect dummies')

        dummies = make_axis_dummies(panel, 'minor')

        if not self._use_all_dummies:
            if 'entity' in self._dropped_dummies:
                to_exclude = str(self._dropped_dummies.get('entity'))
            else:
                to_exclude = dummies.columns[0]

            if to_exclude not in dummies.columns:
                raise Exception('%s not in %s' % (to_exclude,
                                                  dummies.columns))

            self.log('-- Excluding dummy for entity: %s' % to_exclude)

            dummies = dummies.filter(dummies.columns.difference([to_exclude]))

        dummies = dummies.add_prefix('FE_')
        panel = panel.join(dummies)

        return panel

    def _add_categorical_dummies(self, panel, cat_mappings):
        """
        Add categorical dummies to panel

        Returns
        -------
        DataFrame
        """
        if not self._x_effects:
            return panel

        dropped_dummy = (self._entity_effects and not self._use_all_dummies)

        for effect in self._x_effects:
            self.log('-- Adding fixed effect dummies for %s' % effect)

            dummies = get_dummies(panel[effect])

            val_map = cat_mappings.get(effect)
            if val_map:
                val_map = dict((v, k) for k, v in compat.iteritems(val_map))

            if dropped_dummy or not self._use_all_dummies:
                if effect in self._dropped_dummies:
                    to_exclude = mapped_name = self._dropped_dummies.get(
                        effect)

                    if val_map:
                        mapped_name = val_map[to_exclude]
                else:
                    to_exclude = mapped_name = dummies.columns[0]

                if mapped_name not in dummies.columns:  # pragma: no cover
                    raise Exception('%s not in %s' % (to_exclude,
                                                      dummies.columns))

                self.log(
                    '-- Excluding dummy for %s: %s' % (effect, to_exclude))

                dummies = dummies.filter(
                    dummies.columns.difference([mapped_name]))
                dropped_dummy = True

            dummies = _convertDummies(dummies, cat_mappings.get(effect))
            dummies = dummies.add_prefix('%s_' % effect)
            panel = panel.join(dummies)

        return panel

    @property
    def _use_all_dummies(self):
        """
        In the case of using an intercept or including time fixed
        effects, completely partitioning the sample would make the X
        not full rank.
        """
        return (not self._intercept and not self._time_effects)

    @cache_readonly
    def _beta_raw(self):
        """Runs the regression and returns the beta."""
        X = self._x_trans.values
        Y = self._y_trans.values.squeeze()

        beta, _, _, _ = np.linalg.lstsq(X, Y)

        return beta

    @cache_readonly
    def beta(self):
        return Series(self._beta_raw, index=self._x.columns)

    @cache_readonly
    def _df_model_raw(self):
        """Returns the raw model degrees of freedom."""
        return self._df_raw - 1

    @cache_readonly
    def _df_resid_raw(self):
        """Returns the raw residual degrees of freedom."""
        return self._nobs - self._df_raw

    @cache_readonly
    def _df_raw(self):
        """Returns the degrees of freedom."""
        df = math.rank(self._x_trans.values)
        if self._time_effects:
            df += self._total_times

        return df

    @cache_readonly
    def _r2_raw(self):
        Y = self._y_trans.values.squeeze()
        X = self._x_trans.values

        resid = Y - np.dot(X, self._beta_raw)

        SSE = (resid ** 2).sum()

        if self._use_centered_tss:
            SST = ((Y - np.mean(Y)) ** 2).sum()
        else:
            SST = (Y ** 2).sum()

        return 1 - SSE / SST

    @property
    def _use_centered_tss(self):
        # has_intercept = np.abs(self._resid_raw.sum()) < _FP_ERR
        return self._intercept or self._entity_effects or self._time_effects

    @cache_readonly
    def _r2_adj_raw(self):
        """Returns the raw r-squared adjusted values."""
        nobs = self._nobs
        factors = (nobs - 1) / (nobs - self._df_raw)
        return 1 - (1 - self._r2_raw) * factors

    @cache_readonly
    def _resid_raw(self):
        Y = self._y.values.squeeze()
        X = self._x.values
        return Y - np.dot(X, self._beta_raw)

    @cache_readonly
    def resid(self):
        return self._unstack_vector(self._resid_raw)

    @cache_readonly
    def _rmse_raw(self):
        """Returns the raw rmse values."""
        # X = self._x.values
        # Y = self._y.values.squeeze()

        X = self._x_trans.values
        Y = self._y_trans.values.squeeze()

        resid = Y - np.dot(X, self._beta_raw)
        ss = (resid ** 2).sum()
        return np.sqrt(ss / (self._nobs - self._df_raw))

    @cache_readonly
    def _var_beta_raw(self):
        cluster_axis = None
        if self._cluster == 'time':
            cluster_axis = 0
        elif self._cluster == 'entity':
            cluster_axis = 1

        x = self._x
        y = self._y

        if self._time_effects:
            xx = _xx_time_effects(x, y)
        else:
            xx = np.dot(x.values.T, x.values)

        return _var_beta_panel(y, x, self._beta_raw, xx,
                               self._rmse_raw, cluster_axis, self._nw_lags,
                               self._nobs, self._df_raw, self._nw_overlap)

    @cache_readonly
    def _y_fitted_raw(self):
        """Returns the raw fitted y values."""
        return np.dot(self._x.values, self._beta_raw)

    @cache_readonly
    def y_fitted(self):
        return self._unstack_vector(self._y_fitted_raw, index=self._x.index)

    def _unstack_vector(self, vec, index=None):
        if index is None:
            index = self._y_trans.index
        panel = DataFrame(vec, index=index, columns=['dummy'])
        return panel.to_panel()['dummy']

    def _unstack_y(self, vec):
        unstacked = self._unstack_vector(vec)
        return unstacked.reindex(self.beta.index)

    @cache_readonly
    def _time_obs_count(self):
        return self._y_trans.count(level=0).values

    @cache_readonly
    def _time_has_obs(self):
        return self._time_obs_count > 0

    @property
    def _nobs(self):
        return len(self._y)


def _convertDummies(dummies, mapping):
    # cleans up the names of the generated dummies
    new_items = []
    for item in dummies.columns:
        if not mapping:
            var = str(item)
            if isinstance(item, float):
                var = '%g' % item

            new_items.append(var)
        else:
            # renames the dummies if a conversion dict is provided
            new_items.append(mapping[int(item)])

    dummies = DataFrame(dummies.values, index=dummies.index,
                        columns=new_items)

    return dummies


def _is_numeric(df):
    for col in df:
        if df[col].dtype.name == 'object':
            return False

    return True


def add_intercept(panel, name='intercept'):
    """
    Add column of ones to input panel

    Parameters
    ----------
    panel: Panel / DataFrame
    name: string, default 'intercept']

    Returns
    -------
    New object (same type as input)
    """
    panel = panel.copy()
    panel[name] = 1.

    return panel.consolidate()


class MovingPanelOLS(MovingOLS, PanelOLS):
    """Implements rolling/expanding panel OLS.

    See ols function docs
    """
    _panel_model = True

    def __init__(self, y, x, weights=None,
                 window_type='expanding', window=None,
                 min_periods=None,
                 min_obs=None,
                 intercept=True,
                 nw_lags=None, nw_overlap=False,
                 entity_effects=False,
                 time_effects=False,
                 x_effects=None,
                 cluster=None,
                 dropped_dummies=None,
                 verbose=False):

        self._args = dict(intercept=intercept,
                          nw_lags=nw_lags,
                          nw_overlap=nw_overlap,
                          entity_effects=entity_effects,
                          time_effects=time_effects,
                          x_effects=x_effects,
                          cluster=cluster,
                          dropped_dummies=dropped_dummies,
                          verbose=verbose)

        PanelOLS.__init__(self, y=y, x=x, weights=weights,
                          **self._args)

        self._set_window(window_type, window, min_periods)

        if min_obs is None:
            min_obs = len(self._x.columns) + 1

        self._min_obs = min_obs

    @cache_readonly
    def resid(self):
        return self._unstack_y(self._resid_raw)

    @cache_readonly
    def y_fitted(self):
        return self._unstack_y(self._y_fitted_raw)

    @cache_readonly
    def y_predict(self):
        """Returns the predicted y values."""
        return self._unstack_y(self._y_predict_raw)

    def lagged_y_predict(self, lag=1):
        """
        Compute forecast Y value lagging coefficient by input number
        of time periods

        Parameters
        ----------
        lag : int

        Returns
        -------
        DataFrame
        """
        x = self._x.values
        betas = self._beta_matrix(lag=lag)
        return self._unstack_y((betas * x).sum(1))

    @cache_readonly
    def _rolling_ols_call(self):
        return self._calc_betas(self._x_trans, self._y_trans)

    @cache_readonly
    def _df_raw(self):
        """Returns the degrees of freedom."""
        df = self._rolling_rank()

        if self._time_effects:
            df += self._window_time_obs

        return df[self._valid_indices]

    @cache_readonly
    def _var_beta_raw(self):
        """Returns the raw covariance of beta."""
        x = self._x
        y = self._y

        dates = x.index.levels[0]

        cluster_axis = None
        if self._cluster == 'time':
            cluster_axis = 0
        elif self._cluster == 'entity':
            cluster_axis = 1

        nobs = self._nobs
        rmse = self._rmse_raw
        beta = self._beta_raw
        df = self._df_raw
        window = self._window

        if not self._time_effects:
            # Non-transformed X
            cum_xx = self._cum_xx(x)

        results = []
        for n, i in enumerate(self._valid_indices):
            if self._is_rolling and i >= window:
                prior_date = dates[i - window + 1]
            else:
                prior_date = dates[0]

            date = dates[i]

            x_slice = x.truncate(prior_date, date)
            y_slice = y.truncate(prior_date, date)

            if self._time_effects:
                xx = _xx_time_effects(x_slice, y_slice)
            else:
                xx = cum_xx[i]
                if self._is_rolling and i >= window:
                    xx = xx - cum_xx[i - window]

            result = _var_beta_panel(y_slice, x_slice, beta[n], xx, rmse[n],
                                     cluster_axis, self._nw_lags,
                                     nobs[n], df[n], self._nw_overlap)

            results.append(result)

        return np.array(results)

    @cache_readonly
    def _resid_raw(self):
        beta_matrix = self._beta_matrix(lag=0)

        Y = self._y.values.squeeze()
        X = self._x.values
        resid = Y - (X * beta_matrix).sum(1)

        return resid

    @cache_readonly
    def _y_fitted_raw(self):
        x = self._x.values
        betas = self._beta_matrix(lag=0)
        return (betas * x).sum(1)

    @cache_readonly
    def _y_predict_raw(self):
        """Returns the raw predicted y values."""
        x = self._x.values
        betas = self._beta_matrix(lag=1)
        return (betas * x).sum(1)

    def _beta_matrix(self, lag=0):
        if lag < 0:
            raise AssertionError("'lag' must be greater than or equal to 0, "
                                 "input was {0}".format(lag))

        index = self._y_trans.index
        major_labels = index.labels[0]
        labels = major_labels - lag
        indexer = self._valid_indices.searchsorted(labels, side='left')

        beta_matrix = self._beta_raw[indexer]
        beta_matrix[labels < self._valid_indices[0]] = np.NaN

        return beta_matrix

    @cache_readonly
    def _enough_obs(self):
        # XXX: what's the best way to determine where to start?
        # TODO: write unit tests for this

        rank_threshold = len(self._x.columns) + 1
        if self._min_obs < rank_threshold:  # pragma: no cover
            warnings.warn('min_obs is smaller than rank of X matrix')

        enough_observations = self._nobs_raw >= self._min_obs
        enough_time_periods = self._window_time_obs >= self._min_periods
        return enough_time_periods & enough_observations


def create_ols_dict(attr):
    def attr_getter(self):
        d = {}
        for k, v in compat.iteritems(self.results):
            result = getattr(v, attr)
            d[k] = result

        return d

    return attr_getter


def create_ols_attr(attr):
    return property(create_ols_dict(attr))


class NonPooledPanelOLS(object):
    """Implements non-pooled panel OLS.

    Parameters
    ----------
    y : DataFrame
    x : Series, DataFrame, or dict of Series
    intercept : bool
        True if you want an intercept.
    nw_lags : None or int
        Number of Newey-West lags.
    window_type : {'full_sample', 'rolling', 'expanding'}
        'full_sample' by default
    window : int
        size of window (for rolling/expanding OLS)
    """

    ATTRIBUTES = [
        'beta',
        'df',
        'df_model',
        'df_resid',
        'f_stat',
        'p_value',
        'r2',
        'r2_adj',
        'resid',
        'rmse',
        'std_err',
        'summary_as_matrix',
        't_stat',
        'var_beta',
        'x',
        'y',
        'y_fitted',
        'y_predict'
    ]

    def __init__(self, y, x, window_type='full_sample', window=None,
                 min_periods=None, intercept=True, nw_lags=None,
                 nw_overlap=False):

        import warnings
        warnings.warn("The pandas.stats.plm module is deprecated and will be "
                      "removed in a future version. We refer to external packages "
                      "like statsmodels, see some examples here: "
                      "http://www.statsmodels.org/stable/mixed_linear.html",
                      FutureWarning, stacklevel=4)

        for attr in self.ATTRIBUTES:
            setattr(self.__class__, attr, create_ols_attr(attr))

        results = {}

        for entity in y:
            entity_y = y[entity]

            entity_x = {}
            for x_var in x:
                entity_x[x_var] = x[x_var][entity]

            from pandas.stats.interface import ols
            results[entity] = ols(y=entity_y,
                                  x=entity_x,
                                  window_type=window_type,
                                  window=window,
                                  min_periods=min_periods,
                                  intercept=intercept,
                                  nw_lags=nw_lags,
                                  nw_overlap=nw_overlap)

        self.results = results


def _var_beta_panel(y, x, beta, xx, rmse, cluster_axis,
                    nw_lags, nobs, df, nw_overlap):
    xx_inv = math.inv(xx)

    yv = y.values

    if cluster_axis is None:
        if nw_lags is None:
            return xx_inv * (rmse ** 2)
        else:
            resid = yv - np.dot(x.values, beta)
            m = (x.values.T * resid).T

            xeps = math.newey_west(m, nw_lags, nobs, df, nw_overlap)

            return np.dot(xx_inv, np.dot(xeps, xx_inv))
    else:
        Xb = np.dot(x.values, beta).reshape((len(x.values), 1))
        resid = DataFrame(yv[:, None] - Xb, index=y.index, columns=['resid'])

        if cluster_axis == 1:
            x = x.swaplevel(0, 1).sortlevel(0)
            resid = resid.swaplevel(0, 1).sortlevel(0)

        m = _group_agg(x.values * resid.values, x.index._bounds,
                       lambda x: np.sum(x, axis=0))

        if nw_lags is None:
            nw_lags = 0

        xox = 0
        for i in range(len(x.index.levels[0])):
            xox += math.newey_west(m[i: i + 1], nw_lags,
                                   nobs, df, nw_overlap)

        return np.dot(xx_inv, np.dot(xox, xx_inv))


def _group_agg(values, bounds, f):
    """
    R-style aggregator

    Parameters
    ----------
    values : N-length or N x K ndarray
    bounds : B-length ndarray
    f : ndarray aggregation function

    Returns
    -------
    ndarray with same length as bounds array
    """
    if values.ndim == 1:
        N = len(values)
        result = np.empty(len(bounds), dtype=float)
    elif values.ndim == 2:
        N, K = values.shape
        result = np.empty((len(bounds), K), dtype=float)

    testagg = f(values[:min(1, len(values))])
    if isinstance(testagg, np.ndarray) and testagg.ndim == 2:
        raise AssertionError('Function must reduce')

    for i, left_bound in enumerate(bounds):
        if i == len(bounds) - 1:
            right_bound = N
        else:
            right_bound = bounds[i + 1]

        result[i] = f(values[left_bound:right_bound])

    return result


def _xx_time_effects(x, y):
    """
    Returns X'X - (X'T) (T'T)^-1 (T'X)
    """
    # X'X
    xx = np.dot(x.values.T, x.values)
    xt = x.sum(level=0).values

    count = y.unstack().count(1).values
    selector = count > 0

    # X'X - (T'T)^-1 (T'X)
    xt = xt[selector]
    count = count[selector]

    return xx - np.dot(xt.T / count, xt)
Summary ✨

This Python code implements a panel data analysis framework for mixed linear models. It defines classes and functions to estimate various model parameters, such as residuals, variance components, and standard errors. The code is designed to work with panel data, which consists of multiple observations across time for each unit (e.g., entities).
Tech Fingerprint

Alerts (3)

Complexity hotspot; lines 278 to 279 (total complexity: 3)
278 279
'isinstance(' Overuse may indicate design issues; consider polymorphism
834