/pandas/stats/plm.py
Python | 814 lines | 607 code | 118 blank | 89 comment | 61 complexity | 332d1cef25200815fcde39cd342dfd8a MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
- """
- Linear regression objects for panel data
- """
- # pylint: disable-msg=W0231
- # pylint: disable-msg=E1101,E1103
- from __future__ import division
- from pandas.compat import range
- from pandas import compat
- import warnings
- import numpy as np
- from pandas.core.panel import Panel
- from pandas.core.frame import DataFrame
- from pandas.core.reshape import get_dummies
- from pandas.core.series import Series
- from pandas.core.sparse import SparsePanel
- from pandas.stats.ols import OLS, MovingOLS
- import pandas.stats.common as com
- import pandas.stats.math as math
- from pandas.util.decorators import cache_readonly
- class PanelOLS(OLS):
- """Implements panel OLS.
- See ols function docs
- """
- _panel_model = True
- def __init__(self, y, x, weights=None, intercept=True, nw_lags=None,
- entity_effects=False, time_effects=False, x_effects=None,
- cluster=None, dropped_dummies=None, verbose=False,
- nw_overlap=False):
- self._x_orig = x
- self._y_orig = y
- self._weights = weights
- self._intercept = intercept
- self._nw_lags = nw_lags
- self._nw_overlap = nw_overlap
- self._entity_effects = entity_effects
- self._time_effects = time_effects
- self._x_effects = x_effects
- self._dropped_dummies = dropped_dummies or {}
- self._cluster = com._get_cluster_type(cluster)
- self._verbose = verbose
- (self._x, self._x_trans,
- self._x_filtered, self._y,
- self._y_trans) = self._prepare_data()
- self._index = self._x.index.levels[0]
- self._T = len(self._index)
- def log(self, msg):
- if self._verbose: # pragma: no cover
- print(msg)
- def _prepare_data(self):
- """Cleans and stacks input data into DataFrame objects
- If time effects is True, then we turn off intercepts and omit an item
- from every (entity and x) fixed effect.
- Otherwise:
- - If we have an intercept, we omit an item from every fixed effect.
- - Else, we omit an item from every fixed effect except one of them.
- The categorical variables will get dropped from x.
- """
- (x, x_filtered, y, weights, cat_mapping) = self._filter_data()
- self.log('Adding dummies to X variables')
- x = self._add_dummies(x, cat_mapping)
- self.log('Adding dummies to filtered X variables')
- x_filtered = self._add_dummies(x_filtered, cat_mapping)
- if self._x_effects:
- x = x.drop(self._x_effects, axis=1)
- x_filtered = x_filtered.drop(self._x_effects, axis=1)
- if self._time_effects:
- x_regressor = x.sub(x.mean(level=0), level=0)
- unstacked_y = y.unstack()
- y_regressor = unstacked_y.sub(unstacked_y.mean(1), axis=0).stack()
- y_regressor.index = y.index
- elif self._intercept:
- # only add intercept when no time effects
- self.log('Adding intercept')
- x = x_regressor = add_intercept(x)
- x_filtered = add_intercept(x_filtered)
- y_regressor = y
- else:
- self.log('No intercept added')
- x_regressor = x
- y_regressor = y
- if weights is not None:
- if not y_regressor.index.equals(weights.index):
- raise AssertionError("y_regressor and weights must have the "
- "same index")
- if not x_regressor.index.equals(weights.index):
- raise AssertionError("x_regressor and weights must have the "
- "same index")
- rt_weights = np.sqrt(weights)
- y_regressor = y_regressor * rt_weights
- x_regressor = x_regressor.mul(rt_weights, axis=0)
- return x, x_regressor, x_filtered, y, y_regressor
- def _filter_data(self):
- """
- """
- data = self._x_orig
- cat_mapping = {}
- if isinstance(data, DataFrame):
- data = data.to_panel()
- else:
- if isinstance(data, Panel):
- data = data.copy()
- if not isinstance(data, SparsePanel):
- data, cat_mapping = self._convert_x(data)
- if not isinstance(data, Panel):
- data = Panel.from_dict(data, intersect=True)
- x_names = data.items
- if self._weights is not None:
- data['__weights__'] = self._weights
- # Filter x's without y (so we can make a prediction)
- filtered = data.to_frame()
- # Filter all data together using to_frame
- # convert to DataFrame
- y = self._y_orig
- if isinstance(y, Series):
- y = y.unstack()
- data['__y__'] = y
- data_long = data.to_frame()
- x_filt = filtered.filter(x_names)
- x = data_long.filter(x_names)
- y = data_long['__y__']
- if self._weights is not None and not self._weights.empty:
- weights = data_long['__weights__']
- else:
- weights = None
- return x, x_filt, y, weights, cat_mapping
- def _convert_x(self, x):
- # Converts non-numeric data in x to floats. x_converted is the
- # DataFrame with converted values, and x_conversion is a dict that
- # provides the reverse mapping. For example, if 'A' was converted to 0
- # for x named 'variety', then x_conversion['variety'][0] is 'A'.
- x_converted = {}
- cat_mapping = {}
- # x can be either a dict or a Panel, but in Python 3, dicts don't have
- # .iteritems
- iteritems = getattr(x, 'iteritems', x.items)
- for key, df in iteritems():
- if not isinstance(df, DataFrame):
- raise AssertionError("all input items must be DataFrames, "
- "at least one is of "
- "type {0}".format(type(df)))
- if _is_numeric(df):
- x_converted[key] = df
- else:
- try:
- df = df.astype(float)
- except (TypeError, ValueError):
- values = df.values
- distinct_values = sorted(set(values.flat))
- cat_mapping[key] = dict(enumerate(distinct_values))
- new_values = np.searchsorted(distinct_values, values)
- x_converted[key] = DataFrame(new_values, index=df.index,
- columns=df.columns)
- if len(cat_mapping) == 0:
- x_converted = x
- return x_converted, cat_mapping
- def _add_dummies(self, panel, mapping):
- """
- Add entity and / or categorical dummies to input X DataFrame
- Returns
- -------
- DataFrame
- """
- panel = self._add_entity_effects(panel)
- panel = self._add_categorical_dummies(panel, mapping)
- return panel
- def _add_entity_effects(self, panel):
- """
- Add entity dummies to panel
- Returns
- -------
- DataFrame
- """
- from pandas.core.reshape import make_axis_dummies
- if not self._entity_effects:
- return panel
- self.log('-- Adding entity fixed effect dummies')
- dummies = make_axis_dummies(panel, 'minor')
- if not self._use_all_dummies:
- if 'entity' in self._dropped_dummies:
- to_exclude = str(self._dropped_dummies.get('entity'))
- else:
- to_exclude = dummies.columns[0]
- if to_exclude not in dummies.columns:
- raise Exception('%s not in %s' % (to_exclude,
- dummies.columns))
- self.log('-- Excluding dummy for entity: %s' % to_exclude)
- dummies = dummies.filter(dummies.columns - [to_exclude])
- dummies = dummies.add_prefix('FE_')
- panel = panel.join(dummies)
- return panel
- def _add_categorical_dummies(self, panel, cat_mappings):
- """
- Add categorical dummies to panel
- Returns
- -------
- DataFrame
- """
- if not self._x_effects:
- return panel
- dropped_dummy = (self._entity_effects and not self._use_all_dummies)
- for effect in self._x_effects:
- self.log('-- Adding fixed effect dummies for %s' % effect)
- dummies = get_dummies(panel[effect])
- val_map = cat_mappings.get(effect)
- if val_map:
- val_map = dict((v, k) for k, v in compat.iteritems(val_map))
- if dropped_dummy or not self._use_all_dummies:
- if effect in self._dropped_dummies:
- to_exclude = mapped_name = self._dropped_dummies.get(
- effect)
- if val_map:
- mapped_name = val_map[to_exclude]
- else:
- to_exclude = mapped_name = dummies.columns[0]
- if mapped_name not in dummies.columns: # pragma: no cover
- raise Exception('%s not in %s' % (to_exclude,
- dummies.columns))
- self.log(
- '-- Excluding dummy for %s: %s' % (effect, to_exclude))
- dummies = dummies.filter(dummies.columns - [mapped_name])
- dropped_dummy = True
- dummies = _convertDummies(dummies, cat_mappings.get(effect))
- dummies = dummies.add_prefix('%s_' % effect)
- panel = panel.join(dummies)
- return panel
- @property
- def _use_all_dummies(self):
- """
- In the case of using an intercept or including time fixed
- effects, completely partitioning the sample would make the X
- not full rank.
- """
- return (not self._intercept and not self._time_effects)
- @cache_readonly
- def _beta_raw(self):
- """Runs the regression and returns the beta."""
- X = self._x_trans.values
- Y = self._y_trans.values.squeeze()
- beta, _, _, _ = np.linalg.lstsq(X, Y)
- return beta
- @cache_readonly
- def beta(self):
- return Series(self._beta_raw, index=self._x.columns)
- @cache_readonly
- def _df_model_raw(self):
- """Returns the raw model degrees of freedom."""
- return self._df_raw - 1
- @cache_readonly
- def _df_resid_raw(self):
- """Returns the raw residual degrees of freedom."""
- return self._nobs - self._df_raw
- @cache_readonly
- def _df_raw(self):
- """Returns the degrees of freedom."""
- df = math.rank(self._x_trans.values)
- if self._time_effects:
- df += self._total_times
- return df
- @cache_readonly
- def _r2_raw(self):
- Y = self._y_trans.values.squeeze()
- X = self._x_trans.values
- resid = Y - np.dot(X, self._beta_raw)
- SSE = (resid ** 2).sum()
- if self._use_centered_tss:
- SST = ((Y - np.mean(Y)) ** 2).sum()
- else:
- SST = (Y ** 2).sum()
- return 1 - SSE / SST
- @property
- def _use_centered_tss(self):
- # has_intercept = np.abs(self._resid_raw.sum()) < _FP_ERR
- return self._intercept or self._entity_effects or self._time_effects
- @cache_readonly
- def _r2_adj_raw(self):
- """Returns the raw r-squared adjusted values."""
- nobs = self._nobs
- factors = (nobs - 1) / (nobs - self._df_raw)
- return 1 - (1 - self._r2_raw) * factors
- @cache_readonly
- def _resid_raw(self):
- Y = self._y.values.squeeze()
- X = self._x.values
- return Y - np.dot(X, self._beta_raw)
- @cache_readonly
- def resid(self):
- return self._unstack_vector(self._resid_raw)
- @cache_readonly
- def _rmse_raw(self):
- """Returns the raw rmse values."""
- # X = self._x.values
- # Y = self._y.values.squeeze()
- X = self._x_trans.values
- Y = self._y_trans.values.squeeze()
- resid = Y - np.dot(X, self._beta_raw)
- ss = (resid ** 2).sum()
- return np.sqrt(ss / (self._nobs - self._df_raw))
- @cache_readonly
- def _var_beta_raw(self):
- cluster_axis = None
- if self._cluster == 'time':
- cluster_axis = 0
- elif self._cluster == 'entity':
- cluster_axis = 1
- x = self._x
- y = self._y
- if self._time_effects:
- xx = _xx_time_effects(x, y)
- else:
- xx = np.dot(x.values.T, x.values)
- return _var_beta_panel(y, x, self._beta_raw, xx,
- self._rmse_raw, cluster_axis, self._nw_lags,
- self._nobs, self._df_raw, self._nw_overlap)
- @cache_readonly
- def _y_fitted_raw(self):
- """Returns the raw fitted y values."""
- return np.dot(self._x.values, self._beta_raw)
- @cache_readonly
- def y_fitted(self):
- return self._unstack_vector(self._y_fitted_raw, index=self._x.index)
- def _unstack_vector(self, vec, index=None):
- if index is None:
- index = self._y_trans.index
- panel = DataFrame(vec, index=index, columns=['dummy'])
- return panel.to_panel()['dummy']
- def _unstack_y(self, vec):
- unstacked = self._unstack_vector(vec)
- return unstacked.reindex(self.beta.index)
- @cache_readonly
- def _time_obs_count(self):
- return self._y_trans.count(level=0).values
- @cache_readonly
- def _time_has_obs(self):
- return self._time_obs_count > 0
- @property
- def _nobs(self):
- return len(self._y)
- def _convertDummies(dummies, mapping):
- # cleans up the names of the generated dummies
- new_items = []
- for item in dummies.columns:
- if not mapping:
- var = str(item)
- if isinstance(item, float):
- var = '%g' % item
- new_items.append(var)
- else:
- # renames the dummies if a conversion dict is provided
- new_items.append(mapping[int(item)])
- dummies = DataFrame(dummies.values, index=dummies.index,
- columns=new_items)
- return dummies
- def _is_numeric(df):
- for col in df:
- if df[col].dtype.name == 'object':
- return False
- return True
- def add_intercept(panel, name='intercept'):
- """
- Add column of ones to input panel
- Parameters
- ----------
- panel: Panel / DataFrame
- name: string, default 'intercept']
- Returns
- -------
- New object (same type as input)
- """
- panel = panel.copy()
- panel[name] = 1.
- return panel.consolidate()
- class MovingPanelOLS(MovingOLS, PanelOLS):
- """Implements rolling/expanding panel OLS.
- See ols function docs
- """
- _panel_model = True
- def __init__(self, y, x, weights=None,
- window_type='expanding', window=None,
- min_periods=None,
- min_obs=None,
- intercept=True,
- nw_lags=None, nw_overlap=False,
- entity_effects=False,
- time_effects=False,
- x_effects=None,
- cluster=None,
- dropped_dummies=None,
- verbose=False):
- self._args = dict(intercept=intercept,
- nw_lags=nw_lags,
- nw_overlap=nw_overlap,
- entity_effects=entity_effects,
- time_effects=time_effects,
- x_effects=x_effects,
- cluster=cluster,
- dropped_dummies=dropped_dummies,
- verbose=verbose)
- PanelOLS.__init__(self, y=y, x=x, weights=weights,
- **self._args)
- self._set_window(window_type, window, min_periods)
- if min_obs is None:
- min_obs = len(self._x.columns) + 1
- self._min_obs = min_obs
- @cache_readonly
- def resid(self):
- return self._unstack_y(self._resid_raw)
- @cache_readonly
- def y_fitted(self):
- return self._unstack_y(self._y_fitted_raw)
- @cache_readonly
- def y_predict(self):
- """Returns the predicted y values."""
- return self._unstack_y(self._y_predict_raw)
- def lagged_y_predict(self, lag=1):
- """
- Compute forecast Y value lagging coefficient by input number
- of time periods
- Parameters
- ----------
- lag : int
- Returns
- -------
- DataFrame
- """
- x = self._x.values
- betas = self._beta_matrix(lag=lag)
- return self._unstack_y((betas * x).sum(1))
- @cache_readonly
- def _rolling_ols_call(self):
- return self._calc_betas(self._x_trans, self._y_trans)
- @cache_readonly
- def _df_raw(self):
- """Returns the degrees of freedom."""
- df = self._rolling_rank()
- if self._time_effects:
- df += self._window_time_obs
- return df[self._valid_indices]
- @cache_readonly
- def _var_beta_raw(self):
- """Returns the raw covariance of beta."""
- x = self._x
- y = self._y
- dates = x.index.levels[0]
- cluster_axis = None
- if self._cluster == 'time':
- cluster_axis = 0
- elif self._cluster == 'entity':
- cluster_axis = 1
- nobs = self._nobs
- rmse = self._rmse_raw
- beta = self._beta_raw
- df = self._df_raw
- window = self._window
- if not self._time_effects:
- # Non-transformed X
- cum_xx = self._cum_xx(x)
- results = []
- for n, i in enumerate(self._valid_indices):
- if self._is_rolling and i >= window:
- prior_date = dates[i - window + 1]
- else:
- prior_date = dates[0]
- date = dates[i]
- x_slice = x.truncate(prior_date, date)
- y_slice = y.truncate(prior_date, date)
- if self._time_effects:
- xx = _xx_time_effects(x_slice, y_slice)
- else:
- xx = cum_xx[i]
- if self._is_rolling and i >= window:
- xx = xx - cum_xx[i - window]
- result = _var_beta_panel(y_slice, x_slice, beta[n], xx, rmse[n],
- cluster_axis, self._nw_lags,
- nobs[n], df[n], self._nw_overlap)
- results.append(result)
- return np.array(results)
- @cache_readonly
- def _resid_raw(self):
- beta_matrix = self._beta_matrix(lag=0)
- Y = self._y.values.squeeze()
- X = self._x.values
- resid = Y - (X * beta_matrix).sum(1)
- return resid
- @cache_readonly
- def _y_fitted_raw(self):
- x = self._x.values
- betas = self._beta_matrix(lag=0)
- return (betas * x).sum(1)
- @cache_readonly
- def _y_predict_raw(self):
- """Returns the raw predicted y values."""
- x = self._x.values
- betas = self._beta_matrix(lag=1)
- return (betas * x).sum(1)
- def _beta_matrix(self, lag=0):
- if lag < 0:
- raise AssertionError("'lag' must be greater than or equal to 0, "
- "input was {0}".format(lag))
- index = self._y_trans.index
- major_labels = index.labels[0]
- labels = major_labels - lag
- indexer = self._valid_indices.searchsorted(labels, side='left')
- beta_matrix = self._beta_raw[indexer]
- beta_matrix[labels < self._valid_indices[0]] = np.NaN
- return beta_matrix
- @cache_readonly
- def _enough_obs(self):
- # XXX: what's the best way to determine where to start?
- # TODO: write unit tests for this
- rank_threshold = len(self._x.columns) + 1
- if self._min_obs < rank_threshold: # pragma: no cover
- warnings.warn('min_obs is smaller than rank of X matrix')
- enough_observations = self._nobs_raw >= self._min_obs
- enough_time_periods = self._window_time_obs >= self._min_periods
- return enough_time_periods & enough_observations
- def create_ols_dict(attr):
- def attr_getter(self):
- d = {}
- for k, v in compat.iteritems(self.results):
- result = getattr(v, attr)
- d[k] = result
- return d
- return attr_getter
- def create_ols_attr(attr):
- return property(create_ols_dict(attr))
- class NonPooledPanelOLS(object):
- """Implements non-pooled panel OLS.
- Parameters
- ----------
- y : DataFrame
- x : Series, DataFrame, or dict of Series
- intercept : bool
- True if you want an intercept.
- nw_lags : None or int
- Number of Newey-West lags.
- window_type : {'full_sample', 'rolling', 'expanding'}
- 'full_sample' by default
- window : int
- size of window (for rolling/expanding OLS)
- """
- ATTRIBUTES = [
- 'beta',
- 'df',
- 'df_model',
- 'df_resid',
- 'f_stat',
- 'p_value',
- 'r2',
- 'r2_adj',
- 'resid',
- 'rmse',
- 'std_err',
- 'summary_as_matrix',
- 't_stat',
- 'var_beta',
- 'x',
- 'y',
- 'y_fitted',
- 'y_predict'
- ]
- def __init__(self, y, x, window_type='full_sample', window=None,
- min_periods=None, intercept=True, nw_lags=None,
- nw_overlap=False):
- for attr in self.ATTRIBUTES:
- setattr(self.__class__, attr, create_ols_attr(attr))
- results = {}
- for entity in y:
- entity_y = y[entity]
- entity_x = {}
- for x_var in x:
- entity_x[x_var] = x[x_var][entity]
- from pandas.stats.interface import ols
- results[entity] = ols(y=entity_y,
- x=entity_x,
- window_type=window_type,
- window=window,
- min_periods=min_periods,
- intercept=intercept,
- nw_lags=nw_lags,
- nw_overlap=nw_overlap)
- self.results = results
- def _var_beta_panel(y, x, beta, xx, rmse, cluster_axis,
- nw_lags, nobs, df, nw_overlap):
- from pandas.core.frame import group_agg
- xx_inv = math.inv(xx)
- yv = y.values
- if cluster_axis is None:
- if nw_lags is None:
- return xx_inv * (rmse ** 2)
- else:
- resid = yv - np.dot(x.values, beta)
- m = (x.values.T * resid).T
- xeps = math.newey_west(m, nw_lags, nobs, df, nw_overlap)
- return np.dot(xx_inv, np.dot(xeps, xx_inv))
- else:
- Xb = np.dot(x.values, beta).reshape((len(x.values), 1))
- resid = DataFrame(yv[:, None] - Xb, index=y.index, columns=['resid'])
- if cluster_axis == 1:
- x = x.swaplevel(0, 1).sortlevel(0)
- resid = resid.swaplevel(0, 1).sortlevel(0)
- m = group_agg(x.values * resid.values, x.index._bounds,
- lambda x: np.sum(x, axis=0))
- if nw_lags is None:
- nw_lags = 0
- xox = 0
- for i in range(len(x.index.levels[0])):
- xox += math.newey_west(m[i: i + 1], nw_lags,
- nobs, df, nw_overlap)
- return np.dot(xx_inv, np.dot(xox, xx_inv))
- def _xx_time_effects(x, y):
- """
- Returns X'X - (X'T) (T'T)^-1 (T'X)
- """
- # X'X
- xx = np.dot(x.values.T, x.values)
- xt = x.sum(level=0).values
- count = y.unstack().count(1).values
- selector = count > 0
- # X'X - (T'T)^-1 (T'X)
- xt = xt[selector]
- count = count[selector]
- return xx - np.dot(xt.T / count, xt)