/statsmodels/sandbox/panel/panelmod.py
Python | 441 lines | 422 code | 6 blank | 13 comment | 0 complexity | 5e7243f3be1074756f88f57f7486f7bf MD5 | raw file
Possible License(s): BSD-3-Clause
- """
- Sandbox Panel Estimators
- References
- -----------
- Baltagi, Badi H. `Econometric Analysis of Panel Data.` 4th ed. Wiley, 2008.
- """
- from functools import reduce
- import numpy as np
- from statsmodels.regression.linear_model import GLS
- __all__ = ["PanelModel"]
- from pandas import Panel
- def group(X):
- """
- Returns unique numeric values for groups without sorting.
- Examples
- --------
- >>> X = np.array(['a','a','b','c','b','c'])
- >>> group(X)
- >>> g
- array([ 0., 0., 1., 2., 1., 2.])
- """
- uniq_dict = {}
- group = np.zeros(len(X))
- for i in range(len(X)):
- if not X[i] in uniq_dict:
- uniq_dict.update({X[i] : len(uniq_dict)})
- group[i] = uniq_dict[X[i]]
- return group
- def repanel_cov(groups, sigmas):
- '''calculate error covariance matrix for random effects model
- Parameters
- ----------
- groups : ndarray, (nobs, nre) or (nobs,)
- array of group/category observations
- sigma : ndarray, (nre+1,)
- array of standard deviations of random effects,
- last element is the standard deviation of the
- idiosyncratic error
- Returns
- -------
- omega : ndarray, (nobs, nobs)
- covariance matrix of error
- omegainv : ndarray, (nobs, nobs)
- inverse covariance matrix of error
- omegainvsqrt : ndarray, (nobs, nobs)
- squareroot inverse covariance matrix of error
- such that omega = omegainvsqrt * omegainvsqrt.T
- Notes
- -----
- This does not use sparse matrices and constructs nobs by nobs
- matrices. Also, omegainvsqrt is not sparse, i.e. elements are non-zero
- '''
- if groups.ndim == 1:
- groups = groups[:,None]
- nobs, nre = groups.shape
- omega = sigmas[-1]*np.eye(nobs)
- for igr in range(nre):
- group = groups[:,igr:igr+1]
- groupuniq = np.unique(group)
- dummygr = sigmas[igr] * (group == groupuniq).astype(float)
- omega += np.dot(dummygr, dummygr.T)
- ev, evec = np.linalg.eigh(omega) #eig does not work
- omegainv = np.dot(evec, (1/ev * evec).T)
- omegainvhalf = evec/np.sqrt(ev)
- return omega, omegainv, omegainvhalf
- class PanelData(Panel):
- pass
- class PanelModel(object):
- """
- An abstract statistical model class for panel (longitudinal) datasets.
- Parameters
- ----------
- endog : array_like or str
- If a pandas object is used then endog should be the name of the
- endogenous variable as a string.
- # exog
- # panel_arr
- # time_arr
- panel_data : pandas.Panel object
- Notes
- -----
- If a pandas object is supplied it is assumed that the major_axis is time
- and that the minor_axis has the panel variable.
- """
- def __init__(self, endog=None, exog=None, panel=None, time=None,
- xtnames=None, equation=None, panel_data=None):
- if panel_data is None:
- # if endog == None and exog == None and panel == None and \
- # time == None:
- # raise ValueError("If pandel_data is False then endog, exog, \
- #panel_arr, and time_arr cannot be None.")
- self.initialize(endog, exog, panel, time, xtnames, equation)
- # elif aspandas != False:
- # if not isinstance(endog, str):
- # raise ValueError("If a pandas object is supplied then endog \
- #must be a string containing the name of the endogenous variable")
- # if not isinstance(aspandas, Panel):
- # raise ValueError("Only pandas.Panel objects are supported")
- # self.initialize_pandas(endog, aspandas, panel_name)
- def initialize(self, endog, exog, panel, time, xtnames, equation):
- """
- Initialize plain array model.
- See PanelModel
- """
- #TODO: for now, we are going assume a constant, and then make the first
- #panel the base, add a flag for this....
- # get names
- names = equation.split(" ")
- self.endog_name = names[0]
- exog_names = names[1:] # this makes the order matter in the array
- self.panel_name = xtnames[0]
- self.time_name = xtnames[1]
- novar = exog.var(0) == 0
- if True in novar:
- cons_index = np.where(novar == 1)[0][0] # constant col. num
- exog_names.insert(cons_index, 'cons')
- self._cons_index = novar # used again in fit_fixed
- self.exog_names = exog_names
- self.endog = np.squeeze(np.asarray(endog))
- exog = np.asarray(exog)
- self.exog = exog
- self.panel = np.asarray(panel)
- self.time = np.asarray(time)
- self.paneluniq = np.unique(panel)
- self.timeuniq = np.unique(time)
- #TODO: this structure can possibly be extracted somewhat to deal with
- #names in general
- #TODO: add some dimension checks, etc.
- # def initialize_pandas(self, endog, aspandas):
- # """
- # Initialize pandas objects.
- #
- # See PanelModel.
- # """
- # self.aspandas = aspandas
- # endog = aspandas[endog].values
- # self.endog = np.squeeze(endog)
- # exog_name = aspandas.columns.tolist()
- # exog_name.remove(endog)
- # self.exog = aspandas.filterItems(exog_name).values
- #TODO: can the above be simplified to slice notation?
- # if panel_name != None:
- # self.panel_name = panel_name
- # self.exog_name = exog_name
- # self.endog_name = endog
- # self.time_arr = aspandas.major_axis
- #TODO: is time always handled correctly in fromRecords?
- # self.panel_arr = aspandas.minor_axis
- #TODO: all of this might need to be refactored to explicitly rely (internally)
- # on the pandas LongPanel structure for speed and convenience.
- # not sure this part is finished...
- #TODO: does not conform to new initialize
- def initialize_pandas(self, panel_data, endog_name, exog_name):
- self.panel_data = panel_data
- endog = panel_data[endog_name].values # does this create a copy?
- self.endog = np.squeeze(endog)
- if exog_name is None:
- exog_name = panel_data.columns.tolist()
- exog_name.remove(endog_name)
- self.exog = panel_data.filterItems(exog_name).values # copy?
- self._exog_name = exog_name
- self._endog_name = endog_name
- self._timeseries = panel_data.major_axis # might not need these
- self._panelseries = panel_data.minor_axis
- #TODO: this could be pulled out and just have a by kwd that takes
- # the panel or time array
- #TODO: this also needs to be expanded for 'twoway'
- def _group_mean(self, X, index='oneway', counts=False, dummies=False):
- """
- Get group means of X by time or by panel.
- index default is panel
- """
- if index == 'oneway':
- Y = self.panel
- uniq = self.paneluniq
- elif index == 'time':
- Y = self.time
- uniq = self.timeuniq
- else:
- raise ValueError("index %s not understood" % index)
- print(Y, uniq, uniq[:,None], len(Y), len(uniq), len(uniq[:,None]),
- index)
- #TODO: use sparse matrices
- dummy = (Y == uniq[:,None]).astype(float)
- if X.ndim > 1:
- mean = np.dot(dummy,X)/dummy.sum(1)[:,None]
- else:
- mean = np.dot(dummy,X)/dummy.sum(1)
- if counts is False and dummies is False:
- return mean
- elif counts is True and dummies is False:
- return mean, dummy.sum(1)
- elif counts is True and dummies is True:
- return mean, dummy.sum(1), dummy
- elif counts is False and dummies is True:
- return mean, dummy
- #TODO: Use kwd arguments or have fit_method methods?
- def fit(self, model=None, method=None, effects='oneway'):
- """
- method : LSDV, demeaned, MLE, GLS, BE, FE, optional
- model :
- between
- fixed
- random
- pooled
- [gmm]
- effects :
- oneway
- time
- twoway
- femethod : demeaned (only one implemented)
- WLS
- remethod :
- swar -
- amemiya
- nerlove
- walhus
- Notes
- -----
- This is unfinished. None of the method arguments work yet.
- Only oneway effects should work.
- """
- if method: # get rid of this with default
- method = method.lower()
- model = model.lower()
- if method and method not in ["lsdv", "demeaned", "mle",
- "gls", "be", "fe"]:
- # get rid of if method with default
- raise ValueError("%s not a valid method" % method)
- # if method == "lsdv":
- # self.fit_lsdv(model)
- if model == 'pooled':
- return GLS(self.endog, self.exog).fit()
- if model == 'between':
- return self._fit_btwn(method, effects)
- if model == 'fixed':
- return self._fit_fixed(method, effects)
- # def fit_lsdv(self, effects):
- # """
- # Fit using least squares dummy variables.
- #
- # Notes
- # -----
- # Should only be used for small `nobs`.
- # """
- # pdummies = None
- # tdummies = None
- def _fit_btwn(self, method, effects):
- # group mean regression or WLS
- if effects != "twoway":
- endog = self._group_mean(self.endog, index=effects)
- exog = self._group_mean(self.exog, index=effects)
- else:
- raise ValueError("%s effects is not valid for the between "
- "estimator" % effects)
- befit = GLS(endog, exog).fit()
- return befit
- def _fit_fixed(self, method, effects):
- endog = self.endog
- exog = self.exog
- demeantwice = False
- if effects in ["oneway","twoways"]:
- if effects == "twoways":
- demeantwice = True
- effects = "oneway"
- endog_mean, counts = self._group_mean(endog, index=effects,
- counts=True)
- exog_mean = self._group_mean(exog, index=effects)
- counts = counts.astype(int)
- endog = endog - np.repeat(endog_mean, counts)
- exog = exog - np.repeat(exog_mean, counts, axis=0)
- if demeantwice or effects == "time":
- endog_mean, dummies = self._group_mean(endog, index="time",
- dummies=True)
- exog_mean = self._group_mean(exog, index="time")
- # This allows unbalanced panels
- endog = endog - np.dot(endog_mean, dummies)
- exog = exog - np.dot(dummies.T, exog_mean)
- fefit = GLS(endog, exog[:,-self._cons_index]).fit()
- #TODO: might fail with one regressor
- return fefit
- class SURPanel(PanelModel):
- pass
- class SEMPanel(PanelModel):
- pass
- class DynamicPanel(PanelModel):
- pass
- if __name__ == "__main__":
- import numpy.lib.recfunctions as nprf
- import pandas
- from pandas import Panel
- import statsmodels.api as sm
- data = sm.datasets.grunfeld.load()
- # Baltagi does not include American Steel
- endog = data.endog[:-20]
- fullexog = data.exog[:-20]
- # fullexog.sort(order=['firm','year'])
- panel_arr = nprf.append_fields(fullexog, 'investment', endog, float,
- usemask=False)
- panel_df = pandas.DataFrame(panel_arr)
- panel_panda = panel_df.set_index(['year', 'firm']).to_panel()
- # the most cumbersome way of doing it as far as preprocessing by hand
- exog = fullexog[['value','capital']].view(float).reshape(-1,2)
- exog = sm.add_constant(exog, prepend=False)
- panel = group(fullexog['firm'])
- year = fullexog['year']
- panel_mod = PanelModel(endog, exog, panel, year, xtnames=['firm','year'],
- equation='invest value capital')
- # note that equation does not actually do anything but name the variables
- panel_ols = panel_mod.fit(model='pooled')
- panel_be = panel_mod.fit(model='between', effects='oneway')
- panel_fe = panel_mod.fit(model='fixed', effects='oneway')
- panel_bet = panel_mod.fit(model='between', effects='time')
- panel_fet = panel_mod.fit(model='fixed', effects='time')
- panel_fe2 = panel_mod.fit(model='fixed', effects='twoways')
- #see also Baltagi (3rd edt) 3.3 THE RANDOM EFFECTS MODEL p.35
- #for explicit formulas for spectral decomposition
- #but this works also for unbalanced panel
- #
- #I also just saw: 9.4.2 The Random Effects Model p.176 which is
- #partially almost the same as I did
- #
- #this needs to use sparse matrices for larger datasets
- #
- #"""
- #
- #import numpy as np
- #
- groups = np.array([0,0,0,1,1,2,2,2])
- nobs = groups.shape[0]
- groupuniq = np.unique(groups)
- periods = np.array([0,1,2,1,2,0,1,2])
- perioduniq = np.unique(periods)
- dummygr = (groups[:,None] == groupuniq).astype(float)
- dummype = (periods[:,None] == perioduniq).astype(float)
- sigma = 1.
- sigmagr = np.sqrt(2.)
- sigmape = np.sqrt(3.)
- #dummyall = np.c_[sigma*np.ones((nobs,1)), sigmagr*dummygr,
- # sigmape*dummype]
- #exclude constant ?
- dummyall = np.c_[sigmagr*dummygr, sigmape*dummype]
- # omega is the error variance-covariance matrix for the stacked
- # observations
- omega = np.dot(dummyall, dummyall.T) + sigma* np.eye(nobs)
- print(omega)
- print(np.linalg.cholesky(omega))
- ev, evec = np.linalg.eigh(omega) #eig does not work
- omegainv = np.dot(evec, (1/ev * evec).T)
- omegainv2 = np.linalg.inv(omega)
- omegacomp = np.dot(evec, (ev * evec).T)
- print(np.max(np.abs(omegacomp - omega)))
- #check
- #print(np.dot(omegainv,omega)
- print(np.max(np.abs(np.dot(omegainv,omega) - np.eye(nobs))))
- omegainvhalf = evec/np.sqrt(ev) #not sure whether ev should not be column
- print(np.max(np.abs(np.dot(omegainvhalf,omegainvhalf.T) - omegainv)))
- # now we can use omegainvhalf in GLS (instead of the cholesky)
- sigmas2 = np.array([sigmagr, sigmape, sigma])
- groups2 = np.column_stack((groups, periods))
- omega_, omegainv_, omegainvhalf_ = repanel_cov(groups2, sigmas2)
- print(np.max(np.abs(omega_ - omega)))
- print(np.max(np.abs(omegainv_ - omegainv)))
- print(np.max(np.abs(omegainvhalf_ - omegainvhalf)))
- # notation Baltagi (3rd) section 9.4.1 (Fixed Effects Model)
- Pgr = reduce(np.dot,[dummygr,
- np.linalg.inv(np.dot(dummygr.T, dummygr)),dummygr.T])
- Qgr = np.eye(nobs) - Pgr
- # within group effect: np.dot(Qgr, groups)
- # but this is not memory efficient, compared to groupstats
- print(np.max(np.abs(np.dot(Qgr, groups))))