PageRenderTime 151ms CodeModel.GetById 130ms app.highlight 16ms RepoModel.GetById 1ms app.codeStats 0ms

/statsmodels/sandbox/panel/panelmod.py

http://github.com/statsmodels/statsmodels
Python | 440 lines | 421 code | 6 blank | 13 comment | 0 complexity | b0c077a8e47210cf433b0ad51e86a73d MD5 | raw file
  1"""
  2Sandbox Panel Estimators
  3
  4References
  5-----------
  6
  7Baltagi, Badi H. `Econometric Analysis of Panel Data.` 4th ed. Wiley, 2008.
  8"""
  9from functools import reduce
 10
 11import numpy as np
 12
 13from statsmodels.regression.linear_model import GLS
 14
 15__all__ = ["PanelModel"]
 16
 17from pandas import Panel
 18
 19
 20def group(X):
 21    """
 22    Returns unique numeric values for groups without sorting.
 23
 24    Examples
 25    --------
 26    >>> X = np.array(['a','a','b','c','b','c'])
 27    >>> group(X)
 28    >>> g
 29    array([ 0.,  0.,  1.,  2.,  1.,  2.])
 30    """
 31    uniq_dict = {}
 32    group = np.zeros(len(X))
 33    for i in range(len(X)):
 34        if not X[i] in uniq_dict:
 35            uniq_dict.update({X[i] : len(uniq_dict)})
 36        group[i] = uniq_dict[X[i]]
 37    return group
 38
 39def repanel_cov(groups, sigmas):
 40    '''calculate error covariance matrix for random effects model
 41
 42    Parameters
 43    ----------
 44    groups : ndarray, (nobs, nre) or (nobs,)
 45        array of group/category observations
 46    sigma : ndarray, (nre+1,)
 47        array of standard deviations of random effects,
 48        last element is the standard deviation of the
 49        idiosyncratic error
 50
 51    Returns
 52    -------
 53    omega : ndarray, (nobs, nobs)
 54        covariance matrix of error
 55    omegainv : ndarray, (nobs, nobs)
 56        inverse covariance matrix of error
 57    omegainvsqrt : ndarray, (nobs, nobs)
 58        squareroot inverse covariance matrix of error
 59        such that omega = omegainvsqrt * omegainvsqrt.T
 60
 61    Notes
 62    -----
 63    This does not use sparse matrices and constructs nobs by nobs
 64    matrices. Also, omegainvsqrt is not sparse, i.e. elements are non-zero
 65    '''
 66
 67    if groups.ndim == 1:
 68        groups = groups[:,None]
 69    nobs, nre = groups.shape
 70    omega = sigmas[-1]*np.eye(nobs)
 71    for igr in range(nre):
 72        group = groups[:,igr:igr+1]
 73        groupuniq = np.unique(group)
 74        dummygr = sigmas[igr] * (group == groupuniq).astype(float)
 75        omega +=  np.dot(dummygr, dummygr.T)
 76    ev, evec = np.linalg.eigh(omega)  #eig does not work
 77    omegainv = np.dot(evec, (1/ev * evec).T)
 78    omegainvhalf = evec/np.sqrt(ev)
 79    return omega, omegainv, omegainvhalf
 80
 81
 82
 83class PanelData(Panel):
 84    pass
 85
 86class PanelModel(object):
 87    """
 88    An abstract statistical model class for panel (longitudinal) datasets.
 89
 90    Parameters
 91    ----------
 92    endog : array_like or str
 93        If a pandas object is used then endog should be the name of the
 94        endogenous variable as a string.
 95#    exog
 96#    panel_arr
 97#    time_arr
 98    panel_data : pandas.Panel object
 99
100    Notes
101    -----
102    If a pandas object is supplied it is assumed that the major_axis is time
103    and that the minor_axis has the panel variable.
104    """
105    def __init__(self, endog=None, exog=None, panel=None, time=None,
106            xtnames=None, equation=None, panel_data=None):
107        if panel_data is None:
108#            if endog == None and exog == None and panel == None and \
109#                    time == None:
110#                raise ValueError("If pandel_data is False then endog, exog, \
111#panel_arr, and time_arr cannot be None.")
112            self.initialize(endog, exog, panel, time, xtnames, equation)
113#        elif aspandas != False:
114#            if not isinstance(endog, str):
115#                raise ValueError("If a pandas object is supplied then endog \
116#must be a string containing the name of the endogenous variable")
117#            if not isinstance(aspandas, Panel):
118#                raise ValueError("Only pandas.Panel objects are supported")
119#            self.initialize_pandas(endog, aspandas, panel_name)
120
121
122    def initialize(self, endog, exog, panel, time, xtnames, equation):
123        """
124        Initialize plain array model.
125
126        See PanelModel
127        """
128#TODO: for now, we are going assume a constant, and then make the first
129#panel the base, add a flag for this....
130
131        # get names
132        names = equation.split(" ")
133        self.endog_name = names[0]
134        exog_names = names[1:]  # this makes the order matter in the array
135        self.panel_name = xtnames[0]
136        self.time_name = xtnames[1]
137
138
139        novar = exog.var(0) == 0
140        if True in novar:
141            cons_index = np.where(novar == 1)[0][0] # constant col. num
142            exog_names.insert(cons_index, 'cons')
143
144        self._cons_index = novar # used again in fit_fixed
145        self.exog_names = exog_names
146        self.endog = np.squeeze(np.asarray(endog))
147        exog = np.asarray(exog)
148        self.exog = exog
149        self.panel = np.asarray(panel)
150        self.time = np.asarray(time)
151
152        self.paneluniq = np.unique(panel)
153        self.timeuniq = np.unique(time)
154#TODO: this  structure can possibly be extracted somewhat to deal with
155#names in general
156
157#TODO: add some dimension checks, etc.
158
159#    def initialize_pandas(self, endog, aspandas):
160#        """
161#        Initialize pandas objects.
162#
163#        See PanelModel.
164#        """
165#        self.aspandas = aspandas
166#        endog = aspandas[endog].values
167#        self.endog = np.squeeze(endog)
168#        exog_name = aspandas.columns.tolist()
169#        exog_name.remove(endog)
170#        self.exog = aspandas.filterItems(exog_name).values
171#TODO: can the above be simplified to slice notation?
172#        if panel_name != None:
173#            self.panel_name = panel_name
174#        self.exog_name = exog_name
175#        self.endog_name = endog
176#        self.time_arr = aspandas.major_axis
177        #TODO: is time always handled correctly in fromRecords?
178#        self.panel_arr = aspandas.minor_axis
179#TODO: all of this might need to be refactored to explicitly rely (internally)
180# on the pandas LongPanel structure for speed and convenience.
181# not sure this part is finished...
182
183#TODO: does not conform to new initialize
184    def initialize_pandas(self, panel_data, endog_name, exog_name):
185        self.panel_data = panel_data
186        endog = panel_data[endog_name].values # does this create a copy?
187        self.endog = np.squeeze(endog)
188        if exog_name is None:
189            exog_name = panel_data.columns.tolist()
190            exog_name.remove(endog_name)
191        self.exog = panel_data.filterItems(exog_name).values # copy?
192        self._exog_name = exog_name
193        self._endog_name = endog_name
194        self._timeseries = panel_data.major_axis # might not need these
195        self._panelseries = panel_data.minor_axis
196
197#TODO: this could be pulled out and just have a by kwd that takes
198# the panel or time array
199#TODO: this also needs to be expanded for 'twoway'
200    def _group_mean(self, X, index='oneway', counts=False, dummies=False):
201        """
202        Get group means of X by time or by panel.
203
204        index default is panel
205        """
206        if index == 'oneway':
207            Y = self.panel
208            uniq = self.paneluniq
209        elif index == 'time':
210            Y = self.time
211            uniq = self.timeuniq
212        else:
213            raise ValueError("index %s not understood" % index)
214        print(Y, uniq, uniq[:,None], len(Y), len(uniq), len(uniq[:,None]),
215              index)
216        #TODO: use sparse matrices
217        dummy = (Y == uniq[:,None]).astype(float)
218        if X.ndim > 1:
219            mean = np.dot(dummy,X)/dummy.sum(1)[:,None]
220        else:
221            mean = np.dot(dummy,X)/dummy.sum(1)
222        if counts is False and dummies is False:
223            return mean
224        elif counts is True and dummies is False:
225            return mean, dummy.sum(1)
226        elif counts is True and dummies is True:
227            return mean, dummy.sum(1), dummy
228        elif counts is False and dummies is True:
229            return mean, dummy
230
231#TODO: Use kwd arguments or have fit_method methods?
232    def fit(self, model=None, method=None, effects='oneway'):
233        """
234        method : LSDV, demeaned, MLE, GLS, BE, FE, optional
235        model :
236                between
237                fixed
238                random
239                pooled
240                [gmm]
241        effects :
242                oneway
243                time
244                twoway
245        femethod : demeaned (only one implemented)
246                   WLS
247        remethod :
248                swar -
249                amemiya
250                nerlove
251                walhus
252
253
254        Notes
255        -----
256        This is unfinished.  None of the method arguments work yet.
257        Only oneway effects should work.
258        """
259        if method: # get rid of this with default
260            method = method.lower()
261        model = model.lower()
262        if method and method not in ["lsdv", "demeaned", "mle",
263                                     "gls", "be", "fe"]:
264            # get rid of if method with default
265            raise ValueError("%s not a valid method" % method)
266#        if method == "lsdv":
267#            self.fit_lsdv(model)
268        if model == 'pooled':
269            return GLS(self.endog, self.exog).fit()
270        if model == 'between':
271            return self._fit_btwn(method, effects)
272        if model == 'fixed':
273            return self._fit_fixed(method, effects)
274
275#    def fit_lsdv(self, effects):
276#        """
277#        Fit using least squares dummy variables.
278#
279#        Notes
280#        -----
281#        Should only be used for small `nobs`.
282#        """
283#        pdummies = None
284#        tdummies = None
285
286    def _fit_btwn(self, method, effects):
287        # group mean regression or WLS
288        if effects != "twoway":
289            endog = self._group_mean(self.endog, index=effects)
290            exog = self._group_mean(self.exog, index=effects)
291        else:
292            raise ValueError("%s effects is not valid for the between "
293                             "estimator" % effects)
294        befit = GLS(endog, exog).fit()
295        return befit
296
297    def _fit_fixed(self, method, effects):
298        endog = self.endog
299        exog = self.exog
300        demeantwice = False
301        if effects in ["oneway","twoways"]:
302            if effects == "twoways":
303                demeantwice = True
304                effects = "oneway"
305            endog_mean, counts = self._group_mean(endog, index=effects,
306                counts=True)
307            exog_mean = self._group_mean(exog, index=effects)
308            counts = counts.astype(int)
309            endog = endog - np.repeat(endog_mean, counts)
310            exog = exog - np.repeat(exog_mean, counts, axis=0)
311        if demeantwice or effects == "time":
312            endog_mean, dummies = self._group_mean(endog, index="time",
313                dummies=True)
314            exog_mean = self._group_mean(exog, index="time")
315            # This allows unbalanced panels
316            endog = endog - np.dot(endog_mean, dummies)
317            exog = exog - np.dot(dummies.T, exog_mean)
318        fefit = GLS(endog, exog[:,-self._cons_index]).fit()
319#TODO: might fail with one regressor
320        return fefit
321
322
323
324
325class SURPanel(PanelModel):
326    pass
327
328class SEMPanel(PanelModel):
329    pass
330
331class DynamicPanel(PanelModel):
332    pass
333
334if __name__ == "__main__":
335    import pandas
336    from pandas import Panel
337    import statsmodels.api as sm
338    import numpy.lib.recfunctions as nprf
339
340    data = sm.datasets.grunfeld.load(as_pandas=False)
341    # Baltagi does not include American Steel
342    endog = data.endog[:-20]
343    fullexog = data.exog[:-20]
344#    fullexog.sort(order=['firm','year'])
345    panel_arr = nprf.append_fields(fullexog, 'investment', endog, float,
346            usemask=False)
347
348    panel_df = pandas.DataFrame(panel_arr)
349    panel_panda = panel_df.set_index(['year', 'firm']).to_panel()
350
351
352    # the most cumbersome way of doing it as far as preprocessing by hand
353    exog = fullexog[['value','capital']].view(float).reshape(-1,2)
354    exog = sm.add_constant(exog, prepend=False)
355    panel = group(fullexog['firm'])
356    year = fullexog['year']
357    panel_mod = PanelModel(endog, exog, panel, year, xtnames=['firm','year'],
358            equation='invest value capital')
359# note that equation does not actually do anything but name the variables
360    panel_ols = panel_mod.fit(model='pooled')
361
362    panel_be = panel_mod.fit(model='between', effects='oneway')
363    panel_fe = panel_mod.fit(model='fixed', effects='oneway')
364
365    panel_bet = panel_mod.fit(model='between', effects='time')
366    panel_fet = panel_mod.fit(model='fixed', effects='time')
367
368    panel_fe2 = panel_mod.fit(model='fixed', effects='twoways')
369
370
371#see also Baltagi (3rd edt) 3.3 THE RANDOM EFFECTS MODEL p.35
372#for explicit formulas for spectral decomposition
373#but this works also for unbalanced panel
374#
375#I also just saw: 9.4.2 The Random Effects Model p.176 which is
376#partially almost the same as I did
377#
378#this needs to use sparse matrices for larger datasets
379#
380#"""
381#
382#import numpy as np
383#
384
385    groups = np.array([0,0,0,1,1,2,2,2])
386    nobs = groups.shape[0]
387    groupuniq = np.unique(groups)
388    periods = np.array([0,1,2,1,2,0,1,2])
389    perioduniq = np.unique(periods)
390
391    dummygr = (groups[:,None] == groupuniq).astype(float)
392    dummype = (periods[:,None] == perioduniq).astype(float)
393
394    sigma = 1.
395    sigmagr = np.sqrt(2.)
396    sigmape = np.sqrt(3.)
397
398    #dummyall = np.c_[sigma*np.ones((nobs,1)), sigmagr*dummygr,
399    #                                           sigmape*dummype]
400    #exclude constant ?
401    dummyall = np.c_[sigmagr*dummygr, sigmape*dummype]
402    # omega is the error variance-covariance matrix for the stacked
403    # observations
404    omega = np.dot(dummyall, dummyall.T) + sigma* np.eye(nobs)
405    print(omega)
406    print(np.linalg.cholesky(omega))
407    ev, evec = np.linalg.eigh(omega)  #eig does not work
408    omegainv = np.dot(evec, (1/ev * evec).T)
409    omegainv2 = np.linalg.inv(omega)
410    omegacomp = np.dot(evec, (ev * evec).T)
411    print(np.max(np.abs(omegacomp - omega)))
412    #check
413    #print(np.dot(omegainv,omega)
414    print(np.max(np.abs(np.dot(omegainv,omega) - np.eye(nobs))))
415    omegainvhalf = evec/np.sqrt(ev)  #not sure whether ev should not be column
416    print(np.max(np.abs(np.dot(omegainvhalf,omegainvhalf.T) - omegainv)))
417
418    # now we can use omegainvhalf in GLS (instead of the cholesky)
419
420
421
422
423
424
425
426
427    sigmas2 = np.array([sigmagr, sigmape, sigma])
428    groups2 = np.column_stack((groups, periods))
429    omega_, omegainv_, omegainvhalf_ = repanel_cov(groups2, sigmas2)
430    print(np.max(np.abs(omega_ - omega)))
431    print(np.max(np.abs(omegainv_ - omegainv)))
432    print(np.max(np.abs(omegainvhalf_ - omegainvhalf)))
433
434    # notation Baltagi (3rd) section 9.4.1 (Fixed Effects Model)
435    Pgr = reduce(np.dot,[dummygr,
436            np.linalg.inv(np.dot(dummygr.T, dummygr)),dummygr.T])
437    Qgr = np.eye(nobs) - Pgr
438    # within group effect: np.dot(Qgr, groups)
439    # but this is not memory efficient, compared to groupstats
440    print(np.max(np.abs(np.dot(Qgr, groups))))