PageRenderTime 53ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/statsmodels/sandbox/panel/panelmod.py

http://github.com/statsmodels/statsmodels
Python | 441 lines | 422 code | 6 blank | 13 comment | 0 complexity | 5e7243f3be1074756f88f57f7486f7bf MD5 | raw file
Possible License(s): BSD-3-Clause
  1. """
  2. Sandbox Panel Estimators
  3. References
  4. -----------
  5. Baltagi, Badi H. `Econometric Analysis of Panel Data.` 4th ed. Wiley, 2008.
  6. """
  7. from functools import reduce
  8. import numpy as np
  9. from statsmodels.regression.linear_model import GLS
  10. __all__ = ["PanelModel"]
  11. from pandas import Panel
  12. def group(X):
  13. """
  14. Returns unique numeric values for groups without sorting.
  15. Examples
  16. --------
  17. >>> X = np.array(['a','a','b','c','b','c'])
  18. >>> group(X)
  19. >>> g
  20. array([ 0., 0., 1., 2., 1., 2.])
  21. """
  22. uniq_dict = {}
  23. group = np.zeros(len(X))
  24. for i in range(len(X)):
  25. if not X[i] in uniq_dict:
  26. uniq_dict.update({X[i] : len(uniq_dict)})
  27. group[i] = uniq_dict[X[i]]
  28. return group
  29. def repanel_cov(groups, sigmas):
  30. '''calculate error covariance matrix for random effects model
  31. Parameters
  32. ----------
  33. groups : ndarray, (nobs, nre) or (nobs,)
  34. array of group/category observations
  35. sigma : ndarray, (nre+1,)
  36. array of standard deviations of random effects,
  37. last element is the standard deviation of the
  38. idiosyncratic error
  39. Returns
  40. -------
  41. omega : ndarray, (nobs, nobs)
  42. covariance matrix of error
  43. omegainv : ndarray, (nobs, nobs)
  44. inverse covariance matrix of error
  45. omegainvsqrt : ndarray, (nobs, nobs)
  46. squareroot inverse covariance matrix of error
  47. such that omega = omegainvsqrt * omegainvsqrt.T
  48. Notes
  49. -----
  50. This does not use sparse matrices and constructs nobs by nobs
  51. matrices. Also, omegainvsqrt is not sparse, i.e. elements are non-zero
  52. '''
  53. if groups.ndim == 1:
  54. groups = groups[:,None]
  55. nobs, nre = groups.shape
  56. omega = sigmas[-1]*np.eye(nobs)
  57. for igr in range(nre):
  58. group = groups[:,igr:igr+1]
  59. groupuniq = np.unique(group)
  60. dummygr = sigmas[igr] * (group == groupuniq).astype(float)
  61. omega += np.dot(dummygr, dummygr.T)
  62. ev, evec = np.linalg.eigh(omega) #eig does not work
  63. omegainv = np.dot(evec, (1/ev * evec).T)
  64. omegainvhalf = evec/np.sqrt(ev)
  65. return omega, omegainv, omegainvhalf
  66. class PanelData(Panel):
  67. pass
  68. class PanelModel(object):
  69. """
  70. An abstract statistical model class for panel (longitudinal) datasets.
  71. Parameters
  72. ----------
  73. endog : array_like or str
  74. If a pandas object is used then endog should be the name of the
  75. endogenous variable as a string.
  76. # exog
  77. # panel_arr
  78. # time_arr
  79. panel_data : pandas.Panel object
  80. Notes
  81. -----
  82. If a pandas object is supplied it is assumed that the major_axis is time
  83. and that the minor_axis has the panel variable.
  84. """
  85. def __init__(self, endog=None, exog=None, panel=None, time=None,
  86. xtnames=None, equation=None, panel_data=None):
  87. if panel_data is None:
  88. # if endog == None and exog == None and panel == None and \
  89. # time == None:
  90. # raise ValueError("If pandel_data is False then endog, exog, \
  91. #panel_arr, and time_arr cannot be None.")
  92. self.initialize(endog, exog, panel, time, xtnames, equation)
  93. # elif aspandas != False:
  94. # if not isinstance(endog, str):
  95. # raise ValueError("If a pandas object is supplied then endog \
  96. #must be a string containing the name of the endogenous variable")
  97. # if not isinstance(aspandas, Panel):
  98. # raise ValueError("Only pandas.Panel objects are supported")
  99. # self.initialize_pandas(endog, aspandas, panel_name)
  100. def initialize(self, endog, exog, panel, time, xtnames, equation):
  101. """
  102. Initialize plain array model.
  103. See PanelModel
  104. """
  105. #TODO: for now, we are going assume a constant, and then make the first
  106. #panel the base, add a flag for this....
  107. # get names
  108. names = equation.split(" ")
  109. self.endog_name = names[0]
  110. exog_names = names[1:] # this makes the order matter in the array
  111. self.panel_name = xtnames[0]
  112. self.time_name = xtnames[1]
  113. novar = exog.var(0) == 0
  114. if True in novar:
  115. cons_index = np.where(novar == 1)[0][0] # constant col. num
  116. exog_names.insert(cons_index, 'cons')
  117. self._cons_index = novar # used again in fit_fixed
  118. self.exog_names = exog_names
  119. self.endog = np.squeeze(np.asarray(endog))
  120. exog = np.asarray(exog)
  121. self.exog = exog
  122. self.panel = np.asarray(panel)
  123. self.time = np.asarray(time)
  124. self.paneluniq = np.unique(panel)
  125. self.timeuniq = np.unique(time)
  126. #TODO: this structure can possibly be extracted somewhat to deal with
  127. #names in general
  128. #TODO: add some dimension checks, etc.
  129. # def initialize_pandas(self, endog, aspandas):
  130. # """
  131. # Initialize pandas objects.
  132. #
  133. # See PanelModel.
  134. # """
  135. # self.aspandas = aspandas
  136. # endog = aspandas[endog].values
  137. # self.endog = np.squeeze(endog)
  138. # exog_name = aspandas.columns.tolist()
  139. # exog_name.remove(endog)
  140. # self.exog = aspandas.filterItems(exog_name).values
  141. #TODO: can the above be simplified to slice notation?
  142. # if panel_name != None:
  143. # self.panel_name = panel_name
  144. # self.exog_name = exog_name
  145. # self.endog_name = endog
  146. # self.time_arr = aspandas.major_axis
  147. #TODO: is time always handled correctly in fromRecords?
  148. # self.panel_arr = aspandas.minor_axis
  149. #TODO: all of this might need to be refactored to explicitly rely (internally)
  150. # on the pandas LongPanel structure for speed and convenience.
  151. # not sure this part is finished...
  152. #TODO: does not conform to new initialize
  153. def initialize_pandas(self, panel_data, endog_name, exog_name):
  154. self.panel_data = panel_data
  155. endog = panel_data[endog_name].values # does this create a copy?
  156. self.endog = np.squeeze(endog)
  157. if exog_name is None:
  158. exog_name = panel_data.columns.tolist()
  159. exog_name.remove(endog_name)
  160. self.exog = panel_data.filterItems(exog_name).values # copy?
  161. self._exog_name = exog_name
  162. self._endog_name = endog_name
  163. self._timeseries = panel_data.major_axis # might not need these
  164. self._panelseries = panel_data.minor_axis
  165. #TODO: this could be pulled out and just have a by kwd that takes
  166. # the panel or time array
  167. #TODO: this also needs to be expanded for 'twoway'
  168. def _group_mean(self, X, index='oneway', counts=False, dummies=False):
  169. """
  170. Get group means of X by time or by panel.
  171. index default is panel
  172. """
  173. if index == 'oneway':
  174. Y = self.panel
  175. uniq = self.paneluniq
  176. elif index == 'time':
  177. Y = self.time
  178. uniq = self.timeuniq
  179. else:
  180. raise ValueError("index %s not understood" % index)
  181. print(Y, uniq, uniq[:,None], len(Y), len(uniq), len(uniq[:,None]),
  182. index)
  183. #TODO: use sparse matrices
  184. dummy = (Y == uniq[:,None]).astype(float)
  185. if X.ndim > 1:
  186. mean = np.dot(dummy,X)/dummy.sum(1)[:,None]
  187. else:
  188. mean = np.dot(dummy,X)/dummy.sum(1)
  189. if counts is False and dummies is False:
  190. return mean
  191. elif counts is True and dummies is False:
  192. return mean, dummy.sum(1)
  193. elif counts is True and dummies is True:
  194. return mean, dummy.sum(1), dummy
  195. elif counts is False and dummies is True:
  196. return mean, dummy
  197. #TODO: Use kwd arguments or have fit_method methods?
  198. def fit(self, model=None, method=None, effects='oneway'):
  199. """
  200. method : LSDV, demeaned, MLE, GLS, BE, FE, optional
  201. model :
  202. between
  203. fixed
  204. random
  205. pooled
  206. [gmm]
  207. effects :
  208. oneway
  209. time
  210. twoway
  211. femethod : demeaned (only one implemented)
  212. WLS
  213. remethod :
  214. swar -
  215. amemiya
  216. nerlove
  217. walhus
  218. Notes
  219. -----
  220. This is unfinished. None of the method arguments work yet.
  221. Only oneway effects should work.
  222. """
  223. if method: # get rid of this with default
  224. method = method.lower()
  225. model = model.lower()
  226. if method and method not in ["lsdv", "demeaned", "mle",
  227. "gls", "be", "fe"]:
  228. # get rid of if method with default
  229. raise ValueError("%s not a valid method" % method)
  230. # if method == "lsdv":
  231. # self.fit_lsdv(model)
  232. if model == 'pooled':
  233. return GLS(self.endog, self.exog).fit()
  234. if model == 'between':
  235. return self._fit_btwn(method, effects)
  236. if model == 'fixed':
  237. return self._fit_fixed(method, effects)
  238. # def fit_lsdv(self, effects):
  239. # """
  240. # Fit using least squares dummy variables.
  241. #
  242. # Notes
  243. # -----
  244. # Should only be used for small `nobs`.
  245. # """
  246. # pdummies = None
  247. # tdummies = None
  248. def _fit_btwn(self, method, effects):
  249. # group mean regression or WLS
  250. if effects != "twoway":
  251. endog = self._group_mean(self.endog, index=effects)
  252. exog = self._group_mean(self.exog, index=effects)
  253. else:
  254. raise ValueError("%s effects is not valid for the between "
  255. "estimator" % effects)
  256. befit = GLS(endog, exog).fit()
  257. return befit
  258. def _fit_fixed(self, method, effects):
  259. endog = self.endog
  260. exog = self.exog
  261. demeantwice = False
  262. if effects in ["oneway","twoways"]:
  263. if effects == "twoways":
  264. demeantwice = True
  265. effects = "oneway"
  266. endog_mean, counts = self._group_mean(endog, index=effects,
  267. counts=True)
  268. exog_mean = self._group_mean(exog, index=effects)
  269. counts = counts.astype(int)
  270. endog = endog - np.repeat(endog_mean, counts)
  271. exog = exog - np.repeat(exog_mean, counts, axis=0)
  272. if demeantwice or effects == "time":
  273. endog_mean, dummies = self._group_mean(endog, index="time",
  274. dummies=True)
  275. exog_mean = self._group_mean(exog, index="time")
  276. # This allows unbalanced panels
  277. endog = endog - np.dot(endog_mean, dummies)
  278. exog = exog - np.dot(dummies.T, exog_mean)
  279. fefit = GLS(endog, exog[:,-self._cons_index]).fit()
  280. #TODO: might fail with one regressor
  281. return fefit
  282. class SURPanel(PanelModel):
  283. pass
  284. class SEMPanel(PanelModel):
  285. pass
  286. class DynamicPanel(PanelModel):
  287. pass
  288. if __name__ == "__main__":
  289. import numpy.lib.recfunctions as nprf
  290. import pandas
  291. from pandas import Panel
  292. import statsmodels.api as sm
  293. data = sm.datasets.grunfeld.load()
  294. # Baltagi does not include American Steel
  295. endog = data.endog[:-20]
  296. fullexog = data.exog[:-20]
  297. # fullexog.sort(order=['firm','year'])
  298. panel_arr = nprf.append_fields(fullexog, 'investment', endog, float,
  299. usemask=False)
  300. panel_df = pandas.DataFrame(panel_arr)
  301. panel_panda = panel_df.set_index(['year', 'firm']).to_panel()
  302. # the most cumbersome way of doing it as far as preprocessing by hand
  303. exog = fullexog[['value','capital']].view(float).reshape(-1,2)
  304. exog = sm.add_constant(exog, prepend=False)
  305. panel = group(fullexog['firm'])
  306. year = fullexog['year']
  307. panel_mod = PanelModel(endog, exog, panel, year, xtnames=['firm','year'],
  308. equation='invest value capital')
  309. # note that equation does not actually do anything but name the variables
  310. panel_ols = panel_mod.fit(model='pooled')
  311. panel_be = panel_mod.fit(model='between', effects='oneway')
  312. panel_fe = panel_mod.fit(model='fixed', effects='oneway')
  313. panel_bet = panel_mod.fit(model='between', effects='time')
  314. panel_fet = panel_mod.fit(model='fixed', effects='time')
  315. panel_fe2 = panel_mod.fit(model='fixed', effects='twoways')
  316. #see also Baltagi (3rd edt) 3.3 THE RANDOM EFFECTS MODEL p.35
  317. #for explicit formulas for spectral decomposition
  318. #but this works also for unbalanced panel
  319. #
  320. #I also just saw: 9.4.2 The Random Effects Model p.176 which is
  321. #partially almost the same as I did
  322. #
  323. #this needs to use sparse matrices for larger datasets
  324. #
  325. #"""
  326. #
  327. #import numpy as np
  328. #
  329. groups = np.array([0,0,0,1,1,2,2,2])
  330. nobs = groups.shape[0]
  331. groupuniq = np.unique(groups)
  332. periods = np.array([0,1,2,1,2,0,1,2])
  333. perioduniq = np.unique(periods)
  334. dummygr = (groups[:,None] == groupuniq).astype(float)
  335. dummype = (periods[:,None] == perioduniq).astype(float)
  336. sigma = 1.
  337. sigmagr = np.sqrt(2.)
  338. sigmape = np.sqrt(3.)
  339. #dummyall = np.c_[sigma*np.ones((nobs,1)), sigmagr*dummygr,
  340. # sigmape*dummype]
  341. #exclude constant ?
  342. dummyall = np.c_[sigmagr*dummygr, sigmape*dummype]
  343. # omega is the error variance-covariance matrix for the stacked
  344. # observations
  345. omega = np.dot(dummyall, dummyall.T) + sigma* np.eye(nobs)
  346. print(omega)
  347. print(np.linalg.cholesky(omega))
  348. ev, evec = np.linalg.eigh(omega) #eig does not work
  349. omegainv = np.dot(evec, (1/ev * evec).T)
  350. omegainv2 = np.linalg.inv(omega)
  351. omegacomp = np.dot(evec, (ev * evec).T)
  352. print(np.max(np.abs(omegacomp - omega)))
  353. #check
  354. #print(np.dot(omegainv,omega)
  355. print(np.max(np.abs(np.dot(omegainv,omega) - np.eye(nobs))))
  356. omegainvhalf = evec/np.sqrt(ev) #not sure whether ev should not be column
  357. print(np.max(np.abs(np.dot(omegainvhalf,omegainvhalf.T) - omegainv)))
  358. # now we can use omegainvhalf in GLS (instead of the cholesky)
  359. sigmas2 = np.array([sigmagr, sigmape, sigma])
  360. groups2 = np.column_stack((groups, periods))
  361. omega_, omegainv_, omegainvhalf_ = repanel_cov(groups2, sigmas2)
  362. print(np.max(np.abs(omega_ - omega)))
  363. print(np.max(np.abs(omegainv_ - omegainv)))
  364. print(np.max(np.abs(omegainvhalf_ - omegainvhalf)))
  365. # notation Baltagi (3rd) section 9.4.1 (Fixed Effects Model)
  366. Pgr = reduce(np.dot,[dummygr,
  367. np.linalg.inv(np.dot(dummygr.T, dummygr)),dummygr.T])
  368. Qgr = np.eye(nobs) - Pgr
  369. # within group effect: np.dot(Qgr, groups)
  370. # but this is not memory efficient, compared to groupstats
  371. print(np.max(np.abs(np.dot(Qgr, groups))))