PageRenderTime 30ms CodeModel.GetById 12ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/stats/plm.py

http://github.com/wesm/pandas
Python | 863 lines | 654 code | 119 blank | 90 comment | 60 complexity | 495cb1035c51ce4c912e5f301ec74ced MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. """
  2. Linear regression objects for panel data
  3. """
  4. # pylint: disable-msg=W0231
  5. # pylint: disable-msg=E1101,E1103
  6. # flake8: noqa
  7. from __future__ import division
  8. from pandas.compat import range
  9. from pandas import compat
  10. import warnings
  11. import numpy as np
  12. from pandas.core.panel import Panel
  13. from pandas.core.frame import DataFrame
  14. from pandas.core.reshape import get_dummies
  15. from pandas.core.series import Series
  16. from pandas.stats.ols import OLS, MovingOLS
  17. import pandas.stats.common as com
  18. import pandas.stats.math as math
  19. from pandas.util.decorators import cache_readonly
  20. class PanelOLS(OLS):
  21. """Implements panel OLS.
  22. See ols function docs
  23. """
  24. _panel_model = True
  25. def __init__(self, y, x, weights=None, intercept=True, nw_lags=None,
  26. entity_effects=False, time_effects=False, x_effects=None,
  27. cluster=None, dropped_dummies=None, verbose=False,
  28. nw_overlap=False):
  29. import warnings
  30. warnings.warn("The pandas.stats.plm module is deprecated and will be "
  31. "removed in a future version. We refer to external packages "
  32. "like statsmodels, see some examples here: "
  33. "http://www.statsmodels.org/stable/mixed_linear.html",
  34. FutureWarning, stacklevel=4)
  35. self._x_orig = x
  36. self._y_orig = y
  37. self._weights = weights
  38. self._intercept = intercept
  39. self._nw_lags = nw_lags
  40. self._nw_overlap = nw_overlap
  41. self._entity_effects = entity_effects
  42. self._time_effects = time_effects
  43. self._x_effects = x_effects
  44. self._dropped_dummies = dropped_dummies or {}
  45. self._cluster = com._get_cluster_type(cluster)
  46. self._verbose = verbose
  47. (self._x, self._x_trans,
  48. self._x_filtered, self._y,
  49. self._y_trans) = self._prepare_data()
  50. self._index = self._x.index.levels[0]
  51. self._T = len(self._index)
  52. def log(self, msg):
  53. if self._verbose: # pragma: no cover
  54. print(msg)
  55. def _prepare_data(self):
  56. """Cleans and stacks input data into DataFrame objects
  57. If time effects is True, then we turn off intercepts and omit an item
  58. from every (entity and x) fixed effect.
  59. Otherwise:
  60. - If we have an intercept, we omit an item from every fixed effect.
  61. - Else, we omit an item from every fixed effect except one of them.
  62. The categorical variables will get dropped from x.
  63. """
  64. (x, x_filtered, y, weights, cat_mapping) = self._filter_data()
  65. self.log('Adding dummies to X variables')
  66. x = self._add_dummies(x, cat_mapping)
  67. self.log('Adding dummies to filtered X variables')
  68. x_filtered = self._add_dummies(x_filtered, cat_mapping)
  69. if self._x_effects:
  70. x = x.drop(self._x_effects, axis=1)
  71. x_filtered = x_filtered.drop(self._x_effects, axis=1)
  72. if self._time_effects:
  73. x_regressor = x.sub(x.mean(level=0), level=0)
  74. unstacked_y = y.unstack()
  75. y_regressor = unstacked_y.sub(unstacked_y.mean(1), axis=0).stack()
  76. y_regressor.index = y.index
  77. elif self._intercept:
  78. # only add intercept when no time effects
  79. self.log('Adding intercept')
  80. x = x_regressor = add_intercept(x)
  81. x_filtered = add_intercept(x_filtered)
  82. y_regressor = y
  83. else:
  84. self.log('No intercept added')
  85. x_regressor = x
  86. y_regressor = y
  87. if weights is not None:
  88. if not y_regressor.index.equals(weights.index):
  89. raise AssertionError("y_regressor and weights must have the "
  90. "same index")
  91. if not x_regressor.index.equals(weights.index):
  92. raise AssertionError("x_regressor and weights must have the "
  93. "same index")
  94. rt_weights = np.sqrt(weights)
  95. y_regressor = y_regressor * rt_weights
  96. x_regressor = x_regressor.mul(rt_weights, axis=0)
  97. return x, x_regressor, x_filtered, y, y_regressor
  98. def _filter_data(self):
  99. """
  100. """
  101. data = self._x_orig
  102. cat_mapping = {}
  103. if isinstance(data, DataFrame):
  104. data = data.to_panel()
  105. else:
  106. if isinstance(data, Panel):
  107. data = data.copy()
  108. data, cat_mapping = self._convert_x(data)
  109. if not isinstance(data, Panel):
  110. data = Panel.from_dict(data, intersect=True)
  111. x_names = data.items
  112. if self._weights is not None:
  113. data['__weights__'] = self._weights
  114. # Filter x's without y (so we can make a prediction)
  115. filtered = data.to_frame()
  116. # Filter all data together using to_frame
  117. # convert to DataFrame
  118. y = self._y_orig
  119. if isinstance(y, Series):
  120. y = y.unstack()
  121. data['__y__'] = y
  122. data_long = data.to_frame()
  123. x_filt = filtered.filter(x_names)
  124. x = data_long.filter(x_names)
  125. y = data_long['__y__']
  126. if self._weights is not None and not self._weights.empty:
  127. weights = data_long['__weights__']
  128. else:
  129. weights = None
  130. return x, x_filt, y, weights, cat_mapping
  131. def _convert_x(self, x):
  132. # Converts non-numeric data in x to floats. x_converted is the
  133. # DataFrame with converted values, and x_conversion is a dict that
  134. # provides the reverse mapping. For example, if 'A' was converted to 0
  135. # for x named 'variety', then x_conversion['variety'][0] is 'A'.
  136. x_converted = {}
  137. cat_mapping = {}
  138. # x can be either a dict or a Panel, but in Python 3, dicts don't have
  139. # .iteritems
  140. iteritems = getattr(x, 'iteritems', x.items)
  141. for key, df in iteritems():
  142. if not isinstance(df, DataFrame):
  143. raise AssertionError("all input items must be DataFrames, "
  144. "at least one is of "
  145. "type {0}".format(type(df)))
  146. if _is_numeric(df):
  147. x_converted[key] = df
  148. else:
  149. try:
  150. df = df.astype(float)
  151. except (TypeError, ValueError):
  152. values = df.values
  153. distinct_values = sorted(set(values.flat))
  154. cat_mapping[key] = dict(enumerate(distinct_values))
  155. new_values = np.searchsorted(distinct_values, values)
  156. x_converted[key] = DataFrame(new_values, index=df.index,
  157. columns=df.columns)
  158. if len(cat_mapping) == 0:
  159. x_converted = x
  160. return x_converted, cat_mapping
  161. def _add_dummies(self, panel, mapping):
  162. """
  163. Add entity and / or categorical dummies to input X DataFrame
  164. Returns
  165. -------
  166. DataFrame
  167. """
  168. panel = self._add_entity_effects(panel)
  169. panel = self._add_categorical_dummies(panel, mapping)
  170. return panel
  171. def _add_entity_effects(self, panel):
  172. """
  173. Add entity dummies to panel
  174. Returns
  175. -------
  176. DataFrame
  177. """
  178. from pandas.core.reshape import make_axis_dummies
  179. if not self._entity_effects:
  180. return panel
  181. self.log('-- Adding entity fixed effect dummies')
  182. dummies = make_axis_dummies(panel, 'minor')
  183. if not self._use_all_dummies:
  184. if 'entity' in self._dropped_dummies:
  185. to_exclude = str(self._dropped_dummies.get('entity'))
  186. else:
  187. to_exclude = dummies.columns[0]
  188. if to_exclude not in dummies.columns:
  189. raise Exception('%s not in %s' % (to_exclude,
  190. dummies.columns))
  191. self.log('-- Excluding dummy for entity: %s' % to_exclude)
  192. dummies = dummies.filter(dummies.columns.difference([to_exclude]))
  193. dummies = dummies.add_prefix('FE_')
  194. panel = panel.join(dummies)
  195. return panel
  196. def _add_categorical_dummies(self, panel, cat_mappings):
  197. """
  198. Add categorical dummies to panel
  199. Returns
  200. -------
  201. DataFrame
  202. """
  203. if not self._x_effects:
  204. return panel
  205. dropped_dummy = (self._entity_effects and not self._use_all_dummies)
  206. for effect in self._x_effects:
  207. self.log('-- Adding fixed effect dummies for %s' % effect)
  208. dummies = get_dummies(panel[effect])
  209. val_map = cat_mappings.get(effect)
  210. if val_map:
  211. val_map = dict((v, k) for k, v in compat.iteritems(val_map))
  212. if dropped_dummy or not self._use_all_dummies:
  213. if effect in self._dropped_dummies:
  214. to_exclude = mapped_name = self._dropped_dummies.get(
  215. effect)
  216. if val_map:
  217. mapped_name = val_map[to_exclude]
  218. else:
  219. to_exclude = mapped_name = dummies.columns[0]
  220. if mapped_name not in dummies.columns: # pragma: no cover
  221. raise Exception('%s not in %s' % (to_exclude,
  222. dummies.columns))
  223. self.log(
  224. '-- Excluding dummy for %s: %s' % (effect, to_exclude))
  225. dummies = dummies.filter(
  226. dummies.columns.difference([mapped_name]))
  227. dropped_dummy = True
  228. dummies = _convertDummies(dummies, cat_mappings.get(effect))
  229. dummies = dummies.add_prefix('%s_' % effect)
  230. panel = panel.join(dummies)
  231. return panel
  232. @property
  233. def _use_all_dummies(self):
  234. """
  235. In the case of using an intercept or including time fixed
  236. effects, completely partitioning the sample would make the X
  237. not full rank.
  238. """
  239. return (not self._intercept and not self._time_effects)
  240. @cache_readonly
  241. def _beta_raw(self):
  242. """Runs the regression and returns the beta."""
  243. X = self._x_trans.values
  244. Y = self._y_trans.values.squeeze()
  245. beta, _, _, _ = np.linalg.lstsq(X, Y)
  246. return beta
  247. @cache_readonly
  248. def beta(self):
  249. return Series(self._beta_raw, index=self._x.columns)
  250. @cache_readonly
  251. def _df_model_raw(self):
  252. """Returns the raw model degrees of freedom."""
  253. return self._df_raw - 1
  254. @cache_readonly
  255. def _df_resid_raw(self):
  256. """Returns the raw residual degrees of freedom."""
  257. return self._nobs - self._df_raw
  258. @cache_readonly
  259. def _df_raw(self):
  260. """Returns the degrees of freedom."""
  261. df = math.rank(self._x_trans.values)
  262. if self._time_effects:
  263. df += self._total_times
  264. return df
  265. @cache_readonly
  266. def _r2_raw(self):
  267. Y = self._y_trans.values.squeeze()
  268. X = self._x_trans.values
  269. resid = Y - np.dot(X, self._beta_raw)
  270. SSE = (resid ** 2).sum()
  271. if self._use_centered_tss:
  272. SST = ((Y - np.mean(Y)) ** 2).sum()
  273. else:
  274. SST = (Y ** 2).sum()
  275. return 1 - SSE / SST
  276. @property
  277. def _use_centered_tss(self):
  278. # has_intercept = np.abs(self._resid_raw.sum()) < _FP_ERR
  279. return self._intercept or self._entity_effects or self._time_effects
  280. @cache_readonly
  281. def _r2_adj_raw(self):
  282. """Returns the raw r-squared adjusted values."""
  283. nobs = self._nobs
  284. factors = (nobs - 1) / (nobs - self._df_raw)
  285. return 1 - (1 - self._r2_raw) * factors
  286. @cache_readonly
  287. def _resid_raw(self):
  288. Y = self._y.values.squeeze()
  289. X = self._x.values
  290. return Y - np.dot(X, self._beta_raw)
  291. @cache_readonly
  292. def resid(self):
  293. return self._unstack_vector(self._resid_raw)
  294. @cache_readonly
  295. def _rmse_raw(self):
  296. """Returns the raw rmse values."""
  297. # X = self._x.values
  298. # Y = self._y.values.squeeze()
  299. X = self._x_trans.values
  300. Y = self._y_trans.values.squeeze()
  301. resid = Y - np.dot(X, self._beta_raw)
  302. ss = (resid ** 2).sum()
  303. return np.sqrt(ss / (self._nobs - self._df_raw))
  304. @cache_readonly
  305. def _var_beta_raw(self):
  306. cluster_axis = None
  307. if self._cluster == 'time':
  308. cluster_axis = 0
  309. elif self._cluster == 'entity':
  310. cluster_axis = 1
  311. x = self._x
  312. y = self._y
  313. if self._time_effects:
  314. xx = _xx_time_effects(x, y)
  315. else:
  316. xx = np.dot(x.values.T, x.values)
  317. return _var_beta_panel(y, x, self._beta_raw, xx,
  318. self._rmse_raw, cluster_axis, self._nw_lags,
  319. self._nobs, self._df_raw, self._nw_overlap)
  320. @cache_readonly
  321. def _y_fitted_raw(self):
  322. """Returns the raw fitted y values."""
  323. return np.dot(self._x.values, self._beta_raw)
  324. @cache_readonly
  325. def y_fitted(self):
  326. return self._unstack_vector(self._y_fitted_raw, index=self._x.index)
  327. def _unstack_vector(self, vec, index=None):
  328. if index is None:
  329. index = self._y_trans.index
  330. panel = DataFrame(vec, index=index, columns=['dummy'])
  331. return panel.to_panel()['dummy']
  332. def _unstack_y(self, vec):
  333. unstacked = self._unstack_vector(vec)
  334. return unstacked.reindex(self.beta.index)
  335. @cache_readonly
  336. def _time_obs_count(self):
  337. return self._y_trans.count(level=0).values
  338. @cache_readonly
  339. def _time_has_obs(self):
  340. return self._time_obs_count > 0
  341. @property
  342. def _nobs(self):
  343. return len(self._y)
  344. def _convertDummies(dummies, mapping):
  345. # cleans up the names of the generated dummies
  346. new_items = []
  347. for item in dummies.columns:
  348. if not mapping:
  349. var = str(item)
  350. if isinstance(item, float):
  351. var = '%g' % item
  352. new_items.append(var)
  353. else:
  354. # renames the dummies if a conversion dict is provided
  355. new_items.append(mapping[int(item)])
  356. dummies = DataFrame(dummies.values, index=dummies.index,
  357. columns=new_items)
  358. return dummies
  359. def _is_numeric(df):
  360. for col in df:
  361. if df[col].dtype.name == 'object':
  362. return False
  363. return True
  364. def add_intercept(panel, name='intercept'):
  365. """
  366. Add column of ones to input panel
  367. Parameters
  368. ----------
  369. panel: Panel / DataFrame
  370. name: string, default 'intercept']
  371. Returns
  372. -------
  373. New object (same type as input)
  374. """
  375. panel = panel.copy()
  376. panel[name] = 1.
  377. return panel.consolidate()
  378. class MovingPanelOLS(MovingOLS, PanelOLS):
  379. """Implements rolling/expanding panel OLS.
  380. See ols function docs
  381. """
  382. _panel_model = True
  383. def __init__(self, y, x, weights=None,
  384. window_type='expanding', window=None,
  385. min_periods=None,
  386. min_obs=None,
  387. intercept=True,
  388. nw_lags=None, nw_overlap=False,
  389. entity_effects=False,
  390. time_effects=False,
  391. x_effects=None,
  392. cluster=None,
  393. dropped_dummies=None,
  394. verbose=False):
  395. self._args = dict(intercept=intercept,
  396. nw_lags=nw_lags,
  397. nw_overlap=nw_overlap,
  398. entity_effects=entity_effects,
  399. time_effects=time_effects,
  400. x_effects=x_effects,
  401. cluster=cluster,
  402. dropped_dummies=dropped_dummies,
  403. verbose=verbose)
  404. PanelOLS.__init__(self, y=y, x=x, weights=weights,
  405. **self._args)
  406. self._set_window(window_type, window, min_periods)
  407. if min_obs is None:
  408. min_obs = len(self._x.columns) + 1
  409. self._min_obs = min_obs
  410. @cache_readonly
  411. def resid(self):
  412. return self._unstack_y(self._resid_raw)
  413. @cache_readonly
  414. def y_fitted(self):
  415. return self._unstack_y(self._y_fitted_raw)
  416. @cache_readonly
  417. def y_predict(self):
  418. """Returns the predicted y values."""
  419. return self._unstack_y(self._y_predict_raw)
  420. def lagged_y_predict(self, lag=1):
  421. """
  422. Compute forecast Y value lagging coefficient by input number
  423. of time periods
  424. Parameters
  425. ----------
  426. lag : int
  427. Returns
  428. -------
  429. DataFrame
  430. """
  431. x = self._x.values
  432. betas = self._beta_matrix(lag=lag)
  433. return self._unstack_y((betas * x).sum(1))
  434. @cache_readonly
  435. def _rolling_ols_call(self):
  436. return self._calc_betas(self._x_trans, self._y_trans)
  437. @cache_readonly
  438. def _df_raw(self):
  439. """Returns the degrees of freedom."""
  440. df = self._rolling_rank()
  441. if self._time_effects:
  442. df += self._window_time_obs
  443. return df[self._valid_indices]
  444. @cache_readonly
  445. def _var_beta_raw(self):
  446. """Returns the raw covariance of beta."""
  447. x = self._x
  448. y = self._y
  449. dates = x.index.levels[0]
  450. cluster_axis = None
  451. if self._cluster == 'time':
  452. cluster_axis = 0
  453. elif self._cluster == 'entity':
  454. cluster_axis = 1
  455. nobs = self._nobs
  456. rmse = self._rmse_raw
  457. beta = self._beta_raw
  458. df = self._df_raw
  459. window = self._window
  460. if not self._time_effects:
  461. # Non-transformed X
  462. cum_xx = self._cum_xx(x)
  463. results = []
  464. for n, i in enumerate(self._valid_indices):
  465. if self._is_rolling and i >= window:
  466. prior_date = dates[i - window + 1]
  467. else:
  468. prior_date = dates[0]
  469. date = dates[i]
  470. x_slice = x.truncate(prior_date, date)
  471. y_slice = y.truncate(prior_date, date)
  472. if self._time_effects:
  473. xx = _xx_time_effects(x_slice, y_slice)
  474. else:
  475. xx = cum_xx[i]
  476. if self._is_rolling and i >= window:
  477. xx = xx - cum_xx[i - window]
  478. result = _var_beta_panel(y_slice, x_slice, beta[n], xx, rmse[n],
  479. cluster_axis, self._nw_lags,
  480. nobs[n], df[n], self._nw_overlap)
  481. results.append(result)
  482. return np.array(results)
  483. @cache_readonly
  484. def _resid_raw(self):
  485. beta_matrix = self._beta_matrix(lag=0)
  486. Y = self._y.values.squeeze()
  487. X = self._x.values
  488. resid = Y - (X * beta_matrix).sum(1)
  489. return resid
  490. @cache_readonly
  491. def _y_fitted_raw(self):
  492. x = self._x.values
  493. betas = self._beta_matrix(lag=0)
  494. return (betas * x).sum(1)
  495. @cache_readonly
  496. def _y_predict_raw(self):
  497. """Returns the raw predicted y values."""
  498. x = self._x.values
  499. betas = self._beta_matrix(lag=1)
  500. return (betas * x).sum(1)
  501. def _beta_matrix(self, lag=0):
  502. if lag < 0:
  503. raise AssertionError("'lag' must be greater than or equal to 0, "
  504. "input was {0}".format(lag))
  505. index = self._y_trans.index
  506. major_labels = index.labels[0]
  507. labels = major_labels - lag
  508. indexer = self._valid_indices.searchsorted(labels, side='left')
  509. beta_matrix = self._beta_raw[indexer]
  510. beta_matrix[labels < self._valid_indices[0]] = np.NaN
  511. return beta_matrix
  512. @cache_readonly
  513. def _enough_obs(self):
  514. # XXX: what's the best way to determine where to start?
  515. # TODO: write unit tests for this
  516. rank_threshold = len(self._x.columns) + 1
  517. if self._min_obs < rank_threshold: # pragma: no cover
  518. warnings.warn('min_obs is smaller than rank of X matrix')
  519. enough_observations = self._nobs_raw >= self._min_obs
  520. enough_time_periods = self._window_time_obs >= self._min_periods
  521. return enough_time_periods & enough_observations
  522. def create_ols_dict(attr):
  523. def attr_getter(self):
  524. d = {}
  525. for k, v in compat.iteritems(self.results):
  526. result = getattr(v, attr)
  527. d[k] = result
  528. return d
  529. return attr_getter
  530. def create_ols_attr(attr):
  531. return property(create_ols_dict(attr))
  532. class NonPooledPanelOLS(object):
  533. """Implements non-pooled panel OLS.
  534. Parameters
  535. ----------
  536. y : DataFrame
  537. x : Series, DataFrame, or dict of Series
  538. intercept : bool
  539. True if you want an intercept.
  540. nw_lags : None or int
  541. Number of Newey-West lags.
  542. window_type : {'full_sample', 'rolling', 'expanding'}
  543. 'full_sample' by default
  544. window : int
  545. size of window (for rolling/expanding OLS)
  546. """
  547. ATTRIBUTES = [
  548. 'beta',
  549. 'df',
  550. 'df_model',
  551. 'df_resid',
  552. 'f_stat',
  553. 'p_value',
  554. 'r2',
  555. 'r2_adj',
  556. 'resid',
  557. 'rmse',
  558. 'std_err',
  559. 'summary_as_matrix',
  560. 't_stat',
  561. 'var_beta',
  562. 'x',
  563. 'y',
  564. 'y_fitted',
  565. 'y_predict'
  566. ]
  567. def __init__(self, y, x, window_type='full_sample', window=None,
  568. min_periods=None, intercept=True, nw_lags=None,
  569. nw_overlap=False):
  570. import warnings
  571. warnings.warn("The pandas.stats.plm module is deprecated and will be "
  572. "removed in a future version. We refer to external packages "
  573. "like statsmodels, see some examples here: "
  574. "http://www.statsmodels.org/stable/mixed_linear.html",
  575. FutureWarning, stacklevel=4)
  576. for attr in self.ATTRIBUTES:
  577. setattr(self.__class__, attr, create_ols_attr(attr))
  578. results = {}
  579. for entity in y:
  580. entity_y = y[entity]
  581. entity_x = {}
  582. for x_var in x:
  583. entity_x[x_var] = x[x_var][entity]
  584. from pandas.stats.interface import ols
  585. results[entity] = ols(y=entity_y,
  586. x=entity_x,
  587. window_type=window_type,
  588. window=window,
  589. min_periods=min_periods,
  590. intercept=intercept,
  591. nw_lags=nw_lags,
  592. nw_overlap=nw_overlap)
  593. self.results = results
  594. def _var_beta_panel(y, x, beta, xx, rmse, cluster_axis,
  595. nw_lags, nobs, df, nw_overlap):
  596. xx_inv = math.inv(xx)
  597. yv = y.values
  598. if cluster_axis is None:
  599. if nw_lags is None:
  600. return xx_inv * (rmse ** 2)
  601. else:
  602. resid = yv - np.dot(x.values, beta)
  603. m = (x.values.T * resid).T
  604. xeps = math.newey_west(m, nw_lags, nobs, df, nw_overlap)
  605. return np.dot(xx_inv, np.dot(xeps, xx_inv))
  606. else:
  607. Xb = np.dot(x.values, beta).reshape((len(x.values), 1))
  608. resid = DataFrame(yv[:, None] - Xb, index=y.index, columns=['resid'])
  609. if cluster_axis == 1:
  610. x = x.swaplevel(0, 1).sortlevel(0)
  611. resid = resid.swaplevel(0, 1).sortlevel(0)
  612. m = _group_agg(x.values * resid.values, x.index._bounds,
  613. lambda x: np.sum(x, axis=0))
  614. if nw_lags is None:
  615. nw_lags = 0
  616. xox = 0
  617. for i in range(len(x.index.levels[0])):
  618. xox += math.newey_west(m[i: i + 1], nw_lags,
  619. nobs, df, nw_overlap)
  620. return np.dot(xx_inv, np.dot(xox, xx_inv))
  621. def _group_agg(values, bounds, f):
  622. """
  623. R-style aggregator
  624. Parameters
  625. ----------
  626. values : N-length or N x K ndarray
  627. bounds : B-length ndarray
  628. f : ndarray aggregation function
  629. Returns
  630. -------
  631. ndarray with same length as bounds array
  632. """
  633. if values.ndim == 1:
  634. N = len(values)
  635. result = np.empty(len(bounds), dtype=float)
  636. elif values.ndim == 2:
  637. N, K = values.shape
  638. result = np.empty((len(bounds), K), dtype=float)
  639. testagg = f(values[:min(1, len(values))])
  640. if isinstance(testagg, np.ndarray) and testagg.ndim == 2:
  641. raise AssertionError('Function must reduce')
  642. for i, left_bound in enumerate(bounds):
  643. if i == len(bounds) - 1:
  644. right_bound = N
  645. else:
  646. right_bound = bounds[i + 1]
  647. result[i] = f(values[left_bound:right_bound])
  648. return result
  649. def _xx_time_effects(x, y):
  650. """
  651. Returns X'X - (X'T) (T'T)^-1 (T'X)
  652. """
  653. # X'X
  654. xx = np.dot(x.values.T, x.values)
  655. xt = x.sum(level=0).values
  656. count = y.unstack().count(1).values
  657. selector = count > 0
  658. # X'X - (T'T)^-1 (T'X)
  659. xt = xt[selector]
  660. count = count[selector]
  661. return xx - np.dot(xt.T / count, xt)