PageRenderTime 67ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 0ms

/statsmodels/base/tests/test_data.py

http://github.com/statsmodels/statsmodels
Python | 905 lines | 714 code | 147 blank | 44 comment | 10 complexity | f68c0268038562235f78d9b1111c26cd MD5 | raw file
Possible License(s): BSD-3-Clause
  1. from statsmodels.compat.pandas import assert_series_equal, assert_frame_equal,\
  2. make_dataframe
  3. import numpy as np
  4. from numpy.testing import assert_equal, assert_, assert_raises
  5. import pandas as pd
  6. import pytest
  7. from statsmodels.base import data as sm_data
  8. from statsmodels.formula import handle_formula_data
  9. from statsmodels.regression.linear_model import OLS
  10. from statsmodels.genmod.generalized_linear_model import GLM
  11. from statsmodels.genmod import families
  12. from statsmodels.discrete.discrete_model import Logit
  13. # FIXME: do not leave commented-out, enable or move/remove
  14. # class TestDates(object):
  15. # @classmethod
  16. # def setup_class(cls):
  17. # nrows = 10
  18. # cls.dates_result = cls.dates_results = np.random.random(nrows)
  19. #
  20. # def test_dates(self):
  21. # np.testing.assert_equal(data.wrap_output(self.dates_input, 'dates'),
  22. # self.dates_result)
  23. class TestArrays(object):
  24. @classmethod
  25. def setup_class(cls):
  26. cls.endog = np.random.random(10)
  27. cls.exog = np.c_[np.ones(10), np.random.random((10, 2))]
  28. cls.data = sm_data.handle_data(cls.endog, cls.exog)
  29. nrows = 10
  30. nvars = 3
  31. cls.col_result = cls.col_input = np.random.random(nvars)
  32. cls.row_result = cls.row_input = np.random.random(nrows)
  33. cls.cov_result = cls.cov_input = np.random.random((nvars, nvars))
  34. cls.xnames = ['const', 'x1', 'x2']
  35. cls.ynames = 'y'
  36. cls.row_labels = None
  37. def test_orig(self):
  38. np.testing.assert_equal(self.data.orig_endog, self.endog)
  39. np.testing.assert_equal(self.data.orig_exog, self.exog)
  40. def test_endogexog(self):
  41. np.testing.assert_equal(self.data.endog, self.endog)
  42. np.testing.assert_equal(self.data.exog, self.exog)
  43. def test_attach(self):
  44. data = self.data
  45. # this makes sure what the wrappers need work but not the wrapped
  46. # results themselves
  47. np.testing.assert_equal(data.wrap_output(self.col_input, 'columns'),
  48. self.col_result)
  49. np.testing.assert_equal(data.wrap_output(self.row_input, 'rows'),
  50. self.row_result)
  51. np.testing.assert_equal(data.wrap_output(self.cov_input, 'cov'),
  52. self.cov_result)
  53. def test_names(self):
  54. data = self.data
  55. np.testing.assert_equal(data.xnames, self.xnames)
  56. np.testing.assert_equal(data.ynames, self.ynames)
  57. def test_labels(self):
  58. # HACK: because numpy main after NA stuff assert_equal fails on
  59. # pandas indices
  60. # FIXME: see if this can be de-hacked
  61. np.testing.assert_(np.all(self.data.row_labels == self.row_labels))
  62. class TestArrays2dEndog(TestArrays):
  63. @classmethod
  64. def setup_class(cls):
  65. super(TestArrays2dEndog, cls).setup_class()
  66. cls.endog = np.random.random((10, 1))
  67. cls.exog = np.c_[np.ones(10), np.random.random((10, 2))]
  68. cls.data = sm_data.handle_data(cls.endog, cls.exog)
  69. def test_endogexog(self):
  70. np.testing.assert_equal(self.data.endog, self.endog.squeeze())
  71. np.testing.assert_equal(self.data.exog, self.exog)
  72. class TestArrays1dExog(TestArrays):
  73. @classmethod
  74. def setup_class(cls):
  75. super(TestArrays1dExog, cls).setup_class()
  76. cls.endog = np.random.random(10)
  77. exog = np.random.random(10)
  78. cls.data = sm_data.handle_data(cls.endog, exog)
  79. cls.exog = exog[:, None]
  80. cls.xnames = ['x1']
  81. cls.ynames = 'y'
  82. def test_orig(self):
  83. np.testing.assert_equal(self.data.orig_endog, self.endog)
  84. np.testing.assert_equal(self.data.orig_exog, self.exog.squeeze())
  85. class TestDataFrames(TestArrays):
  86. @classmethod
  87. def setup_class(cls):
  88. cls.endog = pd.DataFrame(np.random.random(10), columns=['y_1'])
  89. exog = pd.DataFrame(np.random.random((10, 2)),
  90. columns=['x_1', 'x_2'])
  91. exog.insert(0, 'const', 1)
  92. cls.exog = exog
  93. cls.data = sm_data.handle_data(cls.endog, cls.exog)
  94. nrows = 10
  95. nvars = 3
  96. cls.col_input = np.random.random(nvars)
  97. cls.col_result = pd.Series(cls.col_input,
  98. index=exog.columns)
  99. cls.row_input = np.random.random(nrows)
  100. cls.row_result = pd.Series(cls.row_input,
  101. index=exog.index)
  102. cls.cov_input = np.random.random((nvars, nvars))
  103. cls.cov_result = pd.DataFrame(cls.cov_input,
  104. index=exog.columns,
  105. columns=exog.columns)
  106. cls.xnames = ['const', 'x_1', 'x_2']
  107. cls.ynames = 'y_1'
  108. cls.row_labels = cls.exog.index
  109. def test_orig(self):
  110. assert_frame_equal(self.data.orig_endog, self.endog)
  111. assert_frame_equal(self.data.orig_exog, self.exog)
  112. def test_endogexog(self):
  113. np.testing.assert_equal(self.data.endog, self.endog.values.squeeze())
  114. np.testing.assert_equal(self.data.exog, self.exog.values)
  115. def test_attach(self):
  116. data = self.data
  117. # this makes sure what the wrappers need work but not the wrapped
  118. # results themselves
  119. assert_series_equal(data.wrap_output(self.col_input, 'columns'),
  120. self.col_result)
  121. assert_series_equal(data.wrap_output(self.row_input, 'rows'),
  122. self.row_result)
  123. assert_frame_equal(data.wrap_output(self.cov_input, 'cov'),
  124. self.cov_result)
  125. class TestDataFramesWithMultiIndex(TestDataFrames):
  126. @classmethod
  127. def setup_class(cls):
  128. cls.endog = pd.DataFrame(np.random.random(10), columns=['y_1'])
  129. mi = pd.MultiIndex.from_product([['x'], ['1', '2']])
  130. exog = pd.DataFrame(np.random.random((10, 2)), columns=mi)
  131. exog_flattened_idx = pd.Index(['const', 'x_1', 'x_2'])
  132. exog.insert(0, 'const', 1)
  133. cls.exog = exog
  134. cls.data = sm_data.handle_data(cls.endog, cls.exog)
  135. nrows = 10
  136. nvars = 3
  137. cls.col_input = np.random.random(nvars)
  138. cls.col_result = pd.Series(cls.col_input, index=exog_flattened_idx)
  139. cls.row_input = np.random.random(nrows)
  140. cls.row_result = pd.Series(cls.row_input, index=exog.index)
  141. cls.cov_input = np.random.random((nvars, nvars))
  142. cls.cov_result = pd.DataFrame(cls.cov_input,
  143. index=exog_flattened_idx,
  144. columns=exog_flattened_idx)
  145. cls.xnames = ['const', 'x_1', 'x_2']
  146. cls.ynames = 'y_1'
  147. cls.row_labels = cls.exog.index
  148. class TestLists(TestArrays):
  149. @classmethod
  150. def setup_class(cls):
  151. super(TestLists, cls).setup_class()
  152. cls.endog = np.random.random(10).tolist()
  153. cls.exog = np.c_[np.ones(10), np.random.random((10, 2))].tolist()
  154. cls.data = sm_data.handle_data(cls.endog, cls.exog)
  155. class TestListDataFrame(TestDataFrames):
  156. @classmethod
  157. def setup_class(cls):
  158. cls.endog = np.random.random(10).tolist()
  159. exog = pd.DataFrame(np.random.random((10, 2)),
  160. columns=['x_1', 'x_2'])
  161. exog.insert(0, 'const', 1)
  162. cls.exog = exog
  163. cls.data = sm_data.handle_data(cls.endog, cls.exog)
  164. nrows = 10
  165. nvars = 3
  166. cls.col_input = np.random.random(nvars)
  167. cls.col_result = pd.Series(cls.col_input,
  168. index=exog.columns)
  169. cls.row_input = np.random.random(nrows)
  170. cls.row_result = pd.Series(cls.row_input,
  171. index=exog.index)
  172. cls.cov_input = np.random.random((nvars, nvars))
  173. cls.cov_result = pd.DataFrame(cls.cov_input,
  174. index=exog.columns,
  175. columns=exog.columns)
  176. cls.xnames = ['const', 'x_1', 'x_2']
  177. cls.ynames = 'y'
  178. cls.row_labels = cls.exog.index
  179. def test_endogexog(self):
  180. np.testing.assert_equal(self.data.endog, self.endog)
  181. np.testing.assert_equal(self.data.exog, self.exog.values)
  182. def test_orig(self):
  183. np.testing.assert_equal(self.data.orig_endog, self.endog)
  184. assert_frame_equal(self.data.orig_exog, self.exog)
  185. class TestDataFrameList(TestDataFrames):
  186. @classmethod
  187. def setup_class(cls):
  188. cls.endog = pd.DataFrame(np.random.random(10), columns=['y_1'])
  189. exog = pd.DataFrame(np.random.random((10, 2)),
  190. columns=['x1', 'x2'])
  191. exog.insert(0, 'const', 1)
  192. cls.exog = exog.values.tolist()
  193. cls.data = sm_data.handle_data(cls.endog, cls.exog)
  194. nrows = 10
  195. nvars = 3
  196. cls.col_input = np.random.random(nvars)
  197. cls.col_result = pd.Series(cls.col_input,
  198. index=exog.columns)
  199. cls.row_input = np.random.random(nrows)
  200. cls.row_result = pd.Series(cls.row_input,
  201. index=exog.index)
  202. cls.cov_input = np.random.random((nvars, nvars))
  203. cls.cov_result = pd.DataFrame(cls.cov_input,
  204. index=exog.columns,
  205. columns=exog.columns)
  206. cls.xnames = ['const', 'x1', 'x2']
  207. cls.ynames = 'y_1'
  208. cls.row_labels = cls.endog.index
  209. def test_endogexog(self):
  210. np.testing.assert_equal(self.data.endog, self.endog.values.squeeze())
  211. np.testing.assert_equal(self.data.exog, self.exog)
  212. def test_orig(self):
  213. assert_frame_equal(self.data.orig_endog, self.endog)
  214. np.testing.assert_equal(self.data.orig_exog, self.exog)
  215. class TestArrayDataFrame(TestDataFrames):
  216. @classmethod
  217. def setup_class(cls):
  218. cls.endog = np.random.random(10)
  219. exog = pd.DataFrame(np.random.random((10, 2)),
  220. columns=['x_1', 'x_2'])
  221. exog.insert(0, 'const', 1)
  222. cls.exog = exog
  223. cls.data = sm_data.handle_data(cls.endog, exog)
  224. nrows = 10
  225. nvars = 3
  226. cls.col_input = np.random.random(nvars)
  227. cls.col_result = pd.Series(cls.col_input,
  228. index=exog.columns)
  229. cls.row_input = np.random.random(nrows)
  230. cls.row_result = pd.Series(cls.row_input,
  231. index=exog.index)
  232. cls.cov_input = np.random.random((nvars, nvars))
  233. cls.cov_result = pd.DataFrame(cls.cov_input,
  234. index=exog.columns,
  235. columns=exog.columns)
  236. cls.xnames = ['const', 'x_1', 'x_2']
  237. cls.ynames = 'y'
  238. cls.row_labels = cls.exog.index
  239. def test_endogexog(self):
  240. np.testing.assert_equal(self.data.endog, self.endog)
  241. np.testing.assert_equal(self.data.exog, self.exog.values)
  242. def test_orig(self):
  243. np.testing.assert_equal(self.data.orig_endog, self.endog)
  244. assert_frame_equal(self.data.orig_exog, self.exog)
  245. class TestDataFrameArray(TestDataFrames):
  246. @classmethod
  247. def setup_class(cls):
  248. cls.endog = pd.DataFrame(np.random.random(10), columns=['y_1'])
  249. exog = pd.DataFrame(np.random.random((10, 2)),
  250. columns=['x1', 'x2']) # names mimic defaults
  251. exog.insert(0, 'const', 1)
  252. cls.exog = exog.values
  253. cls.data = sm_data.handle_data(cls.endog, cls.exog)
  254. nrows = 10
  255. nvars = 3
  256. cls.col_input = np.random.random(nvars)
  257. cls.col_result = pd.Series(cls.col_input,
  258. index=exog.columns)
  259. cls.row_input = np.random.random(nrows)
  260. cls.row_result = pd.Series(cls.row_input,
  261. index=exog.index)
  262. cls.cov_input = np.random.random((nvars, nvars))
  263. cls.cov_result = pd.DataFrame(cls.cov_input,
  264. index=exog.columns,
  265. columns=exog.columns)
  266. cls.xnames = ['const', 'x1', 'x2']
  267. cls.ynames = 'y_1'
  268. cls.row_labels = cls.endog.index
  269. def test_endogexog(self):
  270. np.testing.assert_equal(self.data.endog, self.endog.values.squeeze())
  271. np.testing.assert_equal(self.data.exog, self.exog)
  272. def test_orig(self):
  273. assert_frame_equal(self.data.orig_endog, self.endog)
  274. np.testing.assert_equal(self.data.orig_exog, self.exog)
  275. class TestSeriesDataFrame(TestDataFrames):
  276. @classmethod
  277. def setup_class(cls):
  278. cls.endog = pd.Series(np.random.random(10), name='y_1')
  279. exog = pd.DataFrame(np.random.random((10, 2)),
  280. columns=['x_1', 'x_2'])
  281. exog.insert(0, 'const', 1)
  282. cls.exog = exog
  283. cls.data = sm_data.handle_data(cls.endog, cls.exog)
  284. nrows = 10
  285. nvars = 3
  286. cls.col_input = np.random.random(nvars)
  287. cls.col_result = pd.Series(cls.col_input,
  288. index=exog.columns)
  289. cls.row_input = np.random.random(nrows)
  290. cls.row_result = pd.Series(cls.row_input,
  291. index=exog.index)
  292. cls.cov_input = np.random.random((nvars, nvars))
  293. cls.cov_result = pd.DataFrame(cls.cov_input,
  294. index=exog.columns,
  295. columns=exog.columns)
  296. cls.xnames = ['const', 'x_1', 'x_2']
  297. cls.ynames = 'y_1'
  298. cls.row_labels = cls.exog.index
  299. def test_orig(self):
  300. assert_series_equal(self.data.orig_endog, self.endog)
  301. assert_frame_equal(self.data.orig_exog, self.exog)
  302. class TestSeriesSeries(TestDataFrames):
  303. @classmethod
  304. def setup_class(cls):
  305. cls.endog = pd.Series(np.random.random(10), name='y_1')
  306. exog = pd.Series(np.random.random(10), name='x_1')
  307. cls.exog = exog
  308. cls.data = sm_data.handle_data(cls.endog, cls.exog)
  309. nrows = 10
  310. nvars = 1
  311. cls.col_input = np.random.random(nvars)
  312. cls.col_result = pd.Series(cls.col_input,
  313. index=[exog.name])
  314. cls.row_input = np.random.random(nrows)
  315. cls.row_result = pd.Series(cls.row_input,
  316. index=exog.index)
  317. cls.cov_input = np.random.random((nvars, nvars))
  318. cls.cov_result = pd.DataFrame(cls.cov_input,
  319. index=[exog.name],
  320. columns=[exog.name])
  321. cls.xnames = ['x_1']
  322. cls.ynames = 'y_1'
  323. cls.row_labels = cls.exog.index
  324. def test_orig(self):
  325. assert_series_equal(self.data.orig_endog, self.endog)
  326. assert_series_equal(self.data.orig_exog, self.exog)
  327. def test_endogexog(self):
  328. np.testing.assert_equal(self.data.endog, self.endog.values.squeeze())
  329. np.testing.assert_equal(self.data.exog, self.exog.values[:, None])
  330. def test_alignment():
  331. # Fix Issue GH#206
  332. from statsmodels.datasets.macrodata import load_pandas
  333. d = load_pandas().data
  334. # growth rates
  335. gs_l_realinv = 400 * np.log(d['realinv']).diff().dropna()
  336. gs_l_realgdp = 400 * np.log(d['realgdp']).diff().dropna()
  337. lint = d['realint'][:-1] # incorrect indexing for test purposes
  338. endog = gs_l_realinv
  339. # re-index because they will not conform to lint
  340. realgdp = gs_l_realgdp.reindex(lint.index, method='bfill')
  341. data = dict(const=np.ones_like(lint), lrealgdp=realgdp, lint=lint)
  342. exog = pd.DataFrame(data)
  343. # TODO: which index do we get??
  344. np.testing.assert_raises(ValueError, OLS, *(endog, exog))
  345. class TestMultipleEqsArrays(TestArrays):
  346. @classmethod
  347. def setup_class(cls):
  348. cls.endog = np.random.random((10, 4))
  349. cls.exog = np.c_[np.ones(10), np.random.random((10, 2))]
  350. cls.data = sm_data.handle_data(cls.endog, cls.exog)
  351. nrows = 10
  352. nvars = 3
  353. neqs = 4
  354. cls.col_result = cls.col_input = np.random.random(nvars)
  355. cls.row_result = cls.row_input = np.random.random(nrows)
  356. cls.cov_result = cls.cov_input = np.random.random((nvars, nvars))
  357. cls.cov_eq_result = cls.cov_eq_input = np.random.random((neqs, neqs))
  358. cls.col_eq_result = cls.col_eq_input = np.array((neqs, nvars))
  359. cls.xnames = ['const', 'x1', 'x2']
  360. cls.ynames = ['y1', 'y2', 'y3', 'y4']
  361. cls.row_labels = None
  362. def test_attach(self):
  363. data = self.data
  364. # this makes sure what the wrappers need work but not the wrapped
  365. # results themselves
  366. np.testing.assert_equal(data.wrap_output(self.col_input, 'columns'),
  367. self.col_result)
  368. np.testing.assert_equal(data.wrap_output(self.row_input, 'rows'),
  369. self.row_result)
  370. np.testing.assert_equal(data.wrap_output(self.cov_input, 'cov'),
  371. self.cov_result)
  372. np.testing.assert_equal(data.wrap_output(self.cov_eq_input, 'cov_eq'),
  373. self.cov_eq_result)
  374. np.testing.assert_equal(data.wrap_output(self.col_eq_input,
  375. 'columns_eq'),
  376. self.col_eq_result)
  377. class TestMultipleEqsDataFrames(TestDataFrames):
  378. @classmethod
  379. def setup_class(cls):
  380. cls.endog = endog = pd.DataFrame(np.random.random((10, 4)),
  381. columns=['y_1', 'y_2', 'y_3', 'y_4'])
  382. exog = pd.DataFrame(np.random.random((10, 2)),
  383. columns=['x_1', 'x_2'])
  384. exog.insert(0, 'const', 1)
  385. cls.exog = exog
  386. cls.data = sm_data.handle_data(cls.endog, cls.exog)
  387. nrows = 10
  388. nvars = 3
  389. neqs = 4
  390. cls.col_input = np.random.random(nvars)
  391. cls.col_result = pd.Series(cls.col_input,
  392. index=exog.columns)
  393. cls.row_input = np.random.random(nrows)
  394. cls.row_result = pd.Series(cls.row_input,
  395. index=exog.index)
  396. cls.cov_input = np.random.random((nvars, nvars))
  397. cls.cov_result = pd.DataFrame(cls.cov_input,
  398. index=exog.columns,
  399. columns=exog.columns)
  400. cls.cov_eq_input = np.random.random((neqs, neqs))
  401. cls.cov_eq_result = pd.DataFrame(cls.cov_eq_input,
  402. index=endog.columns,
  403. columns=endog.columns)
  404. cls.col_eq_input = np.random.random((nvars, neqs))
  405. cls.col_eq_result = pd.DataFrame(cls.col_eq_input,
  406. index=exog.columns,
  407. columns=endog.columns)
  408. cls.xnames = ['const', 'x_1', 'x_2']
  409. cls.ynames = ['y_1', 'y_2', 'y_3', 'y_4']
  410. cls.row_labels = cls.exog.index
  411. def test_attach(self):
  412. data = self.data
  413. assert_series_equal(data.wrap_output(self.col_input, 'columns'),
  414. self.col_result)
  415. assert_series_equal(data.wrap_output(self.row_input, 'rows'),
  416. self.row_result)
  417. assert_frame_equal(data.wrap_output(self.cov_input, 'cov'),
  418. self.cov_result)
  419. assert_frame_equal(data.wrap_output(self.cov_eq_input, 'cov_eq'),
  420. self.cov_eq_result)
  421. assert_frame_equal(data.wrap_output(self.col_eq_input, 'columns_eq'),
  422. self.col_eq_result)
  423. class TestMissingArray(object):
  424. @classmethod
  425. def setup_class(cls):
  426. X = np.random.random((25, 4))
  427. y = np.random.random(25)
  428. y[10] = np.nan
  429. X[2, 3] = np.nan
  430. X[14, 2] = np.nan
  431. cls.y, cls.X = y, X
  432. @pytest.mark.smoke
  433. def test_raise_no_missing(self):
  434. # GH#1700
  435. sm_data.handle_data(np.random.random(20), np.random.random((20, 2)),
  436. 'raise')
  437. def test_raise(self):
  438. with pytest.raises(Exception):
  439. # TODO: be more specific about exception
  440. sm_data.handle_data(self.y, self.X, 'raise')
  441. def test_drop(self):
  442. y = self.y
  443. X = self.X
  444. combined = np.c_[y, X]
  445. idx = ~np.isnan(combined).any(axis=1)
  446. y = y[idx]
  447. X = X[idx]
  448. data = sm_data.handle_data(self.y, self.X, 'drop')
  449. np.testing.assert_array_equal(data.endog, y)
  450. np.testing.assert_array_equal(data.exog, X)
  451. def test_none(self):
  452. data = sm_data.handle_data(self.y, self.X, 'none', hasconst=False)
  453. np.testing.assert_array_equal(data.endog, self.y)
  454. np.testing.assert_array_equal(data.exog, self.X)
  455. assert data.k_constant == 0
  456. def test_endog_only_raise(self):
  457. with pytest.raises(Exception):
  458. # TODO: be more specific about exception
  459. sm_data.handle_data(self.y, None, 'raise')
  460. def test_endog_only_drop(self):
  461. y = self.y
  462. y = y[~np.isnan(y)]
  463. data = sm_data.handle_data(self.y, None, 'drop')
  464. np.testing.assert_array_equal(data.endog, y)
  465. def test_mv_endog(self):
  466. y = self.X
  467. y = y[~np.isnan(y).any(axis=1)]
  468. data = sm_data.handle_data(self.X, None, 'drop')
  469. np.testing.assert_array_equal(data.endog, y)
  470. def test_extra_kwargs_2d(self):
  471. sigma = np.random.random((25, 25))
  472. sigma = sigma + sigma.T - np.diag(np.diag(sigma))
  473. data = sm_data.handle_data(self.y, self.X, 'drop', sigma=sigma)
  474. idx = ~np.isnan(np.c_[self.y, self.X]).any(axis=1)
  475. sigma = sigma[idx][:, idx]
  476. np.testing.assert_array_equal(data.sigma, sigma)
  477. def test_extra_kwargs_1d(self):
  478. weights = np.random.random(25)
  479. data = sm_data.handle_data(self.y, self.X, 'drop', weights=weights)
  480. idx = ~np.isnan(np.c_[self.y, self.X]).any(axis=1)
  481. weights = weights[idx]
  482. np.testing.assert_array_equal(data.weights, weights)
  483. class TestMissingPandas(object):
  484. @classmethod
  485. def setup_class(cls):
  486. X = np.random.random((25, 4))
  487. y = np.random.random(25)
  488. y[10] = np.nan
  489. X[2, 3] = np.nan
  490. X[14, 2] = np.nan
  491. cls.y = pd.Series(y)
  492. cls.X = pd.DataFrame(X)
  493. @pytest.mark.smoke
  494. def test_raise_no_missing(self):
  495. # GH#1700
  496. sm_data.handle_data(pd.Series(np.random.random(20)),
  497. pd.DataFrame(np.random.random((20, 2))),
  498. 'raise')
  499. def test_raise(self):
  500. with pytest.raises(Exception):
  501. # TODO: be more specific about exception
  502. sm_data.handle_data(self.y, self.X, 'raise')
  503. def test_drop(self):
  504. y = self.y
  505. X = self.X
  506. combined = np.c_[y, X]
  507. idx = ~np.isnan(combined).any(axis=1)
  508. y = y.loc[idx]
  509. X = X.loc[idx]
  510. data = sm_data.handle_data(self.y, self.X, 'drop')
  511. np.testing.assert_array_equal(data.endog, y.values)
  512. assert_series_equal(data.orig_endog, self.y.loc[idx])
  513. np.testing.assert_array_equal(data.exog, X.values)
  514. assert_frame_equal(data.orig_exog, self.X.loc[idx])
  515. def test_none(self):
  516. data = sm_data.handle_data(self.y, self.X, 'none', hasconst=False)
  517. np.testing.assert_array_equal(data.endog, self.y.values)
  518. np.testing.assert_array_equal(data.exog, self.X.values)
  519. assert data.k_constant == 0
  520. def test_endog_only_raise(self):
  521. with pytest.raises(Exception):
  522. # TODO: be more specific about exception
  523. sm_data.handle_data(self.y, None, 'raise')
  524. def test_endog_only_drop(self):
  525. y = self.y
  526. y = y.dropna()
  527. data = sm_data.handle_data(self.y, None, 'drop')
  528. np.testing.assert_array_equal(data.endog, y.values)
  529. def test_mv_endog(self):
  530. y = self.X
  531. y = y.loc[~np.isnan(y.values).any(axis=1)]
  532. data = sm_data.handle_data(self.X, None, 'drop')
  533. np.testing.assert_array_equal(data.endog, y.values)
  534. def test_labels(self):
  535. labels = pd.Index([0, 1, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 15,
  536. 16, 17, 18, 19, 20, 21, 22, 23, 24])
  537. data = sm_data.handle_data(self.y, self.X, 'drop')
  538. np.testing.assert_(data.row_labels.equals(labels))
  539. class TestConstant(object):
  540. @classmethod
  541. def setup_class(cls):
  542. from statsmodels.datasets.longley import load_pandas
  543. cls.data = load_pandas()
  544. def test_array_constant(self):
  545. exog = self.data.exog.copy()
  546. exog['const'] = 1
  547. data = sm_data.handle_data(self.data.endog.values, exog.values)
  548. np.testing.assert_equal(data.k_constant, 1)
  549. np.testing.assert_equal(data.const_idx, 6)
  550. def test_pandas_constant(self):
  551. exog = self.data.exog.copy()
  552. exog['const'] = 1
  553. data = sm_data.handle_data(self.data.endog, exog)
  554. np.testing.assert_equal(data.k_constant, 1)
  555. np.testing.assert_equal(data.const_idx, 6)
  556. def test_pandas_noconstant(self):
  557. exog = self.data.exog.copy()
  558. data = sm_data.handle_data(self.data.endog, exog)
  559. np.testing.assert_equal(data.k_constant, 0)
  560. np.testing.assert_equal(data.const_idx, None)
  561. def test_array_noconstant(self):
  562. exog = self.data.exog.copy()
  563. data = sm_data.handle_data(self.data.endog.values, exog.values)
  564. np.testing.assert_equal(data.k_constant, 0)
  565. np.testing.assert_equal(data.const_idx, None)
  566. class TestHandleMissing(object):
  567. def test_pandas(self):
  568. df = make_dataframe()
  569. df.values[[2, 5, 10], [2, 3, 1]] = np.nan
  570. y, X = df[df.columns[0]], df[df.columns[1:]]
  571. data, _ = sm_data.handle_missing(y, X, missing='drop')
  572. df = df.dropna()
  573. y_exp, X_exp = df[df.columns[0]], df[df.columns[1:]]
  574. assert_frame_equal(data['exog'], X_exp)
  575. assert_series_equal(data['endog'], y_exp)
  576. def test_arrays(self):
  577. arr = np.random.randn(20, 4)
  578. arr[[2, 5, 10], [2, 3, 1]] = np.nan
  579. y, X = arr[:, 0], arr[:, 1:]
  580. data, _ = sm_data.handle_missing(y, X, missing='drop')
  581. bools_mask = np.ones(20, dtype=bool)
  582. bools_mask[[2, 5, 10]] = False
  583. y_exp = arr[bools_mask, 0]
  584. X_exp = arr[bools_mask, 1:]
  585. np.testing.assert_array_equal(data['endog'], y_exp)
  586. np.testing.assert_array_equal(data['exog'], X_exp)
  587. def test_pandas_array(self):
  588. df = make_dataframe()
  589. df.values[[2, 5, 10], [2, 3, 1]] = np.nan
  590. y, X = df[df.columns[0]], df[df.columns[1:]].values
  591. data, _ = sm_data.handle_missing(y, X, missing='drop')
  592. df = df.dropna()
  593. y_exp, X_exp = df[df.columns[0]], df[df.columns[1:]].values
  594. np.testing.assert_array_equal(data['exog'], X_exp)
  595. assert_series_equal(data['endog'], y_exp)
  596. def test_array_pandas(self):
  597. df = make_dataframe()
  598. df.values[[2, 5, 10], [2, 3, 1]] = np.nan
  599. y, X = df[df.columns[0]].values, df[df.columns[1:]]
  600. data, _ = sm_data.handle_missing(y, X, missing='drop')
  601. df = df.dropna()
  602. y_exp, X_exp = df[df.columns[0]].values, df[df.columns[1:]]
  603. assert_frame_equal(data['exog'], X_exp)
  604. np.testing.assert_array_equal(data['endog'], y_exp)
  605. def test_noop(self):
  606. df = make_dataframe()
  607. df.values[[2, 5, 10], [2, 3, 1]] = np.nan
  608. y, X = df[df.columns[0]], df[df.columns[1:]]
  609. data, _ = sm_data.handle_missing(y, X, missing='none')
  610. y_exp, X_exp = df[df.columns[0]], df[df.columns[1:]]
  611. assert_frame_equal(data['exog'], X_exp)
  612. assert_series_equal(data['endog'], y_exp)
  613. class CheckHasConstant(object):
  614. def test_hasconst(self):
  615. for x, result in zip(self.exogs, self.results):
  616. mod = self.mod(self.y, x)
  617. assert_equal(mod.k_constant, result[0])
  618. assert_equal(mod.data.k_constant, result[0])
  619. if result[1] is None:
  620. assert_(mod.data.const_idx is None)
  621. else:
  622. assert_equal(mod.data.const_idx, result[1])
  623. # extra check after fit, some models raise on singular
  624. fit_kwds = getattr(self, 'fit_kwds', {})
  625. try:
  626. res = mod.fit(**fit_kwds)
  627. except np.linalg.LinAlgError:
  628. pass
  629. else:
  630. assert_equal(res.model.k_constant, result[0])
  631. assert_equal(res.model.data.k_constant, result[0])
  632. @classmethod
  633. def setup_class(cls):
  634. # create data
  635. np.random.seed(0)
  636. cls.y_c = np.random.randn(20)
  637. cls.y_bin = (cls.y_c > 0).astype(int)
  638. x1 = np.column_stack((np.ones(20), np.zeros(20)))
  639. result1 = (1, 0)
  640. x2 = np.column_stack((np.arange(20) < 10.5,
  641. np.arange(20) > 10.5)).astype(float)
  642. result2 = (1, None)
  643. x3 = np.column_stack((np.arange(20), np.zeros(20)))
  644. result3 = (0, None)
  645. x4 = np.column_stack((np.arange(20), np.zeros((20, 2))))
  646. result4 = (0, None)
  647. x5 = np.column_stack((np.zeros(20), 0.5 * np.ones(20)))
  648. result5 = (1, 1)
  649. x5b = np.column_stack((np.arange(20), np.ones((20, 3))))
  650. result5b = (1, 1)
  651. x5c = np.column_stack((np.arange(20), np.ones((20, 3)) * [0.5, 1, 1]))
  652. result5c = (1, 2)
  653. # implicit and zero column
  654. x6 = np.column_stack((np.arange(20) < 10.5,
  655. np.arange(20) > 10.5,
  656. np.zeros(20))).astype(float)
  657. result6 = (1, None)
  658. x7 = np.column_stack((np.arange(20) < 10.5,
  659. np.arange(20) > 10.5,
  660. np.zeros((20, 2)))).astype(float)
  661. result7 = (1, None)
  662. cls.exogs = (x1, x2, x3, x4, x5, x5b, x5c, x6, x7)
  663. cls.results = (result1, result2, result3, result4, result5, result5b,
  664. result5c, result6, result7)
  665. cls._initialize()
  666. class TestHasConstantOLS(CheckHasConstant):
  667. @classmethod
  668. def _initialize(cls):
  669. cls.mod = OLS
  670. cls.y = cls.y_c
  671. class TestHasConstantGLM(CheckHasConstant):
  672. @staticmethod
  673. def mod(y, x):
  674. return GLM(y, x, family=families.Binomial())
  675. @classmethod
  676. def _initialize(cls):
  677. cls.y = cls.y_bin
  678. class TestHasConstantLogit(CheckHasConstant):
  679. @classmethod
  680. def _initialize(cls):
  681. cls.mod = Logit
  682. cls.y = cls.y_bin
  683. cls.fit_kwds = {'disp': False}
  684. def test_dtype_object():
  685. # see GH#880
  686. X = np.random.random((40, 2))
  687. df = pd.DataFrame(X)
  688. df[2] = np.random.randint(2, size=40).astype('object')
  689. df['constant'] = 1
  690. y = pd.Series(np.random.randint(2, size=40))
  691. np.testing.assert_raises(ValueError, sm_data.handle_data, y, df)
  692. def test_formula_missing_extra_arrays():
  693. np.random.seed(1)
  694. # because patsy cannot turn off missing data-handling as of 0.3.0, we need
  695. # separate tests to make sure that missing values are handled correctly
  696. # when going through formulas
  697. # there is a handle_formula_data step
  698. # then there is the regular handle_data step
  699. # see GH#2083
  700. # the untested cases are endog/exog have missing. extra has missing.
  701. # endog/exog are fine. extra has missing.
  702. # endog/exog do or do not have missing and extra has wrong dimension
  703. y = np.random.randn(10)
  704. y_missing = y.copy()
  705. y_missing[[2, 5]] = np.nan
  706. X = np.random.randn(10)
  707. X_missing = X.copy()
  708. X_missing[[1, 3]] = np.nan
  709. weights = np.random.uniform(size=10)
  710. weights_missing = weights.copy()
  711. weights_missing[[6]] = np.nan
  712. weights_wrong_size = np.random.randn(12)
  713. data = {'y': y,
  714. 'X': X,
  715. 'y_missing': y_missing,
  716. 'X_missing': X_missing,
  717. 'weights': weights,
  718. 'weights_missing': weights_missing}
  719. data = pd.DataFrame.from_dict(data)
  720. data['constant'] = 1
  721. formula = 'y_missing ~ X_missing'
  722. ((endog, exog),
  723. missing_idx, design_info) = handle_formula_data(data, None, formula,
  724. depth=2,
  725. missing='drop')
  726. kwargs = {'missing_idx': missing_idx, 'missing': 'drop',
  727. 'weights': data['weights_missing']}
  728. model_data = sm_data.handle_data(endog, exog, **kwargs)
  729. data_nona = data.dropna()
  730. assert_equal(data_nona['y'].values, model_data.endog)
  731. assert_equal(data_nona[['constant', 'X']].values, model_data.exog)
  732. assert_equal(data_nona['weights'].values, model_data.weights)
  733. tmp = handle_formula_data(data, None, formula, depth=2, missing='drop')
  734. (endog, exog), missing_idx, design_info = tmp
  735. weights_2d = np.random.randn(10, 10)
  736. weights_2d[[8, 7], [7, 8]] = np.nan # symmetric missing values
  737. kwargs.update({'weights': weights_2d,
  738. 'missing_idx': missing_idx})
  739. model_data2 = sm_data.handle_data(endog, exog, **kwargs)
  740. good_idx = [0, 4, 6, 9]
  741. assert_equal(data.loc[good_idx, 'y'], model_data2.endog)
  742. assert_equal(data.loc[good_idx, ['constant', 'X']], model_data2.exog)
  743. assert_equal(weights_2d[good_idx][:, good_idx], model_data2.weights)
  744. tmp = handle_formula_data(data, None, formula, depth=2, missing='drop')
  745. (endog, exog), missing_idx, design_info = tmp
  746. kwargs.update({'weights': weights_wrong_size,
  747. 'missing_idx': missing_idx})
  748. assert_raises(ValueError, sm_data.handle_data, endog, exog, **kwargs)
  749. def test_raise_nonfinite_exog():
  750. # we raise now in the has constant check before hitting the linear algebra
  751. from statsmodels.tools.sm_exceptions import MissingDataError
  752. x = np.arange(10)[:, None]**([0., 1.])
  753. # random numbers for y
  754. y = np.array([-0.6, -0.1, 0., -0.7, -0.5, 0.5, 0.1, -0.8, -2., 1.1])
  755. x[1, 1] = np.inf
  756. assert_raises(MissingDataError, OLS, y, x)
  757. x[1, 1] = np.nan
  758. assert_raises(MissingDataError, OLS, y, x)