PageRenderTime 61ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/tests/test_groupby.py

https://github.com/benracine/pandas
Python | 1061 lines | 799 code | 219 blank | 43 comment | 45 complexity | 527a575d0b473b78dc26fbab532a0abd MD5 | raw file
Possible License(s): BSD-3-Clause
  1. import nose
  2. import unittest
  3. from datetime import datetime
  4. from numpy import nan
  5. from pandas.core.daterange import DateRange
  6. from pandas.core.index import Index, MultiIndex
  7. from pandas.core.common import rands, groupby
  8. from pandas.core.frame import DataFrame
  9. from pandas.core.groupby import GroupByError
  10. from pandas.core.series import Series
  11. from pandas.util.testing import (assert_panel_equal, assert_frame_equal,
  12. assert_series_equal, assert_almost_equal)
  13. from pandas.core.panel import Panel
  14. from collections import defaultdict
  15. import pandas._tseries as lib
  16. import pandas.core.datetools as dt
  17. import numpy as np
  18. import pandas.util.testing as tm
  19. def commonSetUp(self):
  20. self.dateRange = DateRange('1/1/2005', periods=250, offset=dt.bday)
  21. self.stringIndex = Index([rands(8).upper() for x in xrange(250)])
  22. self.groupId = Series([x[0] for x in self.stringIndex],
  23. index=self.stringIndex)
  24. self.groupDict = dict((k, v) for k, v in self.groupId.iteritems())
  25. self.columnIndex = Index(['A', 'B', 'C', 'D', 'E'])
  26. randMat = np.random.randn(250, 5)
  27. self.stringMatrix = DataFrame(randMat, columns=self.columnIndex,
  28. index=self.stringIndex)
  29. self.timeMatrix = DataFrame(randMat, columns=self.columnIndex,
  30. index=self.dateRange)
  31. class TestGroupBy(unittest.TestCase):
  32. def setUp(self):
  33. self.ts = tm.makeTimeSeries()
  34. self.seriesd = tm.getSeriesData()
  35. self.tsd = tm.getTimeSeriesData()
  36. self.frame = DataFrame(self.seriesd)
  37. self.tsframe = DataFrame(self.tsd)
  38. self.df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
  39. 'foo', 'bar', 'foo', 'foo'],
  40. 'B' : ['one', 'one', 'two', 'three',
  41. 'two', 'two', 'one', 'three'],
  42. 'C' : np.random.randn(8),
  43. 'D' : np.random.randn(8)})
  44. index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
  45. ['one', 'two', 'three']],
  46. labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
  47. [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
  48. names=['first', 'second'])
  49. self.mframe = DataFrame(np.random.randn(10, 3), index=index,
  50. columns=['A', 'B', 'C'])
  51. def test_basic(self):
  52. data = Series(np.arange(9) // 3, index=np.arange(9))
  53. index = np.arange(9)
  54. np.random.shuffle(index)
  55. data = data.reindex(index)
  56. grouped = data.groupby(lambda x: x // 3)
  57. for k, v in grouped:
  58. self.assertEqual(len(v), 3)
  59. agged = grouped.aggregate(np.mean)
  60. self.assertEqual(agged[1], 1)
  61. assert_series_equal(agged, grouped.agg(np.mean)) # shorthand
  62. assert_series_equal(agged, grouped.mean())
  63. # Cython only returning floating point for now...
  64. assert_series_equal(grouped.agg(np.sum).astype(float),
  65. grouped.sum())
  66. transformed = grouped.transform(lambda x: x * x.sum())
  67. self.assertEqual(transformed[7], 12)
  68. value_grouped = data.groupby(data)
  69. assert_series_equal(value_grouped.aggregate(np.mean), agged)
  70. # complex agg
  71. agged = grouped.aggregate([np.mean, np.std])
  72. agged = grouped.aggregate({'one' : np.mean,
  73. 'two' : np.std})
  74. group_constants = {
  75. 0 : 10,
  76. 1 : 20,
  77. 2 : 30
  78. }
  79. agged = grouped.agg(lambda x: group_constants[x.name] + x.mean())
  80. self.assertEqual(agged[1], 21)
  81. # corner cases
  82. self.assertRaises(Exception, grouped.aggregate, lambda x: x * 2)
  83. def test_agg_regression1(self):
  84. grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
  85. result = grouped.agg(np.mean)
  86. expected = grouped.mean()
  87. assert_frame_equal(result, expected)
  88. def test_agg_must_agg(self):
  89. grouped = self.df.groupby('A')['C']
  90. self.assertRaises(Exception, grouped.agg, lambda x: x.describe())
  91. self.assertRaises(Exception, grouped.agg, lambda x: x.index[:2])
  92. def test_get_group(self):
  93. wp = tm.makePanel()
  94. grouped = wp.groupby(lambda x: x.month, axis='major')
  95. gp = grouped.get_group(1)
  96. expected = wp.reindex(major=[x for x in wp.major_axis if x.month == 1])
  97. assert_panel_equal(gp, expected)
  98. def test_agg_apply_corner(self):
  99. # nothing to group, all NA
  100. grouped = self.ts.groupby(self.ts * np.nan)
  101. assert_series_equal(grouped.sum(), Series([]))
  102. assert_series_equal(grouped.agg(np.sum), Series([]))
  103. assert_series_equal(grouped.apply(np.sum), Series([]))
  104. # DataFrame
  105. grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan)
  106. assert_frame_equal(grouped.sum(), DataFrame({}))
  107. assert_frame_equal(grouped.agg(np.sum), DataFrame({}))
  108. assert_frame_equal(grouped.apply(np.sum), DataFrame({}))
  109. def test_len(self):
  110. df = tm.makeTimeDataFrame()
  111. grouped = df.groupby([lambda x: x.year,
  112. lambda x: x.month,
  113. lambda x: x.day])
  114. self.assertEquals(len(grouped), len(df))
  115. grouped = df.groupby([lambda x: x.year,
  116. lambda x: x.month])
  117. expected = len(set([(x.year, x.month) for x in df.index]))
  118. self.assertEquals(len(grouped), expected)
  119. def test_groups(self):
  120. grouped = self.df.groupby(['A'])
  121. groups = grouped.groups
  122. self.assert_(groups is grouped.groups) # caching works
  123. for k, v in grouped.groups.iteritems():
  124. self.assert_((self.df.ix[v]['A'] == k).all())
  125. grouped = self.df.groupby(['A', 'B'])
  126. groups = grouped.groups
  127. self.assert_(groups is grouped.groups) # caching works
  128. for k, v in grouped.groups.iteritems():
  129. self.assert_((self.df.ix[v]['A'] == k[0]).all())
  130. self.assert_((self.df.ix[v]['B'] == k[1]).all())
  131. def test_aggregate_str_func(self):
  132. def _check_results(grouped):
  133. # single series
  134. result = grouped['A'].agg('std')
  135. expected = grouped['A'].std()
  136. assert_series_equal(result, expected)
  137. # group frame by function name
  138. result = grouped.aggregate('var')
  139. expected = grouped.var()
  140. assert_frame_equal(result, expected)
  141. # group frame by function dict
  142. result = grouped.agg({'A' : 'var', 'B' : 'std', 'C' : 'mean'})
  143. expected = DataFrame({'A' : grouped['A'].var(),
  144. 'B' : grouped['B'].std(),
  145. 'C' : grouped['C'].mean()})
  146. assert_frame_equal(result, expected)
  147. by_weekday = self.tsframe.groupby(lambda x: x.weekday())
  148. _check_results(by_weekday)
  149. by_mwkday = self.tsframe.groupby([lambda x: x.month,
  150. lambda x: x.weekday()])
  151. _check_results(by_mwkday)
  152. def test_basic_regression(self):
  153. # regression
  154. T = [1.0*x for x in range(1,10) *10][:1095]
  155. result = Series(T, range(0, len(T)))
  156. groupings = np.random.random((1100,))
  157. groupings = Series(groupings, range(0, len(groupings))) * 10.
  158. grouped = result.groupby(groupings)
  159. grouped.mean()
  160. def test_transform(self):
  161. data = Series(np.arange(9) // 3, index=np.arange(9))
  162. index = np.arange(9)
  163. np.random.shuffle(index)
  164. data = data.reindex(index)
  165. grouped = data.groupby(lambda x: x // 3)
  166. transformed = grouped.transform(lambda x: x * x.sum())
  167. self.assertEqual(transformed[7], 12)
  168. def test_transform_broadcast(self):
  169. grouped = self.ts.groupby(lambda x: x.month)
  170. result = grouped.transform(np.mean)
  171. self.assert_(result.index.equals(self.ts.index))
  172. for _, gp in grouped:
  173. self.assert_((result.reindex(gp.index) == gp.mean()).all())
  174. grouped = self.tsframe.groupby(lambda x: x.month)
  175. result = grouped.transform(np.mean)
  176. self.assert_(result.index.equals(self.tsframe.index))
  177. for _, gp in grouped:
  178. agged = gp.mean()
  179. res = result.reindex(gp.index)
  180. for col in self.tsframe:
  181. self.assert_((res[col] == agged[col]).all())
  182. # group columns
  183. grouped = self.tsframe.groupby({'A' : 0, 'B' : 0, 'C' : 1, 'D' : 1},
  184. axis=1)
  185. result = grouped.transform(np.mean)
  186. self.assert_(result.index.equals(self.tsframe.index))
  187. self.assert_(result.columns.equals(self.tsframe.columns))
  188. for _, gp in grouped:
  189. agged = gp.mean(1)
  190. res = result.reindex(columns=gp.columns)
  191. for idx in gp.index:
  192. self.assert_((res.xs(idx) == agged[idx]).all())
  193. def test_transform_multiple(self):
  194. grouped = self.ts.groupby([lambda x: x.year, lambda x: x.month])
  195. transformed = grouped.transform(lambda x: x * 2)
  196. broadcasted = grouped.transform(np.mean)
  197. def test_dispatch_transform(self):
  198. df = self.tsframe[::5].reindex(self.tsframe.index)
  199. grouped = df.groupby(lambda x: x.month)
  200. filled = grouped.fillna(method='pad')
  201. fillit = lambda x: x.fillna(method='pad')
  202. expected = df.groupby(lambda x: x.month).transform(fillit)
  203. assert_frame_equal(filled, expected)
  204. def test_with_na(self):
  205. index = Index(np.arange(10))
  206. values = Series(np.ones(10), index)
  207. labels = Series([nan, 'foo', 'bar', 'bar', nan, nan, 'bar',
  208. 'bar', nan, 'foo'], index=index)
  209. grouped = values.groupby(labels)
  210. agged = grouped.agg(len)
  211. expected = Series([4, 2], index=['bar', 'foo'])
  212. assert_series_equal(agged, expected)
  213. def test_attr_wrapper(self):
  214. grouped = self.ts.groupby(lambda x: x.weekday())
  215. result = grouped.std()
  216. expected = grouped.agg(lambda x: np.std(x, ddof=1))
  217. assert_series_equal(result, expected)
  218. # this is pretty cool
  219. result = grouped.describe()
  220. expected = {}
  221. for name, gp in grouped:
  222. expected[name] = gp.describe()
  223. expected = DataFrame(expected).T
  224. assert_frame_equal(result, expected)
  225. # get attribute
  226. result = grouped.dtype
  227. expected = grouped.agg(lambda x: x.dtype)
  228. # make sure raises error
  229. self.assertRaises(AttributeError, getattr, grouped, 'foo')
  230. def test_series_describe_multikey(self):
  231. ts = tm.makeTimeSeries()
  232. grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
  233. result = grouped.describe()
  234. assert_series_equal(result['mean'], grouped.mean())
  235. assert_series_equal(result['std'], grouped.std())
  236. assert_series_equal(result['min'], grouped.min())
  237. def test_series_describe_single(self):
  238. ts = tm.makeTimeSeries()
  239. grouped = ts.groupby(lambda x: x.month)
  240. result = grouped.apply(lambda x: x.describe())
  241. expected = grouped.describe()
  242. assert_frame_equal(result, expected)
  243. def test_series_agg_multikey(self):
  244. ts = tm.makeTimeSeries()
  245. grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
  246. result = grouped.agg(np.sum)
  247. expected = grouped.sum()
  248. assert_series_equal(result, expected)
  249. def test_frame_describe_multikey(self):
  250. grouped = self.tsframe.groupby([lambda x: x.year,
  251. lambda x: x.month])
  252. result = grouped.describe()
  253. for col in self.tsframe:
  254. expected = grouped[col].describe()
  255. assert_frame_equal(result[col].unstack(), expected)
  256. groupedT = self.tsframe.groupby({'A' : 0, 'B' : 0,
  257. 'C' : 1, 'D' : 1}, axis=1)
  258. result = groupedT.describe()
  259. for name, group in groupedT:
  260. assert_frame_equal(result[name], group.describe())
  261. def test_frame_groupby(self):
  262. grouped = self.tsframe.groupby(lambda x: x.weekday())
  263. # aggregate
  264. aggregated = grouped.aggregate(np.mean)
  265. self.assertEqual(len(aggregated), 5)
  266. self.assertEqual(len(aggregated.columns), 4)
  267. # by string
  268. tscopy = self.tsframe.copy()
  269. tscopy['weekday'] = [x.weekday() for x in tscopy.index]
  270. stragged = tscopy.groupby('weekday').aggregate(np.mean)
  271. assert_frame_equal(stragged, aggregated)
  272. # transform
  273. transformed = grouped.transform(lambda x: x - x.mean())
  274. self.assertEqual(len(transformed), 30)
  275. self.assertEqual(len(transformed.columns), 4)
  276. # transform propagate
  277. transformed = grouped.transform(lambda x: x.mean())
  278. for name, group in grouped:
  279. mean = group.mean()
  280. for idx in group.index:
  281. assert_almost_equal(transformed.xs(idx), mean)
  282. # iterate
  283. for weekday, group in grouped:
  284. self.assert_(group.index[0].weekday() == weekday)
  285. # groups / group_indices
  286. groups = grouped.primary.groups
  287. indices = grouped.primary.indices
  288. for k, v in groups.iteritems():
  289. samething = self.tsframe.index.take(indices[k])
  290. self.assert_(np.array_equal(v, samething))
  291. def test_frame_groupby_columns(self):
  292. mapping = {
  293. 'A' : 0, 'B' : 0, 'C' : 1, 'D' : 1
  294. }
  295. grouped = self.tsframe.groupby(mapping, axis=1)
  296. # aggregate
  297. aggregated = grouped.aggregate(np.mean)
  298. self.assertEqual(len(aggregated), len(self.tsframe))
  299. self.assertEqual(len(aggregated.columns), 2)
  300. # transform
  301. tf = lambda x: x - x.mean()
  302. groupedT = self.tsframe.T.groupby(mapping, axis=0)
  303. assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf))
  304. # iterate
  305. for k, v in grouped:
  306. self.assertEqual(len(v.columns), 2)
  307. def test_frame_set_name_single(self):
  308. grouped = self.df.groupby('A')
  309. result = grouped.mean()
  310. self.assert_(result.index.name == 'A')
  311. result = self.df.groupby('A', as_index=False).mean()
  312. self.assert_(result.index.name != 'A')
  313. result = grouped.agg(np.mean)
  314. self.assert_(result.index.name == 'A')
  315. result = grouped.agg({'C' : np.mean, 'D' : np.std})
  316. self.assert_(result.index.name == 'A')
  317. result = grouped['C'].mean()
  318. self.assert_(result.index.name == 'A')
  319. result = grouped['C'].agg(np.mean)
  320. self.assert_(result.index.name == 'A')
  321. result = grouped['C'].agg([np.mean, np.std])
  322. self.assert_(result.index.name == 'A')
  323. result = grouped['C'].agg({'foo' : np.mean, 'bar' : np.std})
  324. self.assert_(result.index.name == 'A')
  325. def test_multi_iter(self):
  326. s = Series(np.arange(6))
  327. k1 = np.array(['a', 'a', 'a', 'b', 'b', 'b'])
  328. k2 = np.array(['1', '2', '1', '2', '1', '2'])
  329. grouped = s.groupby([k1, k2])
  330. iterated = list(grouped)
  331. expected = [('a', '1', s[[0, 2]]),
  332. ('a', '2', s[[1]]),
  333. ('b', '1', s[[4]]),
  334. ('b', '2', s[[3, 5]])]
  335. for i, ((one, two), three) in enumerate(iterated):
  336. e1, e2, e3 = expected[i]
  337. self.assert_(e1 == one)
  338. self.assert_(e2 == two)
  339. assert_series_equal(three, e3)
  340. def test_multi_iter_frame(self):
  341. k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
  342. k2 = np.array(['1', '2', '1', '2', '1', '2'])
  343. df = DataFrame({'v1' : np.random.randn(6),
  344. 'v2' : np.random.randn(6),
  345. 'k1' : k1, 'k2' : k2},
  346. index=['one', 'two', 'three', 'four', 'five', 'six'])
  347. grouped = df.groupby(['k1', 'k2'])
  348. # things get sorted!
  349. iterated = list(grouped)
  350. idx = df.index
  351. expected = [('a', '1', df.ix[idx[[4]]]),
  352. ('a', '2', df.ix[idx[[3, 5]]]),
  353. ('b', '1', df.ix[idx[[0, 2]]]),
  354. ('b', '2', df.ix[idx[[1]]])]
  355. for i, ((one, two), three) in enumerate(iterated):
  356. e1, e2, e3 = expected[i]
  357. self.assert_(e1 == one)
  358. self.assert_(e2 == two)
  359. assert_frame_equal(three, e3)
  360. # don't iterate through groups with no data
  361. df['k1'] = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
  362. df['k2'] = np.array(['1', '1', '1', '2', '2', '2'])
  363. grouped = df.groupby(['k1', 'k2'])
  364. groups = {}
  365. for key, gp in grouped:
  366. groups[key] = gp
  367. self.assertEquals(len(groups), 2)
  368. def test_multi_iter_panel(self):
  369. wp = tm.makePanel()
  370. grouped = wp.groupby([lambda x: x.month, lambda x: x.weekday()],
  371. axis=1)
  372. for (month, wd), group in grouped:
  373. exp_axis = [x for x in wp.major_axis
  374. if x.month == month and x.weekday() == wd]
  375. expected = wp.reindex(major=exp_axis)
  376. assert_panel_equal(group, expected)
  377. def test_multi_func(self):
  378. col1 = self.df['A']
  379. col2 = self.df['B']
  380. grouped = self.df.groupby([col1.get, col2.get])
  381. agged = grouped.mean()
  382. expected = self.df.groupby(['A', 'B']).mean()
  383. assert_frame_equal(agged.ix[:, ['C', 'D']],
  384. expected.ix[:, ['C', 'D']])
  385. # some "groups" with no data
  386. df = DataFrame({'v1' : np.random.randn(6),
  387. 'v2' : np.random.randn(6),
  388. 'k1' : np.array(['b', 'b', 'b', 'a', 'a', 'a']),
  389. 'k2' : np.array(['1', '1', '1', '2', '2', '2'])},
  390. index=['one', 'two', 'three', 'four', 'five', 'six'])
  391. # only verify that it works for now
  392. grouped = df.groupby(['k1', 'k2'])
  393. grouped.agg(np.sum)
  394. def test_multi_key_multiple_functions(self):
  395. grouped = self.df.groupby(['A', 'B'])['C']
  396. agged = grouped.agg([np.mean, np.std])
  397. expected = DataFrame({'mean' : grouped.agg(np.mean),
  398. 'std' : grouped.agg(np.std)})
  399. assert_frame_equal(agged, expected)
  400. def test_groupby_multiple_columns(self):
  401. data = self.df
  402. grouped = data.groupby(['A', 'B'])
  403. def _check_op(op):
  404. result1 = op(grouped)
  405. expected = defaultdict(dict)
  406. for n1, gp1 in data.groupby('A'):
  407. for n2, gp2 in gp1.groupby('B'):
  408. expected[n1][n2] = op(gp2.ix[:, ['C', 'D']])
  409. expected = dict((k, DataFrame(v)) for k, v in expected.iteritems())
  410. expected = Panel.fromDict(expected).swapaxes(0, 1)
  411. # a little bit crude
  412. for col in ['C', 'D']:
  413. result_col = op(grouped[col])
  414. exp = expected[col]
  415. pivoted = result1[col].unstack()
  416. pivoted2 = result_col.unstack()
  417. assert_frame_equal(pivoted.reindex_like(exp), exp)
  418. assert_frame_equal(pivoted2.reindex_like(exp), exp)
  419. _check_op(lambda x: x.sum())
  420. _check_op(lambda x: x.mean())
  421. # test single series works the same
  422. result = data['C'].groupby([data['A'], data['B']]).mean()
  423. expected = data.groupby(['A', 'B']).mean()['C']
  424. assert_series_equal(result, expected)
  425. def test_groupby_as_index_agg(self):
  426. grouped = self.df.groupby('A', as_index=False)
  427. # single-key
  428. result = grouped.agg(np.mean)
  429. expected = grouped.mean()
  430. assert_frame_equal(result, expected)
  431. result2 = grouped.agg({'C' : np.mean, 'D' : np.sum})
  432. expected2 = grouped.mean()
  433. expected2['D'] = grouped.sum()['D']
  434. assert_frame_equal(result2, expected2)
  435. # multi-key
  436. grouped = self.df.groupby(['A', 'B'], as_index=False)
  437. result = grouped.agg(np.mean)
  438. expected = grouped.mean()
  439. assert_frame_equal(result, expected)
  440. result2 = grouped.agg({'C' : np.mean, 'D' : np.sum})
  441. expected2 = grouped.mean()
  442. expected2['D'] = grouped.sum()['D']
  443. assert_frame_equal(result2, expected2)
  444. def test_as_index_series_return_frame(self):
  445. grouped = self.df.groupby('A', as_index=False)
  446. grouped2 = self.df.groupby(['A', 'B'], as_index=False)
  447. result = grouped['C'].agg(np.sum)
  448. expected = grouped.agg(np.sum).ix[:, ['A', 'C']]
  449. self.assert_(isinstance(result, DataFrame))
  450. assert_frame_equal(result, expected)
  451. result2 = grouped2['C'].agg(np.sum)
  452. expected2 = grouped2.agg(np.sum).ix[:, ['A', 'B', 'C']]
  453. self.assert_(isinstance(result2, DataFrame))
  454. assert_frame_equal(result2, expected2)
  455. result = grouped['C'].sum()
  456. expected = grouped.sum().ix[:, ['A', 'C']]
  457. self.assert_(isinstance(result, DataFrame))
  458. assert_frame_equal(result, expected)
  459. result2 = grouped2['C'].sum()
  460. expected2 = grouped2.sum().ix[:, ['A', 'B', 'C']]
  461. self.assert_(isinstance(result2, DataFrame))
  462. assert_frame_equal(result2, expected2)
  463. # corner case
  464. self.assertRaises(Exception, grouped['C'].__getitem__,
  465. 'D')
  466. def test_groupby_as_index_cython(self):
  467. data = self.df
  468. # single-key
  469. grouped = data.groupby('A', as_index=False)
  470. result = grouped.mean()
  471. expected = data.groupby(['A']).mean()
  472. expected.insert(0, 'A', expected.index)
  473. expected.index = np.arange(len(expected))
  474. assert_frame_equal(result, expected)
  475. # multi-key
  476. grouped = data.groupby(['A', 'B'], as_index=False)
  477. result = grouped.mean()
  478. expected = data.groupby(['A', 'B']).mean()
  479. arrays = zip(*expected.index.get_tuple_index())
  480. expected.insert(0, 'A', arrays[0])
  481. expected.insert(1, 'B', arrays[1])
  482. expected.index = np.arange(len(expected))
  483. assert_frame_equal(result, expected)
  484. def test_groupby_as_index_corner(self):
  485. self.assertRaises(TypeError, self.ts.groupby,
  486. lambda x: x.weekday(), as_index=False)
  487. self.assertRaises(ValueError, self.df.groupby,
  488. lambda x: x.lower(), as_index=False, axis=1)
  489. def test_groupby_multiple_key(self):
  490. df = tm.makeTimeDataFrame()
  491. grouped = df.groupby([lambda x: x.year,
  492. lambda x: x.month,
  493. lambda x: x.day])
  494. agged = grouped.sum()
  495. assert_almost_equal(df.values, agged.values)
  496. grouped = df.T.groupby([lambda x: x.year,
  497. lambda x: x.month,
  498. lambda x: x.day], axis=1)
  499. agged = grouped.agg(lambda x: x.sum(1))
  500. self.assert_(agged.index.equals(df.columns))
  501. assert_almost_equal(df.T.values, agged.values)
  502. agged = grouped.agg(lambda x: x.sum(1))
  503. assert_almost_equal(df.T.values, agged.values)
  504. def test_groupby_multi_corner(self):
  505. # test that having an all-NA column doesn't mess you up
  506. df = self.df.copy()
  507. df['bad'] = np.nan
  508. agged = df.groupby(['A', 'B']).mean()
  509. expected = self.df.groupby(['A', 'B']).mean()
  510. expected['bad'] = np.nan
  511. assert_frame_equal(agged, expected)
  512. def test_omit_nuisance(self):
  513. grouped = self.df.groupby('A')
  514. result = grouped.mean()
  515. expected = self.df.ix[:, ['A', 'C', 'D']].groupby('A').mean()
  516. assert_frame_equal(result, expected)
  517. df = self.df.ix[:, ['A', 'C', 'D']]
  518. df['E'] = datetime.now()
  519. grouped = df.groupby('A')
  520. result = grouped.agg(np.sum)
  521. expected = grouped.sum()
  522. assert_frame_equal(result, expected)
  523. # won't work with axis = 1
  524. grouped = df.groupby({'A' : 0, 'C' : 0, 'D' : 1, 'E' : 1}, axis=1)
  525. result = self.assertRaises(TypeError, grouped.agg, np.sum)
  526. def test_nonsense_func(self):
  527. df = DataFrame([0])
  528. self.assertRaises(Exception, df.groupby, lambda x: x + 'foo')
  529. def test_cythonized_aggers(self):
  530. data = {'A' : [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., nan, nan],
  531. 'B' : ['A', 'B'] * 6,
  532. 'C' : np.random.randn(12)}
  533. df = DataFrame(data)
  534. df['C'][2:10:2] = nan
  535. def _testit(op):
  536. # single column
  537. grouped = df.drop(['B'], axis=1).groupby('A')
  538. exp = {}
  539. for cat, group in grouped:
  540. exp[cat] = op(group['C'])
  541. exp = DataFrame({'C' : exp})
  542. result = op(grouped)
  543. assert_frame_equal(result, exp)
  544. # multiple columns
  545. grouped = df.groupby(['A', 'B'])
  546. expd = {}
  547. for (cat1, cat2), group in grouped:
  548. expd.setdefault(cat1, {})[cat2] = op(group['C'])
  549. exp = DataFrame(expd).T.stack(dropna=False)
  550. result = op(grouped)['C']
  551. assert_series_equal(result, exp)
  552. _testit(lambda x: x.sum())
  553. _testit(lambda x: x.mean())
  554. def test_cython_agg_boolean(self):
  555. frame = DataFrame({'a': np.random.randint(0, 5, 50),
  556. 'b': np.random.randint(0, 2, 50).astype('bool')})
  557. result = frame.groupby('a')['b'].mean()
  558. expected = frame.groupby('a')['b'].agg(np.mean)
  559. assert_series_equal(result, expected)
  560. def test_cython_agg_nothing_to_agg(self):
  561. frame = DataFrame({'a': np.random.randint(0, 5, 50),
  562. 'b': ['foo', 'bar'] * 25})
  563. self.assertRaises(GroupByError, frame.groupby('a')['b'].mean)
  564. def test_grouping_attrs(self):
  565. deleveled = self.mframe.delevel()
  566. grouped = deleveled.groupby(['first', 'second'])
  567. for i, ping in enumerate(grouped.groupings):
  568. the_counts = self.mframe.groupby(level=i).count()['A']
  569. assert_almost_equal(ping.counts, the_counts)
  570. def test_groupby_level(self):
  571. frame = self.mframe
  572. deleveled = frame.delevel()
  573. result0 = frame.groupby(level=0).sum()
  574. result1 = frame.groupby(level=1).sum()
  575. expected0 = frame.groupby(deleveled['first']).sum()
  576. expected1 = frame.groupby(deleveled['second']).sum()
  577. assert_frame_equal(result0, expected0)
  578. assert_frame_equal(result1, expected1)
  579. self.assertEquals(result0.index.name, frame.index.names[0])
  580. self.assertEquals(result1.index.name, frame.index.names[1])
  581. # groupby level name
  582. result0 = frame.groupby(level='first').sum()
  583. result1 = frame.groupby(level='second').sum()
  584. assert_frame_equal(result0, expected0)
  585. assert_frame_equal(result1, expected1)
  586. # axis=1
  587. result0 = frame.T.groupby(level=0, axis=1).sum()
  588. result1 = frame.T.groupby(level=1, axis=1).sum()
  589. assert_frame_equal(result0, expected0.T)
  590. assert_frame_equal(result1, expected1.T)
  591. # raise exception for non-MultiIndex
  592. self.assertRaises(ValueError, self.df.groupby, level=0)
  593. def test_groupby_level_mapper(self):
  594. frame = self.mframe
  595. deleveled = frame.delevel()
  596. mapper0 = {'foo' : 0, 'bar' : 0,
  597. 'baz' : 1, 'qux' : 1}
  598. mapper1 = {'one' : 0, 'two' : 0, 'three' : 1}
  599. result0 = frame.groupby(mapper0, level=0).sum()
  600. result1 = frame.groupby(mapper1, level=1).sum()
  601. mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']])
  602. mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']])
  603. expected0 = frame.groupby(mapped_level0).sum()
  604. expected1 = frame.groupby(mapped_level1).sum()
  605. assert_frame_equal(result0, expected0)
  606. assert_frame_equal(result1, expected1)
  607. def test_cython_fail_agg(self):
  608. dr = DateRange('1/1/2000', periods=50)
  609. ts = Series(['A', 'B', 'C', 'D', 'E'] * 10, index=dr)
  610. grouped = ts.groupby(lambda x: x.month)
  611. summed = grouped.sum()
  612. expected = grouped.agg(np.sum)
  613. assert_series_equal(summed, expected)
  614. def test_apply_series_to_frame(self):
  615. def f(piece):
  616. return DataFrame({'value' : piece,
  617. 'demeaned' : piece - piece.mean(),
  618. 'logged' : np.log(piece)})
  619. dr = DateRange('1/1/2000', periods=100)
  620. ts = Series(np.random.randn(100), index=dr)
  621. grouped = ts.groupby(lambda x: x.month)
  622. result = grouped.apply(f)
  623. self.assert_(isinstance(result, DataFrame))
  624. self.assert_(result.index.equals(ts.index))
  625. def test_apply_frame_to_series(self):
  626. grouped = self.df.groupby(['A', 'B'])
  627. result = grouped.apply(len)
  628. expected = grouped.count()['C']
  629. self.assert_(result.index.equals(expected.index))
  630. self.assert_(np.array_equal(result.values, expected.values))
  631. def test_apply_transform(self):
  632. grouped = self.ts.groupby(lambda x: x.month)
  633. result = grouped.apply(lambda x: x * 2)
  634. expected = grouped.transform(lambda x: x * 2)
  635. assert_series_equal(result, expected)
  636. def test_apply_multikey_corner(self):
  637. grouped = self.tsframe.groupby([lambda x: x.year,
  638. lambda x: x.month])
  639. def f(group):
  640. return group.sort('A')[-5:]
  641. result = grouped.apply(f)
  642. for key, group in grouped:
  643. assert_frame_equal(result.ix[key], f(group))
  644. def test_groupby_series_indexed_differently(self):
  645. s1 = Series([5.0,-9.0,4.0,100.,-5.,55.,6.7],
  646. index=Index(['a','b','c','d','e','f','g']))
  647. s2 = Series([1.0,1.0,4.0,5.0,5.0,7.0],
  648. index=Index(['a','b','d','f','g','h']))
  649. grouped = s1.groupby(s2)
  650. agged = grouped.mean()
  651. exp = s1.groupby(s2.reindex(s1.index).get).mean()
  652. assert_series_equal(agged, exp)
  653. def test_groupby_with_hier_columns(self):
  654. tuples = zip(*[['bar', 'bar', 'baz', 'baz',
  655. 'foo', 'foo', 'qux', 'qux'],
  656. ['one', 'two', 'one', 'two',
  657. 'one', 'two', 'one', 'two']])
  658. index = MultiIndex.from_tuples(tuples)
  659. columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'),
  660. ('B', 'cat'), ('A', 'dog')])
  661. df = DataFrame(np.random.randn(8, 4), index=index,
  662. columns=columns)
  663. result = df.groupby(level=0).mean()
  664. self.assert_(result.columns.equals(columns))
  665. result = df.groupby(level=0, axis=1).mean()
  666. self.assert_(result.index.equals(df.index))
  667. result = df.groupby(level=0).agg(np.mean)
  668. self.assert_(result.columns.equals(columns))
  669. result = df.groupby(level=0).apply(lambda x: x.mean())
  670. self.assert_(result.columns.equals(columns))
  671. result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1))
  672. self.assert_(result.columns.equals(Index(['A', 'B'])))
  673. self.assert_(result.index.equals(df.index))
  674. # add a nuisance column
  675. sorted_columns, _ = columns.sortlevel(0)
  676. df['A', 'foo'] = 'bar'
  677. result = df.groupby(level=0).mean()
  678. self.assert_(result.columns.equals(sorted_columns))
  679. def test_pass_args_kwargs(self):
  680. from scipy.stats import scoreatpercentile
  681. def f(x, q=None):
  682. return scoreatpercentile(x, q)
  683. g = lambda x: scoreatpercentile(x, 80)
  684. # Series
  685. ts_grouped = self.ts.groupby(lambda x: x.month)
  686. agg_result = ts_grouped.agg(scoreatpercentile, 80)
  687. apply_result = ts_grouped.apply(scoreatpercentile, 80)
  688. trans_result = ts_grouped.transform(scoreatpercentile, 80)
  689. agg_expected = ts_grouped.quantile(.8)
  690. trans_expected = ts_grouped.transform(g)
  691. assert_series_equal(apply_result, agg_expected)
  692. assert_series_equal(agg_result, agg_expected)
  693. assert_series_equal(trans_result, trans_expected)
  694. agg_result = ts_grouped.agg(f, q=80)
  695. apply_result = ts_grouped.apply(f, q=80)
  696. trans_result = ts_grouped.transform(f, q=80)
  697. assert_series_equal(agg_result, agg_expected)
  698. assert_series_equal(apply_result, agg_expected)
  699. assert_series_equal(trans_result, trans_expected)
  700. # DataFrame
  701. df_grouped = self.tsframe.groupby(lambda x: x.month)
  702. agg_result = df_grouped.agg(scoreatpercentile, 80)
  703. apply_result = df_grouped.apply(DataFrame.quantile, .8)
  704. expected = df_grouped.quantile(.8)
  705. assert_frame_equal(apply_result, expected)
  706. assert_frame_equal(agg_result, expected)
  707. agg_result = df_grouped.agg(f, q=80)
  708. apply_result = df_grouped.apply(DataFrame.quantile, q=.8)
  709. assert_frame_equal(agg_result, expected)
  710. assert_frame_equal(apply_result, expected)
  711. def test_cython_na_bug(self):
  712. values = np.random.randn(10)
  713. shape = (5, 5)
  714. label_list = [np.array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2], dtype=np.int32),
  715. np.array([1, 2, 3, 4, 0, 1, 2, 3, 3, 4], dtype=np.int32)]
  716. lib.group_aggregate(values, label_list, shape)
  717. def test_size(self):
  718. grouped = self.df.groupby(['A', 'B'])
  719. result = grouped.size()
  720. for key, group in grouped:
  721. self.assertEquals(result[key], len(group))
  722. grouped = self.df.groupby('A')
  723. result = grouped.size()
  724. for key, group in grouped:
  725. self.assertEquals(result[key], len(group))
  726. grouped = self.df.groupby('B')
  727. result = grouped.size()
  728. for key, group in grouped:
  729. self.assertEquals(result[key], len(group))
  730. def test_grouping_ndarray(self):
  731. grouped = self.df.groupby(self.df['A'].values)
  732. result = grouped.sum()
  733. expected = self.df.groupby('A').sum()
  734. assert_frame_equal(result, expected)
  735. def test_apply_typecast_fail(self):
  736. df = DataFrame({'d' : [1.,1.,1.,2.,2.,2.],
  737. 'c' : np.tile(['a','b','c'], 2),
  738. 'v' : np.arange(1., 7.)})
  739. def f(group):
  740. v = group['v']
  741. group['v2'] = (v - v.min()) / (v.max() - v.min())
  742. return group
  743. result = df.groupby('d').apply(f)
  744. expected = df.copy()
  745. expected['v2'] = np.tile([0., 0.5, 1], 2)
  746. assert_frame_equal(result, expected)
  747. def test_apply_multiindex_fail(self):
  748. index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1],
  749. [1, 2, 3, 1, 2, 3]])
  750. df = DataFrame({'d' : [1.,1.,1.,2.,2.,2.],
  751. 'c' : np.tile(['a','b','c'], 2),
  752. 'v' : np.arange(1., 7.)}, index=index)
  753. def f(group):
  754. v = group['v']
  755. group['v2'] = (v - v.min()) / (v.max() - v.min())
  756. return group
  757. result = df.groupby('d').apply(f)
  758. expected = df.copy()
  759. expected['v2'] = np.tile([0., 0.5, 1], 2)
  760. assert_frame_equal(result, expected)
  761. def test_apply_corner(self):
  762. result = self.tsframe.groupby(lambda x: x.year).apply(lambda x: x * 2)
  763. expected = self.tsframe * 2
  764. assert_frame_equal(result, expected)
  765. def test_transform_mixed_type(self):
  766. index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1],
  767. [1, 2, 3, 1, 2, 3]])
  768. df = DataFrame({'d' : [1.,1.,1.,2.,2.,2.],
  769. 'c' : np.tile(['a','b','c'], 2),
  770. 'v' : np.arange(1., 7.)}, index=index)
  771. def f(group):
  772. group['g'] = group['d'] * 2
  773. return group[:1]
  774. grouped = df.groupby('c')
  775. result = grouped.apply(f)
  776. self.assert_(result['d'].dtype == np.float64)
  777. for key, group in grouped:
  778. res = f(group)
  779. assert_frame_equal(res, result.ix[key])
  780. def test_groupby_wrong_multi_labels(self):
  781. from pandas import read_csv
  782. from cStringIO import StringIO
  783. data = """index,foo,bar,baz,spam,data
  784. 0,foo1,bar1,baz1,spam2,20
  785. 1,foo1,bar2,baz1,spam3,30
  786. 2,foo2,bar2,baz1,spam2,40
  787. 3,foo1,bar1,baz2,spam1,50
  788. 4,foo3,bar1,baz2,spam1,60"""
  789. data = read_csv(StringIO(data), index_col=0)
  790. grouped = data.groupby(['foo', 'bar', 'baz', 'spam'])
  791. result = grouped.agg(np.mean)
  792. expected = grouped.mean()
  793. assert_frame_equal(result, expected)
  794. def test_groupby_series_with_name(self):
  795. result = self.df.groupby(self.df['A']).mean()
  796. result2 = self.df.groupby(self.df['A'], as_index=False).mean()
  797. self.assertEquals(result.index.name, 'A')
  798. self.assert_('A' in result2)
  799. result = self.df.groupby([self.df['A'], self.df['B']]).mean()
  800. result2 = self.df.groupby([self.df['A'], self.df['B']],
  801. as_index=False).mean()
  802. self.assertEquals(result.index.names, ['A', 'B'])
  803. self.assert_('A' in result2)
  804. self.assert_('B' in result2)
  805. class TestPanelGroupBy(unittest.TestCase):
  806. def setUp(self):
  807. self.panel = tm.makePanel()
  808. tm.add_nans(self.panel)
  809. def test_groupby(self):
  810. grouped = self.panel.groupby({'ItemA' : 0, 'ItemB' : 0, 'ItemC' : 1},
  811. axis='items')
  812. agged = grouped.agg(np.mean)
  813. self.assert_(np.array_equal(agged.items, [0, 1]))
  814. grouped = self.panel.groupby(lambda x: x.month, axis='major')
  815. agged = grouped.agg(np.mean)
  816. self.assert_(np.array_equal(agged.major_axis, [1, 2]))
  817. grouped = self.panel.groupby({'A' : 0, 'B' : 0, 'C' : 1, 'D' : 1},
  818. axis='minor')
  819. agged = grouped.agg(np.mean)
  820. self.assert_(np.array_equal(agged.minor_axis, [0, 1]))
  821. class TestAggregate(unittest.TestCase):
  822. setUp = commonSetUp
  823. class TestTransform(unittest.TestCase):
  824. setUp = commonSetUp
  825. if __name__ == '__main__':
  826. import nose
  827. nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],
  828. exit=False)