PageRenderTime 60ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/tests/test_groupby.py

https://github.com/kljensen/pandas
Python | 2112 lines | 1728 code | 319 blank | 65 comment | 58 complexity | 7d87f6aaeae5de932e7137b21c38a596 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. import nose
  2. import unittest
  3. from datetime import datetime
  4. from numpy import nan
  5. from pandas import bdate_range
  6. from pandas.core.index import Index, MultiIndex
  7. from pandas.core.common import rands
  8. from pandas.core.api import Categorical, DataFrame
  9. from pandas.core.groupby import GroupByError, SpecificationError, DataError
  10. from pandas.core.series import Series
  11. from pandas.util.testing import (assert_panel_equal, assert_frame_equal,
  12. assert_series_equal, assert_almost_equal)
  13. from pandas.core.panel import Panel
  14. from pandas.tools.merge import concat
  15. from collections import defaultdict
  16. import pandas.core.common as com
  17. import pandas.core.datetools as dt
  18. import numpy as np
  19. from numpy.testing import assert_equal
  20. import pandas.core.nanops as nanops
  21. import pandas.util.testing as tm
  22. def commonSetUp(self):
  23. self.dateRange = bdate_range('1/1/2005', periods=250)
  24. self.stringIndex = Index([rands(8).upper() for x in xrange(250)])
  25. self.groupId = Series([x[0] for x in self.stringIndex],
  26. index=self.stringIndex)
  27. self.groupDict = dict((k, v) for k, v in self.groupId.iteritems())
  28. self.columnIndex = Index(['A', 'B', 'C', 'D', 'E'])
  29. randMat = np.random.randn(250, 5)
  30. self.stringMatrix = DataFrame(randMat, columns=self.columnIndex,
  31. index=self.stringIndex)
  32. self.timeMatrix = DataFrame(randMat, columns=self.columnIndex,
  33. index=self.dateRange)
  34. class TestGroupBy(unittest.TestCase):
  35. def setUp(self):
  36. self.ts = tm.makeTimeSeries()
  37. self.seriesd = tm.getSeriesData()
  38. self.tsd = tm.getTimeSeriesData()
  39. self.frame = DataFrame(self.seriesd)
  40. self.tsframe = DataFrame(self.tsd)
  41. self.df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
  42. 'foo', 'bar', 'foo', 'foo'],
  43. 'B' : ['one', 'one', 'two', 'three',
  44. 'two', 'two', 'one', 'three'],
  45. 'C' : np.random.randn(8),
  46. 'D' : np.random.randn(8)})
  47. index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
  48. ['one', 'two', 'three']],
  49. labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
  50. [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
  51. names=['first', 'second'])
  52. self.mframe = DataFrame(np.random.randn(10, 3), index=index,
  53. columns=['A', 'B', 'C'])
  54. self.three_group = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo',
  55. 'bar', 'bar', 'bar', 'bar',
  56. 'foo', 'foo', 'foo'],
  57. 'B' : ['one', 'one', 'one', 'two',
  58. 'one', 'one', 'one', 'two',
  59. 'two', 'two', 'one'],
  60. 'C' : ['dull', 'dull', 'shiny', 'dull',
  61. 'dull', 'shiny', 'shiny', 'dull',
  62. 'shiny', 'shiny', 'shiny'],
  63. 'D' : np.random.randn(11),
  64. 'E' : np.random.randn(11),
  65. 'F' : np.random.randn(11)})
  66. def test_basic(self):
  67. data = Series(np.arange(9) // 3, index=np.arange(9))
  68. index = np.arange(9)
  69. np.random.shuffle(index)
  70. data = data.reindex(index)
  71. grouped = data.groupby(lambda x: x // 3)
  72. for k, v in grouped:
  73. self.assertEqual(len(v), 3)
  74. agged = grouped.aggregate(np.mean)
  75. self.assertEqual(agged[1], 1)
  76. assert_series_equal(agged, grouped.agg(np.mean)) # shorthand
  77. assert_series_equal(agged, grouped.mean())
  78. # Cython only returning floating point for now...
  79. assert_series_equal(grouped.agg(np.sum).astype(float),
  80. grouped.sum())
  81. transformed = grouped.transform(lambda x: x * x.sum())
  82. self.assertEqual(transformed[7], 12)
  83. value_grouped = data.groupby(data)
  84. assert_series_equal(value_grouped.aggregate(np.mean), agged)
  85. # complex agg
  86. agged = grouped.aggregate([np.mean, np.std])
  87. agged = grouped.aggregate({'one' : np.mean,
  88. 'two' : np.std})
  89. group_constants = {
  90. 0 : 10,
  91. 1 : 20,
  92. 2 : 30
  93. }
  94. agged = grouped.agg(lambda x: group_constants[x.name] + x.mean())
  95. self.assertEqual(agged[1], 21)
  96. # corner cases
  97. self.assertRaises(Exception, grouped.aggregate, lambda x: x * 2)
  98. def test_first_last_nth(self):
  99. # tests for first / last / nth
  100. grouped = self.df.groupby('A')
  101. first = grouped.first()
  102. expected = self.df.ix[[1, 0], ['B', 'C', 'D']]
  103. expected.index = ['bar', 'foo']
  104. assert_frame_equal(first, expected)
  105. last = grouped.last()
  106. expected = self.df.ix[[5, 7], ['B', 'C', 'D']]
  107. expected.index = ['bar', 'foo']
  108. assert_frame_equal(last, expected)
  109. nth = grouped.nth(1)
  110. expected = self.df.ix[[3, 2], ['B', 'C', 'D']]
  111. expected.index = ['bar', 'foo']
  112. assert_frame_equal(nth, expected)
  113. # it works!
  114. grouped['B'].first()
  115. grouped['B'].last()
  116. grouped['B'].nth(0)
  117. self.df['B'][self.df['A'] == 'foo'] = np.nan
  118. self.assert_(com.isnull(grouped['B'].first()['foo']))
  119. self.assert_(com.isnull(grouped['B'].last()['foo']))
  120. self.assert_(com.isnull(grouped['B'].nth(0)['foo']))
  121. def test_grouper_iter(self):
  122. self.assertEqual(sorted(self.df.groupby('A').grouper), ['bar', 'foo'])
  123. def test_empty_groups(self):
  124. # GH # 1048
  125. self.assertRaises(ValueError, self.df.groupby, [])
  126. def test_groupby_grouper(self):
  127. grouped = self.df.groupby('A')
  128. result = self.df.groupby(grouped.grouper).mean()
  129. expected = grouped.mean()
  130. assert_frame_equal(result, expected)
  131. def test_groupby_dict_mapping(self):
  132. # GH #679
  133. from pandas import Series
  134. s = Series({'T1': 5})
  135. result = s.groupby({'T1': 'T2'}).agg(sum)
  136. expected = s.groupby(['T2']).agg(sum)
  137. assert_series_equal(result, expected)
  138. s = Series([1., 2., 3., 4.], index=list('abcd'))
  139. mapping = {'a' : 0, 'b' : 0, 'c' : 1, 'd' : 1}
  140. result = s.groupby(mapping).mean()
  141. result2 = s.groupby(mapping).agg(np.mean)
  142. expected = s.groupby([0, 0, 1, 1]).mean()
  143. expected2 = s.groupby([0, 0, 1, 1]).mean()
  144. assert_series_equal(result, expected)
  145. assert_series_equal(result, result2)
  146. assert_series_equal(result, expected2)
  147. def test_groupby_nonobject_dtype(self):
  148. key = self.mframe.index.labels[0]
  149. grouped = self.mframe.groupby(key)
  150. result = grouped.sum()
  151. expected = self.mframe.groupby(key.astype('O')).sum()
  152. assert_frame_equal(result, expected)
  153. def test_agg_regression1(self):
  154. grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
  155. result = grouped.agg(np.mean)
  156. expected = grouped.mean()
  157. assert_frame_equal(result, expected)
  158. def test_agg_datetimes_mixed(self):
  159. data = [[1, '2012-01-01', 1.0],
  160. [2, '2012-01-02', 2.0],
  161. [3, None, 3.0]]
  162. df1 = DataFrame({'key': [x[0] for x in data],
  163. 'date': [x[1] for x in data],
  164. 'value': [x[2] for x in data]})
  165. data = [[row[0], datetime.strptime(row[1], '%Y-%m-%d').date()
  166. if row[1] else None, row[2]] for row in data]
  167. df2 = DataFrame({'key': [x[0] for x in data],
  168. 'date': [x[1] for x in data],
  169. 'value': [x[2] for x in data]})
  170. df1['weights'] = df1['value']/df1['value'].sum()
  171. gb1 = df1.groupby('date').aggregate(np.sum)
  172. df2['weights'] = df1['value']/df1['value'].sum()
  173. gb2 = df2.groupby('date').aggregate(np.sum)
  174. assert(len(gb1) == len(gb2))
  175. def test_agg_must_agg(self):
  176. grouped = self.df.groupby('A')['C']
  177. self.assertRaises(Exception, grouped.agg, lambda x: x.describe())
  178. self.assertRaises(Exception, grouped.agg, lambda x: x.index[:2])
  179. def test_agg_ser_multi_key(self):
  180. ser = self.df.C
  181. f = lambda x: x.sum()
  182. results = self.df.C.groupby([self.df.A, self.df.B]).aggregate(f)
  183. expected = self.df.groupby(['A', 'B']).sum()['C']
  184. assert_series_equal(results, expected)
  185. def test_get_group(self):
  186. wp = tm.makePanel()
  187. grouped = wp.groupby(lambda x: x.month, axis='major')
  188. gp = grouped.get_group(1)
  189. expected = wp.reindex(major=[x for x in wp.major_axis if x.month == 1])
  190. assert_panel_equal(gp, expected)
  191. def test_agg_apply_corner(self):
  192. # nothing to group, all NA
  193. grouped = self.ts.groupby(self.ts * np.nan)
  194. assert_series_equal(grouped.sum(), Series([]))
  195. assert_series_equal(grouped.agg(np.sum), Series([]))
  196. assert_series_equal(grouped.apply(np.sum), Series([]))
  197. # DataFrame
  198. grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan)
  199. exp_df = DataFrame(columns=self.tsframe.columns, dtype=float)
  200. assert_frame_equal(grouped.sum(), exp_df)
  201. assert_frame_equal(grouped.agg(np.sum), exp_df)
  202. assert_frame_equal(grouped.apply(np.sum), DataFrame({}, dtype=float))
  203. def test_agg_grouping_is_list_tuple(self):
  204. from pandas.core.groupby import Grouping
  205. df = tm.makeTimeDataFrame()
  206. grouped = df.groupby(lambda x: x.year)
  207. grouper = grouped.grouper.groupings[0].grouper
  208. grouped.grouper.groupings[0] = Grouping(self.ts.index, list(grouper))
  209. result = grouped.agg(np.mean)
  210. expected = grouped.mean()
  211. tm.assert_frame_equal(result, expected)
  212. grouped.grouper.groupings[0] = Grouping(self.ts.index, tuple(grouper))
  213. result = grouped.agg(np.mean)
  214. expected = grouped.mean()
  215. tm.assert_frame_equal(result, expected)
  216. def test_agg_python_multiindex(self):
  217. grouped = self.mframe.groupby(['A', 'B'])
  218. result = grouped.agg(np.mean)
  219. expected = grouped.mean()
  220. tm.assert_frame_equal(result, expected)
  221. def test_apply_describe_bug(self):
  222. grouped = self.mframe.groupby(level='first')
  223. result = grouped.describe() # it works!
  224. def test_len(self):
  225. df = tm.makeTimeDataFrame()
  226. grouped = df.groupby([lambda x: x.year,
  227. lambda x: x.month,
  228. lambda x: x.day])
  229. self.assertEquals(len(grouped), len(df))
  230. grouped = df.groupby([lambda x: x.year,
  231. lambda x: x.month])
  232. expected = len(set([(x.year, x.month) for x in df.index]))
  233. self.assertEquals(len(grouped), expected)
  234. def test_groups(self):
  235. grouped = self.df.groupby(['A'])
  236. groups = grouped.groups
  237. self.assert_(groups is grouped.groups) # caching works
  238. for k, v in grouped.groups.iteritems():
  239. self.assert_((self.df.ix[v]['A'] == k).all())
  240. grouped = self.df.groupby(['A', 'B'])
  241. groups = grouped.groups
  242. self.assert_(groups is grouped.groups) # caching works
  243. for k, v in grouped.groups.iteritems():
  244. self.assert_((self.df.ix[v]['A'] == k[0]).all())
  245. self.assert_((self.df.ix[v]['B'] == k[1]).all())
  246. def test_aggregate_str_func(self):
  247. def _check_results(grouped):
  248. # single series
  249. result = grouped['A'].agg('std')
  250. expected = grouped['A'].std()
  251. assert_series_equal(result, expected)
  252. # group frame by function name
  253. result = grouped.aggregate('var')
  254. expected = grouped.var()
  255. assert_frame_equal(result, expected)
  256. # group frame by function dict
  257. result = grouped.agg({'A' : 'var', 'B' : 'std', 'C' : 'mean'})
  258. expected = DataFrame({'A' : grouped['A'].var(),
  259. 'B' : grouped['B'].std(),
  260. 'C' : grouped['C'].mean()})
  261. assert_frame_equal(result, expected)
  262. by_weekday = self.tsframe.groupby(lambda x: x.weekday())
  263. _check_results(by_weekday)
  264. by_mwkday = self.tsframe.groupby([lambda x: x.month,
  265. lambda x: x.weekday()])
  266. _check_results(by_mwkday)
  267. def test_aggregate_item_by_item(self):
  268. df = self.df.copy()
  269. df['E'] = ['a'] * len(self.df)
  270. grouped = self.df.groupby('A')
  271. def aggfun(ser):
  272. return len(ser + 'a')
  273. result = grouped.agg(aggfun)
  274. self.assertEqual(len(result.columns), 1)
  275. aggfun = lambda ser: ser.size
  276. result = grouped.agg(aggfun)
  277. foo = (self.df.A == 'foo').sum()
  278. bar = (self.df.A == 'bar').sum()
  279. self.assert_((result.xs('foo') == foo).all())
  280. self.assert_((result.xs('bar') == bar).all())
  281. def aggfun(ser):
  282. return ser.size
  283. result = DataFrame().groupby(self.df.A).agg(aggfun)
  284. self.assert_(isinstance(result, DataFrame))
  285. self.assertEqual(len(result), 0)
  286. def test_basic_regression(self):
  287. # regression
  288. T = [1.0*x for x in range(1,10) *10][:1095]
  289. result = Series(T, range(0, len(T)))
  290. groupings = np.random.random((1100,))
  291. groupings = Series(groupings, range(0, len(groupings))) * 10.
  292. grouped = result.groupby(groupings)
  293. grouped.mean()
  294. def test_transform(self):
  295. data = Series(np.arange(9) // 3, index=np.arange(9))
  296. index = np.arange(9)
  297. np.random.shuffle(index)
  298. data = data.reindex(index)
  299. grouped = data.groupby(lambda x: x // 3)
  300. transformed = grouped.transform(lambda x: x * x.sum())
  301. self.assertEqual(transformed[7], 12)
  302. def test_transform_broadcast(self):
  303. grouped = self.ts.groupby(lambda x: x.month)
  304. result = grouped.transform(np.mean)
  305. self.assert_(result.index.equals(self.ts.index))
  306. for _, gp in grouped:
  307. self.assert_((result.reindex(gp.index) == gp.mean()).all())
  308. grouped = self.tsframe.groupby(lambda x: x.month)
  309. result = grouped.transform(np.mean)
  310. self.assert_(result.index.equals(self.tsframe.index))
  311. for _, gp in grouped:
  312. agged = gp.mean()
  313. res = result.reindex(gp.index)
  314. for col in self.tsframe:
  315. self.assert_((res[col] == agged[col]).all())
  316. # group columns
  317. grouped = self.tsframe.groupby({'A' : 0, 'B' : 0, 'C' : 1, 'D' : 1},
  318. axis=1)
  319. result = grouped.transform(np.mean)
  320. self.assert_(result.index.equals(self.tsframe.index))
  321. self.assert_(result.columns.equals(self.tsframe.columns))
  322. for _, gp in grouped:
  323. agged = gp.mean(1)
  324. res = result.reindex(columns=gp.columns)
  325. for idx in gp.index:
  326. self.assert_((res.xs(idx) == agged[idx]).all())
  327. def test_transform_multiple(self):
  328. grouped = self.ts.groupby([lambda x: x.year, lambda x: x.month])
  329. transformed = grouped.transform(lambda x: x * 2)
  330. broadcasted = grouped.transform(np.mean)
  331. def test_dispatch_transform(self):
  332. df = self.tsframe[::5].reindex(self.tsframe.index)
  333. grouped = df.groupby(lambda x: x.month)
  334. filled = grouped.fillna(method='pad')
  335. fillit = lambda x: x.fillna(method='pad')
  336. expected = df.groupby(lambda x: x.month).transform(fillit)
  337. assert_frame_equal(filled, expected)
  338. def test_transform_select_columns(self):
  339. f = lambda x: x.mean()
  340. result = self.df.groupby('A')['C', 'D'].transform(f)
  341. selection = self.df[['C', 'D']]
  342. expected = selection.groupby(self.df['A']).transform(f)
  343. assert_frame_equal(result, expected)
  344. def test_transform_exclude_nuisance(self):
  345. expected = {}
  346. grouped = self.df.groupby('A')
  347. expected['C'] = grouped['C'].transform(np.mean)
  348. expected['D'] = grouped['D'].transform(np.mean)
  349. expected = DataFrame(expected)
  350. result = self.df.groupby('A').transform(np.mean)
  351. assert_frame_equal(result, expected)
  352. def test_transform_function_aliases(self):
  353. result = self.df.groupby('A').transform('mean')
  354. expected = self.df.groupby('A').transform(np.mean)
  355. assert_frame_equal(result, expected)
  356. result = self.df.groupby('A')['C'].transform('mean')
  357. expected = self.df.groupby('A')['C'].transform(np.mean)
  358. assert_series_equal(result, expected)
  359. def test_with_na(self):
  360. index = Index(np.arange(10))
  361. values = Series(np.ones(10), index)
  362. labels = Series([nan, 'foo', 'bar', 'bar', nan, nan, 'bar',
  363. 'bar', nan, 'foo'], index=index)
  364. grouped = values.groupby(labels)
  365. agged = grouped.agg(len)
  366. expected = Series([4, 2], index=['bar', 'foo'])
  367. assert_series_equal(agged, expected, check_dtype=False)
  368. self.assert_(issubclass(agged.dtype.type, np.integer))
  369. def test_attr_wrapper(self):
  370. grouped = self.ts.groupby(lambda x: x.weekday())
  371. result = grouped.std()
  372. expected = grouped.agg(lambda x: np.std(x, ddof=1))
  373. assert_series_equal(result, expected)
  374. # this is pretty cool
  375. result = grouped.describe()
  376. expected = {}
  377. for name, gp in grouped:
  378. expected[name] = gp.describe()
  379. expected = DataFrame(expected).T
  380. assert_frame_equal(result.unstack(), expected)
  381. # get attribute
  382. result = grouped.dtype
  383. expected = grouped.agg(lambda x: x.dtype)
  384. # make sure raises error
  385. self.assertRaises(AttributeError, getattr, grouped, 'foo')
  386. def test_series_describe_multikey(self):
  387. ts = tm.makeTimeSeries()
  388. grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
  389. result = grouped.describe().unstack()
  390. assert_series_equal(result['mean'], grouped.mean())
  391. assert_series_equal(result['std'], grouped.std())
  392. assert_series_equal(result['min'], grouped.min())
  393. def test_series_describe_single(self):
  394. ts = tm.makeTimeSeries()
  395. grouped = ts.groupby(lambda x: x.month)
  396. result = grouped.apply(lambda x: x.describe())
  397. expected = grouped.describe()
  398. assert_series_equal(result, expected)
  399. def test_series_agg_multikey(self):
  400. ts = tm.makeTimeSeries()
  401. grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
  402. result = grouped.agg(np.sum)
  403. expected = grouped.sum()
  404. assert_series_equal(result, expected)
  405. def test_series_agg_multi_pure_python(self):
  406. data = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo',
  407. 'bar', 'bar', 'bar', 'bar',
  408. 'foo', 'foo', 'foo'],
  409. 'B' : ['one', 'one', 'one', 'two',
  410. 'one', 'one', 'one', 'two',
  411. 'two', 'two', 'one'],
  412. 'C' : ['dull', 'dull', 'shiny', 'dull',
  413. 'dull', 'shiny', 'shiny', 'dull',
  414. 'shiny', 'shiny', 'shiny'],
  415. 'D' : np.random.randn(11),
  416. 'E' : np.random.randn(11),
  417. 'F' : np.random.randn(11)})
  418. def bad(x):
  419. assert(len(x.base) > 0)
  420. return 'foo'
  421. result = data.groupby(['A', 'B']).agg(bad)
  422. expected = data.groupby(['A', 'B']).agg(lambda x: 'foo')
  423. assert_frame_equal(result, expected)
  424. def test_series_index_name(self):
  425. grouped = self.df.ix[:, ['C']].groupby(self.df['A'])
  426. result = grouped.agg(lambda x: x.mean())
  427. self.assertEqual(result.index.name, 'A')
  428. def test_frame_describe_multikey(self):
  429. grouped = self.tsframe.groupby([lambda x: x.year,
  430. lambda x: x.month])
  431. result = grouped.describe()
  432. for col in self.tsframe:
  433. expected = grouped[col].describe()
  434. assert_series_equal(result[col], expected)
  435. groupedT = self.tsframe.groupby({'A' : 0, 'B' : 0,
  436. 'C' : 1, 'D' : 1}, axis=1)
  437. result = groupedT.describe()
  438. for name, group in groupedT:
  439. assert_frame_equal(result[name], group.describe())
  440. def test_frame_groupby(self):
  441. grouped = self.tsframe.groupby(lambda x: x.weekday())
  442. # aggregate
  443. aggregated = grouped.aggregate(np.mean)
  444. self.assertEqual(len(aggregated), 5)
  445. self.assertEqual(len(aggregated.columns), 4)
  446. # by string
  447. tscopy = self.tsframe.copy()
  448. tscopy['weekday'] = [x.weekday() for x in tscopy.index]
  449. stragged = tscopy.groupby('weekday').aggregate(np.mean)
  450. assert_frame_equal(stragged, aggregated)
  451. # transform
  452. transformed = grouped.transform(lambda x: x - x.mean())
  453. self.assertEqual(len(transformed), 30)
  454. self.assertEqual(len(transformed.columns), 4)
  455. # transform propagate
  456. transformed = grouped.transform(lambda x: x.mean())
  457. for name, group in grouped:
  458. mean = group.mean()
  459. for idx in group.index:
  460. assert_almost_equal(transformed.xs(idx), mean)
  461. # iterate
  462. for weekday, group in grouped:
  463. self.assert_(group.index[0].weekday() == weekday)
  464. # groups / group_indices
  465. groups = grouped.groups
  466. indices = grouped.indices
  467. for k, v in groups.iteritems():
  468. samething = self.tsframe.index.take(indices[k])
  469. self.assert_((samething == v).all())
  470. def test_grouping_is_iterable(self):
  471. # this code path isn't used anywhere else
  472. # not sure it's useful
  473. grouped = self.tsframe.groupby([lambda x: x.weekday(),
  474. lambda x: x.year])
  475. # test it works
  476. for g in grouped.grouper.groupings[0]:
  477. pass
  478. def test_frame_groupby_columns(self):
  479. mapping = {
  480. 'A' : 0, 'B' : 0, 'C' : 1, 'D' : 1
  481. }
  482. grouped = self.tsframe.groupby(mapping, axis=1)
  483. # aggregate
  484. aggregated = grouped.aggregate(np.mean)
  485. self.assertEqual(len(aggregated), len(self.tsframe))
  486. self.assertEqual(len(aggregated.columns), 2)
  487. # transform
  488. tf = lambda x: x - x.mean()
  489. groupedT = self.tsframe.T.groupby(mapping, axis=0)
  490. assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf))
  491. # iterate
  492. for k, v in grouped:
  493. self.assertEqual(len(v.columns), 2)
  494. def test_frame_set_name_single(self):
  495. grouped = self.df.groupby('A')
  496. result = grouped.mean()
  497. self.assert_(result.index.name == 'A')
  498. result = self.df.groupby('A', as_index=False).mean()
  499. self.assert_(result.index.name != 'A')
  500. result = grouped.agg(np.mean)
  501. self.assert_(result.index.name == 'A')
  502. result = grouped.agg({'C' : np.mean, 'D' : np.std})
  503. self.assert_(result.index.name == 'A')
  504. result = grouped['C'].mean()
  505. self.assert_(result.index.name == 'A')
  506. result = grouped['C'].agg(np.mean)
  507. self.assert_(result.index.name == 'A')
  508. result = grouped['C'].agg([np.mean, np.std])
  509. self.assert_(result.index.name == 'A')
  510. result = grouped['C'].agg({'foo' : np.mean, 'bar' : np.std})
  511. self.assert_(result.index.name == 'A')
  512. def test_multi_iter(self):
  513. s = Series(np.arange(6))
  514. k1 = np.array(['a', 'a', 'a', 'b', 'b', 'b'])
  515. k2 = np.array(['1', '2', '1', '2', '1', '2'])
  516. grouped = s.groupby([k1, k2])
  517. iterated = list(grouped)
  518. expected = [('a', '1', s[[0, 2]]),
  519. ('a', '2', s[[1]]),
  520. ('b', '1', s[[4]]),
  521. ('b', '2', s[[3, 5]])]
  522. for i, ((one, two), three) in enumerate(iterated):
  523. e1, e2, e3 = expected[i]
  524. self.assert_(e1 == one)
  525. self.assert_(e2 == two)
  526. assert_series_equal(three, e3)
  527. def test_multi_iter_frame(self):
  528. k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
  529. k2 = np.array(['1', '2', '1', '2', '1', '2'])
  530. df = DataFrame({'v1' : np.random.randn(6),
  531. 'v2' : np.random.randn(6),
  532. 'k1' : k1, 'k2' : k2},
  533. index=['one', 'two', 'three', 'four', 'five', 'six'])
  534. grouped = df.groupby(['k1', 'k2'])
  535. # things get sorted!
  536. iterated = list(grouped)
  537. idx = df.index
  538. expected = [('a', '1', df.ix[idx[[4]]]),
  539. ('a', '2', df.ix[idx[[3, 5]]]),
  540. ('b', '1', df.ix[idx[[0, 2]]]),
  541. ('b', '2', df.ix[idx[[1]]])]
  542. for i, ((one, two), three) in enumerate(iterated):
  543. e1, e2, e3 = expected[i]
  544. self.assert_(e1 == one)
  545. self.assert_(e2 == two)
  546. assert_frame_equal(three, e3)
  547. # don't iterate through groups with no data
  548. df['k1'] = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
  549. df['k2'] = np.array(['1', '1', '1', '2', '2', '2'])
  550. grouped = df.groupby(['k1', 'k2'])
  551. groups = {}
  552. for key, gp in grouped:
  553. groups[key] = gp
  554. self.assertEquals(len(groups), 2)
  555. # axis = 1
  556. three_levels = self.three_group.groupby(['A', 'B', 'C']).mean()
  557. grouped = three_levels.T.groupby(axis=1, level=(1, 2))
  558. for key, group in grouped:
  559. pass
  560. def test_multi_iter_panel(self):
  561. wp = tm.makePanel()
  562. grouped = wp.groupby([lambda x: x.month, lambda x: x.weekday()],
  563. axis=1)
  564. for (month, wd), group in grouped:
  565. exp_axis = [x for x in wp.major_axis
  566. if x.month == month and x.weekday() == wd]
  567. expected = wp.reindex(major=exp_axis)
  568. assert_panel_equal(group, expected)
  569. def test_multi_func(self):
  570. col1 = self.df['A']
  571. col2 = self.df['B']
  572. grouped = self.df.groupby([col1.get, col2.get])
  573. agged = grouped.mean()
  574. expected = self.df.groupby(['A', 'B']).mean()
  575. assert_frame_equal(agged.ix[:, ['C', 'D']],
  576. expected.ix[:, ['C', 'D']])
  577. # some "groups" with no data
  578. df = DataFrame({'v1' : np.random.randn(6),
  579. 'v2' : np.random.randn(6),
  580. 'k1' : np.array(['b', 'b', 'b', 'a', 'a', 'a']),
  581. 'k2' : np.array(['1', '1', '1', '2', '2', '2'])},
  582. index=['one', 'two', 'three', 'four', 'five', 'six'])
  583. # only verify that it works for now
  584. grouped = df.groupby(['k1', 'k2'])
  585. grouped.agg(np.sum)
  586. def test_multi_key_multiple_functions(self):
  587. grouped = self.df.groupby(['A', 'B'])['C']
  588. agged = grouped.agg([np.mean, np.std])
  589. expected = DataFrame({'mean' : grouped.agg(np.mean),
  590. 'std' : grouped.agg(np.std)})
  591. assert_frame_equal(agged, expected)
  592. def test_frame_multi_key_function_list(self):
  593. data = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo',
  594. 'bar', 'bar', 'bar', 'bar',
  595. 'foo', 'foo', 'foo'],
  596. 'B' : ['one', 'one', 'one', 'two',
  597. 'one', 'one', 'one', 'two',
  598. 'two', 'two', 'one'],
  599. 'C' : ['dull', 'dull', 'shiny', 'dull',
  600. 'dull', 'shiny', 'shiny', 'dull',
  601. 'shiny', 'shiny', 'shiny'],
  602. 'D' : np.random.randn(11),
  603. 'E' : np.random.randn(11),
  604. 'F' : np.random.randn(11)})
  605. grouped = data.groupby(['A', 'B'])
  606. funcs = [np.mean, np.std]
  607. agged = grouped.agg(funcs)
  608. expected = concat([grouped['D'].agg(funcs), grouped['E'].agg(funcs),
  609. grouped['F'].agg(funcs)],
  610. keys=['D', 'E', 'F'], axis=1)
  611. assert(isinstance(agged.index, MultiIndex))
  612. assert(isinstance(expected.index, MultiIndex))
  613. assert_frame_equal(agged, expected)
  614. def test_groupby_multiple_columns(self):
  615. data = self.df
  616. grouped = data.groupby(['A', 'B'])
  617. def _check_op(op):
  618. result1 = op(grouped)
  619. expected = defaultdict(dict)
  620. for n1, gp1 in data.groupby('A'):
  621. for n2, gp2 in gp1.groupby('B'):
  622. expected[n1][n2] = op(gp2.ix[:, ['C', 'D']])
  623. expected = dict((k, DataFrame(v)) for k, v in expected.iteritems())
  624. expected = Panel.fromDict(expected).swapaxes(0, 1)
  625. # a little bit crude
  626. for col in ['C', 'D']:
  627. result_col = op(grouped[col])
  628. exp = expected[col]
  629. pivoted = result1[col].unstack()
  630. pivoted2 = result_col.unstack()
  631. assert_frame_equal(pivoted.reindex_like(exp), exp)
  632. assert_frame_equal(pivoted2.reindex_like(exp), exp)
  633. _check_op(lambda x: x.sum())
  634. _check_op(lambda x: x.mean())
  635. # test single series works the same
  636. result = data['C'].groupby([data['A'], data['B']]).mean()
  637. expected = data.groupby(['A', 'B']).mean()['C']
  638. assert_series_equal(result, expected)
  639. def test_groupby_as_index_agg(self):
  640. grouped = self.df.groupby('A', as_index=False)
  641. # single-key
  642. result = grouped.agg(np.mean)
  643. expected = grouped.mean()
  644. assert_frame_equal(result, expected)
  645. result2 = grouped.agg({'C' : np.mean, 'D' : np.sum})
  646. expected2 = grouped.mean()
  647. expected2['D'] = grouped.sum()['D']
  648. assert_frame_equal(result2, expected2)
  649. grouped = self.df.groupby('A', as_index=True)
  650. expected3 = grouped['C'].sum()
  651. expected3 = DataFrame(expected3).rename(columns={'C' : 'Q'})
  652. result3 = grouped['C'].agg({'Q' : np.sum})
  653. assert_frame_equal(result3, expected3)
  654. # multi-key
  655. grouped = self.df.groupby(['A', 'B'], as_index=False)
  656. result = grouped.agg(np.mean)
  657. expected = grouped.mean()
  658. assert_frame_equal(result, expected)
  659. result2 = grouped.agg({'C' : np.mean, 'D' : np.sum})
  660. expected2 = grouped.mean()
  661. expected2['D'] = grouped.sum()['D']
  662. assert_frame_equal(result2, expected2)
  663. expected3 = grouped['C'].sum()
  664. expected3 = DataFrame(expected3).rename(columns={'C' : 'Q'})
  665. result3 = grouped['C'].agg({'Q' : np.sum})
  666. assert_frame_equal(result3, expected3)
  667. def test_multifunc_select_col_integer_cols(self):
  668. df = self.df
  669. df.columns = np.arange(len(df.columns))
  670. # it works!
  671. result = df.groupby(1, as_index=False)[2].agg({'Q' : np.mean})
  672. def test_as_index_series_return_frame(self):
  673. grouped = self.df.groupby('A', as_index=False)
  674. grouped2 = self.df.groupby(['A', 'B'], as_index=False)
  675. result = grouped['C'].agg(np.sum)
  676. expected = grouped.agg(np.sum).ix[:, ['A', 'C']]
  677. self.assert_(isinstance(result, DataFrame))
  678. assert_frame_equal(result, expected)
  679. result2 = grouped2['C'].agg(np.sum)
  680. expected2 = grouped2.agg(np.sum).ix[:, ['A', 'B', 'C']]
  681. self.assert_(isinstance(result2, DataFrame))
  682. assert_frame_equal(result2, expected2)
  683. result = grouped['C'].sum()
  684. expected = grouped.sum().ix[:, ['A', 'C']]
  685. self.assert_(isinstance(result, DataFrame))
  686. assert_frame_equal(result, expected)
  687. result2 = grouped2['C'].sum()
  688. expected2 = grouped2.sum().ix[:, ['A', 'B', 'C']]
  689. self.assert_(isinstance(result2, DataFrame))
  690. assert_frame_equal(result2, expected2)
  691. # corner case
  692. self.assertRaises(Exception, grouped['C'].__getitem__,
  693. 'D')
  694. def test_groupby_as_index_cython(self):
  695. data = self.df
  696. # single-key
  697. grouped = data.groupby('A', as_index=False)
  698. result = grouped.mean()
  699. expected = data.groupby(['A']).mean()
  700. expected.insert(0, 'A', expected.index)
  701. expected.index = np.arange(len(expected))
  702. assert_frame_equal(result, expected)
  703. # multi-key
  704. grouped = data.groupby(['A', 'B'], as_index=False)
  705. result = grouped.mean()
  706. expected = data.groupby(['A', 'B']).mean()
  707. arrays = zip(*expected.index._tuple_index)
  708. expected.insert(0, 'A', arrays[0])
  709. expected.insert(1, 'B', arrays[1])
  710. expected.index = np.arange(len(expected))
  711. assert_frame_equal(result, expected)
  712. def test_groupby_as_index_series_scalar(self):
  713. grouped = self.df.groupby(['A', 'B'], as_index=False)
  714. # GH #421
  715. result = grouped['C'].agg(len)
  716. expected = grouped.agg(len).ix[:, ['A', 'B', 'C']]
  717. assert_frame_equal(result, expected)
  718. def test_groupby_as_index_corner(self):
  719. self.assertRaises(TypeError, self.ts.groupby,
  720. lambda x: x.weekday(), as_index=False)
  721. self.assertRaises(ValueError, self.df.groupby,
  722. lambda x: x.lower(), as_index=False, axis=1)
  723. def test_groupby_multiple_key(self):
  724. df = tm.makeTimeDataFrame()
  725. grouped = df.groupby([lambda x: x.year,
  726. lambda x: x.month,
  727. lambda x: x.day])
  728. agged = grouped.sum()
  729. assert_almost_equal(df.values, agged.values)
  730. grouped = df.T.groupby([lambda x: x.year,
  731. lambda x: x.month,
  732. lambda x: x.day], axis=1)
  733. agged = grouped.agg(lambda x: x.sum(1))
  734. self.assert_(agged.index.equals(df.columns))
  735. assert_almost_equal(df.T.values, agged.values)
  736. agged = grouped.agg(lambda x: x.sum(1))
  737. assert_almost_equal(df.T.values, agged.values)
  738. def test_groupby_multi_corner(self):
  739. # test that having an all-NA column doesn't mess you up
  740. df = self.df.copy()
  741. df['bad'] = np.nan
  742. agged = df.groupby(['A', 'B']).mean()
  743. expected = self.df.groupby(['A', 'B']).mean()
  744. expected['bad'] = np.nan
  745. assert_frame_equal(agged, expected)
  746. def test_omit_nuisance(self):
  747. grouped = self.df.groupby('A')
  748. result = grouped.mean()
  749. expected = self.df.ix[:, ['A', 'C', 'D']].groupby('A').mean()
  750. assert_frame_equal(result, expected)
  751. agged = grouped.agg(np.mean)
  752. exp = grouped.mean()
  753. assert_frame_equal(agged, exp)
  754. df = self.df.ix[:, ['A', 'C', 'D']]
  755. df['E'] = datetime.now()
  756. grouped = df.groupby('A')
  757. result = grouped.agg(np.sum)
  758. expected = grouped.sum()
  759. assert_frame_equal(result, expected)
  760. # won't work with axis = 1
  761. grouped = df.groupby({'A' : 0, 'C' : 0, 'D' : 1, 'E' : 1}, axis=1)
  762. result = self.assertRaises(TypeError, grouped.agg,
  763. lambda x: x.sum(1, numeric_only=False))
  764. def test_omit_nuisance_python_multiple(self):
  765. grouped = self.three_group.groupby(['A', 'B'])
  766. agged = grouped.agg(np.mean)
  767. exp = grouped.mean()
  768. assert_frame_equal(agged, exp)
  769. def test_empty_groups_corner(self):
  770. # handle empty groups
  771. df = DataFrame({'k1' : np.array(['b', 'b', 'b', 'a', 'a', 'a']),
  772. 'k2' : np.array(['1', '1', '1', '2', '2', '2']),
  773. 'k3' : ['foo', 'bar'] * 3,
  774. 'v1' : np.random.randn(6),
  775. 'v2' : np.random.randn(6)})
  776. grouped = df.groupby(['k1', 'k2'])
  777. result = grouped.agg(np.mean)
  778. expected = grouped.mean()
  779. assert_frame_equal(result, expected)
  780. grouped = self.mframe[3:5].groupby(level=0)
  781. agged = grouped.apply(lambda x: x.mean())
  782. agged_A = grouped['A'].apply(np.mean)
  783. assert_series_equal(agged['A'], agged_A)
  784. self.assertEquals(agged.index.name, 'first')
  785. def test_apply_concat_preserve_names(self):
  786. grouped = self.three_group.groupby(['A', 'B'])
  787. def desc(group):
  788. result = group.describe()
  789. result.index.name = 'stat'
  790. return result
  791. def desc2(group):
  792. result = group.describe()
  793. result.index.name = 'stat'
  794. result = result[:len(group)]
  795. # weirdo
  796. return result
  797. def desc3(group):
  798. result = group.describe()
  799. # names are different
  800. result.index.name = 'stat_%d' % len(group)
  801. result = result[:len(group)]
  802. # weirdo
  803. return result
  804. result = grouped.apply(desc)
  805. self.assertEquals(result.index.names, ['A', 'B', 'stat'])
  806. result2 = grouped.apply(desc2)
  807. self.assertEquals(result2.index.names, ['A', 'B', 'stat'])
  808. result3 = grouped.apply(desc3)
  809. self.assertEquals(result3.index.names, ['A', 'B', None])
  810. def test_nonsense_func(self):
  811. df = DataFrame([0])
  812. self.assertRaises(Exception, df.groupby, lambda x: x + 'foo')
  813. def test_cythonized_aggers(self):
  814. data = {'A' : [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., nan, nan],
  815. 'B' : ['A', 'B'] * 6,
  816. 'C' : np.random.randn(12)}
  817. df = DataFrame(data)
  818. df['C'][2:10:2] = nan
  819. def _testit(op):
  820. # single column
  821. grouped = df.drop(['B'], axis=1).groupby('A')
  822. exp = {}
  823. for cat, group in grouped:
  824. exp[cat] = op(group['C'])
  825. exp = DataFrame({'C' : exp})
  826. result = op(grouped)
  827. assert_frame_equal(result, exp)
  828. # multiple columns
  829. grouped = df.groupby(['A', 'B'])
  830. expd = {}
  831. for (cat1, cat2), group in grouped:
  832. expd.setdefault(cat1, {})[cat2] = op(group['C'])
  833. exp = DataFrame(expd).T.stack(dropna=False)
  834. result = op(grouped)['C']
  835. assert_series_equal(result, exp)
  836. _testit(lambda x: x.sum())
  837. _testit(lambda x: x.mean())
  838. _testit(lambda x: x.prod())
  839. _testit(lambda x: x.min())
  840. _testit(lambda x: x.max())
  841. def test_cython_agg_boolean(self):
  842. frame = DataFrame({'a': np.random.randint(0, 5, 50),
  843. 'b': np.random.randint(0, 2, 50).astype('bool')})
  844. result = frame.groupby('a')['b'].mean()
  845. expected = frame.groupby('a')['b'].agg(np.mean)
  846. assert_series_equal(result, expected)
  847. def test_cython_agg_nothing_to_agg(self):
  848. frame = DataFrame({'a': np.random.randint(0, 5, 50),
  849. 'b': ['foo', 'bar'] * 25})
  850. self.assertRaises(DataError, frame.groupby('a')['b'].mean)
  851. frame = DataFrame({'a': np.random.randint(0, 5, 50),
  852. 'b': ['foo', 'bar'] * 25})
  853. self.assertRaises(DataError, frame[['b']].groupby(frame['a']).mean)
  854. def test_wrap_aggregated_output_multindex(self):
  855. df = self.mframe.T
  856. df['baz', 'two'] = 'peekaboo'
  857. keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
  858. agged = df.groupby(keys).agg(np.mean)
  859. self.assert_(isinstance(agged.columns, MultiIndex))
  860. def aggfun(ser):
  861. if ser.name == ('foo', 'one'):
  862. raise TypeError
  863. else:
  864. return ser.sum()
  865. agged2 = df.groupby(keys).aggregate(aggfun)
  866. self.assertEqual(len(agged2.columns) + 1, len(df.columns))
  867. def test_grouping_attrs(self):
  868. deleveled = self.mframe.reset_index()
  869. grouped = deleveled.groupby(['first', 'second'])
  870. for i, ping in enumerate(grouped.grouper.groupings):
  871. the_counts = self.mframe.groupby(level=i).count()['A']
  872. other_counts = Series(ping.counts, ping.group_index)
  873. assert_almost_equal(the_counts,
  874. other_counts.reindex(the_counts.index))
  875. # compute counts when group by level
  876. grouped = self.mframe.groupby(level=0)
  877. ping = grouped.grouper.groupings[0]
  878. the_counts = grouped.size()
  879. other_counts = Series(ping.counts, ping.group_index)
  880. assert_almost_equal(the_counts,
  881. other_counts.reindex(the_counts.index))
  882. def test_groupby_level(self):
  883. frame = self.mframe
  884. deleveled = frame.reset_index()
  885. result0 = frame.groupby(level=0).sum()
  886. result1 = frame.groupby(level=1).sum()
  887. expected0 = frame.groupby(deleveled['first'].values).sum()
  888. expected1 = frame.groupby(deleveled['second'].values).sum()
  889. expected0 = expected0.reindex(frame.index.levels[0])
  890. expected1 = expected1.reindex(frame.index.levels[1])
  891. self.assert_(result0.index.name == 'first')
  892. self.assert_(result1.index.name == 'second')
  893. assert_frame_equal(result0, expected0)
  894. assert_frame_equal(result1, expected1)
  895. self.assertEquals(result0.index.name, frame.index.names[0])
  896. self.assertEquals(result1.index.name, frame.index.names[1])
  897. # groupby level name
  898. result0 = frame.groupby(level='first').sum()
  899. result1 = frame.groupby(level='second').sum()
  900. assert_frame_equal(result0, expected0)
  901. assert_frame_equal(result1, expected1)
  902. # axis=1
  903. result0 = frame.T.groupby(level=0, axis=1).sum()
  904. result1 = frame.T.groupby(level=1, axis=1).sum()
  905. assert_frame_equal(result0, expected0.T)
  906. assert_frame_equal(result1, expected1.T)
  907. # raise exception for non-MultiIndex
  908. self.assertRaises(ValueError, self.df.groupby, level=1)
  909. def test_groupby_level_apply(self):
  910. frame = self.mframe
  911. result = frame.groupby(level=0).count()
  912. self.assert_(result.index.name == 'first')
  913. result = frame.groupby(level=1).count()
  914. self.assert_(result.index.name == 'second')
  915. result = frame['A'].groupby(level=0).count()
  916. self.assert_(result.index.name == 'first')
  917. def test_groupby_level_mapper(self):
  918. frame = self.mframe
  919. deleveled = frame.reset_index()
  920. mapper0 = {'foo' : 0, 'bar' : 0,
  921. 'baz' : 1, 'qux' : 1}
  922. mapper1 = {'one' : 0, 'two' : 0, 'three' : 1}
  923. result0 = frame.groupby(mapper0, level=0).sum()
  924. result1 = frame.groupby(mapper1, level=1).sum()
  925. mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']])
  926. mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']])
  927. expected0 = frame.groupby(mapped_level0).sum()
  928. expected1 = frame.groupby(mapped_level1).sum()
  929. assert_frame_equal(result0, expected0)
  930. assert_frame_equal(result1, expected1)
  931. def test_groupby_level_0_nonmulti(self):
  932. # #1313
  933. a = Series([1,2,3,10,4,5,20,6], Index([1,2,3,1,4,5,2,6], name='foo'))
  934. result = a.groupby(level=0).sum()
  935. self.assertEquals(result.index.name, a.index.name)
  936. def test_level_preserve_order(self):
  937. grouped = self.mframe.groupby(level=0)
  938. exp_labels = np.array([0, 0, 0, 1, 1, 2, 2, 3, 3, 3])
  939. assert_almost_equal(grouped.grouper.labels[0], exp_labels)
  940. def test_grouping_labels(self):
  941. grouped = self.mframe.groupby(self.mframe.index.get_level_values(0))
  942. exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3])
  943. assert_almost_equal(grouped.grouper.labels[0], exp_labels)
  944. def test_cython_fail_agg(self):
  945. dr = bdate_range('1/1/2000', periods=50)
  946. ts = Series(['A', 'B', 'C', 'D', 'E'] * 10, index=dr)
  947. grouped = ts.groupby(lambda x: x.month)
  948. summed = grouped.sum()
  949. expected = grouped.agg(np.sum)
  950. assert_series_equal(summed, expected)
  951. def test_apply_series_to_frame(self):
  952. def f(piece):
  953. return DataFrame({'value' : piece,
  954. 'demeaned' : piece - piece.mean(),
  955. 'logged' : np.log(piece)})
  956. dr = bdate_range('1/1/2000', periods=100)
  957. ts = Series(np.random.randn(100), index=dr)
  958. grouped = ts.groupby(lambda x: x.month)
  959. result = grouped.apply(f)
  960. self.assert_(isinstance(result, DataFrame))
  961. self.assert_(result.index.equals(ts.index))
  962. def test_apply_series_yield_constant(self):
  963. result = self.df.groupby(['A', 'B'])['C'].apply(len)
  964. self.assertEquals(result.index.names[:2], ['A', 'B'])
  965. def test_apply_frame_to_series(self):
  966. grouped = self.df.groupby(['A', 'B'])
  967. result = grouped.apply(len)
  968. expected = grouped.count()['C']
  969. self.assert_(result.index.equals(expected.index))
  970. self.assert_(np.array_equal(result.values, expected.values))
  971. def test_apply_frame_concat_series(self):
  972. def trans(group):
  973. return group.groupby('B')['C'].sum().order()[:2]
  974. def trans2(group):
  975. grouped = group.groupby(df.reindex(group.index)['B'])
  976. return grouped.sum().order()[:2]
  977. df = DataFrame({'A': np.random.randint(0, 5, 1000),
  978. 'B': np.random.randint(0, 5, 1000),
  979. 'C': np.random.randn(1000)})
  980. result = df.groupby('A').apply(trans)
  981. exp = df.groupby('A')['C'].apply(trans2)
  982. assert_series_equal(result, exp)
  983. def test_apply_transform(self):
  984. grouped = self.ts.groupby(lambda x: x.month)
  985. result = grouped.apply(lambda x: x * 2)
  986. expected = grouped.transform(lambda x: x * 2)
  987. assert_series_equal(result, expected)
  988. def test_apply_multikey_corner(self):
  989. grouped = self.tsframe.groupby([lambda x: x.year,
  990. lambda x: x.month])
  991. def f(group):
  992. return group.sort('A')[-5:]
  993. result = grouped.apply(f)
  994. for key, group in grouped:
  995. assert_frame_equal(result.ix[key], f(group))
  996. def test_groupby_series_indexed_differently(self):
  997. s1 = Series([5.0,-9.0,4.0,100.,-5.,55.,6.7],
  998. index=Index(['a','b','c','d','e','f','g']))
  999. s2 = Series([1.0,1.0,4.0,5.0,5.0,7.0],
  1000. index=Index(['a','b','d','f','g','h']))
  1001. grouped = s1.groupby(s2)
  1002. agged = grouped.mean()
  1003. exp = s1.groupby(s2.reindex(s1.index).get).mean()
  1004. assert_series_equal(agged, exp)
  1005. def test_groupby_with_hier_columns(self):
  1006. tuples = zip(*[['bar', 'bar', 'baz', 'baz',
  1007. 'foo', 'foo', 'qux', 'qux'],
  1008. ['one', 'two', 'one', 'two',
  1009. 'one', 'two', 'one', 'two']])
  1010. index = MultiIndex.from_tuples(tuples)
  1011. columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'),
  1012. ('B', 'cat'), ('A', 'dog')])
  1013. df = DataFrame(np.random.randn(8, 4), index=index,
  1014. columns=columns)
  1015. result = df.groupby(level=0).mean()
  1016. self.assert_(result.columns.equals(columns))
  1017. result = df.groupby(level=0, axis=1).mean()
  1018. self.assert_(result.index.equals(df.index))
  1019. result = df.groupby(level=0).agg(np.mean)
  1020. self.assert_(result.columns.equals(columns))
  1021. result = df.groupby(level=0).apply(lambda x: x.mean())
  1022. self.assert_(result.columns.equals(columns))
  1023. result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1))
  1024. self.assert_(result.columns.equals(Index(['A', 'B'])))
  1025. self.assert_(result.index.equals(df.index))
  1026. # add a nuisance column
  1027. sorted_columns, _ = columns.sortlevel(0)
  1028. df['A', 'foo'] = 'bar'
  1029. result = df.groupby(level=0).mean()
  1030. self.assert_(result.columns.equals(df.columns[:-1]))
  1031. def test_pass_args_kwargs(self):
  1032. from pandas.compat.scipy import scoreatpercentile
  1033. def f(x, q=None):
  1034. return scoreatpercentile(x, q)
  1035. g = lambda x: scoreatpercentile(x, 80)
  1036. # Series
  1037. ts_grouped = self.ts.groupby(lambda x: x.month)
  1038. agg_result = ts_grouped.agg(scoreatpercentile, 80)
  1039. apply_result = ts_grouped.apply(scoreatpercentile, 80)
  1040. trans_result = ts_grouped.transform(scoreatpercentile, 80)
  1041. agg_expected = ts_grouped.quantile(.8)
  1042. trans_expected = ts_grouped.transform(g)
  1043. assert_series_equal(apply_result, agg_expected)
  1044. assert_series_equal(agg_result, agg_expected)
  1045. assert_series_equal(trans_result, trans_expected)
  1046. agg_result = ts_grouped.agg(f, q=80)
  1047. apply_result = ts_grouped.apply(f, q=80)
  1048. trans_result = ts_grouped.transform(f, q=80)
  1049. assert_series_equal(agg_result, agg_expected)
  1050. assert_series_equal(apply_result, agg_expected)
  1051. assert_series_equal(trans_result, trans_expected)
  1052. # DataFrame
  1053. df_grouped = self.tsframe.groupby(lambda x: x.month)
  1054. agg_result = df_grouped.agg(scoreatpercentile, 80)
  1055. apply_result = df_grouped.apply(DataFrame.quantile, .8)
  1056. expected = df_grouped.quantile(.8)
  1057. assert_frame_equal(apply_result, expected)
  1058. assert_frame_equal(agg_result, expected)
  1059. agg_result = df_grouped.agg(f, q=80)
  1060. apply_result = df_grouped.apply(DataFrame.quantile, q=.8)
  1061. assert_frame_equal(agg_result, expected)
  1062. assert_frame_equal(apply_result, expected)
  1063. # def test_cython_na_bug(self):
  1064. # values = np.random.randn(10)
  1065. # shape = (5, 5)
  1066. # label_list = [np.array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2], dtype=np.int32),
  1067. # np.array([1, 2, 3, 4, 0, 1, 2, 3, 3, 4], dtype=np.int32)]
  1068. # lib.group_aggregate(values, label_list, shape)
  1069. def test_size(self):
  1070. grouped = self.df.groupby(['A', 'B'])
  1071. result = grouped.size()
  1072. for key, group in grouped:
  1073. self.assertEquals(result[key], len(group))
  1074. grouped = self.df.groupby('A')
  1075. result = grouped.size()
  1076. for key, group in grouped:
  1077. self.assertEquals(result[key], len(group))
  1078. grouped = self.df.groupby('B')
  1079. result = grouped.size()
  1080. for key, group in grouped:
  1081. self.assertEquals(result[key], len(group))
  1082. def test_grouping_ndarray(self):
  1083. grouped = self.df.groupby(self.df['A'].values)
  1084. result = grouped.sum()
  1085. expected = self.df.groupby('A').sum()
  1086. assert_frame_equal(result, expected)
  1087. def test_apply_typecast_fail(self):
  1088. df = DataFrame({'d' : [1.,1.,1.,2.,2.,2.],
  1089. 'c' : np.tile(['a','b','c'], 2),
  1090. 'v' : np.arange(1., 7.)})
  1091. def f(group):
  1092. v = group['v']
  1093. group['v2'] = (v - v.min()) / (v.max() - v.min())
  1094. return group
  1095. result = df.groupby('d').apply(f)
  1096. expected = df.copy()
  1097. expected['v2'] = np.tile([0., 0.5, 1], 2)
  1098. assert_frame_equal(result, expected)
  1099. def test_apply_multiindex_fail(self):
  1100. index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1],
  1101. [1, 2, 3, 1, 2, 3]])
  1102. df = DataFrame({'d' : [1.,1.,1.,2.,2.,2.],
  1103. 'c' : np.tile(['a','b','c'], 2),
  1104. 'v' : np.arange(1., 7.)}, index=index)
  1105. def f(group):
  1106. v = group['v']
  1107. group['v2'] = (v - v.min()) / (v.max() - v.min())
  1108. return group
  1109. result = df.groupby('d').apply(f)
  1110. expected = df.copy()
  1111. expected['v2'] = np.tile([0., 0.5, 1], 2)
  1112. assert_frame_equal(result, expected)
  1113. def test_apply_corner(self):
  1114. result = self.tsframe.groupby(lambda x: x.year).apply(lambda x: x * 2)
  1115. expected = self.tsframe * 2
  1116. assert_frame_equal(result, expected)
  1117. def test_apply_use_categorical_name(self):
  1118. from pandas import qcut
  1119. cats = qcut(self.df.C, 4)
  1120. def get_stats(group):
  1121. return {'min': group.min(), 'max': group.max(),
  1122. 'count': group.count(), 'mean': group.mean()}
  1123. result = self.df.groupby(cats).D.apply(get_stats)
  1124. self.assertEquals(result.index.names[0], 'C')
  1125. def test_transform_mixed_type(self):
  1126. index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1],
  1127. [1, 2, 3, 1, 2, 3]])
  1128. df = DataFrame({'d' : [1.,1.,1.,2.,2.,2.],
  1129. 'c' : np.tile(['a','b','c'], 2),
  1130. 'v' : np.arange(1., 7.)}, index=index)
  1131. def f(group):
  1132. group['g'] = group['d'] * 2
  1133. return group[:1]
  1134. grouped = df.groupby('c')
  1135. result = grouped.apply(f)
  1136. self.assert_(result['d'].dtype == np.float64)
  1137. for key, group in grouped:
  1138. res = f(group)
  1139. assert_frame_equal(res, result.ix[key])
  1140. def test_groupby_wrong_multi_labels(self):
  1141. from pandas import read_csv
  1142. from pandas.util.py3compat import StringIO
  1143. data = """index,foo,bar,baz,spam,data
  1144. 0,foo1,bar1,baz1,spam2,20
  1145. 1,foo1,bar2,baz1,spam3,30
  1146. 2,foo2,bar2,baz1,spam2,40
  1147. 3,foo1,bar1,baz2,spam1,50
  1148. 4,foo3,bar1,baz2,spam1,60"""
  1149. data = read_csv(StringIO(data), index_col=0)
  1150. grouped = data.groupby(['foo', 'bar', 'baz', 'spam'])
  1151. result = grouped.agg(np.mean)
  1152. expected = grouped.mean()
  1153. assert_frame_equal(result, expected)
  1154. def test_groupby_series_with_name(self):
  1155. result = self.df.groupby(self.df['A']).mean()
  1156. result2 = self.df.groupby(self.df['A'], as_index=False).mean()
  1157. self.assertEquals(result.index.name, 'A')
  1158. self.assert_('A' in result2)
  1159. result = self.df.groupby([self.df['A'], self.df['B']]).mean()
  1160. result2 = self.df.groupby([self.df['A'], self.df['B']],
  1161. as_index=False).mean()
  1162. self.assertEquals(result.index.names, ['A', 'B'])
  1163. self.assert_('A' in result2)
  1164. self.assert_('B' in result2)
  1165. def test_groupby_nonstring_columns(self):
  1166. df = DataFrame([np.arange(10) for x in range(10)])
  1167. grouped = df.groupby(0)
  1168. result = grouped.mean()
  1169. expected = df.groupby(df[0]).mean()
  1170. del expected[0]
  1171. assert_frame_equal(result, expected)
  1172. def test_cython_grouper_series_bug_noncontig(self):
  1173. arr = np.empty((100, 100))
  1174. arr.fill(np.nan)
  1175. obj = Series(arr[:, 0], index=range(100))
  1176. inds = np.tile(range(10), 10)
  1177. result = obj.groupby(inds).agg(Series.median)
  1178. self.assert_(result.isnull().all())
  1179. def test_series_grouper_noncontig_index(self):
  1180. index = Index([tm.rands(10) for _ in xrange(100)])
  1181. values = Series(np.random.randn(50), index=index[::2])
  1182. labels = np.random.randint(0, 5, 50)
  1183. # it works!
  1184. grouped = values.groupby(labels)
  1185. # accessing the index elements causes segfault
  1186. f = lambda x: len(set(map(id, x.index)))
  1187. grouped.agg(f)
  1188. def test_convert_objects_leave_decimal_alone(self):
  1189. from decimal import Decimal
  1190. s = Series(range(5))
  1191. labels = np.array(['a', 'b', 'c', 'd', 'e'], dtype='O')
  1192. def convert_fast(x):
  1193. return Decimal(str(x.mean()))
  1194. def convert_force_pure(x):
  1195. # base will be length 0
  1196. assert(len(x.base) == len(x))
  1197. return Decimal(str(x.mean()))
  1198. grouped = s.groupby(labels)
  1199. result = grouped.agg(convert_fast)
  1200. self.assert_(result.dtype == np.object_)
  1201. self.assert_(isinstance(result[0], Decimal))
  1202. result = grouped.agg(convert_force_pure)
  1203. self.assert_(result.dtype == np.object_)
  1204. self.assert_(isinstance(result[0], Decimal))
  1205. def test_groupby_list_infer_array_like(self):
  1206. result = self.df.groupby(list(self.df['A'])).mean()
  1207. expected = self.df.groupby(self.df['A']).mean()
  1208. assert_frame_equal(result, expected)
  1209. self.assertRaises(Exception, self.df.groupby, list(self.df['A'][:-1]))
  1210. # pathological case of ambiguity
  1211. df = DataFrame({'foo' : [0, 1], 'bar' : [3, 4],
  1212. 'val' : np.random.randn(2)})
  1213. result = df.groupby(['foo', 'bar']).mean()
  1214. expected = df.groupby([df['foo'], df['bar']]).mean()[['val']]
  1215. def test_dictify(self):
  1216. dict(iter(self.df.groupby('A')))
  1217. dict(iter(self.df.groupby(['A', 'B'])))
  1218. dict(iter(self.df['C'].groupby(self.df['A'])))
  1219. dict(iter(self.df['C'].groupby([self.df['A'], self.df['B']])))
  1220. dict(iter(self.df.groupby('A')['C']))
  1221. dict(iter(self.df.groupby(['A', 'B'])['C']))
  1222. def test_sparse_friendly(self):
  1223. sdf = self.df[['C', 'D']].to_sparse()
  1224. panel = tm.makePanel()
  1225. tm.add_nans(panel)
  1226. def _check_work(gp):
  1227. gp.mean()
  1228. gp.agg(np.mean)
  1229. dict(iter(gp))
  1230. # it works!
  1231. _check_work(sdf.groupby(lambda x: x // 2))
  1232. _check_work(sdf['C'].groupby(lambda x: x // 2))
  1233. _check_work(sdf.groupby(self.df['A']))
  1234. # do this someday
  1235. # _check_work(panel.groupby(lambda x: x.month, axis=1))
  1236. def test_panel_groupby(self):
  1237. self.panel = tm.makePanel()
  1238. tm.add_nans(self.panel)
  1239. grouped = self.panel.groupby({'ItemA' : 0, 'ItemB' : 0, 'ItemC' : 1},
  1240. axis='items')
  1241. agged = grouped.mean()
  1242. agged2 = grouped.agg(lambda x: x.mean('items'))
  1243. tm.assert_panel_equal(agged, agged2)
  1244. self.assert_(np.array_equal(agged.items, [0, 1]))
  1245. grouped = self.panel.groupby(lambda x: x.month, axis='major')
  1246. agged = grouped.mean()
  1247. self.assert_(np.array_equal(agged.major_axis, [1, 2]))
  1248. grouped = self.panel.groupby({'A' : 0, 'B' : 0, 'C' : 1, 'D' : 1},
  1249. axis='minor')
  1250. agged = grouped.mean()
  1251. self.assert_(np.array_equal(agged.minor_axis, [0, 1]))
  1252. def test_numpy_groupby(self):
  1253. from pandas.core.groupby import numpy_groupby
  1254. data = np.random.randn(100, 100)
  1255. labels = np.random.randint(0, 10, size=100)
  1256. df = DataFrame(data)
  1257. result = df.groupby(labels).sum().values
  1258. expected = numpy_groupby(data, labels)
  1259. assert_almost_equal(result, expected)
  1260. result = df.groupby(labels, axis=1).sum().values
  1261. expected = numpy_groupby(data, labels, axis=1)
  1262. assert_almost_equal(result, expected)
  1263. def test_groupby_2d_malformed(self):
  1264. d = DataFrame(index=range(2))
  1265. d['group'] = ['g1', 'g2']
  1266. d['zeros'] = [0, 0]
  1267. d['ones'] = [1, 1]
  1268. d['label'] = ['l1', 'l2']
  1269. tmp = d.groupby(['group']).mean()
  1270. res_values = np.array([[0., 1.], [0., 1.]])
  1271. self.assert_(np.array_equal(tmp.columns, ['zeros', 'ones']))
  1272. self.assert_(np.array_equal(tmp.values, res_values))
  1273. def test_int32_overflow(self):
  1274. B = np.concatenate((np.arange(10000), np.arange(10000),
  1275. np.arange(5000)))
  1276. A = np.arange(25000)
  1277. df = DataFrame({'A' : A, 'B' : B,
  1278. 'C' : A, 'D' : B,
  1279. 'E' : np.random.randn(25000)})
  1280. left = df.groupby(['A', 'B', 'C', 'D']).sum()
  1281. right = df.groupby(['D', 'C', 'B', 'A']).sum()
  1282. self.assert_(len(left) == len(right))
  1283. def test_int64_overflow(self):
  1284. B = np.concatenate((np.arange(1000), np.arange(1000),
  1285. np.arange(500)))
  1286. A = np.arange(2500)
  1287. df = DataFrame({'A' : A, 'B' : B,
  1288. 'C' : A, 'D' : B,
  1289. 'E' : A, 'F' : B,
  1290. 'G' : A, 'H' : B,
  1291. 'values' : np.random.randn(2500)})
  1292. lg = df.groupby(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'])
  1293. rg = df.groupby(['H', 'G', 'F', 'E', 'D', 'C', 'B', 'A'])
  1294. left = lg.sum()['values']
  1295. right = rg.sum()['values']
  1296. exp_index, _ = left.index.sortlevel(0)
  1297. self.assert_(left.index.equals(exp_index))
  1298. exp_index, _ = right.index.sortlevel(0)
  1299. self.assert_(right.index.equals(exp_index))
  1300. tups = map(tuple, df[['A', 'B', 'C', 'D',
  1301. 'E', 'F', 'G', 'H']].values)
  1302. tups = com._asarray_tuplesafe(tups)
  1303. expected = df.groupby(tups).sum()['values']
  1304. for k, v in expected.iteritems():
  1305. self.assert_(left[k] == right[k[::-1]] == v)
  1306. self.assert_(len(left) == len(right))
  1307. def test_groupby_sort_multi(self):
  1308. df = DataFrame({'a' : ['foo', 'bar', 'baz'],
  1309. 'b' : [3, 2, 1],
  1310. 'c' : [0, 1, 2],
  1311. 'd' : np.random.randn(3)})
  1312. tups = map(tuple, df[['a', 'b', 'c']].values)
  1313. tups = com._asarray_tuplesafe(tups)
  1314. result = df.groupby(['a', 'b', 'c'], sort=True).sum()
  1315. self.assert_(np.array_equal(result.index.values,
  1316. tups[[1, 2, 0]]))
  1317. tups = map(tuple, df[['c', 'a', 'b']].values)
  1318. tups = com._asarray_tuplesafe(tups)
  1319. result = df.groupby(['c', 'a', 'b'], sort=True).sum()
  1320. self.assert_(np.array_equal(result.index.values, tups))
  1321. tups = map(tuple, df[['b', 'c', 'a']].values)
  1322. tups = com._asarray_tuplesafe(tups)
  1323. result = df.groupby(['b', 'c', 'a'], sort=True).sum()
  1324. self.assert_(np.array_equal(result.index.values,
  1325. tups[[2, 1, 0]]))
  1326. df = DataFrame({'a' : [0, 1, 2, 0, 1, 2],
  1327. 'b' : [0, 0, 0, 1, 1, 1],
  1328. 'd' : np.random.randn(6)})
  1329. grouped = df.groupby(['a', 'b'])['d']
  1330. result = grouped.sum()
  1331. _check_groupby(df, result, ['a', 'b'], 'd')
  1332. def test_intercept_builtin_sum(self):
  1333. import __builtin__
  1334. s = Series([1., 2., np.nan, 3.])
  1335. grouped = s.groupby([0, 1, 2, 2])
  1336. result = grouped.agg(__builtin__.sum)
  1337. result2 = grouped.apply(__builtin__.sum)
  1338. expected = grouped.sum()
  1339. assert_series_equal(result, expected)
  1340. assert_series_equal(result2, expected)
  1341. def test_column_select_via_attr(self):
  1342. result = self.df.groupby('A').C.sum()
  1343. expected = self.df.groupby('A')['C'].sum()
  1344. assert_series_equal(result, expected)
  1345. self.df['mean'] = 1.5
  1346. result = self.df.groupby('A').mean()
  1347. expected = self.df.groupby('A').agg(np.mean)
  1348. assert_frame_equal(result, expected)
  1349. def test_rank_apply(self):
  1350. lev1 = np.array([rands(10) for _ in xrange(1000)], dtype=object)
  1351. lev2 = np.array([rands(10) for _ in xrange(130)], dtype=object)
  1352. lab1 = np.random.randint(0, 1000, size=5000)
  1353. lab2 = np.random.randint(0, 130, size=5000)
  1354. df = DataFrame({'value' : np.random.randn(5000),
  1355. 'key1' : lev1.take(lab1),
  1356. 'key2' : lev2.take(lab2)})
  1357. result = df.groupby(['key1', 'key2']).value.rank()
  1358. expected = []
  1359. for key, piece in df.groupby(['key1', 'key2']):
  1360. expected.append(piece.value.rank())
  1361. expected = concat(expected, axis=0)
  1362. expected = expected.reindex(result.index)
  1363. assert_series_equal(result, expected)
  1364. def test_dont_clobber_name_column(self):
  1365. df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'],
  1366. 'name' : ['foo', 'bar', 'baz'] * 2})
  1367. result = df.groupby('key').apply(lambda x: x)
  1368. assert_frame_equal(result, df)
  1369. def test_skip_group_keys(self):
  1370. from pandas import concat
  1371. tsf = tm.makeTimeDataFrame()
  1372. grouped = tsf.groupby(lambda x: x.month, group_keys=False)
  1373. result = grouped.apply(lambda x: x.sort_index(by='A')[:3])
  1374. pieces = []
  1375. for key, group in grouped:
  1376. pieces.append(group.sort_index(by='A')[:3])
  1377. expected = concat(pieces)
  1378. assert_frame_equal(result, expected)
  1379. grouped = tsf['A'].groupby(lambda x: x.month, group_keys=False)
  1380. result = grouped.apply(lambda x: x.order()[:3])
  1381. pieces = []
  1382. for key, group in grouped:
  1383. pieces.append(group.order()[:3])
  1384. expected = concat(pieces)
  1385. assert_series_equal(result, expected)
  1386. def test_no_nonsense_name(self):
  1387. # GH #995
  1388. s = self.frame['C'].copy()
  1389. s.name = None
  1390. result = s.groupby(self.frame['A']).agg(np.sum)
  1391. self.assert_(result.name is None)
  1392. def test_wrap_agg_out(self):
  1393. grouped = self.three_group.groupby(['A', 'B'])
  1394. def func(ser):
  1395. if ser.dtype == np.object:
  1396. raise TypeError
  1397. else:
  1398. return ser.sum()
  1399. result = grouped.aggregate(func)
  1400. exp_grouped = self.three_group.ix[:, self.three_group.columns != 'C']
  1401. expected = exp_grouped.groupby(['A', 'B']).aggregate(func)
  1402. assert_frame_equal(result, expected)
  1403. def test_multifunc_sum_bug(self):
  1404. # GH #1065
  1405. x = DataFrame(np.arange(9).reshape(3,3))
  1406. x['test']=0
  1407. x['fl']= [1.3,1.5,1.6]
  1408. grouped = x.groupby('test')
  1409. result = grouped.agg({'fl':'sum',2:'size'})
  1410. self.assert_(result['fl'].dtype == np.float64)
  1411. def test_handle_dict_return_value(self):
  1412. def f(group):
  1413. return {'min': group.min(), 'max': group.max()}
  1414. def g(group):
  1415. return Series({'min': group.min(), 'max': group.max()})
  1416. result = self.df.groupby('A')['C'].apply(f)
  1417. expected = self.df.groupby('A')['C'].apply(g)
  1418. self.assert_(isinstance(result, Series))
  1419. assert_series_equal(result, expected)
  1420. def test_getitem_list_of_columns(self):
  1421. df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
  1422. 'foo', 'bar', 'foo', 'foo'],
  1423. 'B': ['one', 'one', 'two', 'three',
  1424. 'two', 'two', 'one', 'three'],
  1425. 'C': np.random.randn(8),
  1426. 'D': np.random.randn(8),
  1427. 'E': np.random.randn(8)})
  1428. result = df.groupby('A')[['C', 'D']].mean()
  1429. result2 = df.groupby('A')['C', 'D'].mean()
  1430. result3 = df.groupby('A')[df.columns[2:4]].mean()
  1431. expected = df.ix[:, ['A', 'C', 'D']].groupby('A').mean()
  1432. assert_frame_equal(result, expected)
  1433. assert_frame_equal(result2, expected)
  1434. assert_frame_equal(result3, expected)
  1435. def test_agg_multiple_functions_maintain_order(self):
  1436. # GH #610
  1437. funcs = [('mean', np.mean), ('max', np.max), ('min', np.min)]
  1438. result = self.df.groupby('A')['C'].agg(funcs)
  1439. exp_cols = ['mean', 'max', 'min']
  1440. self.assert_(np.array_equal(result.columns, exp_cols))
  1441. def test_multiple_functions_tuples_and_non_tuples(self):
  1442. # #1359
  1443. funcs = [('foo', 'mean'), 'std']
  1444. ex_funcs = [('foo', 'mean'), ('std', 'std')]
  1445. result = self.df.groupby('A')['C'].agg(funcs)
  1446. expected = self.df.groupby('A')['C'].agg(ex_funcs)
  1447. assert_frame_equal(result, expected)
  1448. result = self.df.groupby('A').agg(funcs)
  1449. expected = self.df.groupby('A').agg(ex_funcs)
  1450. assert_frame_equal(result, expected)
  1451. def test_agg_multiple_functions_too_many_lambdas(self):
  1452. grouped = self.df.groupby('A')
  1453. funcs = ['mean', lambda x: x.mean(), lambda x: x.std()]
  1454. self.assertRaises(SpecificationError, grouped.agg, funcs)
  1455. def test_more_flexible_frame_multi_function(self):
  1456. from pandas import concat
  1457. grouped = self.df.groupby('A')
  1458. exmean = grouped.agg({'C' : np.mean, 'D' : np.mean})
  1459. exstd = grouped.agg({'C' : np.std, 'D' : np.std})
  1460. expected = concat([exmean, exstd], keys=['mean', 'std'], axis=1)
  1461. expected = expected.swaplevel(0, 1, axis=1).sortlevel(0, axis=1)
  1462. result = grouped.aggregate({'C' : [np.mean, np.std],
  1463. 'D' : [np.mean, np.std]})
  1464. assert_frame_equal(result, expected)
  1465. # be careful
  1466. result = grouped.aggregate({'C' : np.mean,
  1467. 'D' : [np.mean, np.std]})
  1468. expected = grouped.aggregate({'C' : [np.mean],
  1469. 'D' : [np.mean, np.std]})
  1470. assert_frame_equal(result, expected)
  1471. def foo(x): return np.mean(x)
  1472. def bar(x): return np.std(x, ddof=1)
  1473. result = grouped.aggregate({'C' : np.mean,
  1474. 'D' : {'foo': np.mean,
  1475. 'bar': np.std}})
  1476. expected = grouped.aggregate({'C' : [np.mean],
  1477. 'D' : [foo, bar]})
  1478. assert_frame_equal(result, expected)
  1479. def test_multi_function_flexible_mix(self):
  1480. # GH #1268
  1481. grouped = self.df.groupby('A')
  1482. result = grouped.aggregate({'C' : {'foo' : 'mean',
  1483. 'bar' : 'std'},
  1484. 'D' : 'sum'})
  1485. result2 = grouped.aggregate({'C' : {'foo' : 'mean',
  1486. 'bar' : 'std'},
  1487. 'D' : ['sum']})
  1488. expected = grouped.aggregate({'C' : {'foo' : 'mean',
  1489. 'bar' : 'std'},
  1490. 'D' : {'sum' : 'sum'}})
  1491. assert_frame_equal(result, expected)
  1492. assert_frame_equal(result2, expected)
  1493. def test_set_group_name(self):
  1494. def f(group):
  1495. assert group.name is not None
  1496. return group
  1497. def freduce(group):
  1498. assert group.name is not None
  1499. return group.sum()
  1500. def foo(x):
  1501. return freduce(x)
  1502. def _check_all(grouped):
  1503. # make sure all these work
  1504. grouped.apply(f)
  1505. grouped.aggregate(freduce)
  1506. grouped.aggregate({'C': freduce, 'D': freduce})
  1507. grouped.transform(f)
  1508. grouped['C'].apply(f)
  1509. grouped['C'].aggregate(freduce)
  1510. grouped['C'].aggregate([freduce, foo])
  1511. grouped['C'].transform(f)
  1512. _check_all(self.df.groupby('A'))
  1513. _check_all(self.df.groupby(['A', 'B']))
  1514. def test_no_dummy_key_names(self):
  1515. # GH #1291
  1516. result = self.df.groupby(self.df['A'].values).sum()
  1517. self.assert_(result.index.name is None)
  1518. result = self.df.groupby([self.df['A'].values,
  1519. self.df['B'].values]).sum()
  1520. self.assert_(result.index.names == [None, None])
  1521. def test_groupby_categorical(self):
  1522. levels = ['foo', 'bar', 'baz', 'qux']
  1523. labels = np.random.randint(0, 4, size=100)
  1524. cats = Categorical(labels, levels, name='myfactor')
  1525. data = DataFrame(np.random.randn(100, 4))
  1526. result = data.groupby(cats).mean()
  1527. expected = data.groupby(np.asarray(cats)).mean()
  1528. expected = expected.reindex(levels)
  1529. assert_frame_equal(result, expected)
  1530. self.assert_(result.index.name == cats.name)
  1531. grouped = data.groupby(cats)
  1532. desc_result = grouped.describe()
  1533. idx = cats.labels.argsort()
  1534. ord_labels = np.asarray(cats).take(idx)
  1535. ord_data = data.take(idx)
  1536. expected = ord_data.groupby(ord_labels, sort=False).describe()
  1537. assert_frame_equal(desc_result, expected)
  1538. def test_groupby_groups_datetimeindex(self):
  1539. # #1430
  1540. from pandas.tseries.api import DatetimeIndex
  1541. periods = 1000
  1542. ind = DatetimeIndex(start='2012/1/1', freq='5min', periods=periods)
  1543. df = DataFrame({'high': np.arange(periods),
  1544. 'low': np.arange(periods)}, index=ind)
  1545. grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
  1546. # it works!
  1547. groups = grouped.groups
  1548. self.assert_(isinstance(groups.keys()[0], datetime))
  1549. def test_groupby_reindex_inside_function(self):
  1550. from pandas.tseries.api import DatetimeIndex
  1551. periods = 1000
  1552. ind = DatetimeIndex(start='2012/1/1', freq='5min', periods=periods)
  1553. df = DataFrame({'high': np.arange(periods), 'low': np.arange(periods)}, index=ind)
  1554. def agg_before(hour, func, fix=False):
  1555. """
  1556. Run an aggregate func on the subset of data.
  1557. """
  1558. def _func(data):
  1559. d = data.select(lambda x: x.hour < 11).dropna()
  1560. if fix:
  1561. data[data.index[0]]
  1562. if len(d) == 0:
  1563. return None
  1564. return func(d)
  1565. return _func
  1566. def afunc(data):
  1567. d = data.select(lambda x: x.hour < 11).dropna()
  1568. return np.max(d)
  1569. grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
  1570. closure_bad = grouped.agg({'high': agg_before(11, np.max)})
  1571. closure_good = grouped.agg({'high': agg_before(11, np.max, True)})
  1572. assert_frame_equal(closure_bad, closure_good)
  1573. def test_multiindex_columns_empty_level(self):
  1574. l = [['count', 'values'], ['to filter', '']]
  1575. midx = MultiIndex.from_tuples(l)
  1576. df = DataFrame([[1L, 'A']], columns=midx)
  1577. grouped = df.groupby('to filter').groups
  1578. self.assert_(np.array_equal(grouped['A'], [0]))
  1579. grouped = df.groupby([('to filter', '')]).groups
  1580. self.assert_(np.array_equal(grouped['A'], [0]))
  1581. df = DataFrame([[1L, 'A'], [2L, 'B']], columns=midx)
  1582. expected = df.groupby('to filter').groups
  1583. result = df.groupby([('to filter', '')]).groups
  1584. self.assertEquals(result, expected)
  1585. df = DataFrame([[1L, 'A'], [2L, 'A']], columns=midx)
  1586. expected = df.groupby('to filter').groups
  1587. result = df.groupby([('to filter', '')]).groups
  1588. self.assertEquals(result, expected)
  1589. def test_cython_median(self):
  1590. df = DataFrame(np.random.randn(1000))
  1591. df.values[::2] = np.nan
  1592. labels = np.random.randint(0, 50, size=1000).astype(float)
  1593. labels[::17] = np.nan
  1594. result = df.groupby(labels).median()
  1595. exp = df.groupby(labels).agg(nanops.nanmedian)
  1596. assert_frame_equal(result, exp)
  1597. def test_groupby_categorical_no_compress(self):
  1598. data = Series(np.random.randn(9))
  1599. labels = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
  1600. cats = Categorical(labels, [0, 1, 2])
  1601. result = data.groupby(cats).mean()
  1602. exp = data.groupby(labels).mean()
  1603. assert_series_equal(result, exp)
  1604. labels = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3])
  1605. cats = Categorical(labels, [0, 1, 2, 3])
  1606. result = data.groupby(cats).mean()
  1607. exp = data.groupby(labels).mean().reindex(cats.levels)
  1608. assert_series_equal(result, exp)
  1609. def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
  1610. tups = map(tuple, df[keys].values)
  1611. tups = com._asarray_tuplesafe(tups)
  1612. expected = f(df.groupby(tups)[field])
  1613. for k, v in expected.iteritems():
  1614. assert(result[k] == v)
  1615. def test_decons():
  1616. from pandas.core.groupby import decons_group_index, get_group_index
  1617. def testit(label_list, shape):
  1618. group_index = get_group_index(label_list, shape)
  1619. label_list2 = decons_group_index(group_index, shape)
  1620. for a, b in zip(label_list, label_list2):
  1621. assert(np.array_equal(a, b))
  1622. shape = (4, 5, 6)
  1623. label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100),
  1624. np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100),
  1625. np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100)]
  1626. testit(label_list, shape)
  1627. shape = (10000, 10000)
  1628. label_list = [np.tile(np.arange(10000), 5),
  1629. np.tile(np.arange(10000), 5)]
  1630. testit(label_list, shape)
  1631. if __name__ == '__main__':
  1632. import nose
  1633. nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],
  1634. exit=False)