PageRenderTime 65ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 1ms

/pandas/tests/test_groupby.py

https://github.com/thouis/pandas
Python | 2166 lines | 1771 code | 327 blank | 68 comment | 57 complexity | d373a246d854fe631c2b38c0ad2cd300 MD5 | raw file
Possible License(s): BSD-3-Clause

Large files files are truncated, but you can click here to view the full file

  1. import nose
  2. import unittest
  3. from datetime import datetime
  4. from numpy import nan
  5. from pandas import bdate_range
  6. from pandas.core.index import Index, MultiIndex
  7. from pandas.core.common import rands
  8. from pandas.core.api import Categorical, DataFrame
  9. from pandas.core.groupby import GroupByError, SpecificationError, DataError
  10. from pandas.core.series import Series
  11. from pandas.util.testing import (assert_panel_equal, assert_frame_equal,
  12. assert_series_equal, assert_almost_equal)
  13. from pandas.core.panel import Panel
  14. from pandas.tools.merge import concat
  15. from collections import defaultdict
  16. import pandas.core.common as com
  17. import pandas.core.datetools as dt
  18. import numpy as np
  19. from numpy.testing import assert_equal
  20. import pandas.core.nanops as nanops
  21. import pandas.util.testing as tm
  22. def commonSetUp(self):
  23. self.dateRange = bdate_range('1/1/2005', periods=250)
  24. self.stringIndex = Index([rands(8).upper() for x in xrange(250)])
  25. self.groupId = Series([x[0] for x in self.stringIndex],
  26. index=self.stringIndex)
  27. self.groupDict = dict((k, v) for k, v in self.groupId.iteritems())
  28. self.columnIndex = Index(['A', 'B', 'C', 'D', 'E'])
  29. randMat = np.random.randn(250, 5)
  30. self.stringMatrix = DataFrame(randMat, columns=self.columnIndex,
  31. index=self.stringIndex)
  32. self.timeMatrix = DataFrame(randMat, columns=self.columnIndex,
  33. index=self.dateRange)
  34. class TestGroupBy(unittest.TestCase):
  35. _multiprocess_can_split_ = True
  36. def setUp(self):
  37. self.ts = tm.makeTimeSeries()
  38. self.seriesd = tm.getSeriesData()
  39. self.tsd = tm.getTimeSeriesData()
  40. self.frame = DataFrame(self.seriesd)
  41. self.tsframe = DataFrame(self.tsd)
  42. self.df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
  43. 'foo', 'bar', 'foo', 'foo'],
  44. 'B' : ['one', 'one', 'two', 'three',
  45. 'two', 'two', 'one', 'three'],
  46. 'C' : np.random.randn(8),
  47. 'D' : np.random.randn(8)})
  48. index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
  49. ['one', 'two', 'three']],
  50. labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
  51. [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
  52. names=['first', 'second'])
  53. self.mframe = DataFrame(np.random.randn(10, 3), index=index,
  54. columns=['A', 'B', 'C'])
  55. self.three_group = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo',
  56. 'bar', 'bar', 'bar', 'bar',
  57. 'foo', 'foo', 'foo'],
  58. 'B' : ['one', 'one', 'one', 'two',
  59. 'one', 'one', 'one', 'two',
  60. 'two', 'two', 'one'],
  61. 'C' : ['dull', 'dull', 'shiny', 'dull',
  62. 'dull', 'shiny', 'shiny', 'dull',
  63. 'shiny', 'shiny', 'shiny'],
  64. 'D' : np.random.randn(11),
  65. 'E' : np.random.randn(11),
  66. 'F' : np.random.randn(11)})
  67. def test_basic(self):
  68. data = Series(np.arange(9) // 3, index=np.arange(9))
  69. index = np.arange(9)
  70. np.random.shuffle(index)
  71. data = data.reindex(index)
  72. grouped = data.groupby(lambda x: x // 3)
  73. for k, v in grouped:
  74. self.assertEqual(len(v), 3)
  75. agged = grouped.aggregate(np.mean)
  76. self.assertEqual(agged[1], 1)
  77. assert_series_equal(agged, grouped.agg(np.mean)) # shorthand
  78. assert_series_equal(agged, grouped.mean())
  79. # Cython only returning floating point for now...
  80. assert_series_equal(grouped.agg(np.sum).astype(float),
  81. grouped.sum())
  82. transformed = grouped.transform(lambda x: x * x.sum())
  83. self.assertEqual(transformed[7], 12)
  84. value_grouped = data.groupby(data)
  85. assert_series_equal(value_grouped.aggregate(np.mean), agged)
  86. # complex agg
  87. agged = grouped.aggregate([np.mean, np.std])
  88. agged = grouped.aggregate({'one' : np.mean,
  89. 'two' : np.std})
  90. group_constants = {
  91. 0 : 10,
  92. 1 : 20,
  93. 2 : 30
  94. }
  95. agged = grouped.agg(lambda x: group_constants[x.name] + x.mean())
  96. self.assertEqual(agged[1], 21)
  97. # corner cases
  98. self.assertRaises(Exception, grouped.aggregate, lambda x: x * 2)
  99. def test_first_last_nth(self):
  100. # tests for first / last / nth
  101. grouped = self.df.groupby('A')
  102. first = grouped.first()
  103. expected = self.df.ix[[1, 0], ['B', 'C', 'D']]
  104. expected.index = ['bar', 'foo']
  105. assert_frame_equal(first, expected)
  106. last = grouped.last()
  107. expected = self.df.ix[[5, 7], ['B', 'C', 'D']]
  108. expected.index = ['bar', 'foo']
  109. assert_frame_equal(last, expected)
  110. nth = grouped.nth(1)
  111. expected = self.df.ix[[3, 2], ['B', 'C', 'D']]
  112. expected.index = ['bar', 'foo']
  113. assert_frame_equal(nth, expected)
  114. # it works!
  115. grouped['B'].first()
  116. grouped['B'].last()
  117. grouped['B'].nth(0)
  118. self.df['B'][self.df['A'] == 'foo'] = np.nan
  119. self.assert_(com.isnull(grouped['B'].first()['foo']))
  120. self.assert_(com.isnull(grouped['B'].last()['foo']))
  121. self.assert_(com.isnull(grouped['B'].nth(0)['foo']))
  122. def test_grouper_iter(self):
  123. self.assertEqual(sorted(self.df.groupby('A').grouper), ['bar', 'foo'])
  124. def test_empty_groups(self):
  125. # GH # 1048
  126. self.assertRaises(ValueError, self.df.groupby, [])
  127. def test_groupby_grouper(self):
  128. grouped = self.df.groupby('A')
  129. result = self.df.groupby(grouped.grouper).mean()
  130. expected = grouped.mean()
  131. assert_frame_equal(result, expected)
  132. def test_groupby_dict_mapping(self):
  133. # GH #679
  134. from pandas import Series
  135. s = Series({'T1': 5})
  136. result = s.groupby({'T1': 'T2'}).agg(sum)
  137. expected = s.groupby(['T2']).agg(sum)
  138. assert_series_equal(result, expected)
  139. s = Series([1., 2., 3., 4.], index=list('abcd'))
  140. mapping = {'a' : 0, 'b' : 0, 'c' : 1, 'd' : 1}
  141. result = s.groupby(mapping).mean()
  142. result2 = s.groupby(mapping).agg(np.mean)
  143. expected = s.groupby([0, 0, 1, 1]).mean()
  144. expected2 = s.groupby([0, 0, 1, 1]).mean()
  145. assert_series_equal(result, expected)
  146. assert_series_equal(result, result2)
  147. assert_series_equal(result, expected2)
  148. def test_groupby_nonobject_dtype(self):
  149. key = self.mframe.index.labels[0]
  150. grouped = self.mframe.groupby(key)
  151. result = grouped.sum()
  152. expected = self.mframe.groupby(key.astype('O')).sum()
  153. assert_frame_equal(result, expected)
  154. def test_agg_regression1(self):
  155. grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
  156. result = grouped.agg(np.mean)
  157. expected = grouped.mean()
  158. assert_frame_equal(result, expected)
  159. def test_agg_datetimes_mixed(self):
  160. data = [[1, '2012-01-01', 1.0],
  161. [2, '2012-01-02', 2.0],
  162. [3, None, 3.0]]
  163. df1 = DataFrame({'key': [x[0] for x in data],
  164. 'date': [x[1] for x in data],
  165. 'value': [x[2] for x in data]})
  166. data = [[row[0], datetime.strptime(row[1], '%Y-%m-%d').date()
  167. if row[1] else None, row[2]] for row in data]
  168. df2 = DataFrame({'key': [x[0] for x in data],
  169. 'date': [x[1] for x in data],
  170. 'value': [x[2] for x in data]})
  171. df1['weights'] = df1['value']/df1['value'].sum()
  172. gb1 = df1.groupby('date').aggregate(np.sum)
  173. df2['weights'] = df1['value']/df1['value'].sum()
  174. gb2 = df2.groupby('date').aggregate(np.sum)
  175. assert(len(gb1) == len(gb2))
  176. def test_agg_must_agg(self):
  177. grouped = self.df.groupby('A')['C']
  178. self.assertRaises(Exception, grouped.agg, lambda x: x.describe())
  179. self.assertRaises(Exception, grouped.agg, lambda x: x.index[:2])
  180. def test_agg_ser_multi_key(self):
  181. ser = self.df.C
  182. f = lambda x: x.sum()
  183. results = self.df.C.groupby([self.df.A, self.df.B]).aggregate(f)
  184. expected = self.df.groupby(['A', 'B']).sum()['C']
  185. assert_series_equal(results, expected)
  186. def test_get_group(self):
  187. wp = tm.makePanel()
  188. grouped = wp.groupby(lambda x: x.month, axis='major')
  189. gp = grouped.get_group(1)
  190. expected = wp.reindex(major=[x for x in wp.major_axis if x.month == 1])
  191. assert_panel_equal(gp, expected)
  192. def test_agg_apply_corner(self):
  193. # nothing to group, all NA
  194. grouped = self.ts.groupby(self.ts * np.nan)
  195. assert_series_equal(grouped.sum(), Series([]))
  196. assert_series_equal(grouped.agg(np.sum), Series([]))
  197. assert_series_equal(grouped.apply(np.sum), Series([]))
  198. # DataFrame
  199. grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan)
  200. exp_df = DataFrame(columns=self.tsframe.columns, dtype=float)
  201. assert_frame_equal(grouped.sum(), exp_df)
  202. assert_frame_equal(grouped.agg(np.sum), exp_df)
  203. assert_frame_equal(grouped.apply(np.sum), DataFrame({}, dtype=float))
  204. def test_agg_grouping_is_list_tuple(self):
  205. from pandas.core.groupby import Grouping
  206. df = tm.makeTimeDataFrame()
  207. grouped = df.groupby(lambda x: x.year)
  208. grouper = grouped.grouper.groupings[0].grouper
  209. grouped.grouper.groupings[0] = Grouping(self.ts.index, list(grouper))
  210. result = grouped.agg(np.mean)
  211. expected = grouped.mean()
  212. tm.assert_frame_equal(result, expected)
  213. grouped.grouper.groupings[0] = Grouping(self.ts.index, tuple(grouper))
  214. result = grouped.agg(np.mean)
  215. expected = grouped.mean()
  216. tm.assert_frame_equal(result, expected)
  217. def test_agg_python_multiindex(self):
  218. grouped = self.mframe.groupby(['A', 'B'])
  219. result = grouped.agg(np.mean)
  220. expected = grouped.mean()
  221. tm.assert_frame_equal(result, expected)
  222. def test_apply_describe_bug(self):
  223. grouped = self.mframe.groupby(level='first')
  224. result = grouped.describe() # it works!
  225. def test_len(self):
  226. df = tm.makeTimeDataFrame()
  227. grouped = df.groupby([lambda x: x.year,
  228. lambda x: x.month,
  229. lambda x: x.day])
  230. self.assertEquals(len(grouped), len(df))
  231. grouped = df.groupby([lambda x: x.year,
  232. lambda x: x.month])
  233. expected = len(set([(x.year, x.month) for x in df.index]))
  234. self.assertEquals(len(grouped), expected)
  235. def test_groups(self):
  236. grouped = self.df.groupby(['A'])
  237. groups = grouped.groups
  238. self.assert_(groups is grouped.groups) # caching works
  239. for k, v in grouped.groups.iteritems():
  240. self.assert_((self.df.ix[v]['A'] == k).all())
  241. grouped = self.df.groupby(['A', 'B'])
  242. groups = grouped.groups
  243. self.assert_(groups is grouped.groups) # caching works
  244. for k, v in grouped.groups.iteritems():
  245. self.assert_((self.df.ix[v]['A'] == k[0]).all())
  246. self.assert_((self.df.ix[v]['B'] == k[1]).all())
  247. def test_aggregate_str_func(self):
  248. from pandas.util.compat import OrderedDict
  249. def _check_results(grouped):
  250. # single series
  251. result = grouped['A'].agg('std')
  252. expected = grouped['A'].std()
  253. assert_series_equal(result, expected)
  254. # group frame by function name
  255. result = grouped.aggregate('var')
  256. expected = grouped.var()
  257. assert_frame_equal(result, expected)
  258. # group frame by function dict
  259. result = grouped.agg(OrderedDict([['A' , 'var'], ['B' , 'std'], ['C' , 'mean']]))
  260. expected = DataFrame(OrderedDict([['A', grouped['A'].var()],
  261. ['B', grouped['B'].std()],
  262. ['C', grouped['C'].mean()]]))
  263. assert_frame_equal(result, expected)
  264. by_weekday = self.tsframe.groupby(lambda x: x.weekday())
  265. _check_results(by_weekday)
  266. by_mwkday = self.tsframe.groupby([lambda x: x.month,
  267. lambda x: x.weekday()])
  268. _check_results(by_mwkday)
  269. def test_aggregate_item_by_item(self):
  270. df = self.df.copy()
  271. df['E'] = ['a'] * len(self.df)
  272. grouped = self.df.groupby('A')
  273. def aggfun(ser):
  274. return len(ser + 'a')
  275. result = grouped.agg(aggfun)
  276. self.assertEqual(len(result.columns), 1)
  277. aggfun = lambda ser: ser.size
  278. result = grouped.agg(aggfun)
  279. foo = (self.df.A == 'foo').sum()
  280. bar = (self.df.A == 'bar').sum()
  281. K = len(result.columns)
  282. assert_almost_equal(result.xs('foo'), [foo] * K)
  283. assert_almost_equal(result.xs('bar'), [bar] * K)
  284. def aggfun(ser):
  285. return ser.size
  286. result = DataFrame().groupby(self.df.A).agg(aggfun)
  287. self.assert_(isinstance(result, DataFrame))
  288. self.assertEqual(len(result), 0)
  289. def test_basic_regression(self):
  290. # regression
  291. T = [1.0*x for x in range(1,10) *10][:1095]
  292. result = Series(T, range(0, len(T)))
  293. groupings = np.random.random((1100,))
  294. groupings = Series(groupings, range(0, len(groupings))) * 10.
  295. grouped = result.groupby(groupings)
  296. grouped.mean()
  297. def test_transform(self):
  298. data = Series(np.arange(9) // 3, index=np.arange(9))
  299. index = np.arange(9)
  300. np.random.shuffle(index)
  301. data = data.reindex(index)
  302. grouped = data.groupby(lambda x: x // 3)
  303. transformed = grouped.transform(lambda x: x * x.sum())
  304. self.assertEqual(transformed[7], 12)
  305. def test_transform_broadcast(self):
  306. grouped = self.ts.groupby(lambda x: x.month)
  307. result = grouped.transform(np.mean)
  308. self.assert_(result.index.equals(self.ts.index))
  309. for _, gp in grouped:
  310. assert_fp_equal(result.reindex(gp.index), gp.mean())
  311. grouped = self.tsframe.groupby(lambda x: x.month)
  312. result = grouped.transform(np.mean)
  313. self.assert_(result.index.equals(self.tsframe.index))
  314. for _, gp in grouped:
  315. agged = gp.mean()
  316. res = result.reindex(gp.index)
  317. for col in self.tsframe:
  318. assert_fp_equal(res[col], agged[col])
  319. # group columns
  320. grouped = self.tsframe.groupby({'A' : 0, 'B' : 0, 'C' : 1, 'D' : 1},
  321. axis=1)
  322. result = grouped.transform(np.mean)
  323. self.assert_(result.index.equals(self.tsframe.index))
  324. self.assert_(result.columns.equals(self.tsframe.columns))
  325. for _, gp in grouped:
  326. agged = gp.mean(1)
  327. res = result.reindex(columns=gp.columns)
  328. for idx in gp.index:
  329. assert_fp_equal(res.xs(idx), agged[idx])
  330. def test_transform_multiple(self):
  331. grouped = self.ts.groupby([lambda x: x.year, lambda x: x.month])
  332. transformed = grouped.transform(lambda x: x * 2)
  333. broadcasted = grouped.transform(np.mean)
  334. def test_dispatch_transform(self):
  335. df = self.tsframe[::5].reindex(self.tsframe.index)
  336. grouped = df.groupby(lambda x: x.month)
  337. filled = grouped.fillna(method='pad')
  338. fillit = lambda x: x.fillna(method='pad')
  339. expected = df.groupby(lambda x: x.month).transform(fillit)
  340. assert_frame_equal(filled, expected)
  341. def test_transform_select_columns(self):
  342. f = lambda x: x.mean()
  343. result = self.df.groupby('A')['C', 'D'].transform(f)
  344. selection = self.df[['C', 'D']]
  345. expected = selection.groupby(self.df['A']).transform(f)
  346. assert_frame_equal(result, expected)
  347. def test_transform_exclude_nuisance(self):
  348. expected = {}
  349. grouped = self.df.groupby('A')
  350. expected['C'] = grouped['C'].transform(np.mean)
  351. expected['D'] = grouped['D'].transform(np.mean)
  352. expected = DataFrame(expected)
  353. result = self.df.groupby('A').transform(np.mean)
  354. assert_frame_equal(result, expected)
  355. def test_transform_function_aliases(self):
  356. result = self.df.groupby('A').transform('mean')
  357. expected = self.df.groupby('A').transform(np.mean)
  358. assert_frame_equal(result, expected)
  359. result = self.df.groupby('A')['C'].transform('mean')
  360. expected = self.df.groupby('A')['C'].transform(np.mean)
  361. assert_series_equal(result, expected)
  362. def test_with_na(self):
  363. index = Index(np.arange(10))
  364. values = Series(np.ones(10), index)
  365. labels = Series([nan, 'foo', 'bar', 'bar', nan, nan, 'bar',
  366. 'bar', nan, 'foo'], index=index)
  367. grouped = values.groupby(labels)
  368. agged = grouped.agg(len)
  369. expected = Series([4, 2], index=['bar', 'foo'])
  370. assert_series_equal(agged, expected, check_dtype=False)
  371. self.assert_(issubclass(agged.dtype.type, np.integer))
  372. def test_attr_wrapper(self):
  373. grouped = self.ts.groupby(lambda x: x.weekday())
  374. result = grouped.std()
  375. expected = grouped.agg(lambda x: np.std(x, ddof=1))
  376. assert_series_equal(result, expected)
  377. # this is pretty cool
  378. result = grouped.describe()
  379. expected = {}
  380. for name, gp in grouped:
  381. expected[name] = gp.describe()
  382. expected = DataFrame(expected).T
  383. assert_frame_equal(result.unstack(), expected)
  384. # get attribute
  385. result = grouped.dtype
  386. expected = grouped.agg(lambda x: x.dtype)
  387. # make sure raises error
  388. self.assertRaises(AttributeError, getattr, grouped, 'foo')
  389. def test_series_describe_multikey(self):
  390. ts = tm.makeTimeSeries()
  391. grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
  392. result = grouped.describe().unstack()
  393. assert_series_equal(result['mean'], grouped.mean())
  394. assert_series_equal(result['std'], grouped.std())
  395. assert_series_equal(result['min'], grouped.min())
  396. def test_series_describe_single(self):
  397. ts = tm.makeTimeSeries()
  398. grouped = ts.groupby(lambda x: x.month)
  399. result = grouped.apply(lambda x: x.describe())
  400. expected = grouped.describe()
  401. assert_series_equal(result, expected)
  402. def test_series_agg_multikey(self):
  403. ts = tm.makeTimeSeries()
  404. grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
  405. result = grouped.agg(np.sum)
  406. expected = grouped.sum()
  407. assert_series_equal(result, expected)
  408. def test_series_agg_multi_pure_python(self):
  409. data = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo',
  410. 'bar', 'bar', 'bar', 'bar',
  411. 'foo', 'foo', 'foo'],
  412. 'B' : ['one', 'one', 'one', 'two',
  413. 'one', 'one', 'one', 'two',
  414. 'two', 'two', 'one'],
  415. 'C' : ['dull', 'dull', 'shiny', 'dull',
  416. 'dull', 'shiny', 'shiny', 'dull',
  417. 'shiny', 'shiny', 'shiny'],
  418. 'D' : np.random.randn(11),
  419. 'E' : np.random.randn(11),
  420. 'F' : np.random.randn(11)})
  421. def bad(x):
  422. assert(len(x.base) > 0)
  423. return 'foo'
  424. result = data.groupby(['A', 'B']).agg(bad)
  425. expected = data.groupby(['A', 'B']).agg(lambda x: 'foo')
  426. assert_frame_equal(result, expected)
  427. def test_series_index_name(self):
  428. grouped = self.df.ix[:, ['C']].groupby(self.df['A'])
  429. result = grouped.agg(lambda x: x.mean())
  430. self.assertEqual(result.index.name, 'A')
  431. def test_frame_describe_multikey(self):
  432. grouped = self.tsframe.groupby([lambda x: x.year,
  433. lambda x: x.month])
  434. result = grouped.describe()
  435. for col in self.tsframe:
  436. expected = grouped[col].describe()
  437. assert_series_equal(result[col], expected)
  438. groupedT = self.tsframe.groupby({'A' : 0, 'B' : 0,
  439. 'C' : 1, 'D' : 1}, axis=1)
  440. result = groupedT.describe()
  441. for name, group in groupedT:
  442. assert_frame_equal(result[name], group.describe())
  443. def test_frame_groupby(self):
  444. grouped = self.tsframe.groupby(lambda x: x.weekday())
  445. # aggregate
  446. aggregated = grouped.aggregate(np.mean)
  447. self.assertEqual(len(aggregated), 5)
  448. self.assertEqual(len(aggregated.columns), 4)
  449. # by string
  450. tscopy = self.tsframe.copy()
  451. tscopy['weekday'] = [x.weekday() for x in tscopy.index]
  452. stragged = tscopy.groupby('weekday').aggregate(np.mean)
  453. assert_frame_equal(stragged, aggregated)
  454. # transform
  455. transformed = grouped.transform(lambda x: x - x.mean())
  456. self.assertEqual(len(transformed), 30)
  457. self.assertEqual(len(transformed.columns), 4)
  458. # transform propagate
  459. transformed = grouped.transform(lambda x: x.mean())
  460. for name, group in grouped:
  461. mean = group.mean()
  462. for idx in group.index:
  463. assert_almost_equal(transformed.xs(idx), mean)
  464. # iterate
  465. for weekday, group in grouped:
  466. self.assert_(group.index[0].weekday() == weekday)
  467. # groups / group_indices
  468. groups = grouped.groups
  469. indices = grouped.indices
  470. for k, v in groups.iteritems():
  471. samething = self.tsframe.index.take(indices[k])
  472. self.assertTrue((samething == v).all())
  473. def test_grouping_is_iterable(self):
  474. # this code path isn't used anywhere else
  475. # not sure it's useful
  476. grouped = self.tsframe.groupby([lambda x: x.weekday(),
  477. lambda x: x.year])
  478. # test it works
  479. for g in grouped.grouper.groupings[0]:
  480. pass
  481. def test_frame_groupby_columns(self):
  482. mapping = {
  483. 'A' : 0, 'B' : 0, 'C' : 1, 'D' : 1
  484. }
  485. grouped = self.tsframe.groupby(mapping, axis=1)
  486. # aggregate
  487. aggregated = grouped.aggregate(np.mean)
  488. self.assertEqual(len(aggregated), len(self.tsframe))
  489. self.assertEqual(len(aggregated.columns), 2)
  490. # transform
  491. tf = lambda x: x - x.mean()
  492. groupedT = self.tsframe.T.groupby(mapping, axis=0)
  493. assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf))
  494. # iterate
  495. for k, v in grouped:
  496. self.assertEqual(len(v.columns), 2)
  497. def test_frame_set_name_single(self):
  498. grouped = self.df.groupby('A')
  499. result = grouped.mean()
  500. self.assert_(result.index.name == 'A')
  501. result = self.df.groupby('A', as_index=False).mean()
  502. self.assert_(result.index.name != 'A')
  503. result = grouped.agg(np.mean)
  504. self.assert_(result.index.name == 'A')
  505. result = grouped.agg({'C' : np.mean, 'D' : np.std})
  506. self.assert_(result.index.name == 'A')
  507. result = grouped['C'].mean()
  508. self.assert_(result.index.name == 'A')
  509. result = grouped['C'].agg(np.mean)
  510. self.assert_(result.index.name == 'A')
  511. result = grouped['C'].agg([np.mean, np.std])
  512. self.assert_(result.index.name == 'A')
  513. result = grouped['C'].agg({'foo' : np.mean, 'bar' : np.std})
  514. self.assert_(result.index.name == 'A')
  515. def test_multi_iter(self):
  516. s = Series(np.arange(6))
  517. k1 = np.array(['a', 'a', 'a', 'b', 'b', 'b'])
  518. k2 = np.array(['1', '2', '1', '2', '1', '2'])
  519. grouped = s.groupby([k1, k2])
  520. iterated = list(grouped)
  521. expected = [('a', '1', s[[0, 2]]),
  522. ('a', '2', s[[1]]),
  523. ('b', '1', s[[4]]),
  524. ('b', '2', s[[3, 5]])]
  525. for i, ((one, two), three) in enumerate(iterated):
  526. e1, e2, e3 = expected[i]
  527. self.assert_(e1 == one)
  528. self.assert_(e2 == two)
  529. assert_series_equal(three, e3)
  530. def test_multi_iter_frame(self):
  531. k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
  532. k2 = np.array(['1', '2', '1', '2', '1', '2'])
  533. df = DataFrame({'v1' : np.random.randn(6),
  534. 'v2' : np.random.randn(6),
  535. 'k1' : k1, 'k2' : k2},
  536. index=['one', 'two', 'three', 'four', 'five', 'six'])
  537. grouped = df.groupby(['k1', 'k2'])
  538. # things get sorted!
  539. iterated = list(grouped)
  540. idx = df.index
  541. expected = [('a', '1', df.ix[idx[[4]]]),
  542. ('a', '2', df.ix[idx[[3, 5]]]),
  543. ('b', '1', df.ix[idx[[0, 2]]]),
  544. ('b', '2', df.ix[idx[[1]]])]
  545. for i, ((one, two), three) in enumerate(iterated):
  546. e1, e2, e3 = expected[i]
  547. self.assert_(e1 == one)
  548. self.assert_(e2 == two)
  549. assert_frame_equal(three, e3)
  550. # don't iterate through groups with no data
  551. df['k1'] = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
  552. df['k2'] = np.array(['1', '1', '1', '2', '2', '2'])
  553. grouped = df.groupby(['k1', 'k2'])
  554. groups = {}
  555. for key, gp in grouped:
  556. groups[key] = gp
  557. self.assertEquals(len(groups), 2)
  558. # axis = 1
  559. three_levels = self.three_group.groupby(['A', 'B', 'C']).mean()
  560. grouped = three_levels.T.groupby(axis=1, level=(1, 2))
  561. for key, group in grouped:
  562. pass
  563. def test_multi_iter_panel(self):
  564. wp = tm.makePanel()
  565. grouped = wp.groupby([lambda x: x.month, lambda x: x.weekday()],
  566. axis=1)
  567. for (month, wd), group in grouped:
  568. exp_axis = [x for x in wp.major_axis
  569. if x.month == month and x.weekday() == wd]
  570. expected = wp.reindex(major=exp_axis)
  571. assert_panel_equal(group, expected)
  572. def test_multi_func(self):
  573. col1 = self.df['A']
  574. col2 = self.df['B']
  575. grouped = self.df.groupby([col1.get, col2.get])
  576. agged = grouped.mean()
  577. expected = self.df.groupby(['A', 'B']).mean()
  578. assert_frame_equal(agged.ix[:, ['C', 'D']],
  579. expected.ix[:, ['C', 'D']])
  580. # some "groups" with no data
  581. df = DataFrame({'v1' : np.random.randn(6),
  582. 'v2' : np.random.randn(6),
  583. 'k1' : np.array(['b', 'b', 'b', 'a', 'a', 'a']),
  584. 'k2' : np.array(['1', '1', '1', '2', '2', '2'])},
  585. index=['one', 'two', 'three', 'four', 'five', 'six'])
  586. # only verify that it works for now
  587. grouped = df.groupby(['k1', 'k2'])
  588. grouped.agg(np.sum)
  589. def test_multi_key_multiple_functions(self):
  590. grouped = self.df.groupby(['A', 'B'])['C']
  591. agged = grouped.agg([np.mean, np.std])
  592. expected = DataFrame({'mean' : grouped.agg(np.mean),
  593. 'std' : grouped.agg(np.std)})
  594. assert_frame_equal(agged, expected)
  595. def test_frame_multi_key_function_list(self):
  596. data = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo',
  597. 'bar', 'bar', 'bar', 'bar',
  598. 'foo', 'foo', 'foo'],
  599. 'B' : ['one', 'one', 'one', 'two',
  600. 'one', 'one', 'one', 'two',
  601. 'two', 'two', 'one'],
  602. 'C' : ['dull', 'dull', 'shiny', 'dull',
  603. 'dull', 'shiny', 'shiny', 'dull',
  604. 'shiny', 'shiny', 'shiny'],
  605. 'D' : np.random.randn(11),
  606. 'E' : np.random.randn(11),
  607. 'F' : np.random.randn(11)})
  608. grouped = data.groupby(['A', 'B'])
  609. funcs = [np.mean, np.std]
  610. agged = grouped.agg(funcs)
  611. expected = concat([grouped['D'].agg(funcs), grouped['E'].agg(funcs),
  612. grouped['F'].agg(funcs)],
  613. keys=['D', 'E', 'F'], axis=1)
  614. assert(isinstance(agged.index, MultiIndex))
  615. assert(isinstance(expected.index, MultiIndex))
  616. assert_frame_equal(agged, expected)
  617. def test_groupby_multiple_columns(self):
  618. data = self.df
  619. grouped = data.groupby(['A', 'B'])
  620. def _check_op(op):
  621. result1 = op(grouped)
  622. expected = defaultdict(dict)
  623. for n1, gp1 in data.groupby('A'):
  624. for n2, gp2 in gp1.groupby('B'):
  625. expected[n1][n2] = op(gp2.ix[:, ['C', 'D']])
  626. expected = dict((k, DataFrame(v)) for k, v in expected.iteritems())
  627. expected = Panel.fromDict(expected).swapaxes(0, 1)
  628. # a little bit crude
  629. for col in ['C', 'D']:
  630. result_col = op(grouped[col])
  631. exp = expected[col]
  632. pivoted = result1[col].unstack()
  633. pivoted2 = result_col.unstack()
  634. assert_frame_equal(pivoted.reindex_like(exp), exp)
  635. assert_frame_equal(pivoted2.reindex_like(exp), exp)
  636. _check_op(lambda x: x.sum())
  637. _check_op(lambda x: x.mean())
  638. # test single series works the same
  639. result = data['C'].groupby([data['A'], data['B']]).mean()
  640. expected = data.groupby(['A', 'B']).mean()['C']
  641. assert_series_equal(result, expected)
  642. def test_groupby_as_index_agg(self):
  643. from pandas.util.compat import OrderedDict
  644. grouped = self.df.groupby('A', as_index=False)
  645. # single-key
  646. result = grouped.agg(np.mean)
  647. expected = grouped.mean()
  648. assert_frame_equal(result, expected)
  649. result2 = grouped.agg(OrderedDict([['C' , np.mean], ['D' , np.sum]]))
  650. expected2 = grouped.mean()
  651. expected2['D'] = grouped.sum()['D']
  652. assert_frame_equal(result2, expected2)
  653. grouped = self.df.groupby('A', as_index=True)
  654. expected3 = grouped['C'].sum()
  655. expected3 = DataFrame(expected3).rename(columns={'C' : 'Q'})
  656. result3 = grouped['C'].agg({'Q' : np.sum})
  657. assert_frame_equal(result3, expected3)
  658. # multi-key
  659. grouped = self.df.groupby(['A', 'B'], as_index=False)
  660. result = grouped.agg(np.mean)
  661. expected = grouped.mean()
  662. assert_frame_equal(result, expected)
  663. result2 = grouped.agg(OrderedDict([['C' , np.mean], ['D' , np.sum]]))
  664. expected2 = grouped.mean()
  665. expected2['D'] = grouped.sum()['D']
  666. assert_frame_equal(result2, expected2)
  667. expected3 = grouped['C'].sum()
  668. expected3 = DataFrame(expected3).rename(columns={'C' : 'Q'})
  669. result3 = grouped['C'].agg({'Q' : np.sum})
  670. assert_frame_equal(result3, expected3)
  671. def test_multifunc_select_col_integer_cols(self):
  672. df = self.df
  673. df.columns = np.arange(len(df.columns))
  674. # it works!
  675. result = df.groupby(1, as_index=False)[2].agg({'Q' : np.mean})
  676. def test_as_index_series_return_frame(self):
  677. grouped = self.df.groupby('A', as_index=False)
  678. grouped2 = self.df.groupby(['A', 'B'], as_index=False)
  679. result = grouped['C'].agg(np.sum)
  680. expected = grouped.agg(np.sum).ix[:, ['A', 'C']]
  681. self.assert_(isinstance(result, DataFrame))
  682. assert_frame_equal(result, expected)
  683. result2 = grouped2['C'].agg(np.sum)
  684. expected2 = grouped2.agg(np.sum).ix[:, ['A', 'B', 'C']]
  685. self.assert_(isinstance(result2, DataFrame))
  686. assert_frame_equal(result2, expected2)
  687. result = grouped['C'].sum()
  688. expected = grouped.sum().ix[:, ['A', 'C']]
  689. self.assert_(isinstance(result, DataFrame))
  690. assert_frame_equal(result, expected)
  691. result2 = grouped2['C'].sum()
  692. expected2 = grouped2.sum().ix[:, ['A', 'B', 'C']]
  693. self.assert_(isinstance(result2, DataFrame))
  694. assert_frame_equal(result2, expected2)
  695. # corner case
  696. self.assertRaises(Exception, grouped['C'].__getitem__,
  697. 'D')
  698. def test_groupby_as_index_cython(self):
  699. data = self.df
  700. # single-key
  701. grouped = data.groupby('A', as_index=False)
  702. result = grouped.mean()
  703. expected = data.groupby(['A']).mean()
  704. expected.insert(0, 'A', expected.index)
  705. expected.index = np.arange(len(expected))
  706. assert_frame_equal(result, expected)
  707. # multi-key
  708. grouped = data.groupby(['A', 'B'], as_index=False)
  709. result = grouped.mean()
  710. expected = data.groupby(['A', 'B']).mean()
  711. arrays = zip(*expected.index._tuple_index)
  712. expected.insert(0, 'A', arrays[0])
  713. expected.insert(1, 'B', arrays[1])
  714. expected.index = np.arange(len(expected))
  715. assert_frame_equal(result, expected)
  716. def test_groupby_as_index_series_scalar(self):
  717. grouped = self.df.groupby(['A', 'B'], as_index=False)
  718. # GH #421
  719. result = grouped['C'].agg(len)
  720. expected = grouped.agg(len).ix[:, ['A', 'B', 'C']]
  721. assert_frame_equal(result, expected)
  722. def test_groupby_as_index_corner(self):
  723. self.assertRaises(TypeError, self.ts.groupby,
  724. lambda x: x.weekday(), as_index=False)
  725. self.assertRaises(ValueError, self.df.groupby,
  726. lambda x: x.lower(), as_index=False, axis=1)
  727. def test_groupby_multiple_key(self):
  728. df = tm.makeTimeDataFrame()
  729. grouped = df.groupby([lambda x: x.year,
  730. lambda x: x.month,
  731. lambda x: x.day])
  732. agged = grouped.sum()
  733. assert_almost_equal(df.values, agged.values)
  734. grouped = df.T.groupby([lambda x: x.year,
  735. lambda x: x.month,
  736. lambda x: x.day], axis=1)
  737. agged = grouped.agg(lambda x: x.sum(1))
  738. self.assert_(agged.index.equals(df.columns))
  739. assert_almost_equal(df.T.values, agged.values)
  740. agged = grouped.agg(lambda x: x.sum(1))
  741. assert_almost_equal(df.T.values, agged.values)
  742. def test_groupby_multi_corner(self):
  743. # test that having an all-NA column doesn't mess you up
  744. df = self.df.copy()
  745. df['bad'] = np.nan
  746. agged = df.groupby(['A', 'B']).mean()
  747. expected = self.df.groupby(['A', 'B']).mean()
  748. expected['bad'] = np.nan
  749. assert_frame_equal(agged, expected)
  750. def test_omit_nuisance(self):
  751. grouped = self.df.groupby('A')
  752. result = grouped.mean()
  753. expected = self.df.ix[:, ['A', 'C', 'D']].groupby('A').mean()
  754. assert_frame_equal(result, expected)
  755. agged = grouped.agg(np.mean)
  756. exp = grouped.mean()
  757. assert_frame_equal(agged, exp)
  758. df = self.df.ix[:, ['A', 'C', 'D']]
  759. df['E'] = datetime.now()
  760. grouped = df.groupby('A')
  761. result = grouped.agg(np.sum)
  762. expected = grouped.sum()
  763. assert_frame_equal(result, expected)
  764. # won't work with axis = 1
  765. grouped = df.groupby({'A' : 0, 'C' : 0, 'D' : 1, 'E' : 1}, axis=1)
  766. result = self.assertRaises(TypeError, grouped.agg,
  767. lambda x: x.sum(1, numeric_only=False))
  768. def test_omit_nuisance_python_multiple(self):
  769. grouped = self.three_group.groupby(['A', 'B'])
  770. agged = grouped.agg(np.mean)
  771. exp = grouped.mean()
  772. assert_frame_equal(agged, exp)
  773. def test_empty_groups_corner(self):
  774. # handle empty groups
  775. df = DataFrame({'k1' : np.array(['b', 'b', 'b', 'a', 'a', 'a']),
  776. 'k2' : np.array(['1', '1', '1', '2', '2', '2']),
  777. 'k3' : ['foo', 'bar'] * 3,
  778. 'v1' : np.random.randn(6),
  779. 'v2' : np.random.randn(6)})
  780. grouped = df.groupby(['k1', 'k2'])
  781. result = grouped.agg(np.mean)
  782. expected = grouped.mean()
  783. assert_frame_equal(result, expected)
  784. grouped = self.mframe[3:5].groupby(level=0)
  785. agged = grouped.apply(lambda x: x.mean())
  786. agged_A = grouped['A'].apply(np.mean)
  787. assert_series_equal(agged['A'], agged_A)
  788. self.assertEquals(agged.index.name, 'first')
  789. def test_apply_concat_preserve_names(self):
  790. grouped = self.three_group.groupby(['A', 'B'])
  791. def desc(group):
  792. result = group.describe()
  793. result.index.name = 'stat'
  794. return result
  795. def desc2(group):
  796. result = group.describe()
  797. result.index.name = 'stat'
  798. result = result[:len(group)]
  799. # weirdo
  800. return result
  801. def desc3(group):
  802. result = group.describe()
  803. # names are different
  804. result.index.name = 'stat_%d' % len(group)
  805. result = result[:len(group)]
  806. # weirdo
  807. return result
  808. result = grouped.apply(desc)
  809. self.assertEquals(result.index.names, ['A', 'B', 'stat'])
  810. result2 = grouped.apply(desc2)
  811. self.assertEquals(result2.index.names, ['A', 'B', 'stat'])
  812. result3 = grouped.apply(desc3)
  813. self.assertEquals(result3.index.names, ['A', 'B', None])
  814. def test_nonsense_func(self):
  815. df = DataFrame([0])
  816. self.assertRaises(Exception, df.groupby, lambda x: x + 'foo')
  817. def test_cythonized_aggers(self):
  818. data = {'A' : [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., nan, nan],
  819. 'B' : ['A', 'B'] * 6,
  820. 'C' : np.random.randn(12)}
  821. df = DataFrame(data)
  822. df['C'][2:10:2] = nan
  823. def _testit(op):
  824. # single column
  825. grouped = df.drop(['B'], axis=1).groupby('A')
  826. exp = {}
  827. for cat, group in grouped:
  828. exp[cat] = op(group['C'])
  829. exp = DataFrame({'C' : exp})
  830. result = op(grouped)
  831. assert_frame_equal(result, exp)
  832. # multiple columns
  833. grouped = df.groupby(['A', 'B'])
  834. expd = {}
  835. for (cat1, cat2), group in grouped:
  836. expd.setdefault(cat1, {})[cat2] = op(group['C'])
  837. exp = DataFrame(expd).T.stack(dropna=False)
  838. result = op(grouped)['C']
  839. assert_series_equal(result, exp)
  840. _testit(lambda x: x.sum())
  841. _testit(lambda x: x.mean())
  842. _testit(lambda x: x.prod())
  843. _testit(lambda x: x.min())
  844. _testit(lambda x: x.max())
  845. def test_cython_agg_boolean(self):
  846. frame = DataFrame({'a': np.random.randint(0, 5, 50),
  847. 'b': np.random.randint(0, 2, 50).astype('bool')})
  848. result = frame.groupby('a')['b'].mean()
  849. expected = frame.groupby('a')['b'].agg(np.mean)
  850. assert_series_equal(result, expected)
  851. def test_cython_agg_nothing_to_agg(self):
  852. frame = DataFrame({'a': np.random.randint(0, 5, 50),
  853. 'b': ['foo', 'bar'] * 25})
  854. self.assertRaises(DataError, frame.groupby('a')['b'].mean)
  855. frame = DataFrame({'a': np.random.randint(0, 5, 50),
  856. 'b': ['foo', 'bar'] * 25})
  857. self.assertRaises(DataError, frame[['b']].groupby(frame['a']).mean)
  858. def test_cython_agg_frame_columns(self):
  859. # #2113
  860. df = DataFrame({'x': [1,2,3], 'y': [3,4,5]})
  861. result = df.groupby(level=0, axis='columns').mean()
  862. result = df.groupby(level=0, axis='columns').mean()
  863. result = df.groupby(level=0, axis='columns').mean()
  864. _ = df.groupby(level=0, axis='columns').mean()
  865. def test_wrap_aggregated_output_multindex(self):
  866. df = self.mframe.T
  867. df['baz', 'two'] = 'peekaboo'
  868. keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
  869. agged = df.groupby(keys).agg(np.mean)
  870. self.assert_(isinstance(agged.columns, MultiIndex))
  871. def aggfun(ser):
  872. if ser.name == ('foo', 'one'):
  873. raise TypeError
  874. else:
  875. return ser.sum()
  876. agged2 = df.groupby(keys).aggregate(aggfun)
  877. self.assertEqual(len(agged2.columns) + 1, len(df.columns))
  878. def test_groupby_level(self):
  879. frame = self.mframe
  880. deleveled = frame.reset_index()
  881. result0 = frame.groupby(level=0).sum()
  882. result1 = frame.groupby(level=1).sum()
  883. expected0 = frame.groupby(deleveled['first'].values).sum()
  884. expected1 = frame.groupby(deleveled['second'].values).sum()
  885. expected0 = expected0.reindex(frame.index.levels[0])
  886. expected1 = expected1.reindex(frame.index.levels[1])
  887. self.assert_(result0.index.name == 'first')
  888. self.assert_(result1.index.name == 'second')
  889. assert_frame_equal(result0, expected0)
  890. assert_frame_equal(result1, expected1)
  891. self.assertEquals(result0.index.name, frame.index.names[0])
  892. self.assertEquals(result1.index.name, frame.index.names[1])
  893. # groupby level name
  894. result0 = frame.groupby(level='first').sum()
  895. result1 = frame.groupby(level='second').sum()
  896. assert_frame_equal(result0, expected0)
  897. assert_frame_equal(result1, expected1)
  898. # axis=1
  899. result0 = frame.T.groupby(level=0, axis=1).sum()
  900. result1 = frame.T.groupby(level=1, axis=1).sum()
  901. assert_frame_equal(result0, expected0.T)
  902. assert_frame_equal(result1, expected1.T)
  903. # raise exception for non-MultiIndex
  904. self.assertRaises(ValueError, self.df.groupby, level=1)
  905. def test_groupby_level_apply(self):
  906. frame = self.mframe
  907. result = frame.groupby(level=0).count()
  908. self.assert_(result.index.name == 'first')
  909. result = frame.groupby(level=1).count()
  910. self.assert_(result.index.name == 'second')
  911. result = frame['A'].groupby(level=0).count()
  912. self.assert_(result.index.name == 'first')
  913. def test_groupby_level_mapper(self):
  914. frame = self.mframe
  915. deleveled = frame.reset_index()
  916. mapper0 = {'foo' : 0, 'bar' : 0,
  917. 'baz' : 1, 'qux' : 1}
  918. mapper1 = {'one' : 0, 'two' : 0, 'three' : 1}
  919. result0 = frame.groupby(mapper0, level=0).sum()
  920. result1 = frame.groupby(mapper1, level=1).sum()
  921. mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']])
  922. mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']])
  923. expected0 = frame.groupby(mapped_level0).sum()
  924. expected1 = frame.groupby(mapped_level1).sum()
  925. assert_frame_equal(result0, expected0)
  926. assert_frame_equal(result1, expected1)
  927. def test_groupby_level_0_nonmulti(self):
  928. # #1313
  929. a = Series([1,2,3,10,4,5,20,6], Index([1,2,3,1,4,5,2,6], name='foo'))
  930. result = a.groupby(level=0).sum()
  931. self.assertEquals(result.index.name, a.index.name)
  932. def test_level_preserve_order(self):
  933. grouped = self.mframe.groupby(level=0)
  934. exp_labels = np.array([0, 0, 0, 1, 1, 2, 2, 3, 3, 3])
  935. assert_almost_equal(grouped.grouper.labels[0], exp_labels)
  936. def test_grouping_labels(self):
  937. grouped = self.mframe.groupby(self.mframe.index.get_level_values(0))
  938. exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3])
  939. assert_almost_equal(grouped.grouper.labels[0], exp_labels)
  940. def test_cython_fail_agg(self):
  941. dr = bdate_range('1/1/2000', periods=50)
  942. ts = Series(['A', 'B', 'C', 'D', 'E'] * 10, index=dr)
  943. grouped = ts.groupby(lambda x: x.month)
  944. summed = grouped.sum()
  945. expected = grouped.agg(np.sum)
  946. assert_series_equal(summed, expected)
  947. def test_apply_series_to_frame(self):
  948. def f(piece):
  949. return DataFrame({'value' : piece,
  950. 'demeaned' : piece - piece.mean(),
  951. 'logged' : np.log(piece)})
  952. dr = bdate_range('1/1/2000', periods=100)
  953. ts = Series(np.random.randn(100), index=dr)
  954. grouped = ts.groupby(lambda x: x.month)
  955. result = grouped.apply(f)
  956. self.assert_(isinstance(result, DataFrame))
  957. self.assert_(result.index.equals(ts.index))
  958. def test_apply_series_yield_constant(self):
  959. result = self.df.groupby(['A', 'B'])['C'].apply(len)
  960. self.assertEquals(result.index.names[:2], ['A', 'B'])
  961. def test_apply_frame_to_series(self):
  962. grouped = self.df.groupby(['A', 'B'])
  963. result = grouped.apply(len)
  964. expected = grouped.count()['C']
  965. self.assert_(result.index.equals(expected.index))
  966. self.assert_(np.array_equal(result.values, expected.values))
  967. def test_apply_frame_concat_series(self):
  968. def trans(group):
  969. return group.groupby('B')['C'].sum().order()[:2]
  970. def trans2(group):
  971. grouped = group.groupby(df.reindex(group.index)['B'])
  972. return grouped.sum().order()[:2]
  973. df = DataFrame({'A': np.random.randint(0, 5, 1000),
  974. 'B': np.random.randint(0, 5, 1000),
  975. 'C': np.random.randn(1000)})
  976. result = df.groupby('A').apply(trans)
  977. exp = df.groupby('A')['C'].apply(trans2)
  978. assert_series_equal(result, exp)
  979. def test_apply_transform(self):
  980. grouped = self.ts.groupby(lambda x: x.month)
  981. result = grouped.apply(lambda x: x * 2)
  982. expected = grouped.transform(lambda x: x * 2)
  983. assert_series_equal(result, expected)
  984. def test_apply_multikey_corner(self):
  985. grouped = self.tsframe.groupby([lambda x: x.year,
  986. lambda x: x.month])
  987. def f(group):
  988. return group.sort('A')[-5:]
  989. result = grouped.apply(f)
  990. for key, group in grouped:
  991. assert_frame_equal(result.ix[key], f(group))
  992. def test_apply_chunk_view(self):
  993. # Low level tinkering could be unsafe, make sure not
  994. df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3],
  995. 'value': range(9)})
  996. # return view
  997. f = lambda x: x[:2]
  998. result = df.groupby('key', group_keys=False).apply(f)
  999. expected = df.take([0, 1, 3, 4, 6, 7])
  1000. assert_frame_equal(result, expected)
  1001. def test_groupby_series_indexed_differently(self):
  1002. s1 = Series([5.0,-9.0,4.0,100.,-5.,55.,6.7],
  1003. index=Index(['a','b','c','d','e','f','g']))
  1004. s2 = Series([1.0,1.0,4.0,5.0,5.0,7.0],
  1005. index=Index(['a','b','d','f','g','h']))
  1006. grouped = s1.groupby(s2)
  1007. agged = grouped.mean()
  1008. exp = s1.groupby(s2.reindex(s1.index).get).mean()
  1009. assert_series_equal(agged, exp)
  1010. def test_groupby_with_hier_columns(self):
  1011. tuples = zip(*[['bar', 'bar', 'baz', 'baz',
  1012. 'foo', 'foo', 'qux', 'qux'],
  1013. ['one', 'two', 'one', 'two',
  1014. 'one', 'two', 'one', 'two']])
  1015. index = MultiIndex.from_tuples(tuples)
  1016. columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'),
  1017. ('B', 'cat'), ('A', 'dog')])
  1018. df = DataFrame(np.random.randn(8, 4), index=index,
  1019. columns=columns)
  1020. result = df.groupby(level=0).mean()
  1021. self.assert_(result.columns.equals(columns))
  1022. result = df.groupby(level=0, axis=1).mean()
  1023. self.assert_(result.index.equals(df.index))
  1024. result = df.groupby(level=0).agg(np.mean)
  1025. self.assert_(result.columns.equals(columns))
  1026. result = df.groupby(level=0).apply(lambda x: x.mean())
  1027. self.assert_(result.columns.equals(columns))
  1028. result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1))
  1029. self.assert_(result.columns.equals(Index(['A', 'B'])))
  1030. self.assert_(result.index.equals(df.index))
  1031. # add a nuisance column
  1032. sorted_columns, _ = columns.sortlevel(0)
  1033. df['A', 'foo'] = 'bar'
  1034. result = df.groupby(level=0).mean()
  1035. self.assert_(result.columns.equals(df.columns[:-1]))
  1036. def test_pass_args_kwargs(self):
  1037. from pandas.compat.scipy import scoreatpercentile
  1038. def f(x, q=None):
  1039. return scoreatpercentile(x, q)
  1040. g = lambda x: scoreatpercentile(x, 80)
  1041. # Series
  1042. ts_grouped = self.ts.groupby(lambda x: x.month)
  1043. agg_result = ts_grouped.agg(scoreatpercentile, 80)
  1044. apply_result = ts_grouped.apply(scoreatpercentile, 80)
  1045. trans_result = ts_grouped.transform(scoreatpercentile, 80)
  1046. agg_expected = ts_grouped.quantile(.8)
  1047. trans_expected = ts_grouped.transform(g)
  1048. assert_series_equal(apply_result, agg_expected)
  1049. assert_series_equal(agg_result, agg_expected)
  1050. assert_series_equal(trans_result, trans_expected)
  1051. agg_result = ts_grouped.agg(f, q=80)
  1052. apply_result = ts_grouped.apply(f, q=80)
  1053. trans_result = ts_grouped.transform(f, q=80)
  1054. assert_series_equal(agg_result, agg_expected)
  1055. assert_series_equal(apply_result, agg_expected)
  1056. assert_series_equal(trans_result, trans_expected)
  1057. # DataFrame
  1058. df_groupe

Large files files are truncated, but you can click here to view the full file