PageRenderTime 68ms CodeModel.GetById 11ms RepoModel.GetById 0ms app.codeStats 1ms

/pandas/tests/test_groupby.py

http://github.com/wesm/pandas
Python | 6654 lines | 6415 code | 164 blank | 75 comment | 23 complexity | 7119f6a79ece6b6b900523ea4093a56a MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. # -*- coding: utf-8 -*-
  2. from __future__ import print_function
  3. import nose
  4. from datetime import datetime
  5. from numpy import nan
  6. from pandas.types.common import _ensure_platform_int
  7. from pandas import date_range, bdate_range, Timestamp, isnull
  8. from pandas.core.index import Index, MultiIndex, CategoricalIndex
  9. from pandas.core.api import Categorical, DataFrame
  10. from pandas.core.common import UnsupportedFunctionCall
  11. from pandas.core.groupby import (SpecificationError, DataError, _nargsort,
  12. _lexsort_indexer)
  13. from pandas.core.series import Series
  14. from pandas.core.config import option_context
  15. from pandas.formats.printing import pprint_thing
  16. from pandas.util.testing import (assert_panel_equal, assert_frame_equal,
  17. assert_series_equal, assert_almost_equal,
  18. assert_index_equal, assertRaisesRegexp)
  19. from pandas.compat import (range, long, lrange, StringIO, lmap, lzip, map, zip,
  20. builtins, OrderedDict, product as cart_product)
  21. from pandas import compat
  22. from pandas.core.panel import Panel
  23. from pandas.tools.merge import concat
  24. from collections import defaultdict
  25. from functools import partial
  26. import pandas.core.common as com
  27. import numpy as np
  28. import pandas.core.nanops as nanops
  29. import pandas.util.testing as tm
  30. import pandas as pd
  31. class TestGroupBy(tm.TestCase):
  32. _multiprocess_can_split_ = True
  33. def setUp(self):
  34. self.ts = tm.makeTimeSeries()
  35. self.seriesd = tm.getSeriesData()
  36. self.tsd = tm.getTimeSeriesData()
  37. self.frame = DataFrame(self.seriesd)
  38. self.tsframe = DataFrame(self.tsd)
  39. self.df = DataFrame(
  40. {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
  41. 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
  42. 'C': np.random.randn(8),
  43. 'D': np.random.randn(8)})
  44. self.df_mixed_floats = DataFrame(
  45. {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
  46. 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
  47. 'C': np.random.randn(8),
  48. 'D': np.array(
  49. np.random.randn(8), dtype='float32')})
  50. index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',
  51. 'three']],
  52. labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
  53. [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
  54. names=['first', 'second'])
  55. self.mframe = DataFrame(np.random.randn(10, 3), index=index,
  56. columns=['A', 'B', 'C'])
  57. self.three_group = DataFrame(
  58. {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
  59. 'foo', 'foo', 'foo'],
  60. 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
  61. 'two', 'two', 'one'],
  62. 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
  63. 'dull', 'shiny', 'shiny', 'shiny'],
  64. 'D': np.random.randn(11),
  65. 'E': np.random.randn(11),
  66. 'F': np.random.randn(11)})
  67. def test_basic(self):
  68. def checkit(dtype):
  69. data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype)
  70. index = np.arange(9)
  71. np.random.shuffle(index)
  72. data = data.reindex(index)
  73. grouped = data.groupby(lambda x: x // 3)
  74. for k, v in grouped:
  75. self.assertEqual(len(v), 3)
  76. agged = grouped.aggregate(np.mean)
  77. self.assertEqual(agged[1], 1)
  78. assert_series_equal(agged, grouped.agg(np.mean)) # shorthand
  79. assert_series_equal(agged, grouped.mean())
  80. assert_series_equal(grouped.agg(np.sum), grouped.sum())
  81. expected = grouped.apply(lambda x: x * x.sum())
  82. transformed = grouped.transform(lambda x: x * x.sum())
  83. self.assertEqual(transformed[7], 12)
  84. assert_series_equal(transformed, expected)
  85. value_grouped = data.groupby(data)
  86. assert_series_equal(value_grouped.aggregate(np.mean), agged,
  87. check_index_type=False)
  88. # complex agg
  89. agged = grouped.aggregate([np.mean, np.std])
  90. agged = grouped.aggregate({'one': np.mean, 'two': np.std})
  91. group_constants = {0: 10, 1: 20, 2: 30}
  92. agged = grouped.agg(lambda x: group_constants[x.name] + x.mean())
  93. self.assertEqual(agged[1], 21)
  94. # corner cases
  95. self.assertRaises(Exception, grouped.aggregate, lambda x: x * 2)
  96. for dtype in ['int64', 'int32', 'float64', 'float32']:
  97. checkit(dtype)
  98. def test_select_bad_cols(self):
  99. df = DataFrame([[1, 2]], columns=['A', 'B'])
  100. g = df.groupby('A')
  101. self.assertRaises(KeyError, g.__getitem__, ['C']) # g[['C']]
  102. self.assertRaises(KeyError, g.__getitem__, ['A', 'C']) # g[['A', 'C']]
  103. with assertRaisesRegexp(KeyError, '^[^A]+$'):
  104. # A should not be referenced as a bad column...
  105. # will have to rethink regex if you change message!
  106. g[['A', 'C']]
  107. def test_first_last_nth(self):
  108. # tests for first / last / nth
  109. grouped = self.df.groupby('A')
  110. first = grouped.first()
  111. expected = self.df.ix[[1, 0], ['B', 'C', 'D']]
  112. expected.index = Index(['bar', 'foo'], name='A')
  113. expected = expected.sort_index()
  114. assert_frame_equal(first, expected)
  115. nth = grouped.nth(0)
  116. assert_frame_equal(nth, expected)
  117. last = grouped.last()
  118. expected = self.df.ix[[5, 7], ['B', 'C', 'D']]
  119. expected.index = Index(['bar', 'foo'], name='A')
  120. assert_frame_equal(last, expected)
  121. nth = grouped.nth(-1)
  122. assert_frame_equal(nth, expected)
  123. nth = grouped.nth(1)
  124. expected = self.df.ix[[2, 3], ['B', 'C', 'D']].copy()
  125. expected.index = Index(['foo', 'bar'], name='A')
  126. expected = expected.sort_index()
  127. assert_frame_equal(nth, expected)
  128. # it works!
  129. grouped['B'].first()
  130. grouped['B'].last()
  131. grouped['B'].nth(0)
  132. self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan
  133. self.assertTrue(isnull(grouped['B'].first()['foo']))
  134. self.assertTrue(isnull(grouped['B'].last()['foo']))
  135. self.assertTrue(isnull(grouped['B'].nth(0)['foo']))
  136. # v0.14.0 whatsnew
  137. df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
  138. g = df.groupby('A')
  139. result = g.first()
  140. expected = df.iloc[[1, 2]].set_index('A')
  141. assert_frame_equal(result, expected)
  142. expected = df.iloc[[1, 2]].set_index('A')
  143. result = g.nth(0, dropna='any')
  144. assert_frame_equal(result, expected)
  145. def test_first_last_nth_dtypes(self):
  146. df = self.df_mixed_floats.copy()
  147. df['E'] = True
  148. df['F'] = 1
  149. # tests for first / last / nth
  150. grouped = df.groupby('A')
  151. first = grouped.first()
  152. expected = df.ix[[1, 0], ['B', 'C', 'D', 'E', 'F']]
  153. expected.index = Index(['bar', 'foo'], name='A')
  154. expected = expected.sort_index()
  155. assert_frame_equal(first, expected)
  156. last = grouped.last()
  157. expected = df.ix[[5, 7], ['B', 'C', 'D', 'E', 'F']]
  158. expected.index = Index(['bar', 'foo'], name='A')
  159. expected = expected.sort_index()
  160. assert_frame_equal(last, expected)
  161. nth = grouped.nth(1)
  162. expected = df.ix[[3, 2], ['B', 'C', 'D', 'E', 'F']]
  163. expected.index = Index(['bar', 'foo'], name='A')
  164. expected = expected.sort_index()
  165. assert_frame_equal(nth, expected)
  166. # GH 2763, first/last shifting dtypes
  167. idx = lrange(10)
  168. idx.append(9)
  169. s = Series(data=lrange(11), index=idx, name='IntCol')
  170. self.assertEqual(s.dtype, 'int64')
  171. f = s.groupby(level=0).first()
  172. self.assertEqual(f.dtype, 'int64')
  173. def test_nth(self):
  174. df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
  175. g = df.groupby('A')
  176. assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A'))
  177. assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A'))
  178. assert_frame_equal(g.nth(2), df.loc[[]].set_index('A'))
  179. assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A'))
  180. assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A'))
  181. assert_frame_equal(g.nth(-3), df.loc[[]].set_index('A'))
  182. assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]])
  183. assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]])
  184. assert_frame_equal(g[['B']].nth(0),
  185. df.ix[[0, 2], ['A', 'B']].set_index('A'))
  186. exp = df.set_index('A')
  187. assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]])
  188. assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]])
  189. exp['B'] = np.nan
  190. assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]])
  191. assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]])
  192. # out of bounds, regression from 0.13.1
  193. # GH 6621
  194. df = DataFrame({'color': {0: 'green',
  195. 1: 'green',
  196. 2: 'red',
  197. 3: 'red',
  198. 4: 'red'},
  199. 'food': {0: 'ham',
  200. 1: 'eggs',
  201. 2: 'eggs',
  202. 3: 'ham',
  203. 4: 'pork'},
  204. 'two': {0: 1.5456590000000001,
  205. 1: -0.070345000000000005,
  206. 2: -2.4004539999999999,
  207. 3: 0.46206000000000003,
  208. 4: 0.52350799999999997},
  209. 'one': {0: 0.56573799999999996,
  210. 1: -0.9742360000000001,
  211. 2: 1.033801,
  212. 3: -0.78543499999999999,
  213. 4: 0.70422799999999997}}).set_index(['color',
  214. 'food'])
  215. result = df.groupby(level=0, as_index=False).nth(2)
  216. expected = df.iloc[[-1]]
  217. assert_frame_equal(result, expected)
  218. result = df.groupby(level=0, as_index=False).nth(3)
  219. expected = df.loc[[]]
  220. assert_frame_equal(result, expected)
  221. # GH 7559
  222. # from the vbench
  223. df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype='int64')
  224. s = df[1]
  225. g = df[0]
  226. expected = s.groupby(g).first()
  227. expected2 = s.groupby(g).apply(lambda x: x.iloc[0])
  228. assert_series_equal(expected2, expected, check_names=False)
  229. self.assertTrue(expected.name, 0)
  230. self.assertEqual(expected.name, 1)
  231. # validate first
  232. v = s[g == 1].iloc[0]
  233. self.assertEqual(expected.iloc[0], v)
  234. self.assertEqual(expected2.iloc[0], v)
  235. # this is NOT the same as .first (as sorted is default!)
  236. # as it keeps the order in the series (and not the group order)
  237. # related GH 7287
  238. expected = s.groupby(g, sort=False).first()
  239. result = s.groupby(g, sort=False).nth(0, dropna='all')
  240. assert_series_equal(result, expected)
  241. # doc example
  242. df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
  243. g = df.groupby('A')
  244. result = g.B.nth(0, dropna=True)
  245. expected = g.B.first()
  246. assert_series_equal(result, expected)
  247. # test multiple nth values
  248. df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]],
  249. columns=['A', 'B'])
  250. g = df.groupby('A')
  251. assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index('A'))
  252. assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index('A'))
  253. assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index('A'))
  254. assert_frame_equal(
  255. g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index('A'))
  256. assert_frame_equal(
  257. g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
  258. assert_frame_equal(
  259. g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
  260. assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A'))
  261. assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index('A'))
  262. business_dates = pd.date_range(start='4/1/2014', end='6/30/2014',
  263. freq='B')
  264. df = DataFrame(1, index=business_dates, columns=['a', 'b'])
  265. # get the first, fourth and last two business days for each month
  266. key = (df.index.year, df.index.month)
  267. result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
  268. expected_dates = pd.to_datetime(
  269. ['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1',
  270. '2014/5/6', '2014/5/29', '2014/5/30', '2014/6/2', '2014/6/5',
  271. '2014/6/27', '2014/6/30'])
  272. expected = DataFrame(1, columns=['a', 'b'], index=expected_dates)
  273. assert_frame_equal(result, expected)
  274. def test_nth_multi_index(self):
  275. # PR 9090, related to issue 8979
  276. # test nth on MultiIndex, should match .first()
  277. grouped = self.three_group.groupby(['A', 'B'])
  278. result = grouped.nth(0)
  279. expected = grouped.first()
  280. assert_frame_equal(result, expected)
  281. def test_nth_multi_index_as_expected(self):
  282. # PR 9090, related to issue 8979
  283. # test nth on MultiIndex
  284. three_group = DataFrame(
  285. {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
  286. 'foo', 'foo', 'foo'],
  287. 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
  288. 'two', 'two', 'one'],
  289. 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
  290. 'dull', 'shiny', 'shiny', 'shiny']})
  291. grouped = three_group.groupby(['A', 'B'])
  292. result = grouped.nth(0)
  293. expected = DataFrame(
  294. {'C': ['dull', 'dull', 'dull', 'dull']},
  295. index=MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo'],
  296. ['one', 'two', 'one', 'two']],
  297. names=['A', 'B']))
  298. assert_frame_equal(result, expected)
  299. def test_group_selection_cache(self):
  300. # GH 12839 nth, head, and tail should return same result consistently
  301. df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
  302. expected = df.iloc[[0, 2]].set_index('A')
  303. g = df.groupby('A')
  304. result1 = g.head(n=2)
  305. result2 = g.nth(0)
  306. assert_frame_equal(result1, df)
  307. assert_frame_equal(result2, expected)
  308. g = df.groupby('A')
  309. result1 = g.tail(n=2)
  310. result2 = g.nth(0)
  311. assert_frame_equal(result1, df)
  312. assert_frame_equal(result2, expected)
  313. g = df.groupby('A')
  314. result1 = g.nth(0)
  315. result2 = g.head(n=2)
  316. assert_frame_equal(result1, expected)
  317. assert_frame_equal(result2, df)
  318. g = df.groupby('A')
  319. result1 = g.nth(0)
  320. result2 = g.tail(n=2)
  321. assert_frame_equal(result1, expected)
  322. assert_frame_equal(result2, df)
  323. def test_grouper_index_types(self):
  324. # related GH5375
  325. # groupby misbehaving when using a Floatlike index
  326. df = DataFrame(np.arange(10).reshape(5, 2), columns=list('AB'))
  327. for index in [tm.makeFloatIndex, tm.makeStringIndex,
  328. tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex,
  329. tm.makePeriodIndex]:
  330. df.index = index(len(df))
  331. df.groupby(list('abcde')).apply(lambda x: x)
  332. df.index = list(reversed(df.index.tolist()))
  333. df.groupby(list('abcde')).apply(lambda x: x)
  334. def test_grouper_multilevel_freq(self):
  335. # GH 7885
  336. # with level and freq specified in a pd.Grouper
  337. from datetime import date, timedelta
  338. d0 = date.today() - timedelta(days=14)
  339. dates = date_range(d0, date.today())
  340. date_index = pd.MultiIndex.from_product(
  341. [dates, dates], names=['foo', 'bar'])
  342. df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index)
  343. # Check string level
  344. expected = df.reset_index().groupby([pd.Grouper(
  345. key='foo', freq='W'), pd.Grouper(key='bar', freq='W')]).sum()
  346. # reset index changes columns dtype to object
  347. expected.columns = pd.Index([0], dtype='int64')
  348. result = df.groupby([pd.Grouper(level='foo', freq='W'), pd.Grouper(
  349. level='bar', freq='W')]).sum()
  350. assert_frame_equal(result, expected)
  351. # Check integer level
  352. result = df.groupby([pd.Grouper(level=0, freq='W'), pd.Grouper(
  353. level=1, freq='W')]).sum()
  354. assert_frame_equal(result, expected)
  355. def test_grouper_creation_bug(self):
  356. # GH 8795
  357. df = DataFrame({'A': [0, 0, 1, 1, 2, 2], 'B': [1, 2, 3, 4, 5, 6]})
  358. g = df.groupby('A')
  359. expected = g.sum()
  360. g = df.groupby(pd.Grouper(key='A'))
  361. result = g.sum()
  362. assert_frame_equal(result, expected)
  363. result = g.apply(lambda x: x.sum())
  364. assert_frame_equal(result, expected)
  365. g = df.groupby(pd.Grouper(key='A', axis=0))
  366. result = g.sum()
  367. assert_frame_equal(result, expected)
  368. # GH8866
  369. s = Series(np.arange(8, dtype='int64'),
  370. index=pd.MultiIndex.from_product(
  371. [list('ab'), range(2),
  372. date_range('20130101', periods=2)],
  373. names=['one', 'two', 'three']))
  374. result = s.groupby(pd.Grouper(level='three', freq='M')).sum()
  375. expected = Series([28], index=Index(
  376. [Timestamp('2013-01-31')], freq='M', name='three'))
  377. assert_series_equal(result, expected)
  378. # just specifying a level breaks
  379. result = s.groupby(pd.Grouper(level='one')).sum()
  380. expected = s.groupby(level='one').sum()
  381. assert_series_equal(result, expected)
  382. def test_grouper_getting_correct_binner(self):
  383. # GH 10063
  384. # using a non-time-based grouper and a time-based grouper
  385. # and specifying levels
  386. df = DataFrame({'A': 1}, index=pd.MultiIndex.from_product(
  387. [list('ab'), date_range('20130101', periods=80)], names=['one',
  388. 'two']))
  389. result = df.groupby([pd.Grouper(level='one'), pd.Grouper(
  390. level='two', freq='M')]).sum()
  391. expected = DataFrame({'A': [31, 28, 21, 31, 28, 21]},
  392. index=MultiIndex.from_product(
  393. [list('ab'),
  394. date_range('20130101', freq='M', periods=3)],
  395. names=['one', 'two']))
  396. assert_frame_equal(result, expected)
  397. def test_grouper_iter(self):
  398. self.assertEqual(sorted(self.df.groupby('A').grouper), ['bar', 'foo'])
  399. def test_empty_groups(self):
  400. # GH # 1048
  401. self.assertRaises(ValueError, self.df.groupby, [])
  402. def test_groupby_grouper(self):
  403. grouped = self.df.groupby('A')
  404. result = self.df.groupby(grouped.grouper).mean()
  405. expected = grouped.mean()
  406. assert_frame_equal(result, expected)
  407. def test_groupby_duplicated_column_errormsg(self):
  408. # GH7511
  409. df = DataFrame(columns=['A', 'B', 'A', 'C'],
  410. data=[range(4), range(2, 6), range(0, 8, 2)])
  411. self.assertRaises(ValueError, df.groupby, 'A')
  412. self.assertRaises(ValueError, df.groupby, ['A', 'B'])
  413. grouped = df.groupby('B')
  414. c = grouped.count()
  415. self.assertTrue(c.columns.nlevels == 1)
  416. self.assertTrue(c.columns.size == 3)
  417. def test_groupby_dict_mapping(self):
  418. # GH #679
  419. from pandas import Series
  420. s = Series({'T1': 5})
  421. result = s.groupby({'T1': 'T2'}).agg(sum)
  422. expected = s.groupby(['T2']).agg(sum)
  423. assert_series_equal(result, expected)
  424. s = Series([1., 2., 3., 4.], index=list('abcd'))
  425. mapping = {'a': 0, 'b': 0, 'c': 1, 'd': 1}
  426. result = s.groupby(mapping).mean()
  427. result2 = s.groupby(mapping).agg(np.mean)
  428. expected = s.groupby([0, 0, 1, 1]).mean()
  429. expected2 = s.groupby([0, 0, 1, 1]).mean()
  430. assert_series_equal(result, expected)
  431. assert_series_equal(result, result2)
  432. assert_series_equal(result, expected2)
  433. def test_groupby_bounds_check(self):
  434. # groupby_X is code-generated, so if one variant
  435. # does, the rest probably do to
  436. a = np.array([1, 2], dtype='object')
  437. b = np.array([1, 2, 3], dtype='object')
  438. self.assertRaises(AssertionError, pd.algos.groupby_object, a, b)
  439. def test_groupby_grouper_f_sanity_checked(self):
  440. dates = date_range('01-Jan-2013', periods=12, freq='MS')
  441. ts = Series(np.random.randn(12), index=dates)
  442. # GH3035
  443. # index.map is used to apply grouper to the index
  444. # if it fails on the elements, map tries it on the entire index as
  445. # a sequence. That can yield invalid results that cause trouble
  446. # down the line.
  447. # the surprise comes from using key[0:6] rather then str(key)[0:6]
  448. # when the elements are Timestamp.
  449. # the result is Index[0:6], very confusing.
  450. self.assertRaises(AssertionError, ts.groupby, lambda key: key[0:6])
  451. def test_groupby_nonobject_dtype(self):
  452. key = self.mframe.index.labels[0]
  453. grouped = self.mframe.groupby(key)
  454. result = grouped.sum()
  455. expected = self.mframe.groupby(key.astype('O')).sum()
  456. assert_frame_equal(result, expected)
  457. # GH 3911, mixed frame non-conversion
  458. df = self.df_mixed_floats.copy()
  459. df['value'] = lrange(len(df))
  460. def max_value(group):
  461. return group.ix[group['value'].idxmax()]
  462. applied = df.groupby('A').apply(max_value)
  463. result = applied.get_dtype_counts().sort_values()
  464. expected = Series({'object': 2,
  465. 'float64': 2,
  466. 'int64': 1}).sort_values()
  467. assert_series_equal(result, expected)
  468. def test_groupby_return_type(self):
  469. # GH2893, return a reduced type
  470. df1 = DataFrame([{"val1": 1,
  471. "val2": 20}, {"val1": 1,
  472. "val2": 19}, {"val1": 2,
  473. "val2": 27}, {"val1": 2,
  474. "val2": 12}
  475. ])
  476. def func(dataf):
  477. return dataf["val2"] - dataf["val2"].mean()
  478. result = df1.groupby("val1", squeeze=True).apply(func)
  479. tm.assertIsInstance(result, Series)
  480. df2 = DataFrame([{"val1": 1,
  481. "val2": 20}, {"val1": 1,
  482. "val2": 19}, {"val1": 1,
  483. "val2": 27}, {"val1": 1,
  484. "val2": 12}
  485. ])
  486. def func(dataf):
  487. return dataf["val2"] - dataf["val2"].mean()
  488. result = df2.groupby("val1", squeeze=True).apply(func)
  489. tm.assertIsInstance(result, Series)
  490. # GH3596, return a consistent type (regression in 0.11 from 0.10.1)
  491. df = DataFrame([[1, 1], [1, 1]], columns=['X', 'Y'])
  492. result = df.groupby('X', squeeze=False).count()
  493. tm.assertIsInstance(result, DataFrame)
  494. # GH5592
  495. # inconcistent return type
  496. df = DataFrame(dict(A=['Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb',
  497. 'Pony', 'Pony'], B=Series(
  498. np.arange(7), dtype='int64'), C=date_range(
  499. '20130101', periods=7)))
  500. def f(grp):
  501. return grp.iloc[0]
  502. expected = df.groupby('A').first()[['B']]
  503. result = df.groupby('A').apply(f)[['B']]
  504. assert_frame_equal(result, expected)
  505. def f(grp):
  506. if grp.name == 'Tiger':
  507. return None
  508. return grp.iloc[0]
  509. result = df.groupby('A').apply(f)[['B']]
  510. e = expected.copy()
  511. e.loc['Tiger'] = np.nan
  512. assert_frame_equal(result, e)
  513. def f(grp):
  514. if grp.name == 'Pony':
  515. return None
  516. return grp.iloc[0]
  517. result = df.groupby('A').apply(f)[['B']]
  518. e = expected.copy()
  519. e.loc['Pony'] = np.nan
  520. assert_frame_equal(result, e)
  521. # 5592 revisited, with datetimes
  522. def f(grp):
  523. if grp.name == 'Pony':
  524. return None
  525. return grp.iloc[0]
  526. result = df.groupby('A').apply(f)[['C']]
  527. e = df.groupby('A').first()[['C']]
  528. e.loc['Pony'] = pd.NaT
  529. assert_frame_equal(result, e)
  530. # scalar outputs
  531. def f(grp):
  532. if grp.name == 'Pony':
  533. return None
  534. return grp.iloc[0].loc['C']
  535. result = df.groupby('A').apply(f)
  536. e = df.groupby('A').first()['C'].copy()
  537. e.loc['Pony'] = np.nan
  538. e.name = None
  539. assert_series_equal(result, e)
  540. def test_agg_api(self):
  541. # GH 6337
  542. # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error
  543. # different api for agg when passed custom function with mixed frame
  544. df = DataFrame({'data1': np.random.randn(5),
  545. 'data2': np.random.randn(5),
  546. 'key1': ['a', 'a', 'b', 'b', 'a'],
  547. 'key2': ['one', 'two', 'one', 'two', 'one']})
  548. grouped = df.groupby('key1')
  549. def peak_to_peak(arr):
  550. return arr.max() - arr.min()
  551. expected = grouped.agg([peak_to_peak])
  552. expected.columns = ['data1', 'data2']
  553. result = grouped.agg(peak_to_peak)
  554. assert_frame_equal(result, expected)
  555. def test_agg_regression1(self):
  556. grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
  557. result = grouped.agg(np.mean)
  558. expected = grouped.mean()
  559. assert_frame_equal(result, expected)
  560. def test_agg_datetimes_mixed(self):
  561. data = [[1, '2012-01-01', 1.0], [2, '2012-01-02', 2.0], [3, None, 3.0]]
  562. df1 = DataFrame({'key': [x[0] for x in data],
  563. 'date': [x[1] for x in data],
  564. 'value': [x[2] for x in data]})
  565. data = [[row[0], datetime.strptime(row[1], '%Y-%m-%d').date() if row[1]
  566. else None, row[2]] for row in data]
  567. df2 = DataFrame({'key': [x[0] for x in data],
  568. 'date': [x[1] for x in data],
  569. 'value': [x[2] for x in data]})
  570. df1['weights'] = df1['value'] / df1['value'].sum()
  571. gb1 = df1.groupby('date').aggregate(np.sum)
  572. df2['weights'] = df1['value'] / df1['value'].sum()
  573. gb2 = df2.groupby('date').aggregate(np.sum)
  574. assert (len(gb1) == len(gb2))
  575. def test_agg_period_index(self):
  576. from pandas import period_range, PeriodIndex
  577. prng = period_range('2012-1-1', freq='M', periods=3)
  578. df = DataFrame(np.random.randn(3, 2), index=prng)
  579. rs = df.groupby(level=0).sum()
  580. tm.assertIsInstance(rs.index, PeriodIndex)
  581. # GH 3579
  582. index = period_range(start='1999-01', periods=5, freq='M')
  583. s1 = Series(np.random.rand(len(index)), index=index)
  584. s2 = Series(np.random.rand(len(index)), index=index)
  585. series = [('s1', s1), ('s2', s2)]
  586. df = DataFrame.from_items(series)
  587. grouped = df.groupby(df.index.month)
  588. list(grouped)
  589. def test_agg_dict_parameter_cast_result_dtypes(self):
  590. # GH 12821
  591. df = DataFrame(
  592. {'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'],
  593. 'time': date_range('1/1/2011', periods=8, freq='H')})
  594. df.loc[[0, 1, 2, 5], 'time'] = None
  595. # test for `first` function
  596. exp = df.loc[[0, 3, 4, 6]].set_index('class')
  597. grouped = df.groupby('class')
  598. assert_frame_equal(grouped.first(), exp)
  599. assert_frame_equal(grouped.agg('first'), exp)
  600. assert_frame_equal(grouped.agg({'time': 'first'}), exp)
  601. assert_series_equal(grouped.time.first(), exp['time'])
  602. assert_series_equal(grouped.time.agg('first'), exp['time'])
  603. # test for `last` function
  604. exp = df.loc[[0, 3, 4, 7]].set_index('class')
  605. grouped = df.groupby('class')
  606. assert_frame_equal(grouped.last(), exp)
  607. assert_frame_equal(grouped.agg('last'), exp)
  608. assert_frame_equal(grouped.agg({'time': 'last'}), exp)
  609. assert_series_equal(grouped.time.last(), exp['time'])
  610. assert_series_equal(grouped.time.agg('last'), exp['time'])
  611. def test_agg_must_agg(self):
  612. grouped = self.df.groupby('A')['C']
  613. self.assertRaises(Exception, grouped.agg, lambda x: x.describe())
  614. self.assertRaises(Exception, grouped.agg, lambda x: x.index[:2])
  615. def test_agg_ser_multi_key(self):
  616. # TODO(wesm): unused
  617. ser = self.df.C # noqa
  618. f = lambda x: x.sum()
  619. results = self.df.C.groupby([self.df.A, self.df.B]).aggregate(f)
  620. expected = self.df.groupby(['A', 'B']).sum()['C']
  621. assert_series_equal(results, expected)
  622. def test_get_group(self):
  623. wp = tm.makePanel()
  624. grouped = wp.groupby(lambda x: x.month, axis='major')
  625. gp = grouped.get_group(1)
  626. expected = wp.reindex(major=[x for x in wp.major_axis if x.month == 1])
  627. assert_panel_equal(gp, expected)
  628. # GH 5267
  629. # be datelike friendly
  630. df = DataFrame({'DATE': pd.to_datetime(
  631. ['10-Oct-2013', '10-Oct-2013', '10-Oct-2013', '11-Oct-2013',
  632. '11-Oct-2013', '11-Oct-2013']),
  633. 'label': ['foo', 'foo', 'bar', 'foo', 'foo', 'bar'],
  634. 'VAL': [1, 2, 3, 4, 5, 6]})
  635. g = df.groupby('DATE')
  636. key = list(g.groups)[0]
  637. result1 = g.get_group(key)
  638. result2 = g.get_group(Timestamp(key).to_datetime())
  639. result3 = g.get_group(str(Timestamp(key)))
  640. assert_frame_equal(result1, result2)
  641. assert_frame_equal(result1, result3)
  642. g = df.groupby(['DATE', 'label'])
  643. key = list(g.groups)[0]
  644. result1 = g.get_group(key)
  645. result2 = g.get_group((Timestamp(key[0]).to_datetime(), key[1]))
  646. result3 = g.get_group((str(Timestamp(key[0])), key[1]))
  647. assert_frame_equal(result1, result2)
  648. assert_frame_equal(result1, result3)
  649. # must pass a same-length tuple with multiple keys
  650. self.assertRaises(ValueError, lambda: g.get_group('foo'))
  651. self.assertRaises(ValueError, lambda: g.get_group(('foo')))
  652. self.assertRaises(ValueError,
  653. lambda: g.get_group(('foo', 'bar', 'baz')))
  654. def test_get_group_grouped_by_tuple(self):
  655. # GH 8121
  656. df = DataFrame([[(1, ), (1, 2), (1, ), (1, 2)]], index=['ids']).T
  657. gr = df.groupby('ids')
  658. expected = DataFrame({'ids': [(1, ), (1, )]}, index=[0, 2])
  659. result = gr.get_group((1, ))
  660. assert_frame_equal(result, expected)
  661. dt = pd.to_datetime(['2010-01-01', '2010-01-02', '2010-01-01',
  662. '2010-01-02'])
  663. df = DataFrame({'ids': [(x, ) for x in dt]})
  664. gr = df.groupby('ids')
  665. result = gr.get_group(('2010-01-01', ))
  666. expected = DataFrame({'ids': [(dt[0], ), (dt[0], )]}, index=[0, 2])
  667. assert_frame_equal(result, expected)
  668. def test_agg_apply_corner(self):
  669. # nothing to group, all NA
  670. grouped = self.ts.groupby(self.ts * np.nan)
  671. self.assertEqual(self.ts.dtype, np.float64)
  672. # groupby float64 values results in Float64Index
  673. exp = Series([], dtype=np.float64, index=pd.Index(
  674. [], dtype=np.float64))
  675. assert_series_equal(grouped.sum(), exp)
  676. assert_series_equal(grouped.agg(np.sum), exp)
  677. assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False)
  678. # DataFrame
  679. grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan)
  680. exp_df = DataFrame(columns=self.tsframe.columns, dtype=float,
  681. index=pd.Index([], dtype=np.float64))
  682. assert_frame_equal(grouped.sum(), exp_df, check_names=False)
  683. assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False)
  684. assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0],
  685. check_names=False)
  686. def test_agg_grouping_is_list_tuple(self):
  687. from pandas.core.groupby import Grouping
  688. df = tm.makeTimeDataFrame()
  689. grouped = df.groupby(lambda x: x.year)
  690. grouper = grouped.grouper.groupings[0].grouper
  691. grouped.grouper.groupings[0] = Grouping(self.ts.index, list(grouper))
  692. result = grouped.agg(np.mean)
  693. expected = grouped.mean()
  694. tm.assert_frame_equal(result, expected)
  695. grouped.grouper.groupings[0] = Grouping(self.ts.index, tuple(grouper))
  696. result = grouped.agg(np.mean)
  697. expected = grouped.mean()
  698. tm.assert_frame_equal(result, expected)
  699. def test_grouping_error_on_multidim_input(self):
  700. from pandas.core.groupby import Grouping
  701. self.assertRaises(ValueError,
  702. Grouping, self.df.index, self.df[['A', 'A']])
  703. def test_agg_python_multiindex(self):
  704. grouped = self.mframe.groupby(['A', 'B'])
  705. result = grouped.agg(np.mean)
  706. expected = grouped.mean()
  707. tm.assert_frame_equal(result, expected)
  708. def test_apply_describe_bug(self):
  709. grouped = self.mframe.groupby(level='first')
  710. grouped.describe() # it works!
  711. def test_apply_issues(self):
  712. # GH 5788
  713. s = """2011.05.16,00:00,1.40893
  714. 2011.05.16,01:00,1.40760
  715. 2011.05.16,02:00,1.40750
  716. 2011.05.16,03:00,1.40649
  717. 2011.05.17,02:00,1.40893
  718. 2011.05.17,03:00,1.40760
  719. 2011.05.17,04:00,1.40750
  720. 2011.05.17,05:00,1.40649
  721. 2011.05.18,02:00,1.40893
  722. 2011.05.18,03:00,1.40760
  723. 2011.05.18,04:00,1.40750
  724. 2011.05.18,05:00,1.40649"""
  725. df = pd.read_csv(
  726. StringIO(s), header=None, names=['date', 'time', 'value'],
  727. parse_dates=[['date', 'time']])
  728. df = df.set_index('date_time')
  729. expected = df.groupby(df.index.date).idxmax()
  730. result = df.groupby(df.index.date).apply(lambda x: x.idxmax())
  731. assert_frame_equal(result, expected)
  732. # GH 5789
  733. # don't auto coerce dates
  734. df = pd.read_csv(
  735. StringIO(s), header=None, names=['date', 'time', 'value'])
  736. exp_idx = pd.Index(
  737. ['2011.05.16', '2011.05.17', '2011.05.18'
  738. ], dtype=object, name='date')
  739. expected = Series(['00:00', '02:00', '02:00'], index=exp_idx)
  740. result = df.groupby('date').apply(
  741. lambda x: x['time'][x['value'].idxmax()])
  742. assert_series_equal(result, expected)
  743. def test_time_field_bug(self):
  744. # Test a fix for the following error related to GH issue 11324 When
  745. # non-key fields in a group-by dataframe contained time-based fields
  746. # that were not returned by the apply function, an exception would be
  747. # raised.
  748. df = pd.DataFrame({'a': 1, 'b': [datetime.now() for nn in range(10)]})
  749. def func_with_no_date(batch):
  750. return pd.Series({'c': 2})
  751. def func_with_date(batch):
  752. return pd.Series({'c': 2, 'b': datetime(2015, 1, 1)})
  753. dfg_no_conversion = df.groupby(by=['a']).apply(func_with_no_date)
  754. dfg_no_conversion_expected = pd.DataFrame({'c': 2}, index=[1])
  755. dfg_no_conversion_expected.index.name = 'a'
  756. dfg_conversion = df.groupby(by=['a']).apply(func_with_date)
  757. dfg_conversion_expected = pd.DataFrame(
  758. {'b': datetime(2015, 1, 1),
  759. 'c': 2}, index=[1])
  760. dfg_conversion_expected.index.name = 'a'
  761. self.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected)
  762. self.assert_frame_equal(dfg_conversion, dfg_conversion_expected)
  763. def test_len(self):
  764. df = tm.makeTimeDataFrame()
  765. grouped = df.groupby([lambda x: x.year, lambda x: x.month,
  766. lambda x: x.day])
  767. self.assertEqual(len(grouped), len(df))
  768. grouped = df.groupby([lambda x: x.year, lambda x: x.month])
  769. expected = len(set([(x.year, x.month) for x in df.index]))
  770. self.assertEqual(len(grouped), expected)
  771. # issue 11016
  772. df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3]))
  773. self.assertEqual(len(df.groupby(('a'))), 0)
  774. self.assertEqual(len(df.groupby(('b'))), 3)
  775. self.assertEqual(len(df.groupby(('a', 'b'))), 3)
  776. def test_groups(self):
  777. grouped = self.df.groupby(['A'])
  778. groups = grouped.groups
  779. self.assertIs(groups, grouped.groups) # caching works
  780. for k, v in compat.iteritems(grouped.groups):
  781. self.assertTrue((self.df.ix[v]['A'] == k).all())
  782. grouped = self.df.groupby(['A', 'B'])
  783. groups = grouped.groups
  784. self.assertIs(groups, grouped.groups) # caching works
  785. for k, v in compat.iteritems(grouped.groups):
  786. self.assertTrue((self.df.ix[v]['A'] == k[0]).all())
  787. self.assertTrue((self.df.ix[v]['B'] == k[1]).all())
  788. def test_aggregate_str_func(self):
  789. def _check_results(grouped):
  790. # single series
  791. result = grouped['A'].agg('std')
  792. expected = grouped['A'].std()
  793. assert_series_equal(result, expected)
  794. # group frame by function name
  795. result = grouped.aggregate('var')
  796. expected = grouped.var()
  797. assert_frame_equal(result, expected)
  798. # group frame by function dict
  799. result = grouped.agg(OrderedDict([['A', 'var'], ['B', 'std'],
  800. ['C', 'mean'], ['D', 'sem']]))
  801. expected = DataFrame(OrderedDict([['A', grouped['A'].var(
  802. )], ['B', grouped['B'].std()], ['C', grouped['C'].mean()],
  803. ['D', grouped['D'].sem()]]))
  804. assert_frame_equal(result, expected)
  805. by_weekday = self.tsframe.groupby(lambda x: x.weekday())
  806. _check_results(by_weekday)
  807. by_mwkday = self.tsframe.groupby([lambda x: x.month,
  808. lambda x: x.weekday()])
  809. _check_results(by_mwkday)
  810. def test_aggregate_item_by_item(self):
  811. df = self.df.copy()
  812. df['E'] = ['a'] * len(self.df)
  813. grouped = self.df.groupby('A')
  814. # API change in 0.11
  815. # def aggfun(ser):
  816. # return len(ser + 'a')
  817. # result = grouped.agg(aggfun)
  818. # self.assertEqual(len(result.columns), 1)
  819. aggfun = lambda ser: ser.size
  820. result = grouped.agg(aggfun)
  821. foo = (self.df.A == 'foo').sum()
  822. bar = (self.df.A == 'bar').sum()
  823. K = len(result.columns)
  824. # GH5782
  825. # odd comparisons can result here, so cast to make easy
  826. exp = pd.Series(np.array([foo] * K), index=list('BCD'),
  827. dtype=np.float64, name='foo')
  828. tm.assert_series_equal(result.xs('foo'), exp)
  829. exp = pd.Series(np.array([bar] * K), index=list('BCD'),
  830. dtype=np.float64, name='bar')
  831. tm.assert_almost_equal(result.xs('bar'), exp)
  832. def aggfun(ser):
  833. return ser.size
  834. result = DataFrame().groupby(self.df.A).agg(aggfun)
  835. tm.assertIsInstance(result, DataFrame)
  836. self.assertEqual(len(result), 0)
  837. def test_agg_item_by_item_raise_typeerror(self):
  838. from numpy.random import randint
  839. df = DataFrame(randint(10, size=(20, 10)))
  840. def raiseException(df):
  841. pprint_thing('----------------------------------------')
  842. pprint_thing(df.to_string())
  843. raise TypeError
  844. self.assertRaises(TypeError, df.groupby(0).agg, raiseException)
  845. def test_basic_regression(self):
  846. # regression
  847. T = [1.0 * x for x in lrange(1, 10) * 10][:1095]
  848. result = Series(T, lrange(0, len(T)))
  849. groupings = np.random.random((1100, ))
  850. groupings = Series(groupings, lrange(0, len(groupings))) * 10.
  851. grouped = result.groupby(groupings)
  852. grouped.mean()
  853. def test_transform(self):
  854. data = Series(np.arange(9) // 3, index=np.arange(9))
  855. index = np.arange(9)
  856. np.random.shuffle(index)
  857. data = data.reindex(index)
  858. grouped = data.groupby(lambda x: x // 3)
  859. transformed = grouped.transform(lambda x: x * x.sum())
  860. self.assertEqual(transformed[7], 12)
  861. # GH 8046
  862. # make sure that we preserve the input order
  863. df = DataFrame(
  864. np.arange(6, dtype='int64').reshape(
  865. 3, 2), columns=["a", "b"], index=[0, 2, 1])
  866. key = [0, 0, 1]
  867. expected = df.sort_index().groupby(key).transform(
  868. lambda x: x - x.mean()).groupby(key).mean()
  869. result = df.groupby(key).transform(lambda x: x - x.mean()).groupby(
  870. key).mean()
  871. assert_frame_equal(result, expected)
  872. def demean(arr):
  873. return arr - arr.mean()
  874. people = DataFrame(np.random.randn(5, 5),
  875. columns=['a', 'b', 'c', 'd', 'e'],
  876. index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
  877. key = ['one', 'two', 'one', 'two', 'one']
  878. result = people.groupby(key).transform(demean).groupby(key).mean()
  879. expected = people.groupby(key).apply(demean).groupby(key).mean()
  880. assert_frame_equal(result, expected)
  881. # GH 8430
  882. df = tm.makeTimeDataFrame()
  883. g = df.groupby(pd.TimeGrouper('M'))
  884. g.transform(lambda x: x - 1)
  885. # GH 9700
  886. df = DataFrame({'a': range(5, 10), 'b': range(5)})
  887. result = df.groupby('a').transform(max)
  888. expected = DataFrame({'b': range(5)})
  889. tm.assert_frame_equal(result, expected)
  890. def test_transform_fast(self):
  891. df = DataFrame({'id': np.arange(100000) / 3,
  892. 'val': np.random.randn(100000)})
  893. grp = df.groupby('id')['val']
  894. values = np.repeat(grp.mean().values,
  895. _ensure_platform_int(grp.count().values))
  896. expected = pd.Series(values, index=df.index, name='val')
  897. result = grp.transform(np.mean)
  898. assert_series_equal(result, expected)
  899. result = grp.transform('mean')
  900. assert_series_equal(result, expected)
  901. # GH 12737
  902. df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5],
  903. 'd': pd.date_range('2014-1-1', '2014-1-4'),
  904. 'i': [1, 2, 3, 4]},
  905. columns=['grouping', 'f', 'i', 'd'])
  906. result = df.groupby('grouping').transform('first')
  907. dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'),
  908. pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')]
  909. expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5],
  910. 'd': dates,
  911. 'i': [1, 2, 2, 4]},
  912. columns=['f', 'i', 'd'])
  913. assert_frame_equal(result, expected)
  914. # selection
  915. result = df.groupby('grouping')[['f', 'i']].transform('first')
  916. expected = expected[['f', 'i']]
  917. assert_frame_equal(result, expected)
  918. # dup columns
  919. df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a'])
  920. result = df.groupby('g').transform('first')
  921. expected = df.drop('g', axis=1)
  922. assert_frame_equal(result, expected)
  923. def test_transform_broadcast(self):
  924. grouped = self.ts.groupby(lambda x: x.month)
  925. result = grouped.transform(np.mean)
  926. self.assert_index_equal(result.index, self.ts.index)
  927. for _, gp in grouped:
  928. assert_fp_equal(result.reindex(gp.index), gp.mean())
  929. grouped = self.tsframe.groupby(lambda x: x.month)
  930. result = grouped.transform(np.mean)
  931. self.assert_index_equal(result.index, self.tsframe.index)
  932. for _, gp in grouped:
  933. agged = gp.mean()
  934. res = result.reindex(gp.index)
  935. for col in self.tsframe:
  936. assert_fp_equal(res[col], agged[col])
  937. # group columns
  938. grouped = self.tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1},
  939. axis=1)
  940. result = grouped.transform(np.mean)
  941. self.assert_index_equal(result.index, self.tsframe.index)
  942. self.assert_index_equal(result.columns, self.tsframe.columns)
  943. for _, gp in grouped:
  944. agged = gp.mean(1)
  945. res = result.reindex(columns=gp.columns)
  946. for idx in gp.index:
  947. assert_fp_equal(res.xs(idx), agged[idx])
  948. def test_transform_axis(self):
  949. # make sure that we are setting the axes
  950. # correctly when on axis=0 or 1
  951. # in the presence of a non-monotonic indexer
  952. # GH12713
  953. base = self.tsframe.iloc[0:5]
  954. r = len(base.index)
  955. c = len(base.columns)
  956. tso = DataFrame(np.random.randn(r, c),
  957. index=base.index,
  958. columns=base.columns,
  959. dtype='float64')
  960. # monotonic
  961. ts = tso
  962. grouped = ts.groupby(lambda x: x.weekday())
  963. result = ts - grouped.transform('mean')
  964. expected = grouped.apply(lambda x: x - x.mean())
  965. assert_frame_equal(result, expected)
  966. ts = ts.T
  967. grouped = ts.groupby(lambda x: x.weekday(), axis=1)
  968. result = ts - grouped.transform('mean')
  969. expected = grouped.apply(lambda x: (x.T - x.mean(1)).T)
  970. assert_frame_equal(result, expected)
  971. # non-monotonic
  972. ts = tso.iloc[[1, 0] + list(range(2, len(base)))]
  973. grouped = ts.groupby(lambda x: x.weekday())
  974. result = ts - grouped.transform('mean')
  975. expected = grouped.apply(lambda x: x - x.mean())
  976. assert_frame_equal(result, expected)
  977. ts = ts.T
  978. grouped = ts.groupby(lambda x: x.weekday(), axis=1)
  979. result = ts - grouped.transform('mean')
  980. expected = grouped.apply(lambda x: (x.T - x.mean(1)).T)
  981. assert_frame_equal(result, expected)
  982. def test_transform_dtype(self):
  983. # GH 9807
  984. # Check transform dtype output is preserved
  985. df = DataFrame([[1, 3], [2, 3]])
  986. result = df.groupby(1).transform('mean')
  987. expected = DataFrame([[1.5], [1.5]])
  988. assert_frame_equal(result, expected)
  989. def test_transform_bug(self):
  990. # GH 5712
  991. # transforming on a datetime column
  992. df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5)))
  993. result = df.groupby('A')['B'].transform(
  994. lambda x: x.rank(ascending=False))
  995. expected = Series(np.arange(5, 0, step=-1), name='B')
  996. assert_series_equal(result, expected)
  997. def test_transform_multiple(self):
  998. grouped = self.ts.groupby([lambda x: x.year, lambda x: x.month])
  999. grouped.transform(lambda x: x * 2)
  1000. grouped.transform(np.mean)
  1001. def test_dispatch_transform(self):
  1002. df = self.tsframe[::5].reindex(self.tsframe.index)
  1003. grouped = df.groupby(lambda x: x.month)
  1004. filled = grouped.fillna(method='pad')
  1005. fillit = lambda x: x.fillna(method='pad')
  1006. expected = df.groupby(lambda x: x.month).transform(fillit)
  1007. assert_frame_equal(filled, expected)
  1008. def test_transform_select_columns(self):
  1009. f = lambda x: x.mean()
  1010. result = self.df.groupby('A')['C', 'D'].transform(f)
  1011. selection = self.df[['C', 'D']]
  1012. expected = selection.groupby(self.df['A']).transform(f)
  1013. assert_frame_equal(result, expected)
  1014. def test_transform_exclude_nuisance(self):
  1015. # this also tests orderings in transform between
  1016. # series/frame to make sure it's consistent
  1017. expected = {}
  1018. grouped = self.df.groupby('A')
  1019. expected['C'] = grouped['C'].transform(np.mean)
  1020. expected['D'] = grouped['D'].transform(np.mean)
  1021. expected = DataFrame(expected)
  1022. result = self.df.groupby('A').transform(np.mean)
  1023. assert_frame_equal(result, expected)
  1024. def test_transform_function_aliases(self):
  1025. result = self.df.groupby('A').transform('mean')
  1026. expected = self.df.groupby('A').transform(np.mean)
  1027. assert_frame_equal(result, expected)
  1028. result = self.df.groupby('A')['C'].transform('mean')
  1029. expected = self.df.groupby('A')['C'].transform(np.mean)
  1030. assert_series_equal(result, expected)
  1031. def test_series_fast_transform_date(self):
  1032. # GH 13191
  1033. df = pd.DataFrame({'grouping': [np.nan, 1, 1, 3],
  1034. 'd': pd.date_range('2014-1-1', '2014-1-4')})
  1035. result = df.groupby('grouping')['d'].transform('first')
  1036. dates = [pd.NaT, pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'),
  1037. pd.Timestamp('2014-1-4')]
  1038. expected = pd.Series(dates, name='d')
  1039. assert_series_equal(result, expected)
  1040. def test_transform_length(self):
  1041. # GH 9697
  1042. df = pd.DataFrame({'col1': [1, 1, 2, 2], 'col2': [1, 2, 3, np.nan]})
  1043. expected = pd.Series([3.0] * 4)
  1044. def nsum(x):
  1045. return np.nansum(x)
  1046. results = [df.groupby('col1').transform(sum)['col2'],
  1047. df.groupby('col1')['col2'].transform(sum),
  1048. df.groupby('col1').transform(nsum)['col2'],
  1049. df.groupby('col1')['col2'].transform(nsum)]
  1050. for result in results:
  1051. assert_series_equal(result, expected, check_names=False)
  1052. def test_with_na(self):
  1053. index = Index(np.arange(10))
  1054. for dtype in ['float64', 'float32', 'int64', 'int32', 'int16', 'int8']:
  1055. values = Series(np.ones(10), index, dtype=dtype)
  1056. labels = Series([nan, 'foo', 'bar', 'bar', nan, nan, 'bar',
  1057. 'bar', nan, 'foo'], index=index)
  1058. # this SHOULD be an int
  1059. grouped = values.groupby(labels)
  1060. agged = grouped.agg(len)
  1061. expected = Series([4, 2], index=['bar', 'foo'])
  1062. assert_series_equal(agged, expected, check_dtype=False)
  1063. # self.assertTrue(issubclass(agged.dtype.type, np.integer))
  1064. # explicity return a float from my function
  1065. def f(x):
  1066. return float(len(x))
  1067. agged = grouped.agg(f)
  1068. expected = Series([4, 2], index=['bar', 'foo'])
  1069. assert_series_equal(agged, expected, check_dtype=False)
  1070. self.assertTrue(issubclass(agged.dtype.type, np.dtype(dtype).type))
  1071. def test_groupby_transform_with_int(self):
  1072. # GH 3740, make sure that we might upcast on item-by-item transform
  1073. # floats
  1074. df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=Series(1, dtype='float64'),
  1075. C=Series(
  1076. [1, 2, 3, 1, 2, 3], dtype='float64'), D='foo'))
  1077. result = df.groupby('A').transform(lambda x: (x - x.mean()) / x.std())
  1078. expected = DataFrame(dict(B=np.nan, C=Series(
  1079. [-1, 0, 1, -1, 0, 1], dtype='float64')))
  1080. assert_frame_equal(result, expected)
  1081. # int case
  1082. df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1,
  1083. C=[1, 2, 3, 1, 2, 3], D='foo'))
  1084. result = df.groupby('A').transform(lambda x: (x - x.mean()) / x.std())
  1085. expected = DataFrame(dict(B=np.nan, C=[-1, 0, 1, -1, 0, 1]))
  1086. assert_frame_equal(result, expected)
  1087. # int that needs float conversion
  1088. s = Series([2, 3, 4, 10, 5, -1])
  1089. df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=s, D='foo'))
  1090. result = df.groupby('A').transform(lambda x: (x - x.mean()) / x.std())
  1091. s1 = s.iloc[0:3]
  1092. s1 = (s1 - s1.mean()) / s1.std()
  1093. s2 = s.iloc[3:6]
  1094. s2 = (s2 - s2.mean()) / s2.std()
  1095. expected = DataFrame(dict(B=np.nan, C=concat([s1, s2])))
  1096. assert_frame_equal(result, expected)
  1097. # int downcasting
  1098. result = df.groupby('A').transform(lambda x: x * 2 / 2)
  1099. expected = DataFrame(dict(B=1, C=[2, 3, 4, 10, 5, -1]))
  1100. assert_frame_equal(result, expected)
  1101. def test_indices_concatenation_order(self):
  1102. # GH 2808
  1103. def f1(x):
  1104. y = x[(x.b % 2) == 1] ** 2
  1105. if y.empty:
  1106. multiindex = MultiIndex(levels=[[]] * 2, labels=[[]] * 2,
  1107. names=['b', 'c'])
  1108. res = DataFrame(None, columns=['a'], index=multiindex)
  1109. return res
  1110. else:
  1111. y = y.set_index(['b', 'c'])
  1112. return y
  1113. def f2(x):
  1114. y = x[(x.b % 2) == 1] ** 2
  1115. if y.empty:
  1116. return DataFrame()
  1117. else:
  1118. y = y.set_index(['b', 'c'])
  1119. return y
  1120. def f3(x):
  1121. y = x[(x.b % 2) == 1] ** 2
  1122. if y.empty:
  1123. multiindex = MultiIndex(levels=[[]] * 2, labels=[[]] * 2,
  1124. names=['foo', 'bar'])
  1125. res = DataFrame(None, columns=['a', 'b'], index=multiindex)
  1126. return res
  1127. else:
  1128. return y
  1129. df = DataFrame({'a': [1, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)})
  1130. df2 = DataFrame({'a': [3, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)})
  1131. # correct result
  1132. result1 = df.groupby('a').apply(f1)
  1133. result2 = df2.groupby('a').apply(f1)
  1134. assert_frame_equal(result1, result2)
  1135. # should fail (not the same number of levels)
  1136. self.assertRaises(AssertionError, df.groupby('a').apply, f2)
  1137. self.assertRaises(AssertionError, df2.groupby('a').apply, f2)
  1138. # should fail (incorrect shape)
  1139. self.assertRaises(AssertionError, df.groupby('a').apply, f3)
  1140. self.assertRaises(AssertionError, df2.groupby('a').apply, f3)
  1141. def test_attr_wrapper(self):
  1142. grouped = self.ts.groupby(lambda x: x.weekday())
  1143. result = grouped.std()
  1144. expected = grouped.agg(lambda x: np.std(x, ddof=1))
  1145. assert_series_equal(result, expected)
  1146. # this is pretty cool
  1147. result = grouped.describe()
  1148. expected = {}
  1149. for name, gp in grouped:
  1150. expected[name] = gp.describe()
  1151. expected = DataFrame(expected).T
  1152. assert_frame_equal(result.unstack(), expected)
  1153. # get attribute
  1154. result = grouped.dtype
  1155. expected = grouped.agg(lambda x: x.dtype)
  1156. # make sure raises error
  1157. self.assertRaises(AttributeError, getattr, grouped, 'foo')
  1158. def test_series_describe_multikey(self):
  1159. ts = tm.makeTimeSeries()
  1160. grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
  1161. result = grouped.describe().unstack()
  1162. assert_series_equal(result['mean'], grouped.mean(), check_names=False)
  1163. assert_series_equal(result['std'], grouped.std(), check_names=False)
  1164. assert_series_equal(result['min'], grouped.min(), check_names=False)
  1165. def test_series_describe_single(self):
  1166. ts = tm.makeTimeSeries()
  1167. grouped = ts.groupby(lambda x: x.month)
  1168. result = grouped.apply(lambda x: x.describe())
  1169. expected = grouped.describe()
  1170. assert_series_equal(result, expected)
  1171. def test_series_agg_multikey(self):
  1172. ts = tm.makeTimeSeries()
  1173. grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
  1174. result = grouped.agg(np.sum)
  1175. expected = grouped.sum()
  1176. assert_series_equal(result, expected)
  1177. def test_series_agg_multi_pure_python(self):
  1178. data = DataFrame(
  1179. {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
  1180. 'foo', 'foo', 'foo'],
  1181. 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
  1182. 'two', 'two', 'one'],
  1183. 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
  1184. 'dull', 'shiny', 'shiny', 'shiny'],
  1185. 'D': np.random.randn(11),
  1186. 'E': np.random.randn(11),
  1187. 'F': np.random.randn(11)})
  1188. def bad(x):
  1189. assert (len(x.base) > 0)
  1190. return 'foo'
  1191. result = data.groupby(['A', 'B']).agg(bad)
  1192. expected = data.groupby(['A', 'B']).agg(lambda x: 'foo')
  1193. assert_frame_equal(result, expected)
  1194. def test_series_index_name(self):
  1195. grouped = self.df.ix[:, ['C']].groupby(self.df['A'])
  1196. result = grouped.agg(lambda x: x.mean())
  1197. self.assertEqual(result.index.name, 'A')
  1198. def test_frame_describe_multikey(self):
  1199. grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
  1200. result = grouped.describe()
  1201. for col in self.tsframe:
  1202. expected = grouped[col].describe()
  1203. assert_series_equal(result[col], expected, check_names=False)
  1204. groupedT = self.tsframe.groupby({'A': 0, 'B': 0,
  1205. 'C': 1, 'D': 1}, axis=1)
  1206. result = groupedT.describe()
  1207. for name, group in groupedT:
  1208. assert_frame_equal(result[name], group.describe())
  1209. def test_frame_groupby(self):
  1210. grouped = self.tsframe.groupby(lambda x: x.weekday())
  1211. # aggregate
  1212. aggregated = grouped.aggregate(np.mean)
  1213. self.assertEqual(len(aggregated), 5)
  1214. self.assertEqual(len(aggregated.columns), 4)
  1215. # by string
  1216. tscopy = self.tsframe.copy()
  1217. tscopy['weekday'] = [x.weekday() for x in tscopy.index]
  1218. stragged = tscopy.groupby('weekday').aggregate(np.mean)
  1219. assert_frame_equal(stragged, aggregated, check_names=False)
  1220. # transform
  1221. grouped = self.tsframe.head(30).groupby(lambda x: x.weekday())
  1222. transformed = grouped.transform(lambda x: x - x.mean())
  1223. self.assertEqual(len(transformed), 30)
  1224. self.assertEqual(len(transformed.columns), 4)
  1225. # transform propagate
  1226. transformed = grouped.transform(lambda x: x.mean())
  1227. for name, group in grouped:
  1228. mean = group.mean()
  1229. for idx in group.index:
  1230. tm.assert_series_equal(transformed.xs(idx), mean,
  1231. check_names=False)
  1232. # iterate
  1233. for weekday, group in grouped:
  1234. self.assertEqual(group.index[0].weekday(), weekday)
  1235. # groups / group_indices
  1236. groups = grouped.groups
  1237. indices = grouped.indices
  1238. for k, v in compat.iteritems(groups):
  1239. samething = self.tsframe.index.take(indices[k])
  1240. self.assertTrue((samething == v).all())
  1241. def test_grouping_is_iterable(self):
  1242. # this code path isn't used anywhere else
  1243. # not sure it's useful
  1244. grouped = self.tsframe.groupby([lambda x: x.weekday(), lambda x: x.year
  1245. ])
  1246. # test it works
  1247. for g in grouped.grouper.groupings[0]:
  1248. pass
  1249. def test_frame_groupby_columns(self):
  1250. mapping = {'A': 0, 'B': 0, 'C': 1, 'D': 1}
  1251. grouped = self.tsframe.groupby(mapping, axis=1)
  1252. # aggregate
  1253. aggregated = grouped.aggregate(np.mean)
  1254. self.assertEqual(len(aggregated), len(self.tsframe))
  1255. self.assertEqual(len(aggregated.columns), 2)
  1256. # transform
  1257. tf = lambda x: x - x.mean()
  1258. groupedT = self.tsframe.T.groupby(mapping, axis=0)
  1259. assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf))
  1260. # iterate
  1261. for k, v in grouped:
  1262. self.assertEqual(len(v.columns), 2)
  1263. def test_frame_set_name_single(self):
  1264. grouped = self.df.groupby('A')
  1265. result = grouped.mean()
  1266. self.assertEqual(result.index.name, 'A')
  1267. result = self.df.groupby('A', as_index=False).mean()
  1268. self.assertNotEqual(result.index.name, 'A')
  1269. result = grouped.agg(np.mean)
  1270. self.assertEqual(result.index.name, 'A')
  1271. result = grouped.agg({'C': np.mean, 'D': np.std})
  1272. self.assertEqual(result.index.name, 'A')
  1273. result = grouped['C'].mean()
  1274. self.assertEqual(result.index.name, 'A')
  1275. result = grouped['C'].agg(np.mean)
  1276. self.assertEqual(result.index.name, 'A')
  1277. result = grouped['C'].agg([np.mean, np.std])
  1278. self.assertEqual(result.index.name, 'A')
  1279. result = grouped['C'].agg({'foo': np.mean, 'bar': np.std})
  1280. self.assertEqual(result.index.name, 'A')
  1281. def test_aggregate_api_consistency(self):
  1282. # GH 9052
  1283. # make sure that the aggregates via dict
  1284. # are consistent
  1285. df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
  1286. 'foo', 'bar', 'foo', 'foo'],
  1287. 'B': ['one', 'one', 'two', 'two',
  1288. 'two', 'two', 'one', 'two'],
  1289. 'C': np.random.randn(8) + 1.0,
  1290. 'D': np.arange(8)})
  1291. grouped = df.groupby(['A', 'B'])
  1292. c_mean = grouped['C'].mean()
  1293. c_sum = grouped['C'].sum()
  1294. d_mean = grouped['D'].mean()
  1295. d_sum = grouped['D'].sum()
  1296. result = grouped['D'].agg(['sum', 'mean'])
  1297. expected = pd.concat([d_sum, d_mean],
  1298. axis=1)
  1299. expected.columns = ['sum', 'mean']
  1300. assert_frame_equal(result, expected, check_like=True)
  1301. result = grouped.agg([np.sum, np.mean])
  1302. expected = pd.concat([c_sum,
  1303. c_mean,
  1304. d_sum,
  1305. d_mean],
  1306. axis=1)
  1307. expected.columns = MultiIndex.from_product([['C', 'D'],
  1308. ['sum', 'mean']])
  1309. assert_frame_equal(result, expected, check_like=True)
  1310. result = grouped[['D', 'C']].agg([np.sum, np.mean])
  1311. expected = pd.concat([d_sum,
  1312. d_mean,
  1313. c_sum,
  1314. c_mean],
  1315. axis=1)
  1316. expected.columns = MultiIndex.from_product([['D', 'C'],
  1317. ['sum', 'mean']])
  1318. assert_frame_equal(result, expected, check_like=True)
  1319. result = grouped.agg({'C': 'mean', 'D': 'sum'})
  1320. expected = pd.concat([d_sum,
  1321. c_mean],
  1322. axis=1)
  1323. assert_frame_equal(result, expected, check_like=True)
  1324. result = grouped.agg({'C': ['mean', 'sum'],
  1325. 'D': ['mean', 'sum']})
  1326. expected = pd.concat([c_mean,
  1327. c_sum,
  1328. d_mean,
  1329. d_sum],
  1330. axis=1)
  1331. expected.columns = MultiIndex.from_product([['C', 'D'],
  1332. ['mean', 'sum']])
  1333. result = grouped[['D', 'C']].agg({'r': np.sum,
  1334. 'r2': np.mean})
  1335. expected = pd.concat([d_sum,
  1336. c_sum,
  1337. d_mean,
  1338. c_mean],
  1339. axis=1)
  1340. expected.columns = MultiIndex.from_product([['r', 'r2'],
  1341. ['D', 'C']])
  1342. assert_frame_equal(result, expected, check_like=True)
  1343. def test_agg_compat(self):
  1344. # GH 12334
  1345. df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
  1346. 'foo', 'bar', 'foo', 'foo'],
  1347. 'B': ['one', 'one', 'two', 'two',
  1348. 'two', 'two', 'one', 'two'],
  1349. 'C': np.random.randn(8) + 1.0,
  1350. 'D': np.arange(8)})
  1351. g = df.groupby(['A', 'B'])
  1352. expected = pd.concat([g['D'].sum(),
  1353. g['D'].std()],
  1354. axis=1)
  1355. expected.columns = MultiIndex.from_tuples([('C', 'sum'),
  1356. ('C', 'std')])
  1357. result = g['D'].agg({'C': ['sum', 'std']})
  1358. assert_frame_equal(result, expected, check_like=True)
  1359. expected = pd.concat([g['D'].sum(),
  1360. g['D'].std()],
  1361. axis=1)
  1362. expected.columns = ['C', 'D']
  1363. result = g['D'].agg({'C': 'sum', 'D': 'std'})
  1364. assert_frame_equal(result, expected, check_like=True)
  1365. def test_agg_nested_dicts(self):
  1366. # API change for disallowing these types of nested dicts
  1367. df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
  1368. 'foo', 'bar', 'foo', 'foo'],
  1369. 'B': ['one', 'one', 'two', 'two',
  1370. 'two', 'two', 'one', 'two'],
  1371. 'C': np.random.randn(8) + 1.0,
  1372. 'D': np.arange(8)})
  1373. g = df.groupby(['A', 'B'])
  1374. def f():
  1375. g.aggregate({'r1': {'C': ['mean', 'sum']},
  1376. 'r2': {'D': ['mean', 'sum']}})
  1377. self.assertRaises(SpecificationError, f)
  1378. result = g.agg({'C': {'ra': ['mean', 'std']},
  1379. 'D': {'rb': ['mean', 'std']}})
  1380. expected = pd.concat([g['C'].mean(), g['C'].std(), g['D'].mean(),
  1381. g['D'].std()], axis=1)
  1382. expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), (
  1383. 'ra', 'std'), ('rb', 'mean'), ('rb', 'std')])
  1384. assert_frame_equal(result, expected, check_like=True)
  1385. # same name as the original column
  1386. # GH9052
  1387. expected = g['D'].agg({'result1': np.sum, 'result2': np.mean})
  1388. expected = expected.rename(columns={'result1': 'D'})
  1389. result = g['D'].agg({'D': np.sum, 'result2': np.mean})
  1390. assert_frame_equal(result, expected, check_like=True)
  1391. def test_multi_iter(self):
  1392. s = Series(np.arange(6))
  1393. k1 = np.array(['a', 'a', 'a', 'b', 'b', 'b'])
  1394. k2 = np.array(['1', '2', '1', '2', '1', '2'])
  1395. grouped = s.groupby([k1, k2])
  1396. iterated = list(grouped)
  1397. expected = [('a', '1', s[[0, 2]]), ('a', '2', s[[1]]),
  1398. ('b', '1', s[[4]]), ('b', '2', s[[3, 5]])]
  1399. for i, ((one, two), three) in enumerate(iterated):
  1400. e1, e2, e3 = expected[i]
  1401. self.assertEqual(e1, one)
  1402. self.assertEqual(e2, two)
  1403. assert_series_equal(three, e3)
  1404. def test_multi_iter_frame(self):
  1405. k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
  1406. k2 = np.array(['1', '2', '1', '2', '1', '2'])
  1407. df = DataFrame({'v1': np.random.randn(6),
  1408. 'v2': np.random.randn(6),
  1409. 'k1': k1, 'k2': k2},
  1410. index=['one', 'two', 'three', 'four', 'five', 'six'])
  1411. grouped = df.groupby(['k1', 'k2'])
  1412. # things get sorted!
  1413. iterated = list(grouped)
  1414. idx = df.index
  1415. expected = [('a', '1', df.ix[idx[[4]]]),
  1416. ('a', '2', df.ix[idx[[3, 5]]]),
  1417. ('b', '1', df.ix[idx[[0, 2]]]),
  1418. ('b', '2', df.ix[idx[[1]]])]
  1419. for i, ((one, two), three) in enumerate(iterated):
  1420. e1, e2, e3 = expected[i]
  1421. self.assertEqual(e1, one)
  1422. self.assertEqual(e2, two)
  1423. assert_frame_equal(three, e3)
  1424. # don't iterate through groups with no data
  1425. df['k1'] = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
  1426. df['k2'] = np.array(['1', '1', '1', '2', '2', '2'])
  1427. grouped = df.groupby(['k1', 'k2'])
  1428. groups = {}
  1429. for key, gp in grouped:
  1430. groups[key] = gp
  1431. self.assertEqual(len(groups), 2)
  1432. # axis = 1
  1433. three_levels = self.three_group.groupby(['A', 'B', 'C']).mean()
  1434. grouped = three_levels.T.groupby(axis=1, level=(1, 2))
  1435. for key, group in grouped:
  1436. pass
  1437. def test_multi_iter_panel(self):
  1438. wp = tm.makePanel()
  1439. grouped = wp.groupby([lambda x: x.month, lambda x: x.weekday()],
  1440. axis=1)
  1441. for (month, wd), group in grouped:
  1442. exp_axis = [x
  1443. for x in wp.major_axis
  1444. if x.month == month and x.weekday() == wd]
  1445. expected = wp.reindex(major=exp_axis)
  1446. assert_panel_equal(group, expected)
  1447. def test_multi_func(self):
  1448. col1 = self.df['A']
  1449. col2 = self.df['B']
  1450. grouped = self.df.groupby([col1.get, col2.get])
  1451. agged = grouped.mean()
  1452. expected = self.df.groupby(['A', 'B']).mean()
  1453. assert_frame_equal(agged.ix[:, ['C', 'D']], expected.ix[:, ['C', 'D']],
  1454. check_names=False) # TODO groupby get drops names
  1455. # some "groups" with no data
  1456. df = DataFrame({'v1': np.random.randn(6),
  1457. 'v2': np.random.randn(6),
  1458. 'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']),
  1459. 'k2': np.array(['1', '1', '1', '2', '2', '2'])},
  1460. index=['one', 'two', 'three', 'four', 'five', 'six'])
  1461. # only verify that it works for now
  1462. grouped = df.groupby(['k1', 'k2'])
  1463. grouped.agg(np.sum)
  1464. def test_multi_key_multiple_functions(self):
  1465. grouped = self.df.groupby(['A', 'B'])['C']
  1466. agged = grouped.agg([np.mean, np.std])
  1467. expected = DataFrame({'mean': grouped.agg(np.mean),
  1468. 'std': grouped.agg(np.std)})
  1469. assert_frame_equal(agged, expected)
  1470. def test_frame_multi_key_function_list(self):
  1471. data = DataFrame(
  1472. {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
  1473. 'foo', 'foo', 'foo'],
  1474. 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
  1475. 'two', 'two', 'one'],
  1476. 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
  1477. 'dull', 'shiny', 'shiny', 'shiny'],
  1478. 'D': np.random.randn(11),
  1479. 'E': np.random.randn(11),
  1480. 'F': np.random.randn(11)})
  1481. grouped = data.groupby(['A', 'B'])
  1482. funcs = [np.mean, np.std]
  1483. agged = grouped.agg(funcs)
  1484. expected = concat([grouped['D'].agg(funcs), grouped['E'].agg(funcs),
  1485. grouped['F'].agg(funcs)],
  1486. keys=['D', 'E', 'F'], axis=1)
  1487. assert (isinstance(agged.index, MultiIndex))
  1488. assert (isinstance(expected.index, MultiIndex))
  1489. assert_frame_equal(agged, expected)
  1490. def test_groupby_multiple_columns(self):
  1491. data = self.df
  1492. grouped = data.groupby(['A', 'B'])
  1493. def _check_op(op):
  1494. result1 = op(grouped)
  1495. expected = defaultdict(dict)
  1496. for n1, gp1 in data.groupby('A'):
  1497. for n2, gp2 in gp1.groupby('B'):
  1498. expected[n1][n2] = op(gp2.ix[:, ['C', 'D']])
  1499. expected = dict((k, DataFrame(v))
  1500. for k, v in compat.iteritems(expected))
  1501. expected = Panel.fromDict(expected).swapaxes(0, 1)
  1502. expected.major_axis.name, expected.minor_axis.name = 'A', 'B'
  1503. # a little bit crude
  1504. for col in ['C', 'D']:
  1505. result_col = op(grouped[col])
  1506. exp = expected[col]
  1507. pivoted = result1[col].unstack()
  1508. pivoted2 = result_col.unstack()
  1509. assert_frame_equal(pivoted.reindex_like(exp), exp)
  1510. assert_frame_equal(pivoted2.reindex_like(exp), exp)
  1511. _check_op(lambda x: x.sum())
  1512. _check_op(lambda x: x.mean())
  1513. # test single series works the same
  1514. result = data['C'].groupby([data['A'], data['B']]).mean()
  1515. expected = data.groupby(['A', 'B']).mean()['C']
  1516. assert_series_equal(result, expected)
  1517. def test_groupby_as_index_agg(self):
  1518. grouped = self.df.groupby('A', as_index=False)
  1519. # single-key
  1520. result = grouped.agg(np.mean)
  1521. expected = grouped.mean()
  1522. assert_frame_equal(result, expected)
  1523. result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]]))
  1524. expected2 = grouped.mean()
  1525. expected2['D'] = grouped.sum()['D']
  1526. assert_frame_equal(result2, expected2)
  1527. grouped = self.df.groupby('A', as_index=True)
  1528. expected3 = grouped['C'].sum()
  1529. expected3 = DataFrame(expected3).rename(columns={'C': 'Q'})
  1530. result3 = grouped['C'].agg({'Q': np.sum})
  1531. assert_frame_equal(result3, expected3)
  1532. # multi-key
  1533. grouped = self.df.groupby(['A', 'B'], as_index=False)
  1534. result = grouped.agg(np.mean)
  1535. expected = grouped.mean()
  1536. assert_frame_equal(result, expected)
  1537. result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]]))
  1538. expected2 = grouped.mean()
  1539. expected2['D'] = grouped.sum()['D']
  1540. assert_frame_equal(result2, expected2)
  1541. expected3 = grouped['C'].sum()
  1542. expected3 = DataFrame(expected3).rename(columns={'C': 'Q'})
  1543. result3 = grouped['C'].agg({'Q': np.sum})
  1544. assert_frame_equal(result3, expected3)
  1545. # GH7115 & GH8112 & GH8582
  1546. df = DataFrame(np.random.randint(0, 100, (50, 3)),
  1547. columns=['jim', 'joe', 'jolie'])
  1548. ts = Series(np.random.randint(5, 10, 50), name='jim')
  1549. gr = df.groupby(ts)
  1550. gr.nth(0) # invokes set_selection_from_grouper internally
  1551. assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum))
  1552. for attr in ['mean', 'max', 'count', 'idxmax', 'cumsum', 'all']:
  1553. gr = df.groupby(ts, as_index=False)
  1554. left = getattr(gr, attr)()
  1555. gr = df.groupby(ts.values, as_index=True)
  1556. right = getattr(gr, attr)().reset_index(drop=True)
  1557. assert_frame_equal(left, right)
  1558. def test_series_groupby_nunique(self):
  1559. from itertools import product
  1560. from string import ascii_lowercase
  1561. def check_nunique(df, keys):
  1562. for sort, dropna in product((False, True), repeat=2):
  1563. gr = df.groupby(keys, sort=sort)
  1564. left = gr['julie'].nunique(dropna=dropna)
  1565. gr = df.groupby(keys, sort=sort)
  1566. right = gr['julie'].apply(Series.nunique, dropna=dropna)
  1567. assert_series_equal(left, right)
  1568. days = date_range('2015-08-23', periods=10)
  1569. for n, m in product(10 ** np.arange(2, 6), (10, 100, 1000)):
  1570. frame = DataFrame({
  1571. 'jim': np.random.choice(
  1572. list(ascii_lowercase), n),
  1573. 'joe': np.random.choice(days, n),
  1574. 'julie': np.random.randint(0, m, n)
  1575. })
  1576. check_nunique(frame, ['jim'])
  1577. check_nunique(frame, ['jim', 'joe'])
  1578. frame.loc[1::17, 'jim'] = None
  1579. frame.loc[3::37, 'joe'] = None
  1580. frame.loc[7::19, 'julie'] = None
  1581. frame.loc[8::19, 'julie'] = None
  1582. frame.loc[9::19, 'julie'] = None
  1583. check_nunique(frame, ['jim'])
  1584. check_nunique(frame, ['jim', 'joe'])
  1585. def test_series_groupby_value_counts(self):
  1586. from itertools import product
  1587. def rebuild_index(df):
  1588. arr = list(map(df.index.get_level_values, range(df.index.nlevels)))
  1589. df.index = MultiIndex.from_arrays(arr, names=df.index.names)
  1590. return df
  1591. def check_value_counts(df, keys, bins):
  1592. for isort, normalize, sort, ascending, dropna \
  1593. in product((False, True), repeat=5):
  1594. kwargs = dict(normalize=normalize, sort=sort,
  1595. ascending=ascending, dropna=dropna, bins=bins)
  1596. gr = df.groupby(keys, sort=isort)
  1597. left = gr['3rd'].value_counts(**kwargs)
  1598. gr = df.groupby(keys, sort=isort)
  1599. right = gr['3rd'].apply(Series.value_counts, **kwargs)
  1600. right.index.names = right.index.names[:-1] + ['3rd']
  1601. # have to sort on index because of unstable sort on values
  1602. left, right = map(rebuild_index, (left, right)) # xref GH9212
  1603. assert_series_equal(left.sort_index(), right.sort_index())
  1604. def loop(df):
  1605. bins = None, np.arange(0, max(5, df['3rd'].max()) + 1, 2)
  1606. keys = '1st', '2nd', ('1st', '2nd')
  1607. for k, b in product(keys, bins):
  1608. check_value_counts(df, k, b)
  1609. days = date_range('2015-08-24', periods=10)
  1610. for n, m in product((100, 1000), (5, 20)):
  1611. frame = DataFrame({
  1612. '1st': np.random.choice(
  1613. list('abcd'), n),
  1614. '2nd': np.random.choice(days, n),
  1615. '3rd': np.random.randint(1, m + 1, n)
  1616. })
  1617. loop(frame)
  1618. frame.loc[1::11, '1st'] = nan
  1619. frame.loc[3::17, '2nd'] = nan
  1620. frame.loc[7::19, '3rd'] = nan
  1621. frame.loc[8::19, '3rd'] = nan
  1622. frame.loc[9::19, '3rd'] = nan
  1623. loop(frame)
  1624. def test_mulitindex_passthru(self):
  1625. # GH 7997
  1626. # regression from 0.14.1
  1627. df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
  1628. df.columns = pd.MultiIndex.from_tuples([(0, 1), (1, 1), (2, 1)])
  1629. result = df.groupby(axis=1, level=[0, 1]).first()
  1630. assert_frame_equal(result, df)
  1631. def test_multifunc_select_col_integer_cols(self):
  1632. df = self.df
  1633. df.columns = np.arange(len(df.columns))
  1634. # it works!
  1635. df.groupby(1, as_index=False)[2].agg({'Q': np.mean})
  1636. def test_as_index_series_return_frame(self):
  1637. grouped = self.df.groupby('A', as_index=False)
  1638. grouped2 = self.df.groupby(['A', 'B'], as_index=False)
  1639. result = grouped['C'].agg(np.sum)
  1640. expected = grouped.agg(np.sum).ix[:, ['A', 'C']]
  1641. tm.assertIsInstance(result, DataFrame)
  1642. assert_frame_equal(result, expected)
  1643. result2 = grouped2['C'].agg(np.sum)
  1644. expected2 = grouped2.agg(np.sum).ix[:, ['A', 'B', 'C']]
  1645. tm.assertIsInstance(result2, DataFrame)
  1646. assert_frame_equal(result2, expected2)
  1647. result = grouped['C'].sum()
  1648. expected = grouped.sum().ix[:, ['A', 'C']]
  1649. tm.assertIsInstance(result, DataFrame)
  1650. assert_frame_equal(result, expected)
  1651. result2 = grouped2['C'].sum()
  1652. expected2 = grouped2.sum().ix[:, ['A', 'B', 'C']]
  1653. tm.assertIsInstance(result2, DataFrame)
  1654. assert_frame_equal(result2, expected2)
  1655. # corner case
  1656. self.assertRaises(Exception, grouped['C'].__getitem__, 'D')
  1657. def test_groupby_as_index_cython(self):
  1658. data = self.df
  1659. # single-key
  1660. grouped = data.groupby('A', as_index=False)
  1661. result = grouped.mean()
  1662. expected = data.groupby(['A']).mean()
  1663. expected.insert(0, 'A', expected.index)
  1664. expected.index = np.arange(len(expected))
  1665. assert_frame_equal(result, expected)
  1666. # multi-key
  1667. grouped = data.groupby(['A', 'B'], as_index=False)
  1668. result = grouped.mean()
  1669. expected = data.groupby(['A', 'B']).mean()
  1670. arrays = lzip(*expected.index._tuple_index)
  1671. expected.insert(0, 'A', arrays[0])
  1672. expected.insert(1, 'B', arrays[1])
  1673. expected.index = np.arange(len(expected))
  1674. assert_frame_equal(result, expected)
  1675. def test_groupby_as_index_series_scalar(self):
  1676. grouped = self.df.groupby(['A', 'B'], as_index=False)
  1677. # GH #421
  1678. result = grouped['C'].agg(len)
  1679. expected = grouped.agg(len).ix[:, ['A', 'B', 'C']]
  1680. assert_frame_equal(result, expected)
  1681. def test_groupby_as_index_corner(self):
  1682. self.assertRaises(TypeError, self.ts.groupby, lambda x: x.weekday(),
  1683. as_index=False)
  1684. self.assertRaises(ValueError, self.df.groupby, lambda x: x.lower(),
  1685. as_index=False, axis=1)
  1686. def test_groupby_as_index_apply(self):
  1687. # GH #4648 and #3417
  1688. df = DataFrame({'item_id': ['b', 'b', 'a', 'c', 'a', 'b'],
  1689. 'user_id': [1, 2, 1, 1, 3, 1],
  1690. 'time': range(6)})
  1691. g_as = df.groupby('user_id', as_index=True)
  1692. g_not_as = df.groupby('user_id', as_index=False)
  1693. res_as = g_as.head(2).index
  1694. res_not_as = g_not_as.head(2).index
  1695. exp = Index([0, 1, 2, 4])
  1696. assert_index_equal(res_as, exp)
  1697. assert_index_equal(res_not_as, exp)
  1698. res_as_apply = g_as.apply(lambda x: x.head(2)).index
  1699. res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index
  1700. # apply doesn't maintain the original ordering
  1701. # changed in GH5610 as the as_index=False returns a MI here
  1702. exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (
  1703. 2, 4)])
  1704. tp = [(1, 0), (1, 2), (2, 1), (3, 4)]
  1705. exp_as_apply = MultiIndex.from_tuples(tp, names=['user_id', None])
  1706. assert_index_equal(res_as_apply, exp_as_apply)
  1707. assert_index_equal(res_not_as_apply, exp_not_as_apply)
  1708. ind = Index(list('abcde'))
  1709. df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind)
  1710. res = df.groupby(0, as_index=False).apply(lambda x: x).index
  1711. assert_index_equal(res, ind)
  1712. def test_groupby_head_tail(self):
  1713. df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
  1714. g_as = df.groupby('A', as_index=True)
  1715. g_not_as = df.groupby('A', as_index=False)
  1716. # as_index= False, much easier
  1717. assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1))
  1718. assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1))
  1719. empty_not_as = DataFrame(columns=df.columns,
  1720. index=pd.Index([], dtype=df.index.dtype))
  1721. empty_not_as['A'] = empty_not_as['A'].astype(df.A.dtype)
  1722. empty_not_as['B'] = empty_not_as['B'].astype(df.B.dtype)
  1723. assert_frame_equal(empty_not_as, g_not_as.head(0))
  1724. assert_frame_equal(empty_not_as, g_not_as.tail(0))
  1725. assert_frame_equal(empty_not_as, g_not_as.head(-1))
  1726. assert_frame_equal(empty_not_as, g_not_as.tail(-1))
  1727. assert_frame_equal(df, g_not_as.head(7)) # contains all
  1728. assert_frame_equal(df, g_not_as.tail(7))
  1729. # as_index=True, (used to be different)
  1730. df_as = df
  1731. assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1))
  1732. assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1))
  1733. empty_as = DataFrame(index=df_as.index[:0], columns=df.columns)
  1734. empty_as['A'] = empty_not_as['A'].astype(df.A.dtype)
  1735. empty_as['B'] = empty_not_as['B'].astype(df.B.dtype)
  1736. assert_frame_equal(empty_as, g_as.head(0))
  1737. assert_frame_equal(empty_as, g_as.tail(0))
  1738. assert_frame_equal(empty_as, g_as.head(-1))
  1739. assert_frame_equal(empty_as, g_as.tail(-1))
  1740. assert_frame_equal(df_as, g_as.head(7)) # contains all
  1741. assert_frame_equal(df_as, g_as.tail(7))
  1742. # test with selection
  1743. assert_frame_equal(g_as[[]].head(1), df_as.loc[[0, 2], []])
  1744. assert_frame_equal(g_as[['A']].head(1), df_as.loc[[0, 2], ['A']])
  1745. assert_frame_equal(g_as[['B']].head(1), df_as.loc[[0, 2], ['B']])
  1746. assert_frame_equal(g_as[['A', 'B']].head(1), df_as.loc[[0, 2]])
  1747. assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0, 2], []])
  1748. assert_frame_equal(g_not_as[['A']].head(1), df_as.loc[[0, 2], ['A']])
  1749. assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0, 2], ['B']])
  1750. assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0, 2]])
  1751. def test_groupby_multiple_key(self):
  1752. df = tm.makeTimeDataFrame()
  1753. grouped = df.groupby([lambda x: x.year, lambda x: x.month,
  1754. lambda x: x.day])
  1755. agged = grouped.sum()
  1756. assert_almost_equal(df.values, agged.values)
  1757. grouped = df.T.groupby([lambda x: x.year,
  1758. lambda x: x.month,
  1759. lambda x: x.day], axis=1)
  1760. agged = grouped.agg(lambda x: x.sum())
  1761. self.assert_index_equal(agged.index, df.columns)
  1762. assert_almost_equal(df.T.values, agged.values)
  1763. agged = grouped.agg(lambda x: x.sum())
  1764. assert_almost_equal(df.T.values, agged.values)
  1765. def test_groupby_multi_corner(self):
  1766. # test that having an all-NA column doesn't mess you up
  1767. df = self.df.copy()
  1768. df['bad'] = np.nan
  1769. agged = df.groupby(['A', 'B']).mean()
  1770. expected = self.df.groupby(['A', 'B']).mean()
  1771. expected['bad'] = np.nan
  1772. assert_frame_equal(agged, expected)
  1773. def test_omit_nuisance(self):
  1774. grouped = self.df.groupby('A')
  1775. result = grouped.mean()
  1776. expected = self.df.ix[:, ['A', 'C', 'D']].groupby('A').mean()
  1777. assert_frame_equal(result, expected)
  1778. agged = grouped.agg(np.mean)
  1779. exp = grouped.mean()
  1780. assert_frame_equal(agged, exp)
  1781. df = self.df.ix[:, ['A', 'C', 'D']]
  1782. df['E'] = datetime.now()
  1783. grouped = df.groupby('A')
  1784. result = grouped.agg(np.sum)
  1785. expected = grouped.sum()
  1786. assert_frame_equal(result, expected)
  1787. # won't work with axis = 1
  1788. grouped = df.groupby({'A': 0, 'C': 0, 'D': 1, 'E': 1}, axis=1)
  1789. result = self.assertRaises(TypeError, grouped.agg,
  1790. lambda x: x.sum(0, numeric_only=False))
  1791. def test_omit_nuisance_python_multiple(self):
  1792. grouped = self.three_group.groupby(['A', 'B'])
  1793. agged = grouped.agg(np.mean)
  1794. exp = grouped.mean()
  1795. assert_frame_equal(agged, exp)
  1796. def test_empty_groups_corner(self):
  1797. # handle empty groups
  1798. df = DataFrame({'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']),
  1799. 'k2': np.array(['1', '1', '1', '2', '2', '2']),
  1800. 'k3': ['foo', 'bar'] * 3,
  1801. 'v1': np.random.randn(6),
  1802. 'v2': np.random.randn(6)})
  1803. grouped = df.groupby(['k1', 'k2'])
  1804. result = grouped.agg(np.mean)
  1805. expected = grouped.mean()
  1806. assert_frame_equal(result, expected)
  1807. grouped = self.mframe[3:5].groupby(level=0)
  1808. agged = grouped.apply(lambda x: x.mean())
  1809. agged_A = grouped['A'].apply(np.mean)
  1810. assert_series_equal(agged['A'], agged_A)
  1811. self.assertEqual(agged.index.name, 'first')
  1812. def test_apply_concat_preserve_names(self):
  1813. grouped = self.three_group.groupby(['A', 'B'])
  1814. def desc(group):
  1815. result = group.describe()
  1816. result.index.name = 'stat'
  1817. return result
  1818. def desc2(group):
  1819. result = group.describe()
  1820. result.index.name = 'stat'
  1821. result = result[:len(group)]
  1822. # weirdo
  1823. return result
  1824. def desc3(group):
  1825. result = group.describe()
  1826. # names are different
  1827. result.index.name = 'stat_%d' % len(group)
  1828. result = result[:len(group)]
  1829. # weirdo
  1830. return result
  1831. result = grouped.apply(desc)
  1832. self.assertEqual(result.index.names, ('A', 'B', 'stat'))
  1833. result2 = grouped.apply(desc2)
  1834. self.assertEqual(result2.index.names, ('A', 'B', 'stat'))
  1835. result3 = grouped.apply(desc3)
  1836. self.assertEqual(result3.index.names, ('A', 'B', None))
  1837. def test_nonsense_func(self):
  1838. df = DataFrame([0])
  1839. self.assertRaises(Exception, df.groupby, lambda x: x + 'foo')
  1840. def test_builtins_apply(self): # GH8155
  1841. df = pd.DataFrame(np.random.randint(1, 50, (1000, 2)),
  1842. columns=['jim', 'joe'])
  1843. df['jolie'] = np.random.randn(1000)
  1844. for keys in ['jim', ['jim', 'joe']]: # single key & multi-key
  1845. if keys == 'jim':
  1846. continue
  1847. for f in [max, min, sum]:
  1848. fname = f.__name__
  1849. result = df.groupby(keys).apply(f)
  1850. result.shape
  1851. ngroups = len(df.drop_duplicates(subset=keys))
  1852. assert result.shape == (ngroups, 3), 'invalid frame shape: '\
  1853. '{} (expected ({}, 3))'.format(result.shape, ngroups)
  1854. assert_frame_equal(result, # numpy's equivalent function
  1855. df.groupby(keys).apply(getattr(np, fname)))
  1856. if f != sum:
  1857. expected = df.groupby(keys).agg(fname).reset_index()
  1858. expected.set_index(keys, inplace=True, drop=False)
  1859. assert_frame_equal(result, expected, check_dtype=False)
  1860. assert_series_equal(getattr(result, fname)(),
  1861. getattr(df, fname)())
  1862. def test_cythonized_aggers(self):
  1863. data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., nan, nan],
  1864. 'B': ['A', 'B'] * 6,
  1865. 'C': np.random.randn(12)}
  1866. df = DataFrame(data)
  1867. df.loc[2:10:2, 'C'] = nan
  1868. def _testit(name):
  1869. op = lambda x: getattr(x, name)()
  1870. # single column
  1871. grouped = df.drop(['B'], axis=1).groupby('A')
  1872. exp = {}
  1873. for cat, group in grouped:
  1874. exp[cat] = op(group['C'])
  1875. exp = DataFrame({'C': exp})
  1876. exp.index.name = 'A'
  1877. result = op(grouped)
  1878. assert_frame_equal(result, exp)
  1879. # multiple columns
  1880. grouped = df.groupby(['A', 'B'])
  1881. expd = {}
  1882. for (cat1, cat2), group in grouped:
  1883. expd.setdefault(cat1, {})[cat2] = op(group['C'])
  1884. exp = DataFrame(expd).T.stack(dropna=False)
  1885. exp.index.names = ['A', 'B']
  1886. exp.name = 'C'
  1887. result = op(grouped)['C']
  1888. if not tm._incompat_bottleneck_version(name):
  1889. assert_series_equal(result, exp)
  1890. _testit('count')
  1891. _testit('sum')
  1892. _testit('std')
  1893. _testit('var')
  1894. _testit('sem')
  1895. _testit('mean')
  1896. _testit('median')
  1897. _testit('prod')
  1898. _testit('min')
  1899. _testit('max')
  1900. def test_max_min_non_numeric(self):
  1901. # #2700
  1902. aa = DataFrame({'nn': [11, 11, 22, 22],
  1903. 'ii': [1, 2, 3, 4],
  1904. 'ss': 4 * ['mama']})
  1905. result = aa.groupby('nn').max()
  1906. self.assertTrue('ss' in result)
  1907. result = aa.groupby('nn').min()
  1908. self.assertTrue('ss' in result)
  1909. def test_cython_agg_boolean(self):
  1910. frame = DataFrame({'a': np.random.randint(0, 5, 50),
  1911. 'b': np.random.randint(0, 2, 50).astype('bool')})
  1912. result = frame.groupby('a')['b'].mean()
  1913. expected = frame.groupby('a')['b'].agg(np.mean)
  1914. assert_series_equal(result, expected)
  1915. def test_cython_agg_nothing_to_agg(self):
  1916. frame = DataFrame({'a': np.random.randint(0, 5, 50),
  1917. 'b': ['foo', 'bar'] * 25})
  1918. self.assertRaises(DataError, frame.groupby('a')['b'].mean)
  1919. frame = DataFrame({'a': np.random.randint(0, 5, 50),
  1920. 'b': ['foo', 'bar'] * 25})
  1921. self.assertRaises(DataError, frame[['b']].groupby(frame['a']).mean)
  1922. def test_cython_agg_nothing_to_agg_with_dates(self):
  1923. frame = DataFrame({'a': np.random.randint(0, 5, 50),
  1924. 'b': ['foo', 'bar'] * 25,
  1925. 'dates': pd.date_range('now', periods=50,
  1926. freq='T')})
  1927. with tm.assertRaisesRegexp(DataError, "No numeric types to aggregate"):
  1928. frame.groupby('b').dates.mean()
  1929. def test_groupby_timedelta_cython_count(self):
  1930. df = DataFrame({'g': list('ab' * 2),
  1931. 'delt': np.arange(4).astype('timedelta64[ns]')})
  1932. expected = Series([
  1933. 2, 2
  1934. ], index=pd.Index(['a', 'b'], name='g'), name='delt')
  1935. result = df.groupby('g').delt.count()
  1936. tm.assert_series_equal(expected, result)
  1937. def test_cython_agg_frame_columns(self):
  1938. # #2113
  1939. df = DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]})
  1940. df.groupby(level=0, axis='columns').mean()
  1941. df.groupby(level=0, axis='columns').mean()
  1942. df.groupby(level=0, axis='columns').mean()
  1943. df.groupby(level=0, axis='columns').mean()
  1944. def test_wrap_aggregated_output_multindex(self):
  1945. df = self.mframe.T
  1946. df['baz', 'two'] = 'peekaboo'
  1947. keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
  1948. agged = df.groupby(keys).agg(np.mean)
  1949. tm.assertIsInstance(agged.columns, MultiIndex)
  1950. def aggfun(ser):
  1951. if ser.name == ('foo', 'one'):
  1952. raise TypeError
  1953. else:
  1954. return ser.sum()
  1955. agged2 = df.groupby(keys).aggregate(aggfun)
  1956. self.assertEqual(len(agged2.columns) + 1, len(df.columns))
  1957. def test_groupby_level(self):
  1958. frame = self.mframe
  1959. deleveled = frame.reset_index()
  1960. result0 = frame.groupby(level=0).sum()
  1961. result1 = frame.groupby(level=1).sum()
  1962. expected0 = frame.groupby(deleveled['first'].values).sum()
  1963. expected1 = frame.groupby(deleveled['second'].values).sum()
  1964. expected0 = expected0.reindex(frame.index.levels[0])
  1965. expected1 = expected1.reindex(frame.index.levels[1])
  1966. self.assertEqual(result0.index.name, 'first')
  1967. self.assertEqual(result1.index.name, 'second')
  1968. assert_frame_equal(result0, expected0)
  1969. assert_frame_equal(result1, expected1)
  1970. self.assertEqual(result0.index.name, frame.index.names[0])
  1971. self.assertEqual(result1.index.name, frame.index.names[1])
  1972. # groupby level name
  1973. result0 = frame.groupby(level='first').sum()
  1974. result1 = frame.groupby(level='second').sum()
  1975. assert_frame_equal(result0, expected0)
  1976. assert_frame_equal(result1, expected1)
  1977. # axis=1
  1978. result0 = frame.T.groupby(level=0, axis=1).sum()
  1979. result1 = frame.T.groupby(level=1, axis=1).sum()
  1980. assert_frame_equal(result0, expected0.T)
  1981. assert_frame_equal(result1, expected1.T)
  1982. # raise exception for non-MultiIndex
  1983. self.assertRaises(ValueError, self.df.groupby, level=1)
  1984. def test_groupby_level_index_names(self):
  1985. # GH4014 this used to raise ValueError since 'exp'>1 (in py2)
  1986. df = DataFrame({'exp': ['A'] * 3 + ['B'] * 3,
  1987. 'var1': lrange(6), }).set_index('exp')
  1988. df.groupby(level='exp')
  1989. self.assertRaises(ValueError, df.groupby, level='foo')
  1990. def test_groupby_level_with_nas(self):
  1991. index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]],
  1992. labels=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1,
  1993. 2, 3]])
  1994. # factorizing doesn't confuse things
  1995. s = Series(np.arange(8.), index=index)
  1996. result = s.groupby(level=0).sum()
  1997. expected = Series([22., 6.], index=[1, 0])
  1998. assert_series_equal(result, expected)
  1999. index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]],
  2000. labels=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0,
  2001. 1, 2, 3]])
  2002. # factorizing doesn't confuse things
  2003. s = Series(np.arange(8.), index=index)
  2004. result = s.groupby(level=0).sum()
  2005. expected = Series([18., 6.], index=[1, 0])
  2006. assert_series_equal(result, expected)
  2007. def test_groupby_level_apply(self):
  2008. frame = self.mframe
  2009. result = frame.groupby(level=0).count()
  2010. self.assertEqual(result.index.name, 'first')
  2011. result = frame.groupby(level=1).count()
  2012. self.assertEqual(result.index.name, 'second')
  2013. result = frame['A'].groupby(level=0).count()
  2014. self.assertEqual(result.index.name, 'first')
  2015. def test_groupby_args(self):
  2016. # PR8618 and issue 8015
  2017. frame = self.mframe
  2018. def j():
  2019. frame.groupby()
  2020. self.assertRaisesRegexp(TypeError,
  2021. "You have to supply one of 'by' and 'level'",
  2022. j)
  2023. def k():
  2024. frame.groupby(by=None, level=None)
  2025. self.assertRaisesRegexp(TypeError,
  2026. "You have to supply one of 'by' and 'level'",
  2027. k)
  2028. def test_groupby_level_mapper(self):
  2029. frame = self.mframe
  2030. deleveled = frame.reset_index()
  2031. mapper0 = {'foo': 0, 'bar': 0, 'baz': 1, 'qux': 1}
  2032. mapper1 = {'one': 0, 'two': 0, 'three': 1}
  2033. result0 = frame.groupby(mapper0, level=0).sum()
  2034. result1 = frame.groupby(mapper1, level=1).sum()
  2035. mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']])
  2036. mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']])
  2037. expected0 = frame.groupby(mapped_level0).sum()
  2038. expected1 = frame.groupby(mapped_level1).sum()
  2039. expected0.index.name, expected1.index.name = 'first', 'second'
  2040. assert_frame_equal(result0, expected0)
  2041. assert_frame_equal(result1, expected1)
  2042. def test_groupby_level_0_nonmulti(self):
  2043. # #1313
  2044. a = Series([1, 2, 3, 10, 4, 5, 20, 6], Index([1, 2, 3, 1,
  2045. 4, 5, 2, 6], name='foo'))
  2046. result = a.groupby(level=0).sum()
  2047. self.assertEqual(result.index.name, a.index.name)
  2048. def test_groupby_complex(self):
  2049. # GH 12902
  2050. a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1])
  2051. expected = Series((1 + 2j, 5 + 10j))
  2052. result = a.groupby(level=0).sum()
  2053. assert_series_equal(result, expected)
  2054. result = a.sum(level=0)
  2055. assert_series_equal(result, expected)
  2056. def test_level_preserve_order(self):
  2057. grouped = self.mframe.groupby(level=0)
  2058. exp_labels = np.array([0, 0, 0, 1, 1, 2, 2, 3, 3, 3])
  2059. assert_almost_equal(grouped.grouper.labels[0], exp_labels)
  2060. def test_grouping_labels(self):
  2061. grouped = self.mframe.groupby(self.mframe.index.get_level_values(0))
  2062. exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3])
  2063. assert_almost_equal(grouped.grouper.labels[0], exp_labels)
  2064. def test_cython_fail_agg(self):
  2065. dr = bdate_range('1/1/2000', periods=50)
  2066. ts = Series(['A', 'B', 'C', 'D', 'E'] * 10, index=dr)
  2067. grouped = ts.groupby(lambda x: x.month)
  2068. summed = grouped.sum()
  2069. expected = grouped.agg(np.sum)
  2070. assert_series_equal(summed, expected)
  2071. def test_apply_series_to_frame(self):
  2072. def f(piece):
  2073. return DataFrame({'value': piece,
  2074. 'demeaned': piece - piece.mean(),
  2075. 'logged': np.log(piece)})
  2076. dr = bdate_range('1/1/2000', periods=100)
  2077. ts = Series(np.random.randn(100), index=dr)
  2078. grouped = ts.groupby(lambda x: x.month)
  2079. result = grouped.apply(f)
  2080. tm.assertIsInstance(result, DataFrame)
  2081. self.assert_index_equal(result.index, ts.index)
  2082. def test_apply_series_yield_constant(self):
  2083. result = self.df.groupby(['A', 'B'])['C'].apply(len)
  2084. self.assertEqual(result.index.names[:2], ('A', 'B'))
  2085. def test_apply_frame_yield_constant(self):
  2086. # GH13568
  2087. result = self.df.groupby(['A', 'B']).apply(len)
  2088. self.assertTrue(isinstance(result, Series))
  2089. self.assertIsNone(result.name)
  2090. result = self.df.groupby(['A', 'B'])[['C', 'D']].apply(len)
  2091. self.assertTrue(isinstance(result, Series))
  2092. self.assertIsNone(result.name)
  2093. def test_apply_frame_to_series(self):
  2094. grouped = self.df.groupby(['A', 'B'])
  2095. result = grouped.apply(len)
  2096. expected = grouped.count()['C']
  2097. self.assert_index_equal(result.index, expected.index)
  2098. self.assert_numpy_array_equal(result.values, expected.values)
  2099. def test_apply_frame_concat_series(self):
  2100. def trans(group):
  2101. return group.groupby('B')['C'].sum().sort_values()[:2]
  2102. def trans2(group):
  2103. grouped = group.groupby(df.reindex(group.index)['B'])
  2104. return grouped.sum().sort_values()[:2]
  2105. df = DataFrame({'A': np.random.randint(0, 5, 1000),
  2106. 'B': np.random.randint(0, 5, 1000),
  2107. 'C': np.random.randn(1000)})
  2108. result = df.groupby('A').apply(trans)
  2109. exp = df.groupby('A')['C'].apply(trans2)
  2110. assert_series_equal(result, exp, check_names=False)
  2111. self.assertEqual(result.name, 'C')
  2112. def test_apply_transform(self):
  2113. grouped = self.ts.groupby(lambda x: x.month)
  2114. result = grouped.apply(lambda x: x * 2)
  2115. expected = grouped.transform(lambda x: x * 2)
  2116. assert_series_equal(result, expected)
  2117. def test_apply_multikey_corner(self):
  2118. grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
  2119. def f(group):
  2120. return group.sort_values('A')[-5:]
  2121. result = grouped.apply(f)
  2122. for key, group in grouped:
  2123. assert_frame_equal(result.ix[key], f(group))
  2124. def test_mutate_groups(self):
  2125. # GH3380
  2126. mydf = DataFrame({
  2127. 'cat1': ['a'] * 8 + ['b'] * 6,
  2128. 'cat2': ['c'] * 2 + ['d'] * 2 + ['e'] * 2 + ['f'] * 2 + ['c'] * 2 +
  2129. ['d'] * 2 + ['e'] * 2,
  2130. 'cat3': lmap(lambda x: 'g%s' % x, lrange(1, 15)),
  2131. 'val': np.random.randint(100, size=14),
  2132. })
  2133. def f_copy(x):
  2134. x = x.copy()
  2135. x['rank'] = x.val.rank(method='min')
  2136. return x.groupby('cat2')['rank'].min()
  2137. def f_no_copy(x):
  2138. x['rank'] = x.val.rank(method='min')
  2139. return x.groupby('cat2')['rank'].min()
  2140. grpby_copy = mydf.groupby('cat1').apply(f_copy)
  2141. grpby_no_copy = mydf.groupby('cat1').apply(f_no_copy)
  2142. assert_series_equal(grpby_copy, grpby_no_copy)
  2143. def test_no_mutate_but_looks_like(self):
  2144. # GH 8467
  2145. # first show's mutation indicator
  2146. # second does not, but should yield the same results
  2147. df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], 'value': range(9)})
  2148. result1 = df.groupby('key', group_keys=True).apply(lambda x: x[:].key)
  2149. result2 = df.groupby('key', group_keys=True).apply(lambda x: x.key)
  2150. assert_series_equal(result1, result2)
  2151. def test_apply_chunk_view(self):
  2152. # Low level tinkering could be unsafe, make sure not
  2153. df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3],
  2154. 'value': lrange(9)})
  2155. # return view
  2156. f = lambda x: x[:2]
  2157. result = df.groupby('key', group_keys=False).apply(f)
  2158. expected = df.take([0, 1, 3, 4, 6, 7])
  2159. assert_frame_equal(result, expected)
  2160. def test_apply_no_name_column_conflict(self):
  2161. df = DataFrame({'name': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2],
  2162. 'name2': [0, 0, 0, 1, 1, 1, 0, 0, 1, 1],
  2163. 'value': lrange(10)[::-1]})
  2164. # it works! #2605
  2165. grouped = df.groupby(['name', 'name2'])
  2166. grouped.apply(lambda x: x.sort_values('value', inplace=True))
  2167. def test_groupby_series_indexed_differently(self):
  2168. s1 = Series([5.0, -9.0, 4.0, 100., -5., 55., 6.7],
  2169. index=Index(['a', 'b', 'c', 'd', 'e', 'f', 'g']))
  2170. s2 = Series([1.0, 1.0, 4.0, 5.0, 5.0, 7.0],
  2171. index=Index(['a', 'b', 'd', 'f', 'g', 'h']))
  2172. grouped = s1.groupby(s2)
  2173. agged = grouped.mean()
  2174. exp = s1.groupby(s2.reindex(s1.index).get).mean()
  2175. assert_series_equal(agged, exp)
  2176. def test_groupby_with_hier_columns(self):
  2177. tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux',
  2178. 'qux'], ['one', 'two', 'one', 'two', 'one', 'two',
  2179. 'one', 'two']]))
  2180. index = MultiIndex.from_tuples(tuples)
  2181. columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), (
  2182. 'B', 'cat'), ('A', 'dog')])
  2183. df = DataFrame(np.random.randn(8, 4), index=index, columns=columns)
  2184. result = df.groupby(level=0).mean()
  2185. self.assert_index_equal(result.columns, columns)
  2186. result = df.groupby(level=0, axis=1).mean()
  2187. self.assert_index_equal(result.index, df.index)
  2188. result = df.groupby(level=0).agg(np.mean)
  2189. self.assert_index_equal(result.columns, columns)
  2190. result = df.groupby(level=0).apply(lambda x: x.mean())
  2191. self.assert_index_equal(result.columns, columns)
  2192. result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1))
  2193. self.assert_index_equal(result.columns, Index(['A', 'B']))
  2194. self.assert_index_equal(result.index, df.index)
  2195. # add a nuisance column
  2196. sorted_columns, _ = columns.sortlevel(0)
  2197. df['A', 'foo'] = 'bar'
  2198. result = df.groupby(level=0).mean()
  2199. self.assert_index_equal(result.columns, df.columns[:-1])
  2200. def test_pass_args_kwargs(self):
  2201. from numpy import percentile
  2202. def f(x, q=None, axis=0):
  2203. return percentile(x, q, axis=axis)
  2204. g = lambda x: percentile(x, 80, axis=0)
  2205. # Series
  2206. ts_grouped = self.ts.groupby(lambda x: x.month)
  2207. agg_result = ts_grouped.agg(percentile, 80, axis=0)
  2208. apply_result = ts_grouped.apply(percentile, 80, axis=0)
  2209. trans_result = ts_grouped.transform(percentile, 80, axis=0)
  2210. agg_expected = ts_grouped.quantile(.8)
  2211. trans_expected = ts_grouped.transform(g)
  2212. assert_series_equal(apply_result, agg_expected)
  2213. assert_series_equal(agg_result, agg_expected, check_names=False)
  2214. assert_series_equal(trans_result, trans_expected)
  2215. agg_result = ts_grouped.agg(f, q=80)
  2216. apply_result = ts_grouped.apply(f, q=80)
  2217. trans_result = ts_grouped.transform(f, q=80)
  2218. assert_series_equal(agg_result, agg_expected)
  2219. assert_series_equal(apply_result, agg_expected)
  2220. assert_series_equal(trans_result, trans_expected)
  2221. # DataFrame
  2222. df_grouped = self.tsframe.groupby(lambda x: x.month)
  2223. agg_result = df_grouped.agg(percentile, 80, axis=0)
  2224. apply_result = df_grouped.apply(DataFrame.quantile, .8)
  2225. expected = df_grouped.quantile(.8)
  2226. assert_frame_equal(apply_result, expected)
  2227. assert_frame_equal(agg_result, expected, check_names=False)
  2228. agg_result = df_grouped.agg(f, q=80)
  2229. apply_result = df_grouped.apply(DataFrame.quantile, q=.8)
  2230. assert_frame_equal(agg_result, expected, check_names=False)
  2231. assert_frame_equal(apply_result, expected)
  2232. def test_size(self):
  2233. grouped = self.df.groupby(['A', 'B'])
  2234. result = grouped.size()
  2235. for key, group in grouped:
  2236. self.assertEqual(result[key], len(group))
  2237. grouped = self.df.groupby('A')
  2238. result = grouped.size()
  2239. for key, group in grouped:
  2240. self.assertEqual(result[key], len(group))
  2241. grouped = self.df.groupby('B')
  2242. result = grouped.size()
  2243. for key, group in grouped:
  2244. self.assertEqual(result[key], len(group))
  2245. df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc'))
  2246. for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])):
  2247. left = df.groupby(key, sort=sort).size()
  2248. right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0])
  2249. assert_series_equal(left, right, check_names=False)
  2250. # GH11699
  2251. df = DataFrame([], columns=['A', 'B'])
  2252. out = Series([], dtype='int64', index=Index([], name='A'))
  2253. assert_series_equal(df.groupby('A').size(), out)
  2254. def test_count(self):
  2255. from string import ascii_lowercase
  2256. n = 1 << 15
  2257. dr = date_range('2015-08-30', periods=n // 10, freq='T')
  2258. df = DataFrame({
  2259. '1st': np.random.choice(
  2260. list(ascii_lowercase), n),
  2261. '2nd': np.random.randint(0, 5, n),
  2262. '3rd': np.random.randn(n).round(3),
  2263. '4th': np.random.randint(-10, 10, n),
  2264. '5th': np.random.choice(dr, n),
  2265. '6th': np.random.randn(n).round(3),
  2266. '7th': np.random.randn(n).round(3),
  2267. '8th': np.random.choice(dr, n) - np.random.choice(dr, 1),
  2268. '9th': np.random.choice(
  2269. list(ascii_lowercase), n)
  2270. })
  2271. for col in df.columns.drop(['1st', '2nd', '4th']):
  2272. df.loc[np.random.choice(n, n // 10), col] = np.nan
  2273. df['9th'] = df['9th'].astype('category')
  2274. for key in '1st', '2nd', ['1st', '2nd']:
  2275. left = df.groupby(key).count()
  2276. right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1)
  2277. assert_frame_equal(left, right)
  2278. # GH5610
  2279. # count counts non-nulls
  2280. df = pd.DataFrame([[1, 2, 'foo'], [1, nan, 'bar'], [3, nan, nan]],
  2281. columns=['A', 'B', 'C'])
  2282. count_as = df.groupby('A').count()
  2283. count_not_as = df.groupby('A', as_index=False).count()
  2284. expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'],
  2285. index=[1, 3])
  2286. expected.index.name = 'A'
  2287. assert_frame_equal(count_not_as, expected.reset_index())
  2288. assert_frame_equal(count_as, expected)
  2289. count_B = df.groupby('A')['B'].count()
  2290. assert_series_equal(count_B, expected['B'])
  2291. def test_count_object(self):
  2292. df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3, 'c': [2] * 3 + [3] * 3})
  2293. result = df.groupby('c').a.count()
  2294. expected = pd.Series([
  2295. 3, 3
  2296. ], index=pd.Index([2, 3], name='c'), name='a')
  2297. tm.assert_series_equal(result, expected)
  2298. df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3,
  2299. 'c': [2] * 3 + [3] * 3})
  2300. result = df.groupby('c').a.count()
  2301. expected = pd.Series([
  2302. 1, 3
  2303. ], index=pd.Index([2, 3], name='c'), name='a')
  2304. tm.assert_series_equal(result, expected)
  2305. def test_count_cross_type(self): # GH8169
  2306. vals = np.hstack((np.random.randint(0, 5, (100, 2)), np.random.randint(
  2307. 0, 2, (100, 2))))
  2308. df = pd.DataFrame(vals, columns=['a', 'b', 'c', 'd'])
  2309. df[df == 2] = np.nan
  2310. expected = df.groupby(['c', 'd']).count()
  2311. for t in ['float32', 'object']:
  2312. df['a'] = df['a'].astype(t)
  2313. df['b'] = df['b'].astype(t)
  2314. result = df.groupby(['c', 'd']).count()
  2315. tm.assert_frame_equal(result, expected)
  2316. def test_non_cython_api(self):
  2317. # GH5610
  2318. # non-cython calls should not include the grouper
  2319. df = DataFrame(
  2320. [[1, 2, 'foo'], [1,
  2321. nan,
  2322. 'bar', ], [3, nan, 'baz']
  2323. ], columns=['A', 'B', 'C'])
  2324. g = df.groupby('A')
  2325. gni = df.groupby('A', as_index=False)
  2326. # mad
  2327. expected = DataFrame([[0], [nan]], columns=['B'], index=[1, 3])
  2328. expected.index.name = 'A'
  2329. result = g.mad()
  2330. assert_frame_equal(result, expected)
  2331. expected = DataFrame([[0., 0.], [0, nan]], columns=['A', 'B'],
  2332. index=[0, 1])
  2333. result = gni.mad()
  2334. assert_frame_equal(result, expected)
  2335. # describe
  2336. expected = DataFrame(dict(B=concat(
  2337. [df.loc[[0, 1], 'B'].describe(), df.loc[[2], 'B'].describe()],
  2338. keys=[1, 3])))
  2339. expected.index.names = ['A', None]
  2340. result = g.describe()
  2341. assert_frame_equal(result, expected)
  2342. expected = concat(
  2343. [df.loc[[0, 1], ['A', 'B']].describe(),
  2344. df.loc[[2], ['A', 'B']].describe()], keys=[0, 1])
  2345. result = gni.describe()
  2346. assert_frame_equal(result, expected)
  2347. # any
  2348. expected = DataFrame([[True, True], [False, True]], columns=['B', 'C'],
  2349. index=[1, 3])
  2350. expected.index.name = 'A'
  2351. result = g.any()
  2352. assert_frame_equal(result, expected)
  2353. # idxmax
  2354. expected = DataFrame([[0], [nan]], columns=['B'], index=[1, 3])
  2355. expected.index.name = 'A'
  2356. result = g.idxmax()
  2357. assert_frame_equal(result, expected)
  2358. def test_cython_api2(self):
  2359. # this takes the fast apply path
  2360. # cumsum (GH5614)
  2361. df = DataFrame(
  2362. [[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]
  2363. ], columns=['A', 'B', 'C'])
  2364. expected = DataFrame(
  2365. [[2, np.nan], [np.nan, 9], [4, 9]], columns=['B', 'C'])
  2366. result = df.groupby('A').cumsum()
  2367. assert_frame_equal(result, expected)
  2368. # GH 5755 - cumsum is a transformer and should ignore as_index
  2369. result = df.groupby('A', as_index=False).cumsum()
  2370. assert_frame_equal(result, expected)
  2371. # GH 13994
  2372. result = df.groupby('A').cumsum(axis=1)
  2373. expected = df.cumsum(axis=1)
  2374. assert_frame_equal(result, expected)
  2375. result = df.groupby('A').cumprod(axis=1)
  2376. expected = df.cumprod(axis=1)
  2377. assert_frame_equal(result, expected)
  2378. def test_grouping_ndarray(self):
  2379. grouped = self.df.groupby(self.df['A'].values)
  2380. result = grouped.sum()
  2381. expected = self.df.groupby('A').sum()
  2382. assert_frame_equal(result, expected, check_names=False
  2383. ) # Note: no names when grouping by value
  2384. def test_agg_consistency(self):
  2385. # agg with ([]) and () not consistent
  2386. # GH 6715
  2387. def P1(a):
  2388. try:
  2389. return np.percentile(a.dropna(), q=1)
  2390. except:
  2391. return np.nan
  2392. import datetime as dt
  2393. df = DataFrame({'col1': [1, 2, 3, 4],
  2394. 'col2': [10, 25, 26, 31],
  2395. 'date': [dt.date(2013, 2, 10), dt.date(2013, 2, 10),
  2396. dt.date(2013, 2, 11), dt.date(2013, 2, 11)]})
  2397. g = df.groupby('date')
  2398. expected = g.agg([P1])
  2399. expected.columns = expected.columns.levels[0]
  2400. result = g.agg(P1)
  2401. assert_frame_equal(result, expected)
  2402. def test_apply_typecast_fail(self):
  2403. df = DataFrame({'d': [1., 1., 1., 2., 2., 2.],
  2404. 'c': np.tile(
  2405. ['a', 'b', 'c'], 2),
  2406. 'v': np.arange(1., 7.)})
  2407. def f(group):
  2408. v = group['v']
  2409. group['v2'] = (v - v.min()) / (v.max() - v.min())
  2410. return group
  2411. result = df.groupby('d').apply(f)
  2412. expected = df.copy()
  2413. expected['v2'] = np.tile([0., 0.5, 1], 2)
  2414. assert_frame_equal(result, expected)
  2415. def test_apply_multiindex_fail(self):
  2416. index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]
  2417. ])
  2418. df = DataFrame({'d': [1., 1., 1., 2., 2., 2.],
  2419. 'c': np.tile(['a', 'b', 'c'], 2),
  2420. 'v': np.arange(1., 7.)}, index=index)
  2421. def f(group):
  2422. v = group['v']
  2423. group['v2'] = (v - v.min()) / (v.max() - v.min())
  2424. return group
  2425. result = df.groupby('d').apply(f)
  2426. expected = df.copy()
  2427. expected['v2'] = np.tile([0., 0.5, 1], 2)
  2428. assert_frame_equal(result, expected)
  2429. def test_apply_corner(self):
  2430. result = self.tsframe.groupby(lambda x: x.year).apply(lambda x: x * 2)
  2431. expected = self.tsframe * 2
  2432. assert_frame_equal(result, expected)
  2433. def test_apply_without_copy(self):
  2434. # GH 5545
  2435. # returning a non-copy in an applied function fails
  2436. data = DataFrame({'id_field': [100, 100, 200, 300],
  2437. 'category': ['a', 'b', 'c', 'c'],
  2438. 'value': [1, 2, 3, 4]})
  2439. def filt1(x):
  2440. if x.shape[0] == 1:
  2441. return x.copy()
  2442. else:
  2443. return x[x.category == 'c']
  2444. def filt2(x):
  2445. if x.shape[0] == 1:
  2446. return x
  2447. else:
  2448. return x[x.category == 'c']
  2449. expected = data.groupby('id_field').apply(filt1)
  2450. result = data.groupby('id_field').apply(filt2)
  2451. assert_frame_equal(result, expected)
  2452. def test_apply_use_categorical_name(self):
  2453. from pandas import qcut
  2454. cats = qcut(self.df.C, 4)
  2455. def get_stats(group):
  2456. return {'min': group.min(),
  2457. 'max': group.max(),
  2458. 'count': group.count(),
  2459. 'mean': group.mean()}
  2460. result = self.df.groupby(cats).D.apply(get_stats)
  2461. self.assertEqual(result.index.names[0], 'C')
  2462. def test_apply_categorical_data(self):
  2463. # GH 10138
  2464. for ordered in [True, False]:
  2465. dense = Categorical(list('abc'), ordered=ordered)
  2466. # 'b' is in the categories but not in the list
  2467. missing = Categorical(
  2468. list('aaa'), categories=['a', 'b'], ordered=ordered)
  2469. values = np.arange(len(dense))
  2470. df = DataFrame({'missing': missing,
  2471. 'dense': dense,
  2472. 'values': values})
  2473. grouped = df.groupby(['missing', 'dense'])
  2474. # missing category 'b' should still exist in the output index
  2475. idx = MultiIndex.from_product([['a', 'b'], ['a', 'b', 'c']],
  2476. names=['missing', 'dense'])
  2477. expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan],
  2478. index=idx,
  2479. columns=['values'])
  2480. assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected)
  2481. assert_frame_equal(grouped.mean(), expected)
  2482. assert_frame_equal(grouped.agg(np.mean), expected)
  2483. # but for transform we should still get back the original index
  2484. idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']],
  2485. names=['missing', 'dense'])
  2486. expected = Series(1, index=idx)
  2487. assert_series_equal(grouped.apply(lambda x: 1), expected)
  2488. def test_apply_corner_cases(self):
  2489. # #535, can't use sliding iterator
  2490. N = 1000
  2491. labels = np.random.randint(0, 100, size=N)
  2492. df = DataFrame({'key': labels,
  2493. 'value1': np.random.randn(N),
  2494. 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)})
  2495. grouped = df.groupby('key')
  2496. def f(g):
  2497. g['value3'] = g['value1'] * 2
  2498. return g
  2499. result = grouped.apply(f)
  2500. self.assertTrue('value3' in result)
  2501. def test_transform_mixed_type(self):
  2502. index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]
  2503. ])
  2504. df = DataFrame({'d': [1., 1., 1., 2., 2., 2.],
  2505. 'c': np.tile(['a', 'b', 'c'], 2),
  2506. 'v': np.arange(1., 7.)}, index=index)
  2507. def f(group):
  2508. group['g'] = group['d'] * 2
  2509. return group[:1]
  2510. grouped = df.groupby('c')
  2511. result = grouped.apply(f)
  2512. self.assertEqual(result['d'].dtype, np.float64)
  2513. # this is by definition a mutating operation!
  2514. with option_context('mode.chained_assignment', None):
  2515. for key, group in grouped:
  2516. res = f(group)
  2517. assert_frame_equal(res, result.ix[key])
  2518. def test_groupby_wrong_multi_labels(self):
  2519. from pandas import read_csv
  2520. data = """index,foo,bar,baz,spam,data
  2521. 0,foo1,bar1,baz1,spam2,20
  2522. 1,foo1,bar2,baz1,spam3,30
  2523. 2,foo2,bar2,baz1,spam2,40
  2524. 3,foo1,bar1,baz2,spam1,50
  2525. 4,foo3,bar1,baz2,spam1,60"""
  2526. data = read_csv(StringIO(data), index_col=0)
  2527. grouped = data.groupby(['foo', 'bar', 'baz', 'spam'])
  2528. result = grouped.agg(np.mean)
  2529. expected = grouped.mean()
  2530. assert_frame_equal(result, expected)
  2531. def test_groupby_series_with_name(self):
  2532. result = self.df.groupby(self.df['A']).mean()
  2533. result2 = self.df.groupby(self.df['A'], as_index=False).mean()
  2534. self.assertEqual(result.index.name, 'A')
  2535. self.assertIn('A', result2)
  2536. result = self.df.groupby([self.df['A'], self.df['B']]).mean()
  2537. result2 = self.df.groupby([self.df['A'], self.df['B']],
  2538. as_index=False).mean()
  2539. self.assertEqual(result.index.names, ('A', 'B'))
  2540. self.assertIn('A', result2)
  2541. self.assertIn('B', result2)
  2542. def test_seriesgroupby_name_attr(self):
  2543. # GH 6265
  2544. result = self.df.groupby('A')['C']
  2545. self.assertEqual(result.count().name, 'C')
  2546. self.assertEqual(result.mean().name, 'C')
  2547. testFunc = lambda x: np.sum(x) * 2
  2548. self.assertEqual(result.agg(testFunc).name, 'C')
  2549. def test_consistency_name(self):
  2550. # GH 12363
  2551. df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
  2552. 'foo', 'bar', 'foo', 'foo'],
  2553. 'B': ['one', 'one', 'two', 'two',
  2554. 'two', 'two', 'one', 'two'],
  2555. 'C': np.random.randn(8) + 1.0,
  2556. 'D': np.arange(8)})
  2557. expected = df.groupby(['A']).B.count()
  2558. result = df.B.groupby(df.A).count()
  2559. assert_series_equal(result, expected)
  2560. def test_groupby_name_propagation(self):
  2561. # GH 6124
  2562. def summarize(df, name=None):
  2563. return Series({'count': 1, 'mean': 2, 'omissions': 3, }, name=name)
  2564. def summarize_random_name(df):
  2565. # Provide a different name for each Series. In this case, groupby
  2566. # should not attempt to propagate the Series name since they are
  2567. # inconsistent.
  2568. return Series({
  2569. 'count': 1,
  2570. 'mean': 2,
  2571. 'omissions': 3,
  2572. }, name=df.iloc[0]['A'])
  2573. metrics = self.df.groupby('A').apply(summarize)
  2574. self.assertEqual(metrics.columns.name, None)
  2575. metrics = self.df.groupby('A').apply(summarize, 'metrics')
  2576. self.assertEqual(metrics.columns.name, 'metrics')
  2577. metrics = self.df.groupby('A').apply(summarize_random_name)
  2578. self.assertEqual(metrics.columns.name, None)
  2579. def test_groupby_nonstring_columns(self):
  2580. df = DataFrame([np.arange(10) for x in range(10)])
  2581. grouped = df.groupby(0)
  2582. result = grouped.mean()
  2583. expected = df.groupby(df[0]).mean()
  2584. assert_frame_equal(result, expected)
  2585. def test_groupby_mixed_type_columns(self):
  2586. # GH 13432, unorderable types in py3
  2587. df = DataFrame([[0, 1, 2]], columns=['A', 'B', 0])
  2588. expected = DataFrame([[1, 2]], columns=['B', 0],
  2589. index=Index([0], name='A'))
  2590. result = df.groupby('A').first()
  2591. tm.assert_frame_equal(result, expected)
  2592. result = df.groupby('A').sum()
  2593. tm.assert_frame_equal(result, expected)
  2594. def test_cython_grouper_series_bug_noncontig(self):
  2595. arr = np.empty((100, 100))
  2596. arr.fill(np.nan)
  2597. obj = Series(arr[:, 0], index=lrange(100))
  2598. inds = np.tile(lrange(10), 10)
  2599. result = obj.groupby(inds).agg(Series.median)
  2600. self.assertTrue(result.isnull().all())
  2601. def test_series_grouper_noncontig_index(self):
  2602. index = Index(tm.rands_array(10, 100))
  2603. values = Series(np.random.randn(50), index=index[::2])
  2604. labels = np.random.randint(0, 5, 50)
  2605. # it works!
  2606. grouped = values.groupby(labels)
  2607. # accessing the index elements causes segfault
  2608. f = lambda x: len(set(map(id, x.index)))
  2609. grouped.agg(f)
  2610. def test_convert_objects_leave_decimal_alone(self):
  2611. from decimal import Decimal
  2612. s = Series(lrange(5))
  2613. labels = np.array(['a', 'b', 'c', 'd', 'e'], dtype='O')
  2614. def convert_fast(x):
  2615. return Decimal(str(x.mean()))
  2616. def convert_force_pure(x):
  2617. # base will be length 0
  2618. assert (len(x.base) > 0)
  2619. return Decimal(str(x.mean()))
  2620. grouped = s.groupby(labels)
  2621. result = grouped.agg(convert_fast)
  2622. self.assertEqual(result.dtype, np.object_)
  2623. tm.assertIsInstance(result[0], Decimal)
  2624. result = grouped.agg(convert_force_pure)
  2625. self.assertEqual(result.dtype, np.object_)
  2626. tm.assertIsInstance(result[0], Decimal)
  2627. def test_fast_apply(self):
  2628. # make sure that fast apply is correctly called
  2629. # rather than raising any kind of error
  2630. # otherwise the python path will be callsed
  2631. # which slows things down
  2632. N = 1000
  2633. labels = np.random.randint(0, 2000, size=N)
  2634. labels2 = np.random.randint(0, 3, size=N)
  2635. df = DataFrame({'key': labels,
  2636. 'key2': labels2,
  2637. 'value1': np.random.randn(N),
  2638. 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)})
  2639. def f(g):
  2640. return 1
  2641. g = df.groupby(['key', 'key2'])
  2642. grouper = g.grouper
  2643. splitter = grouper._get_splitter(g._selected_obj, axis=g.axis)
  2644. group_keys = grouper._get_group_keys()
  2645. values, mutated = splitter.fast_apply(f, group_keys)
  2646. self.assertFalse(mutated)
  2647. def test_apply_with_mixed_dtype(self):
  2648. # GH3480, apply with mixed dtype on axis=1 breaks in 0.11
  2649. df = DataFrame({'foo1': ['one', 'two', 'two', 'three', 'one', 'two'],
  2650. 'foo2': np.random.randn(6)})
  2651. result = df.apply(lambda x: x, axis=1)
  2652. assert_series_equal(df.get_dtype_counts(), result.get_dtype_counts())
  2653. # GH 3610 incorrect dtype conversion with as_index=False
  2654. df = DataFrame({"c1": [1, 2, 6, 6, 8]})
  2655. df["c2"] = df.c1 / 2.0
  2656. result1 = df.groupby("c2").mean().reset_index().c2
  2657. result2 = df.groupby("c2", as_index=False).mean().c2
  2658. assert_series_equal(result1, result2)
  2659. def test_groupby_aggregation_mixed_dtype(self):
  2660. # GH 6212
  2661. expected = DataFrame({
  2662. 'v1': [5, 5, 7, np.nan, 3, 3, 4, 1],
  2663. 'v2': [55, 55, 77, np.nan, 33, 33, 44, 11]},
  2664. index=MultiIndex.from_tuples([(1, 95), (1, 99), (2, 95), (2, 99),
  2665. ('big', 'damp'),
  2666. ('blue', 'dry'),
  2667. ('red', 'red'), ('red', 'wet')],
  2668. names=['by1', 'by2']))
  2669. df = DataFrame({
  2670. 'v1': [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9],
  2671. 'v2': [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99],
  2672. 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan,
  2673. 12],
  2674. 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99,
  2675. np.nan, np.nan]
  2676. })
  2677. g = df.groupby(['by1', 'by2'])
  2678. result = g[['v1', 'v2']].mean()
  2679. assert_frame_equal(result, expected)
  2680. def test_groupby_dtype_inference_empty(self):
  2681. # GH 6733
  2682. df = DataFrame({'x': [], 'range': np.arange(0, dtype='int64')})
  2683. self.assertEqual(df['x'].dtype, np.float64)
  2684. result = df.groupby('x').first()
  2685. exp_index = Index([], name='x', dtype=np.float64)
  2686. expected = DataFrame({'range': Series(
  2687. [], index=exp_index, dtype='int64')})
  2688. assert_frame_equal(result, expected, by_blocks=True)
  2689. def test_groupby_list_infer_array_like(self):
  2690. result = self.df.groupby(list(self.df['A'])).mean()
  2691. expected = self.df.groupby(self.df['A']).mean()
  2692. assert_frame_equal(result, expected, check_names=False)
  2693. self.assertRaises(Exception, self.df.groupby, list(self.df['A'][:-1]))
  2694. # pathological case of ambiguity
  2695. df = DataFrame({'foo': [0, 1],
  2696. 'bar': [3, 4],
  2697. 'val': np.random.randn(2)})
  2698. result = df.groupby(['foo', 'bar']).mean()
  2699. expected = df.groupby([df['foo'], df['bar']]).mean()[['val']]
  2700. def test_groupby_keys_same_size_as_index(self):
  2701. # GH 11185
  2702. freq = 's'
  2703. index = pd.date_range(start=pd.Timestamp('2015-09-29T11:34:44-0700'),
  2704. periods=2, freq=freq)
  2705. df = pd.DataFrame([['A', 10], ['B', 15]], columns=[
  2706. 'metric', 'values'
  2707. ], index=index)
  2708. result = df.groupby([pd.Grouper(level=0, freq=freq), 'metric']).mean()
  2709. expected = df.set_index([df.index, 'metric'])
  2710. assert_frame_equal(result, expected)
  2711. def test_groupby_one_row(self):
  2712. # GH 11741
  2713. df1 = pd.DataFrame(np.random.randn(1, 4), columns=list('ABCD'))
  2714. self.assertRaises(KeyError, df1.groupby, 'Z')
  2715. df2 = pd.DataFrame(np.random.randn(2, 4), columns=list('ABCD'))
  2716. self.assertRaises(KeyError, df2.groupby, 'Z')
  2717. def test_groupby_nat_exclude(self):
  2718. # GH 6992
  2719. df = pd.DataFrame(
  2720. {'values': np.random.randn(8),
  2721. 'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp(
  2722. '2013-02-01'), np.nan, pd.Timestamp('2013-02-01'), np.nan,
  2723. pd.Timestamp('2013-01-01')],
  2724. 'str': [np.nan, 'a', np.nan, 'a', np.nan, 'a', np.nan, 'b']})
  2725. grouped = df.groupby('dt')
  2726. expected = [[1, 7], [3, 5]]
  2727. keys = sorted(grouped.groups.keys())
  2728. self.assertEqual(len(keys), 2)
  2729. for k, e in zip(keys, expected):
  2730. # grouped.groups keys are np.datetime64 with system tz
  2731. # not to be affected by tz, only compare values
  2732. self.assertEqual(grouped.groups[k], e)
  2733. # confirm obj is not filtered
  2734. tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df)
  2735. self.assertEqual(grouped.ngroups, 2)
  2736. expected = {
  2737. Timestamp('2013-01-01 00:00:00'): np.array([1, 7], dtype=np.int64),
  2738. Timestamp('2013-02-01 00:00:00'): np.array([3, 5], dtype=np.int64)
  2739. }
  2740. for k in grouped.indices:
  2741. self.assert_numpy_array_equal(grouped.indices[k], expected[k])
  2742. tm.assert_frame_equal(
  2743. grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]])
  2744. tm.assert_frame_equal(
  2745. grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]])
  2746. self.assertRaises(KeyError, grouped.get_group, pd.NaT)
  2747. nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan],
  2748. 'nat': [pd.NaT, pd.NaT, pd.NaT]})
  2749. self.assertEqual(nan_df['nan'].dtype, 'float64')
  2750. self.assertEqual(nan_df['nat'].dtype, 'datetime64[ns]')
  2751. for key in ['nan', 'nat']:
  2752. grouped = nan_df.groupby(key)
  2753. self.assertEqual(grouped.groups, {})
  2754. self.assertEqual(grouped.ngroups, 0)
  2755. self.assertEqual(grouped.indices, {})
  2756. self.assertRaises(KeyError, grouped.get_group, np.nan)
  2757. self.assertRaises(KeyError, grouped.get_group, pd.NaT)
  2758. def test_dictify(self):
  2759. dict(iter(self.df.groupby('A')))
  2760. dict(iter(self.df.groupby(['A', 'B'])))
  2761. dict(iter(self.df['C'].groupby(self.df['A'])))
  2762. dict(iter(self.df['C'].groupby([self.df['A'], self.df['B']])))
  2763. dict(iter(self.df.groupby('A')['C']))
  2764. dict(iter(self.df.groupby(['A', 'B'])['C']))
  2765. def test_sparse_friendly(self):
  2766. sdf = self.df[['C', 'D']].to_sparse()
  2767. panel = tm.makePanel()
  2768. tm.add_nans(panel)
  2769. def _check_work(gp):
  2770. gp.mean()
  2771. gp.agg(np.mean)
  2772. dict(iter(gp))
  2773. # it works!
  2774. _check_work(sdf.groupby(lambda x: x // 2))
  2775. _check_work(sdf['C'].groupby(lambda x: x // 2))
  2776. _check_work(sdf.groupby(self.df['A']))
  2777. # do this someday
  2778. # _check_work(panel.groupby(lambda x: x.month, axis=1))
  2779. def test_panel_groupby(self):
  2780. self.panel = tm.makePanel()
  2781. tm.add_nans(self.panel)
  2782. grouped = self.panel.groupby({'ItemA': 0, 'ItemB': 0, 'ItemC': 1},
  2783. axis='items')
  2784. agged = grouped.mean()
  2785. agged2 = grouped.agg(lambda x: x.mean('items'))
  2786. tm.assert_panel_equal(agged, agged2)
  2787. self.assert_index_equal(agged.items, Index([0, 1]))
  2788. grouped = self.panel.groupby(lambda x: x.month, axis='major')
  2789. agged = grouped.mean()
  2790. exp = Index(sorted(list(set(self.panel.major_axis.month))))
  2791. self.assert_index_equal(agged.major_axis, exp)
  2792. grouped = self.panel.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1},
  2793. axis='minor')
  2794. agged = grouped.mean()
  2795. self.assert_index_equal(agged.minor_axis, Index([0, 1]))
  2796. def test_numpy_groupby(self):
  2797. from pandas.core.groupby import numpy_groupby
  2798. data = np.random.randn(100, 100)
  2799. labels = np.random.randint(0, 10, size=100)
  2800. df = DataFrame(data)
  2801. result = df.groupby(labels).sum().values
  2802. expected = numpy_groupby(data, labels)
  2803. assert_almost_equal(result, expected)
  2804. result = df.groupby(labels, axis=1).sum().values
  2805. expected = numpy_groupby(data, labels, axis=1)
  2806. assert_almost_equal(result, expected)
  2807. def test_groupby_2d_malformed(self):
  2808. d = DataFrame(index=lrange(2))
  2809. d['group'] = ['g1', 'g2']
  2810. d['zeros'] = [0, 0]
  2811. d['ones'] = [1, 1]
  2812. d['label'] = ['l1', 'l2']
  2813. tmp = d.groupby(['group']).mean()
  2814. res_values = np.array([[0, 1], [0, 1]], dtype=np.int64)
  2815. self.assert_index_equal(tmp.columns, Index(['zeros', 'ones']))
  2816. self.assert_numpy_array_equal(tmp.values, res_values)
  2817. def test_int32_overflow(self):
  2818. B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000)
  2819. ))
  2820. A = np.arange(25000)
  2821. df = DataFrame({'A': A,
  2822. 'B': B,
  2823. 'C': A,
  2824. 'D': B,
  2825. 'E': np.random.randn(25000)})
  2826. left = df.groupby(['A', 'B', 'C', 'D']).sum()
  2827. right = df.groupby(['D', 'C', 'B', 'A']).sum()
  2828. self.assertEqual(len(left), len(right))
  2829. def test_int64_overflow(self):
  2830. from pandas.core.groupby import _int64_overflow_possible
  2831. B = np.concatenate((np.arange(1000), np.arange(1000), np.arange(500)))
  2832. A = np.arange(2500)
  2833. df = DataFrame({'A': A,
  2834. 'B': B,
  2835. 'C': A,
  2836. 'D': B,
  2837. 'E': A,
  2838. 'F': B,
  2839. 'G': A,
  2840. 'H': B,
  2841. 'values': np.random.randn(2500)})
  2842. lg = df.groupby(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'])
  2843. rg = df.groupby(['H', 'G', 'F', 'E', 'D', 'C', 'B', 'A'])
  2844. left = lg.sum()['values']
  2845. right = rg.sum()['values']
  2846. exp_index, _ = left.index.sortlevel(0)
  2847. self.assert_index_equal(left.index, exp_index)
  2848. exp_index, _ = right.index.sortlevel(0)
  2849. self.assert_index_equal(right.index, exp_index)
  2850. tups = list(map(tuple, df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'
  2851. ]].values))
  2852. tups = com._asarray_tuplesafe(tups)
  2853. expected = df.groupby(tups).sum()['values']
  2854. for k, v in compat.iteritems(expected):
  2855. self.assertEqual(left[k], right[k[::-1]])
  2856. self.assertEqual(left[k], v)
  2857. self.assertEqual(len(left), len(right))
  2858. # GH9096
  2859. values = range(55109)
  2860. data = pd.DataFrame.from_dict({'a': values,
  2861. 'b': values,
  2862. 'c': values,
  2863. 'd': values})
  2864. grouped = data.groupby(['a', 'b', 'c', 'd'])
  2865. self.assertEqual(len(grouped), len(values))
  2866. arr = np.random.randint(-1 << 12, 1 << 12, (1 << 15, 5))
  2867. i = np.random.choice(len(arr), len(arr) * 4)
  2868. arr = np.vstack((arr, arr[i])) # add sume duplicate rows
  2869. i = np.random.permutation(len(arr))
  2870. arr = arr[i] # shuffle rows
  2871. df = DataFrame(arr, columns=list('abcde'))
  2872. df['jim'], df['joe'] = np.random.randn(2, len(df)) * 10
  2873. gr = df.groupby(list('abcde'))
  2874. # verify this is testing what it is supposed to test!
  2875. self.assertTrue(_int64_overflow_possible(gr.grouper.shape))
  2876. # mannually compute groupings
  2877. jim, joe = defaultdict(list), defaultdict(list)
  2878. for key, a, b in zip(map(tuple, arr), df['jim'], df['joe']):
  2879. jim[key].append(a)
  2880. joe[key].append(b)
  2881. self.assertEqual(len(gr), len(jim))
  2882. mi = MultiIndex.from_tuples(jim.keys(), names=list('abcde'))
  2883. def aggr(func):
  2884. f = lambda a: np.fromiter(map(func, a), dtype='f8')
  2885. arr = np.vstack((f(jim.values()), f(joe.values()))).T
  2886. res = DataFrame(arr, columns=['jim', 'joe'], index=mi)
  2887. return res.sort_index()
  2888. assert_frame_equal(gr.mean(), aggr(np.mean))
  2889. assert_frame_equal(gr.median(), aggr(np.median))
  2890. def test_groupby_sort_multi(self):
  2891. df = DataFrame({'a': ['foo', 'bar', 'baz'],
  2892. 'b': [3, 2, 1],
  2893. 'c': [0, 1, 2],
  2894. 'd': np.random.randn(3)})
  2895. tups = lmap(tuple, df[['a', 'b', 'c']].values)
  2896. tups = com._asarray_tuplesafe(tups)
  2897. result = df.groupby(['a', 'b', 'c'], sort=True).sum()
  2898. self.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]])
  2899. tups = lmap(tuple, df[['c', 'a', 'b']].values)
  2900. tups = com._asarray_tuplesafe(tups)
  2901. result = df.groupby(['c', 'a', 'b'], sort=True).sum()
  2902. self.assert_numpy_array_equal(result.index.values, tups)
  2903. tups = lmap(tuple, df[['b', 'c', 'a']].values)
  2904. tups = com._asarray_tuplesafe(tups)
  2905. result = df.groupby(['b', 'c', 'a'], sort=True).sum()
  2906. self.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]])
  2907. df = DataFrame({'a': [0, 1, 2, 0, 1, 2],
  2908. 'b': [0, 0, 0, 1, 1, 1],
  2909. 'd': np.random.randn(6)})
  2910. grouped = df.groupby(['a', 'b'])['d']
  2911. result = grouped.sum()
  2912. _check_groupby(df, result, ['a', 'b'], 'd')
  2913. def test_intercept_builtin_sum(self):
  2914. s = Series([1., 2., np.nan, 3.])
  2915. grouped = s.groupby([0, 1, 2, 2])
  2916. result = grouped.agg(builtins.sum)
  2917. result2 = grouped.apply(builtins.sum)
  2918. expected = grouped.sum()
  2919. assert_series_equal(result, expected)
  2920. assert_series_equal(result2, expected)
  2921. def test_column_select_via_attr(self):
  2922. result = self.df.groupby('A').C.sum()
  2923. expected = self.df.groupby('A')['C'].sum()
  2924. assert_series_equal(result, expected)
  2925. self.df['mean'] = 1.5
  2926. result = self.df.groupby('A').mean()
  2927. expected = self.df.groupby('A').agg(np.mean)
  2928. assert_frame_equal(result, expected)
  2929. def test_rank_apply(self):
  2930. lev1 = tm.rands_array(10, 100)
  2931. lev2 = tm.rands_array(10, 130)
  2932. lab1 = np.random.randint(0, 100, size=500)
  2933. lab2 = np.random.randint(0, 130, size=500)
  2934. df = DataFrame({'value': np.random.randn(500),
  2935. 'key1': lev1.take(lab1),
  2936. 'key2': lev2.take(lab2)})
  2937. result = df.groupby(['key1', 'key2']).value.rank()
  2938. expected = []
  2939. for key, piece in df.groupby(['key1', 'key2']):
  2940. expected.append(piece.value.rank())
  2941. expected = concat(expected, axis=0)
  2942. expected = expected.reindex(result.index)
  2943. assert_series_equal(result, expected)
  2944. result = df.groupby(['key1', 'key2']).value.rank(pct=True)
  2945. expected = []
  2946. for key, piece in df.groupby(['key1', 'key2']):
  2947. expected.append(piece.value.rank(pct=True))
  2948. expected = concat(expected, axis=0)
  2949. expected = expected.reindex(result.index)
  2950. assert_series_equal(result, expected)
  2951. def test_dont_clobber_name_column(self):
  2952. df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'],
  2953. 'name': ['foo', 'bar', 'baz'] * 2})
  2954. result = df.groupby('key').apply(lambda x: x)
  2955. assert_frame_equal(result, df)
  2956. def test_skip_group_keys(self):
  2957. from pandas import concat
  2958. tsf = tm.makeTimeDataFrame()
  2959. grouped = tsf.groupby(lambda x: x.month, group_keys=False)
  2960. result = grouped.apply(lambda x: x.sort_values(by='A')[:3])
  2961. pieces = []
  2962. for key, group in grouped:
  2963. pieces.append(group.sort_values(by='A')[:3])
  2964. expected = concat(pieces)
  2965. assert_frame_equal(result, expected)
  2966. grouped = tsf['A'].groupby(lambda x: x.month, group_keys=False)
  2967. result = grouped.apply(lambda x: x.sort_values()[:3])
  2968. pieces = []
  2969. for key, group in grouped:
  2970. pieces.append(group.sort_values()[:3])
  2971. expected = concat(pieces)
  2972. assert_series_equal(result, expected)
  2973. def test_no_nonsense_name(self):
  2974. # GH #995
  2975. s = self.frame['C'].copy()
  2976. s.name = None
  2977. result = s.groupby(self.frame['A']).agg(np.sum)
  2978. self.assertIsNone(result.name)
  2979. def test_wrap_agg_out(self):
  2980. grouped = self.three_group.groupby(['A', 'B'])
  2981. def func(ser):
  2982. if ser.dtype == np.object:
  2983. raise TypeError
  2984. else:
  2985. return ser.sum()
  2986. result = grouped.aggregate(func)
  2987. exp_grouped = self.three_group.ix[:, self.three_group.columns != 'C']
  2988. expected = exp_grouped.groupby(['A', 'B']).aggregate(func)
  2989. assert_frame_equal(result, expected)
  2990. def test_multifunc_sum_bug(self):
  2991. # GH #1065
  2992. x = DataFrame(np.arange(9).reshape(3, 3))
  2993. x['test'] = 0
  2994. x['fl'] = [1.3, 1.5, 1.6]
  2995. grouped = x.groupby('test')
  2996. result = grouped.agg({'fl': 'sum', 2: 'size'})
  2997. self.assertEqual(result['fl'].dtype, np.float64)
  2998. def test_handle_dict_return_value(self):
  2999. def f(group):
  3000. return {'min': group.min(), 'max': group.max()}
  3001. def g(group):
  3002. return Series({'min': group.min(), 'max': group.max()})
  3003. result = self.df.groupby('A')['C'].apply(f)
  3004. expected = self.df.groupby('A')['C'].apply(g)
  3005. tm.assertIsInstance(result, Series)
  3006. assert_series_equal(result, expected)
  3007. def test_getitem_list_of_columns(self):
  3008. df = DataFrame(
  3009. {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
  3010. 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
  3011. 'C': np.random.randn(8),
  3012. 'D': np.random.randn(8),
  3013. 'E': np.random.randn(8)})
  3014. result = df.groupby('A')[['C', 'D']].mean()
  3015. result2 = df.groupby('A')['C', 'D'].mean()
  3016. result3 = df.groupby('A')[df.columns[2:4]].mean()
  3017. expected = df.ix[:, ['A', 'C', 'D']].groupby('A').mean()
  3018. assert_frame_equal(result, expected)
  3019. assert_frame_equal(result2, expected)
  3020. assert_frame_equal(result3, expected)
  3021. def test_getitem_numeric_column_names(self):
  3022. # GH #13731
  3023. df = DataFrame({0: list('abcd') * 2,
  3024. 2: np.random.randn(8),
  3025. 4: np.random.randn(8),
  3026. 6: np.random.randn(8)})
  3027. result = df.groupby(0)[df.columns[1:3]].mean()
  3028. result2 = df.groupby(0)[2, 4].mean()
  3029. result3 = df.groupby(0)[[2, 4]].mean()
  3030. expected = df.ix[:, [0, 2, 4]].groupby(0).mean()
  3031. assert_frame_equal(result, expected)
  3032. assert_frame_equal(result2, expected)
  3033. assert_frame_equal(result3, expected)
  3034. def test_agg_multiple_functions_maintain_order(self):
  3035. # GH #610
  3036. funcs = [('mean', np.mean), ('max', np.max), ('min', np.min)]
  3037. result = self.df.groupby('A')['C'].agg(funcs)
  3038. exp_cols = Index(['mean', 'max', 'min'])
  3039. self.assert_index_equal(result.columns, exp_cols)
  3040. def test_multiple_functions_tuples_and_non_tuples(self):
  3041. # #1359
  3042. funcs = [('foo', 'mean'), 'std']
  3043. ex_funcs = [('foo', 'mean'), ('std', 'std')]
  3044. result = self.df.groupby('A')['C'].agg(funcs)
  3045. expected = self.df.groupby('A')['C'].agg(ex_funcs)
  3046. assert_frame_equal(result, expected)
  3047. result = self.df.groupby('A').agg(funcs)
  3048. expected = self.df.groupby('A').agg(ex_funcs)
  3049. assert_frame_equal(result, expected)
  3050. def test_agg_multiple_functions_too_many_lambdas(self):
  3051. grouped = self.df.groupby('A')
  3052. funcs = ['mean', lambda x: x.mean(), lambda x: x.std()]
  3053. self.assertRaises(SpecificationError, grouped.agg, funcs)
  3054. def test_more_flexible_frame_multi_function(self):
  3055. from pandas import concat
  3056. grouped = self.df.groupby('A')
  3057. exmean = grouped.agg(OrderedDict([['C', np.mean], ['D', np.mean]]))
  3058. exstd = grouped.agg(OrderedDict([['C', np.std], ['D', np.std]]))
  3059. expected = concat([exmean, exstd], keys=['mean', 'std'], axis=1)
  3060. expected = expected.swaplevel(0, 1, axis=1).sortlevel(0, axis=1)
  3061. d = OrderedDict([['C', [np.mean, np.std]], ['D', [np.mean, np.std]]])
  3062. result = grouped.aggregate(d)
  3063. assert_frame_equal(result, expected)
  3064. # be careful
  3065. result = grouped.aggregate(OrderedDict([['C', np.mean],
  3066. ['D', [np.mean, np.std]]]))
  3067. expected = grouped.aggregate(OrderedDict([['C', np.mean],
  3068. ['D', [np.mean, np.std]]]))
  3069. assert_frame_equal(result, expected)
  3070. def foo(x):
  3071. return np.mean(x)
  3072. def bar(x):
  3073. return np.std(x, ddof=1)
  3074. d = OrderedDict([['C', np.mean], ['D', OrderedDict(
  3075. [['foo', np.mean], ['bar', np.std]])]])
  3076. result = grouped.aggregate(d)
  3077. d = OrderedDict([['C', [np.mean]], ['D', [foo, bar]]])
  3078. expected = grouped.aggregate(d)
  3079. assert_frame_equal(result, expected)
  3080. def test_multi_function_flexible_mix(self):
  3081. # GH #1268
  3082. grouped = self.df.groupby('A')
  3083. d = OrderedDict([['C', OrderedDict([['foo', 'mean'], [
  3084. 'bar', 'std'
  3085. ]])], ['D', 'sum']])
  3086. result = grouped.aggregate(d)
  3087. d2 = OrderedDict([['C', OrderedDict([['foo', 'mean'], [
  3088. 'bar', 'std'
  3089. ]])], ['D', ['sum']]])
  3090. result2 = grouped.aggregate(d2)
  3091. d3 = OrderedDict([['C', OrderedDict([['foo', 'mean'], [
  3092. 'bar', 'std'
  3093. ]])], ['D', {'sum': 'sum'}]])
  3094. expected = grouped.aggregate(d3)
  3095. assert_frame_equal(result, expected)
  3096. assert_frame_equal(result2, expected)
  3097. def test_agg_callables(self):
  3098. # GH 7929
  3099. df = DataFrame({'foo': [1, 2], 'bar': [3, 4]}).astype(np.int64)
  3100. class fn_class(object):
  3101. def __call__(self, x):
  3102. return sum(x)
  3103. equiv_callables = [sum, np.sum, lambda x: sum(x), lambda x: x.sum(),
  3104. partial(sum), fn_class()]
  3105. expected = df.groupby("foo").agg(sum)
  3106. for ecall in equiv_callables:
  3107. result = df.groupby('foo').agg(ecall)
  3108. assert_frame_equal(result, expected)
  3109. def test_set_group_name(self):
  3110. def f(group):
  3111. assert group.name is not None
  3112. return group
  3113. def freduce(group):
  3114. assert group.name is not None
  3115. return group.sum()
  3116. def foo(x):
  3117. return freduce(x)
  3118. def _check_all(grouped):
  3119. # make sure all these work
  3120. grouped.apply(f)
  3121. grouped.aggregate(freduce)
  3122. grouped.aggregate({'C': freduce, 'D': freduce})
  3123. grouped.transform(f)
  3124. grouped['C'].apply(f)
  3125. grouped['C'].aggregate(freduce)
  3126. grouped['C'].aggregate([freduce, foo])
  3127. grouped['C'].transform(f)
  3128. _check_all(self.df.groupby('A'))
  3129. _check_all(self.df.groupby(['A', 'B']))
  3130. def test_no_dummy_key_names(self):
  3131. # GH #1291
  3132. result = self.df.groupby(self.df['A'].values).sum()
  3133. self.assertIsNone(result.index.name)
  3134. result = self.df.groupby([self.df['A'].values, self.df['B'].values
  3135. ]).sum()
  3136. self.assertEqual(result.index.names, (None, None))
  3137. def test_groupby_sort_categorical(self):
  3138. # dataframe groupby sort was being ignored # GH 8868
  3139. df = DataFrame([['(7.5, 10]', 10, 10],
  3140. ['(7.5, 10]', 8, 20],
  3141. ['(2.5, 5]', 5, 30],
  3142. ['(5, 7.5]', 6, 40],
  3143. ['(2.5, 5]', 4, 50],
  3144. ['(0, 2.5]', 1, 60],
  3145. ['(5, 7.5]', 7, 70]], columns=['range', 'foo', 'bar'])
  3146. df['range'] = Categorical(df['range'], ordered=True)
  3147. index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]',
  3148. '(7.5, 10]'], name='range', ordered=True)
  3149. result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
  3150. columns=['foo', 'bar'], index=index)
  3151. col = 'range'
  3152. assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
  3153. # when categories is ordered, group is ordered by category's order
  3154. assert_frame_equal(result_sort, df.groupby(col, sort=False).first())
  3155. df['range'] = Categorical(df['range'], ordered=False)
  3156. index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]',
  3157. '(7.5, 10]'], name='range')
  3158. result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
  3159. columns=['foo', 'bar'], index=index)
  3160. index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', '(5, 7.5]',
  3161. '(0, 2.5]'],
  3162. categories=['(7.5, 10]', '(2.5, 5]',
  3163. '(5, 7.5]', '(0, 2.5]'],
  3164. name='range')
  3165. result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
  3166. index=index, columns=['foo', 'bar'])
  3167. col = 'range'
  3168. # this is an unordered categorical, but we allow this ####
  3169. assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
  3170. assert_frame_equal(result_nosort, df.groupby(col, sort=False).first())
  3171. def test_groupby_sort_categorical_datetimelike(self):
  3172. # GH10505
  3173. # use same data as test_groupby_sort_categorical, which category is
  3174. # corresponding to datetime.month
  3175. df = DataFrame({'dt': [datetime(2011, 7, 1), datetime(2011, 7, 1),
  3176. datetime(2011, 2, 1), datetime(2011, 5, 1),
  3177. datetime(2011, 2, 1), datetime(2011, 1, 1),
  3178. datetime(2011, 5, 1)],
  3179. 'foo': [10, 8, 5, 6, 4, 1, 7],
  3180. 'bar': [10, 20, 30, 40, 50, 60, 70]},
  3181. columns=['dt', 'foo', 'bar'])
  3182. # ordered=True
  3183. df['dt'] = Categorical(df['dt'], ordered=True)
  3184. index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
  3185. datetime(2011, 5, 1), datetime(2011, 7, 1)]
  3186. result_sort = DataFrame(
  3187. [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
  3188. result_sort.index = CategoricalIndex(index, name='dt', ordered=True)
  3189. index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
  3190. datetime(2011, 5, 1), datetime(2011, 1, 1)]
  3191. result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
  3192. columns=['foo', 'bar'])
  3193. result_nosort.index = CategoricalIndex(index, categories=index,
  3194. name='dt', ordered=True)
  3195. col = 'dt'
  3196. assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
  3197. # when categories is ordered, group is ordered by category's order
  3198. assert_frame_equal(result_sort, df.groupby(col, sort=False).first())
  3199. # ordered = False
  3200. df['dt'] = Categorical(df['dt'], ordered=False)
  3201. index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
  3202. datetime(2011, 5, 1), datetime(2011, 7, 1)]
  3203. result_sort = DataFrame(
  3204. [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
  3205. result_sort.index = CategoricalIndex(index, name='dt')
  3206. index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
  3207. datetime(2011, 5, 1), datetime(2011, 1, 1)]
  3208. result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
  3209. columns=['foo', 'bar'])
  3210. result_nosort.index = CategoricalIndex(index, categories=index,
  3211. name='dt')
  3212. col = 'dt'
  3213. assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
  3214. assert_frame_equal(result_nosort, df.groupby(col, sort=False).first())
  3215. def test_groupby_sort_multiindex_series(self):
  3216. # series multiindex groupby sort argument was not being passed through
  3217. # _compress_group_index
  3218. # GH 9444
  3219. index = MultiIndex(levels=[[1, 2], [1, 2]],
  3220. labels=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]],
  3221. names=['a', 'b'])
  3222. mseries = Series([0, 1, 2, 3, 4, 5], index=index)
  3223. index = MultiIndex(levels=[[1, 2], [1, 2]],
  3224. labels=[[0, 0, 1], [1, 0, 0]], names=['a', 'b'])
  3225. mseries_result = Series([0, 2, 4], index=index)
  3226. result = mseries.groupby(level=['a', 'b'], sort=False).first()
  3227. assert_series_equal(result, mseries_result)
  3228. result = mseries.groupby(level=['a', 'b'], sort=True).first()
  3229. assert_series_equal(result, mseries_result.sort_index())
  3230. def test_groupby_categorical(self):
  3231. levels = ['foo', 'bar', 'baz', 'qux']
  3232. codes = np.random.randint(0, 4, size=100)
  3233. cats = Categorical.from_codes(codes, levels, ordered=True)
  3234. data = DataFrame(np.random.randn(100, 4))
  3235. result = data.groupby(cats).mean()
  3236. expected = data.groupby(np.asarray(cats)).mean()
  3237. exp_idx = CategoricalIndex(levels, categories=cats.categories,
  3238. ordered=True)
  3239. expected = expected.reindex(exp_idx)
  3240. assert_frame_equal(result, expected)
  3241. grouped = data.groupby(cats)
  3242. desc_result = grouped.describe()
  3243. idx = cats.codes.argsort()
  3244. ord_labels = np.asarray(cats).take(idx)
  3245. ord_data = data.take(idx)
  3246. exp_cats = Categorical(ord_labels, ordered=True,
  3247. categories=['foo', 'bar', 'baz', 'qux'])
  3248. expected = ord_data.groupby(exp_cats, sort=False).describe()
  3249. expected.index.names = [None, None]
  3250. assert_frame_equal(desc_result, expected)
  3251. # GH 10460
  3252. expc = Categorical.from_codes(np.arange(4).repeat(8),
  3253. levels, ordered=True)
  3254. exp = CategoricalIndex(expc)
  3255. self.assert_index_equal(desc_result.index.get_level_values(0), exp)
  3256. exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
  3257. '75%', 'max'] * 4)
  3258. self.assert_index_equal(desc_result.index.get_level_values(1), exp)
  3259. def test_groupby_datetime_categorical(self):
  3260. # GH9049: ensure backward compatibility
  3261. levels = pd.date_range('2014-01-01', periods=4)
  3262. codes = np.random.randint(0, 4, size=100)
  3263. cats = Categorical.from_codes(codes, levels, ordered=True)
  3264. data = DataFrame(np.random.randn(100, 4))
  3265. result = data.groupby(cats).mean()
  3266. expected = data.groupby(np.asarray(cats)).mean()
  3267. expected = expected.reindex(levels)
  3268. expected.index = CategoricalIndex(expected.index,
  3269. categories=expected.index,
  3270. ordered=True)
  3271. assert_frame_equal(result, expected)
  3272. grouped = data.groupby(cats)
  3273. desc_result = grouped.describe()
  3274. idx = cats.codes.argsort()
  3275. ord_labels = cats.take_nd(idx)
  3276. ord_data = data.take(idx)
  3277. expected = ord_data.groupby(ord_labels).describe()
  3278. expected.index.names = [None, None]
  3279. assert_frame_equal(desc_result, expected)
  3280. tm.assert_index_equal(desc_result.index, expected.index)
  3281. tm.assert_index_equal(
  3282. desc_result.index.get_level_values(0),
  3283. expected.index.get_level_values(0))
  3284. # GH 10460
  3285. expc = Categorical.from_codes(
  3286. np.arange(4).repeat(8), levels, ordered=True)
  3287. exp = CategoricalIndex(expc)
  3288. self.assert_index_equal(desc_result.index.get_level_values(0), exp)
  3289. exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
  3290. '75%', 'max'] * 4)
  3291. self.assert_index_equal(desc_result.index.get_level_values(1), exp)
  3292. def test_groupby_categorical_index(self):
  3293. levels = ['foo', 'bar', 'baz', 'qux']
  3294. codes = np.random.randint(0, 4, size=20)
  3295. cats = Categorical.from_codes(codes, levels, ordered=True)
  3296. df = DataFrame(
  3297. np.repeat(
  3298. np.arange(20), 4).reshape(-1, 4), columns=list('abcd'))
  3299. df['cats'] = cats
  3300. # with a cat index
  3301. result = df.set_index('cats').groupby(level=0).sum()
  3302. expected = df[list('abcd')].groupby(cats.codes).sum()
  3303. expected.index = CategoricalIndex(
  3304. Categorical.from_codes(
  3305. [0, 1, 2, 3], levels, ordered=True), name='cats')
  3306. assert_frame_equal(result, expected)
  3307. # with a cat column, should produce a cat index
  3308. result = df.groupby('cats').sum()
  3309. expected = df[list('abcd')].groupby(cats.codes).sum()
  3310. expected.index = CategoricalIndex(
  3311. Categorical.from_codes(
  3312. [0, 1, 2, 3], levels, ordered=True), name='cats')
  3313. assert_frame_equal(result, expected)
  3314. def test_groupby_describe_categorical_columns(self):
  3315. # GH 11558
  3316. cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'],
  3317. categories=['foo', 'bar', 'baz', 'qux'],
  3318. ordered=True)
  3319. df = DataFrame(np.random.randn(20, 4), columns=cats)
  3320. result = df.groupby([1, 2, 3, 4] * 5).describe()
  3321. tm.assert_index_equal(result.columns, cats)
  3322. tm.assert_categorical_equal(result.columns.values, cats.values)
  3323. def test_groupby_unstack_categorical(self):
  3324. # GH11558 (example is taken from the original issue)
  3325. df = pd.DataFrame({'a': range(10),
  3326. 'medium': ['A', 'B'] * 5,
  3327. 'artist': list('XYXXY') * 2})
  3328. df['medium'] = df['medium'].astype('category')
  3329. gcat = df.groupby(['artist', 'medium'])['a'].count().unstack()
  3330. result = gcat.describe()
  3331. exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False,
  3332. name='medium')
  3333. tm.assert_index_equal(result.columns, exp_columns)
  3334. tm.assert_categorical_equal(result.columns.values, exp_columns.values)
  3335. result = gcat['A'] + gcat['B']
  3336. expected = pd.Series([6, 4], index=pd.Index(['X', 'Y'], name='artist'))
  3337. tm.assert_series_equal(result, expected)
  3338. def test_groupby_groups_datetimeindex(self):
  3339. # #1430
  3340. from pandas.tseries.api import DatetimeIndex
  3341. periods = 1000
  3342. ind = DatetimeIndex(start='2012/1/1', freq='5min', periods=periods)
  3343. df = DataFrame({'high': np.arange(periods),
  3344. 'low': np.arange(periods)}, index=ind)
  3345. grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
  3346. # it works!
  3347. groups = grouped.groups
  3348. tm.assertIsInstance(list(groups.keys())[0], datetime)
  3349. def test_groupby_groups_datetimeindex_tz(self):
  3350. # GH 3950
  3351. dates = ['2011-07-19 07:00:00', '2011-07-19 08:00:00',
  3352. '2011-07-19 09:00:00', '2011-07-19 07:00:00',
  3353. '2011-07-19 08:00:00', '2011-07-19 09:00:00']
  3354. df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'],
  3355. 'datetime': dates,
  3356. 'value1': np.arange(6, dtype='int64'),
  3357. 'value2': [1, 2] * 3})
  3358. df['datetime'] = df['datetime'].apply(
  3359. lambda d: Timestamp(d, tz='US/Pacific'))
  3360. exp_idx1 = pd.DatetimeIndex(['2011-07-19 07:00:00',
  3361. '2011-07-19 07:00:00',
  3362. '2011-07-19 08:00:00',
  3363. '2011-07-19 08:00:00',
  3364. '2011-07-19 09:00:00',
  3365. '2011-07-19 09:00:00'],
  3366. tz='US/Pacific', name='datetime')
  3367. exp_idx2 = Index(['a', 'b'] * 3, name='label')
  3368. exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
  3369. expected = DataFrame({'value1': [0, 3, 1, 4, 2, 5],
  3370. 'value2': [1, 2, 2, 1, 1, 2]},
  3371. index=exp_idx, columns=['value1', 'value2'])
  3372. result = df.groupby(['datetime', 'label']).sum()
  3373. assert_frame_equal(result, expected)
  3374. # by level
  3375. didx = pd.DatetimeIndex(dates, tz='Asia/Tokyo')
  3376. df = DataFrame({'value1': np.arange(6, dtype='int64'),
  3377. 'value2': [1, 2, 3, 1, 2, 3]},
  3378. index=didx)
  3379. exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00',
  3380. '2011-07-19 08:00:00',
  3381. '2011-07-19 09:00:00'], tz='Asia/Tokyo')
  3382. expected = DataFrame({'value1': [3, 5, 7], 'value2': [2, 4, 6]},
  3383. index=exp_idx, columns=['value1', 'value2'])
  3384. result = df.groupby(level=0).sum()
  3385. assert_frame_equal(result, expected)
  3386. def test_groupby_multi_timezone(self):
  3387. # combining multiple / different timezones yields UTC
  3388. data = """0,2000-01-28 16:47:00,America/Chicago
  3389. 1,2000-01-29 16:48:00,America/Chicago
  3390. 2,2000-01-30 16:49:00,America/Los_Angeles
  3391. 3,2000-01-31 16:50:00,America/Chicago
  3392. 4,2000-01-01 16:50:00,America/New_York"""
  3393. df = pd.read_csv(StringIO(data), header=None,
  3394. names=['value', 'date', 'tz'])
  3395. result = df.groupby('tz').date.apply(
  3396. lambda x: pd.to_datetime(x).dt.tz_localize(x.name))
  3397. expected = Series([Timestamp('2000-01-28 16:47:00-0600',
  3398. tz='America/Chicago'),
  3399. Timestamp('2000-01-29 16:48:00-0600',
  3400. tz='America/Chicago'),
  3401. Timestamp('2000-01-30 16:49:00-0800',
  3402. tz='America/Los_Angeles'),
  3403. Timestamp('2000-01-31 16:50:00-0600',
  3404. tz='America/Chicago'),
  3405. Timestamp('2000-01-01 16:50:00-0500',
  3406. tz='America/New_York')],
  3407. name='date',
  3408. dtype=object)
  3409. assert_series_equal(result, expected)
  3410. tz = 'America/Chicago'
  3411. res_values = df.groupby('tz').date.get_group(tz)
  3412. result = pd.to_datetime(res_values).dt.tz_localize(tz)
  3413. exp_values = Series(['2000-01-28 16:47:00', '2000-01-29 16:48:00',
  3414. '2000-01-31 16:50:00'],
  3415. index=[0, 1, 3], name='date')
  3416. expected = pd.to_datetime(exp_values).dt.tz_localize(tz)
  3417. assert_series_equal(result, expected)
  3418. def test_groupby_groups_periods(self):
  3419. dates = ['2011-07-19 07:00:00', '2011-07-19 08:00:00',
  3420. '2011-07-19 09:00:00', '2011-07-19 07:00:00',
  3421. '2011-07-19 08:00:00', '2011-07-19 09:00:00']
  3422. df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'],
  3423. 'period': [pd.Period(d, freq='H') for d in dates],
  3424. 'value1': np.arange(6, dtype='int64'),
  3425. 'value2': [1, 2] * 3})
  3426. exp_idx1 = pd.PeriodIndex(['2011-07-19 07:00:00',
  3427. '2011-07-19 07:00:00',
  3428. '2011-07-19 08:00:00',
  3429. '2011-07-19 08:00:00',
  3430. '2011-07-19 09:00:00',
  3431. '2011-07-19 09:00:00'],
  3432. freq='H', name='period')
  3433. exp_idx2 = Index(['a', 'b'] * 3, name='label')
  3434. exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
  3435. expected = DataFrame({'value1': [0, 3, 1, 4, 2, 5],
  3436. 'value2': [1, 2, 2, 1, 1, 2]},
  3437. index=exp_idx, columns=['value1', 'value2'])
  3438. result = df.groupby(['period', 'label']).sum()
  3439. assert_frame_equal(result, expected)
  3440. # by level
  3441. didx = pd.PeriodIndex(dates, freq='H')
  3442. df = DataFrame({'value1': np.arange(6, dtype='int64'),
  3443. 'value2': [1, 2, 3, 1, 2, 3]},
  3444. index=didx)
  3445. exp_idx = pd.PeriodIndex(['2011-07-19 07:00:00',
  3446. '2011-07-19 08:00:00',
  3447. '2011-07-19 09:00:00'], freq='H')
  3448. expected = DataFrame({'value1': [3, 5, 7], 'value2': [2, 4, 6]},
  3449. index=exp_idx, columns=['value1', 'value2'])
  3450. result = df.groupby(level=0).sum()
  3451. assert_frame_equal(result, expected)
  3452. def test_groupby_reindex_inside_function(self):
  3453. from pandas.tseries.api import DatetimeIndex
  3454. periods = 1000
  3455. ind = DatetimeIndex(start='2012/1/1', freq='5min', periods=periods)
  3456. df = DataFrame({'high': np.arange(
  3457. periods), 'low': np.arange(periods)}, index=ind)
  3458. def agg_before(hour, func, fix=False):
  3459. """
  3460. Run an aggregate func on the subset of data.
  3461. """
  3462. def _func(data):
  3463. d = data.select(lambda x: x.hour < 11).dropna()
  3464. if fix:
  3465. data[data.index[0]]
  3466. if len(d) == 0:
  3467. return None
  3468. return func(d)
  3469. return _func
  3470. def afunc(data):
  3471. d = data.select(lambda x: x.hour < 11).dropna()
  3472. return np.max(d)
  3473. grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
  3474. closure_bad = grouped.agg({'high': agg_before(11, np.max)})
  3475. closure_good = grouped.agg({'high': agg_before(11, np.max, True)})
  3476. assert_frame_equal(closure_bad, closure_good)
  3477. def test_multiindex_columns_empty_level(self):
  3478. l = [['count', 'values'], ['to filter', '']]
  3479. midx = MultiIndex.from_tuples(l)
  3480. df = DataFrame([[long(1), 'A']], columns=midx)
  3481. grouped = df.groupby('to filter').groups
  3482. self.assertEqual(grouped['A'], [0])
  3483. grouped = df.groupby([('to filter', '')]).groups
  3484. self.assertEqual(grouped['A'], [0])
  3485. df = DataFrame([[long(1), 'A'], [long(2), 'B']], columns=midx)
  3486. expected = df.groupby('to filter').groups
  3487. result = df.groupby([('to filter', '')]).groups
  3488. self.assertEqual(result, expected)
  3489. df = DataFrame([[long(1), 'A'], [long(2), 'A']], columns=midx)
  3490. expected = df.groupby('to filter').groups
  3491. result = df.groupby([('to filter', '')]).groups
  3492. self.assertEqual(result, expected)
  3493. def test_cython_median(self):
  3494. df = DataFrame(np.random.randn(1000))
  3495. df.values[::2] = np.nan
  3496. labels = np.random.randint(0, 50, size=1000).astype(float)
  3497. labels[::17] = np.nan
  3498. result = df.groupby(labels).median()
  3499. exp = df.groupby(labels).agg(nanops.nanmedian)
  3500. assert_frame_equal(result, exp)
  3501. df = DataFrame(np.random.randn(1000, 5))
  3502. rs = df.groupby(labels).agg(np.median)
  3503. xp = df.groupby(labels).median()
  3504. assert_frame_equal(rs, xp)
  3505. def test_groupby_categorical_no_compress(self):
  3506. data = Series(np.random.randn(9))
  3507. codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
  3508. cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True)
  3509. result = data.groupby(cats).mean()
  3510. exp = data.groupby(codes).mean()
  3511. exp.index = CategoricalIndex(exp.index, categories=cats.categories,
  3512. ordered=cats.ordered)
  3513. assert_series_equal(result, exp)
  3514. codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3])
  3515. cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True)
  3516. result = data.groupby(cats).mean()
  3517. exp = data.groupby(codes).mean().reindex(cats.categories)
  3518. exp.index = CategoricalIndex(exp.index, categories=cats.categories,
  3519. ordered=cats.ordered)
  3520. assert_series_equal(result, exp)
  3521. cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
  3522. categories=["a", "b", "c", "d"], ordered=True)
  3523. data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
  3524. result = data.groupby("b").mean()
  3525. result = result["a"].values
  3526. exp = np.array([1, 2, 4, np.nan])
  3527. self.assert_numpy_array_equal(result, exp)
  3528. def test_groupby_non_arithmetic_agg_types(self):
  3529. # GH9311, GH6620
  3530. df = pd.DataFrame([{'a': 1,
  3531. 'b': 1}, {'a': 1,
  3532. 'b': 2}, {'a': 2,
  3533. 'b': 3}, {'a': 2,
  3534. 'b': 4}])
  3535. dtypes = ['int8', 'int16', 'int32', 'int64', 'float32', 'float64']
  3536. grp_exp = {'first': {'df': [{'a': 1,
  3537. 'b': 1}, {'a': 2,
  3538. 'b': 3}]},
  3539. 'last': {'df': [{'a': 1,
  3540. 'b': 2}, {'a': 2,
  3541. 'b': 4}]},
  3542. 'min': {'df': [{'a': 1,
  3543. 'b': 1}, {'a': 2,
  3544. 'b': 3}]},
  3545. 'max': {'df': [{'a': 1,
  3546. 'b': 2}, {'a': 2,
  3547. 'b': 4}]},
  3548. 'nth': {'df': [{'a': 1,
  3549. 'b': 2}, {'a': 2,
  3550. 'b': 4}],
  3551. 'args': [1]},
  3552. 'count': {'df': [{'a': 1,
  3553. 'b': 2}, {'a': 2,
  3554. 'b': 2}],
  3555. 'out_type': 'int64'}}
  3556. for dtype in dtypes:
  3557. df_in = df.copy()
  3558. df_in['b'] = df_in.b.astype(dtype)
  3559. for method, data in compat.iteritems(grp_exp):
  3560. if 'args' not in data:
  3561. data['args'] = []
  3562. if 'out_type' in data:
  3563. out_type = data['out_type']
  3564. else:
  3565. out_type = dtype
  3566. exp = data['df']
  3567. df_out = pd.DataFrame(exp)
  3568. df_out['b'] = df_out.b.astype(out_type)
  3569. df_out.set_index('a', inplace=True)
  3570. grpd = df_in.groupby('a')
  3571. t = getattr(grpd, method)(*data['args'])
  3572. assert_frame_equal(t, df_out)
  3573. def test_groupby_non_arithmetic_agg_intlike_precision(self):
  3574. # GH9311, GH6620
  3575. c = 24650000000000000
  3576. inputs = ((Timestamp('2011-01-15 12:50:28.502376'),
  3577. Timestamp('2011-01-20 12:50:28.593448')), (1 + c, 2 + c))
  3578. for i in inputs:
  3579. df = pd.DataFrame([{'a': 1, 'b': i[0]}, {'a': 1, 'b': i[1]}])
  3580. grp_exp = {'first': {'expected': i[0]},
  3581. 'last': {'expected': i[1]},
  3582. 'min': {'expected': i[0]},
  3583. 'max': {'expected': i[1]},
  3584. 'nth': {'expected': i[1],
  3585. 'args': [1]},
  3586. 'count': {'expected': 2}}
  3587. for method, data in compat.iteritems(grp_exp):
  3588. if 'args' not in data:
  3589. data['args'] = []
  3590. grpd = df.groupby('a')
  3591. res = getattr(grpd, method)(*data['args'])
  3592. self.assertEqual(res.iloc[0].b, data['expected'])
  3593. def test_groupby_first_datetime64(self):
  3594. df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)])
  3595. df[1] = df[1].view('M8[ns]')
  3596. self.assertTrue(issubclass(df[1].dtype.type, np.datetime64))
  3597. result = df.groupby(level=0).first()
  3598. got_dt = result[1].dtype
  3599. self.assertTrue(issubclass(got_dt.type, np.datetime64))
  3600. result = df[1].groupby(level=0).first()
  3601. got_dt = result.dtype
  3602. self.assertTrue(issubclass(got_dt.type, np.datetime64))
  3603. def test_groupby_max_datetime64(self):
  3604. # GH 5869
  3605. # datetimelike dtype conversion from int
  3606. df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5)))
  3607. expected = df.groupby('A')['A'].apply(lambda x: x.max())
  3608. result = df.groupby('A')['A'].max()
  3609. assert_series_equal(result, expected)
  3610. def test_groupby_datetime64_32_bit(self):
  3611. # GH 6410 / numpy 4328
  3612. # 32-bit under 1.9-dev indexing issue
  3613. df = DataFrame({"A": range(2), "B": [pd.Timestamp('2000-01-1')] * 2})
  3614. result = df.groupby("A")["B"].transform(min)
  3615. expected = Series([pd.Timestamp('2000-01-1')] * 2, name='B')
  3616. assert_series_equal(result, expected)
  3617. def test_groupby_categorical_unequal_len(self):
  3618. # GH3011
  3619. series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4])
  3620. # The raises only happens with categorical, not with series of types
  3621. # category
  3622. bins = pd.cut(series.dropna().values, 4)
  3623. # len(bins) != len(series) here
  3624. self.assertRaises(ValueError, lambda: series.groupby(bins).mean())
  3625. def test_groupby_multiindex_missing_pair(self):
  3626. # GH9049
  3627. df = DataFrame({'group1': ['a', 'a', 'a', 'b'],
  3628. 'group2': ['c', 'c', 'd', 'c'],
  3629. 'value': [1, 1, 1, 5]})
  3630. df = df.set_index(['group1', 'group2'])
  3631. df_grouped = df.groupby(level=['group1', 'group2'], sort=True)
  3632. res = df_grouped.agg('sum')
  3633. idx = MultiIndex.from_tuples(
  3634. [('a', 'c'), ('a', 'd'), ('b', 'c')], names=['group1', 'group2'])
  3635. exp = DataFrame([[2], [1], [5]], index=idx, columns=['value'])
  3636. tm.assert_frame_equal(res, exp)
  3637. def test_groupby_multiindex_not_lexsorted(self):
  3638. # GH 11640
  3639. # define the lexsorted version
  3640. lexsorted_mi = MultiIndex.from_tuples(
  3641. [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c'])
  3642. lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
  3643. self.assertTrue(lexsorted_df.columns.is_lexsorted())
  3644. # define the non-lexsorted version
  3645. not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'],
  3646. data=[[1, 'b1', 'c1', 3],
  3647. [1, 'b2', 'c2', 4]])
  3648. not_lexsorted_df = not_lexsorted_df.pivot_table(
  3649. index='a', columns=['b', 'c'], values='d')
  3650. not_lexsorted_df = not_lexsorted_df.reset_index()
  3651. self.assertFalse(not_lexsorted_df.columns.is_lexsorted())
  3652. # compare the results
  3653. tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)
  3654. expected = lexsorted_df.groupby('a').mean()
  3655. with tm.assert_produces_warning(com.PerformanceWarning):
  3656. result = not_lexsorted_df.groupby('a').mean()
  3657. tm.assert_frame_equal(expected, result)
  3658. def test_groupby_levels_and_columns(self):
  3659. # GH9344, GH9049
  3660. idx_names = ['x', 'y']
  3661. idx = pd.MultiIndex.from_tuples(
  3662. [(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names)
  3663. df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx)
  3664. by_levels = df.groupby(level=idx_names).mean()
  3665. # reset_index changes columns dtype to object
  3666. by_columns = df.reset_index().groupby(idx_names).mean()
  3667. tm.assert_frame_equal(by_levels, by_columns, check_column_type=False)
  3668. by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64)
  3669. tm.assert_frame_equal(by_levels, by_columns)
  3670. def test_gb_apply_list_of_unequal_len_arrays(self):
  3671. # GH1738
  3672. df = DataFrame({'group1': ['a', 'a', 'a', 'b', 'b', 'b', 'a', 'a', 'a',
  3673. 'b', 'b', 'b'],
  3674. 'group2': ['c', 'c', 'd', 'd', 'd', 'e', 'c', 'c', 'd',
  3675. 'd', 'd', 'e'],
  3676. 'weight': [1.1, 2, 3, 4, 5, 6, 2, 4, 6, 8, 1, 2],
  3677. 'value': [7.1, 8, 9, 10, 11, 12, 8, 7, 6, 5, 4, 3]})
  3678. df = df.set_index(['group1', 'group2'])
  3679. df_grouped = df.groupby(level=['group1', 'group2'], sort=True)
  3680. def noddy(value, weight):
  3681. out = np.array(value * weight).repeat(3)
  3682. return out
  3683. # the kernel function returns arrays of unequal length
  3684. # pandas sniffs the first one, sees it's an array and not
  3685. # a list, and assumed the rest are of equal length
  3686. # and so tries a vstack
  3687. # don't die
  3688. df_grouped.apply(lambda x: noddy(x.value, x.weight))
  3689. def test_groupby_with_empty(self):
  3690. index = pd.DatetimeIndex(())
  3691. data = ()
  3692. series = pd.Series(data, index)
  3693. grouper = pd.tseries.resample.TimeGrouper('D')
  3694. grouped = series.groupby(grouper)
  3695. assert next(iter(grouped), None) is None
  3696. def test_groupby_with_single_column(self):
  3697. df = pd.DataFrame({'a': list('abssbab')})
  3698. tm.assert_frame_equal(df.groupby('a').get_group('a'), df.iloc[[0, 5]])
  3699. # GH 13530
  3700. exp = pd.DataFrame([], index=pd.Index(['a', 'b', 's'], name='a'))
  3701. tm.assert_frame_equal(df.groupby('a').count(), exp)
  3702. tm.assert_frame_equal(df.groupby('a').sum(), exp)
  3703. tm.assert_frame_equal(df.groupby('a').nth(1), exp)
  3704. def test_groupby_with_small_elem(self):
  3705. # GH 8542
  3706. # length=2
  3707. df = pd.DataFrame({'event': ['start', 'start'],
  3708. 'change': [1234, 5678]},
  3709. index=pd.DatetimeIndex(['2014-09-10', '2013-10-10']))
  3710. grouped = df.groupby([pd.TimeGrouper(freq='M'), 'event'])
  3711. self.assertEqual(len(grouped.groups), 2)
  3712. self.assertEqual(grouped.ngroups, 2)
  3713. self.assertIn((pd.Timestamp('2014-09-30'), 'start'), grouped.groups)
  3714. self.assertIn((pd.Timestamp('2013-10-31'), 'start'), grouped.groups)
  3715. res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start'))
  3716. tm.assert_frame_equal(res, df.iloc[[0], :])
  3717. res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start'))
  3718. tm.assert_frame_equal(res, df.iloc[[1], :])
  3719. df = pd.DataFrame({'event': ['start', 'start', 'start'],
  3720. 'change': [1234, 5678, 9123]},
  3721. index=pd.DatetimeIndex(['2014-09-10', '2013-10-10',
  3722. '2014-09-15']))
  3723. grouped = df.groupby([pd.TimeGrouper(freq='M'), 'event'])
  3724. self.assertEqual(len(grouped.groups), 2)
  3725. self.assertEqual(grouped.ngroups, 2)
  3726. self.assertIn((pd.Timestamp('2014-09-30'), 'start'), grouped.groups)
  3727. self.assertIn((pd.Timestamp('2013-10-31'), 'start'), grouped.groups)
  3728. res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start'))
  3729. tm.assert_frame_equal(res, df.iloc[[0, 2], :])
  3730. res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start'))
  3731. tm.assert_frame_equal(res, df.iloc[[1], :])
  3732. # length=3
  3733. df = pd.DataFrame({'event': ['start', 'start', 'start'],
  3734. 'change': [1234, 5678, 9123]},
  3735. index=pd.DatetimeIndex(['2014-09-10', '2013-10-10',
  3736. '2014-08-05']))
  3737. grouped = df.groupby([pd.TimeGrouper(freq='M'), 'event'])
  3738. self.assertEqual(len(grouped.groups), 3)
  3739. self.assertEqual(grouped.ngroups, 3)
  3740. self.assertIn((pd.Timestamp('2014-09-30'), 'start'), grouped.groups)
  3741. self.assertIn((pd.Timestamp('2013-10-31'), 'start'), grouped.groups)
  3742. self.assertIn((pd.Timestamp('2014-08-31'), 'start'), grouped.groups)
  3743. res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start'))
  3744. tm.assert_frame_equal(res, df.iloc[[0], :])
  3745. res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start'))
  3746. tm.assert_frame_equal(res, df.iloc[[1], :])
  3747. res = grouped.get_group((pd.Timestamp('2014-08-31'), 'start'))
  3748. tm.assert_frame_equal(res, df.iloc[[2], :])
  3749. def test_groupby_with_timezone_selection(self):
  3750. # GH 11616
  3751. # Test that column selection returns output in correct timezone.
  3752. np.random.seed(42)
  3753. df = pd.DataFrame({
  3754. 'factor': np.random.randint(0, 3, size=60),
  3755. 'time': pd.date_range('01/01/2000 00:00', periods=60,
  3756. freq='s', tz='UTC')
  3757. })
  3758. df1 = df.groupby('factor').max()['time']
  3759. df2 = df.groupby('factor')['time'].max()
  3760. tm.assert_series_equal(df1, df2)
  3761. def test_timezone_info(self):
  3762. # GH 11682
  3763. # Timezone info lost when broadcasting scalar datetime to DataFrame
  3764. tm._skip_if_no_pytz()
  3765. import pytz
  3766. df = pd.DataFrame({'a': [1], 'b': [datetime.now(pytz.utc)]})
  3767. self.assertEqual(df['b'][0].tzinfo, pytz.utc)
  3768. df = pd.DataFrame({'a': [1, 2, 3]})
  3769. df['b'] = datetime.now(pytz.utc)
  3770. self.assertEqual(df['b'][0].tzinfo, pytz.utc)
  3771. def test_groupby_with_timegrouper(self):
  3772. # GH 4161
  3773. # TimeGrouper requires a sorted index
  3774. # also verifies that the resultant index has the correct name
  3775. import datetime as DT
  3776. df_original = DataFrame({
  3777. 'Buyer': 'Carl Carl Carl Carl Joe Carl'.split(),
  3778. 'Quantity': [18, 3, 5, 1, 9, 3],
  3779. 'Date': [
  3780. DT.datetime(2013, 9, 1, 13, 0),
  3781. DT.datetime(2013, 9, 1, 13, 5),
  3782. DT.datetime(2013, 10, 1, 20, 0),
  3783. DT.datetime(2013, 10, 3, 10, 0),
  3784. DT.datetime(2013, 12, 2, 12, 0),
  3785. DT.datetime(2013, 9, 2, 14, 0),
  3786. ]
  3787. })
  3788. # GH 6908 change target column's order
  3789. df_reordered = df_original.sort_values(by='Quantity')
  3790. for df in [df_original, df_reordered]:
  3791. df = df.set_index(['Date'])
  3792. expected = DataFrame(
  3793. {'Quantity': np.nan},
  3794. index=date_range('20130901 13:00:00',
  3795. '20131205 13:00:00', freq='5D',
  3796. name='Date', closed='left'))
  3797. expected.iloc[[0, 6, 18], 0] = np.array(
  3798. [24., 6., 9.], dtype='float64')
  3799. result1 = df.resample('5D') .sum()
  3800. assert_frame_equal(result1, expected)
  3801. df_sorted = df.sort_index()
  3802. result2 = df_sorted.groupby(pd.TimeGrouper(freq='5D')).sum()
  3803. assert_frame_equal(result2, expected)
  3804. result3 = df.groupby(pd.TimeGrouper(freq='5D')).sum()
  3805. assert_frame_equal(result3, expected)
  3806. def test_groupby_with_timegrouper_methods(self):
  3807. # GH 3881
  3808. # make sure API of timegrouper conforms
  3809. import datetime as DT
  3810. df_original = pd.DataFrame({
  3811. 'Branch': 'A A A A A B'.split(),
  3812. 'Buyer': 'Carl Mark Carl Joe Joe Carl'.split(),
  3813. 'Quantity': [1, 3, 5, 8, 9, 3],
  3814. 'Date': [
  3815. DT.datetime(2013, 1, 1, 13, 0),
  3816. DT.datetime(2013, 1, 1, 13, 5),
  3817. DT.datetime(2013, 10, 1, 20, 0),
  3818. DT.datetime(2013, 10, 2, 10, 0),
  3819. DT.datetime(2013, 12, 2, 12, 0),
  3820. DT.datetime(2013, 12, 2, 14, 0),
  3821. ]
  3822. })
  3823. df_sorted = df_original.sort_values(by='Quantity', ascending=False)
  3824. for df in [df_original, df_sorted]:
  3825. df = df.set_index('Date', drop=False)
  3826. g = df.groupby(pd.TimeGrouper('6M'))
  3827. self.assertTrue(g.group_keys)
  3828. self.assertTrue(isinstance(g.grouper, pd.core.groupby.BinGrouper))
  3829. groups = g.groups
  3830. self.assertTrue(isinstance(groups, dict))
  3831. self.assertTrue(len(groups) == 3)
  3832. def test_timegrouper_with_reg_groups(self):
  3833. # GH 3794
  3834. # allow combinateion of timegrouper/reg groups
  3835. import datetime as DT
  3836. df_original = DataFrame({
  3837. 'Branch': 'A A A A A A A B'.split(),
  3838. 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
  3839. 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
  3840. 'Date': [
  3841. DT.datetime(2013, 1, 1, 13, 0),
  3842. DT.datetime(2013, 1, 1, 13, 5),
  3843. DT.datetime(2013, 10, 1, 20, 0),
  3844. DT.datetime(2013, 10, 2, 10, 0),
  3845. DT.datetime(2013, 10, 1, 20, 0),
  3846. DT.datetime(2013, 10, 2, 10, 0),
  3847. DT.datetime(2013, 12, 2, 12, 0),
  3848. DT.datetime(2013, 12, 2, 14, 0),
  3849. ]
  3850. }).set_index('Date')
  3851. df_sorted = df_original.sort_values(by='Quantity', ascending=False)
  3852. for df in [df_original, df_sorted]:
  3853. expected = DataFrame({
  3854. 'Buyer': 'Carl Joe Mark'.split(),
  3855. 'Quantity': [10, 18, 3],
  3856. 'Date': [
  3857. DT.datetime(2013, 12, 31, 0, 0),
  3858. DT.datetime(2013, 12, 31, 0, 0),
  3859. DT.datetime(2013, 12, 31, 0, 0),
  3860. ]
  3861. }).set_index(['Date', 'Buyer'])
  3862. result = df.groupby([pd.Grouper(freq='A'), 'Buyer']).sum()
  3863. assert_frame_equal(result, expected)
  3864. expected = DataFrame({
  3865. 'Buyer': 'Carl Mark Carl Joe'.split(),
  3866. 'Quantity': [1, 3, 9, 18],
  3867. 'Date': [
  3868. DT.datetime(2013, 1, 1, 0, 0),
  3869. DT.datetime(2013, 1, 1, 0, 0),
  3870. DT.datetime(2013, 7, 1, 0, 0),
  3871. DT.datetime(2013, 7, 1, 0, 0),
  3872. ]
  3873. }).set_index(['Date', 'Buyer'])
  3874. result = df.groupby([pd.Grouper(freq='6MS'), 'Buyer']).sum()
  3875. assert_frame_equal(result, expected)
  3876. df_original = DataFrame({
  3877. 'Branch': 'A A A A A A A B'.split(),
  3878. 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
  3879. 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
  3880. 'Date': [
  3881. DT.datetime(2013, 10, 1, 13, 0),
  3882. DT.datetime(2013, 10, 1, 13, 5),
  3883. DT.datetime(2013, 10, 1, 20, 0),
  3884. DT.datetime(2013, 10, 2, 10, 0),
  3885. DT.datetime(2013, 10, 1, 20, 0),
  3886. DT.datetime(2013, 10, 2, 10, 0),
  3887. DT.datetime(2013, 10, 2, 12, 0),
  3888. DT.datetime(2013, 10, 2, 14, 0),
  3889. ]
  3890. }).set_index('Date')
  3891. df_sorted = df_original.sort_values(by='Quantity', ascending=False)
  3892. for df in [df_original, df_sorted]:
  3893. expected = DataFrame({
  3894. 'Buyer': 'Carl Joe Mark Carl Joe'.split(),
  3895. 'Quantity': [6, 8, 3, 4, 10],
  3896. 'Date': [
  3897. DT.datetime(2013, 10, 1, 0, 0),
  3898. DT.datetime(2013, 10, 1, 0, 0),
  3899. DT.datetime(2013, 10, 1, 0, 0),
  3900. DT.datetime(2013, 10, 2, 0, 0),
  3901. DT.datetime(2013, 10, 2, 0, 0),
  3902. ]
  3903. }).set_index(['Date', 'Buyer'])
  3904. result = df.groupby([pd.Grouper(freq='1D'), 'Buyer']).sum()
  3905. assert_frame_equal(result, expected)
  3906. result = df.groupby([pd.Grouper(freq='1M'), 'Buyer']).sum()
  3907. expected = DataFrame({
  3908. 'Buyer': 'Carl Joe Mark'.split(),
  3909. 'Quantity': [10, 18, 3],
  3910. 'Date': [
  3911. DT.datetime(2013, 10, 31, 0, 0),
  3912. DT.datetime(2013, 10, 31, 0, 0),
  3913. DT.datetime(2013, 10, 31, 0, 0),
  3914. ]
  3915. }).set_index(['Date', 'Buyer'])
  3916. assert_frame_equal(result, expected)
  3917. # passing the name
  3918. df = df.reset_index()
  3919. result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer'
  3920. ]).sum()
  3921. assert_frame_equal(result, expected)
  3922. with self.assertRaises(KeyError):
  3923. df.groupby([pd.Grouper(freq='1M', key='foo'), 'Buyer']).sum()
  3924. # passing the level
  3925. df = df.set_index('Date')
  3926. result = df.groupby([pd.Grouper(freq='1M', level='Date'), 'Buyer'
  3927. ]).sum()
  3928. assert_frame_equal(result, expected)
  3929. result = df.groupby([pd.Grouper(freq='1M', level=0), 'Buyer']).sum(
  3930. )
  3931. assert_frame_equal(result, expected)
  3932. with self.assertRaises(ValueError):
  3933. df.groupby([pd.Grouper(freq='1M', level='foo'),
  3934. 'Buyer']).sum()
  3935. # multi names
  3936. df = df.copy()
  3937. df['Date'] = df.index + pd.offsets.MonthEnd(2)
  3938. result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer'
  3939. ]).sum()
  3940. expected = DataFrame({
  3941. 'Buyer': 'Carl Joe Mark'.split(),
  3942. 'Quantity': [10, 18, 3],
  3943. 'Date': [
  3944. DT.datetime(2013, 11, 30, 0, 0),
  3945. DT.datetime(2013, 11, 30, 0, 0),
  3946. DT.datetime(2013, 11, 30, 0, 0),
  3947. ]
  3948. }).set_index(['Date', 'Buyer'])
  3949. assert_frame_equal(result, expected)
  3950. # error as we have both a level and a name!
  3951. with self.assertRaises(ValueError):
  3952. df.groupby([pd.Grouper(freq='1M', key='Date',
  3953. level='Date'), 'Buyer']).sum()
  3954. # single groupers
  3955. expected = DataFrame({'Quantity': [31],
  3956. 'Date': [DT.datetime(2013, 10, 31, 0, 0)
  3957. ]}).set_index('Date')
  3958. result = df.groupby(pd.Grouper(freq='1M')).sum()
  3959. assert_frame_equal(result, expected)
  3960. result = df.groupby([pd.Grouper(freq='1M')]).sum()
  3961. assert_frame_equal(result, expected)
  3962. expected = DataFrame({'Quantity': [31],
  3963. 'Date': [DT.datetime(2013, 11, 30, 0, 0)
  3964. ]}).set_index('Date')
  3965. result = df.groupby(pd.Grouper(freq='1M', key='Date')).sum()
  3966. assert_frame_equal(result, expected)
  3967. result = df.groupby([pd.Grouper(freq='1M', key='Date')]).sum()
  3968. assert_frame_equal(result, expected)
  3969. # GH 6764 multiple grouping with/without sort
  3970. df = DataFrame({
  3971. 'date': pd.to_datetime([
  3972. '20121002', '20121007', '20130130', '20130202', '20130305',
  3973. '20121002', '20121207', '20130130', '20130202', '20130305',
  3974. '20130202', '20130305'
  3975. ]),
  3976. 'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
  3977. 'whole_cost': [1790, 364, 280, 259, 201, 623, 90, 312, 359, 301,
  3978. 359, 801],
  3979. 'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12]
  3980. }).set_index('date')
  3981. for freq in ['D', 'M', 'A', 'Q-APR']:
  3982. expected = df.groupby('user_id')[
  3983. 'whole_cost'].resample(
  3984. freq).sum().dropna().reorder_levels(
  3985. ['date', 'user_id']).sortlevel().astype('int64')
  3986. expected.name = 'whole_cost'
  3987. result1 = df.sort_index().groupby([pd.TimeGrouper(freq=freq),
  3988. 'user_id'])['whole_cost'].sum()
  3989. assert_series_equal(result1, expected)
  3990. result2 = df.groupby([pd.TimeGrouper(freq=freq), 'user_id'])[
  3991. 'whole_cost'].sum()
  3992. assert_series_equal(result2, expected)
  3993. def test_timegrouper_get_group(self):
  3994. # GH 6914
  3995. df_original = DataFrame({
  3996. 'Buyer': 'Carl Joe Joe Carl Joe Carl'.split(),
  3997. 'Quantity': [18, 3, 5, 1, 9, 3],
  3998. 'Date': [datetime(2013, 9, 1, 13, 0),
  3999. datetime(2013, 9, 1, 13, 5),
  4000. datetime(2013, 10, 1, 20, 0),
  4001. datetime(2013, 10, 3, 10, 0),
  4002. datetime(2013, 12, 2, 12, 0),
  4003. datetime(2013, 9, 2, 14, 0), ]
  4004. })
  4005. df_reordered = df_original.sort_values(by='Quantity')
  4006. # single grouping
  4007. expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]],
  4008. df_original.iloc[[4]]]
  4009. dt_list = ['2013-09-30', '2013-10-31', '2013-12-31']
  4010. for df in [df_original, df_reordered]:
  4011. grouped = df.groupby(pd.Grouper(freq='M', key='Date'))
  4012. for t, expected in zip(dt_list, expected_list):
  4013. dt = pd.Timestamp(t)
  4014. result = grouped.get_group(dt)
  4015. assert_frame_equal(result, expected)
  4016. # multiple grouping
  4017. expected_list = [df_original.iloc[[1]], df_original.iloc[[3]],
  4018. df_original.iloc[[4]]]
  4019. g_list = [('Joe', '2013-09-30'), ('Carl', '2013-10-31'),
  4020. ('Joe', '2013-12-31')]
  4021. for df in [df_original, df_reordered]:
  4022. grouped = df.groupby(['Buyer', pd.Grouper(freq='M', key='Date')])
  4023. for (b, t), expected in zip(g_list, expected_list):
  4024. dt = pd.Timestamp(t)
  4025. result = grouped.get_group((b, dt))
  4026. assert_frame_equal(result, expected)
  4027. # with index
  4028. df_original = df_original.set_index('Date')
  4029. df_reordered = df_original.sort_values(by='Quantity')
  4030. expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]],
  4031. df_original.iloc[[4]]]
  4032. for df in [df_original, df_reordered]:
  4033. grouped = df.groupby(pd.Grouper(freq='M'))
  4034. for t, expected in zip(dt_list, expected_list):
  4035. dt = pd.Timestamp(t)
  4036. result = grouped.get_group(dt)
  4037. assert_frame_equal(result, expected)
  4038. def test_timegrouper_apply_return_type_series(self):
  4039. # Using `apply` with the `TimeGrouper` should give the
  4040. # same return type as an `apply` with a `Grouper`.
  4041. # Issue #11742
  4042. df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'],
  4043. 'value': [10, 13]})
  4044. df_dt = df.copy()
  4045. df_dt['date'] = pd.to_datetime(df_dt['date'])
  4046. def sumfunc_series(x):
  4047. return pd.Series([x['value'].sum()], ('sum',))
  4048. expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_series)
  4049. result = (df_dt.groupby(pd.TimeGrouper(freq='M', key='date'))
  4050. .apply(sumfunc_series))
  4051. assert_frame_equal(result.reset_index(drop=True),
  4052. expected.reset_index(drop=True))
  4053. def test_timegrouper_apply_return_type_value(self):
  4054. # Using `apply` with the `TimeGrouper` should give the
  4055. # same return type as an `apply` with a `Grouper`.
  4056. # Issue #11742
  4057. df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'],
  4058. 'value': [10, 13]})
  4059. df_dt = df.copy()
  4060. df_dt['date'] = pd.to_datetime(df_dt['date'])
  4061. def sumfunc_value(x):
  4062. return x.value.sum()
  4063. expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_value)
  4064. result = (df_dt.groupby(pd.TimeGrouper(freq='M', key='date'))
  4065. .apply(sumfunc_value))
  4066. assert_series_equal(result.reset_index(drop=True),
  4067. expected.reset_index(drop=True))
  4068. def test_cumcount(self):
  4069. df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'])
  4070. g = df.groupby('A')
  4071. sg = g.A
  4072. expected = Series([0, 1, 2, 0, 3])
  4073. assert_series_equal(expected, g.cumcount())
  4074. assert_series_equal(expected, sg.cumcount())
  4075. def test_cumcount_empty(self):
  4076. ge = DataFrame().groupby(level=0)
  4077. se = Series().groupby(level=0)
  4078. # edge case, as this is usually considered float
  4079. e = Series(dtype='int64')
  4080. assert_series_equal(e, ge.cumcount())
  4081. assert_series_equal(e, se.cumcount())
  4082. def test_cumcount_dupe_index(self):
  4083. df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
  4084. index=[0] * 5)
  4085. g = df.groupby('A')
  4086. sg = g.A
  4087. expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
  4088. assert_series_equal(expected, g.cumcount())
  4089. assert_series_equal(expected, sg.cumcount())
  4090. def test_cumcount_mi(self):
  4091. mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
  4092. df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
  4093. index=mi)
  4094. g = df.groupby('A')
  4095. sg = g.A
  4096. expected = Series([0, 1, 2, 0, 3], index=mi)
  4097. assert_series_equal(expected, g.cumcount())
  4098. assert_series_equal(expected, sg.cumcount())
  4099. def test_cumcount_groupby_not_col(self):
  4100. df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
  4101. index=[0] * 5)
  4102. g = df.groupby([0, 0, 0, 1, 0])
  4103. sg = g.A
  4104. expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
  4105. assert_series_equal(expected, g.cumcount())
  4106. assert_series_equal(expected, sg.cumcount())
  4107. def test_filter_series(self):
  4108. s = pd.Series([1, 3, 20, 5, 22, 24, 7])
  4109. expected_odd = pd.Series([1, 3, 5, 7], index=[0, 1, 3, 6])
  4110. expected_even = pd.Series([20, 22, 24], index=[2, 4, 5])
  4111. grouper = s.apply(lambda x: x % 2)
  4112. grouped = s.groupby(grouper)
  4113. assert_series_equal(
  4114. grouped.filter(lambda x: x.mean() < 10), expected_odd)
  4115. assert_series_equal(
  4116. grouped.filter(lambda x: x.mean() > 10), expected_even)
  4117. # Test dropna=False.
  4118. assert_series_equal(
  4119. grouped.filter(lambda x: x.mean() < 10, dropna=False),
  4120. expected_odd.reindex(s.index))
  4121. assert_series_equal(
  4122. grouped.filter(lambda x: x.mean() > 10, dropna=False),
  4123. expected_even.reindex(s.index))
  4124. def test_filter_single_column_df(self):
  4125. df = pd.DataFrame([1, 3, 20, 5, 22, 24, 7])
  4126. expected_odd = pd.DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6])
  4127. expected_even = pd.DataFrame([20, 22, 24], index=[2, 4, 5])
  4128. grouper = df[0].apply(lambda x: x % 2)
  4129. grouped = df.groupby(grouper)
  4130. assert_frame_equal(
  4131. grouped.filter(lambda x: x.mean() < 10), expected_odd)
  4132. assert_frame_equal(
  4133. grouped.filter(lambda x: x.mean() > 10), expected_even)
  4134. # Test dropna=False.
  4135. assert_frame_equal(
  4136. grouped.filter(lambda x: x.mean() < 10, dropna=False),
  4137. expected_odd.reindex(df.index))
  4138. assert_frame_equal(
  4139. grouped.filter(lambda x: x.mean() > 10, dropna=False),
  4140. expected_even.reindex(df.index))
  4141. def test_filter_multi_column_df(self):
  4142. df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': [1, 1, 1, 1]})
  4143. grouper = df['A'].apply(lambda x: x % 2)
  4144. grouped = df.groupby(grouper)
  4145. expected = pd.DataFrame({'A': [12, 12], 'B': [1, 1]}, index=[1, 2])
  4146. assert_frame_equal(
  4147. grouped.filter(lambda x: x['A'].sum() - x['B'].sum() > 10),
  4148. expected)
  4149. def test_filter_mixed_df(self):
  4150. df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()})
  4151. grouper = df['A'].apply(lambda x: x % 2)
  4152. grouped = df.groupby(grouper)
  4153. expected = pd.DataFrame({'A': [12, 12], 'B': ['b', 'c']}, index=[1, 2])
  4154. assert_frame_equal(
  4155. grouped.filter(lambda x: x['A'].sum() > 10), expected)
  4156. def test_filter_out_all_groups(self):
  4157. s = pd.Series([1, 3, 20, 5, 22, 24, 7])
  4158. grouper = s.apply(lambda x: x % 2)
  4159. grouped = s.groupby(grouper)
  4160. assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]])
  4161. df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()})
  4162. grouper = df['A'].apply(lambda x: x % 2)
  4163. grouped = df.groupby(grouper)
  4164. assert_frame_equal(
  4165. grouped.filter(lambda x: x['A'].sum() > 1000), df.ix[[]])
  4166. def test_filter_out_no_groups(self):
  4167. s = pd.Series([1, 3, 20, 5, 22, 24, 7])
  4168. grouper = s.apply(lambda x: x % 2)
  4169. grouped = s.groupby(grouper)
  4170. filtered = grouped.filter(lambda x: x.mean() > 0)
  4171. assert_series_equal(filtered, s)
  4172. df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()})
  4173. grouper = df['A'].apply(lambda x: x % 2)
  4174. grouped = df.groupby(grouper)
  4175. filtered = grouped.filter(lambda x: x['A'].mean() > 0)
  4176. assert_frame_equal(filtered, df)
  4177. def test_filter_out_all_groups_in_df(self):
  4178. # GH12768
  4179. df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]})
  4180. res = df.groupby('a')
  4181. res = res.filter(lambda x: x['b'].sum() > 5, dropna=False)
  4182. expected = pd.DataFrame({'a': [nan] * 3, 'b': [nan] * 3})
  4183. assert_frame_equal(expected, res)
  4184. df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]})
  4185. res = df.groupby('a')
  4186. res = res.filter(lambda x: x['b'].sum() > 5, dropna=True)
  4187. expected = pd.DataFrame({'a': [], 'b': []}, dtype="int64")
  4188. assert_frame_equal(expected, res)
  4189. def test_filter_condition_raises(self):
  4190. def raise_if_sum_is_zero(x):
  4191. if x.sum() == 0:
  4192. raise ValueError
  4193. else:
  4194. return x.sum() > 0
  4195. s = pd.Series([-1, 0, 1, 2])
  4196. grouper = s.apply(lambda x: x % 2)
  4197. grouped = s.groupby(grouper)
  4198. self.assertRaises(TypeError,
  4199. lambda: grouped.filter(raise_if_sum_is_zero))
  4200. def test_filter_with_axis_in_groupby(self):
  4201. # issue 11041
  4202. index = pd.MultiIndex.from_product([range(10), [0, 1]])
  4203. data = pd.DataFrame(
  4204. np.arange(100).reshape(-1, 20), columns=index, dtype='int64')
  4205. result = data.groupby(level=0,
  4206. axis=1).filter(lambda x: x.iloc[0, 0] > 10)
  4207. expected = data.iloc[:, 12:20]
  4208. assert_frame_equal(result, expected)
  4209. def test_filter_bad_shapes(self):
  4210. df = DataFrame({'A': np.arange(8),
  4211. 'B': list('aabbbbcc'),
  4212. 'C': np.arange(8)})
  4213. s = df['B']
  4214. g_df = df.groupby('B')
  4215. g_s = s.groupby(s)
  4216. f = lambda x: x
  4217. self.assertRaises(TypeError, lambda: g_df.filter(f))
  4218. self.assertRaises(TypeError, lambda: g_s.filter(f))
  4219. f = lambda x: x == 1
  4220. self.assertRaises(TypeError, lambda: g_df.filter(f))
  4221. self.assertRaises(TypeError, lambda: g_s.filter(f))
  4222. f = lambda x: np.outer(x, x)
  4223. self.assertRaises(TypeError, lambda: g_df.filter(f))
  4224. self.assertRaises(TypeError, lambda: g_s.filter(f))
  4225. def test_filter_nan_is_false(self):
  4226. df = DataFrame({'A': np.arange(8),
  4227. 'B': list('aabbbbcc'),
  4228. 'C': np.arange(8)})
  4229. s = df['B']
  4230. g_df = df.groupby(df['B'])
  4231. g_s = s.groupby(s)
  4232. f = lambda x: np.nan
  4233. assert_frame_equal(g_df.filter(f), df.loc[[]])
  4234. assert_series_equal(g_s.filter(f), s[[]])
  4235. def test_filter_against_workaround(self):
  4236. np.random.seed(0)
  4237. # Series of ints
  4238. s = Series(np.random.randint(0, 100, 1000))
  4239. grouper = s.apply(lambda x: np.round(x, -1))
  4240. grouped = s.groupby(grouper)
  4241. f = lambda x: x.mean() > 10
  4242. old_way = s[grouped.transform(f).astype('bool')]
  4243. new_way = grouped.filter(f)
  4244. assert_series_equal(new_way.sort_values(), old_way.sort_values())
  4245. # Series of floats
  4246. s = 100 * Series(np.random.random(1000))
  4247. grouper = s.apply(lambda x: np.round(x, -1))
  4248. grouped = s.groupby(grouper)
  4249. f = lambda x: x.mean() > 10
  4250. old_way = s[grouped.transform(f).astype('bool')]
  4251. new_way = grouped.filter(f)
  4252. assert_series_equal(new_way.sort_values(), old_way.sort_values())
  4253. # Set up DataFrame of ints, floats, strings.
  4254. from string import ascii_lowercase
  4255. letters = np.array(list(ascii_lowercase))
  4256. N = 1000
  4257. random_letters = letters.take(np.random.randint(0, 26, N))
  4258. df = DataFrame({'ints': Series(np.random.randint(0, 100, N)),
  4259. 'floats': N / 10 * Series(np.random.random(N)),
  4260. 'letters': Series(random_letters)})
  4261. # Group by ints; filter on floats.
  4262. grouped = df.groupby('ints')
  4263. old_way = df[grouped.floats.
  4264. transform(lambda x: x.mean() > N / 20).astype('bool')]
  4265. new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20)
  4266. assert_frame_equal(new_way, old_way)
  4267. # Group by floats (rounded); filter on strings.
  4268. grouper = df.floats.apply(lambda x: np.round(x, -1))
  4269. grouped = df.groupby(grouper)
  4270. old_way = df[grouped.letters.
  4271. transform(lambda x: len(x) < N / 10).astype('bool')]
  4272. new_way = grouped.filter(lambda x: len(x.letters) < N / 10)
  4273. assert_frame_equal(new_way, old_way)
  4274. # Group by strings; filter on ints.
  4275. grouped = df.groupby('letters')
  4276. old_way = df[grouped.ints.
  4277. transform(lambda x: x.mean() > N / 20).astype('bool')]
  4278. new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20)
  4279. assert_frame_equal(new_way, old_way)
  4280. def test_filter_using_len(self):
  4281. # BUG GH4447
  4282. df = DataFrame({'A': np.arange(8),
  4283. 'B': list('aabbbbcc'),
  4284. 'C': np.arange(8)})
  4285. grouped = df.groupby('B')
  4286. actual = grouped.filter(lambda x: len(x) > 2)
  4287. expected = DataFrame(
  4288. {'A': np.arange(2, 6),
  4289. 'B': list('bbbb'),
  4290. 'C': np.arange(2, 6)}, index=np.arange(2, 6))
  4291. assert_frame_equal(actual, expected)
  4292. actual = grouped.filter(lambda x: len(x) > 4)
  4293. expected = df.ix[[]]
  4294. assert_frame_equal(actual, expected)
  4295. # Series have always worked properly, but we'll test anyway.
  4296. s = df['B']
  4297. grouped = s.groupby(s)
  4298. actual = grouped.filter(lambda x: len(x) > 2)
  4299. expected = Series(4 * ['b'], index=np.arange(2, 6), name='B')
  4300. assert_series_equal(actual, expected)
  4301. actual = grouped.filter(lambda x: len(x) > 4)
  4302. expected = s[[]]
  4303. assert_series_equal(actual, expected)
  4304. def test_filter_maintains_ordering(self):
  4305. # Simple case: index is sequential. #4621
  4306. df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
  4307. 'tag': [23, 45, 62, 24, 45, 34, 25, 62]})
  4308. s = df['pid']
  4309. grouped = df.groupby('tag')
  4310. actual = grouped.filter(lambda x: len(x) > 1)
  4311. expected = df.iloc[[1, 2, 4, 7]]
  4312. assert_frame_equal(actual, expected)
  4313. grouped = s.groupby(df['tag'])
  4314. actual = grouped.filter(lambda x: len(x) > 1)
  4315. expected = s.iloc[[1, 2, 4, 7]]
  4316. assert_series_equal(actual, expected)
  4317. # Now index is sequentially decreasing.
  4318. df.index = np.arange(len(df) - 1, -1, -1)
  4319. s = df['pid']
  4320. grouped = df.groupby('tag')
  4321. actual = grouped.filter(lambda x: len(x) > 1)
  4322. expected = df.iloc[[1, 2, 4, 7]]
  4323. assert_frame_equal(actual, expected)
  4324. grouped = s.groupby(df['tag'])
  4325. actual = grouped.filter(lambda x: len(x) > 1)
  4326. expected = s.iloc[[1, 2, 4, 7]]
  4327. assert_series_equal(actual, expected)
  4328. # Index is shuffled.
  4329. SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3]
  4330. df.index = df.index[SHUFFLED]
  4331. s = df['pid']
  4332. grouped = df.groupby('tag')
  4333. actual = grouped.filter(lambda x: len(x) > 1)
  4334. expected = df.iloc[[1, 2, 4, 7]]
  4335. assert_frame_equal(actual, expected)
  4336. grouped = s.groupby(df['tag'])
  4337. actual = grouped.filter(lambda x: len(x) > 1)
  4338. expected = s.iloc[[1, 2, 4, 7]]
  4339. assert_series_equal(actual, expected)
  4340. def test_filter_multiple_timestamp(self):
  4341. # GH 10114
  4342. df = DataFrame({'A': np.arange(5, dtype='int64'),
  4343. 'B': ['foo', 'bar', 'foo', 'bar', 'bar'],
  4344. 'C': Timestamp('20130101')})
  4345. grouped = df.groupby(['B', 'C'])
  4346. result = grouped['A'].filter(lambda x: True)
  4347. assert_series_equal(df['A'], result)
  4348. result = grouped['A'].transform(len)
  4349. expected = Series([2, 3, 2, 3, 3], name='A')
  4350. assert_series_equal(result, expected)
  4351. result = grouped.filter(lambda x: True)
  4352. assert_frame_equal(df, result)
  4353. result = grouped.transform('sum')
  4354. expected = DataFrame({'A': [2, 8, 2, 8, 8]})
  4355. assert_frame_equal(result, expected)
  4356. result = grouped.transform(len)
  4357. expected = DataFrame({'A': [2, 3, 2, 3, 3]})
  4358. assert_frame_equal(result, expected)
  4359. def test_filter_and_transform_with_non_unique_int_index(self):
  4360. # GH4620
  4361. index = [1, 1, 1, 2, 1, 1, 0, 1]
  4362. df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
  4363. 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
  4364. grouped_df = df.groupby('tag')
  4365. ser = df['pid']
  4366. grouped_ser = ser.groupby(df['tag'])
  4367. expected_indexes = [1, 2, 4, 7]
  4368. # Filter DataFrame
  4369. actual = grouped_df.filter(lambda x: len(x) > 1)
  4370. expected = df.iloc[expected_indexes]
  4371. assert_frame_equal(actual, expected)
  4372. actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
  4373. expected = df.copy()
  4374. expected.iloc[[0, 3, 5, 6]] = np.nan
  4375. assert_frame_equal(actual, expected)
  4376. # Filter Series
  4377. actual = grouped_ser.filter(lambda x: len(x) > 1)
  4378. expected = ser.take(expected_indexes)
  4379. assert_series_equal(actual, expected)
  4380. actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
  4381. NA = np.nan
  4382. expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
  4383. # ^ made manually because this can get confusing!
  4384. assert_series_equal(actual, expected)
  4385. # Transform Series
  4386. actual = grouped_ser.transform(len)
  4387. expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
  4388. assert_series_equal(actual, expected)
  4389. # Transform (a column from) DataFrameGroupBy
  4390. actual = grouped_df.pid.transform(len)
  4391. assert_series_equal(actual, expected)
  4392. def test_filter_and_transform_with_multiple_non_unique_int_index(self):
  4393. # GH4620
  4394. index = [1, 1, 1, 2, 0, 0, 0, 1]
  4395. df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
  4396. 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
  4397. grouped_df = df.groupby('tag')
  4398. ser = df['pid']
  4399. grouped_ser = ser.groupby(df['tag'])
  4400. expected_indexes = [1, 2, 4, 7]
  4401. # Filter DataFrame
  4402. actual = grouped_df.filter(lambda x: len(x) > 1)
  4403. expected = df.iloc[expected_indexes]
  4404. assert_frame_equal(actual, expected)
  4405. actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
  4406. expected = df.copy()
  4407. expected.iloc[[0, 3, 5, 6]] = np.nan
  4408. assert_frame_equal(actual, expected)
  4409. # Filter Series
  4410. actual = grouped_ser.filter(lambda x: len(x) > 1)
  4411. expected = ser.take(expected_indexes)
  4412. assert_series_equal(actual, expected)
  4413. actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
  4414. NA = np.nan
  4415. expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
  4416. # ^ made manually because this can get confusing!
  4417. assert_series_equal(actual, expected)
  4418. # Transform Series
  4419. actual = grouped_ser.transform(len)
  4420. expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
  4421. assert_series_equal(actual, expected)
  4422. # Transform (a column from) DataFrameGroupBy
  4423. actual = grouped_df.pid.transform(len)
  4424. assert_series_equal(actual, expected)
  4425. def test_filter_and_transform_with_non_unique_float_index(self):
  4426. # GH4620
  4427. index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float)
  4428. df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
  4429. 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
  4430. grouped_df = df.groupby('tag')
  4431. ser = df['pid']
  4432. grouped_ser = ser.groupby(df['tag'])
  4433. expected_indexes = [1, 2, 4, 7]
  4434. # Filter DataFrame
  4435. actual = grouped_df.filter(lambda x: len(x) > 1)
  4436. expected = df.iloc[expected_indexes]
  4437. assert_frame_equal(actual, expected)
  4438. actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
  4439. expected = df.copy()
  4440. expected.iloc[[0, 3, 5, 6]] = np.nan
  4441. assert_frame_equal(actual, expected)
  4442. # Filter Series
  4443. actual = grouped_ser.filter(lambda x: len(x) > 1)
  4444. expected = ser.take(expected_indexes)
  4445. assert_series_equal(actual, expected)
  4446. actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
  4447. NA = np.nan
  4448. expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
  4449. # ^ made manually because this can get confusing!
  4450. assert_series_equal(actual, expected)
  4451. # Transform Series
  4452. actual = grouped_ser.transform(len)
  4453. expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
  4454. assert_series_equal(actual, expected)
  4455. # Transform (a column from) DataFrameGroupBy
  4456. actual = grouped_df.pid.transform(len)
  4457. assert_series_equal(actual, expected)
  4458. def test_filter_and_transform_with_non_unique_timestamp_index(self):
  4459. # GH4620
  4460. t0 = Timestamp('2013-09-30 00:05:00')
  4461. t1 = Timestamp('2013-10-30 00:05:00')
  4462. t2 = Timestamp('2013-11-30 00:05:00')
  4463. index = [t1, t1, t1, t2, t1, t1, t0, t1]
  4464. df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
  4465. 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
  4466. grouped_df = df.groupby('tag')
  4467. ser = df['pid']
  4468. grouped_ser = ser.groupby(df['tag'])
  4469. expected_indexes = [1, 2, 4, 7]
  4470. # Filter DataFrame
  4471. actual = grouped_df.filter(lambda x: len(x) > 1)
  4472. expected = df.iloc[expected_indexes]
  4473. assert_frame_equal(actual, expected)
  4474. actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
  4475. expected = df.copy()
  4476. expected.iloc[[0, 3, 5, 6]] = np.nan
  4477. assert_frame_equal(actual, expected)
  4478. # Filter Series
  4479. actual = grouped_ser.filter(lambda x: len(x) > 1)
  4480. expected = ser.take(expected_indexes)
  4481. assert_series_equal(actual, expected)
  4482. actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
  4483. NA = np.nan
  4484. expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
  4485. # ^ made manually because this can get confusing!
  4486. assert_series_equal(actual, expected)
  4487. # Transform Series
  4488. actual = grouped_ser.transform(len)
  4489. expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
  4490. assert_series_equal(actual, expected)
  4491. # Transform (a column from) DataFrameGroupBy
  4492. actual = grouped_df.pid.transform(len)
  4493. assert_series_equal(actual, expected)
  4494. def test_filter_and_transform_with_non_unique_string_index(self):
  4495. # GH4620
  4496. index = list('bbbcbbab')
  4497. df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
  4498. 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
  4499. grouped_df = df.groupby('tag')
  4500. ser = df['pid']
  4501. grouped_ser = ser.groupby(df['tag'])
  4502. expected_indexes = [1, 2, 4, 7]
  4503. # Filter DataFrame
  4504. actual = grouped_df.filter(lambda x: len(x) > 1)
  4505. expected = df.iloc[expected_indexes]
  4506. assert_frame_equal(actual, expected)
  4507. actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
  4508. expected = df.copy()
  4509. expected.iloc[[0, 3, 5, 6]] = np.nan
  4510. assert_frame_equal(actual, expected)
  4511. # Filter Series
  4512. actual = grouped_ser.filter(lambda x: len(x) > 1)
  4513. expected = ser.take(expected_indexes)
  4514. assert_series_equal(actual, expected)
  4515. actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
  4516. NA = np.nan
  4517. expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
  4518. # ^ made manually because this can get confusing!
  4519. assert_series_equal(actual, expected)
  4520. # Transform Series
  4521. actual = grouped_ser.transform(len)
  4522. expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
  4523. assert_series_equal(actual, expected)
  4524. # Transform (a column from) DataFrameGroupBy
  4525. actual = grouped_df.pid.transform(len)
  4526. assert_series_equal(actual, expected)
  4527. def test_filter_has_access_to_grouped_cols(self):
  4528. df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=['A', 'B'])
  4529. g = df.groupby('A')
  4530. # previously didn't have access to col A #????
  4531. filt = g.filter(lambda x: x['A'].sum() == 2)
  4532. assert_frame_equal(filt, df.iloc[[0, 1]])
  4533. def test_filter_enforces_scalarness(self):
  4534. df = pd.DataFrame([
  4535. ['best', 'a', 'x'],
  4536. ['worst', 'b', 'y'],
  4537. ['best', 'c', 'x'],
  4538. ['best', 'd', 'y'],
  4539. ['worst', 'd', 'y'],
  4540. ['worst', 'd', 'y'],
  4541. ['best', 'd', 'z'],
  4542. ], columns=['a', 'b', 'c'])
  4543. with tm.assertRaisesRegexp(TypeError, 'filter function returned a.*'):
  4544. df.groupby('c').filter(lambda g: g['a'] == 'best')
  4545. def test_filter_non_bool_raises(self):
  4546. df = pd.DataFrame([
  4547. ['best', 'a', 1],
  4548. ['worst', 'b', 1],
  4549. ['best', 'c', 1],
  4550. ['best', 'd', 1],
  4551. ['worst', 'd', 1],
  4552. ['worst', 'd', 1],
  4553. ['best', 'd', 1],
  4554. ], columns=['a', 'b', 'c'])
  4555. with tm.assertRaisesRegexp(TypeError, 'filter function returned a.*'):
  4556. df.groupby('a').filter(lambda g: g.c.mean())
  4557. def test_fill_constistency(self):
  4558. # GH9221
  4559. # pass thru keyword arguments to the generated wrapper
  4560. # are set if the passed kw is None (only)
  4561. df = DataFrame(index=pd.MultiIndex.from_product(
  4562. [['value1', 'value2'], date_range('2014-01-01', '2014-01-06')]),
  4563. columns=Index(
  4564. ['1', '2'], name='id'))
  4565. df['1'] = [np.nan, 1, np.nan, np.nan, 11, np.nan, np.nan, 2, np.nan,
  4566. np.nan, 22, np.nan]
  4567. df['2'] = [np.nan, 3, np.nan, np.nan, 33, np.nan, np.nan, 4, np.nan,
  4568. np.nan, 44, np.nan]
  4569. expected = df.groupby(level=0, axis=0).fillna(method='ffill')
  4570. result = df.T.groupby(level=0, axis=1).fillna(method='ffill').T
  4571. assert_frame_equal(result, expected)
  4572. def test_index_label_overlaps_location(self):
  4573. # checking we don't have any label/location confusion in the
  4574. # the wake of GH5375
  4575. df = DataFrame(list('ABCDE'), index=[2, 0, 2, 1, 1])
  4576. g = df.groupby(list('ababb'))
  4577. actual = g.filter(lambda x: len(x) > 2)
  4578. expected = df.iloc[[1, 3, 4]]
  4579. assert_frame_equal(actual, expected)
  4580. ser = df[0]
  4581. g = ser.groupby(list('ababb'))
  4582. actual = g.filter(lambda x: len(x) > 2)
  4583. expected = ser.take([1, 3, 4])
  4584. assert_series_equal(actual, expected)
  4585. # ... and again, with a generic Index of floats
  4586. df.index = df.index.astype(float)
  4587. g = df.groupby(list('ababb'))
  4588. actual = g.filter(lambda x: len(x) > 2)
  4589. expected = df.iloc[[1, 3, 4]]
  4590. assert_frame_equal(actual, expected)
  4591. ser = df[0]
  4592. g = ser.groupby(list('ababb'))
  4593. actual = g.filter(lambda x: len(x) > 2)
  4594. expected = ser.take([1, 3, 4])
  4595. assert_series_equal(actual, expected)
  4596. def test_groupby_selection_with_methods(self):
  4597. # some methods which require DatetimeIndex
  4598. rng = pd.date_range('2014', periods=len(self.df))
  4599. self.df.index = rng
  4600. g = self.df.groupby(['A'])[['C']]
  4601. g_exp = self.df[['C']].groupby(self.df['A'])
  4602. # TODO check groupby with > 1 col ?
  4603. # methods which are called as .foo()
  4604. methods = ['count',
  4605. 'corr',
  4606. 'cummax',
  4607. 'cummin',
  4608. 'cumprod',
  4609. 'describe',
  4610. 'rank',
  4611. 'quantile',
  4612. 'diff',
  4613. 'shift',
  4614. 'all',
  4615. 'any',
  4616. 'idxmin',
  4617. 'idxmax',
  4618. 'ffill',
  4619. 'bfill',
  4620. 'pct_change',
  4621. 'tshift']
  4622. for m in methods:
  4623. res = getattr(g, m)()
  4624. exp = getattr(g_exp, m)()
  4625. assert_frame_equal(res, exp) # should always be frames!
  4626. # methods which aren't just .foo()
  4627. assert_frame_equal(g.fillna(0), g_exp.fillna(0))
  4628. assert_frame_equal(g.dtypes, g_exp.dtypes)
  4629. assert_frame_equal(g.apply(lambda x: x.sum()),
  4630. g_exp.apply(lambda x: x.sum()))
  4631. assert_frame_equal(g.resample('D').mean(), g_exp.resample('D').mean())
  4632. assert_frame_equal(g.resample('D').ohlc(),
  4633. g_exp.resample('D').ohlc())
  4634. assert_frame_equal(g.filter(lambda x: len(x) == 3),
  4635. g_exp.filter(lambda x: len(x) == 3))
  4636. def test_groupby_whitelist(self):
  4637. from string import ascii_lowercase
  4638. letters = np.array(list(ascii_lowercase))
  4639. N = 10
  4640. random_letters = letters.take(np.random.randint(0, 26, N))
  4641. df = DataFrame({'floats': N / 10 * Series(np.random.random(N)),
  4642. 'letters': Series(random_letters)})
  4643. s = df.floats
  4644. df_whitelist = frozenset([
  4645. 'last',
  4646. 'first',
  4647. 'mean',
  4648. 'sum',
  4649. 'min',
  4650. 'max',
  4651. 'head',
  4652. 'tail',
  4653. 'cumsum',
  4654. 'cumprod',
  4655. 'cummin',
  4656. 'cummax',
  4657. 'cumcount',
  4658. 'resample',
  4659. 'describe',
  4660. 'rank',
  4661. 'quantile',
  4662. 'fillna',
  4663. 'mad',
  4664. 'any',
  4665. 'all',
  4666. 'take',
  4667. 'idxmax',
  4668. 'idxmin',
  4669. 'shift',
  4670. 'tshift',
  4671. 'ffill',
  4672. 'bfill',
  4673. 'pct_change',
  4674. 'skew',
  4675. 'plot',
  4676. 'boxplot',
  4677. 'hist',
  4678. 'median',
  4679. 'dtypes',
  4680. 'corrwith',
  4681. 'corr',
  4682. 'cov',
  4683. 'diff',
  4684. ])
  4685. s_whitelist = frozenset([
  4686. 'last',
  4687. 'first',
  4688. 'mean',
  4689. 'sum',
  4690. 'min',
  4691. 'max',
  4692. 'head',
  4693. 'tail',
  4694. 'cumsum',
  4695. 'cumprod',
  4696. 'cummin',
  4697. 'cummax',
  4698. 'cumcount',
  4699. 'resample',
  4700. 'describe',
  4701. 'rank',
  4702. 'quantile',
  4703. 'fillna',
  4704. 'mad',
  4705. 'any',
  4706. 'all',
  4707. 'take',
  4708. 'idxmax',
  4709. 'idxmin',
  4710. 'shift',
  4711. 'tshift',
  4712. 'ffill',
  4713. 'bfill',
  4714. 'pct_change',
  4715. 'skew',
  4716. 'plot',
  4717. 'hist',
  4718. 'median',
  4719. 'dtype',
  4720. 'corr',
  4721. 'cov',
  4722. 'diff',
  4723. 'unique',
  4724. # 'nlargest', 'nsmallest',
  4725. ])
  4726. for obj, whitelist in zip((df, s), (df_whitelist, s_whitelist)):
  4727. gb = obj.groupby(df.letters)
  4728. self.assertEqual(whitelist, gb._apply_whitelist)
  4729. for m in whitelist:
  4730. getattr(type(gb), m)
  4731. AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew',
  4732. 'mad', 'std', 'var', 'sem']
  4733. AGG_FUNCTIONS_WITH_SKIPNA = ['skew', 'mad']
  4734. def test_groupby_whitelist_deprecations(self):
  4735. from string import ascii_lowercase
  4736. letters = np.array(list(ascii_lowercase))
  4737. N = 10
  4738. random_letters = letters.take(np.random.randint(0, 26, N))
  4739. df = DataFrame({'floats': N / 10 * Series(np.random.random(N)),
  4740. 'letters': Series(random_letters)})
  4741. # 10711 deprecated
  4742. with tm.assert_produces_warning(FutureWarning):
  4743. df.groupby('letters').irow(0)
  4744. with tm.assert_produces_warning(FutureWarning):
  4745. df.groupby('letters').floats.irow(0)
  4746. def test_regression_whitelist_methods(self):
  4747. # GH6944
  4748. # explicity test the whitelest methods
  4749. index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',
  4750. 'three']],
  4751. labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
  4752. [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
  4753. names=['first', 'second'])
  4754. raw_frame = DataFrame(np.random.randn(10, 3), index=index,
  4755. columns=Index(['A', 'B', 'C'], name='exp'))
  4756. raw_frame.ix[1, [1, 2]] = np.nan
  4757. raw_frame.ix[7, [0, 1]] = np.nan
  4758. for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS,
  4759. lrange(2), lrange(2),
  4760. [True, False]):
  4761. if axis == 0:
  4762. frame = raw_frame
  4763. else:
  4764. frame = raw_frame.T
  4765. if op in self.AGG_FUNCTIONS_WITH_SKIPNA:
  4766. grouped = frame.groupby(level=level, axis=axis)
  4767. result = getattr(grouped, op)(skipna=skipna)
  4768. expected = getattr(frame, op)(level=level, axis=axis,
  4769. skipna=skipna)
  4770. assert_frame_equal(result, expected)
  4771. else:
  4772. grouped = frame.groupby(level=level, axis=axis)
  4773. result = getattr(grouped, op)()
  4774. expected = getattr(frame, op)(level=level, axis=axis)
  4775. assert_frame_equal(result, expected)
  4776. def test_groupby_blacklist(self):
  4777. from string import ascii_lowercase
  4778. letters = np.array(list(ascii_lowercase))
  4779. N = 10
  4780. random_letters = letters.take(np.random.randint(0, 26, N))
  4781. df = DataFrame({'floats': N / 10 * Series(np.random.random(N)),
  4782. 'letters': Series(random_letters)})
  4783. s = df.floats
  4784. blacklist = [
  4785. 'eval', 'query', 'abs', 'where',
  4786. 'mask', 'align', 'groupby', 'clip', 'astype',
  4787. 'at', 'combine', 'consolidate', 'convert_objects',
  4788. ]
  4789. to_methods = [method for method in dir(df) if method.startswith('to_')]
  4790. blacklist.extend(to_methods)
  4791. # e.g., to_csv
  4792. defined_but_not_allowed = ("(?:^Cannot.+{0!r}.+{1!r}.+try using the "
  4793. "'apply' method$)")
  4794. # e.g., query, eval
  4795. not_defined = "(?:^{1!r} object has no attribute {0!r}$)"
  4796. fmt = defined_but_not_allowed + '|' + not_defined
  4797. for bl in blacklist:
  4798. for obj in (df, s):
  4799. gb = obj.groupby(df.letters)
  4800. msg = fmt.format(bl, type(gb).__name__)
  4801. with tm.assertRaisesRegexp(AttributeError, msg):
  4802. getattr(gb, bl)
  4803. def test_tab_completion(self):
  4804. grp = self.mframe.groupby(level='second')
  4805. results = set([v for v in dir(grp) if not v.startswith('_')])
  4806. expected = set(
  4807. ['A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter',
  4808. 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max',
  4809. 'mean', 'median', 'min', 'name', 'ngroups', 'nth', 'ohlc', 'plot',
  4810. 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count',
  4811. 'head', 'irow', 'describe', 'cummax', 'quantile', 'rank',
  4812. 'cumprod', 'tail', 'resample', 'cummin', 'fillna', 'cumsum',
  4813. 'cumcount', 'all', 'shift', 'skew', 'bfill', 'ffill', 'take',
  4814. 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', 'cov',
  4815. 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin',
  4816. 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding'])
  4817. self.assertEqual(results, expected)
  4818. def test_lexsort_indexer(self):
  4819. keys = [[nan] * 5 + list(range(100)) + [nan] * 5]
  4820. # orders=True, na_position='last'
  4821. result = _lexsort_indexer(keys, orders=True, na_position='last')
  4822. exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110))
  4823. tm.assert_numpy_array_equal(result, np.array(exp))
  4824. # orders=True, na_position='first'
  4825. result = _lexsort_indexer(keys, orders=True, na_position='first')
  4826. exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105))
  4827. tm.assert_numpy_array_equal(result, np.array(exp))
  4828. # orders=False, na_position='last'
  4829. result = _lexsort_indexer(keys, orders=False, na_position='last')
  4830. exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110))
  4831. tm.assert_numpy_array_equal(result, np.array(exp))
  4832. # orders=False, na_position='first'
  4833. result = _lexsort_indexer(keys, orders=False, na_position='first')
  4834. exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1))
  4835. tm.assert_numpy_array_equal(result, np.array(exp))
  4836. def test_nargsort(self):
  4837. # np.argsort(items) places NaNs last
  4838. items = [nan] * 5 + list(range(100)) + [nan] * 5
  4839. # np.argsort(items2) may not place NaNs first
  4840. items2 = np.array(items, dtype='O')
  4841. try:
  4842. # GH 2785; due to a regression in NumPy1.6.2
  4843. np.argsort(np.array([[1, 2], [1, 3], [1, 2]], dtype='i'))
  4844. np.argsort(items2, kind='mergesort')
  4845. except TypeError:
  4846. raise nose.SkipTest('requested sort not available for type')
  4847. # mergesort is the most difficult to get right because we want it to be
  4848. # stable.
  4849. # According to numpy/core/tests/test_multiarray, """The number of
  4850. # sorted items must be greater than ~50 to check the actual algorithm
  4851. # because quick and merge sort fall over to insertion sort for small
  4852. # arrays."""
  4853. # mergesort, ascending=True, na_position='last'
  4854. result = _nargsort(items, kind='mergesort', ascending=True,
  4855. na_position='last')
  4856. exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110))
  4857. tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
  4858. # mergesort, ascending=True, na_position='first'
  4859. result = _nargsort(items, kind='mergesort', ascending=True,
  4860. na_position='first')
  4861. exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105))
  4862. tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
  4863. # mergesort, ascending=False, na_position='last'
  4864. result = _nargsort(items, kind='mergesort', ascending=False,
  4865. na_position='last')
  4866. exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110))
  4867. tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
  4868. # mergesort, ascending=False, na_position='first'
  4869. result = _nargsort(items, kind='mergesort', ascending=False,
  4870. na_position='first')
  4871. exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1))
  4872. tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
  4873. # mergesort, ascending=True, na_position='last'
  4874. result = _nargsort(items2, kind='mergesort', ascending=True,
  4875. na_position='last')
  4876. exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110))
  4877. tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
  4878. # mergesort, ascending=True, na_position='first'
  4879. result = _nargsort(items2, kind='mergesort', ascending=True,
  4880. na_position='first')
  4881. exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105))
  4882. tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
  4883. # mergesort, ascending=False, na_position='last'
  4884. result = _nargsort(items2, kind='mergesort', ascending=False,
  4885. na_position='last')
  4886. exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110))
  4887. tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
  4888. # mergesort, ascending=False, na_position='first'
  4889. result = _nargsort(items2, kind='mergesort', ascending=False,
  4890. na_position='first')
  4891. exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1))
  4892. tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
  4893. def test_datetime_count(self):
  4894. df = DataFrame({'a': [1, 2, 3] * 2,
  4895. 'dates': pd.date_range('now', periods=6, freq='T')})
  4896. result = df.groupby('a').dates.count()
  4897. expected = Series([
  4898. 2, 2, 2
  4899. ], index=Index([1, 2, 3], name='a'), name='dates')
  4900. tm.assert_series_equal(result, expected)
  4901. def test_lower_int_prec_count(self):
  4902. df = DataFrame({'a': np.array(
  4903. [0, 1, 2, 100], np.int8),
  4904. 'b': np.array(
  4905. [1, 2, 3, 6], np.uint32),
  4906. 'c': np.array(
  4907. [4, 5, 6, 8], np.int16),
  4908. 'grp': list('ab' * 2)})
  4909. result = df.groupby('grp').count()
  4910. expected = DataFrame({'a': [2, 2],
  4911. 'b': [2, 2],
  4912. 'c': [2, 2]}, index=pd.Index(list('ab'),
  4913. name='grp'))
  4914. tm.assert_frame_equal(result, expected)
  4915. def test_count_uses_size_on_exception(self):
  4916. class RaisingObjectException(Exception):
  4917. pass
  4918. class RaisingObject(object):
  4919. def __init__(self, msg='I will raise inside Cython'):
  4920. super(RaisingObject, self).__init__()
  4921. self.msg = msg
  4922. def __eq__(self, other):
  4923. # gets called in Cython to check that raising calls the method
  4924. raise RaisingObjectException(self.msg)
  4925. df = DataFrame({'a': [RaisingObject() for _ in range(4)],
  4926. 'grp': list('ab' * 2)})
  4927. result = df.groupby('grp').count()
  4928. expected = DataFrame({'a': [2, 2]}, index=pd.Index(
  4929. list('ab'), name='grp'))
  4930. tm.assert_frame_equal(result, expected)
  4931. def test__cython_agg_general(self):
  4932. ops = [('mean', np.mean),
  4933. ('median', np.median),
  4934. ('var', np.var),
  4935. ('add', np.sum),
  4936. ('prod', np.prod),
  4937. ('min', np.min),
  4938. ('max', np.max),
  4939. ('first', lambda x: x.iloc[0]),
  4940. ('last', lambda x: x.iloc[-1]), ]
  4941. df = DataFrame(np.random.randn(1000))
  4942. labels = np.random.randint(0, 50, size=1000).astype(float)
  4943. for op, targop in ops:
  4944. result = df.groupby(labels)._cython_agg_general(op)
  4945. expected = df.groupby(labels).agg(targop)
  4946. try:
  4947. tm.assert_frame_equal(result, expected)
  4948. except BaseException as exc:
  4949. exc.args += ('operation: %s' % op, )
  4950. raise
  4951. def test_cython_group_transform_algos(self):
  4952. # GH 4095
  4953. dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint32,
  4954. np.uint64, np.float32, np.float64]
  4955. ops = [(pd.algos.group_cumprod_float64, np.cumproduct, [np.float64]),
  4956. (pd.algos.group_cumsum, np.cumsum, dtypes)]
  4957. for pd_op, np_op, dtypes in ops:
  4958. for dtype in dtypes:
  4959. data = np.array([[1], [2], [3], [4]], dtype=dtype)
  4960. ans = np.zeros_like(data)
  4961. accum = np.array([[0]], dtype=dtype)
  4962. labels = np.array([0, 0, 0, 0], dtype=np.int64)
  4963. pd_op(ans, data, labels, accum)
  4964. self.assert_numpy_array_equal(np_op(data), ans[:, 0],
  4965. check_dtype=False)
  4966. # with nans
  4967. labels = np.array([0, 0, 0, 0, 0], dtype=np.int64)
  4968. data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64')
  4969. accum = np.array([[0.0]])
  4970. actual = np.zeros_like(data)
  4971. actual.fill(np.nan)
  4972. pd.algos.group_cumprod_float64(actual, data, labels, accum)
  4973. expected = np.array([1, 2, 6, np.nan, 24], dtype='float64')
  4974. self.assert_numpy_array_equal(actual[:, 0], expected)
  4975. accum = np.array([[0.0]])
  4976. actual = np.zeros_like(data)
  4977. actual.fill(np.nan)
  4978. pd.algos.group_cumsum(actual, data, labels, accum)
  4979. expected = np.array([1, 3, 6, np.nan, 10], dtype='float64')
  4980. self.assert_numpy_array_equal(actual[:, 0], expected)
  4981. # timedelta
  4982. data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None]
  4983. accum = np.array([[0]], dtype='int64')
  4984. actual = np.zeros_like(data, dtype='int64')
  4985. pd.algos.group_cumsum(actual, data.view('int64'), labels, accum)
  4986. expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64(
  4987. 2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'),
  4988. np.timedelta64(5, 'ns')])
  4989. self.assert_numpy_array_equal(actual[:, 0].view('m8[ns]'), expected)
  4990. def test_cython_transform(self):
  4991. # GH 4095
  4992. ops = [(('cumprod',
  4993. ()), lambda x: x.cumprod()), (('cumsum', ()),
  4994. lambda x: x.cumsum()),
  4995. (('shift', (-1, )),
  4996. lambda x: x.shift(-1)), (('shift',
  4997. (1, )), lambda x: x.shift())]
  4998. s = Series(np.random.randn(1000))
  4999. s_missing = s.copy()
  5000. s_missing.iloc[2:10] = np.nan
  5001. labels = np.random.randint(0, 50, size=1000).astype(float)
  5002. # series
  5003. for (op, args), targop in ops:
  5004. for data in [s, s_missing]:
  5005. # print(data.head())
  5006. expected = data.groupby(labels).transform(targop)
  5007. tm.assert_series_equal(expected,
  5008. data.groupby(labels).transform(op,
  5009. *args))
  5010. tm.assert_series_equal(expected, getattr(
  5011. data.groupby(labels), op)(*args))
  5012. strings = list('qwertyuiopasdfghjklz')
  5013. strings_missing = strings[:]
  5014. strings_missing[5] = np.nan
  5015. df = DataFrame({'float': s,
  5016. 'float_missing': s_missing,
  5017. 'int': [1, 1, 1, 1, 2] * 200,
  5018. 'datetime': pd.date_range('1990-1-1', periods=1000),
  5019. 'timedelta': pd.timedelta_range(1, freq='s',
  5020. periods=1000),
  5021. 'string': strings * 50,
  5022. 'string_missing': strings_missing * 50})
  5023. df['cat'] = df['string'].astype('category')
  5024. df2 = df.copy()
  5025. df2.index = pd.MultiIndex.from_product([range(100), range(10)])
  5026. # DataFrame - Single and MultiIndex,
  5027. # group by values, index level, columns
  5028. for df in [df, df2]:
  5029. for gb_target in [dict(by=labels), dict(level=0), dict(by='string')
  5030. ]: # dict(by='string_missing')]:
  5031. # dict(by=['int','string'])]:
  5032. gb = df.groupby(**gb_target)
  5033. # whitelisted methods set the selection before applying
  5034. # bit a of hack to make sure the cythonized shift
  5035. # is equivalent to pre 0.17.1 behavior
  5036. if op == 'shift':
  5037. gb._set_group_selection()
  5038. for (op, args), targop in ops:
  5039. if op != 'shift' and 'int' not in gb_target:
  5040. # numeric apply fastpath promotes dtype so have
  5041. # to apply seperately and concat
  5042. i = gb[['int']].apply(targop)
  5043. f = gb[['float', 'float_missing']].apply(targop)
  5044. expected = pd.concat([f, i], axis=1)
  5045. else:
  5046. expected = gb.apply(targop)
  5047. expected = expected.sort_index(axis=1)
  5048. tm.assert_frame_equal(expected,
  5049. gb.transform(op, *args).sort_index(
  5050. axis=1))
  5051. tm.assert_frame_equal(expected, getattr(gb, op)(*args))
  5052. # individual columns
  5053. for c in df:
  5054. if c not in ['float', 'int', 'float_missing'
  5055. ] and op != 'shift':
  5056. self.assertRaises(DataError, gb[c].transform, op)
  5057. self.assertRaises(DataError, getattr(gb[c], op))
  5058. else:
  5059. expected = gb[c].apply(targop)
  5060. expected.name = c
  5061. tm.assert_series_equal(expected,
  5062. gb[c].transform(op, *args))
  5063. tm.assert_series_equal(expected,
  5064. getattr(gb[c], op)(*args))
  5065. def test_groupby_cumprod(self):
  5066. # GH 4095
  5067. df = pd.DataFrame({'key': ['b'] * 10, 'value': 2})
  5068. actual = df.groupby('key')['value'].cumprod()
  5069. expected = df.groupby('key')['value'].apply(lambda x: x.cumprod())
  5070. expected.name = 'value'
  5071. tm.assert_series_equal(actual, expected)
  5072. df = pd.DataFrame({'key': ['b'] * 100, 'value': 2})
  5073. actual = df.groupby('key')['value'].cumprod()
  5074. # if overflows, groupby product casts to float
  5075. # while numpy passes back invalid values
  5076. df['value'] = df['value'].astype(float)
  5077. expected = df.groupby('key')['value'].apply(lambda x: x.cumprod())
  5078. expected.name = 'value'
  5079. tm.assert_series_equal(actual, expected)
  5080. def test_ops_general(self):
  5081. ops = [('mean', np.mean),
  5082. ('median', np.median),
  5083. ('std', np.std),
  5084. ('var', np.var),
  5085. ('sum', np.sum),
  5086. ('prod', np.prod),
  5087. ('min', np.min),
  5088. ('max', np.max),
  5089. ('first', lambda x: x.iloc[0]),
  5090. ('last', lambda x: x.iloc[-1]),
  5091. ('count', np.size), ]
  5092. try:
  5093. from scipy.stats import sem
  5094. except ImportError:
  5095. pass
  5096. else:
  5097. ops.append(('sem', sem))
  5098. df = DataFrame(np.random.randn(1000))
  5099. labels = np.random.randint(0, 50, size=1000).astype(float)
  5100. for op, targop in ops:
  5101. result = getattr(df.groupby(labels), op)().astype(float)
  5102. expected = df.groupby(labels).agg(targop)
  5103. try:
  5104. tm.assert_frame_equal(result, expected)
  5105. except BaseException as exc:
  5106. exc.args += ('operation: %s' % op, )
  5107. raise
  5108. def test_max_nan_bug(self):
  5109. raw = """,Date,app,File
  5110. 2013-04-23,2013-04-23 00:00:00,,log080001.log
  5111. 2013-05-06,2013-05-06 00:00:00,,log.log
  5112. 2013-05-07,2013-05-07 00:00:00,OE,xlsx"""
  5113. df = pd.read_csv(StringIO(raw), parse_dates=[0])
  5114. gb = df.groupby('Date')
  5115. r = gb[['File']].max()
  5116. e = gb['File'].max().to_frame()
  5117. tm.assert_frame_equal(r, e)
  5118. self.assertFalse(r['File'].isnull().any())
  5119. def test_nlargest(self):
  5120. a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
  5121. b = Series(list('a' * 5 + 'b' * 5))
  5122. gb = a.groupby(b)
  5123. r = gb.nlargest(3)
  5124. e = Series([
  5125. 7, 5, 3, 10, 9, 6
  5126. ], index=MultiIndex.from_arrays([list('aaabbb'), [3, 2, 1, 9, 5, 8]]))
  5127. tm.assert_series_equal(r, e)
  5128. a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
  5129. gb = a.groupby(b)
  5130. e = Series([
  5131. 3, 2, 1, 3, 3, 2
  5132. ], index=MultiIndex.from_arrays([list('aaabbb'), [2, 3, 1, 6, 5, 7]]))
  5133. assert_series_equal(gb.nlargest(3, keep='last'), e)
  5134. with tm.assert_produces_warning(FutureWarning):
  5135. assert_series_equal(gb.nlargest(3, take_last=True), e)
  5136. def test_nsmallest(self):
  5137. a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
  5138. b = Series(list('a' * 5 + 'b' * 5))
  5139. gb = a.groupby(b)
  5140. r = gb.nsmallest(3)
  5141. e = Series([
  5142. 1, 2, 3, 0, 4, 6
  5143. ], index=MultiIndex.from_arrays([list('aaabbb'), [0, 4, 1, 6, 7, 8]]))
  5144. tm.assert_series_equal(r, e)
  5145. a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
  5146. gb = a.groupby(b)
  5147. e = Series([
  5148. 0, 1, 1, 0, 1, 2
  5149. ], index=MultiIndex.from_arrays([list('aaabbb'), [4, 1, 0, 9, 8, 7]]))
  5150. assert_series_equal(gb.nsmallest(3, keep='last'), e)
  5151. with tm.assert_produces_warning(FutureWarning):
  5152. assert_series_equal(gb.nsmallest(3, take_last=True), e)
  5153. def test_transform_doesnt_clobber_ints(self):
  5154. # GH 7972
  5155. n = 6
  5156. x = np.arange(n)
  5157. df = DataFrame({'a': x // 2, 'b': 2.0 * x, 'c': 3.0 * x})
  5158. df2 = DataFrame({'a': x // 2 * 1.0, 'b': 2.0 * x, 'c': 3.0 * x})
  5159. gb = df.groupby('a')
  5160. result = gb.transform('mean')
  5161. gb2 = df2.groupby('a')
  5162. expected = gb2.transform('mean')
  5163. tm.assert_frame_equal(result, expected)
  5164. def test_groupby_categorical_two_columns(self):
  5165. # https://github.com/pydata/pandas/issues/8138
  5166. d = {'cat':
  5167. pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"],
  5168. ordered=True),
  5169. 'ints': [1, 1, 2, 2],
  5170. 'val': [10, 20, 30, 40]}
  5171. test = pd.DataFrame(d)
  5172. # Grouping on a single column
  5173. groups_single_key = test.groupby("cat")
  5174. res = groups_single_key.agg('mean')
  5175. exp_index = pd.CategoricalIndex(["a", "b", "c"], name="cat",
  5176. ordered=True)
  5177. exp = DataFrame({"ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan]},
  5178. index=exp_index)
  5179. tm.assert_frame_equal(res, exp)
  5180. # Grouping on two columns
  5181. groups_double_key = test.groupby(["cat", "ints"])
  5182. res = groups_double_key.agg('mean')
  5183. exp = DataFrame({"val": [10, 30, 20, 40, np.nan, np.nan],
  5184. "cat": ["a", "a", "b", "b", "c", "c"],
  5185. "ints": [1, 2, 1, 2, 1, 2]}).set_index(["cat", "ints"
  5186. ])
  5187. tm.assert_frame_equal(res, exp)
  5188. # GH 10132
  5189. for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]:
  5190. c, i = key
  5191. result = groups_double_key.get_group(key)
  5192. expected = test[(test.cat == c) & (test.ints == i)]
  5193. assert_frame_equal(result, expected)
  5194. d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]}
  5195. test = pd.DataFrame(d)
  5196. values = pd.cut(test['C1'], [1, 2, 3, 6])
  5197. values.name = "cat"
  5198. groups_double_key = test.groupby([values, 'C2'])
  5199. res = groups_double_key.agg('mean')
  5200. nan = np.nan
  5201. idx = MultiIndex.from_product([["(1, 2]", "(2, 3]", "(3, 6]"],
  5202. [1, 2, 3, 4]],
  5203. names=["cat", "C2"])
  5204. exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3,
  5205. nan, nan, nan, nan, 4, 5],
  5206. "C3": [nan, nan, nan, nan, 10, 100,
  5207. nan, nan, nan, nan, 200, 34]}, index=idx)
  5208. tm.assert_frame_equal(res, exp)
  5209. def test_groupby_multi_categorical_as_index(self):
  5210. # GH13204
  5211. df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]),
  5212. 'A': [10, 11, 11],
  5213. 'B': [101, 102, 103]})
  5214. result = df.groupby(['cat', 'A'], as_index=False).sum()
  5215. expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3],
  5216. 'A': [10, 11, 10, 11, 10, 11],
  5217. 'B': [101.0, nan, nan, 205.0, nan, nan]},
  5218. columns=['cat', 'A', 'B'])
  5219. tm.assert_frame_equal(result, expected)
  5220. # function grouper
  5221. f = lambda r: df.loc[r, 'A']
  5222. result = df.groupby(['cat', f], as_index=False).sum()
  5223. expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3],
  5224. 'A': [10.0, nan, nan, 22.0, nan, nan],
  5225. 'B': [101.0, nan, nan, 205.0, nan, nan]},
  5226. columns=['cat', 'A', 'B'])
  5227. tm.assert_frame_equal(result, expected)
  5228. # another not in-axis grouper (conflicting names in index)
  5229. s = Series(['a', 'b', 'b'], name='cat')
  5230. result = df.groupby(['cat', s], as_index=False).sum()
  5231. expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3],
  5232. 'A': [10.0, nan, nan, 22.0, nan, nan],
  5233. 'B': [101.0, nan, nan, 205.0, nan, nan]},
  5234. columns=['cat', 'A', 'B'])
  5235. tm.assert_frame_equal(result, expected)
  5236. # is original index dropped?
  5237. expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3],
  5238. 'A': [10, 11, 10, 11, 10, 11],
  5239. 'B': [101.0, nan, nan, 205.0, nan, nan]},
  5240. columns=['cat', 'A', 'B'])
  5241. for name in [None, 'X', 'B', 'cat']:
  5242. df.index = Index(list("abc"), name=name)
  5243. result = df.groupby(['cat', 'A'], as_index=False).sum()
  5244. tm.assert_frame_equal(result, expected, check_index_type=True)
  5245. def test_groupby_apply_all_none(self):
  5246. # Tests to make sure no errors if apply function returns all None
  5247. # values. Issue 9684.
  5248. test_df = DataFrame({'groups': [0, 0, 1, 1],
  5249. 'random_vars': [8, 7, 4, 5]})
  5250. def test_func(x):
  5251. pass
  5252. result = test_df.groupby('groups').apply(test_func)
  5253. expected = DataFrame()
  5254. tm.assert_frame_equal(result, expected)
  5255. def test_groupby_apply_none_first(self):
  5256. # GH 12824. Tests if apply returns None first.
  5257. test_df1 = DataFrame({'groups': [1, 1, 1, 2], 'vars': [0, 1, 2, 3]})
  5258. test_df2 = DataFrame({'groups': [1, 2, 2, 2], 'vars': [0, 1, 2, 3]})
  5259. def test_func(x):
  5260. if x.shape[0] < 2:
  5261. return None
  5262. return x.iloc[[0, -1]]
  5263. result1 = test_df1.groupby('groups').apply(test_func)
  5264. result2 = test_df2.groupby('groups').apply(test_func)
  5265. index1 = MultiIndex.from_arrays([[1, 1], [0, 2]],
  5266. names=['groups', None])
  5267. index2 = MultiIndex.from_arrays([[2, 2], [1, 3]],
  5268. names=['groups', None])
  5269. expected1 = DataFrame({'groups': [1, 1], 'vars': [0, 2]},
  5270. index=index1)
  5271. expected2 = DataFrame({'groups': [2, 2], 'vars': [1, 3]},
  5272. index=index2)
  5273. tm.assert_frame_equal(result1, expected1)
  5274. tm.assert_frame_equal(result2, expected2)
  5275. def test_first_last_max_min_on_time_data(self):
  5276. # GH 10295
  5277. # Verify that NaT is not in the result of max, min, first and last on
  5278. # Dataframe with datetime or timedelta values.
  5279. from datetime import timedelta as td
  5280. df_test = DataFrame(
  5281. {'dt': [nan, '2015-07-24 10:10', '2015-07-25 11:11',
  5282. '2015-07-23 12:12', nan],
  5283. 'td': [nan, td(days=1), td(days=2), td(days=3), nan]})
  5284. df_test.dt = pd.to_datetime(df_test.dt)
  5285. df_test['group'] = 'A'
  5286. df_ref = df_test[df_test.dt.notnull()]
  5287. grouped_test = df_test.groupby('group')
  5288. grouped_ref = df_ref.groupby('group')
  5289. assert_frame_equal(grouped_ref.max(), grouped_test.max())
  5290. assert_frame_equal(grouped_ref.min(), grouped_test.min())
  5291. assert_frame_equal(grouped_ref.first(), grouped_test.first())
  5292. assert_frame_equal(grouped_ref.last(), grouped_test.last())
  5293. def test_groupby_preserves_sort(self):
  5294. # Test to ensure that groupby always preserves sort order of original
  5295. # object. Issue #8588 and #9651
  5296. df = DataFrame(
  5297. {'int_groups': [3, 1, 0, 1, 0, 3, 3, 3],
  5298. 'string_groups': ['z', 'a', 'z', 'a', 'a', 'g', 'g', 'g'],
  5299. 'ints': [8, 7, 4, 5, 2, 9, 1, 1],
  5300. 'floats': [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5],
  5301. 'strings': ['z', 'd', 'a', 'e', 'word', 'word2', '42', '47']})
  5302. # Try sorting on different types and with different group types
  5303. for sort_column in ['ints', 'floats', 'strings', ['ints', 'floats'],
  5304. ['ints', 'strings']]:
  5305. for group_column in ['int_groups', 'string_groups',
  5306. ['int_groups', 'string_groups']]:
  5307. df = df.sort_values(by=sort_column)
  5308. g = df.groupby(group_column)
  5309. def test_sort(x):
  5310. assert_frame_equal(x, x.sort_values(by=sort_column))
  5311. g.apply(test_sort)
  5312. def test_nunique_with_object(self):
  5313. # GH 11077
  5314. data = pd.DataFrame(
  5315. [[100, 1, 'Alice'],
  5316. [200, 2, 'Bob'],
  5317. [300, 3, 'Charlie'],
  5318. [-400, 4, 'Dan'],
  5319. [500, 5, 'Edith']],
  5320. columns=['amount', 'id', 'name']
  5321. )
  5322. result = data.groupby(['id', 'amount'])['name'].nunique()
  5323. index = MultiIndex.from_arrays([data.id, data.amount])
  5324. expected = pd.Series([1] * 5, name='name', index=index)
  5325. tm.assert_series_equal(result, expected)
  5326. def test_transform_with_non_scalar_group(self):
  5327. # GH 10165
  5328. cols = pd.MultiIndex.from_tuples([
  5329. ('syn', 'A'), ('mis', 'A'), ('non', 'A'),
  5330. ('syn', 'C'), ('mis', 'C'), ('non', 'C'),
  5331. ('syn', 'T'), ('mis', 'T'), ('non', 'T'),
  5332. ('syn', 'G'), ('mis', 'G'), ('non', 'G')])
  5333. df = pd.DataFrame(np.random.randint(1, 10, (4, 12)),
  5334. columns=cols,
  5335. index=['A', 'C', 'G', 'T'])
  5336. self.assertRaisesRegexp(ValueError, 'transform must return a scalar '
  5337. 'value for each group.*', df.groupby
  5338. (axis=1, level=1).transform,
  5339. lambda z: z.div(z.sum(axis=1), axis=0))
  5340. def test_numpy_compat(self):
  5341. # see gh-12811
  5342. df = pd.DataFrame({'A': [1, 2, 1], 'B': [1, 2, 3]})
  5343. g = df.groupby('A')
  5344. msg = "numpy operations are not valid with groupby"
  5345. for func in ('mean', 'var', 'std', 'cumprod', 'cumsum'):
  5346. tm.assertRaisesRegexp(UnsupportedFunctionCall, msg,
  5347. getattr(g, func), 1, 2, 3)
  5348. tm.assertRaisesRegexp(UnsupportedFunctionCall, msg,
  5349. getattr(g, func), foo=1)
  5350. def test_grouping_string_repr(self):
  5351. # GH 13394
  5352. mi = MultiIndex.from_arrays([list("AAB"), list("aba")])
  5353. df = DataFrame([[1, 2, 3]], columns=mi)
  5354. gr = df.groupby(df[('A', 'a')])
  5355. result = gr.grouper.groupings[0].__repr__()
  5356. expected = "Grouping(('A', 'a'))"
  5357. tm.assert_equal(result, expected)
  5358. def test_group_shift_with_null_key(self):
  5359. # This test is designed to replicate the segfault in issue #13813.
  5360. n_rows = 1200
  5361. # Generate a moderately large dataframe with occasional missing
  5362. # values in column `B`, and then group by [`A`, `B`]. This should
  5363. # force `-1` in `labels` array of `g.grouper.group_info` exactly
  5364. # at those places, where the group-by key is partilly missing.
  5365. df = DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i)
  5366. for i in range(n_rows)], dtype=float,
  5367. columns=["A", "B", "Z"], index=None)
  5368. g = df.groupby(["A", "B"])
  5369. expected = DataFrame([(i + 12 if i % 3 and i < n_rows - 12
  5370. else np.nan)
  5371. for i in range(n_rows)], dtype=float,
  5372. columns=["Z"], index=None)
  5373. result = g.shift(-1)
  5374. assert_frame_equal(result, expected)
  5375. def assert_fp_equal(a, b):
  5376. assert (np.abs(a - b) < 1e-12).all()
  5377. def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
  5378. tups = lmap(tuple, df[keys].values)
  5379. tups = com._asarray_tuplesafe(tups)
  5380. expected = f(df.groupby(tups)[field])
  5381. for k, v in compat.iteritems(expected):
  5382. assert (result[k] == v)
  5383. def test_decons():
  5384. from pandas.core.groupby import decons_group_index, get_group_index
  5385. def testit(label_list, shape):
  5386. group_index = get_group_index(label_list, shape, sort=True, xnull=True)
  5387. label_list2 = decons_group_index(group_index, shape)
  5388. for a, b in zip(label_list, label_list2):
  5389. assert (np.array_equal(a, b))
  5390. shape = (4, 5, 6)
  5391. label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100), np.tile(
  5392. [0, 2, 4, 3, 0, 1, 2, 3], 100), np.tile(
  5393. [5, 1, 0, 2, 3, 0, 5, 4], 100)]
  5394. testit(label_list, shape)
  5395. shape = (10000, 10000)
  5396. label_list = [np.tile(np.arange(10000), 5), np.tile(np.arange(10000), 5)]
  5397. testit(label_list, shape)
  5398. if __name__ == '__main__':
  5399. nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s'
  5400. ], exit=False)