PageRenderTime 84ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 1ms

/pandas/tests/test_groupby.py

http://github.com/pydata/pandas
Python | 4473 lines | 4292 code | 132 blank | 49 comment | 24 complexity | 8f76f988646643acde67dd7273b6636f MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. from __future__ import print_function
  2. import nose
  3. from numpy.testing.decorators import slow
  4. from datetime import datetime
  5. from numpy import nan
  6. from pandas import date_range,bdate_range, Timestamp
  7. from pandas.core.index import Index, MultiIndex, Int64Index
  8. from pandas.core.common import rands
  9. from pandas.core.api import Categorical, DataFrame
  10. from pandas.core.groupby import (SpecificationError, DataError,
  11. _nargsort, _lexsort_indexer)
  12. from pandas.core.series import Series
  13. from pandas.util.testing import (assert_panel_equal, assert_frame_equal,
  14. assert_series_equal, assert_almost_equal,
  15. assert_index_equal, assertRaisesRegexp)
  16. from pandas.compat import(
  17. range, long, lrange, StringIO, lmap, lzip, map,
  18. zip, builtins, OrderedDict
  19. )
  20. from pandas import compat
  21. from pandas.core.panel import Panel
  22. from pandas.tools.merge import concat
  23. from collections import defaultdict
  24. import pandas.core.common as com
  25. import numpy as np
  26. import pandas.core.nanops as nanops
  27. import pandas.util.testing as tm
  28. import pandas as pd
  29. from numpy.testing import assert_equal
  30. def _skip_if_mpl_not_installed():
  31. try:
  32. import matplotlib.pyplot as plt
  33. except ImportError:
  34. raise nose.SkipTest("matplotlib not installed")
  35. def commonSetUp(self):
  36. self.dateRange = bdate_range('1/1/2005', periods=250)
  37. self.stringIndex = Index([rands(8).upper() for x in range(250)])
  38. self.groupId = Series([x[0] for x in self.stringIndex],
  39. index=self.stringIndex)
  40. self.groupDict = dict((k, v) for k, v in compat.iteritems(self.groupId))
  41. self.columnIndex = Index(['A', 'B', 'C', 'D', 'E'])
  42. randMat = np.random.randn(250, 5)
  43. self.stringMatrix = DataFrame(randMat, columns=self.columnIndex,
  44. index=self.stringIndex)
  45. self.timeMatrix = DataFrame(randMat, columns=self.columnIndex,
  46. index=self.dateRange)
  47. class TestGroupBy(tm.TestCase):
  48. _multiprocess_can_split_ = True
  49. def setUp(self):
  50. self.ts = tm.makeTimeSeries()
  51. self.seriesd = tm.getSeriesData()
  52. self.tsd = tm.getTimeSeriesData()
  53. self.frame = DataFrame(self.seriesd)
  54. self.tsframe = DataFrame(self.tsd)
  55. self.df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
  56. 'foo', 'bar', 'foo', 'foo'],
  57. 'B': ['one', 'one', 'two', 'three',
  58. 'two', 'two', 'one', 'three'],
  59. 'C': np.random.randn(8),
  60. 'D': np.random.randn(8)})
  61. self.df_mixed_floats = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
  62. 'foo', 'bar', 'foo', 'foo'],
  63. 'B': ['one', 'one', 'two', 'three',
  64. 'two', 'two', 'one', 'three'],
  65. 'C': np.random.randn(8),
  66. 'D': np.array(np.random.randn(8),
  67. dtype='float32')})
  68. index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
  69. ['one', 'two', 'three']],
  70. labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
  71. [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
  72. names=['first', 'second'])
  73. self.mframe = DataFrame(np.random.randn(10, 3), index=index,
  74. columns=['A', 'B', 'C'])
  75. self.three_group = DataFrame({'A': ['foo', 'foo', 'foo', 'foo',
  76. 'bar', 'bar', 'bar', 'bar',
  77. 'foo', 'foo', 'foo'],
  78. 'B': ['one', 'one', 'one', 'two',
  79. 'one', 'one', 'one', 'two',
  80. 'two', 'two', 'one'],
  81. 'C': ['dull', 'dull', 'shiny', 'dull',
  82. 'dull', 'shiny', 'shiny', 'dull',
  83. 'shiny', 'shiny', 'shiny'],
  84. 'D': np.random.randn(11),
  85. 'E': np.random.randn(11),
  86. 'F': np.random.randn(11)})
  87. def test_basic(self):
  88. def checkit(dtype):
  89. data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype)
  90. index = np.arange(9)
  91. np.random.shuffle(index)
  92. data = data.reindex(index)
  93. grouped = data.groupby(lambda x: x // 3)
  94. for k, v in grouped:
  95. self.assertEqual(len(v), 3)
  96. agged = grouped.aggregate(np.mean)
  97. self.assertEqual(agged[1], 1)
  98. assert_series_equal(agged, grouped.agg(np.mean)) # shorthand
  99. assert_series_equal(agged, grouped.mean())
  100. assert_series_equal(grouped.agg(np.sum), grouped.sum())
  101. expected = grouped.apply(lambda x: x * x.sum())
  102. transformed = grouped.transform(lambda x: x * x.sum())
  103. self.assertEqual(transformed[7], 12)
  104. assert_series_equal(transformed, expected)
  105. value_grouped = data.groupby(data)
  106. assert_series_equal(value_grouped.aggregate(np.mean), agged)
  107. # complex agg
  108. agged = grouped.aggregate([np.mean, np.std])
  109. agged = grouped.aggregate({'one': np.mean,
  110. 'two': np.std})
  111. group_constants = {
  112. 0: 10,
  113. 1: 20,
  114. 2: 30
  115. }
  116. agged = grouped.agg(lambda x: group_constants[x.name] + x.mean())
  117. self.assertEqual(agged[1], 21)
  118. # corner cases
  119. self.assertRaises(Exception, grouped.aggregate, lambda x: x * 2)
  120. for dtype in ['int64', 'int32', 'float64', 'float32']:
  121. checkit(dtype)
  122. def test_select_bad_cols(self):
  123. df = DataFrame([[1, 2]], columns=['A', 'B'])
  124. g = df.groupby('A')
  125. self.assertRaises(KeyError, g.__getitem__, ['C']) # g[['C']]
  126. self.assertRaises(KeyError, g.__getitem__, ['A', 'C']) # g[['A', 'C']]
  127. with assertRaisesRegexp(KeyError, '^[^A]+$'):
  128. # A should not be referenced as a bad column...
  129. # will have to rethink regex if you change message!
  130. g[['A', 'C']]
  131. def test_first_last_nth(self):
  132. # tests for first / last / nth
  133. grouped = self.df.groupby('A')
  134. first = grouped.first()
  135. expected = self.df.ix[[1, 0], ['B','C','D']]
  136. expected.index = Index(['bar', 'foo'],name='A')
  137. expected = expected.sort_index()
  138. assert_frame_equal(first, expected)
  139. nth = grouped.nth(0)
  140. assert_frame_equal(nth, expected)
  141. last = grouped.last()
  142. expected = self.df.ix[[5, 7], ['B','C','D']]
  143. expected.index = Index(['bar', 'foo'],name='A')
  144. assert_frame_equal(last, expected)
  145. nth = grouped.nth(-1)
  146. assert_frame_equal(nth, expected)
  147. nth = grouped.nth(1)
  148. expected = self.df.ix[[2, 3],['B','C','D']].copy()
  149. expected.index = Index(['foo', 'bar'],name='A')
  150. expected = expected.sort_index()
  151. assert_frame_equal(nth, expected)
  152. # it works!
  153. grouped['B'].first()
  154. grouped['B'].last()
  155. grouped['B'].nth(0)
  156. self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan
  157. self.assertTrue(com.isnull(grouped['B'].first()['foo']))
  158. self.assertTrue(com.isnull(grouped['B'].last()['foo']))
  159. self.assertTrue(com.isnull(grouped['B'].nth(0)[0])) # not sure what this is testing
  160. # v0.14.0 whatsnew
  161. df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
  162. g = df.groupby('A')
  163. result = g.first()
  164. expected = df.iloc[[1,2]].set_index('A')
  165. assert_frame_equal(result, expected)
  166. expected = df.iloc[[1,2]].set_index('A')
  167. result = g.nth(0,dropna='any')
  168. assert_frame_equal(result, expected)
  169. def test_first_last_nth_dtypes(self):
  170. df = self.df_mixed_floats.copy()
  171. df['E'] = True
  172. df['F'] = 1
  173. # tests for first / last / nth
  174. grouped = df.groupby('A')
  175. first = grouped.first()
  176. expected = df.ix[[1, 0], ['B', 'C', 'D', 'E', 'F']]
  177. expected.index = Index(['bar', 'foo'], name='A')
  178. expected = expected.sort_index()
  179. assert_frame_equal(first, expected)
  180. last = grouped.last()
  181. expected = df.ix[[5, 7], ['B', 'C', 'D', 'E', 'F']]
  182. expected.index = Index(['bar', 'foo'], name='A')
  183. expected = expected.sort_index()
  184. assert_frame_equal(last, expected)
  185. nth = grouped.nth(1)
  186. expected = df.ix[[3, 2],['B', 'C', 'D', 'E', 'F']]
  187. expected.index = Index(['bar', 'foo'], name='A')
  188. expected = expected.sort_index()
  189. assert_frame_equal(nth, expected)
  190. # GH 2763, first/last shifting dtypes
  191. idx = lrange(10)
  192. idx.append(9)
  193. s = Series(data=lrange(11), index=idx, name='IntCol')
  194. self.assertEqual(s.dtype, 'int64')
  195. f = s.groupby(level=0).first()
  196. self.assertEqual(f.dtype, 'int64')
  197. def test_nth(self):
  198. df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
  199. g = df.groupby('A')
  200. assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A'))
  201. assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A'))
  202. assert_frame_equal(g.nth(2), df.loc[[],['B']])
  203. assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A'))
  204. assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A'))
  205. assert_frame_equal(g.nth(-3), df.loc[[],['B']])
  206. assert_series_equal(g.B.nth(0), df.B.iloc[[0, 2]])
  207. assert_series_equal(g.B.nth(1), df.B.iloc[[1]])
  208. assert_frame_equal(g[['B']].nth(0), df.ix[[0, 2], ['A', 'B']].set_index('A'))
  209. exp = df.set_index('A')
  210. assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]])
  211. assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]])
  212. exp['B'] = np.nan
  213. assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]])
  214. assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]])
  215. # out of bounds, regression from 0.13.1
  216. # GH 6621
  217. df = DataFrame({'color': {0: 'green', 1: 'green', 2: 'red', 3: 'red', 4: 'red'},
  218. 'food': {0: 'ham', 1: 'eggs', 2: 'eggs', 3: 'ham', 4: 'pork'},
  219. 'two': {0: 1.5456590000000001, 1: -0.070345000000000005, 2: -2.4004539999999999, 3: 0.46206000000000003, 4: 0.52350799999999997},
  220. 'one': {0: 0.56573799999999996, 1: -0.9742360000000001, 2: 1.033801, 3: -0.78543499999999999, 4: 0.70422799999999997}}).set_index(['color', 'food'])
  221. result = df.groupby(level=0).nth(2)
  222. expected = df.iloc[[-1]]
  223. assert_frame_equal(result,expected)
  224. result = df.groupby(level=0).nth(3)
  225. expected = df.loc[[]]
  226. assert_frame_equal(result,expected)
  227. # GH 7559
  228. # from the vbench
  229. df = DataFrame(np.random.randint(1, 10, (100, 2)),dtype='int64')
  230. s = df[1]
  231. g = df[0]
  232. expected = s.groupby(g).first()
  233. expected2 = s.groupby(g).apply(lambda x: x.iloc[0])
  234. assert_series_equal(expected2,expected)
  235. # validate first
  236. v = s[g==1].iloc[0]
  237. self.assertEqual(expected.iloc[0],v)
  238. self.assertEqual(expected2.iloc[0],v)
  239. # this is NOT the same as .first (as sorted is default!)
  240. # as it keeps the order in the series (and not the group order)
  241. # related GH 7287
  242. expected = s.groupby(g,sort=False).first()
  243. expected.index = range(1,10)
  244. result = s.groupby(g).nth(0,dropna='all')
  245. assert_series_equal(result,expected)
  246. # doc example
  247. df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
  248. g = df.groupby('A')
  249. result = g.B.nth(0, dropna=True)
  250. expected = g.B.first()
  251. assert_series_equal(result,expected)
  252. def test_grouper_index_types(self):
  253. # related GH5375
  254. # groupby misbehaving when using a Floatlike index
  255. df = DataFrame(np.arange(10).reshape(5,2),columns=list('AB'))
  256. for index in [ tm.makeFloatIndex, tm.makeStringIndex,
  257. tm.makeUnicodeIndex, tm.makeIntIndex,
  258. tm.makeDateIndex, tm.makePeriodIndex ]:
  259. df.index = index(len(df))
  260. df.groupby(list('abcde')).apply(lambda x: x)
  261. df.index = list(reversed(df.index.tolist()))
  262. df.groupby(list('abcde')).apply(lambda x: x)
  263. def test_grouper_iter(self):
  264. self.assertEqual(sorted(self.df.groupby('A').grouper), ['bar', 'foo'])
  265. def test_empty_groups(self):
  266. # GH # 1048
  267. self.assertRaises(ValueError, self.df.groupby, [])
  268. def test_groupby_grouper(self):
  269. grouped = self.df.groupby('A')
  270. result = self.df.groupby(grouped.grouper).mean()
  271. expected = grouped.mean()
  272. assert_frame_equal(result, expected)
  273. def test_groupby_dict_mapping(self):
  274. # GH #679
  275. from pandas import Series
  276. s = Series({'T1': 5})
  277. result = s.groupby({'T1': 'T2'}).agg(sum)
  278. expected = s.groupby(['T2']).agg(sum)
  279. assert_series_equal(result, expected)
  280. s = Series([1., 2., 3., 4.], index=list('abcd'))
  281. mapping = {'a': 0, 'b': 0, 'c': 1, 'd': 1}
  282. result = s.groupby(mapping).mean()
  283. result2 = s.groupby(mapping).agg(np.mean)
  284. expected = s.groupby([0, 0, 1, 1]).mean()
  285. expected2 = s.groupby([0, 0, 1, 1]).mean()
  286. assert_series_equal(result, expected)
  287. assert_series_equal(result, result2)
  288. assert_series_equal(result, expected2)
  289. def test_groupby_bounds_check(self):
  290. import pandas as pd
  291. # groupby_X is code-generated, so if one variant
  292. # does, the rest probably do to
  293. a = np.array([1,2],dtype='object')
  294. b = np.array([1,2,3],dtype='object')
  295. self.assertRaises(AssertionError, pd.algos.groupby_object,a, b)
  296. def test_groupby_grouper_f_sanity_checked(self):
  297. import pandas as pd
  298. dates = date_range('01-Jan-2013', periods=12, freq='MS')
  299. ts = pd.TimeSeries(np.random.randn(12), index=dates)
  300. # GH3035
  301. # index.map is used to apply grouper to the index
  302. # if it fails on the elements, map tries it on the entire index as
  303. # a sequence. That can yield invalid results that cause trouble
  304. # down the line.
  305. # the surprise comes from using key[0:6] rather then str(key)[0:6]
  306. # when the elements are Timestamp.
  307. # the result is Index[0:6], very confusing.
  308. self.assertRaises(AssertionError, ts.groupby,lambda key: key[0:6])
  309. def test_groupby_nonobject_dtype(self):
  310. key = self.mframe.index.labels[0]
  311. grouped = self.mframe.groupby(key)
  312. result = grouped.sum()
  313. expected = self.mframe.groupby(key.astype('O')).sum()
  314. assert_frame_equal(result, expected)
  315. # GH 3911, mixed frame non-conversion
  316. df = self.df_mixed_floats.copy()
  317. df['value'] = lrange(len(df))
  318. def max_value(group):
  319. return group.ix[group['value'].idxmax()]
  320. applied = df.groupby('A').apply(max_value)
  321. result = applied.get_dtype_counts()
  322. result.sort()
  323. expected = Series({ 'object' : 2, 'float64' : 2, 'int64' : 1 })
  324. expected.sort()
  325. assert_series_equal(result,expected)
  326. def test_groupby_return_type(self):
  327. # GH2893, return a reduced type
  328. df1 = DataFrame([{"val1": 1, "val2" : 20}, {"val1":1, "val2": 19},
  329. {"val1":2, "val2": 27}, {"val1":2, "val2": 12}])
  330. def func(dataf):
  331. return dataf["val2"] - dataf["val2"].mean()
  332. result = df1.groupby("val1", squeeze=True).apply(func)
  333. tm.assert_isinstance(result,Series)
  334. df2 = DataFrame([{"val1": 1, "val2" : 20}, {"val1":1, "val2": 19},
  335. {"val1":1, "val2": 27}, {"val1":1, "val2": 12}])
  336. def func(dataf):
  337. return dataf["val2"] - dataf["val2"].mean()
  338. result = df2.groupby("val1", squeeze=True).apply(func)
  339. tm.assert_isinstance(result,Series)
  340. # GH3596, return a consistent type (regression in 0.11 from 0.10.1)
  341. df = DataFrame([[1,1],[1,1]],columns=['X','Y'])
  342. result = df.groupby('X',squeeze=False).count()
  343. tm.assert_isinstance(result,DataFrame)
  344. # GH5592
  345. # inconcistent return type
  346. df = DataFrame(dict(A = [ 'Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb', 'Pony', 'Pony' ],
  347. B = Series(np.arange(7),dtype='int64'),
  348. C = date_range('20130101',periods=7)))
  349. def f(grp):
  350. return grp.iloc[0]
  351. expected = df.groupby('A').first()[['B']]
  352. result = df.groupby('A').apply(f)[['B']]
  353. assert_frame_equal(result,expected)
  354. def f(grp):
  355. if grp.name == 'Tiger':
  356. return None
  357. return grp.iloc[0]
  358. result = df.groupby('A').apply(f)[['B']]
  359. e = expected.copy()
  360. e.loc['Tiger'] = np.nan
  361. assert_frame_equal(result,e)
  362. def f(grp):
  363. if grp.name == 'Pony':
  364. return None
  365. return grp.iloc[0]
  366. result = df.groupby('A').apply(f)[['B']]
  367. e = expected.copy()
  368. e.loc['Pony'] = np.nan
  369. assert_frame_equal(result,e)
  370. # 5592 revisited, with datetimes
  371. def f(grp):
  372. if grp.name == 'Pony':
  373. return None
  374. return grp.iloc[0]
  375. result = df.groupby('A').apply(f)[['C']]
  376. e = df.groupby('A').first()[['C']]
  377. e.loc['Pony'] = np.nan
  378. assert_frame_equal(result,e)
  379. # scalar outputs
  380. def f(grp):
  381. if grp.name == 'Pony':
  382. return None
  383. return grp.iloc[0].loc['C']
  384. result = df.groupby('A').apply(f)
  385. e = df.groupby('A').first()['C'].copy()
  386. e.loc['Pony'] = np.nan
  387. e.name = None
  388. assert_series_equal(result,e)
  389. def test_agg_api(self):
  390. # GH 6337
  391. # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error
  392. # different api for agg when passed custom function with mixed frame
  393. df = DataFrame({'data1':np.random.randn(5),
  394. 'data2':np.random.randn(5),
  395. 'key1':['a','a','b','b','a'],
  396. 'key2':['one','two','one','two','one']})
  397. grouped = df.groupby('key1')
  398. def peak_to_peak(arr):
  399. return arr.max() - arr.min()
  400. expected = grouped.agg([peak_to_peak])
  401. expected.columns=['data1','data2']
  402. result = grouped.agg(peak_to_peak)
  403. assert_frame_equal(result,expected)
  404. def test_agg_regression1(self):
  405. grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
  406. result = grouped.agg(np.mean)
  407. expected = grouped.mean()
  408. assert_frame_equal(result, expected)
  409. def test_agg_datetimes_mixed(self):
  410. data = [[1, '2012-01-01', 1.0],
  411. [2, '2012-01-02', 2.0],
  412. [3, None, 3.0]]
  413. df1 = DataFrame({'key': [x[0] for x in data],
  414. 'date': [x[1] for x in data],
  415. 'value': [x[2] for x in data]})
  416. data = [[row[0], datetime.strptime(row[1], '%Y-%m-%d').date()
  417. if row[1] else None, row[2]] for row in data]
  418. df2 = DataFrame({'key': [x[0] for x in data],
  419. 'date': [x[1] for x in data],
  420. 'value': [x[2] for x in data]})
  421. df1['weights'] = df1['value'] / df1['value'].sum()
  422. gb1 = df1.groupby('date').aggregate(np.sum)
  423. df2['weights'] = df1['value'] / df1['value'].sum()
  424. gb2 = df2.groupby('date').aggregate(np.sum)
  425. assert(len(gb1) == len(gb2))
  426. def test_agg_period_index(self):
  427. from pandas import period_range, PeriodIndex
  428. prng = period_range('2012-1-1', freq='M', periods=3)
  429. df = DataFrame(np.random.randn(3, 2), index=prng)
  430. rs = df.groupby(level=0).sum()
  431. tm.assert_isinstance(rs.index, PeriodIndex)
  432. # GH 3579
  433. index = period_range(start='1999-01', periods=5, freq='M')
  434. s1 = Series(np.random.rand(len(index)), index=index)
  435. s2 = Series(np.random.rand(len(index)), index=index)
  436. series = [('s1', s1), ('s2',s2)]
  437. df = DataFrame.from_items(series)
  438. grouped = df.groupby(df.index.month)
  439. list(grouped)
  440. def test_agg_must_agg(self):
  441. grouped = self.df.groupby('A')['C']
  442. self.assertRaises(Exception, grouped.agg, lambda x: x.describe())
  443. self.assertRaises(Exception, grouped.agg, lambda x: x.index[:2])
  444. def test_agg_ser_multi_key(self):
  445. ser = self.df.C
  446. f = lambda x: x.sum()
  447. results = self.df.C.groupby([self.df.A, self.df.B]).aggregate(f)
  448. expected = self.df.groupby(['A', 'B']).sum()['C']
  449. assert_series_equal(results, expected)
  450. def test_get_group(self):
  451. wp = tm.makePanel()
  452. grouped = wp.groupby(lambda x: x.month, axis='major')
  453. gp = grouped.get_group(1)
  454. expected = wp.reindex(major=[x for x in wp.major_axis if x.month == 1])
  455. assert_panel_equal(gp, expected)
  456. # GH 5267
  457. # be datelike friendly
  458. df = DataFrame({'DATE' : pd.to_datetime(['10-Oct-2013', '10-Oct-2013', '10-Oct-2013',
  459. '11-Oct-2013', '11-Oct-2013', '11-Oct-2013']),
  460. 'label' : ['foo','foo','bar','foo','foo','bar'],
  461. 'VAL' : [1,2,3,4,5,6]})
  462. g = df.groupby('DATE')
  463. key = list(g.groups)[0]
  464. result1 = g.get_group(key)
  465. result2 = g.get_group(Timestamp(key).to_datetime())
  466. result3 = g.get_group(str(Timestamp(key)))
  467. assert_frame_equal(result1,result2)
  468. assert_frame_equal(result1,result3)
  469. g = df.groupby(['DATE','label'])
  470. key = list(g.groups)[0]
  471. result1 = g.get_group(key)
  472. result2 = g.get_group((Timestamp(key[0]).to_datetime(),key[1]))
  473. result3 = g.get_group((str(Timestamp(key[0])),key[1]))
  474. assert_frame_equal(result1,result2)
  475. assert_frame_equal(result1,result3)
  476. # must pass a same-length tuple with multiple keys
  477. self.assertRaises(ValueError, lambda : g.get_group('foo'))
  478. self.assertRaises(ValueError, lambda : g.get_group(('foo')))
  479. self.assertRaises(ValueError, lambda : g.get_group(('foo','bar','baz')))
  480. def test_agg_apply_corner(self):
  481. # nothing to group, all NA
  482. grouped = self.ts.groupby(self.ts * np.nan)
  483. assert_series_equal(grouped.sum(), Series([]))
  484. assert_series_equal(grouped.agg(np.sum), Series([]))
  485. assert_series_equal(grouped.apply(np.sum), Series([]))
  486. # DataFrame
  487. grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan)
  488. exp_df = DataFrame(columns=self.tsframe.columns, dtype=float)
  489. assert_frame_equal(grouped.sum(), exp_df, check_names=False)
  490. assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False)
  491. assert_frame_equal(grouped.apply(np.sum), DataFrame({}, dtype=float))
  492. def test_agg_grouping_is_list_tuple(self):
  493. from pandas.core.groupby import Grouping
  494. df = tm.makeTimeDataFrame()
  495. grouped = df.groupby(lambda x: x.year)
  496. grouper = grouped.grouper.groupings[0].grouper
  497. grouped.grouper.groupings[0] = Grouping(self.ts.index, list(grouper))
  498. result = grouped.agg(np.mean)
  499. expected = grouped.mean()
  500. tm.assert_frame_equal(result, expected)
  501. grouped.grouper.groupings[0] = Grouping(self.ts.index, tuple(grouper))
  502. result = grouped.agg(np.mean)
  503. expected = grouped.mean()
  504. tm.assert_frame_equal(result, expected)
  505. def test_agg_python_multiindex(self):
  506. grouped = self.mframe.groupby(['A', 'B'])
  507. result = grouped.agg(np.mean)
  508. expected = grouped.mean()
  509. tm.assert_frame_equal(result, expected)
  510. def test_apply_describe_bug(self):
  511. grouped = self.mframe.groupby(level='first')
  512. result = grouped.describe() # it works!
  513. def test_apply_issues(self):
  514. # GH 5788
  515. s="""2011.05.16,00:00,1.40893
  516. 2011.05.16,01:00,1.40760
  517. 2011.05.16,02:00,1.40750
  518. 2011.05.16,03:00,1.40649
  519. 2011.05.17,02:00,1.40893
  520. 2011.05.17,03:00,1.40760
  521. 2011.05.17,04:00,1.40750
  522. 2011.05.17,05:00,1.40649
  523. 2011.05.18,02:00,1.40893
  524. 2011.05.18,03:00,1.40760
  525. 2011.05.18,04:00,1.40750
  526. 2011.05.18,05:00,1.40649"""
  527. df = pd.read_csv(StringIO(s), header=None, names=['date', 'time', 'value'], parse_dates=[['date', 'time']])
  528. df = df.set_index('date_time')
  529. expected = df.groupby(df.index.date).idxmax()
  530. result = df.groupby(df.index.date).apply(lambda x: x.idxmax())
  531. assert_frame_equal(result,expected)
  532. # GH 5789
  533. # don't auto coerce dates
  534. df = pd.read_csv(StringIO(s), header=None, names=['date', 'time', 'value'])
  535. expected = Series(['00:00','02:00','02:00'],index=['2011.05.16','2011.05.17','2011.05.18'])
  536. result = df.groupby('date').apply(lambda x: x['time'][x['value'].idxmax()])
  537. assert_series_equal(result,expected)
  538. def test_len(self):
  539. df = tm.makeTimeDataFrame()
  540. grouped = df.groupby([lambda x: x.year,
  541. lambda x: x.month,
  542. lambda x: x.day])
  543. self.assertEqual(len(grouped), len(df))
  544. grouped = df.groupby([lambda x: x.year,
  545. lambda x: x.month])
  546. expected = len(set([(x.year, x.month) for x in df.index]))
  547. self.assertEqual(len(grouped), expected)
  548. def test_groups(self):
  549. grouped = self.df.groupby(['A'])
  550. groups = grouped.groups
  551. self.assertIs(groups, grouped.groups) # caching works
  552. for k, v in compat.iteritems(grouped.groups):
  553. self.assertTrue((self.df.ix[v]['A'] == k).all())
  554. grouped = self.df.groupby(['A', 'B'])
  555. groups = grouped.groups
  556. self.assertIs(groups, grouped.groups) # caching works
  557. for k, v in compat.iteritems(grouped.groups):
  558. self.assertTrue((self.df.ix[v]['A'] == k[0]).all())
  559. self.assertTrue((self.df.ix[v]['B'] == k[1]).all())
  560. def test_aggregate_str_func(self):
  561. def _check_results(grouped):
  562. # single series
  563. result = grouped['A'].agg('std')
  564. expected = grouped['A'].std()
  565. assert_series_equal(result, expected)
  566. # group frame by function name
  567. result = grouped.aggregate('var')
  568. expected = grouped.var()
  569. assert_frame_equal(result, expected)
  570. # group frame by function dict
  571. result = grouped.agg(OrderedDict([['A', 'var'],
  572. ['B', 'std'],
  573. ['C', 'mean'],
  574. ['D', 'sem']]))
  575. expected = DataFrame(OrderedDict([['A', grouped['A'].var()],
  576. ['B', grouped['B'].std()],
  577. ['C', grouped['C'].mean()],
  578. ['D', grouped['D'].sem()]]))
  579. assert_frame_equal(result, expected)
  580. by_weekday = self.tsframe.groupby(lambda x: x.weekday())
  581. _check_results(by_weekday)
  582. by_mwkday = self.tsframe.groupby([lambda x: x.month,
  583. lambda x: x.weekday()])
  584. _check_results(by_mwkday)
  585. def test_aggregate_item_by_item(self):
  586. df = self.df.copy()
  587. df['E'] = ['a'] * len(self.df)
  588. grouped = self.df.groupby('A')
  589. # API change in 0.11
  590. # def aggfun(ser):
  591. # return len(ser + 'a')
  592. # result = grouped.agg(aggfun)
  593. # self.assertEqual(len(result.columns), 1)
  594. aggfun = lambda ser: ser.size
  595. result = grouped.agg(aggfun)
  596. foo = (self.df.A == 'foo').sum()
  597. bar = (self.df.A == 'bar').sum()
  598. K = len(result.columns)
  599. # GH5782
  600. # odd comparisons can result here, so cast to make easy
  601. assert_almost_equal(result.xs('foo'), np.array([foo] * K).astype('float64'))
  602. assert_almost_equal(result.xs('bar'), np.array([bar] * K).astype('float64'))
  603. def aggfun(ser):
  604. return ser.size
  605. result = DataFrame().groupby(self.df.A).agg(aggfun)
  606. tm.assert_isinstance(result, DataFrame)
  607. self.assertEqual(len(result), 0)
  608. def test_agg_item_by_item_raise_typeerror(self):
  609. from numpy.random import randint
  610. df = DataFrame(randint(10, size=(20, 10)))
  611. def raiseException(df):
  612. com.pprint_thing('----------------------------------------')
  613. com.pprint_thing(df.to_string())
  614. raise TypeError
  615. self.assertRaises(TypeError, df.groupby(0).agg,
  616. raiseException)
  617. def test_basic_regression(self):
  618. # regression
  619. T = [1.0 * x for x in lrange(1, 10) * 10][:1095]
  620. result = Series(T, lrange(0, len(T)))
  621. groupings = np.random.random((1100,))
  622. groupings = Series(groupings, lrange(0, len(groupings))) * 10.
  623. grouped = result.groupby(groupings)
  624. grouped.mean()
  625. def test_transform(self):
  626. data = Series(np.arange(9) // 3, index=np.arange(9))
  627. index = np.arange(9)
  628. np.random.shuffle(index)
  629. data = data.reindex(index)
  630. grouped = data.groupby(lambda x: x // 3)
  631. transformed = grouped.transform(lambda x: x * x.sum())
  632. self.assertEqual(transformed[7], 12)
  633. def test_transform_broadcast(self):
  634. grouped = self.ts.groupby(lambda x: x.month)
  635. result = grouped.transform(np.mean)
  636. self.assertTrue(result.index.equals(self.ts.index))
  637. for _, gp in grouped:
  638. assert_fp_equal(result.reindex(gp.index), gp.mean())
  639. grouped = self.tsframe.groupby(lambda x: x.month)
  640. result = grouped.transform(np.mean)
  641. self.assertTrue(result.index.equals(self.tsframe.index))
  642. for _, gp in grouped:
  643. agged = gp.mean()
  644. res = result.reindex(gp.index)
  645. for col in self.tsframe:
  646. assert_fp_equal(res[col], agged[col])
  647. # group columns
  648. grouped = self.tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1},
  649. axis=1)
  650. result = grouped.transform(np.mean)
  651. self.assertTrue(result.index.equals(self.tsframe.index))
  652. self.assertTrue(result.columns.equals(self.tsframe.columns))
  653. for _, gp in grouped:
  654. agged = gp.mean(1)
  655. res = result.reindex(columns=gp.columns)
  656. for idx in gp.index:
  657. assert_fp_equal(res.xs(idx), agged[idx])
  658. def test_transform_bug(self):
  659. # GH 5712
  660. # transforming on a datetime column
  661. df = DataFrame(dict(A = Timestamp('20130101'), B = np.arange(5)))
  662. result = df.groupby('A')['B'].transform(lambda x: x.rank(ascending=False))
  663. expected = Series(np.arange(5,0,step=-1),name='B')
  664. assert_series_equal(result,expected)
  665. def test_transform_multiple(self):
  666. grouped = self.ts.groupby([lambda x: x.year, lambda x: x.month])
  667. transformed = grouped.transform(lambda x: x * 2)
  668. broadcasted = grouped.transform(np.mean)
  669. def test_dispatch_transform(self):
  670. df = self.tsframe[::5].reindex(self.tsframe.index)
  671. grouped = df.groupby(lambda x: x.month)
  672. filled = grouped.fillna(method='pad')
  673. fillit = lambda x: x.fillna(method='pad')
  674. expected = df.groupby(lambda x: x.month).transform(fillit)
  675. assert_frame_equal(filled, expected)
  676. def test_transform_select_columns(self):
  677. f = lambda x: x.mean()
  678. result = self.df.groupby('A')['C', 'D'].transform(f)
  679. selection = self.df[['C', 'D']]
  680. expected = selection.groupby(self.df['A']).transform(f)
  681. assert_frame_equal(result, expected)
  682. def test_transform_exclude_nuisance(self):
  683. expected = {}
  684. grouped = self.df.groupby('A')
  685. expected['C'] = grouped['C'].transform(np.mean)
  686. expected['D'] = grouped['D'].transform(np.mean)
  687. expected = DataFrame(expected)
  688. result = self.df.groupby('A').transform(np.mean)
  689. assert_frame_equal(result, expected)
  690. def test_transform_function_aliases(self):
  691. result = self.df.groupby('A').transform('mean')
  692. expected = self.df.groupby('A').transform(np.mean)
  693. assert_frame_equal(result, expected)
  694. result = self.df.groupby('A')['C'].transform('mean')
  695. expected = self.df.groupby('A')['C'].transform(np.mean)
  696. assert_series_equal(result, expected)
  697. def test_with_na(self):
  698. index = Index(np.arange(10))
  699. for dtype in ['float64','float32','int64','int32','int16','int8']:
  700. values = Series(np.ones(10), index, dtype=dtype)
  701. labels = Series([nan, 'foo', 'bar', 'bar', nan, nan, 'bar',
  702. 'bar', nan, 'foo'], index=index)
  703. # this SHOULD be an int
  704. grouped = values.groupby(labels)
  705. agged = grouped.agg(len)
  706. expected = Series([4, 2], index=['bar', 'foo'])
  707. assert_series_equal(agged, expected, check_dtype=False)
  708. #self.assertTrue(issubclass(agged.dtype.type, np.integer))
  709. # explicity return a float from my function
  710. def f(x):
  711. return float(len(x))
  712. agged = grouped.agg(f)
  713. expected = Series([4, 2], index=['bar', 'foo'])
  714. assert_series_equal(agged, expected, check_dtype=False)
  715. self.assertTrue(issubclass(agged.dtype.type, np.dtype(dtype).type))
  716. def test_groupby_transform_with_int(self):
  717. # GH 3740, make sure that we might upcast on item-by-item transform
  718. # floats
  719. df = DataFrame(dict(A = [1,1,1,2,2,2], B = Series(1,dtype='float64'), C = Series([1,2,3,1,2,3],dtype='float64'), D = 'foo'))
  720. result = df.groupby('A').transform(lambda x: (x-x.mean())/x.std())
  721. expected = DataFrame(dict(B = np.nan, C = Series([-1,0,1,-1,0,1],dtype='float64')))
  722. assert_frame_equal(result,expected)
  723. # int case
  724. df = DataFrame(dict(A = [1,1,1,2,2,2], B = 1, C = [1,2,3,1,2,3], D = 'foo'))
  725. result = df.groupby('A').transform(lambda x: (x-x.mean())/x.std())
  726. expected = DataFrame(dict(B = np.nan, C = [-1,0,1,-1,0,1]))
  727. assert_frame_equal(result,expected)
  728. # int that needs float conversion
  729. s = Series([2,3,4,10,5,-1])
  730. df = DataFrame(dict(A = [1,1,1,2,2,2], B = 1, C = s, D = 'foo'))
  731. result = df.groupby('A').transform(lambda x: (x-x.mean())/x.std())
  732. s1 = s.iloc[0:3]
  733. s1 = (s1-s1.mean())/s1.std()
  734. s2 = s.iloc[3:6]
  735. s2 = (s2-s2.mean())/s2.std()
  736. expected = DataFrame(dict(B = np.nan, C = concat([s1,s2])))
  737. assert_frame_equal(result,expected)
  738. # int downcasting
  739. result = df.groupby('A').transform(lambda x: x*2/2)
  740. expected = DataFrame(dict(B = 1, C = [2,3,4,10,5,-1]))
  741. assert_frame_equal(result,expected)
  742. def test_indices_concatenation_order(self):
  743. # GH 2808
  744. def f1(x):
  745. y = x[(x.b % 2) == 1]**2
  746. if y.empty:
  747. multiindex = MultiIndex(
  748. levels = [[]]*2,
  749. labels = [[]]*2,
  750. names = ['b', 'c']
  751. )
  752. res = DataFrame(None,
  753. columns=['a'],
  754. index=multiindex)
  755. return res
  756. else:
  757. y = y.set_index(['b','c'])
  758. return y
  759. def f2(x):
  760. y = x[(x.b % 2) == 1]**2
  761. if y.empty:
  762. return DataFrame()
  763. else:
  764. y = y.set_index(['b','c'])
  765. return y
  766. def f3(x):
  767. y = x[(x.b % 2) == 1]**2
  768. if y.empty:
  769. multiindex = MultiIndex(
  770. levels = [[]]*2,
  771. labels = [[]]*2,
  772. names = ['foo', 'bar']
  773. )
  774. res = DataFrame(None,
  775. columns=['a','b'],
  776. index=multiindex)
  777. return res
  778. else:
  779. return y
  780. df = DataFrame({'a':[1,2,2,2],
  781. 'b':lrange(4),
  782. 'c':lrange(5,9)})
  783. df2 = DataFrame({'a':[3,2,2,2],
  784. 'b':lrange(4),
  785. 'c':lrange(5,9)})
  786. # correct result
  787. result1 = df.groupby('a').apply(f1)
  788. result2 = df2.groupby('a').apply(f1)
  789. assert_frame_equal(result1, result2)
  790. # should fail (not the same number of levels)
  791. self.assertRaises(AssertionError, df.groupby('a').apply, f2)
  792. self.assertRaises(AssertionError, df2.groupby('a').apply, f2)
  793. # should fail (incorrect shape)
  794. self.assertRaises(AssertionError, df.groupby('a').apply, f3)
  795. self.assertRaises(AssertionError, df2.groupby('a').apply, f3)
  796. def test_attr_wrapper(self):
  797. grouped = self.ts.groupby(lambda x: x.weekday())
  798. result = grouped.std()
  799. expected = grouped.agg(lambda x: np.std(x, ddof=1))
  800. assert_series_equal(result, expected)
  801. # this is pretty cool
  802. result = grouped.describe()
  803. expected = {}
  804. for name, gp in grouped:
  805. expected[name] = gp.describe()
  806. expected = DataFrame(expected).T
  807. assert_frame_equal(result.unstack(), expected)
  808. # get attribute
  809. result = grouped.dtype
  810. expected = grouped.agg(lambda x: x.dtype)
  811. # make sure raises error
  812. self.assertRaises(AttributeError, getattr, grouped, 'foo')
  813. def test_series_describe_multikey(self):
  814. ts = tm.makeTimeSeries()
  815. grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
  816. result = grouped.describe().unstack()
  817. assert_series_equal(result['mean'], grouped.mean())
  818. assert_series_equal(result['std'], grouped.std())
  819. assert_series_equal(result['min'], grouped.min())
  820. def test_series_describe_single(self):
  821. ts = tm.makeTimeSeries()
  822. grouped = ts.groupby(lambda x: x.month)
  823. result = grouped.apply(lambda x: x.describe())
  824. expected = grouped.describe()
  825. assert_series_equal(result, expected)
  826. def test_series_agg_multikey(self):
  827. ts = tm.makeTimeSeries()
  828. grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
  829. result = grouped.agg(np.sum)
  830. expected = grouped.sum()
  831. assert_series_equal(result, expected)
  832. def test_series_agg_multi_pure_python(self):
  833. data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo',
  834. 'bar', 'bar', 'bar', 'bar',
  835. 'foo', 'foo', 'foo'],
  836. 'B': ['one', 'one', 'one', 'two',
  837. 'one', 'one', 'one', 'two',
  838. 'two', 'two', 'one'],
  839. 'C': ['dull', 'dull', 'shiny', 'dull',
  840. 'dull', 'shiny', 'shiny', 'dull',
  841. 'shiny', 'shiny', 'shiny'],
  842. 'D': np.random.randn(11),
  843. 'E': np.random.randn(11),
  844. 'F': np.random.randn(11)})
  845. def bad(x):
  846. assert(len(x.base) > 0)
  847. return 'foo'
  848. result = data.groupby(['A', 'B']).agg(bad)
  849. expected = data.groupby(['A', 'B']).agg(lambda x: 'foo')
  850. assert_frame_equal(result, expected)
  851. def test_series_index_name(self):
  852. grouped = self.df.ix[:, ['C']].groupby(self.df['A'])
  853. result = grouped.agg(lambda x: x.mean())
  854. self.assertEqual(result.index.name, 'A')
  855. def test_frame_describe_multikey(self):
  856. grouped = self.tsframe.groupby([lambda x: x.year,
  857. lambda x: x.month])
  858. result = grouped.describe()
  859. for col in self.tsframe:
  860. expected = grouped[col].describe()
  861. assert_series_equal(result[col], expected)
  862. groupedT = self.tsframe.groupby({'A': 0, 'B': 0,
  863. 'C': 1, 'D': 1}, axis=1)
  864. result = groupedT.describe()
  865. for name, group in groupedT:
  866. assert_frame_equal(result[name], group.describe())
  867. def test_frame_groupby(self):
  868. grouped = self.tsframe.groupby(lambda x: x.weekday())
  869. # aggregate
  870. aggregated = grouped.aggregate(np.mean)
  871. self.assertEqual(len(aggregated), 5)
  872. self.assertEqual(len(aggregated.columns), 4)
  873. # by string
  874. tscopy = self.tsframe.copy()
  875. tscopy['weekday'] = [x.weekday() for x in tscopy.index]
  876. stragged = tscopy.groupby('weekday').aggregate(np.mean)
  877. assert_frame_equal(stragged, aggregated, check_names=False)
  878. # transform
  879. grouped = self.tsframe.head(30).groupby(lambda x: x.weekday())
  880. transformed = grouped.transform(lambda x: x - x.mean())
  881. self.assertEqual(len(transformed), 30)
  882. self.assertEqual(len(transformed.columns), 4)
  883. # transform propagate
  884. transformed = grouped.transform(lambda x: x.mean())
  885. for name, group in grouped:
  886. mean = group.mean()
  887. for idx in group.index:
  888. assert_almost_equal(transformed.xs(idx), mean)
  889. # iterate
  890. for weekday, group in grouped:
  891. self.assertEqual(group.index[0].weekday(), weekday)
  892. # groups / group_indices
  893. groups = grouped.groups
  894. indices = grouped.indices
  895. for k, v in compat.iteritems(groups):
  896. samething = self.tsframe.index.take(indices[k])
  897. self.assertTrue((samething == v).all())
  898. def test_grouping_is_iterable(self):
  899. # this code path isn't used anywhere else
  900. # not sure it's useful
  901. grouped = self.tsframe.groupby([lambda x: x.weekday(),
  902. lambda x: x.year])
  903. # test it works
  904. for g in grouped.grouper.groupings[0]:
  905. pass
  906. def test_frame_groupby_columns(self):
  907. mapping = {
  908. 'A': 0, 'B': 0, 'C': 1, 'D': 1
  909. }
  910. grouped = self.tsframe.groupby(mapping, axis=1)
  911. # aggregate
  912. aggregated = grouped.aggregate(np.mean)
  913. self.assertEqual(len(aggregated), len(self.tsframe))
  914. self.assertEqual(len(aggregated.columns), 2)
  915. # transform
  916. tf = lambda x: x - x.mean()
  917. groupedT = self.tsframe.T.groupby(mapping, axis=0)
  918. assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf))
  919. # iterate
  920. for k, v in grouped:
  921. self.assertEqual(len(v.columns), 2)
  922. def test_frame_set_name_single(self):
  923. grouped = self.df.groupby('A')
  924. result = grouped.mean()
  925. self.assertEqual(result.index.name, 'A')
  926. result = self.df.groupby('A', as_index=False).mean()
  927. self.assertNotEqual(result.index.name, 'A')
  928. result = grouped.agg(np.mean)
  929. self.assertEqual(result.index.name, 'A')
  930. result = grouped.agg({'C': np.mean, 'D': np.std})
  931. self.assertEqual(result.index.name, 'A')
  932. result = grouped['C'].mean()
  933. self.assertEqual(result.index.name, 'A')
  934. result = grouped['C'].agg(np.mean)
  935. self.assertEqual(result.index.name, 'A')
  936. result = grouped['C'].agg([np.mean, np.std])
  937. self.assertEqual(result.index.name, 'A')
  938. result = grouped['C'].agg({'foo': np.mean, 'bar': np.std})
  939. self.assertEqual(result.index.name, 'A')
  940. def test_multi_iter(self):
  941. s = Series(np.arange(6))
  942. k1 = np.array(['a', 'a', 'a', 'b', 'b', 'b'])
  943. k2 = np.array(['1', '2', '1', '2', '1', '2'])
  944. grouped = s.groupby([k1, k2])
  945. iterated = list(grouped)
  946. expected = [('a', '1', s[[0, 2]]),
  947. ('a', '2', s[[1]]),
  948. ('b', '1', s[[4]]),
  949. ('b', '2', s[[3, 5]])]
  950. for i, ((one, two), three) in enumerate(iterated):
  951. e1, e2, e3 = expected[i]
  952. self.assertEqual(e1, one)
  953. self.assertEqual(e2, two)
  954. assert_series_equal(three, e3)
  955. def test_multi_iter_frame(self):
  956. k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
  957. k2 = np.array(['1', '2', '1', '2', '1', '2'])
  958. df = DataFrame({'v1': np.random.randn(6),
  959. 'v2': np.random.randn(6),
  960. 'k1': k1, 'k2': k2},
  961. index=['one', 'two', 'three', 'four', 'five', 'six'])
  962. grouped = df.groupby(['k1', 'k2'])
  963. # things get sorted!
  964. iterated = list(grouped)
  965. idx = df.index
  966. expected = [('a', '1', df.ix[idx[[4]]]),
  967. ('a', '2', df.ix[idx[[3, 5]]]),
  968. ('b', '1', df.ix[idx[[0, 2]]]),
  969. ('b', '2', df.ix[idx[[1]]])]
  970. for i, ((one, two), three) in enumerate(iterated):
  971. e1, e2, e3 = expected[i]
  972. self.assertEqual(e1, one)
  973. self.assertEqual(e2, two)
  974. assert_frame_equal(three, e3)
  975. # don't iterate through groups with no data
  976. df['k1'] = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
  977. df['k2'] = np.array(['1', '1', '1', '2', '2', '2'])
  978. grouped = df.groupby(['k1', 'k2'])
  979. groups = {}
  980. for key, gp in grouped:
  981. groups[key] = gp
  982. self.assertEqual(len(groups), 2)
  983. # axis = 1
  984. three_levels = self.three_group.groupby(['A', 'B', 'C']).mean()
  985. grouped = three_levels.T.groupby(axis=1, level=(1, 2))
  986. for key, group in grouped:
  987. pass
  988. def test_multi_iter_panel(self):
  989. wp = tm.makePanel()
  990. grouped = wp.groupby([lambda x: x.month, lambda x: x.weekday()],
  991. axis=1)
  992. for (month, wd), group in grouped:
  993. exp_axis = [x for x in wp.major_axis
  994. if x.month == month and x.weekday() == wd]
  995. expected = wp.reindex(major=exp_axis)
  996. assert_panel_equal(group, expected)
  997. def test_multi_func(self):
  998. col1 = self.df['A']
  999. col2 = self.df['B']
  1000. grouped = self.df.groupby([col1.get, col2.get])
  1001. agged = grouped.mean()
  1002. expected = self.df.groupby(['A', 'B']).mean()
  1003. assert_frame_equal(agged.ix[:, ['C', 'D']],
  1004. expected.ix[:, ['C', 'D']],
  1005. check_names=False) # TODO groupby get drops names
  1006. # some "groups" with no data
  1007. df = DataFrame({'v1': np.random.randn(6),
  1008. 'v2': np.random.randn(6),
  1009. 'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']),
  1010. 'k2': np.array(['1', '1', '1', '2', '2', '2'])},
  1011. index=['one', 'two', 'three', 'four', 'five', 'six'])
  1012. # only verify that it works for now
  1013. grouped = df.groupby(['k1', 'k2'])
  1014. grouped.agg(np.sum)
  1015. def test_multi_key_multiple_functions(self):
  1016. grouped = self.df.groupby(['A', 'B'])['C']
  1017. agged = grouped.agg([np.mean, np.std])
  1018. expected = DataFrame({'mean': grouped.agg(np.mean),
  1019. 'std': grouped.agg(np.std)})
  1020. assert_frame_equal(agged, expected)
  1021. def test_frame_multi_key_function_list(self):
  1022. data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo',
  1023. 'bar', 'bar', 'bar', 'bar',
  1024. 'foo', 'foo', 'foo'],
  1025. 'B': ['one', 'one', 'one', 'two',
  1026. 'one', 'one', 'one', 'two',
  1027. 'two', 'two', 'one'],
  1028. 'C': ['dull', 'dull', 'shiny', 'dull',
  1029. 'dull', 'shiny', 'shiny', 'dull',
  1030. 'shiny', 'shiny', 'shiny'],
  1031. 'D': np.random.randn(11),
  1032. 'E': np.random.randn(11),
  1033. 'F': np.random.randn(11)})
  1034. grouped = data.groupby(['A', 'B'])
  1035. funcs = [np.mean, np.std]
  1036. agged = grouped.agg(funcs)
  1037. expected = concat([grouped['D'].agg(funcs), grouped['E'].agg(funcs),
  1038. grouped['F'].agg(funcs)],
  1039. keys=['D', 'E', 'F'], axis=1)
  1040. assert(isinstance(agged.index, MultiIndex))
  1041. assert(isinstance(expected.index, MultiIndex))
  1042. assert_frame_equal(agged, expected)
  1043. def test_groupby_multiple_columns(self):
  1044. data = self.df
  1045. grouped = data.groupby(['A', 'B'])
  1046. def _check_op(op):
  1047. result1 = op(grouped)
  1048. expected = defaultdict(dict)
  1049. for n1, gp1 in data.groupby('A'):
  1050. for n2, gp2 in gp1.groupby('B'):
  1051. expected[n1][n2] = op(gp2.ix[:, ['C', 'D']])
  1052. expected = dict((k, DataFrame(v)) for k, v in compat.iteritems(expected))
  1053. expected = Panel.fromDict(expected).swapaxes(0, 1)
  1054. expected.major_axis.name, expected.minor_axis.name = 'A', 'B'
  1055. # a little bit crude
  1056. for col in ['C', 'D']:
  1057. result_col = op(grouped[col])
  1058. exp = expected[col]
  1059. pivoted = result1[col].unstack()
  1060. pivoted2 = result_col.unstack()
  1061. assert_frame_equal(pivoted.reindex_like(exp), exp)
  1062. assert_frame_equal(pivoted2.reindex_like(exp), exp)
  1063. _check_op(lambda x: x.sum())
  1064. _check_op(lambda x: x.mean())
  1065. # test single series works the same
  1066. result = data['C'].groupby([data['A'], data['B']]).mean()
  1067. expected = data.groupby(['A', 'B']).mean()['C']
  1068. assert_series_equal(result, expected)
  1069. def test_groupby_as_index_agg(self):
  1070. grouped = self.df.groupby('A', as_index=False)
  1071. # single-key
  1072. result = grouped.agg(np.mean)
  1073. expected = grouped.mean()
  1074. assert_frame_equal(result, expected)
  1075. result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]]))
  1076. expected2 = grouped.mean()
  1077. expected2['D'] = grouped.sum()['D']
  1078. assert_frame_equal(result2, expected2)
  1079. grouped = self.df.groupby('A', as_index=True)
  1080. expected3 = grouped['C'].sum()
  1081. expected3 = DataFrame(expected3).rename(columns={'C': 'Q'})
  1082. result3 = grouped['C'].agg({'Q': np.sum})
  1083. assert_frame_equal(result3, expected3)
  1084. # multi-key
  1085. grouped = self.df.groupby(['A', 'B'], as_index=False)
  1086. result = grouped.agg(np.mean)
  1087. expected = grouped.mean()
  1088. assert_frame_equal(result, expected)
  1089. result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]]))
  1090. expected2 = grouped.mean()
  1091. expected2['D'] = grouped.sum()['D']
  1092. assert_frame_equal(result2, expected2)
  1093. expected3 = grouped['C'].sum()
  1094. expected3 = DataFrame(expected3).rename(columns={'C': 'Q'})
  1095. result3 = grouped['C'].agg({'Q': np.sum})
  1096. assert_frame_equal(result3, expected3)
  1097. def test_multifunc_select_col_integer_cols(self):
  1098. df = self.df
  1099. df.columns = np.arange(len(df.columns))
  1100. # it works!
  1101. result = df.groupby(1, as_index=False)[2].agg({'Q': np.mean})
  1102. def test_as_index_series_return_frame(self):
  1103. grouped = self.df.groupby('A', as_index=False)
  1104. grouped2 = self.df.groupby(['A', 'B'], as_index=False)
  1105. result = grouped['C'].agg(np.sum)
  1106. expected = grouped.agg(np.sum).ix[:, ['A', 'C']]
  1107. tm.assert_isinstance(result, DataFrame)
  1108. assert_frame_equal(result, expected)
  1109. result2 = grouped2['C'].agg(np.sum)
  1110. expected2 = grouped2.agg(np.sum).ix[:, ['A', 'B', 'C']]
  1111. tm.assert_isinstance(result2, DataFrame)
  1112. assert_frame_equal(result2, expected2)
  1113. result = grouped['C'].sum()
  1114. expected = grouped.sum().ix[:, ['A', 'C']]
  1115. tm.assert_isinstance(result, DataFrame)
  1116. assert_frame_equal(result, expected)
  1117. result2 = grouped2['C'].sum()
  1118. expected2 = grouped2.sum().ix[:, ['A', 'B', 'C']]
  1119. tm.assert_isinstance(result2, DataFrame)
  1120. assert_frame_equal(result2, expected2)
  1121. # corner case
  1122. self.assertRaises(Exception, grouped['C'].__getitem__,
  1123. 'D')
  1124. def test_groupby_as_index_cython(self):
  1125. data = self.df
  1126. # single-key
  1127. grouped = data.groupby('A', as_index=False)
  1128. result = grouped.mean()
  1129. expected = data.groupby(['A']).mean()
  1130. expected.insert(0, 'A', expected.index)
  1131. expected.index = np.arange(len(expected))
  1132. assert_frame_equal(result, expected)
  1133. # multi-key
  1134. grouped = data.groupby(['A', 'B'], as_index=False)
  1135. result = grouped.mean()
  1136. expected = data.groupby(['A', 'B']).mean()
  1137. arrays = lzip(*expected.index._tuple_index)
  1138. expected.insert(0, 'A', arrays[0])
  1139. expected.insert(1, 'B', arrays[1])
  1140. expected.index = np.arange(len(expected))
  1141. assert_frame_equal(result, expected)
  1142. def test_groupby_as_index_series_scalar(self):
  1143. grouped = self.df.groupby(['A', 'B'], as_index=False)
  1144. # GH #421
  1145. result = grouped['C'].agg(len)
  1146. expected = grouped.agg(len).ix[:, ['A', 'B', 'C']]
  1147. assert_frame_equal(result, expected)
  1148. def test_groupby_as_index_corner(self):
  1149. self.assertRaises(TypeError, self.ts.groupby,
  1150. lambda x: x.weekday(), as_index=False)
  1151. self.assertRaises(ValueError, self.df.groupby,
  1152. lambda x: x.lower(), as_index=False, axis=1)
  1153. def test_groupby_as_index_apply(self):
  1154. # GH #4648 and #3417
  1155. df = DataFrame({'item_id': ['b', 'b', 'a', 'c', 'a', 'b'],
  1156. 'user_id': [1,2,1,1,3,1],
  1157. 'time': range(6)})
  1158. g_as = df.groupby('user_id', as_index=True)
  1159. g_not_as = df.groupby('user_id', as_index=False)
  1160. res_as = g_as.head(2).index
  1161. res_not_as = g_not_as.head(2).index
  1162. exp = Index([0, 1, 2, 4])
  1163. assert_index_equal(res_as, exp)
  1164. assert_index_equal(res_not_as, exp)
  1165. res_as_apply = g_as.apply(lambda x: x.head(2)).index
  1166. res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index
  1167. # apply doesn't maintain the original ordering
  1168. # changed in GH5610 as the as_index=False returns a MI here
  1169. exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (2, 4)])
  1170. exp_as_apply = MultiIndex.from_tuples([(1, 0), (1, 2), (2, 1), (3, 4)])
  1171. assert_index_equal(res_as_apply, exp_as_apply)
  1172. assert_index_equal(res_not_as_apply, exp_not_as_apply)
  1173. ind = Index(list('abcde'))
  1174. df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind)
  1175. res = df.groupby(0, as_index=False).apply(lambda x: x).index
  1176. assert_index_equal(res, ind)
  1177. def test_groupby_head_tail(self):
  1178. df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
  1179. g_as = df.groupby('A', as_index=True)
  1180. g_not_as = df.groupby('A', as_index=False)
  1181. # as_index= False, much easier
  1182. assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1))
  1183. assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1))
  1184. empty_not_as = DataFrame(columns=df.columns)
  1185. assert_frame_equal(empty_not_as, g_not_as.head(0))
  1186. assert_frame_equal(empty_not_as, g_not_as.tail(0))
  1187. assert_frame_equal(empty_not_as, g_not_as.head(-1))
  1188. assert_frame_equal(empty_not_as, g_not_as.tail(-1))
  1189. assert_frame_equal(df, g_not_as.head(7)) # contains all
  1190. assert_frame_equal(df, g_not_as.tail(7))
  1191. # as_index=True, (used to be different)
  1192. df_as = df
  1193. assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1))
  1194. assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1))
  1195. empty_as = DataFrame(index=df_as.index[:0], columns=df.columns)
  1196. assert_frame_equal(empty_as, g_as.head(0))
  1197. assert_frame_equal(empty_as, g_as.tail(0))
  1198. assert_frame_equal(empty_as, g_as.head(-1))
  1199. assert_frame_equal(empty_as, g_as.tail(-1))
  1200. assert_frame_equal(df_as, g_as.head(7)) # contains all
  1201. assert_frame_equal(df_as, g_as.tail(7))
  1202. # test with selection
  1203. assert_frame_equal(g_as[[]].head(1), df_as.loc[[0,2], []])
  1204. assert_frame_equal(g_as[['A']].head(1), df_as.loc[[0,2], ['A']])
  1205. assert_frame_equal(g_as[['B']].head(1), df_as.loc[[0,2], ['B']])
  1206. assert_frame_equal(g_as[['A', 'B']].head(1), df_as.loc[[0,2]])
  1207. assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0,2], []])
  1208. assert_frame_equal(g_not_as[['A']].head(1), df_as.loc[[0,2], ['A']])
  1209. assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0,2], ['B']])
  1210. assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0,2]])
  1211. def test_groupby_multiple_key(self):
  1212. df = tm.makeTimeDataFrame()
  1213. grouped = df.groupby([lambda x: x.year,
  1214. lambda x: x.month,
  1215. lambda x: x.day])
  1216. agged = grouped.sum()
  1217. assert_almost_equal(df.values, agged.values)
  1218. grouped = df.T.groupby([lambda x: x.year,
  1219. lambda x: x.month,
  1220. lambda x: x.day], axis=1)
  1221. agged = grouped.agg(lambda x: x.sum(1))
  1222. self.assertTrue(agged.index.equals(df.columns))
  1223. assert_almost_equal(df.T.values, agged.values)
  1224. agged = grouped.agg(lambda x: x.sum(1))
  1225. assert_almost_equal(df.T.values, agged.values)
  1226. def test_groupby_multi_corner(self):
  1227. # test that having an all-NA column doesn't mess you up
  1228. df = self.df.copy()
  1229. df['bad'] = np.nan
  1230. agged = df.groupby(['A', 'B']).mean()
  1231. expected = self.df.groupby(['A', 'B']).mean()
  1232. expected['bad'] = np.nan
  1233. assert_frame_equal(agged, expected)
  1234. def test_omit_nuisance(self):
  1235. grouped = self.df.groupby('A')
  1236. result = grouped.mean()
  1237. expected = self.df.ix[:, ['A', 'C', 'D']].groupby('A').mean()
  1238. assert_frame_equal(result, expected)
  1239. agged = grouped.agg(np.mean)
  1240. exp = grouped.mean()
  1241. assert_frame_equal(agged, exp)
  1242. df = self.df.ix[:, ['A', 'C', 'D']]
  1243. df['E'] = datetime.now()
  1244. grouped = df.groupby('A')
  1245. result = grouped.agg(np.sum)
  1246. expected = grouped.sum()
  1247. assert_frame_equal(result, expected)
  1248. # won't work with axis = 1
  1249. grouped = df.groupby({'A': 0, 'C': 0, 'D': 1, 'E': 1}, axis=1)
  1250. result = self.assertRaises(TypeError, grouped.agg,
  1251. lambda x: x.sum(1, numeric_only=False))
  1252. def test_omit_nuisance_python_multiple(self):
  1253. grouped = self.three_group.groupby(['A', 'B'])
  1254. agged = grouped.agg(np.mean)
  1255. exp = grouped.mean()
  1256. assert_frame_equal(agged, exp)
  1257. def test_empty_groups_corner(self):
  1258. # handle empty groups
  1259. df = DataFrame({'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']),
  1260. 'k2': np.array(['1', '1', '1', '2', '2', '2']),
  1261. 'k3': ['foo', 'bar'] * 3,
  1262. 'v1': np.random.randn(6),
  1263. 'v2': np.random.randn(6)})
  1264. grouped = df.groupby(['k1', 'k2'])
  1265. result = grouped.agg(np.mean)
  1266. expected = grouped.mean()
  1267. assert_frame_equal(result, expected)
  1268. grouped = self.mframe[3:5].groupby(level=0)
  1269. agged = grouped.apply(lambda x: x.mean())
  1270. agged_A = grouped['A'].apply(np.mean)
  1271. assert_series_equal(agged['A'], agged_A)
  1272. self.assertEqual(agged.index.name, 'first')
  1273. def test_apply_concat_preserve_names(self):
  1274. grouped = self.three_group.groupby(['A', 'B'])
  1275. def desc(group):
  1276. result = group.describe()
  1277. result.index.name = 'stat'
  1278. return result
  1279. def desc2(group):
  1280. result = group.describe()
  1281. result.index.name = 'stat'
  1282. result = result[:len(group)]
  1283. # weirdo
  1284. return result
  1285. def desc3(group):
  1286. result = group.describe()
  1287. # names are different
  1288. result.index.name = 'stat_%d' % len(group)
  1289. result = result[:len(group)]
  1290. # weirdo
  1291. return result
  1292. result = grouped.apply(desc)
  1293. self.assertEqual(result.index.names, ('A', 'B', 'stat'))
  1294. result2 = grouped.apply(desc2)
  1295. self.assertEqual(result2.index.names, ('A', 'B', 'stat'))
  1296. result3 = grouped.apply(desc3)
  1297. self.assertEqual(result3.index.names, ('A', 'B', None))
  1298. def test_nonsense_func(self):
  1299. df = DataFrame([0])
  1300. self.assertRaises(Exception, df.groupby, lambda x: x + 'foo')
  1301. def test_cythonized_aggers(self):
  1302. data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., nan, nan],
  1303. 'B': ['A', 'B'] * 6,
  1304. 'C': np.random.randn(12)}
  1305. df = DataFrame(data)
  1306. df['C'][2:10:2] = nan
  1307. def _testit(op):
  1308. # single column
  1309. grouped = df.drop(['B'], axis=1).groupby('A')
  1310. exp = {}
  1311. for cat, group in grouped:
  1312. exp[cat] = op(group['C'])
  1313. exp = DataFrame({'C': exp})
  1314. exp.index.name = 'A'
  1315. result = op(grouped)
  1316. assert_frame_equal(result, exp)
  1317. # multiple columns
  1318. grouped = df.groupby(['A', 'B'])
  1319. expd = {}
  1320. for (cat1, cat2), group in grouped:
  1321. expd.setdefault(cat1, {})[cat2] = op(group['C'])
  1322. exp = DataFrame(expd).T.stack(dropna=False)
  1323. result = op(grouped)['C']
  1324. assert_series_equal(result, exp)
  1325. _testit(lambda x: x.count())
  1326. _testit(lambda x: x.sum())
  1327. _testit(lambda x: x.std())
  1328. _testit(lambda x: x.var())
  1329. _testit(lambda x: x.sem())
  1330. _testit(lambda x: x.mean())
  1331. _testit(lambda x: x.median())
  1332. _testit(lambda x: x.prod())
  1333. _testit(lambda x: x.min())
  1334. _testit(lambda x: x.max())
  1335. def test_max_min_non_numeric(self):
  1336. # #2700
  1337. aa = DataFrame({'nn':[11,11,22,22],'ii':[1,2,3,4],'ss':4*['mama']})
  1338. result = aa.groupby('nn').max()
  1339. self.assertTrue('ss' in result)
  1340. result = aa.groupby('nn').min()
  1341. self.assertTrue('ss' in result)
  1342. def test_cython_agg_boolean(self):
  1343. frame = DataFrame({'a': np.random.randint(0, 5, 50),
  1344. 'b': np.random.randint(0, 2, 50).astype('bool')})
  1345. result = frame.groupby('a')['b'].mean()
  1346. expected = frame.groupby('a')['b'].agg(np.mean)
  1347. assert_series_equal(result, expected)
  1348. def test_cython_agg_nothing_to_agg(self):
  1349. frame = DataFrame({'a': np.random.randint(0, 5, 50),
  1350. 'b': ['foo', 'bar'] * 25})
  1351. self.assertRaises(DataError, frame.groupby('a')['b'].mean)
  1352. frame = DataFrame({'a': np.random.randint(0, 5, 50),
  1353. 'b': ['foo', 'bar'] * 25})
  1354. self.assertRaises(DataError, frame[['b']].groupby(frame['a']).mean)
  1355. def test_cython_agg_nothing_to_agg_with_dates(self):
  1356. frame = DataFrame({'a': np.random.randint(0, 5, 50),
  1357. 'b': ['foo', 'bar'] * 25,
  1358. 'dates': pd.date_range('now', periods=50,
  1359. freq='T')})
  1360. with tm.assertRaisesRegexp(DataError, "No numeric types to aggregate"):
  1361. frame.groupby('b').dates.mean()
  1362. def test_groupby_timedelta_cython_count(self):
  1363. df = DataFrame({'g': list('ab' * 2),
  1364. 'delt': np.arange(4).astype('timedelta64[ns]')})
  1365. expected = Series([2, 2], index=['a', 'b'], name='delt')
  1366. result = df.groupby('g').delt.count()
  1367. tm.assert_series_equal(expected, result)
  1368. def test_cython_agg_frame_columns(self):
  1369. # #2113
  1370. df = DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]})
  1371. result = df.groupby(level=0, axis='columns').mean()
  1372. result = df.groupby(level=0, axis='columns').mean()
  1373. result = df.groupby(level=0, axis='columns').mean()
  1374. _ = df.groupby(level=0, axis='columns').mean()
  1375. def test_wrap_aggregated_output_multindex(self):
  1376. df = self.mframe.T
  1377. df['baz', 'two'] = 'peekaboo'
  1378. keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
  1379. agged = df.groupby(keys).agg(np.mean)
  1380. tm.assert_isinstance(agged.columns, MultiIndex)
  1381. def aggfun(ser):
  1382. if ser.name == ('foo', 'one'):
  1383. raise TypeError
  1384. else:
  1385. return ser.sum()
  1386. agged2 = df.groupby(keys).aggregate(aggfun)
  1387. self.assertEqual(len(agged2.columns) + 1, len(df.columns))
  1388. def test_groupby_level(self):
  1389. frame = self.mframe
  1390. deleveled = frame.reset_index()
  1391. result0 = frame.groupby(level=0).sum()
  1392. result1 = frame.groupby(level=1).sum()
  1393. expected0 = frame.groupby(deleveled['first'].values).sum()
  1394. expected1 = frame.groupby(deleveled['second'].values).sum()
  1395. expected0 = expected0.reindex(frame.index.levels[0])
  1396. expected1 = expected1.reindex(frame.index.levels[1])
  1397. self.assertEqual(result0.index.name, 'first')
  1398. self.assertEqual(result1.index.name, 'second')
  1399. assert_frame_equal(result0, expected0)
  1400. assert_frame_equal(result1, expected1)
  1401. self.assertEqual(result0.index.name, frame.index.names[0])
  1402. self.assertEqual(result1.index.name, frame.index.names[1])
  1403. # groupby level name
  1404. result0 = frame.groupby(level='first').sum()
  1405. result1 = frame.groupby(level='second').sum()
  1406. assert_frame_equal(result0, expected0)
  1407. assert_frame_equal(result1, expected1)
  1408. # axis=1
  1409. result0 = frame.T.groupby(level=0, axis=1).sum()
  1410. result1 = frame.T.groupby(level=1, axis=1).sum()
  1411. assert_frame_equal(result0, expected0.T)
  1412. assert_frame_equal(result1, expected1.T)
  1413. # raise exception for non-MultiIndex
  1414. self.assertRaises(ValueError, self.df.groupby, level=1)
  1415. def test_groupby_level_index_names(self):
  1416. ## GH4014 this used to raise ValueError since 'exp'>1 (in py2)
  1417. df = DataFrame({'exp' : ['A']*3 + ['B']*3, 'var1' : lrange(6),}).set_index('exp')
  1418. df.groupby(level='exp')
  1419. self.assertRaises(ValueError, df.groupby, level='foo')
  1420. def test_groupby_level_with_nas(self):
  1421. index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]],
  1422. labels=[[1, 1, 1, 1, 0, 0, 0, 0],
  1423. [0, 1, 2, 3, 0, 1, 2, 3]])
  1424. # factorizing doesn't confuse things
  1425. s = Series(np.arange(8.), index=index)
  1426. result = s.groupby(level=0).sum()
  1427. expected = Series([22., 6.], index=[1, 0])
  1428. assert_series_equal(result, expected)
  1429. index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]],
  1430. labels=[[1, 1, 1, 1, -1, 0, 0, 0],
  1431. [0, 1, 2, 3, 0, 1, 2, 3]])
  1432. # factorizing doesn't confuse things
  1433. s = Series(np.arange(8.), index=index)
  1434. result = s.groupby(level=0).sum()
  1435. expected = Series([18., 6.], index=[1, 0])
  1436. assert_series_equal(result, expected)
  1437. def test_groupby_level_apply(self):
  1438. frame = self.mframe
  1439. result = frame.groupby(level=0).count()
  1440. self.assertEqual(result.index.name, 'first')
  1441. result = frame.groupby(level=1).count()
  1442. self.assertEqual(result.index.name, 'second')
  1443. result = frame['A'].groupby(level=0).count()
  1444. self.assertEqual(result.index.name, 'first')
  1445. def test_groupby_level_mapper(self):
  1446. frame = self.mframe
  1447. deleveled = frame.reset_index()
  1448. mapper0 = {'foo': 0, 'bar': 0,
  1449. 'baz': 1, 'qux': 1}
  1450. mapper1 = {'one': 0, 'two': 0, 'three': 1}
  1451. result0 = frame.groupby(mapper0, level=0).sum()
  1452. result1 = frame.groupby(mapper1, level=1).sum()
  1453. mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']])
  1454. mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']])
  1455. expected0 = frame.groupby(mapped_level0).sum()
  1456. expected1 = frame.groupby(mapped_level1).sum()
  1457. expected0.index.name, expected1.index.name = 'first', 'second'
  1458. assert_frame_equal(result0, expected0)
  1459. assert_frame_equal(result1, expected1)
  1460. def test_groupby_level_0_nonmulti(self):
  1461. # #1313
  1462. a = Series([1, 2, 3, 10, 4, 5, 20, 6], Index([1, 2, 3, 1,
  1463. 4, 5, 2, 6], name='foo'))
  1464. result = a.groupby(level=0).sum()
  1465. self.assertEqual(result.index.name, a.index.name)
  1466. def test_level_preserve_order(self):
  1467. grouped = self.mframe.groupby(level=0)
  1468. exp_labels = np.array([0, 0, 0, 1, 1, 2, 2, 3, 3, 3])
  1469. assert_almost_equal(grouped.grouper.labels[0], exp_labels)
  1470. def test_grouping_labels(self):
  1471. grouped = self.mframe.groupby(self.mframe.index.get_level_values(0))
  1472. exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3])
  1473. assert_almost_equal(grouped.grouper.labels[0], exp_labels)
  1474. def test_cython_fail_agg(self):
  1475. dr = bdate_range('1/1/2000', periods=50)
  1476. ts = Series(['A', 'B', 'C', 'D', 'E'] * 10, index=dr)
  1477. grouped = ts.groupby(lambda x: x.month)
  1478. summed = grouped.sum()
  1479. expected = grouped.agg(np.sum)
  1480. assert_series_equal(summed, expected)
  1481. def test_apply_series_to_frame(self):
  1482. def f(piece):
  1483. return DataFrame({'value': piece,
  1484. 'demeaned': piece - piece.mean(),
  1485. 'logged': np.log(piece)})
  1486. dr = bdate_range('1/1/2000', periods=100)
  1487. ts = Series(np.random.randn(100), index=dr)
  1488. grouped = ts.groupby(lambda x: x.month)
  1489. result = grouped.apply(f)
  1490. tm.assert_isinstance(result, DataFrame)
  1491. self.assertTrue(result.index.equals(ts.index))
  1492. def test_apply_series_yield_constant(self):
  1493. result = self.df.groupby(['A', 'B'])['C'].apply(len)
  1494. self.assertEqual(result.index.names[:2], ('A', 'B'))
  1495. def test_apply_frame_to_series(self):
  1496. grouped = self.df.groupby(['A', 'B'])
  1497. result = grouped.apply(len)
  1498. expected = grouped.count()['C']
  1499. self.assertTrue(result.index.equals(expected.index))
  1500. self.assert_numpy_array_equal(result.values, expected.values)
  1501. def test_apply_frame_concat_series(self):
  1502. def trans(group):
  1503. return group.groupby('B')['C'].sum().order()[:2]
  1504. def trans2(group):
  1505. grouped = group.groupby(df.reindex(group.index)['B'])
  1506. return grouped.sum().order()[:2]
  1507. df = DataFrame({'A': np.random.randint(0, 5, 1000),
  1508. 'B': np.random.randint(0, 5, 1000),
  1509. 'C': np.random.randn(1000)})
  1510. result = df.groupby('A').apply(trans)
  1511. exp = df.groupby('A')['C'].apply(trans2)
  1512. assert_series_equal(result, exp)
  1513. def test_apply_transform(self):
  1514. grouped = self.ts.groupby(lambda x: x.month)
  1515. result = grouped.apply(lambda x: x * 2)
  1516. expected = grouped.transform(lambda x: x * 2)
  1517. assert_series_equal(result, expected)
  1518. def test_apply_multikey_corner(self):
  1519. grouped = self.tsframe.groupby([lambda x: x.year,
  1520. lambda x: x.month])
  1521. def f(group):
  1522. return group.sort('A')[-5:]
  1523. result = grouped.apply(f)
  1524. for key, group in grouped:
  1525. assert_frame_equal(result.ix[key], f(group))
  1526. def test_mutate_groups(self):
  1527. # GH3380
  1528. mydf = DataFrame({
  1529. 'cat1' : ['a'] * 8 + ['b'] * 6,
  1530. 'cat2' : ['c'] * 2 + ['d'] * 2 + ['e'] * 2 + ['f'] * 2 + ['c'] * 2 + ['d'] * 2 + ['e'] * 2,
  1531. 'cat3' : lmap(lambda x: 'g%s' % x, lrange(1,15)),
  1532. 'val' : np.random.randint(100, size=14),
  1533. })
  1534. def f_copy(x):
  1535. x = x.copy()
  1536. x['rank'] = x.val.rank(method='min')
  1537. return x.groupby('cat2')['rank'].min()
  1538. def f_no_copy(x):
  1539. x['rank'] = x.val.rank(method='min')
  1540. return x.groupby('cat2')['rank'].min()
  1541. grpby_copy = mydf.groupby('cat1').apply(f_copy)
  1542. grpby_no_copy = mydf.groupby('cat1').apply(f_no_copy)
  1543. assert_series_equal(grpby_copy,grpby_no_copy)
  1544. def test_apply_chunk_view(self):
  1545. # Low level tinkering could be unsafe, make sure not
  1546. df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3],
  1547. 'value': lrange(9)})
  1548. # return view
  1549. f = lambda x: x[:2]
  1550. result = df.groupby('key', group_keys=False).apply(f)
  1551. expected = df.take([0, 1, 3, 4, 6, 7])
  1552. assert_frame_equal(result, expected)
  1553. def test_apply_no_name_column_conflict(self):
  1554. df = DataFrame({'name': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2],
  1555. 'name2': [0, 0, 0, 1, 1, 1, 0, 0, 1, 1],
  1556. 'value': lrange(10)[::-1]})
  1557. # it works! #2605
  1558. grouped = df.groupby(['name', 'name2'])
  1559. grouped.apply(lambda x: x.sort('value'))
  1560. def test_groupby_series_indexed_differently(self):
  1561. s1 = Series([5.0, -9.0, 4.0, 100., -5., 55., 6.7],
  1562. index=Index(['a', 'b', 'c', 'd', 'e', 'f', 'g']))
  1563. s2 = Series([1.0, 1.0, 4.0, 5.0, 5.0, 7.0],
  1564. index=Index(['a', 'b', 'd', 'f', 'g', 'h']))
  1565. grouped = s1.groupby(s2)
  1566. agged = grouped.mean()
  1567. exp = s1.groupby(s2.reindex(s1.index).get).mean()
  1568. assert_series_equal(agged, exp)
  1569. def test_groupby_with_hier_columns(self):
  1570. tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
  1571. 'foo', 'foo', 'qux', 'qux'],
  1572. ['one', 'two', 'one', 'two',
  1573. 'one', 'two', 'one', 'two']]))
  1574. index = MultiIndex.from_tuples(tuples)
  1575. columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'),
  1576. ('B', 'cat'), ('A', 'dog')])
  1577. df = DataFrame(np.random.randn(8, 4), index=index,
  1578. columns=columns)
  1579. result = df.groupby(level=0).mean()
  1580. self.assertTrue(result.columns.equals(columns))
  1581. result = df.groupby(level=0, axis=1).mean()
  1582. self.assertTrue(result.index.equals(df.index))
  1583. result = df.groupby(level=0).agg(np.mean)
  1584. self.assertTrue(result.columns.equals(columns))
  1585. result = df.groupby(level=0).apply(lambda x: x.mean())
  1586. self.assertTrue(result.columns.equals(columns))
  1587. result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1))
  1588. self.assertTrue(result.columns.equals(Index(['A', 'B'])))
  1589. self.assertTrue(result.index.equals(df.index))
  1590. # add a nuisance column
  1591. sorted_columns, _ = columns.sortlevel(0)
  1592. df['A', 'foo'] = 'bar'
  1593. result = df.groupby(level=0).mean()
  1594. self.assertTrue(result.columns.equals(df.columns[:-1]))
  1595. def test_pass_args_kwargs(self):
  1596. from numpy import percentile
  1597. def f(x, q=None, axis=0):
  1598. return percentile(x, q, axis=axis)
  1599. g = lambda x: percentile(x, 80, axis=0)
  1600. # Series
  1601. ts_grouped = self.ts.groupby(lambda x: x.month)
  1602. agg_result = ts_grouped.agg(percentile, 80, axis=0)
  1603. apply_result = ts_grouped.apply(percentile, 80, axis=0)
  1604. trans_result = ts_grouped.transform(percentile, 80, axis=0)
  1605. agg_expected = ts_grouped.quantile(.8)
  1606. trans_expected = ts_grouped.transform(g)
  1607. assert_series_equal(apply_result, agg_expected)
  1608. assert_series_equal(agg_result, agg_expected)
  1609. assert_series_equal(trans_result, trans_expected)
  1610. agg_result = ts_grouped.agg(f, q=80)
  1611. apply_result = ts_grouped.apply(f, q=80)
  1612. trans_result = ts_grouped.transform(f, q=80)
  1613. assert_series_equal(agg_result, agg_expected)
  1614. assert_series_equal(apply_result, agg_expected)
  1615. assert_series_equal(trans_result, trans_expected)
  1616. # DataFrame
  1617. df_grouped = self.tsframe.groupby(lambda x: x.month)
  1618. agg_result = df_grouped.agg(percentile, 80, axis=0)
  1619. apply_result = df_grouped.apply(DataFrame.quantile, .8)
  1620. expected = df_grouped.quantile(.8)
  1621. assert_frame_equal(apply_result, expected)
  1622. assert_frame_equal(agg_result, expected)
  1623. agg_result = df_grouped.agg(f, q=80)
  1624. apply_result = df_grouped.apply(DataFrame.quantile, q=.8)
  1625. assert_frame_equal(agg_result, expected)
  1626. assert_frame_equal(apply_result, expected)
  1627. # def test_cython_na_bug(self):
  1628. # values = np.random.randn(10)
  1629. # shape = (5, 5)
  1630. # label_list = [np.array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2], dtype=np.int32),
  1631. # np.array([1, 2, 3, 4, 0, 1, 2, 3, 3, 4], dtype=np.int32)]
  1632. # lib.group_aggregate(values, label_list, shape)
  1633. def test_size(self):
  1634. grouped = self.df.groupby(['A', 'B'])
  1635. result = grouped.size()
  1636. for key, group in grouped:
  1637. self.assertEqual(result[key], len(group))
  1638. grouped = self.df.groupby('A')
  1639. result = grouped.size()
  1640. for key, group in grouped:
  1641. self.assertEqual(result[key], len(group))
  1642. grouped = self.df.groupby('B')
  1643. result = grouped.size()
  1644. for key, group in grouped:
  1645. self.assertEqual(result[key], len(group))
  1646. def test_count(self):
  1647. # GH5610
  1648. # count counts non-nulls
  1649. df = pd.DataFrame([[1, 2, 'foo'], [1, nan, 'bar'], [3, nan, nan]],
  1650. columns=['A', 'B', 'C'])
  1651. count_as = df.groupby('A').count()
  1652. count_not_as = df.groupby('A', as_index=False).count()
  1653. expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'], index=[1,3])
  1654. expected.index.name='A'
  1655. assert_frame_equal(count_not_as, expected.reset_index())
  1656. assert_frame_equal(count_as, expected)
  1657. count_B = df.groupby('A')['B'].count()
  1658. assert_series_equal(count_B, expected['B'])
  1659. def test_count_object(self):
  1660. df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3,
  1661. 'c': [2] * 3 + [3] * 3})
  1662. result = df.groupby('c').a.count()
  1663. expected = pd.Series([3, 3], index=[2, 3], name='a')
  1664. tm.assert_series_equal(result, expected)
  1665. df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3,
  1666. 'c': [2] * 3 + [3] * 3})
  1667. result = df.groupby('c').a.count()
  1668. expected = pd.Series([1, 3], index=[2, 3], name='a')
  1669. tm.assert_series_equal(result, expected)
  1670. def test_non_cython_api(self):
  1671. # GH5610
  1672. # non-cython calls should not include the grouper
  1673. df = DataFrame([[1, 2, 'foo'], [1, nan, 'bar',], [3, nan, 'baz']], columns=['A', 'B','C'])
  1674. g = df.groupby('A')
  1675. gni = df.groupby('A',as_index=False)
  1676. # mad
  1677. expected = DataFrame([[0],[nan]],columns=['B'],index=[1,3])
  1678. expected.index.name = 'A'
  1679. result = g.mad()
  1680. assert_frame_equal(result,expected)
  1681. expected = DataFrame([[0.,0.],[0,nan]],columns=['A','B'],index=[0,1])
  1682. result = gni.mad()
  1683. assert_frame_equal(result,expected)
  1684. # describe
  1685. expected = DataFrame(dict(B = concat([df.loc[[0,1],'B'].describe(),df.loc[[2],'B'].describe()],keys=[1,3])))
  1686. expected.index.names = ['A',None]
  1687. result = g.describe()
  1688. assert_frame_equal(result,expected)
  1689. expected = concat([df.loc[[0,1],['A','B']].describe(),df.loc[[2],['A','B']].describe()],keys=[0,1])
  1690. result = gni.describe()
  1691. assert_frame_equal(result,expected)
  1692. # any
  1693. expected = DataFrame([[True, True],[False, True]],columns=['B','C'],index=[1,3])
  1694. expected.index.name = 'A'
  1695. result = g.any()
  1696. assert_frame_equal(result,expected)
  1697. # idxmax
  1698. expected = DataFrame([[0],[nan]],columns=['B'],index=[1,3])
  1699. expected.index.name = 'A'
  1700. result = g.idxmax()
  1701. assert_frame_equal(result,expected)
  1702. # cumsum (GH5614)
  1703. df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=['A', 'B', 'C'])
  1704. expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=['B', 'C'])
  1705. result = df.groupby('A').cumsum()
  1706. assert_frame_equal(result,expected)
  1707. expected = DataFrame([[1, 2, np.nan], [2, np.nan, 9], [3, 4, 9]], columns=['A', 'B', 'C']).astype('float64')
  1708. result = df.groupby('A', as_index=False).cumsum()
  1709. assert_frame_equal(result,expected)
  1710. def test_grouping_ndarray(self):
  1711. grouped = self.df.groupby(self.df['A'].values)
  1712. result = grouped.sum()
  1713. expected = self.df.groupby('A').sum()
  1714. assert_frame_equal(result, expected, check_names=False) # Note: no names when grouping by value
  1715. def test_agg_consistency(self):
  1716. # agg with ([]) and () not consistent
  1717. # GH 6715
  1718. def P1(a):
  1719. try:
  1720. return np.percentile(a.dropna(), q=1)
  1721. except:
  1722. return np.nan
  1723. import datetime as dt
  1724. df = DataFrame({'col1':[1,2,3,4],
  1725. 'col2':[10,25,26,31],
  1726. 'date':[dt.date(2013,2,10),dt.date(2013,2,10),dt.date(2013,2,11),dt.date(2013,2,11)]})
  1727. g = df.groupby('date')
  1728. expected = g.agg([P1])
  1729. expected.columns = expected.columns.levels[0]
  1730. result = g.agg(P1)
  1731. assert_frame_equal(result, expected)
  1732. def test_apply_typecast_fail(self):
  1733. df = DataFrame({'d': [1., 1., 1., 2., 2., 2.],
  1734. 'c': np.tile(['a', 'b', 'c'], 2),
  1735. 'v': np.arange(1., 7.)})
  1736. def f(group):
  1737. v = group['v']
  1738. group['v2'] = (v - v.min()) / (v.max() - v.min())
  1739. return group
  1740. result = df.groupby('d').apply(f)
  1741. expected = df.copy()
  1742. expected['v2'] = np.tile([0., 0.5, 1], 2)
  1743. assert_frame_equal(result, expected)
  1744. def test_apply_multiindex_fail(self):
  1745. index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1],
  1746. [1, 2, 3, 1, 2, 3]])
  1747. df = DataFrame({'d': [1., 1., 1., 2., 2., 2.],
  1748. 'c': np.tile(['a', 'b', 'c'], 2),
  1749. 'v': np.arange(1., 7.)}, index=index)
  1750. def f(group):
  1751. v = group['v']
  1752. group['v2'] = (v - v.min()) / (v.max() - v.min())
  1753. return group
  1754. result = df.groupby('d').apply(f)
  1755. expected = df.copy()
  1756. expected['v2'] = np.tile([0., 0.5, 1], 2)
  1757. assert_frame_equal(result, expected)
  1758. def test_apply_corner(self):
  1759. result = self.tsframe.groupby(lambda x: x.year).apply(lambda x: x * 2)
  1760. expected = self.tsframe * 2
  1761. assert_frame_equal(result, expected)
  1762. def test_apply_without_copy(self):
  1763. # GH 5545
  1764. # returning a non-copy in an applied function fails
  1765. data = DataFrame({'id_field' : [100, 100, 200, 300], 'category' : ['a','b','c','c'], 'value' : [1,2,3,4]})
  1766. def filt1(x):
  1767. if x.shape[0] == 1:
  1768. return x.copy()
  1769. else:
  1770. return x[x.category == 'c']
  1771. def filt2(x):
  1772. if x.shape[0] == 1:
  1773. return x
  1774. else:
  1775. return x[x.category == 'c']
  1776. expected = data.groupby('id_field').apply(filt1)
  1777. result = data.groupby('id_field').apply(filt2)
  1778. assert_frame_equal(result,expected)
  1779. def test_apply_use_categorical_name(self):
  1780. from pandas import qcut
  1781. cats = qcut(self.df.C, 4)
  1782. def get_stats(group):
  1783. return {'min': group.min(), 'max': group.max(),
  1784. 'count': group.count(), 'mean': group.mean()}
  1785. result = self.df.groupby(cats).D.apply(get_stats)
  1786. self.assertEqual(result.index.names[0], 'C')
  1787. def test_apply_corner_cases(self):
  1788. # #535, can't use sliding iterator
  1789. N = 1000
  1790. labels = np.random.randint(0, 100, size=N)
  1791. df = DataFrame({'key': labels,
  1792. 'value1': np.random.randn(N),
  1793. 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)})
  1794. grouped = df.groupby('key')
  1795. def f(g):
  1796. g['value3'] = g['value1'] * 2
  1797. return g
  1798. result = grouped.apply(f)
  1799. self.assertTrue('value3' in result)
  1800. def test_transform_mixed_type(self):
  1801. index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1],
  1802. [1, 2, 3, 1, 2, 3]])
  1803. df = DataFrame({'d': [1., 1., 1., 2., 2., 2.],
  1804. 'c': np.tile(['a', 'b', 'c'], 2),
  1805. 'v': np.arange(1., 7.)}, index=index)
  1806. def f(group):
  1807. group['g'] = group['d'] * 2
  1808. return group[:1]
  1809. grouped = df.groupby('c')
  1810. result = grouped.apply(f)
  1811. self.assertEqual(result['d'].dtype, np.float64)
  1812. for key, group in grouped:
  1813. res = f(group)
  1814. assert_frame_equal(res, result.ix[key])
  1815. def test_groupby_wrong_multi_labels(self):
  1816. from pandas import read_csv
  1817. data = """index,foo,bar,baz,spam,data
  1818. 0,foo1,bar1,baz1,spam2,20
  1819. 1,foo1,bar2,baz1,spam3,30
  1820. 2,foo2,bar2,baz1,spam2,40
  1821. 3,foo1,bar1,baz2,spam1,50
  1822. 4,foo3,bar1,baz2,spam1,60"""
  1823. data = read_csv(StringIO(data), index_col=0)
  1824. grouped = data.groupby(['foo', 'bar', 'baz', 'spam'])
  1825. result = grouped.agg(np.mean)
  1826. expected = grouped.mean()
  1827. assert_frame_equal(result, expected)
  1828. def test_groupby_series_with_name(self):
  1829. result = self.df.groupby(self.df['A']).mean()
  1830. result2 = self.df.groupby(self.df['A'], as_index=False).mean()
  1831. self.assertEqual(result.index.name, 'A')
  1832. self.assertIn('A', result2)
  1833. result = self.df.groupby([self.df['A'], self.df['B']]).mean()
  1834. result2 = self.df.groupby([self.df['A'], self.df['B']],
  1835. as_index=False).mean()
  1836. self.assertEqual(result.index.names, ('A', 'B'))
  1837. self.assertIn('A', result2)
  1838. self.assertIn('B', result2)
  1839. def test_seriesgroupby_name_attr(self):
  1840. # GH 6265
  1841. result = self.df.groupby('A')['C']
  1842. self.assertEqual(result.count().name, 'C')
  1843. self.assertEqual(result.mean().name, 'C')
  1844. testFunc = lambda x: np.sum(x)*2
  1845. self.assertEqual(result.agg(testFunc).name, 'C')
  1846. def test_groupby_name_propagation(self):
  1847. # GH 6124
  1848. def summarize(df, name=None):
  1849. return Series({
  1850. 'count': 1,
  1851. 'mean': 2,
  1852. 'omissions': 3,
  1853. }, name=name)
  1854. def summarize_random_name(df):
  1855. # Provide a different name for each Series. In this case, groupby
  1856. # should not attempt to propagate the Series name since they are
  1857. # inconsistent.
  1858. return Series({
  1859. 'count': 1,
  1860. 'mean': 2,
  1861. 'omissions': 3,
  1862. }, name=df.iloc[0]['A'])
  1863. metrics = self.df.groupby('A').apply(summarize)
  1864. self.assertEqual(metrics.columns.name, None)
  1865. metrics = self.df.groupby('A').apply(summarize, 'metrics')
  1866. self.assertEqual(metrics.columns.name, 'metrics')
  1867. metrics = self.df.groupby('A').apply(summarize_random_name)
  1868. self.assertEqual(metrics.columns.name, None)
  1869. def test_groupby_nonstring_columns(self):
  1870. df = DataFrame([np.arange(10) for x in range(10)])
  1871. grouped = df.groupby(0)
  1872. result = grouped.mean()
  1873. expected = df.groupby(df[0]).mean()
  1874. del expected[0]
  1875. assert_frame_equal(result, expected)
  1876. def test_cython_grouper_series_bug_noncontig(self):
  1877. arr = np.empty((100, 100))
  1878. arr.fill(np.nan)
  1879. obj = Series(arr[:, 0], index=lrange(100))
  1880. inds = np.tile(lrange(10), 10)
  1881. result = obj.groupby(inds).agg(Series.median)
  1882. self.assertTrue(result.isnull().all())
  1883. def test_series_grouper_noncontig_index(self):
  1884. index = Index([tm.rands(10) for _ in range(100)])
  1885. values = Series(np.random.randn(50), index=index[::2])
  1886. labels = np.random.randint(0, 5, 50)
  1887. # it works!
  1888. grouped = values.groupby(labels)
  1889. # accessing the index elements causes segfault
  1890. f = lambda x: len(set(map(id, x.index)))
  1891. grouped.agg(f)
  1892. def test_convert_objects_leave_decimal_alone(self):
  1893. from decimal import Decimal
  1894. s = Series(lrange(5))
  1895. labels = np.array(['a', 'b', 'c', 'd', 'e'], dtype='O')
  1896. def convert_fast(x):
  1897. return Decimal(str(x.mean()))
  1898. def convert_force_pure(x):
  1899. # base will be length 0
  1900. assert(len(x.base) > 0)
  1901. return Decimal(str(x.mean()))
  1902. grouped = s.groupby(labels)
  1903. result = grouped.agg(convert_fast)
  1904. self.assertEqual(result.dtype, np.object_)
  1905. tm.assert_isinstance(result[0], Decimal)
  1906. result = grouped.agg(convert_force_pure)
  1907. self.assertEqual(result.dtype, np.object_)
  1908. tm.assert_isinstance(result[0], Decimal)
  1909. def test_apply_with_mixed_dtype(self):
  1910. # GH3480, apply with mixed dtype on axis=1 breaks in 0.11
  1911. df = DataFrame({'foo1' : ['one', 'two', 'two', 'three', 'one', 'two'],
  1912. 'foo2' : np.random.randn(6)})
  1913. result = df.apply(lambda x: x, axis=1)
  1914. assert_series_equal(df.get_dtype_counts(), result.get_dtype_counts())
  1915. # GH 3610 incorrect dtype conversion with as_index=False
  1916. df = DataFrame({"c1" : [1,2,6,6,8]})
  1917. df["c2"] = df.c1/2.0
  1918. result1 = df.groupby("c2").mean().reset_index().c2
  1919. result2 = df.groupby("c2", as_index=False).mean().c2
  1920. assert_series_equal(result1,result2)
  1921. def test_groupby_aggregation_mixed_dtype(self):
  1922. # GH 6212
  1923. expected = DataFrame({
  1924. 'v1': [5,5,7,np.nan,3,3,4,1],
  1925. 'v2': [55,55,77,np.nan,33,33,44,11]},
  1926. index=MultiIndex.from_tuples([(1,95),(1,99),(2,95),(2,99),('big','damp'),
  1927. ('blue','dry'),('red','red'),('red','wet')],
  1928. names=['by1','by2']))
  1929. df = DataFrame({
  1930. 'v1': [1,3,5,7,8,3,5,np.nan,4,5,7,9],
  1931. 'v2': [11,33,55,77,88,33,55,np.nan,44,55,77,99],
  1932. 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12],
  1933. 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99, np.nan,
  1934. np.nan]
  1935. })
  1936. g = df.groupby(['by1','by2'])
  1937. result = g[['v1','v2']].mean()
  1938. assert_frame_equal(result,expected)
  1939. def test_groupby_dtype_inference_empty(self):
  1940. # GH 6733
  1941. df = DataFrame({'x': [], 'range': np.arange(0,dtype='int64')})
  1942. result = df.groupby('x').first()
  1943. expected = DataFrame({'range' : Series([],index=Index([],name='x'),dtype='int64') })
  1944. assert_frame_equal(result,expected,by_blocks=True)
  1945. def test_groupby_list_infer_array_like(self):
  1946. result = self.df.groupby(list(self.df['A'])).mean()
  1947. expected = self.df.groupby(self.df['A']).mean()
  1948. assert_frame_equal(result, expected, check_names=False)
  1949. self.assertRaises(Exception, self.df.groupby, list(self.df['A'][:-1]))
  1950. # pathological case of ambiguity
  1951. df = DataFrame({'foo': [0, 1], 'bar': [3, 4],
  1952. 'val': np.random.randn(2)})
  1953. result = df.groupby(['foo', 'bar']).mean()
  1954. expected = df.groupby([df['foo'], df['bar']]).mean()[['val']]
  1955. def test_dictify(self):
  1956. dict(iter(self.df.groupby('A')))
  1957. dict(iter(self.df.groupby(['A', 'B'])))
  1958. dict(iter(self.df['C'].groupby(self.df['A'])))
  1959. dict(iter(self.df['C'].groupby([self.df['A'], self.df['B']])))
  1960. dict(iter(self.df.groupby('A')['C']))
  1961. dict(iter(self.df.groupby(['A', 'B'])['C']))
  1962. def test_sparse_friendly(self):
  1963. sdf = self.df[['C', 'D']].to_sparse()
  1964. panel = tm.makePanel()
  1965. tm.add_nans(panel)
  1966. def _check_work(gp):
  1967. gp.mean()
  1968. gp.agg(np.mean)
  1969. dict(iter(gp))
  1970. # it works!
  1971. _check_work(sdf.groupby(lambda x: x // 2))
  1972. _check_work(sdf['C'].groupby(lambda x: x // 2))
  1973. _check_work(sdf.groupby(self.df['A']))
  1974. # do this someday
  1975. # _check_work(panel.groupby(lambda x: x.month, axis=1))
  1976. def test_panel_groupby(self):
  1977. self.panel = tm.makePanel()
  1978. tm.add_nans(self.panel)
  1979. grouped = self.panel.groupby({'ItemA': 0, 'ItemB': 0, 'ItemC': 1},
  1980. axis='items')
  1981. agged = grouped.mean()
  1982. agged2 = grouped.agg(lambda x: x.mean('items'))
  1983. tm.assert_panel_equal(agged, agged2)
  1984. self.assert_numpy_array_equal(agged.items, [0, 1])
  1985. grouped = self.panel.groupby(lambda x: x.month, axis='major')
  1986. agged = grouped.mean()
  1987. self.assert_numpy_array_equal(agged.major_axis, sorted(list(set(self.panel.major_axis.month))))
  1988. grouped = self.panel.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1},
  1989. axis='minor')
  1990. agged = grouped.mean()
  1991. self.assert_numpy_array_equal(agged.minor_axis, [0, 1])
  1992. def test_numpy_groupby(self):
  1993. from pandas.core.groupby import numpy_groupby
  1994. data = np.random.randn(100, 100)
  1995. labels = np.random.randint(0, 10, size=100)
  1996. df = DataFrame(data)
  1997. result = df.groupby(labels).sum().values
  1998. expected = numpy_groupby(data, labels)
  1999. assert_almost_equal(result, expected)
  2000. result = df.groupby(labels, axis=1).sum().values
  2001. expected = numpy_groupby(data, labels, axis=1)
  2002. assert_almost_equal(result, expected)
  2003. def test_groupby_2d_malformed(self):
  2004. d = DataFrame(index=lrange(2))
  2005. d['group'] = ['g1', 'g2']
  2006. d['zeros'] = [0, 0]
  2007. d['ones'] = [1, 1]
  2008. d['label'] = ['l1', 'l2']
  2009. tmp = d.groupby(['group']).mean()
  2010. res_values = np.array([[0., 1.], [0., 1.]])
  2011. self.assert_numpy_array_equal(tmp.columns, ['zeros', 'ones'])
  2012. self.assert_numpy_array_equal(tmp.values, res_values)
  2013. def test_int32_overflow(self):
  2014. B = np.concatenate((np.arange(10000), np.arange(10000),
  2015. np.arange(5000)))
  2016. A = np.arange(25000)
  2017. df = DataFrame({'A': A, 'B': B,
  2018. 'C': A, 'D': B,
  2019. 'E': np.random.randn(25000)})
  2020. left = df.groupby(['A', 'B', 'C', 'D']).sum()
  2021. right = df.groupby(['D', 'C', 'B', 'A']).sum()
  2022. self.assertEqual(len(left), len(right))
  2023. def test_int64_overflow(self):
  2024. B = np.concatenate((np.arange(1000), np.arange(1000),
  2025. np.arange(500)))
  2026. A = np.arange(2500)
  2027. df = DataFrame({'A': A, 'B': B,
  2028. 'C': A, 'D': B,
  2029. 'E': A, 'F': B,
  2030. 'G': A, 'H': B,
  2031. 'values': np.random.randn(2500)})
  2032. lg = df.groupby(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'])
  2033. rg = df.groupby(['H', 'G', 'F', 'E', 'D', 'C', 'B', 'A'])
  2034. left = lg.sum()['values']
  2035. right = rg.sum()['values']
  2036. exp_index, _ = left.index.sortlevel(0)
  2037. self.assertTrue(left.index.equals(exp_index))
  2038. exp_index, _ = right.index.sortlevel(0)
  2039. self.assertTrue(right.index.equals(exp_index))
  2040. tups = list(map(tuple, df[['A', 'B', 'C', 'D',
  2041. 'E', 'F', 'G', 'H']].values))
  2042. tups = com._asarray_tuplesafe(tups)
  2043. expected = df.groupby(tups).sum()['values']
  2044. for k, v in compat.iteritems(expected):
  2045. self.assertEqual(left[k], right[k[::-1]])
  2046. self.assertEqual(left[k], v)
  2047. self.assertEqual(len(left), len(right))
  2048. def test_groupby_sort_multi(self):
  2049. df = DataFrame({'a': ['foo', 'bar', 'baz'],
  2050. 'b': [3, 2, 1],
  2051. 'c': [0, 1, 2],
  2052. 'd': np.random.randn(3)})
  2053. tups = lmap(tuple, df[['a', 'b', 'c']].values)
  2054. tups = com._asarray_tuplesafe(tups)
  2055. result = df.groupby(['a', 'b', 'c'], sort=True).sum()
  2056. self.assert_numpy_array_equal(result.index.values,
  2057. tups[[1, 2, 0]])
  2058. tups = lmap(tuple, df[['c', 'a', 'b']].values)
  2059. tups = com._asarray_tuplesafe(tups)
  2060. result = df.groupby(['c', 'a', 'b'], sort=True).sum()
  2061. self.assert_numpy_array_equal(result.index.values, tups)
  2062. tups = lmap(tuple, df[['b', 'c', 'a']].values)
  2063. tups = com._asarray_tuplesafe(tups)
  2064. result = df.groupby(['b', 'c', 'a'], sort=True).sum()
  2065. self.assert_numpy_array_equal(result.index.values,
  2066. tups[[2, 1, 0]])
  2067. df = DataFrame({'a': [0, 1, 2, 0, 1, 2],
  2068. 'b': [0, 0, 0, 1, 1, 1],
  2069. 'd': np.random.randn(6)})
  2070. grouped = df.groupby(['a', 'b'])['d']
  2071. result = grouped.sum()
  2072. _check_groupby(df, result, ['a', 'b'], 'd')
  2073. def test_intercept_builtin_sum(self):
  2074. s = Series([1., 2., np.nan, 3.])
  2075. grouped = s.groupby([0, 1, 2, 2])
  2076. result = grouped.agg(builtins.sum)
  2077. result2 = grouped.apply(builtins.sum)
  2078. expected = grouped.sum()
  2079. assert_series_equal(result, expected)
  2080. assert_series_equal(result2, expected)
  2081. def test_column_select_via_attr(self):
  2082. result = self.df.groupby('A').C.sum()
  2083. expected = self.df.groupby('A')['C'].sum()
  2084. assert_series_equal(result, expected)
  2085. self.df['mean'] = 1.5
  2086. result = self.df.groupby('A').mean()
  2087. expected = self.df.groupby('A').agg(np.mean)
  2088. assert_frame_equal(result, expected)
  2089. def test_rank_apply(self):
  2090. lev1 = np.array([rands(10) for _ in range(100)], dtype=object)
  2091. lev2 = np.array([rands(10) for _ in range(130)], dtype=object)
  2092. lab1 = np.random.randint(0, 100, size=500)
  2093. lab2 = np.random.randint(0, 130, size=500)
  2094. df = DataFrame({'value': np.random.randn(500),
  2095. 'key1': lev1.take(lab1),
  2096. 'key2': lev2.take(lab2)})
  2097. result = df.groupby(['key1', 'key2']).value.rank()
  2098. expected = []
  2099. for key, piece in df.groupby(['key1', 'key2']):
  2100. expected.append(piece.value.rank())
  2101. expected = concat(expected, axis=0)
  2102. expected = expected.reindex(result.index)
  2103. assert_series_equal(result, expected)
  2104. result = df.groupby(['key1', 'key2']).value.rank(pct=True)
  2105. expected = []
  2106. for key, piece in df.groupby(['key1', 'key2']):
  2107. expected.append(piece.value.rank(pct=True))
  2108. expected = concat(expected, axis=0)
  2109. expected = expected.reindex(result.index)
  2110. assert_series_equal(result, expected)
  2111. def test_dont_clobber_name_column(self):
  2112. df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'],
  2113. 'name': ['foo', 'bar', 'baz'] * 2})
  2114. result = df.groupby('key').apply(lambda x: x)
  2115. assert_frame_equal(result, df)
  2116. def test_skip_group_keys(self):
  2117. from pandas import concat
  2118. tsf = tm.makeTimeDataFrame()
  2119. grouped = tsf.groupby(lambda x: x.month, group_keys=False)
  2120. result = grouped.apply(lambda x: x.sort_index(by='A')[:3])
  2121. pieces = []
  2122. for key, group in grouped:
  2123. pieces.append(group.sort_index(by='A')[:3])
  2124. expected = concat(pieces)
  2125. assert_frame_equal(result, expected)
  2126. grouped = tsf['A'].groupby(lambda x: x.month, group_keys=False)
  2127. result = grouped.apply(lambda x: x.order()[:3])
  2128. pieces = []
  2129. for key, group in grouped:
  2130. pieces.append(group.order()[:3])
  2131. expected = concat(pieces)
  2132. assert_series_equal(result, expected)
  2133. def test_no_nonsense_name(self):
  2134. # GH #995
  2135. s = self.frame['C'].copy()
  2136. s.name = None
  2137. result = s.groupby(self.frame['A']).agg(np.sum)
  2138. self.assertIsNone(result.name)
  2139. def test_wrap_agg_out(self):
  2140. grouped = self.three_group.groupby(['A', 'B'])
  2141. def func(ser):
  2142. if ser.dtype == np.object:
  2143. raise TypeError
  2144. else:
  2145. return ser.sum()
  2146. result = grouped.aggregate(func)
  2147. exp_grouped = self.three_group.ix[:, self.three_group.columns != 'C']
  2148. expected = exp_grouped.groupby(['A', 'B']).aggregate(func)
  2149. assert_frame_equal(result, expected)
  2150. def test_multifunc_sum_bug(self):
  2151. # GH #1065
  2152. x = DataFrame(np.arange(9).reshape(3, 3))
  2153. x['test'] = 0
  2154. x['fl'] = [1.3, 1.5, 1.6]
  2155. grouped = x.groupby('test')
  2156. result = grouped.agg({'fl': 'sum', 2: 'size'})
  2157. self.assertEqual(result['fl'].dtype, np.float64)
  2158. def test_handle_dict_return_value(self):
  2159. def f(group):
  2160. return {'min': group.min(), 'max': group.max()}
  2161. def g(group):
  2162. return Series({'min': group.min(), 'max': group.max()})
  2163. result = self.df.groupby('A')['C'].apply(f)
  2164. expected = self.df.groupby('A')['C'].apply(g)
  2165. tm.assert_isinstance(result, Series)
  2166. assert_series_equal(result, expected)
  2167. def test_getitem_list_of_columns(self):
  2168. df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
  2169. 'foo', 'bar', 'foo', 'foo'],
  2170. 'B': ['one', 'one', 'two', 'three',
  2171. 'two', 'two', 'one', 'three'],
  2172. 'C': np.random.randn(8),
  2173. 'D': np.random.randn(8),
  2174. 'E': np.random.randn(8)})
  2175. result = df.groupby('A')[['C', 'D']].mean()
  2176. result2 = df.groupby('A')['C', 'D'].mean()
  2177. result3 = df.groupby('A')[df.columns[2:4]].mean()
  2178. expected = df.ix[:, ['A', 'C', 'D']].groupby('A').mean()
  2179. assert_frame_equal(result, expected)
  2180. assert_frame_equal(result2, expected)
  2181. assert_frame_equal(result3, expected)
  2182. def test_agg_multiple_functions_maintain_order(self):
  2183. # GH #610
  2184. funcs = [('mean', np.mean), ('max', np.max), ('min', np.min)]
  2185. result = self.df.groupby('A')['C'].agg(funcs)
  2186. exp_cols = ['mean', 'max', 'min']
  2187. self.assert_numpy_array_equal(result.columns, exp_cols)
  2188. def test_multiple_functions_tuples_and_non_tuples(self):
  2189. # #1359
  2190. funcs = [('foo', 'mean'), 'std']
  2191. ex_funcs = [('foo', 'mean'), ('std', 'std')]
  2192. result = self.df.groupby('A')['C'].agg(funcs)
  2193. expected = self.df.groupby('A')['C'].agg(ex_funcs)
  2194. assert_frame_equal(result, expected)
  2195. result = self.df.groupby('A').agg(funcs)
  2196. expected = self.df.groupby('A').agg(ex_funcs)
  2197. assert_frame_equal(result, expected)
  2198. def test_agg_multiple_functions_too_many_lambdas(self):
  2199. grouped = self.df.groupby('A')
  2200. funcs = ['mean', lambda x: x.mean(), lambda x: x.std()]
  2201. self.assertRaises(SpecificationError, grouped.agg, funcs)
  2202. def test_more_flexible_frame_multi_function(self):
  2203. from pandas import concat
  2204. grouped = self.df.groupby('A')
  2205. exmean = grouped.agg(OrderedDict([['C', np.mean], ['D', np.mean]]))
  2206. exstd = grouped.agg(OrderedDict([['C', np.std], ['D', np.std]]))
  2207. expected = concat([exmean, exstd], keys=['mean', 'std'], axis=1)
  2208. expected = expected.swaplevel(0, 1, axis=1).sortlevel(0, axis=1)
  2209. d = OrderedDict([['C', [np.mean, np.std]], ['D', [np.mean, np.std]]])
  2210. result = grouped.aggregate(d)
  2211. assert_frame_equal(result, expected)
  2212. # be careful
  2213. result = grouped.aggregate(OrderedDict([['C', np.mean],
  2214. ['D', [np.mean, np.std]]]))
  2215. expected = grouped.aggregate(OrderedDict([['C', np.mean],
  2216. ['D', [np.mean, np.std]]]))
  2217. assert_frame_equal(result, expected)
  2218. def foo(x):
  2219. return np.mean(x)
  2220. def bar(x):
  2221. return np.std(x, ddof=1)
  2222. d = OrderedDict([['C', np.mean],
  2223. ['D', OrderedDict([['foo', np.mean],
  2224. ['bar', np.std]])]])
  2225. result = grouped.aggregate(d)
  2226. d = OrderedDict([['C', [np.mean]], ['D', [foo, bar]]])
  2227. expected = grouped.aggregate(d)
  2228. assert_frame_equal(result, expected)
  2229. def test_multi_function_flexible_mix(self):
  2230. # GH #1268
  2231. grouped = self.df.groupby('A')
  2232. d = OrderedDict([['C', OrderedDict([['foo', 'mean'],
  2233. [
  2234. 'bar', 'std']])],
  2235. ['D', 'sum']])
  2236. result = grouped.aggregate(d)
  2237. d2 = OrderedDict([['C', OrderedDict([['foo', 'mean'],
  2238. [
  2239. 'bar', 'std']])],
  2240. ['D', ['sum']]])
  2241. result2 = grouped.aggregate(d2)
  2242. d3 = OrderedDict([['C', OrderedDict([['foo', 'mean'],
  2243. [
  2244. 'bar', 'std']])],
  2245. ['D', {'sum': 'sum'}]])
  2246. expected = grouped.aggregate(d3)
  2247. assert_frame_equal(result, expected)
  2248. assert_frame_equal(result2, expected)
  2249. def test_set_group_name(self):
  2250. def f(group):
  2251. assert group.name is not None
  2252. return group
  2253. def freduce(group):
  2254. assert group.name is not None
  2255. return group.sum()
  2256. def foo(x):
  2257. return freduce(x)
  2258. def _check_all(grouped):
  2259. # make sure all these work
  2260. grouped.apply(f)
  2261. grouped.aggregate(freduce)
  2262. grouped.aggregate({'C': freduce, 'D': freduce})
  2263. grouped.transform(f)
  2264. grouped['C'].apply(f)
  2265. grouped['C'].aggregate(freduce)
  2266. grouped['C'].aggregate([freduce, foo])
  2267. grouped['C'].transform(f)
  2268. _check_all(self.df.groupby('A'))
  2269. _check_all(self.df.groupby(['A', 'B']))
  2270. def test_no_dummy_key_names(self):
  2271. # GH #1291
  2272. result = self.df.groupby(self.df['A'].values).sum()
  2273. self.assertIsNone(result.index.name)
  2274. result = self.df.groupby([self.df['A'].values,
  2275. self.df['B'].values]).sum()
  2276. self.assertEqual(result.index.names, (None, None))
  2277. def test_groupby_categorical(self):
  2278. levels = ['foo', 'bar', 'baz', 'qux']
  2279. labels = np.random.randint(0, 4, size=100)
  2280. cats = Categorical(labels, levels, name='myfactor')
  2281. data = DataFrame(np.random.randn(100, 4))
  2282. result = data.groupby(cats).mean()
  2283. expected = data.groupby(np.asarray(cats)).mean()
  2284. expected = expected.reindex(levels)
  2285. expected.index.name = 'myfactor'
  2286. assert_frame_equal(result, expected)
  2287. self.assertEqual(result.index.name, cats.name)
  2288. grouped = data.groupby(cats)
  2289. desc_result = grouped.describe()
  2290. idx = cats.labels.argsort()
  2291. ord_labels = np.asarray(cats).take(idx)
  2292. ord_data = data.take(idx)
  2293. expected = ord_data.groupby(ord_labels, sort=False).describe()
  2294. expected.index.names = ['myfactor', None]
  2295. assert_frame_equal(desc_result, expected)
  2296. def test_groupby_groups_datetimeindex(self):
  2297. # #1430
  2298. from pandas.tseries.api import DatetimeIndex
  2299. periods = 1000
  2300. ind = DatetimeIndex(start='2012/1/1', freq='5min', periods=periods)
  2301. df = DataFrame({'high': np.arange(periods),
  2302. 'low': np.arange(periods)}, index=ind)
  2303. grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
  2304. # it works!
  2305. groups = grouped.groups
  2306. tm.assert_isinstance(list(groups.keys())[0], datetime)
  2307. def test_groupby_groups_datetimeindex_tz(self):
  2308. # GH 3950
  2309. dates = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00',
  2310. '2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00']
  2311. df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'],
  2312. 'datetime': dates,
  2313. 'value1': np.arange(6,dtype='int64'),
  2314. 'value2': [1, 2] * 3})
  2315. df['datetime'] = df['datetime'].apply(lambda d: Timestamp(d, tz='US/Pacific'))
  2316. exp_idx1 = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 07:00:00',
  2317. '2011-07-19 08:00:00', '2011-07-19 08:00:00',
  2318. '2011-07-19 09:00:00', '2011-07-19 09:00:00'],
  2319. tz='US/Pacific', name='datetime')
  2320. exp_idx2 = Index(['a', 'b'] * 3, name='label')
  2321. exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
  2322. expected = DataFrame({'value1': [0, 3, 1, 4, 2, 5], 'value2': [1, 2, 2, 1, 1, 2]},
  2323. index=exp_idx, columns=['value1', 'value2'])
  2324. result = df.groupby(['datetime', 'label']).sum()
  2325. assert_frame_equal(result, expected)
  2326. # by level
  2327. didx = pd.DatetimeIndex(dates, tz='Asia/Tokyo')
  2328. df = DataFrame({'value1': np.arange(6,dtype='int64'),
  2329. 'value2': [1, 2, 3, 1, 2, 3]},
  2330. index=didx)
  2331. exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00',
  2332. '2011-07-19 09:00:00'], tz='Asia/Tokyo')
  2333. expected = DataFrame({'value1': [3, 5, 7], 'value2': [2, 4, 6]},
  2334. index=exp_idx, columns=['value1', 'value2'])
  2335. result = df.groupby(level=0).sum()
  2336. assert_frame_equal(result, expected)
  2337. def test_groupby_reindex_inside_function(self):
  2338. from pandas.tseries.api import DatetimeIndex
  2339. periods = 1000
  2340. ind = DatetimeIndex(start='2012/1/1', freq='5min', periods=periods)
  2341. df = DataFrame({'high': np.arange(
  2342. periods), 'low': np.arange(periods)}, index=ind)
  2343. def agg_before(hour, func, fix=False):
  2344. """
  2345. Run an aggregate func on the subset of data.
  2346. """
  2347. def _func(data):
  2348. d = data.select(lambda x: x.hour < 11).dropna()
  2349. if fix:
  2350. data[data.index[0]]
  2351. if len(d) == 0:
  2352. return None
  2353. return func(d)
  2354. return _func
  2355. def afunc(data):
  2356. d = data.select(lambda x: x.hour < 11).dropna()
  2357. return np.max(d)
  2358. grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
  2359. closure_bad = grouped.agg({'high': agg_before(11, np.max)})
  2360. closure_good = grouped.agg({'high': agg_before(11, np.max, True)})
  2361. assert_frame_equal(closure_bad, closure_good)
  2362. def test_multiindex_columns_empty_level(self):
  2363. l = [['count', 'values'], ['to filter', '']]
  2364. midx = MultiIndex.from_tuples(l)
  2365. df = DataFrame([[long(1), 'A']], columns=midx)
  2366. grouped = df.groupby('to filter').groups
  2367. self.assert_numpy_array_equal(grouped['A'], [0])
  2368. grouped = df.groupby([('to filter', '')]).groups
  2369. self.assert_numpy_array_equal(grouped['A'], [0])
  2370. df = DataFrame([[long(1), 'A'], [long(2), 'B']], columns=midx)
  2371. expected = df.groupby('to filter').groups
  2372. result = df.groupby([('to filter', '')]).groups
  2373. self.assertEqual(result, expected)
  2374. df = DataFrame([[long(1), 'A'], [long(2), 'A']], columns=midx)
  2375. expected = df.groupby('to filter').groups
  2376. result = df.groupby([('to filter', '')]).groups
  2377. self.assertEqual(result, expected)
  2378. def test_cython_median(self):
  2379. df = DataFrame(np.random.randn(1000))
  2380. df.values[::2] = np.nan
  2381. labels = np.random.randint(0, 50, size=1000).astype(float)
  2382. labels[::17] = np.nan
  2383. result = df.groupby(labels).median()
  2384. exp = df.groupby(labels).agg(nanops.nanmedian)
  2385. assert_frame_equal(result, exp)
  2386. df = DataFrame(np.random.randn(1000, 5))
  2387. rs = df.groupby(labels).agg(np.median)
  2388. xp = df.groupby(labels).median()
  2389. assert_frame_equal(rs, xp)
  2390. def test_groupby_categorical_no_compress(self):
  2391. data = Series(np.random.randn(9))
  2392. labels = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
  2393. cats = Categorical(labels, [0, 1, 2])
  2394. result = data.groupby(cats).mean()
  2395. exp = data.groupby(labels).mean()
  2396. assert_series_equal(result, exp)
  2397. labels = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3])
  2398. cats = Categorical(labels, [0, 1, 2, 3])
  2399. result = data.groupby(cats).mean()
  2400. exp = data.groupby(labels).mean().reindex(cats.levels)
  2401. assert_series_equal(result, exp)
  2402. def test_groupby_first_datetime64(self):
  2403. df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)])
  2404. df[1] = df[1].view('M8[ns]')
  2405. self.assertTrue(issubclass(df[1].dtype.type, np.datetime64))
  2406. result = df.groupby(level=0).first()
  2407. got_dt = result[1].dtype
  2408. self.assertTrue(issubclass(got_dt.type, np.datetime64))
  2409. result = df[1].groupby(level=0).first()
  2410. got_dt = result.dtype
  2411. self.assertTrue(issubclass(got_dt.type, np.datetime64))
  2412. def test_groupby_max_datetime64(self):
  2413. # GH 5869
  2414. # datetimelike dtype conversion from int
  2415. df = DataFrame(dict(A = Timestamp('20130101'), B = np.arange(5)))
  2416. expected = df.groupby('A')['A'].apply(lambda x: x.max())
  2417. result = df.groupby('A')['A'].max()
  2418. assert_series_equal(result,expected)
  2419. def test_groupby_datetime64_32_bit(self):
  2420. # GH 6410 / numpy 4328
  2421. # 32-bit under 1.9-dev indexing issue
  2422. df = DataFrame({"A": range(2), "B": [pd.Timestamp('2000-01-1')]*2})
  2423. result = df.groupby("A")["B"].transform(min)
  2424. expected = Series([pd.Timestamp('2000-01-1')]*2)
  2425. assert_series_equal(result,expected)
  2426. def test_groupby_categorical_unequal_len(self):
  2427. import pandas as pd
  2428. #GH3011
  2429. series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4])
  2430. bins = pd.cut(series.dropna(), 4)
  2431. # len(bins) != len(series) here
  2432. self.assertRaises(AssertionError,lambda : series.groupby(bins).mean())
  2433. def test_gb_apply_list_of_unequal_len_arrays(self):
  2434. # GH1738
  2435. df = DataFrame({'group1': ['a','a','a','b','b','b','a','a','a','b','b','b'],
  2436. 'group2': ['c','c','d','d','d','e','c','c','d','d','d','e'],
  2437. 'weight': [1.1,2,3,4,5,6,2,4,6,8,1,2],
  2438. 'value': [7.1,8,9,10,11,12,8,7,6,5,4,3]
  2439. })
  2440. df = df.set_index(['group1', 'group2'])
  2441. df_grouped = df.groupby(level=['group1','group2'], sort=True)
  2442. def noddy(value, weight):
  2443. out = np.array( value * weight ).repeat(3)
  2444. return out
  2445. # the kernel function returns arrays of unequal length
  2446. # pandas sniffs the first one, sees it's an array and not
  2447. # a list, and assumed the rest are of equal length
  2448. # and so tries a vstack
  2449. # don't die
  2450. no_toes = df_grouped.apply(lambda x: noddy(x.value, x.weight ))
  2451. def test_groupby_with_empty(self):
  2452. import pandas as pd
  2453. index = pd.DatetimeIndex(())
  2454. data = ()
  2455. series = pd.Series(data, index)
  2456. grouper = pd.tseries.resample.TimeGrouper('D')
  2457. grouped = series.groupby(grouper)
  2458. assert next(iter(grouped), None) is None
  2459. def test_groupby_with_timegrouper(self):
  2460. # GH 4161
  2461. # TimeGrouper requires a sorted index
  2462. # also verifies that the resultant index has the correct name
  2463. import datetime as DT
  2464. df_original = DataFrame({
  2465. 'Buyer': 'Carl Carl Carl Carl Joe Carl'.split(),
  2466. 'Quantity': [18,3,5,1,9,3],
  2467. 'Date' : [
  2468. DT.datetime(2013,9,1,13,0),
  2469. DT.datetime(2013,9,1,13,5),
  2470. DT.datetime(2013,10,1,20,0),
  2471. DT.datetime(2013,10,3,10,0),
  2472. DT.datetime(2013,12,2,12,0),
  2473. DT.datetime(2013,9,2,14,0),
  2474. ]})
  2475. # GH 6908 change target column's order
  2476. df_reordered = df_original.sort(columns='Quantity')
  2477. for df in [df_original, df_reordered]:
  2478. df = df.set_index(['Date'])
  2479. expected = DataFrame({ 'Quantity' : np.nan },
  2480. index=date_range('20130901 13:00:00','20131205 13:00:00',
  2481. freq='5D',name='Date',closed='left'))
  2482. expected.iloc[[0,6,18],0] = np.array([24.,6.,9.],dtype='float64')
  2483. result1 = df.resample('5D',how=sum)
  2484. assert_frame_equal(result1, expected)
  2485. df_sorted = df.sort_index()
  2486. result2 = df_sorted.groupby(pd.TimeGrouper(freq='5D')).sum()
  2487. assert_frame_equal(result2, expected)
  2488. result3 = df.groupby(pd.TimeGrouper(freq='5D')).sum()
  2489. assert_frame_equal(result3, expected)
  2490. def test_groupby_with_timegrouper_methods(self):
  2491. # GH 3881
  2492. # make sure API of timegrouper conforms
  2493. import datetime as DT
  2494. df_original = pd.DataFrame({
  2495. 'Branch' : 'A A A A A B'.split(),
  2496. 'Buyer': 'Carl Mark Carl Joe Joe Carl'.split(),
  2497. 'Quantity': [1,3,5,8,9,3],
  2498. 'Date' : [
  2499. DT.datetime(2013,1,1,13,0),
  2500. DT.datetime(2013,1,1,13,5),
  2501. DT.datetime(2013,10,1,20,0),
  2502. DT.datetime(2013,10,2,10,0),
  2503. DT.datetime(2013,12,2,12,0),
  2504. DT.datetime(2013,12,2,14,0),
  2505. ]})
  2506. df_sorted = df_original.sort(columns='Quantity', ascending=False)
  2507. for df in [df_original, df_sorted]:
  2508. df = df.set_index('Date', drop=False)
  2509. g = df.groupby(pd.TimeGrouper('6M'))
  2510. self.assertTrue(g.group_keys)
  2511. self.assertTrue(isinstance(g.grouper,pd.core.groupby.BinGrouper))
  2512. groups = g.groups
  2513. self.assertTrue(isinstance(groups,dict))
  2514. self.assertTrue(len(groups) == 3)
  2515. def test_timegrouper_with_reg_groups(self):
  2516. # GH 3794
  2517. # allow combinateion of timegrouper/reg groups
  2518. import datetime as DT
  2519. df_original = DataFrame({
  2520. 'Branch' : 'A A A A A A A B'.split(),
  2521. 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
  2522. 'Quantity': [1,3,5,1,8,1,9,3],
  2523. 'Date' : [
  2524. DT.datetime(2013,1,1,13,0),
  2525. DT.datetime(2013,1,1,13,5),
  2526. DT.datetime(2013,10,1,20,0),
  2527. DT.datetime(2013,10,2,10,0),
  2528. DT.datetime(2013,10,1,20,0),
  2529. DT.datetime(2013,10,2,10,0),
  2530. DT.datetime(2013,12,2,12,0),
  2531. DT.datetime(2013,12,2,14,0),
  2532. ]}).set_index('Date')
  2533. df_sorted = df_original.sort(columns='Quantity', ascending=False)
  2534. for df in [df_original, df_sorted]:
  2535. expected = DataFrame({
  2536. 'Buyer': 'Carl Joe Mark'.split(),
  2537. 'Quantity': [10,18,3],
  2538. 'Date' : [
  2539. DT.datetime(2013,12,31,0,0),
  2540. DT.datetime(2013,12,31,0,0),
  2541. DT.datetime(2013,12,31,0,0),
  2542. ]}).set_index(['Date','Buyer'])
  2543. result = df.groupby([pd.Grouper(freq='A'),'Buyer']).sum()
  2544. assert_frame_equal(result,expected)
  2545. expected = DataFrame({
  2546. 'Buyer': 'Carl Mark Carl Joe'.split(),
  2547. 'Quantity': [1,3,9,18],
  2548. 'Date' : [
  2549. DT.datetime(2013,1,1,0,0),
  2550. DT.datetime(2013,1,1,0,0),
  2551. DT.datetime(2013,7,1,0,0),
  2552. DT.datetime(2013,7,1,0,0),
  2553. ]}).set_index(['Date','Buyer'])
  2554. result = df.groupby([pd.Grouper(freq='6MS'),'Buyer']).sum()
  2555. assert_frame_equal(result,expected)
  2556. df_original = DataFrame({
  2557. 'Branch' : 'A A A A A A A B'.split(),
  2558. 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
  2559. 'Quantity': [1,3,5,1,8,1,9,3],
  2560. 'Date' : [
  2561. DT.datetime(2013,10,1,13,0),
  2562. DT.datetime(2013,10,1,13,5),
  2563. DT.datetime(2013,10,1,20,0),
  2564. DT.datetime(2013,10,2,10,0),
  2565. DT.datetime(2013,10,1,20,0),
  2566. DT.datetime(2013,10,2,10,0),
  2567. DT.datetime(2013,10,2,12,0),
  2568. DT.datetime(2013,10,2,14,0),
  2569. ]}).set_index('Date')
  2570. df_sorted = df_original.sort(columns='Quantity', ascending=False)
  2571. for df in [df_original, df_sorted]:
  2572. expected = DataFrame({
  2573. 'Buyer': 'Carl Joe Mark Carl Joe'.split(),
  2574. 'Quantity': [6,8,3,4,10],
  2575. 'Date' : [
  2576. DT.datetime(2013,10,1,0,0),
  2577. DT.datetime(2013,10,1,0,0),
  2578. DT.datetime(2013,10,1,0,0),
  2579. DT.datetime(2013,10,2,0,0),
  2580. DT.datetime(2013,10,2,0,0),
  2581. ]}).set_index(['Date','Buyer'])
  2582. result = df.groupby([pd.Grouper(freq='1D'),'Buyer']).sum()
  2583. assert_frame_equal(result,expected)
  2584. result = df.groupby([pd.Grouper(freq='1M'),'Buyer']).sum()
  2585. expected = DataFrame({
  2586. 'Buyer': 'Carl Joe Mark'.split(),
  2587. 'Quantity': [10,18,3],
  2588. 'Date' : [
  2589. DT.datetime(2013,10,31,0,0),
  2590. DT.datetime(2013,10,31,0,0),
  2591. DT.datetime(2013,10,31,0,0),
  2592. ]}).set_index(['Date','Buyer'])
  2593. assert_frame_equal(result,expected)
  2594. # passing the name
  2595. df = df.reset_index()
  2596. result = df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum()
  2597. assert_frame_equal(result,expected)
  2598. self.assertRaises(KeyError, lambda : df.groupby([pd.Grouper(freq='1M',key='foo'),'Buyer']).sum())
  2599. # passing the level
  2600. df = df.set_index('Date')
  2601. result = df.groupby([pd.Grouper(freq='1M',level='Date'),'Buyer']).sum()
  2602. assert_frame_equal(result,expected)
  2603. result = df.groupby([pd.Grouper(freq='1M',level=0),'Buyer']).sum()
  2604. assert_frame_equal(result,expected)
  2605. self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',level='foo'),'Buyer']).sum())
  2606. # multi names
  2607. df = df.copy()
  2608. df['Date'] = df.index + pd.offsets.MonthEnd(2)
  2609. result = df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum()
  2610. expected = DataFrame({
  2611. 'Buyer': 'Carl Joe Mark'.split(),
  2612. 'Quantity': [10,18,3],
  2613. 'Date' : [
  2614. DT.datetime(2013,11,30,0,0),
  2615. DT.datetime(2013,11,30,0,0),
  2616. DT.datetime(2013,11,30,0,0),
  2617. ]}).set_index(['Date','Buyer'])
  2618. assert_frame_equal(result,expected)
  2619. # error as we have both a level and a name!
  2620. self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',key='Date',level='Date'),'Buyer']).sum())
  2621. # single groupers
  2622. expected = DataFrame({ 'Quantity' : [31],
  2623. 'Date' : [DT.datetime(2013,10,31,0,0)] }).set_index('Date')
  2624. result = df.groupby(pd.Grouper(freq='1M')).sum()
  2625. assert_frame_equal(result, expected)
  2626. result = df.groupby([pd.Grouper(freq='1M')]).sum()
  2627. assert_frame_equal(result, expected)
  2628. expected = DataFrame({ 'Quantity' : [31],
  2629. 'Date' : [DT.datetime(2013,11,30,0,0)] }).set_index('Date')
  2630. result = df.groupby(pd.Grouper(freq='1M',key='Date')).sum()
  2631. assert_frame_equal(result, expected)
  2632. result = df.groupby([pd.Grouper(freq='1M',key='Date')]).sum()
  2633. assert_frame_equal(result, expected)
  2634. # GH 6764 multiple grouping with/without sort
  2635. df = DataFrame({
  2636. 'date' : pd.to_datetime([
  2637. '20121002','20121007','20130130','20130202','20130305','20121002',
  2638. '20121207','20130130','20130202','20130305','20130202','20130305']),
  2639. 'user_id' : [1,1,1,1,1,3,3,3,5,5,5,5],
  2640. 'whole_cost' : [1790,364,280,259,201,623,90,312,359,301,359,801],
  2641. 'cost1' : [12,15,10,24,39,1,0,90,45,34,1,12] }).set_index('date')
  2642. for freq in ['D', 'M', 'A', 'Q-APR']:
  2643. expected = df.groupby('user_id')['whole_cost'].resample(
  2644. freq, how='sum').dropna().reorder_levels(
  2645. ['date','user_id']).sortlevel().astype('int64')
  2646. expected.name = 'whole_cost'
  2647. result1 = df.sort_index().groupby([pd.TimeGrouper(freq=freq), 'user_id'])['whole_cost'].sum()
  2648. assert_series_equal(result1, expected)
  2649. result2 = df.groupby([pd.TimeGrouper(freq=freq), 'user_id'])['whole_cost'].sum()
  2650. assert_series_equal(result2, expected)
  2651. def test_timegrouper_get_group(self):
  2652. # GH 6914
  2653. df_original = DataFrame({
  2654. 'Buyer': 'Carl Joe Joe Carl Joe Carl'.split(),
  2655. 'Quantity': [18,3,5,1,9,3],
  2656. 'Date' : [datetime(2013,9,1,13,0), datetime(2013,9,1,13,5),
  2657. datetime(2013,10,1,20,0), datetime(2013,10,3,10,0),
  2658. datetime(2013,12,2,12,0), datetime(2013,9,2,14,0),]})
  2659. df_reordered = df_original.sort(columns='Quantity')
  2660. # single grouping
  2661. expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]],
  2662. df_original.iloc[[4]]]
  2663. dt_list = ['2013-09-30', '2013-10-31', '2013-12-31']
  2664. for df in [df_original, df_reordered]:
  2665. grouped = df.groupby(pd.Grouper(freq='M', key='Date'))
  2666. for t, expected in zip(dt_list, expected_list):
  2667. dt = pd.Timestamp(t)
  2668. result = grouped.get_group(dt)
  2669. assert_frame_equal(result, expected)
  2670. # multiple grouping
  2671. expected_list = [df_original.iloc[[1]], df_original.iloc[[3]],
  2672. df_original.iloc[[4]]]
  2673. g_list = [('Joe', '2013-09-30'), ('Carl', '2013-10-31'), ('Joe', '2013-12-31')]
  2674. for df in [df_original, df_reordered]:
  2675. grouped = df.groupby(['Buyer', pd.Grouper(freq='M', key='Date')])
  2676. for (b, t), expected in zip(g_list, expected_list):
  2677. dt = pd.Timestamp(t)
  2678. result = grouped.get_group((b, dt))
  2679. assert_frame_equal(result, expected)
  2680. # with index
  2681. df_original = df_original.set_index('Date')
  2682. df_reordered = df_original.sort(columns='Quantity')
  2683. expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]],
  2684. df_original.iloc[[4]]]
  2685. for df in [df_original, df_reordered]:
  2686. grouped = df.groupby(pd.Grouper(freq='M'))
  2687. for t, expected in zip(dt_list, expected_list):
  2688. dt = pd.Timestamp(t)
  2689. result = grouped.get_group(dt)
  2690. assert_frame_equal(result, expected)
  2691. def test_cumcount(self):
  2692. df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'])
  2693. g = df.groupby('A')
  2694. sg = g.A
  2695. expected = Series([0, 1, 2, 0, 3])
  2696. assert_series_equal(expected, g.cumcount())
  2697. assert_series_equal(expected, sg.cumcount())
  2698. def test_cumcount_empty(self):
  2699. ge = DataFrame().groupby()
  2700. se = Series().groupby()
  2701. e = Series(dtype='int64') # edge case, as this is usually considered float
  2702. assert_series_equal(e, ge.cumcount())
  2703. assert_series_equal(e, se.cumcount())
  2704. def test_cumcount_dupe_index(self):
  2705. df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], index=[0] * 5)
  2706. g = df.groupby('A')
  2707. sg = g.A
  2708. expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
  2709. assert_series_equal(expected, g.cumcount())
  2710. assert_series_equal(expected, sg.cumcount())
  2711. def test_cumcount_mi(self):
  2712. mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
  2713. df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], index=mi)
  2714. g = df.groupby('A')
  2715. sg = g.A
  2716. expected = Series([0, 1, 2, 0, 3], index=mi)
  2717. assert_series_equal(expected, g.cumcount())
  2718. assert_series_equal(expected, sg.cumcount())
  2719. def test_cumcount_groupby_not_col(self):
  2720. df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], index=[0] * 5)
  2721. g = df.groupby([0, 0, 0, 1, 0])
  2722. sg = g.A
  2723. expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
  2724. assert_series_equal(expected, g.cumcount())
  2725. assert_series_equal(expected, sg.cumcount())
  2726. def test_filter_series(self):
  2727. import pandas as pd
  2728. s = pd.Series([1, 3, 20, 5, 22, 24, 7])
  2729. expected_odd = pd.Series([1, 3, 5, 7], index=[0, 1, 3, 6])
  2730. expected_even = pd.Series([20, 22, 24], index=[2, 4, 5])
  2731. grouper = s.apply(lambda x: x % 2)
  2732. grouped = s.groupby(grouper)
  2733. assert_series_equal(
  2734. grouped.filter(lambda x: x.mean() < 10), expected_odd)
  2735. assert_series_equal(
  2736. grouped.filter(lambda x: x.mean() > 10), expected_even)
  2737. # Test dropna=False.
  2738. assert_series_equal(
  2739. grouped.filter(lambda x: x.mean() < 10, dropna=False),
  2740. expected_odd.reindex(s.index))
  2741. assert_series_equal(
  2742. grouped.filter(lambda x: x.mean() > 10, dropna=False),
  2743. expected_even.reindex(s.index))
  2744. def test_filter_single_column_df(self):
  2745. import pandas as pd
  2746. df = pd.DataFrame([1, 3, 20, 5, 22, 24, 7])
  2747. expected_odd = pd.DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6])
  2748. expected_even = pd.DataFrame([20, 22, 24], index=[2, 4, 5])
  2749. grouper = df[0].apply(lambda x: x % 2)
  2750. grouped = df.groupby(grouper)
  2751. assert_frame_equal(
  2752. grouped.filter(lambda x: x.mean() < 10), expected_odd)
  2753. assert_frame_equal(
  2754. grouped.filter(lambda x: x.mean() > 10), expected_even)
  2755. # Test dropna=False.
  2756. assert_frame_equal(
  2757. grouped.filter(lambda x: x.mean() < 10, dropna=False),
  2758. expected_odd.reindex(df.index))
  2759. assert_frame_equal(
  2760. grouped.filter(lambda x: x.mean() > 10, dropna=False),
  2761. expected_even.reindex(df.index))
  2762. def test_filter_multi_column_df(self):
  2763. import pandas as pd
  2764. df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': [1, 1, 1, 1]})
  2765. grouper = df['A'].apply(lambda x: x % 2)
  2766. grouped = df.groupby(grouper)
  2767. expected = pd.DataFrame({'A': [12, 12], 'B': [1, 1]}, index=[1, 2])
  2768. assert_frame_equal(
  2769. grouped.filter(lambda x: x['A'].sum() - x['B'].sum() > 10), expected)
  2770. def test_filter_mixed_df(self):
  2771. import pandas as pd
  2772. df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()})
  2773. grouper = df['A'].apply(lambda x: x % 2)
  2774. grouped = df.groupby(grouper)
  2775. expected = pd.DataFrame({'A': [12, 12], 'B': ['b', 'c']},
  2776. index=[1, 2])
  2777. assert_frame_equal(
  2778. grouped.filter(lambda x: x['A'].sum() > 10), expected)
  2779. def test_filter_out_all_groups(self):
  2780. import pandas as pd
  2781. s = pd.Series([1, 3, 20, 5, 22, 24, 7])
  2782. grouper = s.apply(lambda x: x % 2)
  2783. grouped = s.groupby(grouper)
  2784. assert_series_equal(
  2785. grouped.filter(lambda x: x.mean() > 1000), s[[]])
  2786. df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()})
  2787. grouper = df['A'].apply(lambda x: x % 2)
  2788. grouped = df.groupby(grouper)
  2789. assert_frame_equal(
  2790. grouped.filter(lambda x: x['A'].sum() > 1000), df.ix[[]])
  2791. def test_filter_out_no_groups(self):
  2792. import pandas as pd
  2793. s = pd.Series([1, 3, 20, 5, 22, 24, 7])
  2794. grouper = s.apply(lambda x: x % 2)
  2795. grouped = s.groupby(grouper)
  2796. filtered = grouped.filter(lambda x: x.mean() > 0)
  2797. assert_series_equal(filtered, s)
  2798. df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()})
  2799. grouper = df['A'].apply(lambda x: x % 2)
  2800. grouped = df.groupby(grouper)
  2801. filtered = grouped.filter(lambda x: x['A'].mean() > 0)
  2802. assert_frame_equal(filtered, df)
  2803. def test_filter_condition_raises(self):
  2804. import pandas as pd
  2805. def raise_if_sum_is_zero(x):
  2806. if x.sum() == 0:
  2807. raise ValueError
  2808. else:
  2809. return x.sum() > 0
  2810. s = pd.Series([-1,0,1,2])
  2811. grouper = s.apply(lambda x: x % 2)
  2812. grouped = s.groupby(grouper)
  2813. self.assertRaises(TypeError,
  2814. lambda: grouped.filter(raise_if_sum_is_zero))
  2815. def test_filter_bad_shapes(self):
  2816. df = DataFrame({'A': np.arange(8), 'B': list('aabbbbcc'), 'C': np.arange(8)})
  2817. s = df['B']
  2818. g_df = df.groupby('B')
  2819. g_s = s.groupby(s)
  2820. f = lambda x: x
  2821. self.assertRaises(TypeError, lambda: g_df.filter(f))
  2822. self.assertRaises(TypeError, lambda: g_s.filter(f))
  2823. f = lambda x: x == 1
  2824. self.assertRaises(TypeError, lambda: g_df.filter(f))
  2825. self.assertRaises(TypeError, lambda: g_s.filter(f))
  2826. f = lambda x: np.outer(x, x)
  2827. self.assertRaises(TypeError, lambda: g_df.filter(f))
  2828. self.assertRaises(TypeError, lambda: g_s.filter(f))
  2829. def test_filter_nan_is_false(self):
  2830. df = DataFrame({'A': np.arange(8), 'B': list('aabbbbcc'), 'C': np.arange(8)})
  2831. s = df['B']
  2832. g_df = df.groupby(df['B'])
  2833. g_s = s.groupby(s)
  2834. f = lambda x: np.nan
  2835. assert_frame_equal(g_df.filter(f), df.loc[[]])
  2836. assert_series_equal(g_s.filter(f), s[[]])
  2837. def test_filter_against_workaround(self):
  2838. np.random.seed(0)
  2839. # Series of ints
  2840. s = Series(np.random.randint(0,100,1000))
  2841. grouper = s.apply(lambda x: np.round(x, -1))
  2842. grouped = s.groupby(grouper)
  2843. f = lambda x: x.mean() > 10
  2844. old_way = s[grouped.transform(f).astype('bool')]
  2845. new_way = grouped.filter(f)
  2846. assert_series_equal(new_way.order(), old_way.order())
  2847. # Series of floats
  2848. s = 100*Series(np.random.random(1000))
  2849. grouper = s.apply(lambda x: np.round(x, -1))
  2850. grouped = s.groupby(grouper)
  2851. f = lambda x: x.mean() > 10
  2852. old_way = s[grouped.transform(f).astype('bool')]
  2853. new_way = grouped.filter(f)
  2854. assert_series_equal(new_way.order(), old_way.order())
  2855. # Set up DataFrame of ints, floats, strings.
  2856. from string import ascii_lowercase
  2857. letters = np.array(list(ascii_lowercase))
  2858. N = 1000
  2859. random_letters = letters.take(np.random.randint(0, 26, N))
  2860. df = DataFrame({'ints': Series(np.random.randint(0, 100, N)),
  2861. 'floats': N/10*Series(np.random.random(N)),
  2862. 'letters': Series(random_letters)})
  2863. # Group by ints; filter on floats.
  2864. grouped = df.groupby('ints')
  2865. old_way = df[grouped.floats.\
  2866. transform(lambda x: x.mean() > N/20).astype('bool')]
  2867. new_way = grouped.filter(lambda x: x['floats'].mean() > N/20)
  2868. assert_frame_equal(new_way, old_way)
  2869. # Group by floats (rounded); filter on strings.
  2870. grouper = df.floats.apply(lambda x: np.round(x, -1))
  2871. grouped = df.groupby(grouper)
  2872. old_way = df[grouped.letters.\
  2873. transform(lambda x: len(x) < N/10).astype('bool')]
  2874. new_way = grouped.filter(
  2875. lambda x: len(x.letters) < N/10)
  2876. assert_frame_equal(new_way, old_way)
  2877. # Group by strings; filter on ints.
  2878. grouped = df.groupby('letters')
  2879. old_way = df[grouped.ints.\
  2880. transform(lambda x: x.mean() > N/20).astype('bool')]
  2881. new_way = grouped.filter(lambda x: x['ints'].mean() > N/20)
  2882. assert_frame_equal(new_way, old_way)
  2883. def test_filter_using_len(self):
  2884. # BUG GH4447
  2885. df = DataFrame({'A': np.arange(8), 'B': list('aabbbbcc'), 'C': np.arange(8)})
  2886. grouped = df.groupby('B')
  2887. actual = grouped.filter(lambda x: len(x) > 2)
  2888. expected = DataFrame({'A': np.arange(2, 6), 'B': list('bbbb'), 'C': np.arange(2, 6)}, index=np.arange(2, 6))
  2889. assert_frame_equal(actual, expected)
  2890. actual = grouped.filter(lambda x: len(x) > 4)
  2891. expected = df.ix[[]]
  2892. assert_frame_equal(actual, expected)
  2893. # Series have always worked properly, but we'll test anyway.
  2894. s = df['B']
  2895. grouped = s.groupby(s)
  2896. actual = grouped.filter(lambda x: len(x) > 2)
  2897. expected = Series(4*['b'], index=np.arange(2, 6))
  2898. assert_series_equal(actual, expected)
  2899. actual = grouped.filter(lambda x: len(x) > 4)
  2900. expected = s[[]]
  2901. assert_series_equal(actual, expected)
  2902. def test_filter_maintains_ordering(self):
  2903. # Simple case: index is sequential. #4621
  2904. df = DataFrame({'pid' : [1,1,1,2,2,3,3,3],
  2905. 'tag' : [23,45,62,24,45,34,25,62]})
  2906. s = df['pid']
  2907. grouped = df.groupby('tag')
  2908. actual = grouped.filter(lambda x: len(x) > 1)
  2909. expected = df.iloc[[1, 2, 4, 7]]
  2910. assert_frame_equal(actual, expected)
  2911. grouped = s.groupby(df['tag'])
  2912. actual = grouped.filter(lambda x: len(x) > 1)
  2913. expected = s.iloc[[1, 2, 4, 7]]
  2914. assert_series_equal(actual, expected)
  2915. # Now index is sequentially decreasing.
  2916. df.index = np.arange(len(df) - 1, -1, -1)
  2917. s = df['pid']
  2918. grouped = df.groupby('tag')
  2919. actual = grouped.filter(lambda x: len(x) > 1)
  2920. expected = df.iloc[[1, 2, 4, 7]]
  2921. assert_frame_equal(actual, expected)
  2922. grouped = s.groupby(df['tag'])
  2923. actual = grouped.filter(lambda x: len(x) > 1)
  2924. expected = s.iloc[[1, 2, 4, 7]]
  2925. assert_series_equal(actual, expected)
  2926. # Index is shuffled.
  2927. SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3]
  2928. df.index = df.index[SHUFFLED]
  2929. s = df['pid']
  2930. grouped = df.groupby('tag')
  2931. actual = grouped.filter(lambda x: len(x) > 1)
  2932. expected = df.iloc[[1, 2, 4, 7]]
  2933. assert_frame_equal(actual, expected)
  2934. grouped = s.groupby(df['tag'])
  2935. actual = grouped.filter(lambda x: len(x) > 1)
  2936. expected = s.iloc[[1, 2, 4, 7]]
  2937. assert_series_equal(actual, expected)
  2938. def test_filter_and_transform_with_non_unique_int_index(self):
  2939. # GH4620
  2940. index = [1, 1, 1, 2, 1, 1, 0, 1]
  2941. df = DataFrame({'pid' : [1,1,1,2,2,3,3,3],
  2942. 'tag' : [23,45,62,24,45,34,25,62]}, index=index)
  2943. grouped_df = df.groupby('tag')
  2944. ser = df['pid']
  2945. grouped_ser = ser.groupby(df['tag'])
  2946. expected_indexes = [1, 2, 4, 7]
  2947. # Filter DataFrame
  2948. actual = grouped_df.filter(lambda x: len(x) > 1)
  2949. expected = df.iloc[expected_indexes]
  2950. assert_frame_equal(actual, expected)
  2951. actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
  2952. expected = df.copy()
  2953. expected.iloc[[0, 3, 5, 6]] = np.nan
  2954. assert_frame_equal(actual, expected)
  2955. # Filter Series
  2956. actual = grouped_ser.filter(lambda x: len(x) > 1)
  2957. expected = ser.take(expected_indexes)
  2958. assert_series_equal(actual, expected)
  2959. actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
  2960. NA = np.nan
  2961. expected = Series([NA,1,1,NA,2,NA,NA,3], index, name='pid')
  2962. # ^ made manually because this can get confusing!
  2963. assert_series_equal(actual, expected)
  2964. # Transform Series
  2965. actual = grouped_ser.transform(len)
  2966. expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index)
  2967. assert_series_equal(actual, expected)
  2968. # Transform (a column from) DataFrameGroupBy
  2969. actual = grouped_df.pid.transform(len)
  2970. assert_series_equal(actual, expected)
  2971. def test_filter_and_transform_with_multiple_non_unique_int_index(self):
  2972. # GH4620
  2973. index = [1, 1, 1, 2, 0, 0, 0, 1]
  2974. df = DataFrame({'pid' : [1,1,1,2,2,3,3,3],
  2975. 'tag' : [23,45,62,24,45,34,25,62]}, index=index)
  2976. grouped_df = df.groupby('tag')
  2977. ser = df['pid']
  2978. grouped_ser = ser.groupby(df['tag'])
  2979. expected_indexes = [1, 2, 4, 7]
  2980. # Filter DataFrame
  2981. actual = grouped_df.filter(lambda x: len(x) > 1)
  2982. expected = df.iloc[expected_indexes]
  2983. assert_frame_equal(actual, expected)
  2984. actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
  2985. expected = df.copy()
  2986. expected.iloc[[0, 3, 5, 6]] = np.nan
  2987. assert_frame_equal(actual, expected)
  2988. # Filter Series
  2989. actual = grouped_ser.filter(lambda x: len(x) > 1)
  2990. expected = ser.take(expected_indexes)
  2991. assert_series_equal(actual, expected)
  2992. actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
  2993. NA = np.nan
  2994. expected = Series([NA,1,1,NA,2,NA,NA,3], index, name='pid')
  2995. # ^ made manually because this can get confusing!
  2996. assert_series_equal(actual, expected)
  2997. # Transform Series
  2998. actual = grouped_ser.transform(len)
  2999. expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index)
  3000. assert_series_equal(actual, expected)
  3001. # Transform (a column from) DataFrameGroupBy
  3002. actual = grouped_df.pid.transform(len)
  3003. assert_series_equal(actual, expected)
  3004. def test_filter_and_transform_with_non_unique_float_index(self):
  3005. # GH4620
  3006. index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float)
  3007. df = DataFrame({'pid' : [1,1,1,2,2,3,3,3],
  3008. 'tag' : [23,45,62,24,45,34,25,62]}, index=index)
  3009. grouped_df = df.groupby('tag')
  3010. ser = df['pid']
  3011. grouped_ser = ser.groupby(df['tag'])
  3012. expected_indexes = [1, 2, 4, 7]
  3013. # Filter DataFrame
  3014. actual = grouped_df.filter(lambda x: len(x) > 1)
  3015. expected = df.iloc[expected_indexes]
  3016. assert_frame_equal(actual, expected)
  3017. actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
  3018. expected = df.copy()
  3019. expected.iloc[[0, 3, 5, 6]] = np.nan
  3020. assert_frame_equal(actual, expected)
  3021. # Filter Series
  3022. actual = grouped_ser.filter(lambda x: len(x) > 1)
  3023. expected = ser.take(expected_indexes)
  3024. assert_series_equal(actual, expected)
  3025. actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
  3026. NA = np.nan
  3027. expected = Series([NA,1,1,NA,2,NA,NA,3], index, name='pid')
  3028. # ^ made manually because this can get confusing!
  3029. assert_series_equal(actual, expected)
  3030. # Transform Series
  3031. actual = grouped_ser.transform(len)
  3032. expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index)
  3033. assert_series_equal(actual, expected)
  3034. # Transform (a column from) DataFrameGroupBy
  3035. actual = grouped_df.pid.transform(len)
  3036. assert_series_equal(actual, expected)
  3037. def test_filter_and_transform_with_non_unique_float_index(self):
  3038. # GH4620
  3039. index = np.array([1, 1, 1, 2, 0, 0, 0, 1], dtype=float)
  3040. df = DataFrame({'pid' : [1,1,1,2,2,3,3,3],
  3041. 'tag' : [23,45,62,24,45,34,25,62]}, index=index)
  3042. grouped_df = df.groupby('tag')
  3043. ser = df['pid']
  3044. grouped_ser = ser.groupby(df['tag'])
  3045. expected_indexes = [1, 2, 4, 7]
  3046. # Filter DataFrame
  3047. actual = grouped_df.filter(lambda x: len(x) > 1)
  3048. expected = df.iloc[expected_indexes]
  3049. assert_frame_equal(actual, expected)
  3050. actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
  3051. expected = df.copy()
  3052. expected.iloc[[0, 3, 5, 6]] = np.nan
  3053. assert_frame_equal(actual, expected)
  3054. # Filter Series
  3055. actual = grouped_ser.filter(lambda x: len(x) > 1)
  3056. expected = ser.take(expected_indexes)
  3057. assert_series_equal(actual, expected)
  3058. actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
  3059. NA = np.nan
  3060. expected = Series([NA,1,1,NA,2,NA,NA,3], index, name='pid')
  3061. # ^ made manually because this can get confusing!
  3062. assert_series_equal(actual, expected)
  3063. # Transform Series
  3064. actual = grouped_ser.transform(len)
  3065. expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index)
  3066. assert_series_equal(actual, expected)
  3067. # Transform (a column from) DataFrameGroupBy
  3068. actual = grouped_df.pid.transform(len)
  3069. assert_series_equal(actual, expected)
  3070. def test_filter_and_transform_with_non_unique_timestamp_index(self):
  3071. # GH4620
  3072. t0 = Timestamp('2013-09-30 00:05:00')
  3073. t1 = Timestamp('2013-10-30 00:05:00')
  3074. t2 = Timestamp('2013-11-30 00:05:00')
  3075. index = [t1, t1, t1, t2, t1, t1, t0, t1]
  3076. df = DataFrame({'pid' : [1,1,1,2,2,3,3,3],
  3077. 'tag' : [23,45,62,24,45,34,25,62]}, index=index)
  3078. grouped_df = df.groupby('tag')
  3079. ser = df['pid']
  3080. grouped_ser = ser.groupby(df['tag'])
  3081. expected_indexes = [1, 2, 4, 7]
  3082. # Filter DataFrame
  3083. actual = grouped_df.filter(lambda x: len(x) > 1)
  3084. expected = df.iloc[expected_indexes]
  3085. assert_frame_equal(actual, expected)
  3086. actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
  3087. expected = df.copy()
  3088. expected.iloc[[0, 3, 5, 6]] = np.nan
  3089. assert_frame_equal(actual, expected)
  3090. # Filter Series
  3091. actual = grouped_ser.filter(lambda x: len(x) > 1)
  3092. expected = ser.take(expected_indexes)
  3093. assert_series_equal(actual, expected)
  3094. actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
  3095. NA = np.nan
  3096. expected = Series([NA,1,1,NA,2,NA,NA,3], index, name='pid')
  3097. # ^ made manually because this can get confusing!
  3098. assert_series_equal(actual, expected)
  3099. # Transform Series
  3100. actual = grouped_ser.transform(len)
  3101. expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index)
  3102. assert_series_equal(actual, expected)
  3103. # Transform (a column from) DataFrameGroupBy
  3104. actual = grouped_df.pid.transform(len)
  3105. assert_series_equal(actual, expected)
  3106. def test_filter_and_transform_with_non_unique_string_index(self):
  3107. # GH4620
  3108. index = list('bbbcbbab')
  3109. df = DataFrame({'pid' : [1,1,1,2,2,3,3,3],
  3110. 'tag' : [23,45,62,24,45,34,25,62]}, index=index)
  3111. grouped_df = df.groupby('tag')
  3112. ser = df['pid']
  3113. grouped_ser = ser.groupby(df['tag'])
  3114. expected_indexes = [1, 2, 4, 7]
  3115. # Filter DataFrame
  3116. actual = grouped_df.filter(lambda x: len(x) > 1)
  3117. expected = df.iloc[expected_indexes]
  3118. assert_frame_equal(actual, expected)
  3119. actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
  3120. expected = df.copy()
  3121. expected.iloc[[0, 3, 5, 6]] = np.nan
  3122. assert_frame_equal(actual, expected)
  3123. # Filter Series
  3124. actual = grouped_ser.filter(lambda x: len(x) > 1)
  3125. expected = ser.take(expected_indexes)
  3126. assert_series_equal(actual, expected)
  3127. actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
  3128. NA = np.nan
  3129. expected = Series([NA,1,1,NA,2,NA,NA,3], index, name='pid')
  3130. # ^ made manually because this can get confusing!
  3131. assert_series_equal(actual, expected)
  3132. # Transform Series
  3133. actual = grouped_ser.transform(len)
  3134. expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index)
  3135. assert_series_equal(actual, expected)
  3136. # Transform (a column from) DataFrameGroupBy
  3137. actual = grouped_df.pid.transform(len)
  3138. assert_series_equal(actual, expected)
  3139. def test_filter_has_access_to_grouped_cols(self):
  3140. df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=['A', 'B'])
  3141. g = df.groupby('A')
  3142. # previously didn't have access to col A #????
  3143. filt = g.filter(lambda x: x['A'].sum() == 2)
  3144. assert_frame_equal(filt, df.iloc[[0, 1]])
  3145. def test_index_label_overlaps_location(self):
  3146. # checking we don't have any label/location confusion in the
  3147. # the wake of GH5375
  3148. df = DataFrame(list('ABCDE'), index=[2, 0, 2, 1, 1])
  3149. g = df.groupby(list('ababb'))
  3150. actual = g.filter(lambda x: len(x) > 2)
  3151. expected = df.iloc[[1, 3, 4]]
  3152. assert_frame_equal(actual, expected)
  3153. ser = df[0]
  3154. g = ser.groupby(list('ababb'))
  3155. actual = g.filter(lambda x: len(x) > 2)
  3156. expected = ser.take([1, 3, 4])
  3157. assert_series_equal(actual, expected)
  3158. # ... and again, with a generic Index of floats
  3159. df.index = df.index.astype(float)
  3160. g = df.groupby(list('ababb'))
  3161. actual = g.filter(lambda x: len(x) > 2)
  3162. expected = df.iloc[[1, 3, 4]]
  3163. assert_frame_equal(actual, expected)
  3164. ser = df[0]
  3165. g = ser.groupby(list('ababb'))
  3166. actual = g.filter(lambda x: len(x) > 2)
  3167. expected = ser.take([1, 3, 4])
  3168. assert_series_equal(actual, expected)
  3169. def test_groupby_selection_with_methods(self):
  3170. # some methods which require DatetimeIndex
  3171. rng = pd.date_range('2014', periods=len(self.df))
  3172. self.df.index = rng
  3173. g = self.df.groupby(['A'])[['C']]
  3174. g_exp = self.df[['C']].groupby(self.df['A'])
  3175. # TODO check groupby with > 1 col ?
  3176. # methods which are called as .foo()
  3177. methods = ['count',
  3178. 'corr',
  3179. 'cummax', 'cummin', 'cumprod',
  3180. 'describe', 'rank',
  3181. 'quantile',
  3182. 'diff', 'shift',
  3183. 'all', 'any',
  3184. 'idxmin', 'idxmax',
  3185. 'ffill', 'bfill',
  3186. 'pct_change',
  3187. 'tshift',
  3188. #'ohlc'
  3189. ]
  3190. for m in methods:
  3191. res = getattr(g, m)()
  3192. exp = getattr(g_exp, m)()
  3193. assert_frame_equal(res, exp) # should always be frames!
  3194. # methods which aren't just .foo()
  3195. assert_frame_equal(g.fillna(0), g_exp.fillna(0))
  3196. assert_frame_equal(g.dtypes, g_exp.dtypes)
  3197. assert_frame_equal(g.apply(lambda x: x.sum()),
  3198. g_exp.apply(lambda x: x.sum()))
  3199. assert_frame_equal(g.resample('D'), g_exp.resample('D'))
  3200. assert_frame_equal(g.resample('D', how='ohlc'),
  3201. g_exp.resample('D', how='ohlc'))
  3202. assert_frame_equal(g.filter(lambda x: len(x) == 3),
  3203. g_exp.filter(lambda x: len(x) == 3))
  3204. def test_groupby_whitelist(self):
  3205. from string import ascii_lowercase
  3206. letters = np.array(list(ascii_lowercase))
  3207. N = 10
  3208. random_letters = letters.take(np.random.randint(0, 26, N))
  3209. df = DataFrame({'floats': N / 10 * Series(np.random.random(N)),
  3210. 'letters': Series(random_letters)})
  3211. s = df.floats
  3212. df_whitelist = frozenset([
  3213. 'last', 'first',
  3214. 'mean', 'sum', 'min', 'max',
  3215. 'head', 'tail',
  3216. 'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount',
  3217. 'resample',
  3218. 'describe',
  3219. 'rank', 'quantile', 'count',
  3220. 'fillna',
  3221. 'mad',
  3222. 'any', 'all',
  3223. 'irow', 'take',
  3224. 'idxmax', 'idxmin',
  3225. 'shift', 'tshift',
  3226. 'ffill', 'bfill',
  3227. 'pct_change', 'skew',
  3228. 'plot', 'boxplot', 'hist',
  3229. 'median', 'dtypes',
  3230. 'corrwith', 'corr', 'cov',
  3231. 'diff',
  3232. ])
  3233. s_whitelist = frozenset([
  3234. 'last', 'first',
  3235. 'mean', 'sum', 'min', 'max',
  3236. 'head', 'tail',
  3237. 'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount',
  3238. 'resample',
  3239. 'describe',
  3240. 'rank', 'quantile', 'count',
  3241. 'fillna',
  3242. 'mad',
  3243. 'any', 'all',
  3244. 'irow', 'take',
  3245. 'idxmax', 'idxmin',
  3246. 'shift', 'tshift',
  3247. 'ffill', 'bfill',
  3248. 'pct_change', 'skew',
  3249. 'plot', 'hist',
  3250. 'median', 'dtype',
  3251. 'corr', 'cov',
  3252. 'value_counts',
  3253. 'diff',
  3254. 'unique', 'nunique',
  3255. 'nlargest', 'nsmallest',
  3256. ])
  3257. for obj, whitelist in zip((df, s),
  3258. (df_whitelist, s_whitelist)):
  3259. gb = obj.groupby(df.letters)
  3260. self.assertEqual(whitelist, gb._apply_whitelist)
  3261. for m in whitelist:
  3262. getattr(gb, m)
  3263. def test_groupby_blacklist(self):
  3264. from string import ascii_lowercase
  3265. letters = np.array(list(ascii_lowercase))
  3266. N = 10
  3267. random_letters = letters.take(np.random.randint(0, 26, N))
  3268. df = DataFrame({'floats': N / 10 * Series(np.random.random(N)),
  3269. 'letters': Series(random_letters)})
  3270. s = df.floats
  3271. blacklist = [
  3272. 'eval', 'query', 'abs', 'where',
  3273. 'mask', 'align', 'groupby', 'clip', 'astype',
  3274. 'at', 'combine', 'consolidate', 'convert_objects',
  3275. ]
  3276. to_methods = [method for method in dir(df) if method.startswith('to_')]
  3277. blacklist.extend(to_methods)
  3278. # e.g., to_csv
  3279. defined_but_not_allowed = ("(?:^Cannot.+{0!r}.+{1!r}.+try using the "
  3280. "'apply' method$)")
  3281. # e.g., query, eval
  3282. not_defined = "(?:^{1!r} object has no attribute {0!r}$)"
  3283. fmt = defined_but_not_allowed + '|' + not_defined
  3284. for bl in blacklist:
  3285. for obj in (df, s):
  3286. gb = obj.groupby(df.letters)
  3287. msg = fmt.format(bl, type(gb).__name__)
  3288. with tm.assertRaisesRegexp(AttributeError, msg):
  3289. getattr(gb, bl)
  3290. def test_series_groupby_plotting_nominally_works(self):
  3291. _skip_if_mpl_not_installed()
  3292. n = 10
  3293. weight = Series(np.random.normal(166, 20, size=n))
  3294. height = Series(np.random.normal(60, 10, size=n))
  3295. with tm.RNGContext(42):
  3296. gender = tm.choice(['male', 'female'], size=n)
  3297. weight.groupby(gender).plot()
  3298. tm.close()
  3299. height.groupby(gender).hist()
  3300. tm.close()
  3301. def test_plotting_with_float_index_works(self):
  3302. _skip_if_mpl_not_installed()
  3303. # GH 7025
  3304. df = DataFrame({'def': [1,1,1,2,2,2,3,3,3],
  3305. 'val': np.random.randn(9)},
  3306. index=[1.0,2.0,3.0,1.0,2.0,3.0,1.0,2.0,3.0])
  3307. df.groupby('def')['val'].plot()
  3308. tm.close()
  3309. df.groupby('def')['val'].apply(lambda x: x.plot())
  3310. tm.close()
  3311. @slow
  3312. def test_frame_groupby_plot_boxplot(self):
  3313. _skip_if_mpl_not_installed()
  3314. import matplotlib.pyplot as plt
  3315. import matplotlib as mpl
  3316. mpl.use('Agg')
  3317. tm.close()
  3318. n = 10
  3319. weight = Series(np.random.normal(166, 20, size=n))
  3320. height = Series(np.random.normal(60, 10, size=n))
  3321. with tm.RNGContext(42):
  3322. gender = tm.choice(['male', 'female'], size=n)
  3323. df = DataFrame({'height': height, 'weight': weight, 'gender': gender})
  3324. gb = df.groupby('gender')
  3325. res = gb.plot()
  3326. self.assertEqual(len(plt.get_fignums()), 2)
  3327. self.assertEqual(len(res), 2)
  3328. tm.close()
  3329. res = gb.boxplot()
  3330. self.assertEqual(len(plt.get_fignums()), 1)
  3331. self.assertEqual(len(res), 2)
  3332. tm.close()
  3333. # now works with GH 5610 as gender is excluded
  3334. res = df.groupby('gender').hist()
  3335. tm.close()
  3336. @slow
  3337. def test_frame_groupby_hist(self):
  3338. _skip_if_mpl_not_installed()
  3339. import matplotlib.pyplot as plt
  3340. import matplotlib as mpl
  3341. mpl.use('Agg')
  3342. tm.close()
  3343. n = 10
  3344. weight = Series(np.random.normal(166, 20, size=n))
  3345. height = Series(np.random.normal(60, 10, size=n))
  3346. with tm.RNGContext(42):
  3347. gender_int = tm.choice([0, 1], size=n)
  3348. df_int = DataFrame({'height': height, 'weight': weight,
  3349. 'gender': gender_int})
  3350. gb = df_int.groupby('gender')
  3351. axes = gb.hist()
  3352. self.assertEqual(len(axes), 2)
  3353. self.assertEqual(len(plt.get_fignums()), 2)
  3354. tm.close()
  3355. def test_tab_completion(self):
  3356. grp = self.mframe.groupby(level='second')
  3357. results = set([v for v in dir(grp) if not v.startswith('_')])
  3358. expected = set(['A','B','C',
  3359. 'agg','aggregate','apply','boxplot','filter','first','get_group',
  3360. 'groups','hist','indices','last','max','mean','median',
  3361. 'min','name','ngroups','nth','ohlc','plot', 'prod',
  3362. 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', 'head',
  3363. 'describe', 'cummax', 'quantile', 'rank', 'cumprod', 'tail',
  3364. 'resample', 'cummin', 'fillna', 'cumsum', 'cumcount',
  3365. 'all', 'shift', 'skew', 'bfill', 'irow', 'ffill',
  3366. 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith',
  3367. 'cov', 'dtypes', 'diff', 'idxmax', 'idxmin'
  3368. ])
  3369. self.assertEqual(results, expected)
  3370. def test_lexsort_indexer(self):
  3371. keys = [[nan]*5 + list(range(100)) + [nan]*5]
  3372. # orders=True, na_position='last'
  3373. result = _lexsort_indexer(keys, orders=True, na_position='last')
  3374. expected = list(range(5, 105)) + list(range(5)) + list(range(105, 110))
  3375. assert_equal(result, expected)
  3376. # orders=True, na_position='first'
  3377. result = _lexsort_indexer(keys, orders=True, na_position='first')
  3378. expected = list(range(5)) + list(range(105, 110)) + list(range(5, 105))
  3379. assert_equal(result, expected)
  3380. # orders=False, na_position='last'
  3381. result = _lexsort_indexer(keys, orders=False, na_position='last')
  3382. expected = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110))
  3383. assert_equal(result, expected)
  3384. # orders=False, na_position='first'
  3385. result = _lexsort_indexer(keys, orders=False, na_position='first')
  3386. expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1))
  3387. assert_equal(result, expected)
  3388. def test_nargsort(self):
  3389. # np.argsort(items) places NaNs last
  3390. items = [nan]*5 + list(range(100)) + [nan]*5
  3391. # np.argsort(items2) may not place NaNs first
  3392. items2 = np.array(items, dtype='O')
  3393. try:
  3394. # GH 2785; due to a regression in NumPy1.6.2
  3395. np.argsort(np.array([[1, 2], [1, 3], [1, 2]], dtype='i'))
  3396. np.argsort(items2, kind='mergesort')
  3397. except TypeError as err:
  3398. raise nose.SkipTest('requested sort not available for type')
  3399. # mergesort is the most difficult to get right because we want it to be stable.
  3400. # According to numpy/core/tests/test_multiarray, """The number
  3401. # of sorted items must be greater than ~50 to check the actual algorithm
  3402. # because quick and merge sort fall over to insertion sort for small
  3403. # arrays."""
  3404. # mergesort, ascending=True, na_position='last'
  3405. result = _nargsort(
  3406. items, kind='mergesort', ascending=True, na_position='last')
  3407. expected = list(range(5, 105)) + list(range(5)) + list(range(105, 110))
  3408. assert_equal(result, expected)
  3409. # mergesort, ascending=True, na_position='first'
  3410. result = _nargsort(
  3411. items, kind='mergesort', ascending=True, na_position='first')
  3412. expected = list(range(5)) + list(range(105, 110)) + list(range(5, 105))
  3413. assert_equal(result, expected)
  3414. # mergesort, ascending=False, na_position='last'
  3415. result = _nargsort(
  3416. items, kind='mergesort', ascending=False, na_position='last')
  3417. expected = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110))
  3418. assert_equal(result, expected)
  3419. # mergesort, ascending=False, na_position='first'
  3420. result = _nargsort(
  3421. items, kind='mergesort', ascending=False, na_position='first')
  3422. expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1))
  3423. assert_equal(result, expected)
  3424. # mergesort, ascending=True, na_position='last'
  3425. result = _nargsort(
  3426. items2, kind='mergesort', ascending=True, na_position='last')
  3427. expected = list(range(5, 105)) + list(range(5)) + list(range(105, 110))
  3428. assert_equal(result, expected)
  3429. # mergesort, ascending=True, na_position='first'
  3430. result = _nargsort(
  3431. items2, kind='mergesort', ascending=True, na_position='first')
  3432. expected = list(range(5)) + list(range(105, 110)) + list(range(5, 105))
  3433. assert_equal(result, expected)
  3434. # mergesort, ascending=False, na_position='last'
  3435. result = _nargsort(
  3436. items2, kind='mergesort', ascending=False, na_position='last')
  3437. expected = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110))
  3438. assert_equal(result, expected)
  3439. # mergesort, ascending=False, na_position='first'
  3440. result = _nargsort(
  3441. items2, kind='mergesort', ascending=False, na_position='first')
  3442. expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1))
  3443. assert_equal(result, expected)
  3444. def test_datetime_count(self):
  3445. df = DataFrame({'a': [1,2,3] * 2,
  3446. 'dates': pd.date_range('now', periods=6, freq='T')})
  3447. result = df.groupby('a').dates.count()
  3448. expected = Series([2, 2, 2], index=Index([1, 2, 3], name='a'),
  3449. name='dates')
  3450. tm.assert_series_equal(result, expected)
  3451. def test_lower_int_prec_count(self):
  3452. df = DataFrame({'a': np.array([0, 1, 2, 100], np.int8),
  3453. 'b': np.array([1, 2, 3, 6], np.uint32),
  3454. 'c': np.array([4, 5, 6, 8], np.int16),
  3455. 'grp': list('ab' * 2)})
  3456. result = df.groupby('grp').count()
  3457. expected = DataFrame({'a': [2, 2],
  3458. 'b': [2, 2],
  3459. 'c': [2, 2]}, index=pd.Index(list('ab'),
  3460. name='grp'))
  3461. tm.assert_frame_equal(result, expected)
  3462. def test_count_uses_size_on_exception(self):
  3463. class RaisingObjectException(Exception):
  3464. pass
  3465. class RaisingObject(object):
  3466. def __init__(self, msg='I will raise inside Cython'):
  3467. super(RaisingObject, self).__init__()
  3468. self.msg = msg
  3469. def __eq__(self, other):
  3470. # gets called in Cython to check that raising calls the method
  3471. raise RaisingObjectException(self.msg)
  3472. df = DataFrame({'a': [RaisingObject() for _ in range(4)],
  3473. 'grp': list('ab' * 2)})
  3474. result = df.groupby('grp').count()
  3475. expected = DataFrame({'a': [2, 2]}, index=pd.Index(list('ab'),
  3476. name='grp'))
  3477. tm.assert_frame_equal(result, expected)
  3478. def test__cython_agg_general(self):
  3479. ops = [('mean', np.mean),
  3480. ('median', np.median),
  3481. ('var', np.var),
  3482. ('add', np.sum),
  3483. ('prod', np.prod),
  3484. ('min', np.min),
  3485. ('max', np.max),
  3486. ('first', lambda x: x.iloc[0]),
  3487. ('last', lambda x: x.iloc[-1]),
  3488. ('count', np.size),
  3489. ]
  3490. df = DataFrame(np.random.randn(1000))
  3491. labels = np.random.randint(0, 50, size=1000).astype(float)
  3492. for op, targop in ops:
  3493. result = df.groupby(labels)._cython_agg_general(op)
  3494. expected = df.groupby(labels).agg(targop)
  3495. try:
  3496. tm.assert_frame_equal(result, expected)
  3497. except BaseException as exc:
  3498. exc.args += ('operation: %s' % op,)
  3499. raise
  3500. def test_ops_general(self):
  3501. ops = [('mean', np.mean),
  3502. ('median', np.median),
  3503. ('std', np.std),
  3504. ('var', np.var),
  3505. ('sum', np.sum),
  3506. ('prod', np.prod),
  3507. ('min', np.min),
  3508. ('max', np.max),
  3509. ('first', lambda x: x.iloc[0]),
  3510. ('last', lambda x: x.iloc[-1]),
  3511. ('count', np.size),
  3512. ]
  3513. try:
  3514. from scipy.stats import sem
  3515. except ImportError:
  3516. pass
  3517. else:
  3518. ops.append(('sem', sem))
  3519. df = DataFrame(np.random.randn(1000))
  3520. labels = np.random.randint(0, 50, size=1000).astype(float)
  3521. for op, targop in ops:
  3522. result = getattr(df.groupby(labels), op)().astype(float)
  3523. expected = df.groupby(labels).agg(targop)
  3524. try:
  3525. tm.assert_frame_equal(result, expected)
  3526. except BaseException as exc:
  3527. exc.args += ('operation: %s' % op,)
  3528. raise
  3529. def test_max_nan_bug(self):
  3530. raw = """,Date,app,File
  3531. 2013-04-23,2013-04-23 00:00:00,,log080001.log
  3532. 2013-05-06,2013-05-06 00:00:00,,log.log
  3533. 2013-05-07,2013-05-07 00:00:00,OE,xlsx"""
  3534. df = pd.read_csv(StringIO(raw), parse_dates=[0])
  3535. gb = df.groupby('Date')
  3536. r = gb[['File']].max()
  3537. e = gb['File'].max().to_frame()
  3538. tm.assert_frame_equal(r, e)
  3539. self.assertFalse(r['File'].isnull().any())
  3540. def test_nlargest(self):
  3541. a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
  3542. b = Series(list('a' * 5 + 'b' * 5))
  3543. gb = a.groupby(b)
  3544. r = gb.nlargest(3)
  3545. e = Series([7, 5, 3, 10, 9, 6],
  3546. index=MultiIndex.from_arrays([list('aaabbb'),
  3547. [3, 2, 1, 9, 5, 8]]))
  3548. tm.assert_series_equal(r, e)
  3549. def test_nsmallest(self):
  3550. a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
  3551. b = Series(list('a' * 5 + 'b' * 5))
  3552. gb = a.groupby(b)
  3553. r = gb.nsmallest(3)
  3554. e = Series([1, 2, 3, 0, 4, 6],
  3555. index=MultiIndex.from_arrays([list('aaabbb'),
  3556. [0, 4, 1, 6, 7, 8]]))
  3557. tm.assert_series_equal(r, e)
  3558. def assert_fp_equal(a, b):
  3559. assert (np.abs(a - b) < 1e-12).all()
  3560. def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
  3561. tups = lmap(tuple, df[keys].values)
  3562. tups = com._asarray_tuplesafe(tups)
  3563. expected = f(df.groupby(tups)[field])
  3564. for k, v in compat.iteritems(expected):
  3565. assert(result[k] == v)
  3566. def test_decons():
  3567. from pandas.core.groupby import decons_group_index, get_group_index
  3568. def testit(label_list, shape):
  3569. group_index = get_group_index(label_list, shape)
  3570. label_list2 = decons_group_index(group_index, shape)
  3571. for a, b in zip(label_list, label_list2):
  3572. assert(np.array_equal(a, b))
  3573. shape = (4, 5, 6)
  3574. label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100),
  3575. np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100),
  3576. np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100)]
  3577. testit(label_list, shape)
  3578. shape = (10000, 10000)
  3579. label_list = [np.tile(np.arange(10000), 5),
  3580. np.tile(np.arange(10000), 5)]
  3581. testit(label_list, shape)
  3582. if __name__ == '__main__':
  3583. nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure',
  3584. '-s'], exit=False)