PageRenderTime 60ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 1ms

/pandas/tests/test_groupby.py

http://github.com/wesm/pandas
Python | 6654 lines | 6415 code | 164 blank | 75 comment | 23 complexity | 7119f6a79ece6b6b900523ea4093a56a MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0

Large files files are truncated, but you can click here to view the full file

  1. # -*- coding: utf-8 -*-
  2. from __future__ import print_function
  3. import nose
  4. from datetime import datetime
  5. from numpy import nan
  6. from pandas.types.common import _ensure_platform_int
  7. from pandas import date_range, bdate_range, Timestamp, isnull
  8. from pandas.core.index import Index, MultiIndex, CategoricalIndex
  9. from pandas.core.api import Categorical, DataFrame
  10. from pandas.core.common import UnsupportedFunctionCall
  11. from pandas.core.groupby import (SpecificationError, DataError, _nargsort,
  12. _lexsort_indexer)
  13. from pandas.core.series import Series
  14. from pandas.core.config import option_context
  15. from pandas.formats.printing import pprint_thing
  16. from pandas.util.testing import (assert_panel_equal, assert_frame_equal,
  17. assert_series_equal, assert_almost_equal,
  18. assert_index_equal, assertRaisesRegexp)
  19. from pandas.compat import (range, long, lrange, StringIO, lmap, lzip, map, zip,
  20. builtins, OrderedDict, product as cart_product)
  21. from pandas import compat
  22. from pandas.core.panel import Panel
  23. from pandas.tools.merge import concat
  24. from collections import defaultdict
  25. from functools import partial
  26. import pandas.core.common as com
  27. import numpy as np
  28. import pandas.core.nanops as nanops
  29. import pandas.util.testing as tm
  30. import pandas as pd
  31. class TestGroupBy(tm.TestCase):
  32. _multiprocess_can_split_ = True
  33. def setUp(self):
  34. self.ts = tm.makeTimeSeries()
  35. self.seriesd = tm.getSeriesData()
  36. self.tsd = tm.getTimeSeriesData()
  37. self.frame = DataFrame(self.seriesd)
  38. self.tsframe = DataFrame(self.tsd)
  39. self.df = DataFrame(
  40. {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
  41. 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
  42. 'C': np.random.randn(8),
  43. 'D': np.random.randn(8)})
  44. self.df_mixed_floats = DataFrame(
  45. {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
  46. 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
  47. 'C': np.random.randn(8),
  48. 'D': np.array(
  49. np.random.randn(8), dtype='float32')})
  50. index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',
  51. 'three']],
  52. labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
  53. [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
  54. names=['first', 'second'])
  55. self.mframe = DataFrame(np.random.randn(10, 3), index=index,
  56. columns=['A', 'B', 'C'])
  57. self.three_group = DataFrame(
  58. {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
  59. 'foo', 'foo', 'foo'],
  60. 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
  61. 'two', 'two', 'one'],
  62. 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
  63. 'dull', 'shiny', 'shiny', 'shiny'],
  64. 'D': np.random.randn(11),
  65. 'E': np.random.randn(11),
  66. 'F': np.random.randn(11)})
  67. def test_basic(self):
  68. def checkit(dtype):
  69. data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype)
  70. index = np.arange(9)
  71. np.random.shuffle(index)
  72. data = data.reindex(index)
  73. grouped = data.groupby(lambda x: x // 3)
  74. for k, v in grouped:
  75. self.assertEqual(len(v), 3)
  76. agged = grouped.aggregate(np.mean)
  77. self.assertEqual(agged[1], 1)
  78. assert_series_equal(agged, grouped.agg(np.mean)) # shorthand
  79. assert_series_equal(agged, grouped.mean())
  80. assert_series_equal(grouped.agg(np.sum), grouped.sum())
  81. expected = grouped.apply(lambda x: x * x.sum())
  82. transformed = grouped.transform(lambda x: x * x.sum())
  83. self.assertEqual(transformed[7], 12)
  84. assert_series_equal(transformed, expected)
  85. value_grouped = data.groupby(data)
  86. assert_series_equal(value_grouped.aggregate(np.mean), agged,
  87. check_index_type=False)
  88. # complex agg
  89. agged = grouped.aggregate([np.mean, np.std])
  90. agged = grouped.aggregate({'one': np.mean, 'two': np.std})
  91. group_constants = {0: 10, 1: 20, 2: 30}
  92. agged = grouped.agg(lambda x: group_constants[x.name] + x.mean())
  93. self.assertEqual(agged[1], 21)
  94. # corner cases
  95. self.assertRaises(Exception, grouped.aggregate, lambda x: x * 2)
  96. for dtype in ['int64', 'int32', 'float64', 'float32']:
  97. checkit(dtype)
  98. def test_select_bad_cols(self):
  99. df = DataFrame([[1, 2]], columns=['A', 'B'])
  100. g = df.groupby('A')
  101. self.assertRaises(KeyError, g.__getitem__, ['C']) # g[['C']]
  102. self.assertRaises(KeyError, g.__getitem__, ['A', 'C']) # g[['A', 'C']]
  103. with assertRaisesRegexp(KeyError, '^[^A]+$'):
  104. # A should not be referenced as a bad column...
  105. # will have to rethink regex if you change message!
  106. g[['A', 'C']]
  107. def test_first_last_nth(self):
  108. # tests for first / last / nth
  109. grouped = self.df.groupby('A')
  110. first = grouped.first()
  111. expected = self.df.ix[[1, 0], ['B', 'C', 'D']]
  112. expected.index = Index(['bar', 'foo'], name='A')
  113. expected = expected.sort_index()
  114. assert_frame_equal(first, expected)
  115. nth = grouped.nth(0)
  116. assert_frame_equal(nth, expected)
  117. last = grouped.last()
  118. expected = self.df.ix[[5, 7], ['B', 'C', 'D']]
  119. expected.index = Index(['bar', 'foo'], name='A')
  120. assert_frame_equal(last, expected)
  121. nth = grouped.nth(-1)
  122. assert_frame_equal(nth, expected)
  123. nth = grouped.nth(1)
  124. expected = self.df.ix[[2, 3], ['B', 'C', 'D']].copy()
  125. expected.index = Index(['foo', 'bar'], name='A')
  126. expected = expected.sort_index()
  127. assert_frame_equal(nth, expected)
  128. # it works!
  129. grouped['B'].first()
  130. grouped['B'].last()
  131. grouped['B'].nth(0)
  132. self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan
  133. self.assertTrue(isnull(grouped['B'].first()['foo']))
  134. self.assertTrue(isnull(grouped['B'].last()['foo']))
  135. self.assertTrue(isnull(grouped['B'].nth(0)['foo']))
  136. # v0.14.0 whatsnew
  137. df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
  138. g = df.groupby('A')
  139. result = g.first()
  140. expected = df.iloc[[1, 2]].set_index('A')
  141. assert_frame_equal(result, expected)
  142. expected = df.iloc[[1, 2]].set_index('A')
  143. result = g.nth(0, dropna='any')
  144. assert_frame_equal(result, expected)
  145. def test_first_last_nth_dtypes(self):
  146. df = self.df_mixed_floats.copy()
  147. df['E'] = True
  148. df['F'] = 1
  149. # tests for first / last / nth
  150. grouped = df.groupby('A')
  151. first = grouped.first()
  152. expected = df.ix[[1, 0], ['B', 'C', 'D', 'E', 'F']]
  153. expected.index = Index(['bar', 'foo'], name='A')
  154. expected = expected.sort_index()
  155. assert_frame_equal(first, expected)
  156. last = grouped.last()
  157. expected = df.ix[[5, 7], ['B', 'C', 'D', 'E', 'F']]
  158. expected.index = Index(['bar', 'foo'], name='A')
  159. expected = expected.sort_index()
  160. assert_frame_equal(last, expected)
  161. nth = grouped.nth(1)
  162. expected = df.ix[[3, 2], ['B', 'C', 'D', 'E', 'F']]
  163. expected.index = Index(['bar', 'foo'], name='A')
  164. expected = expected.sort_index()
  165. assert_frame_equal(nth, expected)
  166. # GH 2763, first/last shifting dtypes
  167. idx = lrange(10)
  168. idx.append(9)
  169. s = Series(data=lrange(11), index=idx, name='IntCol')
  170. self.assertEqual(s.dtype, 'int64')
  171. f = s.groupby(level=0).first()
  172. self.assertEqual(f.dtype, 'int64')
  173. def test_nth(self):
  174. df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
  175. g = df.groupby('A')
  176. assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A'))
  177. assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A'))
  178. assert_frame_equal(g.nth(2), df.loc[[]].set_index('A'))
  179. assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A'))
  180. assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A'))
  181. assert_frame_equal(g.nth(-3), df.loc[[]].set_index('A'))
  182. assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]])
  183. assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]])
  184. assert_frame_equal(g[['B']].nth(0),
  185. df.ix[[0, 2], ['A', 'B']].set_index('A'))
  186. exp = df.set_index('A')
  187. assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]])
  188. assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]])
  189. exp['B'] = np.nan
  190. assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]])
  191. assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]])
  192. # out of bounds, regression from 0.13.1
  193. # GH 6621
  194. df = DataFrame({'color': {0: 'green',
  195. 1: 'green',
  196. 2: 'red',
  197. 3: 'red',
  198. 4: 'red'},
  199. 'food': {0: 'ham',
  200. 1: 'eggs',
  201. 2: 'eggs',
  202. 3: 'ham',
  203. 4: 'pork'},
  204. 'two': {0: 1.5456590000000001,
  205. 1: -0.070345000000000005,
  206. 2: -2.4004539999999999,
  207. 3: 0.46206000000000003,
  208. 4: 0.52350799999999997},
  209. 'one': {0: 0.56573799999999996,
  210. 1: -0.9742360000000001,
  211. 2: 1.033801,
  212. 3: -0.78543499999999999,
  213. 4: 0.70422799999999997}}).set_index(['color',
  214. 'food'])
  215. result = df.groupby(level=0, as_index=False).nth(2)
  216. expected = df.iloc[[-1]]
  217. assert_frame_equal(result, expected)
  218. result = df.groupby(level=0, as_index=False).nth(3)
  219. expected = df.loc[[]]
  220. assert_frame_equal(result, expected)
  221. # GH 7559
  222. # from the vbench
  223. df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype='int64')
  224. s = df[1]
  225. g = df[0]
  226. expected = s.groupby(g).first()
  227. expected2 = s.groupby(g).apply(lambda x: x.iloc[0])
  228. assert_series_equal(expected2, expected, check_names=False)
  229. self.assertTrue(expected.name, 0)
  230. self.assertEqual(expected.name, 1)
  231. # validate first
  232. v = s[g == 1].iloc[0]
  233. self.assertEqual(expected.iloc[0], v)
  234. self.assertEqual(expected2.iloc[0], v)
  235. # this is NOT the same as .first (as sorted is default!)
  236. # as it keeps the order in the series (and not the group order)
  237. # related GH 7287
  238. expected = s.groupby(g, sort=False).first()
  239. result = s.groupby(g, sort=False).nth(0, dropna='all')
  240. assert_series_equal(result, expected)
  241. # doc example
  242. df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
  243. g = df.groupby('A')
  244. result = g.B.nth(0, dropna=True)
  245. expected = g.B.first()
  246. assert_series_equal(result, expected)
  247. # test multiple nth values
  248. df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]],
  249. columns=['A', 'B'])
  250. g = df.groupby('A')
  251. assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index('A'))
  252. assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index('A'))
  253. assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index('A'))
  254. assert_frame_equal(
  255. g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index('A'))
  256. assert_frame_equal(
  257. g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
  258. assert_frame_equal(
  259. g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
  260. assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A'))
  261. assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index('A'))
  262. business_dates = pd.date_range(start='4/1/2014', end='6/30/2014',
  263. freq='B')
  264. df = DataFrame(1, index=business_dates, columns=['a', 'b'])
  265. # get the first, fourth and last two business days for each month
  266. key = (df.index.year, df.index.month)
  267. result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
  268. expected_dates = pd.to_datetime(
  269. ['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1',
  270. '2014/5/6', '2014/5/29', '2014/5/30', '2014/6/2', '2014/6/5',
  271. '2014/6/27', '2014/6/30'])
  272. expected = DataFrame(1, columns=['a', 'b'], index=expected_dates)
  273. assert_frame_equal(result, expected)
  274. def test_nth_multi_index(self):
  275. # PR 9090, related to issue 8979
  276. # test nth on MultiIndex, should match .first()
  277. grouped = self.three_group.groupby(['A', 'B'])
  278. result = grouped.nth(0)
  279. expected = grouped.first()
  280. assert_frame_equal(result, expected)
  281. def test_nth_multi_index_as_expected(self):
  282. # PR 9090, related to issue 8979
  283. # test nth on MultiIndex
  284. three_group = DataFrame(
  285. {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
  286. 'foo', 'foo', 'foo'],
  287. 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
  288. 'two', 'two', 'one'],
  289. 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
  290. 'dull', 'shiny', 'shiny', 'shiny']})
  291. grouped = three_group.groupby(['A', 'B'])
  292. result = grouped.nth(0)
  293. expected = DataFrame(
  294. {'C': ['dull', 'dull', 'dull', 'dull']},
  295. index=MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo'],
  296. ['one', 'two', 'one', 'two']],
  297. names=['A', 'B']))
  298. assert_frame_equal(result, expected)
  299. def test_group_selection_cache(self):
  300. # GH 12839 nth, head, and tail should return same result consistently
  301. df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
  302. expected = df.iloc[[0, 2]].set_index('A')
  303. g = df.groupby('A')
  304. result1 = g.head(n=2)
  305. result2 = g.nth(0)
  306. assert_frame_equal(result1, df)
  307. assert_frame_equal(result2, expected)
  308. g = df.groupby('A')
  309. result1 = g.tail(n=2)
  310. result2 = g.nth(0)
  311. assert_frame_equal(result1, df)
  312. assert_frame_equal(result2, expected)
  313. g = df.groupby('A')
  314. result1 = g.nth(0)
  315. result2 = g.head(n=2)
  316. assert_frame_equal(result1, expected)
  317. assert_frame_equal(result2, df)
  318. g = df.groupby('A')
  319. result1 = g.nth(0)
  320. result2 = g.tail(n=2)
  321. assert_frame_equal(result1, expected)
  322. assert_frame_equal(result2, df)
  323. def test_grouper_index_types(self):
  324. # related GH5375
  325. # groupby misbehaving when using a Floatlike index
  326. df = DataFrame(np.arange(10).reshape(5, 2), columns=list('AB'))
  327. for index in [tm.makeFloatIndex, tm.makeStringIndex,
  328. tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex,
  329. tm.makePeriodIndex]:
  330. df.index = index(len(df))
  331. df.groupby(list('abcde')).apply(lambda x: x)
  332. df.index = list(reversed(df.index.tolist()))
  333. df.groupby(list('abcde')).apply(lambda x: x)
  334. def test_grouper_multilevel_freq(self):
  335. # GH 7885
  336. # with level and freq specified in a pd.Grouper
  337. from datetime import date, timedelta
  338. d0 = date.today() - timedelta(days=14)
  339. dates = date_range(d0, date.today())
  340. date_index = pd.MultiIndex.from_product(
  341. [dates, dates], names=['foo', 'bar'])
  342. df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index)
  343. # Check string level
  344. expected = df.reset_index().groupby([pd.Grouper(
  345. key='foo', freq='W'), pd.Grouper(key='bar', freq='W')]).sum()
  346. # reset index changes columns dtype to object
  347. expected.columns = pd.Index([0], dtype='int64')
  348. result = df.groupby([pd.Grouper(level='foo', freq='W'), pd.Grouper(
  349. level='bar', freq='W')]).sum()
  350. assert_frame_equal(result, expected)
  351. # Check integer level
  352. result = df.groupby([pd.Grouper(level=0, freq='W'), pd.Grouper(
  353. level=1, freq='W')]).sum()
  354. assert_frame_equal(result, expected)
  355. def test_grouper_creation_bug(self):
  356. # GH 8795
  357. df = DataFrame({'A': [0, 0, 1, 1, 2, 2], 'B': [1, 2, 3, 4, 5, 6]})
  358. g = df.groupby('A')
  359. expected = g.sum()
  360. g = df.groupby(pd.Grouper(key='A'))
  361. result = g.sum()
  362. assert_frame_equal(result, expected)
  363. result = g.apply(lambda x: x.sum())
  364. assert_frame_equal(result, expected)
  365. g = df.groupby(pd.Grouper(key='A', axis=0))
  366. result = g.sum()
  367. assert_frame_equal(result, expected)
  368. # GH8866
  369. s = Series(np.arange(8, dtype='int64'),
  370. index=pd.MultiIndex.from_product(
  371. [list('ab'), range(2),
  372. date_range('20130101', periods=2)],
  373. names=['one', 'two', 'three']))
  374. result = s.groupby(pd.Grouper(level='three', freq='M')).sum()
  375. expected = Series([28], index=Index(
  376. [Timestamp('2013-01-31')], freq='M', name='three'))
  377. assert_series_equal(result, expected)
  378. # just specifying a level breaks
  379. result = s.groupby(pd.Grouper(level='one')).sum()
  380. expected = s.groupby(level='one').sum()
  381. assert_series_equal(result, expected)
  382. def test_grouper_getting_correct_binner(self):
  383. # GH 10063
  384. # using a non-time-based grouper and a time-based grouper
  385. # and specifying levels
  386. df = DataFrame({'A': 1}, index=pd.MultiIndex.from_product(
  387. [list('ab'), date_range('20130101', periods=80)], names=['one',
  388. 'two']))
  389. result = df.groupby([pd.Grouper(level='one'), pd.Grouper(
  390. level='two', freq='M')]).sum()
  391. expected = DataFrame({'A': [31, 28, 21, 31, 28, 21]},
  392. index=MultiIndex.from_product(
  393. [list('ab'),
  394. date_range('20130101', freq='M', periods=3)],
  395. names=['one', 'two']))
  396. assert_frame_equal(result, expected)
  397. def test_grouper_iter(self):
  398. self.assertEqual(sorted(self.df.groupby('A').grouper), ['bar', 'foo'])
  399. def test_empty_groups(self):
  400. # GH # 1048
  401. self.assertRaises(ValueError, self.df.groupby, [])
  402. def test_groupby_grouper(self):
  403. grouped = self.df.groupby('A')
  404. result = self.df.groupby(grouped.grouper).mean()
  405. expected = grouped.mean()
  406. assert_frame_equal(result, expected)
  407. def test_groupby_duplicated_column_errormsg(self):
  408. # GH7511
  409. df = DataFrame(columns=['A', 'B', 'A', 'C'],
  410. data=[range(4), range(2, 6), range(0, 8, 2)])
  411. self.assertRaises(ValueError, df.groupby, 'A')
  412. self.assertRaises(ValueError, df.groupby, ['A', 'B'])
  413. grouped = df.groupby('B')
  414. c = grouped.count()
  415. self.assertTrue(c.columns.nlevels == 1)
  416. self.assertTrue(c.columns.size == 3)
  417. def test_groupby_dict_mapping(self):
  418. # GH #679
  419. from pandas import Series
  420. s = Series({'T1': 5})
  421. result = s.groupby({'T1': 'T2'}).agg(sum)
  422. expected = s.groupby(['T2']).agg(sum)
  423. assert_series_equal(result, expected)
  424. s = Series([1., 2., 3., 4.], index=list('abcd'))
  425. mapping = {'a': 0, 'b': 0, 'c': 1, 'd': 1}
  426. result = s.groupby(mapping).mean()
  427. result2 = s.groupby(mapping).agg(np.mean)
  428. expected = s.groupby([0, 0, 1, 1]).mean()
  429. expected2 = s.groupby([0, 0, 1, 1]).mean()
  430. assert_series_equal(result, expected)
  431. assert_series_equal(result, result2)
  432. assert_series_equal(result, expected2)
  433. def test_groupby_bounds_check(self):
  434. # groupby_X is code-generated, so if one variant
  435. # does, the rest probably do to
  436. a = np.array([1, 2], dtype='object')
  437. b = np.array([1, 2, 3], dtype='object')
  438. self.assertRaises(AssertionError, pd.algos.groupby_object, a, b)
  439. def test_groupby_grouper_f_sanity_checked(self):
  440. dates = date_range('01-Jan-2013', periods=12, freq='MS')
  441. ts = Series(np.random.randn(12), index=dates)
  442. # GH3035
  443. # index.map is used to apply grouper to the index
  444. # if it fails on the elements, map tries it on the entire index as
  445. # a sequence. That can yield invalid results that cause trouble
  446. # down the line.
  447. # the surprise comes from using key[0:6] rather then str(key)[0:6]
  448. # when the elements are Timestamp.
  449. # the result is Index[0:6], very confusing.
  450. self.assertRaises(AssertionError, ts.groupby, lambda key: key[0:6])
  451. def test_groupby_nonobject_dtype(self):
  452. key = self.mframe.index.labels[0]
  453. grouped = self.mframe.groupby(key)
  454. result = grouped.sum()
  455. expected = self.mframe.groupby(key.astype('O')).sum()
  456. assert_frame_equal(result, expected)
  457. # GH 3911, mixed frame non-conversion
  458. df = self.df_mixed_floats.copy()
  459. df['value'] = lrange(len(df))
  460. def max_value(group):
  461. return group.ix[group['value'].idxmax()]
  462. applied = df.groupby('A').apply(max_value)
  463. result = applied.get_dtype_counts().sort_values()
  464. expected = Series({'object': 2,
  465. 'float64': 2,
  466. 'int64': 1}).sort_values()
  467. assert_series_equal(result, expected)
  468. def test_groupby_return_type(self):
  469. # GH2893, return a reduced type
  470. df1 = DataFrame([{"val1": 1,
  471. "val2": 20}, {"val1": 1,
  472. "val2": 19}, {"val1": 2,
  473. "val2": 27}, {"val1": 2,
  474. "val2": 12}
  475. ])
  476. def func(dataf):
  477. return dataf["val2"] - dataf["val2"].mean()
  478. result = df1.groupby("val1", squeeze=True).apply(func)
  479. tm.assertIsInstance(result, Series)
  480. df2 = DataFrame([{"val1": 1,
  481. "val2": 20}, {"val1": 1,
  482. "val2": 19}, {"val1": 1,
  483. "val2": 27}, {"val1": 1,
  484. "val2": 12}
  485. ])
  486. def func(dataf):
  487. return dataf["val2"] - dataf["val2"].mean()
  488. result = df2.groupby("val1", squeeze=True).apply(func)
  489. tm.assertIsInstance(result, Series)
  490. # GH3596, return a consistent type (regression in 0.11 from 0.10.1)
  491. df = DataFrame([[1, 1], [1, 1]], columns=['X', 'Y'])
  492. result = df.groupby('X', squeeze=False).count()
  493. tm.assertIsInstance(result, DataFrame)
  494. # GH5592
  495. # inconcistent return type
  496. df = DataFrame(dict(A=['Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb',
  497. 'Pony', 'Pony'], B=Series(
  498. np.arange(7), dtype='int64'), C=date_range(
  499. '20130101', periods=7)))
  500. def f(grp):
  501. return grp.iloc[0]
  502. expected = df.groupby('A').first()[['B']]
  503. result = df.groupby('A').apply(f)[['B']]
  504. assert_frame_equal(result, expected)
  505. def f(grp):
  506. if grp.name == 'Tiger':
  507. return None
  508. return grp.iloc[0]
  509. result = df.groupby('A').apply(f)[['B']]
  510. e = expected.copy()
  511. e.loc['Tiger'] = np.nan
  512. assert_frame_equal(result, e)
  513. def f(grp):
  514. if grp.name == 'Pony':
  515. return None
  516. return grp.iloc[0]
  517. result = df.groupby('A').apply(f)[['B']]
  518. e = expected.copy()
  519. e.loc['Pony'] = np.nan
  520. assert_frame_equal(result, e)
  521. # 5592 revisited, with datetimes
  522. def f(grp):
  523. if grp.name == 'Pony':
  524. return None
  525. return grp.iloc[0]
  526. result = df.groupby('A').apply(f)[['C']]
  527. e = df.groupby('A').first()[['C']]
  528. e.loc['Pony'] = pd.NaT
  529. assert_frame_equal(result, e)
  530. # scalar outputs
  531. def f(grp):
  532. if grp.name == 'Pony':
  533. return None
  534. return grp.iloc[0].loc['C']
  535. result = df.groupby('A').apply(f)
  536. e = df.groupby('A').first()['C'].copy()
  537. e.loc['Pony'] = np.nan
  538. e.name = None
  539. assert_series_equal(result, e)
  540. def test_agg_api(self):
  541. # GH 6337
  542. # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error
  543. # different api for agg when passed custom function with mixed frame
  544. df = DataFrame({'data1': np.random.randn(5),
  545. 'data2': np.random.randn(5),
  546. 'key1': ['a', 'a', 'b', 'b', 'a'],
  547. 'key2': ['one', 'two', 'one', 'two', 'one']})
  548. grouped = df.groupby('key1')
  549. def peak_to_peak(arr):
  550. return arr.max() - arr.min()
  551. expected = grouped.agg([peak_to_peak])
  552. expected.columns = ['data1', 'data2']
  553. result = grouped.agg(peak_to_peak)
  554. assert_frame_equal(result, expected)
  555. def test_agg_regression1(self):
  556. grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
  557. result = grouped.agg(np.mean)
  558. expected = grouped.mean()
  559. assert_frame_equal(result, expected)
  560. def test_agg_datetimes_mixed(self):
  561. data = [[1, '2012-01-01', 1.0], [2, '2012-01-02', 2.0], [3, None, 3.0]]
  562. df1 = DataFrame({'key': [x[0] for x in data],
  563. 'date': [x[1] for x in data],
  564. 'value': [x[2] for x in data]})
  565. data = [[row[0], datetime.strptime(row[1], '%Y-%m-%d').date() if row[1]
  566. else None, row[2]] for row in data]
  567. df2 = DataFrame({'key': [x[0] for x in data],
  568. 'date': [x[1] for x in data],
  569. 'value': [x[2] for x in data]})
  570. df1['weights'] = df1['value'] / df1['value'].sum()
  571. gb1 = df1.groupby('date').aggregate(np.sum)
  572. df2['weights'] = df1['value'] / df1['value'].sum()
  573. gb2 = df2.groupby('date').aggregate(np.sum)
  574. assert (len(gb1) == len(gb2))
  575. def test_agg_period_index(self):
  576. from pandas import period_range, PeriodIndex
  577. prng = period_range('2012-1-1', freq='M', periods=3)
  578. df = DataFrame(np.random.randn(3, 2), index=prng)
  579. rs = df.groupby(level=0).sum()
  580. tm.assertIsInstance(rs.index, PeriodIndex)
  581. # GH 3579
  582. index = period_range(start='1999-01', periods=5, freq='M')
  583. s1 = Series(np.random.rand(len(index)), index=index)
  584. s2 = Series(np.random.rand(len(index)), index=index)
  585. series = [('s1', s1), ('s2', s2)]
  586. df = DataFrame.from_items(series)
  587. grouped = df.groupby(df.index.month)
  588. list(grouped)
  589. def test_agg_dict_parameter_cast_result_dtypes(self):
  590. # GH 12821
  591. df = DataFrame(
  592. {'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'],
  593. 'time': date_range('1/1/2011', periods=8, freq='H')})
  594. df.loc[[0, 1, 2, 5], 'time'] = None
  595. # test for `first` function
  596. exp = df.loc[[0, 3, 4, 6]].set_index('class')
  597. grouped = df.groupby('class')
  598. assert_frame_equal(grouped.first(), exp)
  599. assert_frame_equal(grouped.agg('first'), exp)
  600. assert_frame_equal(grouped.agg({'time': 'first'}), exp)
  601. assert_series_equal(grouped.time.first(), exp['time'])
  602. assert_series_equal(grouped.time.agg('first'), exp['time'])
  603. # test for `last` function
  604. exp = df.loc[[0, 3, 4, 7]].set_index('class')
  605. grouped = df.groupby('class')
  606. assert_frame_equal(grouped.last(), exp)
  607. assert_frame_equal(grouped.agg('last'), exp)
  608. assert_frame_equal(grouped.agg({'time': 'last'}), exp)
  609. assert_series_equal(grouped.time.last(), exp['time'])
  610. assert_series_equal(grouped.time.agg('last'), exp['time'])
  611. def test_agg_must_agg(self):
  612. grouped = self.df.groupby('A')['C']
  613. self.assertRaises(Exception, grouped.agg, lambda x: x.describe())
  614. self.assertRaises(Exception, grouped.agg, lambda x: x.index[:2])
  615. def test_agg_ser_multi_key(self):
  616. # TODO(wesm): unused
  617. ser = self.df.C # noqa
  618. f = lambda x: x.sum()
  619. results = self.df.C.groupby([self.df.A, self.df.B]).aggregate(f)
  620. expected = self.df.groupby(['A', 'B']).sum()['C']
  621. assert_series_equal(results, expected)
  622. def test_get_group(self):
  623. wp = tm.makePanel()
  624. grouped = wp.groupby(lambda x: x.month, axis='major')
  625. gp = grouped.get_group(1)
  626. expected = wp.reindex(major=[x for x in wp.major_axis if x.month == 1])
  627. assert_panel_equal(gp, expected)
  628. # GH 5267
  629. # be datelike friendly
  630. df = DataFrame({'DATE': pd.to_datetime(
  631. ['10-Oct-2013', '10-Oct-2013', '10-Oct-2013', '11-Oct-2013',
  632. '11-Oct-2013', '11-Oct-2013']),
  633. 'label': ['foo', 'foo', 'bar', 'foo', 'foo', 'bar'],
  634. 'VAL': [1, 2, 3, 4, 5, 6]})
  635. g = df.groupby('DATE')
  636. key = list(g.groups)[0]
  637. result1 = g.get_group(key)
  638. result2 = g.get_group(Timestamp(key).to_datetime())
  639. result3 = g.get_group(str(Timestamp(key)))
  640. assert_frame_equal(result1, result2)
  641. assert_frame_equal(result1, result3)
  642. g = df.groupby(['DATE', 'label'])
  643. key = list(g.groups)[0]
  644. result1 = g.get_group(key)
  645. result2 = g.get_group((Timestamp(key[0]).to_datetime(), key[1]))
  646. result3 = g.get_group((str(Timestamp(key[0])), key[1]))
  647. assert_frame_equal(result1, result2)
  648. assert_frame_equal(result1, result3)
  649. # must pass a same-length tuple with multiple keys
  650. self.assertRaises(ValueError, lambda: g.get_group('foo'))
  651. self.assertRaises(ValueError, lambda: g.get_group(('foo')))
  652. self.assertRaises(ValueError,
  653. lambda: g.get_group(('foo', 'bar', 'baz')))
  654. def test_get_group_grouped_by_tuple(self):
  655. # GH 8121
  656. df = DataFrame([[(1, ), (1, 2), (1, ), (1, 2)]], index=['ids']).T
  657. gr = df.groupby('ids')
  658. expected = DataFrame({'ids': [(1, ), (1, )]}, index=[0, 2])
  659. result = gr.get_group((1, ))
  660. assert_frame_equal(result, expected)
  661. dt = pd.to_datetime(['2010-01-01', '2010-01-02', '2010-01-01',
  662. '2010-01-02'])
  663. df = DataFrame({'ids': [(x, ) for x in dt]})
  664. gr = df.groupby('ids')
  665. result = gr.get_group(('2010-01-01', ))
  666. expected = DataFrame({'ids': [(dt[0], ), (dt[0], )]}, index=[0, 2])
  667. assert_frame_equal(result, expected)
  668. def test_agg_apply_corner(self):
  669. # nothing to group, all NA
  670. grouped = self.ts.groupby(self.ts * np.nan)
  671. self.assertEqual(self.ts.dtype, np.float64)
  672. # groupby float64 values results in Float64Index
  673. exp = Series([], dtype=np.float64, index=pd.Index(
  674. [], dtype=np.float64))
  675. assert_series_equal(grouped.sum(), exp)
  676. assert_series_equal(grouped.agg(np.sum), exp)
  677. assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False)
  678. # DataFrame
  679. grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan)
  680. exp_df = DataFrame(columns=self.tsframe.columns, dtype=float,
  681. index=pd.Index([], dtype=np.float64))
  682. assert_frame_equal(grouped.sum(), exp_df, check_names=False)
  683. assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False)
  684. assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0],
  685. check_names=False)
  686. def test_agg_grouping_is_list_tuple(self):
  687. from pandas.core.groupby import Grouping
  688. df = tm.makeTimeDataFrame()
  689. grouped = df.groupby(lambda x: x.year)
  690. grouper = grouped.grouper.groupings[0].grouper
  691. grouped.grouper.groupings[0] = Grouping(self.ts.index, list(grouper))
  692. result = grouped.agg(np.mean)
  693. expected = grouped.mean()
  694. tm.assert_frame_equal(result, expected)
  695. grouped.grouper.groupings[0] = Grouping(self.ts.index, tuple(grouper))
  696. result = grouped.agg(np.mean)
  697. expected = grouped.mean()
  698. tm.assert_frame_equal(result, expected)
  699. def test_grouping_error_on_multidim_input(self):
  700. from pandas.core.groupby import Grouping
  701. self.assertRaises(ValueError,
  702. Grouping, self.df.index, self.df[['A', 'A']])
  703. def test_agg_python_multiindex(self):
  704. grouped = self.mframe.groupby(['A', 'B'])
  705. result = grouped.agg(np.mean)
  706. expected = grouped.mean()
  707. tm.assert_frame_equal(result, expected)
  708. def test_apply_describe_bug(self):
  709. grouped = self.mframe.groupby(level='first')
  710. grouped.describe() # it works!
  711. def test_apply_issues(self):
  712. # GH 5788
  713. s = """2011.05.16,00:00,1.40893
  714. 2011.05.16,01:00,1.40760
  715. 2011.05.16,02:00,1.40750
  716. 2011.05.16,03:00,1.40649
  717. 2011.05.17,02:00,1.40893
  718. 2011.05.17,03:00,1.40760
  719. 2011.05.17,04:00,1.40750
  720. 2011.05.17,05:00,1.40649
  721. 2011.05.18,02:00,1.40893
  722. 2011.05.18,03:00,1.40760
  723. 2011.05.18,04:00,1.40750
  724. 2011.05.18,05:00,1.40649"""
  725. df = pd.read_csv(
  726. StringIO(s), header=None, names=['date', 'time', 'value'],
  727. parse_dates=[['date', 'time']])
  728. df = df.set_index('date_time')
  729. expected = df.groupby(df.index.date).idxmax()
  730. result = df.groupby(df.index.date).apply(lambda x: x.idxmax())
  731. assert_frame_equal(result, expected)
  732. # GH 5789
  733. # don't auto coerce dates
  734. df = pd.read_csv(
  735. StringIO(s), header=None, names=['date', 'time', 'value'])
  736. exp_idx = pd.Index(
  737. ['2011.05.16', '2011.05.17', '2011.05.18'
  738. ], dtype=object, name='date')
  739. expected = Series(['00:00', '02:00', '02:00'], index=exp_idx)
  740. result = df.groupby('date').apply(
  741. lambda x: x['time'][x['value'].idxmax()])
  742. assert_series_equal(result, expected)
  743. def test_time_field_bug(self):
  744. # Test a fix for the following error related to GH issue 11324 When
  745. # non-key fields in a group-by dataframe contained time-based fields
  746. # that were not returned by the apply function, an exception would be
  747. # raised.
  748. df = pd.DataFrame({'a': 1, 'b': [datetime.now() for nn in range(10)]})
  749. def func_with_no_date(batch):
  750. return pd.Series({'c': 2})
  751. def func_with_date(batch):
  752. return pd.Series({'c': 2, 'b': datetime(2015, 1, 1)})
  753. dfg_no_conversion = df.groupby(by=['a']).apply(func_with_no_date)
  754. dfg_no_conversion_expected = pd.DataFrame({'c': 2}, index=[1])
  755. dfg_no_conversion_expected.index.name = 'a'
  756. dfg_conversion = df.groupby(by=['a']).apply(func_with_date)
  757. dfg_conversion_expected = pd.DataFrame(
  758. {'b': datetime(2015, 1, 1),
  759. 'c': 2}, index=[1])
  760. dfg_conversion_expected.index.name = 'a'
  761. self.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected)
  762. self.assert_frame_equal(dfg_conversion, dfg_conversion_expected)
  763. def test_len(self):
  764. df = tm.makeTimeDataFrame()
  765. grouped = df.groupby([lambda x: x.year, lambda x: x.month,
  766. lambda x: x.day])
  767. self.assertEqual(len(grouped), len(df))
  768. grouped = df.groupby([lambda x: x.year, lambda x: x.month])
  769. expected = len(set([(x.year, x.month) for x in df.index]))
  770. self.assertEqual(len(grouped), expected)
  771. # issue 11016
  772. df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3]))
  773. self.assertEqual(len(df.groupby(('a'))), 0)
  774. self.assertEqual(len(df.groupby(('b'))), 3)
  775. self.assertEqual(len(df.groupby(('a', 'b'))), 3)
  776. def test_groups(self):
  777. grouped = self.df.groupby(['A'])
  778. groups = grouped.groups
  779. self.assertIs(groups, grouped.groups) # caching works
  780. for k, v in compat.iteritems(grouped.groups):
  781. self.assertTrue((self.df.ix[v]['A'] == k).all())
  782. grouped = self.df.groupby(['A', 'B'])
  783. groups = grouped.groups
  784. self.assertIs(groups, grouped.groups) # caching works
  785. for k, v in compat.iteritems(grouped.groups):
  786. self.assertTrue((self.df.ix[v]['A'] == k[0]).all())
  787. self.assertTrue((self.df.ix[v]['B'] == k[1]).all())
  788. def test_aggregate_str_func(self):
  789. def _check_results(grouped):
  790. # single series
  791. result = grouped['A'].agg('std')
  792. expected = grouped['A'].std()
  793. assert_series_equal(result, expected)
  794. # group frame by function name
  795. result = grouped.aggregate('var')
  796. expected = grouped.var()
  797. assert_frame_equal(result, expected)
  798. # group frame by function dict
  799. result = grouped.agg(OrderedDict([['A', 'var'], ['B', 'std'],
  800. ['C', 'mean'], ['D', 'sem']]))
  801. expected = DataFrame(OrderedDict([['A', grouped['A'].var(
  802. )], ['B', grouped['B'].std()], ['C', grouped['C'].mean()],
  803. ['D', grouped['D'].sem()]]))
  804. assert_frame_equal(result, expected)
  805. by_weekday = self.tsframe.groupby(lambda x: x.weekday())
  806. _check_results(by_weekday)
  807. by_mwkday = self.tsframe.groupby([lambda x: x.month,
  808. lambda x: x.weekday()])
  809. _check_results(by_mwkday)
  810. def test_aggregate_item_by_item(self):
  811. df = self.df.copy()
  812. df['E'] = ['a'] * len(self.df)
  813. grouped = self.df.groupby('A')
  814. # API change in 0.11
  815. # def aggfun(ser):
  816. # return len(ser + 'a')
  817. # result = grouped.agg(aggfun)
  818. # self.assertEqual(len(result.columns), 1)
  819. aggfun = lambda ser: ser.size
  820. result = grouped.agg(aggfun)
  821. foo = (self.df.A == 'foo').sum()
  822. bar = (self.df.A == 'bar').sum()
  823. K = len(result.columns)
  824. # GH5782
  825. # odd comparisons can result here, so cast to make easy
  826. exp = pd.Series(np.array([foo] * K), index=list('BCD'),
  827. dtype=np.float64, name='foo')
  828. tm.assert_series_equal(result.xs('foo'), exp)
  829. exp = pd.Series(np.array([bar] * K), index=list('BCD'),
  830. dtype=np.float64, name='bar')
  831. tm.assert_almost_equal(result.xs('bar'), exp)
  832. def aggfun(ser):
  833. return ser.size
  834. result = DataFrame().groupby(self.df.A).agg(aggfun)
  835. tm.assertIsInstance(result, DataFrame)
  836. self.assertEqual(len(result), 0)
  837. def test_agg_item_by_item_raise_typeerror(self):
  838. from numpy.random import randint
  839. df = DataFrame(randint(10, size=(20, 10)))
  840. def raiseException(df):
  841. pprint_thing('----------------------------------------')
  842. pprint_thing(df.to_string())
  843. raise TypeError
  844. self.assertRaises(TypeError, df.groupby(0).agg, raiseException)
  845. def test_basic_regression(self):
  846. # regression
  847. T = [1.0 * x for x in lrange(1, 10) * 10][:1095]
  848. result = Series(T, lrange(0, len(T)))
  849. groupings = np.random.random((1100, ))
  850. groupings = Series(groupings, lrange(0, len(groupings))) * 10.
  851. grouped = result.groupby(groupings)
  852. grouped.mean()
  853. def test_transform(self):
  854. data = Series(np.arange(9) // 3, index=np.arange(9))
  855. index = np.arange(9)
  856. np.random.shuffle(index)
  857. data = data.reindex(index)
  858. grouped = data.groupby(lambda x: x // 3)
  859. transformed = grouped.transform(lambda x: x * x.sum())
  860. self.assertEqual(transformed[7], 12)
  861. # GH 8046
  862. # make sure that we preserve the input order
  863. df = DataFrame(
  864. np.arange(6, dtype='int64').reshape(
  865. 3, 2), columns=["a", "b"], index=[0, 2, 1])
  866. key = [0, 0, 1]
  867. expected = df.sort_index().groupby(key).transform(
  868. lambda x: x - x.mean()).groupby(key).mean()
  869. result = df.groupby(key).transform(lambda x: x - x.mean()).groupby(
  870. key).mean()
  871. assert_frame_equal(result, expected)
  872. def demean(arr):
  873. return arr - arr.mean()
  874. people = DataFrame(np.random.randn(5, 5),
  875. columns=['a', 'b', 'c', 'd', 'e'],
  876. index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
  877. key = ['one', 'two', 'one', 'two', 'one']
  878. result = people.groupby(key).transform(demean).groupby(key).mean()
  879. expected = people.groupby(key).apply(demean).groupby(key).mean()
  880. assert_frame_equal(result, expected)
  881. # GH 8430
  882. df = tm.makeTimeDataFrame()
  883. g = df.groupby(pd.TimeGrouper('M'))
  884. g.transform(lambda x: x - 1)
  885. # GH 9700
  886. df = DataFrame({'a': range(5, 10), 'b': range(5)})
  887. result = df.groupby('a').transform(max)
  888. expected = DataFrame({'b': range(5)})
  889. tm.assert_frame_equal(result, expected)
  890. def test_transform_fast(self):
  891. df = DataFrame({'id': np.arange(100000) / 3,
  892. 'val': np.random.randn(100000)})
  893. grp = df.groupby('id')['val']
  894. values = np.repeat(grp.mean().values,
  895. _ensure_platform_int(grp.count().values))
  896. expected = pd.Series(values, index=df.index, name='val')
  897. result = grp.transform(np.mean)
  898. assert_series_equal(result, expected)
  899. result = grp.transform('mean')
  900. assert_series_equal(result, expected)
  901. # GH 12737
  902. df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5],
  903. 'd': pd.date_range('2014-1-1', '2014-1-4'),
  904. 'i': [1, 2, 3, 4]},
  905. columns=['grouping', 'f', 'i', 'd'])
  906. result = df.groupby('grouping').transform('first')
  907. dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'),
  908. pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')]
  909. expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5],
  910. 'd': dates,
  911. 'i': [1, 2, 2, 4]},
  912. columns=['f', 'i', 'd'])
  913. assert_frame_equal(result, expected)
  914. # selection
  915. result = df.groupby('grouping')[['f', 'i']].transform('first')
  916. expected = expected[['f', 'i']]
  917. assert_frame_equal(result, expected)
  918. # dup columns
  919. df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a'])
  920. result = df.groupby('g').transform('first')
  921. expected = df.drop('g', axis=1)
  922. assert_frame_equal(result, expected)
  923. def test_transform_broadcast(self):
  924. grouped = self.ts.groupby(lambda x: x.month)
  925. result = grouped.transform(np.mean)
  926. self.assert_index_equal(result.index, self.ts.index)
  927. for _, gp in grouped:
  928. assert_fp_equal(result.reindex(gp.index), gp.mean())
  929. grouped = self.tsframe.groupby(lambda x: x.month)
  930. result = grouped.transform(np.mean)
  931. self.assert_index_equal(result.index, self.tsframe.index)
  932. for _, gp in grouped:
  933. agged = gp.mean()
  934. res = result.reindex(gp.index)
  935. for col in self.tsframe:
  936. assert_fp_equal(res[col], agged[col])
  937. # group columns
  938. grouped = self.tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1},
  939. axis=1)
  940. result = grouped.transform(np.mean)
  941. self.assert_index_equal(result.index, self.tsframe.index)
  942. self.assert_index_equal(result.columns, self.tsframe.columns)
  943. for _, gp in grouped:
  944. agged = gp.mean(1)
  945. res = result.reindex(columns=gp.columns)
  946. for idx in gp.index:
  947. assert_fp_equal(res.xs(idx), agged[idx])
  948. def test_transform_axis(self):
  949. # make sure that we are setting the axes
  950. # correctly when on axis=0 or 1
  951. # in the presence of a non-monotonic indexer
  952. # GH12713
  953. base = self.tsframe.iloc[0:5]
  954. r = len(base.index)
  955. c = len(base.columns)
  956. tso = DataFrame(np.random.randn(r, c),
  957. index=base.index,
  958. columns=base.columns,
  959. dtype='float64')
  960. # monotonic
  961. ts = tso
  962. grouped = ts.groupby(lambda x: x.weekday())
  963. result = ts - grouped.transform('mean')
  964. expected = grouped.apply(lambda x: x - x.mean())
  965. assert_frame_equal(result, expected)
  966. ts = ts.T
  967. grouped = ts.groupby(lambda x: x.weekday(), axis=1)
  968. result = ts - grouped.transform('mean')
  969. expected = grouped.apply(lambda x: (x.T - x.mean(1)).T)
  970. assert_frame_equal(result, expected)
  971. # non-monotonic
  972. ts = tso.iloc[[1, 0] + list(range(2, len(base)))]
  973. grouped = ts.groupby(lambda x: x.weekday())
  974. result = ts - grouped.transform('mean')
  975. expected = grouped.apply(lambda x: x - x.mean())
  976. assert_frame_equal(result, expected)
  977. ts = ts.T
  978. grouped = ts.groupby(lambda x: x.weekday(), axis=1)
  979. result = ts - grouped.transform('mean')
  980. expected = grouped.apply(lambda x: (x.T - x.mean(1)).T)
  981. assert_frame_equal(result, expected)
  982. def test_transform_dtype(self):
  983. # GH 9807
  984. # Check transform dtype output is preserved
  985. df = DataFrame([[1, 3], [2, 3]])
  986. result = df.groupby(1).transform('mean')
  987. expected = DataFrame([[1.5], [1.5]])
  988. assert_frame_equal(result, expected)
  989. def test_transform_bug(self):
  990. # GH 5712
  991. # transforming on a datetime column
  992. df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5)))
  993. result = df.groupby('A')['B'].transform(
  994. lambda x: x.rank(ascending=False))
  995. expected = Series(np.arange(5, 0, step=-1), name='B')
  996. assert_series_equal(result, expected)
  997. def test_transform_multiple(self):
  998. grouped = self.ts.groupby([lambda x: x.year, lambda x: x.month])
  999. grouped.transform(lambda x: x * 2)
  1000. grouped.transform(np.mean)
  1001. def test_dispatch_transform(self):
  1002. df = self.tsframe[::5].reindex(self.tsframe.index)
  1003. grouped = df.groupby(lambda x: x.month)
  1004. filled = grouped.fillna(method='pad')
  1005. fillit = lambda x: x.fillna(method='pad')
  1006. expected = df.groupby(lambda x: x.month).transform(fillit)
  1007. assert_frame_equal(filled, expected)
  1008. def test_transform_select_columns(self):
  1009. f = lambda x: x.mean()
  1010. result = self.df.groupby('A')['C', 'D'].transform(f)
  1011. selection = self.df[['C', 'D']]
  1012. expected = selection.groupby(self.df['A']).transform(f)
  1013. assert_frame_equal(result, expected)
  1014. def test_transform_exclude_nuisance(self):
  1015. # this also tests orderings in transform between
  1016. # series/frame to make sure it's consistent
  1017. expected = {}
  1018. grouped = self.df.groupby('A')
  1019. expected['C'] = grouped['C'].transform(np.mean)
  1020. expected['D'] = grouped['D'].transform(np.mean)
  1021. expected = DataFrame(expected)
  1022. result = self.df.groupby('A').transform(np.mean)
  1023. assert_frame_equal(result, expected)
  1024. def test_transform_function_aliases(self):
  1025. result = self.df.groupby('A').transform('mean')
  1026. expected = self.df.groupby('A').transform(np.mean)
  1027. assert_frame_equal(result, expected)
  1028. result = self.df.groupby('A')['C'].transform('mean')
  1029. expected = self.df.groupby('A')['C'].transform(np.mean)
  1030. assert_series_equal(result, expected)
  1031. def test_series_fast_transform_date(self):
  1032. # GH 13191
  1033. df = pd.DataFrame({'grouping': [np.nan, 1, 1, 3],
  1034. 'd': pd.date_range('2014-1-1', '2014-1-4')})
  1035. result = df.groupby('grouping')['d'].transform('first')
  1036. dates = [pd.NaT, pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'),
  1037. pd.Timestamp('2014-1-4')]
  1038. expected = pd.Series(dates, name='d')
  1039. assert_series_equal(result, expected)
  1040. def test_transform_length(self):
  1041. # GH 9697
  1042. df = pd.DataFrame({'col1': [1, 1, 2, 2], 'col2': [1, 2, 3, np.nan]})
  1043. expected = pd.Series([3.0] * 4)
  1044. def nsum(x):
  1045. return np.nansum(x)
  1046. results = [df.groupby('col1…

Large files files are truncated, but you can click here to view the full file