/pandas/tests/test_groupby.py
Python | 6654 lines | 6415 code | 164 blank | 75 comment | 23 complexity | 7119f6a79ece6b6b900523ea4093a56a MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
Large files files are truncated, but you can click here to view the full file
- # -*- coding: utf-8 -*-
- from __future__ import print_function
- import nose
- from datetime import datetime
- from numpy import nan
- from pandas.types.common import _ensure_platform_int
- from pandas import date_range, bdate_range, Timestamp, isnull
- from pandas.core.index import Index, MultiIndex, CategoricalIndex
- from pandas.core.api import Categorical, DataFrame
- from pandas.core.common import UnsupportedFunctionCall
- from pandas.core.groupby import (SpecificationError, DataError, _nargsort,
- _lexsort_indexer)
- from pandas.core.series import Series
- from pandas.core.config import option_context
- from pandas.formats.printing import pprint_thing
- from pandas.util.testing import (assert_panel_equal, assert_frame_equal,
- assert_series_equal, assert_almost_equal,
- assert_index_equal, assertRaisesRegexp)
- from pandas.compat import (range, long, lrange, StringIO, lmap, lzip, map, zip,
- builtins, OrderedDict, product as cart_product)
- from pandas import compat
- from pandas.core.panel import Panel
- from pandas.tools.merge import concat
- from collections import defaultdict
- from functools import partial
- import pandas.core.common as com
- import numpy as np
- import pandas.core.nanops as nanops
- import pandas.util.testing as tm
- import pandas as pd
- class TestGroupBy(tm.TestCase):
- _multiprocess_can_split_ = True
- def setUp(self):
- self.ts = tm.makeTimeSeries()
- self.seriesd = tm.getSeriesData()
- self.tsd = tm.getTimeSeriesData()
- self.frame = DataFrame(self.seriesd)
- self.tsframe = DataFrame(self.tsd)
- self.df = DataFrame(
- {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
- 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
- 'C': np.random.randn(8),
- 'D': np.random.randn(8)})
- self.df_mixed_floats = DataFrame(
- {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
- 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
- 'C': np.random.randn(8),
- 'D': np.array(
- np.random.randn(8), dtype='float32')})
- index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',
- 'three']],
- labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
- [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
- names=['first', 'second'])
- self.mframe = DataFrame(np.random.randn(10, 3), index=index,
- columns=['A', 'B', 'C'])
- self.three_group = DataFrame(
- {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
- 'foo', 'foo', 'foo'],
- 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
- 'two', 'two', 'one'],
- 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
- 'dull', 'shiny', 'shiny', 'shiny'],
- 'D': np.random.randn(11),
- 'E': np.random.randn(11),
- 'F': np.random.randn(11)})
- def test_basic(self):
- def checkit(dtype):
- data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype)
- index = np.arange(9)
- np.random.shuffle(index)
- data = data.reindex(index)
- grouped = data.groupby(lambda x: x // 3)
- for k, v in grouped:
- self.assertEqual(len(v), 3)
- agged = grouped.aggregate(np.mean)
- self.assertEqual(agged[1], 1)
- assert_series_equal(agged, grouped.agg(np.mean)) # shorthand
- assert_series_equal(agged, grouped.mean())
- assert_series_equal(grouped.agg(np.sum), grouped.sum())
- expected = grouped.apply(lambda x: x * x.sum())
- transformed = grouped.transform(lambda x: x * x.sum())
- self.assertEqual(transformed[7], 12)
- assert_series_equal(transformed, expected)
- value_grouped = data.groupby(data)
- assert_series_equal(value_grouped.aggregate(np.mean), agged,
- check_index_type=False)
- # complex agg
- agged = grouped.aggregate([np.mean, np.std])
- agged = grouped.aggregate({'one': np.mean, 'two': np.std})
- group_constants = {0: 10, 1: 20, 2: 30}
- agged = grouped.agg(lambda x: group_constants[x.name] + x.mean())
- self.assertEqual(agged[1], 21)
- # corner cases
- self.assertRaises(Exception, grouped.aggregate, lambda x: x * 2)
- for dtype in ['int64', 'int32', 'float64', 'float32']:
- checkit(dtype)
- def test_select_bad_cols(self):
- df = DataFrame([[1, 2]], columns=['A', 'B'])
- g = df.groupby('A')
- self.assertRaises(KeyError, g.__getitem__, ['C']) # g[['C']]
- self.assertRaises(KeyError, g.__getitem__, ['A', 'C']) # g[['A', 'C']]
- with assertRaisesRegexp(KeyError, '^[^A]+$'):
- # A should not be referenced as a bad column...
- # will have to rethink regex if you change message!
- g[['A', 'C']]
- def test_first_last_nth(self):
- # tests for first / last / nth
- grouped = self.df.groupby('A')
- first = grouped.first()
- expected = self.df.ix[[1, 0], ['B', 'C', 'D']]
- expected.index = Index(['bar', 'foo'], name='A')
- expected = expected.sort_index()
- assert_frame_equal(first, expected)
- nth = grouped.nth(0)
- assert_frame_equal(nth, expected)
- last = grouped.last()
- expected = self.df.ix[[5, 7], ['B', 'C', 'D']]
- expected.index = Index(['bar', 'foo'], name='A')
- assert_frame_equal(last, expected)
- nth = grouped.nth(-1)
- assert_frame_equal(nth, expected)
- nth = grouped.nth(1)
- expected = self.df.ix[[2, 3], ['B', 'C', 'D']].copy()
- expected.index = Index(['foo', 'bar'], name='A')
- expected = expected.sort_index()
- assert_frame_equal(nth, expected)
- # it works!
- grouped['B'].first()
- grouped['B'].last()
- grouped['B'].nth(0)
- self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan
- self.assertTrue(isnull(grouped['B'].first()['foo']))
- self.assertTrue(isnull(grouped['B'].last()['foo']))
- self.assertTrue(isnull(grouped['B'].nth(0)['foo']))
- # v0.14.0 whatsnew
- df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
- g = df.groupby('A')
- result = g.first()
- expected = df.iloc[[1, 2]].set_index('A')
- assert_frame_equal(result, expected)
- expected = df.iloc[[1, 2]].set_index('A')
- result = g.nth(0, dropna='any')
- assert_frame_equal(result, expected)
- def test_first_last_nth_dtypes(self):
- df = self.df_mixed_floats.copy()
- df['E'] = True
- df['F'] = 1
- # tests for first / last / nth
- grouped = df.groupby('A')
- first = grouped.first()
- expected = df.ix[[1, 0], ['B', 'C', 'D', 'E', 'F']]
- expected.index = Index(['bar', 'foo'], name='A')
- expected = expected.sort_index()
- assert_frame_equal(first, expected)
- last = grouped.last()
- expected = df.ix[[5, 7], ['B', 'C', 'D', 'E', 'F']]
- expected.index = Index(['bar', 'foo'], name='A')
- expected = expected.sort_index()
- assert_frame_equal(last, expected)
- nth = grouped.nth(1)
- expected = df.ix[[3, 2], ['B', 'C', 'D', 'E', 'F']]
- expected.index = Index(['bar', 'foo'], name='A')
- expected = expected.sort_index()
- assert_frame_equal(nth, expected)
- # GH 2763, first/last shifting dtypes
- idx = lrange(10)
- idx.append(9)
- s = Series(data=lrange(11), index=idx, name='IntCol')
- self.assertEqual(s.dtype, 'int64')
- f = s.groupby(level=0).first()
- self.assertEqual(f.dtype, 'int64')
- def test_nth(self):
- df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
- g = df.groupby('A')
- assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A'))
- assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A'))
- assert_frame_equal(g.nth(2), df.loc[[]].set_index('A'))
- assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A'))
- assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A'))
- assert_frame_equal(g.nth(-3), df.loc[[]].set_index('A'))
- assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]])
- assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]])
- assert_frame_equal(g[['B']].nth(0),
- df.ix[[0, 2], ['A', 'B']].set_index('A'))
- exp = df.set_index('A')
- assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]])
- assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]])
- exp['B'] = np.nan
- assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]])
- assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]])
- # out of bounds, regression from 0.13.1
- # GH 6621
- df = DataFrame({'color': {0: 'green',
- 1: 'green',
- 2: 'red',
- 3: 'red',
- 4: 'red'},
- 'food': {0: 'ham',
- 1: 'eggs',
- 2: 'eggs',
- 3: 'ham',
- 4: 'pork'},
- 'two': {0: 1.5456590000000001,
- 1: -0.070345000000000005,
- 2: -2.4004539999999999,
- 3: 0.46206000000000003,
- 4: 0.52350799999999997},
- 'one': {0: 0.56573799999999996,
- 1: -0.9742360000000001,
- 2: 1.033801,
- 3: -0.78543499999999999,
- 4: 0.70422799999999997}}).set_index(['color',
- 'food'])
- result = df.groupby(level=0, as_index=False).nth(2)
- expected = df.iloc[[-1]]
- assert_frame_equal(result, expected)
- result = df.groupby(level=0, as_index=False).nth(3)
- expected = df.loc[[]]
- assert_frame_equal(result, expected)
- # GH 7559
- # from the vbench
- df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype='int64')
- s = df[1]
- g = df[0]
- expected = s.groupby(g).first()
- expected2 = s.groupby(g).apply(lambda x: x.iloc[0])
- assert_series_equal(expected2, expected, check_names=False)
- self.assertTrue(expected.name, 0)
- self.assertEqual(expected.name, 1)
- # validate first
- v = s[g == 1].iloc[0]
- self.assertEqual(expected.iloc[0], v)
- self.assertEqual(expected2.iloc[0], v)
- # this is NOT the same as .first (as sorted is default!)
- # as it keeps the order in the series (and not the group order)
- # related GH 7287
- expected = s.groupby(g, sort=False).first()
- result = s.groupby(g, sort=False).nth(0, dropna='all')
- assert_series_equal(result, expected)
- # doc example
- df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
- g = df.groupby('A')
- result = g.B.nth(0, dropna=True)
- expected = g.B.first()
- assert_series_equal(result, expected)
- # test multiple nth values
- df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]],
- columns=['A', 'B'])
- g = df.groupby('A')
- assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index('A'))
- assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index('A'))
- assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index('A'))
- assert_frame_equal(
- g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index('A'))
- assert_frame_equal(
- g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
- assert_frame_equal(
- g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
- assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A'))
- assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index('A'))
- business_dates = pd.date_range(start='4/1/2014', end='6/30/2014',
- freq='B')
- df = DataFrame(1, index=business_dates, columns=['a', 'b'])
- # get the first, fourth and last two business days for each month
- key = (df.index.year, df.index.month)
- result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
- expected_dates = pd.to_datetime(
- ['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1',
- '2014/5/6', '2014/5/29', '2014/5/30', '2014/6/2', '2014/6/5',
- '2014/6/27', '2014/6/30'])
- expected = DataFrame(1, columns=['a', 'b'], index=expected_dates)
- assert_frame_equal(result, expected)
- def test_nth_multi_index(self):
- # PR 9090, related to issue 8979
- # test nth on MultiIndex, should match .first()
- grouped = self.three_group.groupby(['A', 'B'])
- result = grouped.nth(0)
- expected = grouped.first()
- assert_frame_equal(result, expected)
- def test_nth_multi_index_as_expected(self):
- # PR 9090, related to issue 8979
- # test nth on MultiIndex
- three_group = DataFrame(
- {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
- 'foo', 'foo', 'foo'],
- 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
- 'two', 'two', 'one'],
- 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
- 'dull', 'shiny', 'shiny', 'shiny']})
- grouped = three_group.groupby(['A', 'B'])
- result = grouped.nth(0)
- expected = DataFrame(
- {'C': ['dull', 'dull', 'dull', 'dull']},
- index=MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo'],
- ['one', 'two', 'one', 'two']],
- names=['A', 'B']))
- assert_frame_equal(result, expected)
- def test_group_selection_cache(self):
- # GH 12839 nth, head, and tail should return same result consistently
- df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
- expected = df.iloc[[0, 2]].set_index('A')
- g = df.groupby('A')
- result1 = g.head(n=2)
- result2 = g.nth(0)
- assert_frame_equal(result1, df)
- assert_frame_equal(result2, expected)
- g = df.groupby('A')
- result1 = g.tail(n=2)
- result2 = g.nth(0)
- assert_frame_equal(result1, df)
- assert_frame_equal(result2, expected)
- g = df.groupby('A')
- result1 = g.nth(0)
- result2 = g.head(n=2)
- assert_frame_equal(result1, expected)
- assert_frame_equal(result2, df)
- g = df.groupby('A')
- result1 = g.nth(0)
- result2 = g.tail(n=2)
- assert_frame_equal(result1, expected)
- assert_frame_equal(result2, df)
- def test_grouper_index_types(self):
- # related GH5375
- # groupby misbehaving when using a Floatlike index
- df = DataFrame(np.arange(10).reshape(5, 2), columns=list('AB'))
- for index in [tm.makeFloatIndex, tm.makeStringIndex,
- tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex,
- tm.makePeriodIndex]:
- df.index = index(len(df))
- df.groupby(list('abcde')).apply(lambda x: x)
- df.index = list(reversed(df.index.tolist()))
- df.groupby(list('abcde')).apply(lambda x: x)
- def test_grouper_multilevel_freq(self):
- # GH 7885
- # with level and freq specified in a pd.Grouper
- from datetime import date, timedelta
- d0 = date.today() - timedelta(days=14)
- dates = date_range(d0, date.today())
- date_index = pd.MultiIndex.from_product(
- [dates, dates], names=['foo', 'bar'])
- df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index)
- # Check string level
- expected = df.reset_index().groupby([pd.Grouper(
- key='foo', freq='W'), pd.Grouper(key='bar', freq='W')]).sum()
- # reset index changes columns dtype to object
- expected.columns = pd.Index([0], dtype='int64')
- result = df.groupby([pd.Grouper(level='foo', freq='W'), pd.Grouper(
- level='bar', freq='W')]).sum()
- assert_frame_equal(result, expected)
- # Check integer level
- result = df.groupby([pd.Grouper(level=0, freq='W'), pd.Grouper(
- level=1, freq='W')]).sum()
- assert_frame_equal(result, expected)
- def test_grouper_creation_bug(self):
- # GH 8795
- df = DataFrame({'A': [0, 0, 1, 1, 2, 2], 'B': [1, 2, 3, 4, 5, 6]})
- g = df.groupby('A')
- expected = g.sum()
- g = df.groupby(pd.Grouper(key='A'))
- result = g.sum()
- assert_frame_equal(result, expected)
- result = g.apply(lambda x: x.sum())
- assert_frame_equal(result, expected)
- g = df.groupby(pd.Grouper(key='A', axis=0))
- result = g.sum()
- assert_frame_equal(result, expected)
- # GH8866
- s = Series(np.arange(8, dtype='int64'),
- index=pd.MultiIndex.from_product(
- [list('ab'), range(2),
- date_range('20130101', periods=2)],
- names=['one', 'two', 'three']))
- result = s.groupby(pd.Grouper(level='three', freq='M')).sum()
- expected = Series([28], index=Index(
- [Timestamp('2013-01-31')], freq='M', name='three'))
- assert_series_equal(result, expected)
- # just specifying a level breaks
- result = s.groupby(pd.Grouper(level='one')).sum()
- expected = s.groupby(level='one').sum()
- assert_series_equal(result, expected)
- def test_grouper_getting_correct_binner(self):
- # GH 10063
- # using a non-time-based grouper and a time-based grouper
- # and specifying levels
- df = DataFrame({'A': 1}, index=pd.MultiIndex.from_product(
- [list('ab'), date_range('20130101', periods=80)], names=['one',
- 'two']))
- result = df.groupby([pd.Grouper(level='one'), pd.Grouper(
- level='two', freq='M')]).sum()
- expected = DataFrame({'A': [31, 28, 21, 31, 28, 21]},
- index=MultiIndex.from_product(
- [list('ab'),
- date_range('20130101', freq='M', periods=3)],
- names=['one', 'two']))
- assert_frame_equal(result, expected)
- def test_grouper_iter(self):
- self.assertEqual(sorted(self.df.groupby('A').grouper), ['bar', 'foo'])
- def test_empty_groups(self):
- # GH # 1048
- self.assertRaises(ValueError, self.df.groupby, [])
- def test_groupby_grouper(self):
- grouped = self.df.groupby('A')
- result = self.df.groupby(grouped.grouper).mean()
- expected = grouped.mean()
- assert_frame_equal(result, expected)
- def test_groupby_duplicated_column_errormsg(self):
- # GH7511
- df = DataFrame(columns=['A', 'B', 'A', 'C'],
- data=[range(4), range(2, 6), range(0, 8, 2)])
- self.assertRaises(ValueError, df.groupby, 'A')
- self.assertRaises(ValueError, df.groupby, ['A', 'B'])
- grouped = df.groupby('B')
- c = grouped.count()
- self.assertTrue(c.columns.nlevels == 1)
- self.assertTrue(c.columns.size == 3)
- def test_groupby_dict_mapping(self):
- # GH #679
- from pandas import Series
- s = Series({'T1': 5})
- result = s.groupby({'T1': 'T2'}).agg(sum)
- expected = s.groupby(['T2']).agg(sum)
- assert_series_equal(result, expected)
- s = Series([1., 2., 3., 4.], index=list('abcd'))
- mapping = {'a': 0, 'b': 0, 'c': 1, 'd': 1}
- result = s.groupby(mapping).mean()
- result2 = s.groupby(mapping).agg(np.mean)
- expected = s.groupby([0, 0, 1, 1]).mean()
- expected2 = s.groupby([0, 0, 1, 1]).mean()
- assert_series_equal(result, expected)
- assert_series_equal(result, result2)
- assert_series_equal(result, expected2)
- def test_groupby_bounds_check(self):
- # groupby_X is code-generated, so if one variant
- # does, the rest probably do to
- a = np.array([1, 2], dtype='object')
- b = np.array([1, 2, 3], dtype='object')
- self.assertRaises(AssertionError, pd.algos.groupby_object, a, b)
- def test_groupby_grouper_f_sanity_checked(self):
- dates = date_range('01-Jan-2013', periods=12, freq='MS')
- ts = Series(np.random.randn(12), index=dates)
- # GH3035
- # index.map is used to apply grouper to the index
- # if it fails on the elements, map tries it on the entire index as
- # a sequence. That can yield invalid results that cause trouble
- # down the line.
- # the surprise comes from using key[0:6] rather then str(key)[0:6]
- # when the elements are Timestamp.
- # the result is Index[0:6], very confusing.
- self.assertRaises(AssertionError, ts.groupby, lambda key: key[0:6])
- def test_groupby_nonobject_dtype(self):
- key = self.mframe.index.labels[0]
- grouped = self.mframe.groupby(key)
- result = grouped.sum()
- expected = self.mframe.groupby(key.astype('O')).sum()
- assert_frame_equal(result, expected)
- # GH 3911, mixed frame non-conversion
- df = self.df_mixed_floats.copy()
- df['value'] = lrange(len(df))
- def max_value(group):
- return group.ix[group['value'].idxmax()]
- applied = df.groupby('A').apply(max_value)
- result = applied.get_dtype_counts().sort_values()
- expected = Series({'object': 2,
- 'float64': 2,
- 'int64': 1}).sort_values()
- assert_series_equal(result, expected)
- def test_groupby_return_type(self):
- # GH2893, return a reduced type
- df1 = DataFrame([{"val1": 1,
- "val2": 20}, {"val1": 1,
- "val2": 19}, {"val1": 2,
- "val2": 27}, {"val1": 2,
- "val2": 12}
- ])
- def func(dataf):
- return dataf["val2"] - dataf["val2"].mean()
- result = df1.groupby("val1", squeeze=True).apply(func)
- tm.assertIsInstance(result, Series)
- df2 = DataFrame([{"val1": 1,
- "val2": 20}, {"val1": 1,
- "val2": 19}, {"val1": 1,
- "val2": 27}, {"val1": 1,
- "val2": 12}
- ])
- def func(dataf):
- return dataf["val2"] - dataf["val2"].mean()
- result = df2.groupby("val1", squeeze=True).apply(func)
- tm.assertIsInstance(result, Series)
- # GH3596, return a consistent type (regression in 0.11 from 0.10.1)
- df = DataFrame([[1, 1], [1, 1]], columns=['X', 'Y'])
- result = df.groupby('X', squeeze=False).count()
- tm.assertIsInstance(result, DataFrame)
- # GH5592
- # inconcistent return type
- df = DataFrame(dict(A=['Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb',
- 'Pony', 'Pony'], B=Series(
- np.arange(7), dtype='int64'), C=date_range(
- '20130101', periods=7)))
- def f(grp):
- return grp.iloc[0]
- expected = df.groupby('A').first()[['B']]
- result = df.groupby('A').apply(f)[['B']]
- assert_frame_equal(result, expected)
- def f(grp):
- if grp.name == 'Tiger':
- return None
- return grp.iloc[0]
- result = df.groupby('A').apply(f)[['B']]
- e = expected.copy()
- e.loc['Tiger'] = np.nan
- assert_frame_equal(result, e)
- def f(grp):
- if grp.name == 'Pony':
- return None
- return grp.iloc[0]
- result = df.groupby('A').apply(f)[['B']]
- e = expected.copy()
- e.loc['Pony'] = np.nan
- assert_frame_equal(result, e)
- # 5592 revisited, with datetimes
- def f(grp):
- if grp.name == 'Pony':
- return None
- return grp.iloc[0]
- result = df.groupby('A').apply(f)[['C']]
- e = df.groupby('A').first()[['C']]
- e.loc['Pony'] = pd.NaT
- assert_frame_equal(result, e)
- # scalar outputs
- def f(grp):
- if grp.name == 'Pony':
- return None
- return grp.iloc[0].loc['C']
- result = df.groupby('A').apply(f)
- e = df.groupby('A').first()['C'].copy()
- e.loc['Pony'] = np.nan
- e.name = None
- assert_series_equal(result, e)
- def test_agg_api(self):
- # GH 6337
- # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error
- # different api for agg when passed custom function with mixed frame
- df = DataFrame({'data1': np.random.randn(5),
- 'data2': np.random.randn(5),
- 'key1': ['a', 'a', 'b', 'b', 'a'],
- 'key2': ['one', 'two', 'one', 'two', 'one']})
- grouped = df.groupby('key1')
- def peak_to_peak(arr):
- return arr.max() - arr.min()
- expected = grouped.agg([peak_to_peak])
- expected.columns = ['data1', 'data2']
- result = grouped.agg(peak_to_peak)
- assert_frame_equal(result, expected)
- def test_agg_regression1(self):
- grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
- result = grouped.agg(np.mean)
- expected = grouped.mean()
- assert_frame_equal(result, expected)
- def test_agg_datetimes_mixed(self):
- data = [[1, '2012-01-01', 1.0], [2, '2012-01-02', 2.0], [3, None, 3.0]]
- df1 = DataFrame({'key': [x[0] for x in data],
- 'date': [x[1] for x in data],
- 'value': [x[2] for x in data]})
- data = [[row[0], datetime.strptime(row[1], '%Y-%m-%d').date() if row[1]
- else None, row[2]] for row in data]
- df2 = DataFrame({'key': [x[0] for x in data],
- 'date': [x[1] for x in data],
- 'value': [x[2] for x in data]})
- df1['weights'] = df1['value'] / df1['value'].sum()
- gb1 = df1.groupby('date').aggregate(np.sum)
- df2['weights'] = df1['value'] / df1['value'].sum()
- gb2 = df2.groupby('date').aggregate(np.sum)
- assert (len(gb1) == len(gb2))
- def test_agg_period_index(self):
- from pandas import period_range, PeriodIndex
- prng = period_range('2012-1-1', freq='M', periods=3)
- df = DataFrame(np.random.randn(3, 2), index=prng)
- rs = df.groupby(level=0).sum()
- tm.assertIsInstance(rs.index, PeriodIndex)
- # GH 3579
- index = period_range(start='1999-01', periods=5, freq='M')
- s1 = Series(np.random.rand(len(index)), index=index)
- s2 = Series(np.random.rand(len(index)), index=index)
- series = [('s1', s1), ('s2', s2)]
- df = DataFrame.from_items(series)
- grouped = df.groupby(df.index.month)
- list(grouped)
- def test_agg_dict_parameter_cast_result_dtypes(self):
- # GH 12821
- df = DataFrame(
- {'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'],
- 'time': date_range('1/1/2011', periods=8, freq='H')})
- df.loc[[0, 1, 2, 5], 'time'] = None
- # test for `first` function
- exp = df.loc[[0, 3, 4, 6]].set_index('class')
- grouped = df.groupby('class')
- assert_frame_equal(grouped.first(), exp)
- assert_frame_equal(grouped.agg('first'), exp)
- assert_frame_equal(grouped.agg({'time': 'first'}), exp)
- assert_series_equal(grouped.time.first(), exp['time'])
- assert_series_equal(grouped.time.agg('first'), exp['time'])
- # test for `last` function
- exp = df.loc[[0, 3, 4, 7]].set_index('class')
- grouped = df.groupby('class')
- assert_frame_equal(grouped.last(), exp)
- assert_frame_equal(grouped.agg('last'), exp)
- assert_frame_equal(grouped.agg({'time': 'last'}), exp)
- assert_series_equal(grouped.time.last(), exp['time'])
- assert_series_equal(grouped.time.agg('last'), exp['time'])
- def test_agg_must_agg(self):
- grouped = self.df.groupby('A')['C']
- self.assertRaises(Exception, grouped.agg, lambda x: x.describe())
- self.assertRaises(Exception, grouped.agg, lambda x: x.index[:2])
- def test_agg_ser_multi_key(self):
- # TODO(wesm): unused
- ser = self.df.C # noqa
- f = lambda x: x.sum()
- results = self.df.C.groupby([self.df.A, self.df.B]).aggregate(f)
- expected = self.df.groupby(['A', 'B']).sum()['C']
- assert_series_equal(results, expected)
- def test_get_group(self):
- wp = tm.makePanel()
- grouped = wp.groupby(lambda x: x.month, axis='major')
- gp = grouped.get_group(1)
- expected = wp.reindex(major=[x for x in wp.major_axis if x.month == 1])
- assert_panel_equal(gp, expected)
- # GH 5267
- # be datelike friendly
- df = DataFrame({'DATE': pd.to_datetime(
- ['10-Oct-2013', '10-Oct-2013', '10-Oct-2013', '11-Oct-2013',
- '11-Oct-2013', '11-Oct-2013']),
- 'label': ['foo', 'foo', 'bar', 'foo', 'foo', 'bar'],
- 'VAL': [1, 2, 3, 4, 5, 6]})
- g = df.groupby('DATE')
- key = list(g.groups)[0]
- result1 = g.get_group(key)
- result2 = g.get_group(Timestamp(key).to_datetime())
- result3 = g.get_group(str(Timestamp(key)))
- assert_frame_equal(result1, result2)
- assert_frame_equal(result1, result3)
- g = df.groupby(['DATE', 'label'])
- key = list(g.groups)[0]
- result1 = g.get_group(key)
- result2 = g.get_group((Timestamp(key[0]).to_datetime(), key[1]))
- result3 = g.get_group((str(Timestamp(key[0])), key[1]))
- assert_frame_equal(result1, result2)
- assert_frame_equal(result1, result3)
- # must pass a same-length tuple with multiple keys
- self.assertRaises(ValueError, lambda: g.get_group('foo'))
- self.assertRaises(ValueError, lambda: g.get_group(('foo')))
- self.assertRaises(ValueError,
- lambda: g.get_group(('foo', 'bar', 'baz')))
- def test_get_group_grouped_by_tuple(self):
- # GH 8121
- df = DataFrame([[(1, ), (1, 2), (1, ), (1, 2)]], index=['ids']).T
- gr = df.groupby('ids')
- expected = DataFrame({'ids': [(1, ), (1, )]}, index=[0, 2])
- result = gr.get_group((1, ))
- assert_frame_equal(result, expected)
- dt = pd.to_datetime(['2010-01-01', '2010-01-02', '2010-01-01',
- '2010-01-02'])
- df = DataFrame({'ids': [(x, ) for x in dt]})
- gr = df.groupby('ids')
- result = gr.get_group(('2010-01-01', ))
- expected = DataFrame({'ids': [(dt[0], ), (dt[0], )]}, index=[0, 2])
- assert_frame_equal(result, expected)
- def test_agg_apply_corner(self):
- # nothing to group, all NA
- grouped = self.ts.groupby(self.ts * np.nan)
- self.assertEqual(self.ts.dtype, np.float64)
- # groupby float64 values results in Float64Index
- exp = Series([], dtype=np.float64, index=pd.Index(
- [], dtype=np.float64))
- assert_series_equal(grouped.sum(), exp)
- assert_series_equal(grouped.agg(np.sum), exp)
- assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False)
- # DataFrame
- grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan)
- exp_df = DataFrame(columns=self.tsframe.columns, dtype=float,
- index=pd.Index([], dtype=np.float64))
- assert_frame_equal(grouped.sum(), exp_df, check_names=False)
- assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False)
- assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0],
- check_names=False)
- def test_agg_grouping_is_list_tuple(self):
- from pandas.core.groupby import Grouping
- df = tm.makeTimeDataFrame()
- grouped = df.groupby(lambda x: x.year)
- grouper = grouped.grouper.groupings[0].grouper
- grouped.grouper.groupings[0] = Grouping(self.ts.index, list(grouper))
- result = grouped.agg(np.mean)
- expected = grouped.mean()
- tm.assert_frame_equal(result, expected)
- grouped.grouper.groupings[0] = Grouping(self.ts.index, tuple(grouper))
- result = grouped.agg(np.mean)
- expected = grouped.mean()
- tm.assert_frame_equal(result, expected)
- def test_grouping_error_on_multidim_input(self):
- from pandas.core.groupby import Grouping
- self.assertRaises(ValueError,
- Grouping, self.df.index, self.df[['A', 'A']])
- def test_agg_python_multiindex(self):
- grouped = self.mframe.groupby(['A', 'B'])
- result = grouped.agg(np.mean)
- expected = grouped.mean()
- tm.assert_frame_equal(result, expected)
- def test_apply_describe_bug(self):
- grouped = self.mframe.groupby(level='first')
- grouped.describe() # it works!
- def test_apply_issues(self):
- # GH 5788
- s = """2011.05.16,00:00,1.40893
- 2011.05.16,01:00,1.40760
- 2011.05.16,02:00,1.40750
- 2011.05.16,03:00,1.40649
- 2011.05.17,02:00,1.40893
- 2011.05.17,03:00,1.40760
- 2011.05.17,04:00,1.40750
- 2011.05.17,05:00,1.40649
- 2011.05.18,02:00,1.40893
- 2011.05.18,03:00,1.40760
- 2011.05.18,04:00,1.40750
- 2011.05.18,05:00,1.40649"""
- df = pd.read_csv(
- StringIO(s), header=None, names=['date', 'time', 'value'],
- parse_dates=[['date', 'time']])
- df = df.set_index('date_time')
- expected = df.groupby(df.index.date).idxmax()
- result = df.groupby(df.index.date).apply(lambda x: x.idxmax())
- assert_frame_equal(result, expected)
- # GH 5789
- # don't auto coerce dates
- df = pd.read_csv(
- StringIO(s), header=None, names=['date', 'time', 'value'])
- exp_idx = pd.Index(
- ['2011.05.16', '2011.05.17', '2011.05.18'
- ], dtype=object, name='date')
- expected = Series(['00:00', '02:00', '02:00'], index=exp_idx)
- result = df.groupby('date').apply(
- lambda x: x['time'][x['value'].idxmax()])
- assert_series_equal(result, expected)
- def test_time_field_bug(self):
- # Test a fix for the following error related to GH issue 11324 When
- # non-key fields in a group-by dataframe contained time-based fields
- # that were not returned by the apply function, an exception would be
- # raised.
- df = pd.DataFrame({'a': 1, 'b': [datetime.now() for nn in range(10)]})
- def func_with_no_date(batch):
- return pd.Series({'c': 2})
- def func_with_date(batch):
- return pd.Series({'c': 2, 'b': datetime(2015, 1, 1)})
- dfg_no_conversion = df.groupby(by=['a']).apply(func_with_no_date)
- dfg_no_conversion_expected = pd.DataFrame({'c': 2}, index=[1])
- dfg_no_conversion_expected.index.name = 'a'
- dfg_conversion = df.groupby(by=['a']).apply(func_with_date)
- dfg_conversion_expected = pd.DataFrame(
- {'b': datetime(2015, 1, 1),
- 'c': 2}, index=[1])
- dfg_conversion_expected.index.name = 'a'
- self.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected)
- self.assert_frame_equal(dfg_conversion, dfg_conversion_expected)
- def test_len(self):
- df = tm.makeTimeDataFrame()
- grouped = df.groupby([lambda x: x.year, lambda x: x.month,
- lambda x: x.day])
- self.assertEqual(len(grouped), len(df))
- grouped = df.groupby([lambda x: x.year, lambda x: x.month])
- expected = len(set([(x.year, x.month) for x in df.index]))
- self.assertEqual(len(grouped), expected)
- # issue 11016
- df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3]))
- self.assertEqual(len(df.groupby(('a'))), 0)
- self.assertEqual(len(df.groupby(('b'))), 3)
- self.assertEqual(len(df.groupby(('a', 'b'))), 3)
- def test_groups(self):
- grouped = self.df.groupby(['A'])
- groups = grouped.groups
- self.assertIs(groups, grouped.groups) # caching works
- for k, v in compat.iteritems(grouped.groups):
- self.assertTrue((self.df.ix[v]['A'] == k).all())
- grouped = self.df.groupby(['A', 'B'])
- groups = grouped.groups
- self.assertIs(groups, grouped.groups) # caching works
- for k, v in compat.iteritems(grouped.groups):
- self.assertTrue((self.df.ix[v]['A'] == k[0]).all())
- self.assertTrue((self.df.ix[v]['B'] == k[1]).all())
- def test_aggregate_str_func(self):
- def _check_results(grouped):
- # single series
- result = grouped['A'].agg('std')
- expected = grouped['A'].std()
- assert_series_equal(result, expected)
- # group frame by function name
- result = grouped.aggregate('var')
- expected = grouped.var()
- assert_frame_equal(result, expected)
- # group frame by function dict
- result = grouped.agg(OrderedDict([['A', 'var'], ['B', 'std'],
- ['C', 'mean'], ['D', 'sem']]))
- expected = DataFrame(OrderedDict([['A', grouped['A'].var(
- )], ['B', grouped['B'].std()], ['C', grouped['C'].mean()],
- ['D', grouped['D'].sem()]]))
- assert_frame_equal(result, expected)
- by_weekday = self.tsframe.groupby(lambda x: x.weekday())
- _check_results(by_weekday)
- by_mwkday = self.tsframe.groupby([lambda x: x.month,
- lambda x: x.weekday()])
- _check_results(by_mwkday)
- def test_aggregate_item_by_item(self):
- df = self.df.copy()
- df['E'] = ['a'] * len(self.df)
- grouped = self.df.groupby('A')
- # API change in 0.11
- # def aggfun(ser):
- # return len(ser + 'a')
- # result = grouped.agg(aggfun)
- # self.assertEqual(len(result.columns), 1)
- aggfun = lambda ser: ser.size
- result = grouped.agg(aggfun)
- foo = (self.df.A == 'foo').sum()
- bar = (self.df.A == 'bar').sum()
- K = len(result.columns)
- # GH5782
- # odd comparisons can result here, so cast to make easy
- exp = pd.Series(np.array([foo] * K), index=list('BCD'),
- dtype=np.float64, name='foo')
- tm.assert_series_equal(result.xs('foo'), exp)
- exp = pd.Series(np.array([bar] * K), index=list('BCD'),
- dtype=np.float64, name='bar')
- tm.assert_almost_equal(result.xs('bar'), exp)
- def aggfun(ser):
- return ser.size
- result = DataFrame().groupby(self.df.A).agg(aggfun)
- tm.assertIsInstance(result, DataFrame)
- self.assertEqual(len(result), 0)
- def test_agg_item_by_item_raise_typeerror(self):
- from numpy.random import randint
- df = DataFrame(randint(10, size=(20, 10)))
- def raiseException(df):
- pprint_thing('----------------------------------------')
- pprint_thing(df.to_string())
- raise TypeError
- self.assertRaises(TypeError, df.groupby(0).agg, raiseException)
- def test_basic_regression(self):
- # regression
- T = [1.0 * x for x in lrange(1, 10) * 10][:1095]
- result = Series(T, lrange(0, len(T)))
- groupings = np.random.random((1100, ))
- groupings = Series(groupings, lrange(0, len(groupings))) * 10.
- grouped = result.groupby(groupings)
- grouped.mean()
- def test_transform(self):
- data = Series(np.arange(9) // 3, index=np.arange(9))
- index = np.arange(9)
- np.random.shuffle(index)
- data = data.reindex(index)
- grouped = data.groupby(lambda x: x // 3)
- transformed = grouped.transform(lambda x: x * x.sum())
- self.assertEqual(transformed[7], 12)
- # GH 8046
- # make sure that we preserve the input order
- df = DataFrame(
- np.arange(6, dtype='int64').reshape(
- 3, 2), columns=["a", "b"], index=[0, 2, 1])
- key = [0, 0, 1]
- expected = df.sort_index().groupby(key).transform(
- lambda x: x - x.mean()).groupby(key).mean()
- result = df.groupby(key).transform(lambda x: x - x.mean()).groupby(
- key).mean()
- assert_frame_equal(result, expected)
- def demean(arr):
- return arr - arr.mean()
- people = DataFrame(np.random.randn(5, 5),
- columns=['a', 'b', 'c', 'd', 'e'],
- index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
- key = ['one', 'two', 'one', 'two', 'one']
- result = people.groupby(key).transform(demean).groupby(key).mean()
- expected = people.groupby(key).apply(demean).groupby(key).mean()
- assert_frame_equal(result, expected)
- # GH 8430
- df = tm.makeTimeDataFrame()
- g = df.groupby(pd.TimeGrouper('M'))
- g.transform(lambda x: x - 1)
- # GH 9700
- df = DataFrame({'a': range(5, 10), 'b': range(5)})
- result = df.groupby('a').transform(max)
- expected = DataFrame({'b': range(5)})
- tm.assert_frame_equal(result, expected)
- def test_transform_fast(self):
- df = DataFrame({'id': np.arange(100000) / 3,
- 'val': np.random.randn(100000)})
- grp = df.groupby('id')['val']
- values = np.repeat(grp.mean().values,
- _ensure_platform_int(grp.count().values))
- expected = pd.Series(values, index=df.index, name='val')
- result = grp.transform(np.mean)
- assert_series_equal(result, expected)
- result = grp.transform('mean')
- assert_series_equal(result, expected)
- # GH 12737
- df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5],
- 'd': pd.date_range('2014-1-1', '2014-1-4'),
- 'i': [1, 2, 3, 4]},
- columns=['grouping', 'f', 'i', 'd'])
- result = df.groupby('grouping').transform('first')
- dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'),
- pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')]
- expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5],
- 'd': dates,
- 'i': [1, 2, 2, 4]},
- columns=['f', 'i', 'd'])
- assert_frame_equal(result, expected)
- # selection
- result = df.groupby('grouping')[['f', 'i']].transform('first')
- expected = expected[['f', 'i']]
- assert_frame_equal(result, expected)
- # dup columns
- df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a'])
- result = df.groupby('g').transform('first')
- expected = df.drop('g', axis=1)
- assert_frame_equal(result, expected)
- def test_transform_broadcast(self):
- grouped = self.ts.groupby(lambda x: x.month)
- result = grouped.transform(np.mean)
- self.assert_index_equal(result.index, self.ts.index)
- for _, gp in grouped:
- assert_fp_equal(result.reindex(gp.index), gp.mean())
- grouped = self.tsframe.groupby(lambda x: x.month)
- result = grouped.transform(np.mean)
- self.assert_index_equal(result.index, self.tsframe.index)
- for _, gp in grouped:
- agged = gp.mean()
- res = result.reindex(gp.index)
- for col in self.tsframe:
- assert_fp_equal(res[col], agged[col])
- # group columns
- grouped = self.tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1},
- axis=1)
- result = grouped.transform(np.mean)
- self.assert_index_equal(result.index, self.tsframe.index)
- self.assert_index_equal(result.columns, self.tsframe.columns)
- for _, gp in grouped:
- agged = gp.mean(1)
- res = result.reindex(columns=gp.columns)
- for idx in gp.index:
- assert_fp_equal(res.xs(idx), agged[idx])
- def test_transform_axis(self):
- # make sure that we are setting the axes
- # correctly when on axis=0 or 1
- # in the presence of a non-monotonic indexer
- # GH12713
- base = self.tsframe.iloc[0:5]
- r = len(base.index)
- c = len(base.columns)
- tso = DataFrame(np.random.randn(r, c),
- index=base.index,
- columns=base.columns,
- dtype='float64')
- # monotonic
- ts = tso
- grouped = ts.groupby(lambda x: x.weekday())
- result = ts - grouped.transform('mean')
- expected = grouped.apply(lambda x: x - x.mean())
- assert_frame_equal(result, expected)
- ts = ts.T
- grouped = ts.groupby(lambda x: x.weekday(), axis=1)
- result = ts - grouped.transform('mean')
- expected = grouped.apply(lambda x: (x.T - x.mean(1)).T)
- assert_frame_equal(result, expected)
- # non-monotonic
- ts = tso.iloc[[1, 0] + list(range(2, len(base)))]
- grouped = ts.groupby(lambda x: x.weekday())
- result = ts - grouped.transform('mean')
- expected = grouped.apply(lambda x: x - x.mean())
- assert_frame_equal(result, expected)
- ts = ts.T
- grouped = ts.groupby(lambda x: x.weekday(), axis=1)
- result = ts - grouped.transform('mean')
- expected = grouped.apply(lambda x: (x.T - x.mean(1)).T)
- assert_frame_equal(result, expected)
- def test_transform_dtype(self):
- # GH 9807
- # Check transform dtype output is preserved
- df = DataFrame([[1, 3], [2, 3]])
- result = df.groupby(1).transform('mean')
- expected = DataFrame([[1.5], [1.5]])
- assert_frame_equal(result, expected)
- def test_transform_bug(self):
- # GH 5712
- # transforming on a datetime column
- df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5)))
- result = df.groupby('A')['B'].transform(
- lambda x: x.rank(ascending=False))
- expected = Series(np.arange(5, 0, step=-1), name='B')
- assert_series_equal(result, expected)
- def test_transform_multiple(self):
- grouped = self.ts.groupby([lambda x: x.year, lambda x: x.month])
- grouped.transform(lambda x: x * 2)
- grouped.transform(np.mean)
- def test_dispatch_transform(self):
- df = self.tsframe[::5].reindex(self.tsframe.index)
- grouped = df.groupby(lambda x: x.month)
- filled = grouped.fillna(method='pad')
- fillit = lambda x: x.fillna(method='pad')
- expected = df.groupby(lambda x: x.month).transform(fillit)
- assert_frame_equal(filled, expected)
- def test_transform_select_columns(self):
- f = lambda x: x.mean()
- result = self.df.groupby('A')['C', 'D'].transform(f)
- selection = self.df[['C', 'D']]
- expected = selection.groupby(self.df['A']).transform(f)
- assert_frame_equal(result, expected)
- def test_transform_exclude_nuisance(self):
- # this also tests orderings in transform between
- # series/frame to make sure it's consistent
- expected = {}
- grouped = self.df.groupby('A')
- expected['C'] = grouped['C'].transform(np.mean)
- expected['D'] = grouped['D'].transform(np.mean)
- expected = DataFrame(expected)
- result = self.df.groupby('A').transform(np.mean)
- assert_frame_equal(result, expected)
- def test_transform_function_aliases(self):
- result = self.df.groupby('A').transform('mean')
- expected = self.df.groupby('A').transform(np.mean)
- assert_frame_equal(result, expected)
- result = self.df.groupby('A')['C'].transform('mean')
- expected = self.df.groupby('A')['C'].transform(np.mean)
- assert_series_equal(result, expected)
- def test_series_fast_transform_date(self):
- # GH 13191
- df = pd.DataFrame({'grouping': [np.nan, 1, 1, 3],
- 'd': pd.date_range('2014-1-1', '2014-1-4')})
- result = df.groupby('grouping')['d'].transform('first')
- dates = [pd.NaT, pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'),
- pd.Timestamp('2014-1-4')]
- expected = pd.Series(dates, name='d')
- assert_series_equal(result, expected)
- def test_transform_length(self):
- # GH 9697
- df = pd.DataFrame({'col1': [1, 1, 2, 2], 'col2': [1, 2, 3, np.nan]})
- expected = pd.Series([3.0] * 4)
- def nsum(x):
- return np.nansum(x)
- results = [df.groupby('col1…
Large files files are truncated, but you can click here to view the full file