/pandas/tests/test_groupby.py
Python | 4473 lines | 4292 code | 132 blank | 49 comment | 24 complexity | 8f76f988646643acde67dd7273b6636f MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
Large files files are truncated, but you can click here to view the full file
- from __future__ import print_function
- import nose
- from numpy.testing.decorators import slow
- from datetime import datetime
- from numpy import nan
- from pandas import date_range,bdate_range, Timestamp
- from pandas.core.index import Index, MultiIndex, Int64Index
- from pandas.core.common import rands
- from pandas.core.api import Categorical, DataFrame
- from pandas.core.groupby import (SpecificationError, DataError,
- _nargsort, _lexsort_indexer)
- from pandas.core.series import Series
- from pandas.util.testing import (assert_panel_equal, assert_frame_equal,
- assert_series_equal, assert_almost_equal,
- assert_index_equal, assertRaisesRegexp)
- from pandas.compat import(
- range, long, lrange, StringIO, lmap, lzip, map,
- zip, builtins, OrderedDict
- )
- from pandas import compat
- from pandas.core.panel import Panel
- from pandas.tools.merge import concat
- from collections import defaultdict
- import pandas.core.common as com
- import numpy as np
- import pandas.core.nanops as nanops
- import pandas.util.testing as tm
- import pandas as pd
- from numpy.testing import assert_equal
- def _skip_if_mpl_not_installed():
- try:
- import matplotlib.pyplot as plt
- except ImportError:
- raise nose.SkipTest("matplotlib not installed")
- def commonSetUp(self):
- self.dateRange = bdate_range('1/1/2005', periods=250)
- self.stringIndex = Index([rands(8).upper() for x in range(250)])
- self.groupId = Series([x[0] for x in self.stringIndex],
- index=self.stringIndex)
- self.groupDict = dict((k, v) for k, v in compat.iteritems(self.groupId))
- self.columnIndex = Index(['A', 'B', 'C', 'D', 'E'])
- randMat = np.random.randn(250, 5)
- self.stringMatrix = DataFrame(randMat, columns=self.columnIndex,
- index=self.stringIndex)
- self.timeMatrix = DataFrame(randMat, columns=self.columnIndex,
- index=self.dateRange)
- class TestGroupBy(tm.TestCase):
- _multiprocess_can_split_ = True
- def setUp(self):
- self.ts = tm.makeTimeSeries()
- self.seriesd = tm.getSeriesData()
- self.tsd = tm.getTimeSeriesData()
- self.frame = DataFrame(self.seriesd)
- self.tsframe = DataFrame(self.tsd)
- self.df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
- 'foo', 'bar', 'foo', 'foo'],
- 'B': ['one', 'one', 'two', 'three',
- 'two', 'two', 'one', 'three'],
- 'C': np.random.randn(8),
- 'D': np.random.randn(8)})
- self.df_mixed_floats = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
- 'foo', 'bar', 'foo', 'foo'],
- 'B': ['one', 'one', 'two', 'three',
- 'two', 'two', 'one', 'three'],
- 'C': np.random.randn(8),
- 'D': np.array(np.random.randn(8),
- dtype='float32')})
- index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
- ['one', 'two', 'three']],
- labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
- [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
- names=['first', 'second'])
- self.mframe = DataFrame(np.random.randn(10, 3), index=index,
- columns=['A', 'B', 'C'])
- self.three_group = DataFrame({'A': ['foo', 'foo', 'foo', 'foo',
- 'bar', 'bar', 'bar', 'bar',
- 'foo', 'foo', 'foo'],
- 'B': ['one', 'one', 'one', 'two',
- 'one', 'one', 'one', 'two',
- 'two', 'two', 'one'],
- 'C': ['dull', 'dull', 'shiny', 'dull',
- 'dull', 'shiny', 'shiny', 'dull',
- 'shiny', 'shiny', 'shiny'],
- 'D': np.random.randn(11),
- 'E': np.random.randn(11),
- 'F': np.random.randn(11)})
- def test_basic(self):
- def checkit(dtype):
- data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype)
- index = np.arange(9)
- np.random.shuffle(index)
- data = data.reindex(index)
- grouped = data.groupby(lambda x: x // 3)
- for k, v in grouped:
- self.assertEqual(len(v), 3)
- agged = grouped.aggregate(np.mean)
- self.assertEqual(agged[1], 1)
- assert_series_equal(agged, grouped.agg(np.mean)) # shorthand
- assert_series_equal(agged, grouped.mean())
- assert_series_equal(grouped.agg(np.sum), grouped.sum())
- expected = grouped.apply(lambda x: x * x.sum())
- transformed = grouped.transform(lambda x: x * x.sum())
- self.assertEqual(transformed[7], 12)
- assert_series_equal(transformed, expected)
- value_grouped = data.groupby(data)
- assert_series_equal(value_grouped.aggregate(np.mean), agged)
- # complex agg
- agged = grouped.aggregate([np.mean, np.std])
- agged = grouped.aggregate({'one': np.mean,
- 'two': np.std})
- group_constants = {
- 0: 10,
- 1: 20,
- 2: 30
- }
- agged = grouped.agg(lambda x: group_constants[x.name] + x.mean())
- self.assertEqual(agged[1], 21)
- # corner cases
- self.assertRaises(Exception, grouped.aggregate, lambda x: x * 2)
- for dtype in ['int64', 'int32', 'float64', 'float32']:
- checkit(dtype)
- def test_select_bad_cols(self):
- df = DataFrame([[1, 2]], columns=['A', 'B'])
- g = df.groupby('A')
- self.assertRaises(KeyError, g.__getitem__, ['C']) # g[['C']]
- self.assertRaises(KeyError, g.__getitem__, ['A', 'C']) # g[['A', 'C']]
- with assertRaisesRegexp(KeyError, '^[^A]+$'):
- # A should not be referenced as a bad column...
- # will have to rethink regex if you change message!
- g[['A', 'C']]
- def test_first_last_nth(self):
- # tests for first / last / nth
- grouped = self.df.groupby('A')
- first = grouped.first()
- expected = self.df.ix[[1, 0], ['B','C','D']]
- expected.index = Index(['bar', 'foo'],name='A')
- expected = expected.sort_index()
- assert_frame_equal(first, expected)
- nth = grouped.nth(0)
- assert_frame_equal(nth, expected)
- last = grouped.last()
- expected = self.df.ix[[5, 7], ['B','C','D']]
- expected.index = Index(['bar', 'foo'],name='A')
- assert_frame_equal(last, expected)
- nth = grouped.nth(-1)
- assert_frame_equal(nth, expected)
- nth = grouped.nth(1)
- expected = self.df.ix[[2, 3],['B','C','D']].copy()
- expected.index = Index(['foo', 'bar'],name='A')
- expected = expected.sort_index()
- assert_frame_equal(nth, expected)
- # it works!
- grouped['B'].first()
- grouped['B'].last()
- grouped['B'].nth(0)
- self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan
- self.assertTrue(com.isnull(grouped['B'].first()['foo']))
- self.assertTrue(com.isnull(grouped['B'].last()['foo']))
- self.assertTrue(com.isnull(grouped['B'].nth(0)[0])) # not sure what this is testing
- # v0.14.0 whatsnew
- df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
- g = df.groupby('A')
- result = g.first()
- expected = df.iloc[[1,2]].set_index('A')
- assert_frame_equal(result, expected)
- expected = df.iloc[[1,2]].set_index('A')
- result = g.nth(0,dropna='any')
- assert_frame_equal(result, expected)
- def test_first_last_nth_dtypes(self):
- df = self.df_mixed_floats.copy()
- df['E'] = True
- df['F'] = 1
- # tests for first / last / nth
- grouped = df.groupby('A')
- first = grouped.first()
- expected = df.ix[[1, 0], ['B', 'C', 'D', 'E', 'F']]
- expected.index = Index(['bar', 'foo'], name='A')
- expected = expected.sort_index()
- assert_frame_equal(first, expected)
- last = grouped.last()
- expected = df.ix[[5, 7], ['B', 'C', 'D', 'E', 'F']]
- expected.index = Index(['bar', 'foo'], name='A')
- expected = expected.sort_index()
- assert_frame_equal(last, expected)
- nth = grouped.nth(1)
- expected = df.ix[[3, 2],['B', 'C', 'D', 'E', 'F']]
- expected.index = Index(['bar', 'foo'], name='A')
- expected = expected.sort_index()
- assert_frame_equal(nth, expected)
- # GH 2763, first/last shifting dtypes
- idx = lrange(10)
- idx.append(9)
- s = Series(data=lrange(11), index=idx, name='IntCol')
- self.assertEqual(s.dtype, 'int64')
- f = s.groupby(level=0).first()
- self.assertEqual(f.dtype, 'int64')
- def test_nth(self):
- df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
- g = df.groupby('A')
- assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A'))
- assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A'))
- assert_frame_equal(g.nth(2), df.loc[[],['B']])
- assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A'))
- assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A'))
- assert_frame_equal(g.nth(-3), df.loc[[],['B']])
- assert_series_equal(g.B.nth(0), df.B.iloc[[0, 2]])
- assert_series_equal(g.B.nth(1), df.B.iloc[[1]])
- assert_frame_equal(g[['B']].nth(0), df.ix[[0, 2], ['A', 'B']].set_index('A'))
- exp = df.set_index('A')
- assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]])
- assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]])
- exp['B'] = np.nan
- assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]])
- assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]])
- # out of bounds, regression from 0.13.1
- # GH 6621
- df = DataFrame({'color': {0: 'green', 1: 'green', 2: 'red', 3: 'red', 4: 'red'},
- 'food': {0: 'ham', 1: 'eggs', 2: 'eggs', 3: 'ham', 4: 'pork'},
- 'two': {0: 1.5456590000000001, 1: -0.070345000000000005, 2: -2.4004539999999999, 3: 0.46206000000000003, 4: 0.52350799999999997},
- 'one': {0: 0.56573799999999996, 1: -0.9742360000000001, 2: 1.033801, 3: -0.78543499999999999, 4: 0.70422799999999997}}).set_index(['color', 'food'])
- result = df.groupby(level=0).nth(2)
- expected = df.iloc[[-1]]
- assert_frame_equal(result,expected)
- result = df.groupby(level=0).nth(3)
- expected = df.loc[[]]
- assert_frame_equal(result,expected)
- # GH 7559
- # from the vbench
- df = DataFrame(np.random.randint(1, 10, (100, 2)),dtype='int64')
- s = df[1]
- g = df[0]
- expected = s.groupby(g).first()
- expected2 = s.groupby(g).apply(lambda x: x.iloc[0])
- assert_series_equal(expected2,expected)
- # validate first
- v = s[g==1].iloc[0]
- self.assertEqual(expected.iloc[0],v)
- self.assertEqual(expected2.iloc[0],v)
- # this is NOT the same as .first (as sorted is default!)
- # as it keeps the order in the series (and not the group order)
- # related GH 7287
- expected = s.groupby(g,sort=False).first()
- expected.index = range(1,10)
- result = s.groupby(g).nth(0,dropna='all')
- assert_series_equal(result,expected)
- # doc example
- df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
- g = df.groupby('A')
- result = g.B.nth(0, dropna=True)
- expected = g.B.first()
- assert_series_equal(result,expected)
- def test_grouper_index_types(self):
- # related GH5375
- # groupby misbehaving when using a Floatlike index
- df = DataFrame(np.arange(10).reshape(5,2),columns=list('AB'))
- for index in [ tm.makeFloatIndex, tm.makeStringIndex,
- tm.makeUnicodeIndex, tm.makeIntIndex,
- tm.makeDateIndex, tm.makePeriodIndex ]:
- df.index = index(len(df))
- df.groupby(list('abcde')).apply(lambda x: x)
- df.index = list(reversed(df.index.tolist()))
- df.groupby(list('abcde')).apply(lambda x: x)
- def test_grouper_iter(self):
- self.assertEqual(sorted(self.df.groupby('A').grouper), ['bar', 'foo'])
- def test_empty_groups(self):
- # GH # 1048
- self.assertRaises(ValueError, self.df.groupby, [])
- def test_groupby_grouper(self):
- grouped = self.df.groupby('A')
- result = self.df.groupby(grouped.grouper).mean()
- expected = grouped.mean()
- assert_frame_equal(result, expected)
- def test_groupby_dict_mapping(self):
- # GH #679
- from pandas import Series
- s = Series({'T1': 5})
- result = s.groupby({'T1': 'T2'}).agg(sum)
- expected = s.groupby(['T2']).agg(sum)
- assert_series_equal(result, expected)
- s = Series([1., 2., 3., 4.], index=list('abcd'))
- mapping = {'a': 0, 'b': 0, 'c': 1, 'd': 1}
- result = s.groupby(mapping).mean()
- result2 = s.groupby(mapping).agg(np.mean)
- expected = s.groupby([0, 0, 1, 1]).mean()
- expected2 = s.groupby([0, 0, 1, 1]).mean()
- assert_series_equal(result, expected)
- assert_series_equal(result, result2)
- assert_series_equal(result, expected2)
- def test_groupby_bounds_check(self):
- import pandas as pd
- # groupby_X is code-generated, so if one variant
- # does, the rest probably do to
- a = np.array([1,2],dtype='object')
- b = np.array([1,2,3],dtype='object')
- self.assertRaises(AssertionError, pd.algos.groupby_object,a, b)
- def test_groupby_grouper_f_sanity_checked(self):
- import pandas as pd
- dates = date_range('01-Jan-2013', periods=12, freq='MS')
- ts = pd.TimeSeries(np.random.randn(12), index=dates)
- # GH3035
- # index.map is used to apply grouper to the index
- # if it fails on the elements, map tries it on the entire index as
- # a sequence. That can yield invalid results that cause trouble
- # down the line.
- # the surprise comes from using key[0:6] rather then str(key)[0:6]
- # when the elements are Timestamp.
- # the result is Index[0:6], very confusing.
- self.assertRaises(AssertionError, ts.groupby,lambda key: key[0:6])
- def test_groupby_nonobject_dtype(self):
- key = self.mframe.index.labels[0]
- grouped = self.mframe.groupby(key)
- result = grouped.sum()
- expected = self.mframe.groupby(key.astype('O')).sum()
- assert_frame_equal(result, expected)
- # GH 3911, mixed frame non-conversion
- df = self.df_mixed_floats.copy()
- df['value'] = lrange(len(df))
- def max_value(group):
- return group.ix[group['value'].idxmax()]
- applied = df.groupby('A').apply(max_value)
- result = applied.get_dtype_counts()
- result.sort()
- expected = Series({ 'object' : 2, 'float64' : 2, 'int64' : 1 })
- expected.sort()
- assert_series_equal(result,expected)
- def test_groupby_return_type(self):
- # GH2893, return a reduced type
- df1 = DataFrame([{"val1": 1, "val2" : 20}, {"val1":1, "val2": 19},
- {"val1":2, "val2": 27}, {"val1":2, "val2": 12}])
- def func(dataf):
- return dataf["val2"] - dataf["val2"].mean()
- result = df1.groupby("val1", squeeze=True).apply(func)
- tm.assert_isinstance(result,Series)
- df2 = DataFrame([{"val1": 1, "val2" : 20}, {"val1":1, "val2": 19},
- {"val1":1, "val2": 27}, {"val1":1, "val2": 12}])
- def func(dataf):
- return dataf["val2"] - dataf["val2"].mean()
- result = df2.groupby("val1", squeeze=True).apply(func)
- tm.assert_isinstance(result,Series)
- # GH3596, return a consistent type (regression in 0.11 from 0.10.1)
- df = DataFrame([[1,1],[1,1]],columns=['X','Y'])
- result = df.groupby('X',squeeze=False).count()
- tm.assert_isinstance(result,DataFrame)
- # GH5592
- # inconcistent return type
- df = DataFrame(dict(A = [ 'Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb', 'Pony', 'Pony' ],
- B = Series(np.arange(7),dtype='int64'),
- C = date_range('20130101',periods=7)))
- def f(grp):
- return grp.iloc[0]
- expected = df.groupby('A').first()[['B']]
- result = df.groupby('A').apply(f)[['B']]
- assert_frame_equal(result,expected)
- def f(grp):
- if grp.name == 'Tiger':
- return None
- return grp.iloc[0]
- result = df.groupby('A').apply(f)[['B']]
- e = expected.copy()
- e.loc['Tiger'] = np.nan
- assert_frame_equal(result,e)
- def f(grp):
- if grp.name == 'Pony':
- return None
- return grp.iloc[0]
- result = df.groupby('A').apply(f)[['B']]
- e = expected.copy()
- e.loc['Pony'] = np.nan
- assert_frame_equal(result,e)
- # 5592 revisited, with datetimes
- def f(grp):
- if grp.name == 'Pony':
- return None
- return grp.iloc[0]
- result = df.groupby('A').apply(f)[['C']]
- e = df.groupby('A').first()[['C']]
- e.loc['Pony'] = np.nan
- assert_frame_equal(result,e)
- # scalar outputs
- def f(grp):
- if grp.name == 'Pony':
- return None
- return grp.iloc[0].loc['C']
- result = df.groupby('A').apply(f)
- e = df.groupby('A').first()['C'].copy()
- e.loc['Pony'] = np.nan
- e.name = None
- assert_series_equal(result,e)
- def test_agg_api(self):
- # GH 6337
- # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error
- # different api for agg when passed custom function with mixed frame
- df = DataFrame({'data1':np.random.randn(5),
- 'data2':np.random.randn(5),
- 'key1':['a','a','b','b','a'],
- 'key2':['one','two','one','two','one']})
- grouped = df.groupby('key1')
- def peak_to_peak(arr):
- return arr.max() - arr.min()
- expected = grouped.agg([peak_to_peak])
- expected.columns=['data1','data2']
- result = grouped.agg(peak_to_peak)
- assert_frame_equal(result,expected)
- def test_agg_regression1(self):
- grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
- result = grouped.agg(np.mean)
- expected = grouped.mean()
- assert_frame_equal(result, expected)
- def test_agg_datetimes_mixed(self):
- data = [[1, '2012-01-01', 1.0],
- [2, '2012-01-02', 2.0],
- [3, None, 3.0]]
- df1 = DataFrame({'key': [x[0] for x in data],
- 'date': [x[1] for x in data],
- 'value': [x[2] for x in data]})
- data = [[row[0], datetime.strptime(row[1], '%Y-%m-%d').date()
- if row[1] else None, row[2]] for row in data]
- df2 = DataFrame({'key': [x[0] for x in data],
- 'date': [x[1] for x in data],
- 'value': [x[2] for x in data]})
- df1['weights'] = df1['value'] / df1['value'].sum()
- gb1 = df1.groupby('date').aggregate(np.sum)
- df2['weights'] = df1['value'] / df1['value'].sum()
- gb2 = df2.groupby('date').aggregate(np.sum)
- assert(len(gb1) == len(gb2))
- def test_agg_period_index(self):
- from pandas import period_range, PeriodIndex
- prng = period_range('2012-1-1', freq='M', periods=3)
- df = DataFrame(np.random.randn(3, 2), index=prng)
- rs = df.groupby(level=0).sum()
- tm.assert_isinstance(rs.index, PeriodIndex)
- # GH 3579
- index = period_range(start='1999-01', periods=5, freq='M')
- s1 = Series(np.random.rand(len(index)), index=index)
- s2 = Series(np.random.rand(len(index)), index=index)
- series = [('s1', s1), ('s2',s2)]
- df = DataFrame.from_items(series)
- grouped = df.groupby(df.index.month)
- list(grouped)
- def test_agg_must_agg(self):
- grouped = self.df.groupby('A')['C']
- self.assertRaises(Exception, grouped.agg, lambda x: x.describe())
- self.assertRaises(Exception, grouped.agg, lambda x: x.index[:2])
- def test_agg_ser_multi_key(self):
- ser = self.df.C
- f = lambda x: x.sum()
- results = self.df.C.groupby([self.df.A, self.df.B]).aggregate(f)
- expected = self.df.groupby(['A', 'B']).sum()['C']
- assert_series_equal(results, expected)
- def test_get_group(self):
- wp = tm.makePanel()
- grouped = wp.groupby(lambda x: x.month, axis='major')
- gp = grouped.get_group(1)
- expected = wp.reindex(major=[x for x in wp.major_axis if x.month == 1])
- assert_panel_equal(gp, expected)
- # GH 5267
- # be datelike friendly
- df = DataFrame({'DATE' : pd.to_datetime(['10-Oct-2013', '10-Oct-2013', '10-Oct-2013',
- '11-Oct-2013', '11-Oct-2013', '11-Oct-2013']),
- 'label' : ['foo','foo','bar','foo','foo','bar'],
- 'VAL' : [1,2,3,4,5,6]})
- g = df.groupby('DATE')
- key = list(g.groups)[0]
- result1 = g.get_group(key)
- result2 = g.get_group(Timestamp(key).to_datetime())
- result3 = g.get_group(str(Timestamp(key)))
- assert_frame_equal(result1,result2)
- assert_frame_equal(result1,result3)
- g = df.groupby(['DATE','label'])
- key = list(g.groups)[0]
- result1 = g.get_group(key)
- result2 = g.get_group((Timestamp(key[0]).to_datetime(),key[1]))
- result3 = g.get_group((str(Timestamp(key[0])),key[1]))
- assert_frame_equal(result1,result2)
- assert_frame_equal(result1,result3)
- # must pass a same-length tuple with multiple keys
- self.assertRaises(ValueError, lambda : g.get_group('foo'))
- self.assertRaises(ValueError, lambda : g.get_group(('foo')))
- self.assertRaises(ValueError, lambda : g.get_group(('foo','bar','baz')))
- def test_agg_apply_corner(self):
- # nothing to group, all NA
- grouped = self.ts.groupby(self.ts * np.nan)
- assert_series_equal(grouped.sum(), Series([]))
- assert_series_equal(grouped.agg(np.sum), Series([]))
- assert_series_equal(grouped.apply(np.sum), Series([]))
- # DataFrame
- grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan)
- exp_df = DataFrame(columns=self.tsframe.columns, dtype=float)
- assert_frame_equal(grouped.sum(), exp_df, check_names=False)
- assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False)
- assert_frame_equal(grouped.apply(np.sum), DataFrame({}, dtype=float))
- def test_agg_grouping_is_list_tuple(self):
- from pandas.core.groupby import Grouping
- df = tm.makeTimeDataFrame()
- grouped = df.groupby(lambda x: x.year)
- grouper = grouped.grouper.groupings[0].grouper
- grouped.grouper.groupings[0] = Grouping(self.ts.index, list(grouper))
- result = grouped.agg(np.mean)
- expected = grouped.mean()
- tm.assert_frame_equal(result, expected)
- grouped.grouper.groupings[0] = Grouping(self.ts.index, tuple(grouper))
- result = grouped.agg(np.mean)
- expected = grouped.mean()
- tm.assert_frame_equal(result, expected)
- def test_agg_python_multiindex(self):
- grouped = self.mframe.groupby(['A', 'B'])
- result = grouped.agg(np.mean)
- expected = grouped.mean()
- tm.assert_frame_equal(result, expected)
- def test_apply_describe_bug(self):
- grouped = self.mframe.groupby(level='first')
- result = grouped.describe() # it works!
- def test_apply_issues(self):
- # GH 5788
- s="""2011.05.16,00:00,1.40893
- 2011.05.16,01:00,1.40760
- 2011.05.16,02:00,1.40750
- 2011.05.16,03:00,1.40649
- 2011.05.17,02:00,1.40893
- 2011.05.17,03:00,1.40760
- 2011.05.17,04:00,1.40750
- 2011.05.17,05:00,1.40649
- 2011.05.18,02:00,1.40893
- 2011.05.18,03:00,1.40760
- 2011.05.18,04:00,1.40750
- 2011.05.18,05:00,1.40649"""
- df = pd.read_csv(StringIO(s), header=None, names=['date', 'time', 'value'], parse_dates=[['date', 'time']])
- df = df.set_index('date_time')
- expected = df.groupby(df.index.date).idxmax()
- result = df.groupby(df.index.date).apply(lambda x: x.idxmax())
- assert_frame_equal(result,expected)
- # GH 5789
- # don't auto coerce dates
- df = pd.read_csv(StringIO(s), header=None, names=['date', 'time', 'value'])
- expected = Series(['00:00','02:00','02:00'],index=['2011.05.16','2011.05.17','2011.05.18'])
- result = df.groupby('date').apply(lambda x: x['time'][x['value'].idxmax()])
- assert_series_equal(result,expected)
- def test_len(self):
- df = tm.makeTimeDataFrame()
- grouped = df.groupby([lambda x: x.year,
- lambda x: x.month,
- lambda x: x.day])
- self.assertEqual(len(grouped), len(df))
- grouped = df.groupby([lambda x: x.year,
- lambda x: x.month])
- expected = len(set([(x.year, x.month) for x in df.index]))
- self.assertEqual(len(grouped), expected)
- def test_groups(self):
- grouped = self.df.groupby(['A'])
- groups = grouped.groups
- self.assertIs(groups, grouped.groups) # caching works
- for k, v in compat.iteritems(grouped.groups):
- self.assertTrue((self.df.ix[v]['A'] == k).all())
- grouped = self.df.groupby(['A', 'B'])
- groups = grouped.groups
- self.assertIs(groups, grouped.groups) # caching works
- for k, v in compat.iteritems(grouped.groups):
- self.assertTrue((self.df.ix[v]['A'] == k[0]).all())
- self.assertTrue((self.df.ix[v]['B'] == k[1]).all())
- def test_aggregate_str_func(self):
- def _check_results(grouped):
- # single series
- result = grouped['A'].agg('std')
- expected = grouped['A'].std()
- assert_series_equal(result, expected)
- # group frame by function name
- result = grouped.aggregate('var')
- expected = grouped.var()
- assert_frame_equal(result, expected)
- # group frame by function dict
- result = grouped.agg(OrderedDict([['A', 'var'],
- ['B', 'std'],
- ['C', 'mean'],
- ['D', 'sem']]))
- expected = DataFrame(OrderedDict([['A', grouped['A'].var()],
- ['B', grouped['B'].std()],
- ['C', grouped['C'].mean()],
- ['D', grouped['D'].sem()]]))
- assert_frame_equal(result, expected)
- by_weekday = self.tsframe.groupby(lambda x: x.weekday())
- _check_results(by_weekday)
- by_mwkday = self.tsframe.groupby([lambda x: x.month,
- lambda x: x.weekday()])
- _check_results(by_mwkday)
- def test_aggregate_item_by_item(self):
- df = self.df.copy()
- df['E'] = ['a'] * len(self.df)
- grouped = self.df.groupby('A')
- # API change in 0.11
- # def aggfun(ser):
- # return len(ser + 'a')
- # result = grouped.agg(aggfun)
- # self.assertEqual(len(result.columns), 1)
- aggfun = lambda ser: ser.size
- result = grouped.agg(aggfun)
- foo = (self.df.A == 'foo').sum()
- bar = (self.df.A == 'bar').sum()
- K = len(result.columns)
- # GH5782
- # odd comparisons can result here, so cast to make easy
- assert_almost_equal(result.xs('foo'), np.array([foo] * K).astype('float64'))
- assert_almost_equal(result.xs('bar'), np.array([bar] * K).astype('float64'))
- def aggfun(ser):
- return ser.size
- result = DataFrame().groupby(self.df.A).agg(aggfun)
- tm.assert_isinstance(result, DataFrame)
- self.assertEqual(len(result), 0)
- def test_agg_item_by_item_raise_typeerror(self):
- from numpy.random import randint
- df = DataFrame(randint(10, size=(20, 10)))
- def raiseException(df):
- com.pprint_thing('----------------------------------------')
- com.pprint_thing(df.to_string())
- raise TypeError
- self.assertRaises(TypeError, df.groupby(0).agg,
- raiseException)
- def test_basic_regression(self):
- # regression
- T = [1.0 * x for x in lrange(1, 10) * 10][:1095]
- result = Series(T, lrange(0, len(T)))
- groupings = np.random.random((1100,))
- groupings = Series(groupings, lrange(0, len(groupings))) * 10.
- grouped = result.groupby(groupings)
- grouped.mean()
- def test_transform(self):
- data = Series(np.arange(9) // 3, index=np.arange(9))
- index = np.arange(9)
- np.random.shuffle(index)
- data = data.reindex(index)
- grouped = data.groupby(lambda x: x // 3)
- transformed = grouped.transform(lambda x: x * x.sum())
- self.assertEqual(transformed[7], 12)
- def test_transform_broadcast(self):
- grouped = self.ts.groupby(lambda x: x.month)
- result = grouped.transform(np.mean)
- self.assertTrue(result.index.equals(self.ts.index))
- for _, gp in grouped:
- assert_fp_equal(result.reindex(gp.index), gp.mean())
- grouped = self.tsframe.groupby(lambda x: x.month)
- result = grouped.transform(np.mean)
- self.assertTrue(result.index.equals(self.tsframe.index))
- for _, gp in grouped:
- agged = gp.mean()
- res = result.reindex(gp.index)
- for col in self.tsframe:
- assert_fp_equal(res[col], agged[col])
- # group columns
- grouped = self.tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1},
- axis=1)
- result = grouped.transform(np.mean)
- self.assertTrue(result.index.equals(self.tsframe.index))
- self.assertTrue(result.columns.equals(self.tsframe.columns))
- for _, gp in grouped:
- agged = gp.mean(1)
- res = result.reindex(columns=gp.columns)
- for idx in gp.index:
- assert_fp_equal(res.xs(idx), agged[idx])
- def test_transform_bug(self):
- # GH 5712
- # transforming on a datetime column
- df = DataFrame(dict(A = Timestamp('20130101'), B = np.arange(5)))
- result = df.groupby('A')['B'].transform(lambda x: x.rank(ascending=False))
- expected = Series(np.arange(5,0,step=-1),name='B')
- assert_series_equal(result,expected)
- def test_transform_multiple(self):
- grouped = self.ts.groupby([lambda x: x.year, lambda x: x.month])
- transformed = grouped.transform(lambda x: x * 2)
- broadcasted = grouped.transform(np.mean)
- def test_dispatch_transform(self):
- df = self.tsframe[::5].reindex(self.tsframe.index)
- grouped = df.groupby(lambda x: x.month)
- filled = grouped.fillna(method='pad')
- fillit = lambda x: x.fillna(method='pad')
- expected = df.groupby(lambda x: x.month).transform(fillit)
- assert_frame_equal(filled, expected)
- def test_transform_select_columns(self):
- f = lambda x: x.mean()
- result = self.df.groupby('A')['C', 'D'].transform(f)
- selection = self.df[['C', 'D']]
- expected = selection.groupby(self.df['A']).transform(f)
- assert_frame_equal(result, expected)
- def test_transform_exclude_nuisance(self):
- expected = {}
- grouped = self.df.groupby('A')
- expected['C'] = grouped['C'].transform(np.mean)
- expected['D'] = grouped['D'].transform(np.mean)
- expected = DataFrame(expected)
- result = self.df.groupby('A').transform(np.mean)
- assert_frame_equal(result, expected)
- def test_transform_function_aliases(self):
- result = self.df.groupby('A').transform('mean')
- expected = self.df.groupby('A').transform(np.mean)
- assert_frame_equal(result, expected)
- result = self.df.groupby('A')['C'].transform('mean')
- expected = self.df.groupby('A')['C'].transform(np.mean)
- assert_series_equal(result, expected)
- def test_with_na(self):
- index = Index(np.arange(10))
- for dtype in ['float64','float32','int64','int32','int16','int8']:
- values = Series(np.ones(10), index, dtype=dtype)
- labels = Series([nan, 'foo', 'bar', 'bar', nan, nan, 'bar',
- 'bar', nan, 'foo'], index=index)
- # this SHOULD be an int
- grouped = values.groupby(labels)
- agged = grouped.agg(len)
- expected = Series([4, 2], index=['bar', 'foo'])
- assert_series_equal(agged, expected, check_dtype=False)
- #self.assertTrue(issubclass(agged.dtype.type, np.integer))
- # explicity return a float from my function
- def f(x):
- return float(len(x))
- agged = grouped.agg(f)
- expected = Series([4, 2], index=['bar', 'foo'])
- assert_series_equal(agged, expected, check_dtype=False)
- self.assertTrue(issubclass(agged.dtype.type, np.dtype(dtype).type))
- def test_groupby_transform_with_int(self):
- # GH 3740, make sure that we might upcast on item-by-item transform
- # floats
- df = DataFrame(dict(A = [1,1,1,2,2,2], B = Series(1,dtype='float64'), C = Series([1,2,3,1,2,3],dtype='float64'), D = 'foo'))
- result = df.groupby('A').transform(lambda x: (x-x.mean())/x.std())
- expected = DataFrame(dict(B = np.nan, C = Series([-1,0,1,-1,0,1],dtype='float64')))
- assert_frame_equal(result,expected)
- # int case
- df = DataFrame(dict(A = [1,1,1,2,2,2], B = 1, C = [1,2,3,1,2,3], D = 'foo'))
- result = df.groupby('A').transform(lambda x: (x-x.mean())/x.std())
- expected = DataFrame(dict(B = np.nan, C = [-1,0,1,-1,0,1]))
- assert_frame_equal(result,expected)
- # int that needs float conversion
- s = Series([2,3,4,10,5,-1])
- df = DataFrame(dict(A = [1,1,1,2,2,2], B = 1, C = s, D = 'foo'))
- result = df.groupby('A').transform(lambda x: (x-x.mean())/x.std())
- s1 = s.iloc[0:3]
- s1 = (s1-s1.mean())/s1.std()
- s2 = s.iloc[3:6]
- s2 = (s2-s2.mean())/s2.std()
- expected = DataFrame(dict(B = np.nan, C = concat([s1,s2])))
- assert_frame_equal(result,expected)
- # int downcasting
- result = df.groupby('A').transform(lambda x: x*2/2)
- expected = DataFrame(dict(B = 1, C = [2,3,4,10,5,-1]))
- assert_frame_equal(result,expected)
- def test_indices_concatenation_order(self):
- # GH 2808
- def f1(x):
- y = x[(x.b % 2) == 1]**2
- if y.empty:
- multiindex = MultiIndex(
- levels = [[]]*2,
- labels = [[]]*2,
- names = ['b', 'c']
- )
- res = DataFrame(None,
- columns=['a'],
- index=multiindex)
- return res
- else:
- y = y.set_index(['b','c'])
- return y
- def f2(x):
- y = x[(x.b % 2) == 1]**2
- if y.empty:
- return DataFrame()
- else:
- y = y.set_index(['b','c'])
- return y
- def f3(x):
- y = x[(x.b % 2) == 1]**2
- if y.empty:
- multiindex = MultiIndex(
- levels = [[]]*2,
- labels = [[]]*2,
- names = ['foo', 'bar']
- )
- res = DataFrame(None,
- columns=['a','b'],
- index=multiindex)
- return res
- else:
- return y
- df = DataFrame({'a':[1,2,2,2],
- 'b':lrange(4),
- 'c':lrange(5,9)})
- df2 = DataFrame({'a':[3,2,2,2],
- 'b':lrange(4),
- 'c':lrange(5,9)})
- # correct result
- result1 = df.groupby('a').apply(f1)
- result2 = df2.groupby('a').apply(f1)
- assert_frame_equal(result1, result2)
- # should fail (not the same number of levels)
- self.assertRaises(AssertionError, df.groupby('a').apply, f2)
- self.assertRaises(AssertionError, df2.groupby('a').apply, f2)
- # should fail (incorrect shape)
- self.assertRaises(AssertionError, df.groupby('a').apply, f3)
- self.assertRaises(AssertionError, df2.groupby('a').apply, f3)
- def test_attr_wrapper(self):
- grouped = self.ts.groupby(lambda x: x.weekday())
- result = grouped.std()
- expected = grouped.agg(lambda x: np.std(x, ddof=1))
- assert_series_equal(result, expected)
- # this is pretty cool
- result = grouped.describe()
- expected = {}
- for name, gp in grouped:
- expected[name] = gp.describe()
- expected = DataFrame(expected).T
- assert_frame_equal(result.unstack(), expected)
- # get attribute
- result = grouped.dtype
- expected = grouped.agg(lambda x: x.dtype)
- # make sure raises error
- self.assertRaises(AttributeError, getattr, grouped, 'foo')
- def test_series_describe_multikey(self):
- ts = tm.makeTimeSeries()
- grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
- result = grouped.describe().unstack()
- assert_series_equal(result['mean'], grouped.mean())
- assert_series_equal(result['std'], grouped.std())
- assert_series_equal(result['min'], grouped.min())
- def test_series_describe_single(self):
- ts = tm.makeTimeSeries()
- grouped = ts.groupby(lambda x: x.month)
- result = grouped.apply(lambda x: x.describe())
- expected = grouped.describe()
- assert_series_equal(result, expected)
- def test_series_agg_multikey(self):
- ts = tm.makeTimeSeries()
- grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
- result = grouped.agg(np.sum)
- expected = grouped.sum()
- assert_series_equal(result, expected)
- def test_series_agg_multi_pure_python(self):
- data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo',
- 'bar', 'bar', 'bar', 'bar',
- 'foo', 'foo', 'foo'],
- 'B': ['one', 'one', 'one', 'two',
- 'one', 'one', 'one', 'two',
- 'two', 'two', 'one'],
- 'C': ['dull', 'dull', 'shiny', 'dull',
- 'dull', 'shiny', 'shiny', 'dull',
- 'shiny', 'shiny', 'shiny'],
- 'D': np.random.randn(11),
- 'E': np.random.randn(11),
- 'F': np.random.randn(11)})
- def bad(x):
- assert(len(x.base) > 0)
- return 'foo'
- result = data.groupby(['A', 'B']).agg(bad)
- expected = data.groupby(['A', 'B']).agg(lambda x: 'foo')
- assert_frame_equal(result, expected)
- def test_series_index_name(self):
- grouped = self.df.ix[:, ['C']].groupby(self.df['A'])
- result = grouped.agg(lambda x: x.mean())
- self.assertEqual(result.index.name, 'A')
- def test_frame_describe_multikey(self):
- grouped = self.tsframe.groupby([lambda x: x.year,
- lambda x: x.month])
- result = grouped.describe()
- for col in self.tsframe:
- expected = grouped[col].describe()
- assert_series_equal(result[col], expected)
- groupedT = self.tsframe.groupby({'A': 0, 'B': 0,
- 'C': 1, 'D': 1}, axis=1)
- result = groupedT.describe()
- for name, group in groupedT:
- assert_frame_equal(result[name], group.describe())
- def test_frame_groupby(self):
- grouped = self.tsframe.groupby(lambda x: x.weekday())
- # aggregate
- aggregated = grouped.aggregate(np.mean)
- self.assertEqual(len(aggregated), 5)
- self.assertEqual(len(aggregated.columns), 4)
- # by string
- tscopy = self.tsframe.copy()
- tscopy['weekday'] = [x.weekday() for x in tscopy.index]
- stragged = tscopy.groupby('weekday').aggregate(np.mean)
- assert_frame_equal(stragged, aggregated, check_names=False)
- # transform
- grouped = self.tsframe.head(30).groupby(lambda x: x.weekday())
- transformed = grouped.transform(lambda x: x - x.mean())
- self.assertEqual(len(transformed), 30)
- self.assertEqual(len(transformed.columns), 4)
- # transform propagate
- transformed = grouped.transform(lambda x: x.mean())
- for name, group in grouped:
- mean = group.mean()
- for idx in group.index:
- assert_almost_equal(transformed.xs(idx), mean)
- # iterate
- for weekday, group in grouped:
- self.assertEqual(group.index[0].weekday(), weekday)
- # groups / group_indices
- groups = grouped.groups
- indices = grouped.indices
- for k, v in compat.iteritems(groups):
- samething = self.tsframe.index.take(indices[k])
- self.assertTrue((samething == v).all())
- def test_grouping_is_iterable(self):
- # this code path isn't used anywhere else
- # not sure it's useful
- grouped = self.tsframe.groupby([lambda x: x.weekday(),
- lambda x: x.year])
- # test it works
- for g in grouped.grouper.groupings[0]:
- pass
- def test_frame_groupby_columns(self):
- mapping = {
- 'A': 0, 'B': 0, 'C': 1, 'D': 1
- }
- grouped = self.tsframe.groupby(mapping, axis=1)
- # aggregate
- aggregated = grouped.aggregate(np.mean)
- self.assertEqual(len(aggregated), len(self.tsframe))
- self.assertEqual(len(aggregated.columns), 2)
- # transform
- tf = lambda x: x - x.mean()
- groupedT = self.tsframe.T.groupby(mapping, axis=0)
- assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf))
- # iterate
- for k, v in grouped:
- self.assertEqual(len(v.columns), 2)
- def test_frame_set_name_single(self):
- grouped = self.df.groupby('A')
- result = grouped.mean()
- self.assertEqual(result.index.name, 'A')
- result = self.df.groupby('A', as_index=False).mean()
- self.assertNotEqual(result.index.name, 'A')
- result = grouped.agg(np.mean)
- self.assertEqual(result.index.name, 'A')
- result = grouped.agg({'C': np.mean, 'D': np.std})
- self.assertEqual(result.index.name, 'A')
- result = grouped['C'].mean()
- self.assertEqual(result.index.name, 'A')
- result = grouped['C'].agg(np.mean)
- self.assertEqual(result.index.name, 'A')
- result = grouped['C'].agg([np.mean, np.std])
- self.assertEqual(result.index.name, 'A')
- result = grouped['C'].agg({'foo': np.mean, 'bar': np.std})
- self.assertEqual(result.index.name, 'A')
- def test_multi_iter(self):
- s = Series(np.arange(6))
- k1 = np.array(['a', 'a', 'a', 'b', 'b', 'b'])
- k2 = np.array(['1', '2', '1', '2', '1', '2'])
- grouped = s.groupby([k1, k2])
- iterated = list(grouped)
- expected = [('a', '1', s[[0, 2]]),
- ('a', '2', s[[1]]),
- ('b', '1', s[[4]]),
- ('b', '2', s[[3, 5]])]
- for i, ((one, two), three) in enumerate(iterated):
- e1, e2, e3 = expected[i]
- self.assertEqual(e1, one)
- self.assertEqual(e2, two)
- assert_series_equal(three, e3)
- def test_multi_iter_frame(self):
- k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
- k2 = np.array(['1', '2', '1', '2', '1', '2'])
- df = DataFrame({'v1': np.random.randn(6),
- 'v2': np.random.randn(6),
- 'k1': k1, 'k2': k2},
- index=['one', 'two', 'three', 'four', 'five', 'six'])
- grouped = df.groupby(['k1', 'k2'])
- # things get sorted!
- iterated = list(grouped)
- idx = df.index
- expected = [('a', '1', df.ix[idx[[4]]]),
- ('a', '2', df.ix[idx[[3, 5]]]),
- ('b', '1', df.ix[idx[[0, 2]]]),
- ('b', '2', df.ix[idx[[1]]])]
- for i, ((one, two), three) in enumerate(iterated):
- e1, e2, e3 = expected[i]
- self.assertEqual(e1, one)
- self.assertEqual(e2, two)
- assert_frame_equal(three, e3)
- # don't iterate through groups with no data
- df['k1'] = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
- df['k2'] = np.array(['1', '1', '1', '2', '2', '2'])
- grouped = df.groupby(['k1', 'k2'])
- groups = {}
- for key, gp in grouped:
- groups[key] = gp
- self.assertEqual(len(groups), 2)
- # axis = 1
- three_levels = self.three_group.groupby(['A', 'B', 'C']).mean()
- grouped = three_levels.T.groupby(axis=1, level=(1, 2))
- for key, group in grouped:
- pass
- def test_multi_iter_panel(self):
- wp = tm.makePanel()
- grouped = wp.groupby([lambda x: x.month, lambda x: x.weekday()],
- axis=1)
- for (month, wd), group in grouped:
- exp_axis = [x for x in wp.major_axis
- if x.month == month and x.weekday() == wd]
- expected = wp.reindex(major=exp_axis)
- assert_panel_equal(group, expected)
- def test_multi_func(self):
- col1 = self.df['A']
- col2 = self.df['B']
- grouped = self.df.groupby([col1.get, col2.get])
- agged = grouped.mean()
- expected = self.df.groupby(['A', 'B']).mean()
- assert_frame_equal(agged.ix[:, ['C', 'D']],
- expected.ix[:, ['C', 'D']],
- check_names=False) # TODO groupby get drops names
- # some "groups" with no data
- df = DataFrame({'v1': np.random.randn(6),
- 'v2': np.random.randn(6),
- 'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']),
- 'k2': np.array(['1', '1', '1', '2', '2', '2'])},
- index=['one', 'two', 'three', 'four', 'five', 'six'])
- # only verify that it works for now
- grouped = df.groupby(['k1', 'k2'])
- grouped.agg(np.sum)
- def test_multi_key_multiple_functions(self):
- grouped = self.df.groupby(['A', 'B'])['C']
- agged = grouped.agg([np.mean, np.std])
- expected = DataFrame({'mean': grouped.agg(np.mean),
- 'std': grouped.agg(np.std)})
- assert_frame_equal(agged, expected)
- def test_frame_multi_key_function_list(self):
- data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo',
- 'bar', 'bar', 'bar', 'bar',
- 'foo', 'foo', 'foo'],
- 'B': ['one', 'one', 'one', 'two',
- 'one', 'one', 'one', 'two',
- 'two', 'two', 'one'],
- 'C': ['dull', 'dull', 'shiny', 'dull',
- 'dull', 'shiny', 'shiny', 'dull',
- 'shiny', 'shiny', 'shiny'],
- 'D': np.random.randn(11),
- 'E': np.random.randn(11),
- 'F': np.random.randn(11)})
- grouped = data.groupby(['A', 'B'])
- funcs = [np.mean, np.std]
- agged = grouped.agg(funcs)
- expected = concat([grouped['D'].agg(funcs), grouped['E'].agg(funcs),
- grouped['F'].agg(funcs)],
- keys=['D', 'E', 'F'], axis=1)
- assert(isinstance(agged.index, MultiIndex))
- assert(isinstance(expected.index, MultiIndex))
- assert_frame_equal(agged, expected)
- def test_groupby_multiple_columns(self):
- data = self.df
- grouped = data.groupby(['A', 'B'])
- def _check_op(op):
- result1 = op(grouped)
- expected = defaultdict(dict)
- for n1, gp1 in data.groupby('A'):
- for n2, gp2 in gp1.groupby('B'):
- expected[n1][n2] = op(gp2.ix[:, ['C', 'D']])
- expected = dict((k, DataFrame(v)) for k, v in compat.iteritems(expected))
- expected = Panel.fromDict(expected).swapaxes(0,…
Large files files are truncated, but you can click here to view the full file