/pandas/core/groupby.py
Python | 4441 lines | 4436 code | 5 blank | 0 comment | 14 complexity | 18d0687b836be8d203e1d5948ec00b74 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
Large files files are truncated, but you can click here to view the full file
- import types
- from functools import wraps
- import numpy as np
- import datetime
- import collections
- import warnings
- import copy
- from pandas.compat import(
- zip, range, long, lzip,
- callable, map
- )
- from pandas import compat
- from pandas.compat.numpy import function as nv
- from pandas.compat.numpy import _np_version_under1p8
- from pandas.types.common import (_DATELIKE_DTYPES,
- is_numeric_dtype,
- is_timedelta64_dtype, is_datetime64_dtype,
- is_categorical_dtype,
- is_datetime_or_timedelta_dtype,
- is_bool, is_integer_dtype,
- is_complex_dtype,
- is_bool_dtype,
- is_scalar,
- _ensure_float64,
- _ensure_platform_int,
- _ensure_int64,
- _ensure_object,
- _ensure_float)
- from pandas.types.cast import _possibly_downcast_to_dtype
- from pandas.types.missing import isnull, notnull, _maybe_fill
- from pandas.core.common import _values_from_object, AbstractMethodError
- from pandas.core.base import (PandasObject, SelectionMixin, GroupByError,
- DataError, SpecificationError)
- from pandas.core.categorical import Categorical
- from pandas.core.frame import DataFrame
- from pandas.core.generic import NDFrame
- from pandas.core.index import (Index, MultiIndex, CategoricalIndex,
- _ensure_index)
- from pandas.core.internals import BlockManager, make_block
- from pandas.core.series import Series
- from pandas.core.panel import Panel
- from pandas.util.decorators import (cache_readonly, Substitution, Appender,
- make_signature, deprecate_kwarg)
- from pandas.formats.printing import pprint_thing
- from pandas.util.validators import validate_kwargs
- import pandas.core.algorithms as algos
- import pandas.core.common as com
- from pandas.core.config import option_context
- import pandas.lib as lib
- from pandas.lib import Timestamp
- import pandas.tslib as tslib
- import pandas.algos as _algos
- import pandas.hashtable as _hash
- _doc_template = """
- See also
- --------
- pandas.Series.%(name)s
- pandas.DataFrame.%(name)s
- pandas.Panel.%(name)s
- """
- # special case to prevent duplicate plots when catching exceptions when
- # forwarding methods from NDFrames
- _plotting_methods = frozenset(['plot', 'boxplot', 'hist'])
- _common_apply_whitelist = frozenset([
- 'last', 'first',
- 'head', 'tail', 'median',
- 'mean', 'sum', 'min', 'max',
- 'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount',
- 'resample',
- 'describe',
- 'rank', 'quantile',
- 'fillna',
- 'mad',
- 'any', 'all',
- 'take',
- 'idxmax', 'idxmin',
- 'shift', 'tshift',
- 'ffill', 'bfill',
- 'pct_change', 'skew',
- 'corr', 'cov', 'diff',
- ]) | _plotting_methods
- _series_apply_whitelist = \
- (_common_apply_whitelist - set(['boxplot'])) | \
- frozenset(['dtype', 'unique'])
- _dataframe_apply_whitelist = \
- _common_apply_whitelist | frozenset(['dtypes', 'corrwith'])
- _cython_transforms = frozenset(['cumprod', 'cumsum', 'shift'])
- def _groupby_function(name, alias, npfunc, numeric_only=True,
- _convert=False):
- _local_template = "Compute %(f)s of group values"
- @Substitution(name='groupby', f=name)
- @Appender(_doc_template)
- @Appender(_local_template)
- def f(self):
- self._set_group_selection()
- try:
- return self._cython_agg_general(alias, numeric_only=numeric_only)
- except AssertionError as e:
- raise SpecificationError(str(e))
- except Exception:
- result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
- if _convert:
- result = result._convert(datetime=True)
- return result
- f.__name__ = name
- return f
- def _first_compat(x, axis=0):
- def _first(x):
- x = np.asarray(x)
- x = x[notnull(x)]
- if len(x) == 0:
- return np.nan
- return x[0]
- if isinstance(x, DataFrame):
- return x.apply(_first, axis=axis)
- else:
- return _first(x)
- def _last_compat(x, axis=0):
- def _last(x):
- x = np.asarray(x)
- x = x[notnull(x)]
- if len(x) == 0:
- return np.nan
- return x[-1]
- if isinstance(x, DataFrame):
- return x.apply(_last, axis=axis)
- else:
- return _last(x)
- class Grouper(object):
- """
- A Grouper allows the user to specify a groupby instruction for a target
- object
- This specification will select a column via the key parameter, or if the
- level and/or axis parameters are given, a level of the index of the target
- object.
- These are local specifications and will override 'global' settings,
- that is the parameters axis and level which are passed to the groupby
- itself.
- Parameters
- ----------
- key : string, defaults to None
- groupby key, which selects the grouping column of the target
- level : name/number, defaults to None
- the level for the target index
- freq : string / frequency object, defaults to None
- This will groupby the specified frequency if the target selection
- (via key or level) is a datetime-like object. For full specification
- of available frequencies, please see
- `here <http://pandas.pydata.org/pandas-docs/stable/timeseries.html>`_.
- axis : number/name of the axis, defaults to 0
- sort : boolean, default to False
- whether to sort the resulting labels
- additional kwargs to control time-like groupers (when freq is passed)
- closed : closed end of interval; left or right
- label : interval boundary to use for labeling; left or right
- convention : {'start', 'end', 'e', 's'}
- If grouper is PeriodIndex
- Returns
- -------
- A specification for a groupby instruction
- Examples
- --------
- Syntactic sugar for ``df.groupby('A')``
- >>> df.groupby(Grouper(key='A'))
- Specify a resample operation on the column 'date'
- >>> df.groupby(Grouper(key='date', freq='60s'))
- Specify a resample operation on the level 'date' on the columns axis
- with a frequency of 60s
- >>> df.groupby(Grouper(level='date', freq='60s', axis=1))
- """
- def __new__(cls, *args, **kwargs):
- if kwargs.get('freq') is not None:
- from pandas.tseries.resample import TimeGrouper
- cls = TimeGrouper
- return super(Grouper, cls).__new__(cls)
- def __init__(self, key=None, level=None, freq=None, axis=0, sort=False):
- self.key = key
- self.level = level
- self.freq = freq
- self.axis = axis
- self.sort = sort
- self.grouper = None
- self.obj = None
- self.indexer = None
- self.binner = None
- @property
- def ax(self):
- return self.grouper
- def _get_grouper(self, obj):
- """
- Parameters
- ----------
- obj : the subject object
- Returns
- -------
- a tuple of binner, grouper, obj (possibly sorted)
- """
- self._set_grouper(obj)
- self.grouper, exclusions, self.obj = _get_grouper(self.obj, [self.key],
- axis=self.axis,
- level=self.level,
- sort=self.sort)
- return self.binner, self.grouper, self.obj
- def _set_grouper(self, obj, sort=False):
- """
- given an object and the specifications, setup the internal grouper
- for this particular specification
- Parameters
- ----------
- obj : the subject object
- """
- if self.key is not None and self.level is not None:
- raise ValueError(
- "The Grouper cannot specify both a key and a level!")
- # the key must be a valid info item
- if self.key is not None:
- key = self.key
- if key not in obj._info_axis:
- raise KeyError("The grouper name {0} is not found".format(key))
- ax = Index(obj[key], name=key)
- else:
- ax = obj._get_axis(self.axis)
- if self.level is not None:
- level = self.level
- # if a level is given it must be a mi level or
- # equivalent to the axis name
- if isinstance(ax, MultiIndex):
- level = ax._get_level_number(level)
- ax = Index(ax.get_level_values(
- level), name=ax.names[level])
- else:
- if level not in (0, ax.name):
- raise ValueError(
- "The level {0} is not valid".format(level))
- # possibly sort
- if (self.sort or sort) and not ax.is_monotonic:
- # use stable sort to support first, last, nth
- indexer = self.indexer = ax.argsort(kind='mergesort')
- ax = ax.take(indexer)
- obj = obj.take(indexer, axis=self.axis,
- convert=False, is_copy=False)
- self.obj = obj
- self.grouper = ax
- return self.grouper
- def _get_binner_for_grouping(self, obj):
- """ default to the standard binner here """
- group_axis = obj._get_axis(self.axis)
- return Grouping(group_axis, None, obj=obj, name=self.key,
- level=self.level, sort=self.sort, in_axis=False)
- @property
- def groups(self):
- return self.grouper.groups
- class GroupByPlot(PandasObject):
- """
- Class implementing the .plot attribute for groupby objects
- """
- def __init__(self, groupby):
- self._groupby = groupby
- def __call__(self, *args, **kwargs):
- def f(self):
- return self.plot(*args, **kwargs)
- f.__name__ = 'plot'
- return self._groupby.apply(f)
- def __getattr__(self, name):
- def attr(*args, **kwargs):
- def f(self):
- return getattr(self.plot, name)(*args, **kwargs)
- return self._groupby.apply(f)
- return attr
- class _GroupBy(PandasObject, SelectionMixin):
- _group_selection = None
- _apply_whitelist = frozenset([])
- def __init__(self, obj, keys=None, axis=0, level=None,
- grouper=None, exclusions=None, selection=None, as_index=True,
- sort=True, group_keys=True, squeeze=False, **kwargs):
- self._selection = selection
- if isinstance(obj, NDFrame):
- obj._consolidate_inplace()
- self.level = level
- if not as_index:
- if not isinstance(obj, DataFrame):
- raise TypeError('as_index=False only valid with DataFrame')
- if axis != 0:
- raise ValueError('as_index=False only valid for axis=0')
- self.as_index = as_index
- self.keys = keys
- self.sort = sort
- self.group_keys = group_keys
- self.squeeze = squeeze
- self.mutated = kwargs.pop('mutated', False)
- if grouper is None:
- grouper, exclusions, obj = _get_grouper(obj, keys,
- axis=axis,
- level=level,
- sort=sort,
- mutated=self.mutated)
- self.obj = obj
- self.axis = obj._get_axis_number(axis)
- self.grouper = grouper
- self.exclusions = set(exclusions) if exclusions else set()
- # we accept no other args
- validate_kwargs('group', kwargs, {})
- def __len__(self):
- return len(self.groups)
- def __unicode__(self):
- # TODO: Better unicode/repr for GroupBy object
- return object.__repr__(self)
- def _assure_grouper(self):
- """
- we create the grouper on instantiation
- sub-classes may have a different policy
- """
- pass
- @property
- def groups(self):
- """ dict {group name -> group labels} """
- self._assure_grouper()
- return self.grouper.groups
- @property
- def ngroups(self):
- self._assure_grouper()
- return self.grouper.ngroups
- @property
- def indices(self):
- """ dict {group name -> group indices} """
- self._assure_grouper()
- return self.grouper.indices
- def _get_indices(self, names):
- """
- safe get multiple indices, translate keys for
- datelike to underlying repr
- """
- def get_converter(s):
- # possibly convert to the actual key types
- # in the indices, could be a Timestamp or a np.datetime64
- if isinstance(s, (Timestamp, datetime.datetime)):
- return lambda key: Timestamp(key)
- elif isinstance(s, np.datetime64):
- return lambda key: Timestamp(key).asm8
- else:
- return lambda key: key
- if len(names) == 0:
- return []
- if len(self.indices) > 0:
- index_sample = next(iter(self.indices))
- else:
- index_sample = None # Dummy sample
- name_sample = names[0]
- if isinstance(index_sample, tuple):
- if not isinstance(name_sample, tuple):
- msg = ("must supply a tuple to get_group with multiple"
- " grouping keys")
- raise ValueError(msg)
- if not len(name_sample) == len(index_sample):
- try:
- # If the original grouper was a tuple
- return [self.indices[name] for name in names]
- except KeyError:
- # turns out it wasn't a tuple
- msg = ("must supply a a same-length tuple to get_group"
- " with multiple grouping keys")
- raise ValueError(msg)
- converters = [get_converter(s) for s in index_sample]
- names = [tuple([f(n) for f, n in zip(converters, name)])
- for name in names]
- else:
- converter = get_converter(index_sample)
- names = [converter(name) for name in names]
- return [self.indices.get(name, []) for name in names]
- def _get_index(self, name):
- """ safe get index, translate keys for datelike to underlying repr """
- return self._get_indices([name])[0]
- @cache_readonly
- def _selected_obj(self):
- if self._selection is None or isinstance(self.obj, Series):
- if self._group_selection is not None:
- return self.obj[self._group_selection]
- return self.obj
- else:
- return self.obj[self._selection]
- def _reset_group_selection(self):
- """
- Clear group based selection. Used for methods needing to return info on
- each group regardless of whether a group selection was previously set.
- """
- if self._group_selection is not None:
- self._group_selection = None
- # GH12839 clear cached selection too when changing group selection
- self._reset_cache('_selected_obj')
- def _set_group_selection(self):
- """
- Create group based selection. Used when selection is not passed
- directly but instead via a grouper.
- """
- grp = self.grouper
- if self.as_index and getattr(grp, 'groupings', None) is not None and \
- self.obj.ndim > 1:
- ax = self.obj._info_axis
- groupers = [g.name for g in grp.groupings
- if g.level is None and g.in_axis]
- if len(groupers):
- self._group_selection = ax.difference(Index(groupers)).tolist()
- # GH12839 clear selected obj cache when group selection changes
- self._reset_cache('_selected_obj')
- def _set_result_index_ordered(self, result):
- # set the result index on the passed values object and
- # return the new object, xref 8046
- # the values/counts are repeated according to the group index
- # shortcut if we have an already ordered grouper
- if not self.grouper.is_monotonic:
- index = Index(np.concatenate(
- self._get_indices(self.grouper.result_index)))
- result.set_axis(self.axis, index)
- result = result.sort_index(axis=self.axis)
- result.set_axis(self.axis, self.obj._get_axis(self.axis))
- return result
- def _dir_additions(self):
- return self.obj._dir_additions() | self._apply_whitelist
- def __getattr__(self, attr):
- if attr in self._internal_names_set:
- return object.__getattribute__(self, attr)
- if attr in self.obj:
- return self[attr]
- if hasattr(self.obj, attr):
- return self._make_wrapper(attr)
- raise AttributeError("%r object has no attribute %r" %
- (type(self).__name__, attr))
- plot = property(GroupByPlot)
- def _make_wrapper(self, name):
- if name not in self._apply_whitelist:
- is_callable = callable(getattr(self._selected_obj, name, None))
- kind = ' callable ' if is_callable else ' '
- msg = ("Cannot access{0}attribute {1!r} of {2!r} objects, try "
- "using the 'apply' method".format(kind, name,
- type(self).__name__))
- raise AttributeError(msg)
- # need to setup the selection
- # as are not passed directly but in the grouper
- self._set_group_selection()
- f = getattr(self._selected_obj, name)
- if not isinstance(f, types.MethodType):
- return self.apply(lambda self: getattr(self, name))
- f = getattr(type(self._selected_obj), name)
- def wrapper(*args, **kwargs):
- # a little trickery for aggregation functions that need an axis
- # argument
- kwargs_with_axis = kwargs.copy()
- if 'axis' not in kwargs_with_axis or \
- kwargs_with_axis['axis'] is None:
- kwargs_with_axis['axis'] = self.axis
- def curried_with_axis(x):
- return f(x, *args, **kwargs_with_axis)
- def curried(x):
- return f(x, *args, **kwargs)
- # preserve the name so we can detect it when calling plot methods,
- # to avoid duplicates
- curried.__name__ = curried_with_axis.__name__ = name
- # special case otherwise extra plots are created when catching the
- # exception below
- if name in _plotting_methods:
- return self.apply(curried)
- try:
- return self.apply(curried_with_axis)
- except Exception:
- try:
- return self.apply(curried)
- except Exception:
- # related to : GH3688
- # try item-by-item
- # this can be called recursively, so need to raise
- # ValueError
- # if we don't have this method to indicated to aggregate to
- # mark this column as an error
- try:
- return self._aggregate_item_by_item(name,
- *args, **kwargs)
- except (AttributeError):
- raise ValueError
- return wrapper
- def get_group(self, name, obj=None):
- """
- Constructs NDFrame from group with provided name
- Parameters
- ----------
- name : object
- the name of the group to get as a DataFrame
- obj : NDFrame, default None
- the NDFrame to take the DataFrame out of. If
- it is None, the object groupby was called on will
- be used
- Returns
- -------
- group : type of obj
- """
- if obj is None:
- obj = self._selected_obj
- inds = self._get_index(name)
- if not len(inds):
- raise KeyError(name)
- return obj.take(inds, axis=self.axis, convert=False)
- def __iter__(self):
- """
- Groupby iterator
- Returns
- -------
- Generator yielding sequence of (name, subsetted object)
- for each group
- """
- return self.grouper.get_iterator(self.obj, axis=self.axis)
- @Substitution(name='groupby')
- def apply(self, func, *args, **kwargs):
- """
- Apply function and combine results together in an intelligent way. The
- split-apply-combine combination rules attempt to be as common sense
- based as possible. For example:
- case 1:
- group DataFrame
- apply aggregation function (f(chunk) -> Series)
- yield DataFrame, with group axis having group labels
- case 2:
- group DataFrame
- apply transform function ((f(chunk) -> DataFrame with same indexes)
- yield DataFrame with resulting chunks glued together
- case 3:
- group Series
- apply function with f(chunk) -> DataFrame
- yield DataFrame with result of chunks glued together
- Parameters
- ----------
- func : function
- Notes
- -----
- See online documentation for full exposition on how to use apply.
- In the current implementation apply calls func twice on the
- first group to decide whether it can take a fast or slow code
- path. This can lead to unexpected behavior if func has
- side-effects, as they will take effect twice for the first
- group.
- See also
- --------
- aggregate, transform"""
- func = self._is_builtin_func(func)
- # this is needed so we don't try and wrap strings. If we could
- # resolve functions to their callable functions prior, this
- # wouldn't be needed
- if args or kwargs:
- if callable(func):
- @wraps(func)
- def f(g):
- return func(g, *args, **kwargs)
- else:
- raise ValueError('func must be a callable if args or '
- 'kwargs are supplied')
- else:
- f = func
- # ignore SettingWithCopy here in case the user mutates
- with option_context('mode.chained_assignment', None):
- return self._python_apply_general(f)
- def _python_apply_general(self, f):
- keys, values, mutated = self.grouper.apply(f, self._selected_obj,
- self.axis)
- return self._wrap_applied_output(
- keys,
- values,
- not_indexed_same=mutated or self.mutated)
- def _iterate_slices(self):
- yield self.name, self._selected_obj
- def transform(self, func, *args, **kwargs):
- raise AbstractMethodError(self)
- def _cumcount_array(self, ascending=True):
- """
- Parameters
- ----------
- ascending : bool, default True
- If False, number in reverse, from length of group - 1 to 0.
- Note
- ----
- this is currently implementing sort=False
- (though the default is sort=True) for groupby in general
- """
- ids, _, ngroups = self.grouper.group_info
- sorter = _get_group_index_sorter(ids, ngroups)
- ids, count = ids[sorter], len(ids)
- if count == 0:
- return np.empty(0, dtype=np.int64)
- run = np.r_[True, ids[:-1] != ids[1:]]
- rep = np.diff(np.r_[np.nonzero(run)[0], count])
- out = (~run).cumsum()
- if ascending:
- out -= np.repeat(out[run], rep)
- else:
- out = np.repeat(out[np.r_[run[1:], True]], rep) - out
- rev = np.empty(count, dtype=np.intp)
- rev[sorter] = np.arange(count, dtype=np.intp)
- return out[rev].astype(np.int64, copy=False)
- def _index_with_as_index(self, b):
- """
- Take boolean mask of index to be returned from apply, if as_index=True
- """
- # TODO perf, it feels like this should already be somewhere...
- from itertools import chain
- original = self._selected_obj.index
- gp = self.grouper
- levels = chain((gp.levels[i][gp.labels[i][b]]
- for i in range(len(gp.groupings))),
- (original.get_level_values(i)[b]
- for i in range(original.nlevels)))
- new = MultiIndex.from_arrays(list(levels))
- new.names = gp.names + original.names
- return new
- def _try_cast(self, result, obj):
- """
- try to cast the result to our obj original type,
- we may have roundtripped thru object in the mean-time
- """
- if obj.ndim > 1:
- dtype = obj.values.dtype
- else:
- dtype = obj.dtype
- if not is_scalar(result):
- result = _possibly_downcast_to_dtype(result, dtype)
- return result
- def _cython_transform(self, how, numeric_only=True):
- output = {}
- for name, obj in self._iterate_slices():
- is_numeric = is_numeric_dtype(obj.dtype)
- if numeric_only and not is_numeric:
- continue
- try:
- result, names = self.grouper.transform(obj.values, how)
- except AssertionError as e:
- raise GroupByError(str(e))
- output[name] = self._try_cast(result, obj)
- if len(output) == 0:
- raise DataError('No numeric types to aggregate')
- return self._wrap_transformed_output(output, names)
- def _cython_agg_general(self, how, numeric_only=True):
- output = {}
- for name, obj in self._iterate_slices():
- is_numeric = is_numeric_dtype(obj.dtype)
- if numeric_only and not is_numeric:
- continue
- try:
- result, names = self.grouper.aggregate(obj.values, how)
- except AssertionError as e:
- raise GroupByError(str(e))
- output[name] = self._try_cast(result, obj)
- if len(output) == 0:
- raise DataError('No numeric types to aggregate')
- return self._wrap_aggregated_output(output, names)
- def _python_agg_general(self, func, *args, **kwargs):
- func = self._is_builtin_func(func)
- f = lambda x: func(x, *args, **kwargs)
- # iterate through "columns" ex exclusions to populate output dict
- output = {}
- for name, obj in self._iterate_slices():
- try:
- result, counts = self.grouper.agg_series(obj, f)
- output[name] = self._try_cast(result, obj)
- except TypeError:
- continue
- if len(output) == 0:
- return self._python_apply_general(f)
- if self.grouper._filter_empty_groups:
- mask = counts.ravel() > 0
- for name, result in compat.iteritems(output):
- # since we are masking, make sure that we have a float object
- values = result
- if is_numeric_dtype(values.dtype):
- values = _ensure_float(values)
- output[name] = self._try_cast(values[mask], result)
- return self._wrap_aggregated_output(output)
- def _wrap_applied_output(self, *args, **kwargs):
- raise AbstractMethodError(self)
- def _concat_objects(self, keys, values, not_indexed_same=False):
- from pandas.tools.merge import concat
- def reset_identity(values):
- # reset the identities of the components
- # of the values to prevent aliasing
- for v in values:
- if v is not None:
- ax = v._get_axis(self.axis)
- ax._reset_identity()
- return values
- if not not_indexed_same:
- result = concat(values, axis=self.axis)
- ax = self._selected_obj._get_axis(self.axis)
- if isinstance(result, Series):
- result = result.reindex(ax)
- else:
- result = result.reindex_axis(ax, axis=self.axis)
- elif self.group_keys:
- values = reset_identity(values)
- if self.as_index:
- # possible MI return case
- group_keys = keys
- group_levels = self.grouper.levels
- group_names = self.grouper.names
- result = concat(values, axis=self.axis, keys=group_keys,
- levels=group_levels, names=group_names)
- else:
- # GH5610, returns a MI, with the first level being a
- # range index
- keys = list(range(len(values)))
- result = concat(values, axis=self.axis, keys=keys)
- else:
- values = reset_identity(values)
- result = concat(values, axis=self.axis)
- if (isinstance(result, Series) and
- getattr(self, 'name', None) is not None):
- result.name = self.name
- return result
- def _apply_filter(self, indices, dropna):
- if len(indices) == 0:
- indices = np.array([], dtype='int64')
- else:
- indices = np.sort(np.concatenate(indices))
- if dropna:
- filtered = self._selected_obj.take(indices, axis=self.axis)
- else:
- mask = np.empty(len(self._selected_obj.index), dtype=bool)
- mask.fill(False)
- mask[indices.astype(int)] = True
- # mask fails to broadcast when passed to where; broadcast manually.
- mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T
- filtered = self._selected_obj.where(mask) # Fill with NaNs.
- return filtered
- class GroupBy(_GroupBy):
- """
- Class for grouping and aggregating relational data. See aggregate,
- transform, and apply functions on this object.
- It's easiest to use obj.groupby(...) to use GroupBy, but you can also do:
- ::
- grouped = groupby(obj, ...)
- Parameters
- ----------
- obj : pandas object
- axis : int, default 0
- level : int, default None
- Level of MultiIndex
- groupings : list of Grouping objects
- Most users should ignore this
- exclusions : array-like, optional
- List of columns to exclude
- name : string
- Most users should ignore this
- Notes
- -----
- After grouping, see aggregate, apply, and transform functions. Here are
- some other brief notes about usage. When grouping by multiple groups, the
- result index will be a MultiIndex (hierarchical) by default.
- Iteration produces (key, group) tuples, i.e. chunking the data by group. So
- you can write code like:
- ::
- grouped = obj.groupby(keys, axis=axis)
- for key, group in grouped:
- # do something with the data
- Function calls on GroupBy, if not specially implemented, "dispatch" to the
- grouped data. So if you group a DataFrame and wish to invoke the std()
- method on each group, you can simply do:
- ::
- df.groupby(mapper).std()
- rather than
- ::
- df.groupby(mapper).aggregate(np.std)
- You can pass arguments to these "wrapped" functions, too.
- See the online documentation for full exposition on these topics and much
- more
- Returns
- -------
- **Attributes**
- groups : dict
- {group name -> group labels}
- len(grouped) : int
- Number of groups
- """
- _apply_whitelist = _common_apply_whitelist
- def irow(self, i):
- """
- DEPRECATED. Use ``.nth(i)`` instead
- """
- # 10177
- warnings.warn("irow(i) is deprecated. Please use .nth(i)",
- FutureWarning, stacklevel=2)
- return self.nth(i)
- @Substitution(name='groupby')
- @Appender(_doc_template)
- def count(self):
- """Compute count of group, excluding missing values"""
- # defined here for API doc
- raise NotImplementedError
- @Substitution(name='groupby')
- @Appender(_doc_template)
- def mean(self, *args, **kwargs):
- """
- Compute mean of groups, excluding missing values
- For multiple groupings, the result index will be a MultiIndex
- """
- nv.validate_groupby_func('mean', args, kwargs)
- try:
- return self._cython_agg_general('mean')
- except GroupByError:
- raise
- except Exception: # pragma: no cover
- self._set_group_selection()
- f = lambda x: x.mean(axis=self.axis)
- return self._python_agg_general(f)
- @Substitution(name='groupby')
- @Appender(_doc_template)
- def median(self):
- """
- Compute median of groups, excluding missing values
- For multiple groupings, the result index will be a MultiIndex
- """
- try:
- return self._cython_agg_general('median')
- except GroupByError:
- raise
- except Exception: # pragma: no cover
- self._set_group_selection()
- def f(x):
- if isinstance(x, np.ndarray):
- x = Series(x)
- return x.median(axis=self.axis)
- return self._python_agg_general(f)
- @Substitution(name='groupby')
- @Appender(_doc_template)
- def std(self, ddof=1, *args, **kwargs):
- """
- Compute standard deviation of groups, excluding missing values
- For multiple groupings, the result index will be a MultiIndex
- Parameters
- ----------
- ddof : integer, default 1
- degrees of freedom
- """
- # TODO: implement at Cython level?
- nv.validate_groupby_func('std', args, kwargs)
- return np.sqrt(self.var(ddof=ddof))
- @Substitution(name='groupby')
- @Appender(_doc_template)
- def var(self, ddof=1, *args, **kwargs):
- """
- Compute variance of groups, excluding missing values
- For multiple groupings, the result index will be a MultiIndex
- Parameters
- ----------
- ddof : integer, default 1
- degrees of freedom
- """
- nv.validate_groupby_func('var', args, kwargs)
- if ddof == 1:
- return self._cython_agg_general('var')
- else:
- self._set_group_selection()
- f = lambda x: x.var(ddof=ddof)
- return self._python_agg_general(f)
- @Substitution(name='groupby')
- @Appender(_doc_template)
- def sem(self, ddof=1):
- """
- Compute standard error of the mean of groups, excluding missing values
- For multiple groupings, the result index will be a MultiIndex
- Parameters
- ----------
- ddof : integer, default 1
- degrees of freedom
- """
- return self.std(ddof=ddof) / np.sqrt(self.count())
- @Substitution(name='groupby')
- @Appender(_doc_template)
- def size(self):
- """Compute group sizes"""
- return self.grouper.size()
- sum = _groupby_function('sum', 'add', np.sum)
- prod = _groupby_function('prod', 'prod', np.prod)
- min = _groupby_function('min', 'min', np.min, numeric_only=False)
- max = _groupby_function('max', 'max', np.max, numeric_only=False)
- first = _groupby_function('first', 'first', _first_compat,
- numeric_only=False, _convert=True)
- last = _groupby_function('last', 'last', _last_compat, numeric_only=False,
- _convert=True)
- @Substitution(name='groupby')
- @Appender(_doc_template)
- def ohlc(self):
- """
- Compute sum of values, excluding missing values
- For multiple groupings, the result index will be a MultiIndex
- """
- return self._apply_to_column_groupbys(
- lambda x: x._cython_agg_general('ohlc'))
- @Substitution(name='groupby')
- @Appender(_doc_template)
- def resample(self, rule, *args, **kwargs):
- """
- Provide resampling when using a TimeGrouper
- Return a new grouper with our resampler appended
- """
- from pandas.tseries.resample import get_resampler_for_grouping
- return get_resampler_for_grouping(self, rule, *args, **kwargs)
- @Substitution(name='groupby')
- @Appender(_doc_template)
- def rolling(self, *args, **kwargs):
- """
- Return a rolling grouper, providing rolling
- functionaility per group
- """
- from pandas.core.window import RollingGroupby
- return RollingGroupby(self, *args, **kwargs)
- @Substitution(name='groupby')
- @Appender(_doc_template)
- def expanding(self, *args, **kwargs):
- """
- Return an expanding grouper, providing expanding
- functionaility per group
- """
- from pandas.core.window import ExpandingGroupby
- return ExpandingGroupby(self, *args, **kwargs)
- @Substitution(name='groupby')
- @Appender(_doc_template)
- def pad(self, limit=None):
- """
- Forward fill the values
- Parameters
- ----------
- limit : integer, optional
- limit of how many values to fill
- See Also
- --------
- Series.fillna
- DataFrame.fillna
- """
- return self.apply(lambda x: x.ffill(limit=limit))
- ffill = pad
- @Substitution(name='groupby')
- @Appender(_doc_template)
- def backfill(self, limit=None):
- """
- Backward fill the values
- Parameters
- ----------
- limit : integer, optional
- limit of how many values to fill
- See Also
- --------
- Series.fillna
- DataFrame.fillna
- """
- return self.apply(lambda x: x.bfill(limit=limit))
- bfill = backfill
- @Substitution(name='groupby')
- @Appender(_doc_template)
- def nth(self, n, dropna=None):
- """
- Take the nth row from each group if n is an int, or a subset of rows
- if n is a list of ints.
- If dropna, will take the nth non-null row, dropna is either
- Truthy (if a Series) or 'all', 'any' (if a DataFrame);
- this is equivalent to calling dropna(how=dropna) before the
- groupby.
- Parameters
- ----------
- n : int or list of ints
- a single nth value for the row or a list of nth values
- dropna : None or str, optional
- apply the specified dropna operation before counting which row is
- the nth row. Needs to be None, 'any' or 'all'
- Examples
- --------
- >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
- ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B'])
- >>> g = df.groupby('A')
- >>> g.nth(0)
- B
- A
- 1 NaN
- 2 3.0
- >>> g.nth(1)
- B
- A
- 1 2.0
- 2 5.0
- >>> g.nth(-1)
- B
- A
- 1 4.0
- 2 5.0
- >>> g.nth([0, 1])
- B
- A
- 1 NaN
- 1 2.0
- 2 3.0
- 2 5.0
- Specifying ``dropna`` allows count ignoring NaN
- >>> g.nth(0, dropna='any')
- B
- A
- 1 2.0
- 2 3.0
- NaNs denote group exhausted when using dropna
- >>> g.nth(3, dropna='any')
- B
- A
- 1 NaN
- 2 NaN
- Specifying ``as_index=False`` in ``groupby`` keeps the original index.
- >>> df.groupby('A', as_index=False).nth(1)
- A B
- 1 1 2.0
- 4 2 5.0
- """
- if isinstance(n, int):
- nth_values = [n]
- elif isinstance(n, (set, list, tuple)):
- nth_values = list(set(n))
- if dropna is not None:
- raise ValueError(
- "dropna option with a list of nth values is not supported")
- else:
- raise TypeError("n needs to be an int or a list/set/tuple of ints")
- nth_values = np.array(nth_values, dtype=np.intp)
- self._set_group_selection()
- if not dropna:
- mask = np.in1d(self._cumcount_array(), nth_values) | \
- np.in1d(self._cumcount_array(ascending=False) + 1, -nth_values)
- out = self._selected_obj[mask]
- if not self.as_index:
- return out
- ids, _, _ = self.grouper.group_info
- out.index = self.grouper.result_index[ids[mask]]
- return out.sort_index() if self.sort else out
- if isinstance(self._selected_obj, DataFrame) and \
- dropna not in ['any', 'all']:
- # Note: when agg-ing picker doesn't raise this, just returns NaN
- raise ValueError("For a DataFrame groupby, dropna must be "
- "either None, 'any' or 'all', "
- "(was passed %s)." % (dropna),)
- # old behaviour, but with all and any support for DataFrames.
- # modified in GH 7559 to have better perf
- max_len = n if n >= 0 else - 1 - n
- dropped = self.obj.dropna(how=dropna, axis=self.axis)
- # get a new grouper for our dropped obj
- if self.keys is None and self.level is None:
- # we don't have the grouper info available
- # (e.g. we have selected out
- # a column that is not in the current object)
- axis = self.grouper.axis
- grouper = axis[axis.isin(dropped.index)]
- else:
- # create a grouper with the original parameters, but on the dropped
- # object
- grouper, _, _ = _get_grouper(dropped, key=self.keys,
- axis=self.axis, level=self.level,
- sort=self.sort,
- mutated=self.mutated)
- grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort)
- sizes, result = grb.size(), grb.nth(n)
- mask = (sizes < max_len).values
- # set the results which don't meet the criteria
- if len(result) and mask.any():
- result.loc[mask] = np.nan
- # reset/reindex to the original groups
- if len(self.obj) == len(dropped) or \
- len(result) == len(self.grouper.result_index):
- result.index = self.grouper.result_index
- else:
- result = result.reindex(self.grouper.result_index)
- return result
- @Substitution(name='groupby')
- @Appender(_doc_template)
- def cumcount(self, ascending=True):
- """
- Number each item in each group from 0 to the length of that group - 1.
- Essentially this is equivalent to
- >>> self.apply(lambda x: Series(np.arange(len(x)), x.index))
- Parameters
- ----------
- ascending : bool, default True
- If False, number in reverse, from length of group - 1 to 0.
- Examples
- --------
- >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],
- ... columns=['A'])
- >>> df
- A
- 0 a
- 1 a
- 2 a
- 3 b
- 4 b
- 5 a
- >>> df.groupby('A').cumcount()
- 0 0
- 1 1
- 2 2
- 3 0
- 4 1
- 5 3
- dtype: int64
- >>> df.groupby('A').cumcount(ascending=False)
- 0 3
- 1 2
- 2 1
- 3 1
- 4 0
- 5 0
- dtype: int64
- """
- self._set_group_selection()
- index = self._selected_obj.index
- cumcounts = self._cumcount_array(ascending=ascending)
- return Series(cumcounts, index)
- @Substitution(name='groupby')
- @Appender(_doc_template)
- def cumprod(self, axis=0, *args, **kwargs):
- """Cumulative product for each group"""
- nv.validate_groupby_func('cumprod', args, kwargs)
- if axis != 0:
- return self.apply(lambda x: x.cumprod(axis=axis))
- return self._cython_transform('cumprod')
- @Substitution(name='groupby')
- @Appender(_doc_template)
- def cumsum(self, axis=0, *args, **kwargs):
- """Cumulative sum for each group"""
- nv.validate_groupby_func('cumsum', args, kwargs)
- if axis != 0:
- return self.apply(lambda x: x.cumsum(axis=axis))
- return self._cython_transform('cumsum')
- @Substitution(name='groupby')
- @Appender(_doc_template)
- def shift(self, periods=1, freq=None, axis=0):
- """
- Shift each group by periods observations
- Parameters
- ----------
- periods : integer, default 1
- number of periods to shift
- freq : frequency string
- axis : axis to shift, default 0
- """
- if freq is not None or axis != 0:
- return self.apply(lambda x: x.shift(periods, freq, axis))
- labels, _, ngroups = self.grouper.group_info
- # filled in by Cython
- indexer = np.zeros_like(labels)
- _algos.group_shift_indexer(indexer, labels, ngroups, periods)
- output = {}
- for name, obj in self._iterate_slices():
- output[name] = algos.take_nd(obj.values, indexer)
- return self._wrap_transformed_output(output)
- @Substitution(name='groupby')
- @Appender(_doc_template)
- def head(self, n=5):
- """
- Returns first n rows of each group.
- Essentially equivalent to ``.apply(lambda x: x.head(n))``,
- except ignores as_index flag.
- Examples
- --------
- >>> df = DataFrame([[1, 2], [1, 4], [5, 6]],
- columns=['A', 'B'])
- >>> df.groupby('A', as_index=False).head(1)
- A B
- 0 1 2
- 2 5 6
- >>> df.groupby('A').head(1)
- A B
- 0 1 2
- 2 5 6
- """
- self._reset_group_selection()
- mask = self._cumcount_array() < n
- return self._selected_obj[mask]
- @Substitution(name='groupby')
- @Appender(_doc_template)
- def tail(self, n=5):
- """
- Returns last n rows of each group
- Essentially equivalent to ``.apply(lambda x: x.tail(n))``,
- except ignores as_index flag.
- Examples
- --------
- >>> df = DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]],
- columns=['A', 'B'])
- >>> df.groupby('A').tail(1)
- A B
- 1 a 2
- 3 b 2
- >>> df.groupby('A').head(1)
- A B
- 0 a 1
- 2 b 1
- """
- self._reset_group_selection()
- mask = self._cumcount_array(ascending=False) < n
- return self._selected_obj[mask]
- @Appender(GroupBy.__doc__)
- def groupby(obj, by, **kwds):
- if isinstance(obj, Series):
- klass = SeriesGroupBy
- elif isinstance(obj, DataFrame):
- klass = DataFrameGroupBy
- else: # pragma: no cover
- raise TypeError('invalid type: %s' % type(obj))
- return klass(obj, by, **kwds)
- def _get_axes(group):
- if isinstance(group, Series):
- return [group.index]
- else:
- return group.axes
- def _is_indexed_like(obj, axes):
- if isinstance(obj, Series):
- if len(axes) > 1:
- return False
- return obj.index.equals(axes[0])
- elif isinstance(obj, DataFrame):
- return obj.index.equals(axes[0])
- return False
- class BaseGrouper(object):
- """
- This is an internal Grouper class, which actually holds
- the generated groups
- """
- def __init__(self, axis, groupings, sort=True, group_keys=True,
- mutated=False):
- self._filter_empty_groups = self.compressed = len(groupings) != 1
- self.axis = axis
- self.groupings = groupings
- self.sort = sort
- self.group_keys = group_keys
- self.mutated = mutated
- @property
- def shape(self):
- return tuple(ping.ngroups for ping in self.groupings)
- def __iter__(self):
- return iter(self.indices)
- @property
- def nkeys(self):
- return len(self.groupings)
- def get_iterator(self, data, axis=0):
- """
- Groupby iterator
- Returns
- -------
- Generator yielding sequence of (name, subsetted object)
- for each group
- """
- splitter = self._get_splitter(data, axis=axis)
- keys = self._get_group_keys()
- for key, (i, group) in zip(keys, splitter):
- yield key, group
- def _get_splitter(self, data, axis=0):
- comp_ids, _, ngroups = self.group_info
- return get_splitter(data, comp_ids, ngroups, axis=axis)
- def _get_group_keys(self):
- if len(self.groupings) == 1:
- return self.levels[0]
- else:
- comp_ids, _, ngroups = self.group_info
- # provide "flattened" iterator for multi-group setting
- mapper = _KeyMapper(comp_ids, ngroups, self.labels, self.levels)
- return [mapper.get_key(i) for i in range(ngroups)]
- def apply(self, f, data, axis=0):
- mutated = self.mutated
- splitter = self._get_splitter(data, axis=axis)
- group_keys = self._get_group_keys()
- # oh boy
- f_name = com._get_callable_name(f)
- if (f_name not in _plotting_methods and
- hasattr(splitter, 'fast_apply') and axis == 0):
- try:
- values, mutated = splitter.fast_apply(f, group_keys)
- return group_keys, values, mutated
- except (lib.InvalidApply):
- # we detect a mutation of some kind
- # so take slow path
- pass
- except Exception:
- # raise this error to the caller
- pass
- result_values = []
- for key, (i, group) in zip(…
Large files files are truncated, but you can click here to view the full file