/pandas/core/groupby.py
Python | 3509 lines | 3505 code | 4 blank | 0 comment | 9 complexity | 3a4baa974c53d4b762d06020f453f60c MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
Large files files are truncated, but you can click here to view the full file
- import types
- from functools import wraps
- import numpy as np
- import datetime
- import collections
- from pandas.compat import(
- zip, builtins, range, long, lzip,
- OrderedDict, callable
- )
- from pandas import compat
- from pandas.core.base import PandasObject
- from pandas.core.categorical import Categorical
- from pandas.core.frame import DataFrame
- from pandas.core.generic import NDFrame
- from pandas.core.index import Index, MultiIndex, _ensure_index, _union_indexes
- from pandas.core.internals import BlockManager, make_block
- from pandas.core.series import Series
- from pandas.core.panel import Panel
- from pandas.util.decorators import cache_readonly, Appender
- import pandas.core.algorithms as algos
- import pandas.core.common as com
- from pandas.core.common import(_possibly_downcast_to_dtype, isnull,
- notnull, _DATELIKE_DTYPES, is_numeric_dtype,
- is_timedelta64_dtype, is_datetime64_dtype)
- from pandas import _np_version_under1p7
- import pandas.lib as lib
- from pandas.lib import Timestamp
- import pandas.tslib as tslib
- import pandas.algos as _algos
- import pandas.hashtable as _hash
- _agg_doc = """Aggregate using input function or dict of {column -> function}
- Parameters
- ----------
- arg : function or dict
- Function to use for aggregating groups. If a function, must either
- work when passed a DataFrame or when passed to DataFrame.apply. If
- passed a dict, the keys must be DataFrame column names.
- Notes
- -----
- Numpy functions mean/median/prod/sum/std/var are special cased so the
- default behavior is applying the function along axis=0
- (e.g., np.mean(arr_2d, axis=0)) as opposed to
- mimicking the default Numpy behavior (e.g., np.mean(arr_2d)).
- Returns
- -------
- aggregated : DataFrame
- """
- # special case to prevent duplicate plots when catching exceptions when
- # forwarding methods from NDFrames
- _plotting_methods = frozenset(['plot', 'boxplot', 'hist'])
- _common_apply_whitelist = frozenset([
- 'last', 'first',
- 'head', 'tail', 'median',
- 'mean', 'sum', 'min', 'max',
- 'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount',
- 'resample',
- 'describe',
- 'rank', 'quantile', 'count',
- 'fillna',
- 'mad',
- 'any', 'all',
- 'irow', 'take',
- 'idxmax', 'idxmin',
- 'shift', 'tshift',
- 'ffill', 'bfill',
- 'pct_change', 'skew',
- 'corr', 'cov', 'diff',
- ]) | _plotting_methods
- _series_apply_whitelist = \
- (_common_apply_whitelist - set(['boxplot'])) | \
- frozenset(['dtype', 'value_counts', 'unique', 'nunique',
- 'nlargest', 'nsmallest'])
- _dataframe_apply_whitelist = \
- _common_apply_whitelist | frozenset(['dtypes', 'corrwith'])
- class GroupByError(Exception):
- pass
- class DataError(GroupByError):
- pass
- class SpecificationError(GroupByError):
- pass
- def _groupby_function(name, alias, npfunc, numeric_only=True,
- _convert=False):
- def f(self):
- self._set_selection_from_grouper()
- try:
- return self._cython_agg_general(alias, numeric_only=numeric_only)
- except AssertionError as e:
- raise SpecificationError(str(e))
- except Exception:
- result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
- if _convert:
- result = result.convert_objects()
- return result
- f.__doc__ = "Compute %s of group values" % name
- f.__name__ = name
- return f
- def _first_compat(x, axis=0):
- def _first(x):
- x = np.asarray(x)
- x = x[notnull(x)]
- if len(x) == 0:
- return np.nan
- return x[0]
- if isinstance(x, DataFrame):
- return x.apply(_first, axis=axis)
- else:
- return _first(x)
- def _last_compat(x, axis=0):
- def _last(x):
- x = np.asarray(x)
- x = x[notnull(x)]
- if len(x) == 0:
- return np.nan
- return x[-1]
- if isinstance(x, DataFrame):
- return x.apply(_last, axis=axis)
- else:
- return _last(x)
- def _count_compat(x, axis=0):
- return x.size
- class Grouper(object):
- """
- A Grouper allows the user to specify a groupby instruction for a target object
- This specification will select a column via the key parameter, or if the level and/or
- axis parameters are given, a level of the index of the target object.
- These are local specifications and will override 'global' settings, that is the parameters
- axis and level which are passed to the groupby itself.
- Parameters
- ----------
- key : string, defaults to None
- groupby key, which selects the grouping column of the target
- level : name/number, defaults to None
- the level for the target index
- freq : string / freqency object, defaults to None
- This will groupby the specified frequency if the target selection (via key or level) is
- a datetime-like object
- axis : number/name of the axis, defaults to None
- sort : boolean, default to False
- whether to sort the resulting labels
- additional kwargs to control time-like groupers (when freq is passed)
- closed : closed end of interval; left or right
- label : interval boundary to use for labeling; left or right
- convention : {'start', 'end', 'e', 's'}
- If grouper is PeriodIndex
- Returns
- -------
- A specification for a groupby instruction
- Examples
- --------
- >>> df.groupby(Grouper(key='A')) : syntatic sugar for df.groupby('A')
- >>> df.groupby(Grouper(key='date',freq='60s')) : specify a resample on the column 'date'
- >>> df.groupby(Grouper(level='date',freq='60s',axis=1)) :
- specify a resample on the level 'date' on the columns axis with a frequency of 60s
- """
- def __new__(cls, *args, **kwargs):
- if kwargs.get('freq') is not None:
- from pandas.tseries.resample import TimeGrouper
- cls = TimeGrouper
- return super(Grouper, cls).__new__(cls)
- def __init__(self, key=None, level=None, freq=None, axis=None, sort=False):
- self.key=key
- self.level=level
- self.freq=freq
- self.axis=axis
- self.sort=sort
- self.grouper=None
- self.obj=None
- self.indexer=None
- self.binner=None
- self.grouper=None
- @property
- def ax(self):
- return self.grouper
- def _get_grouper(self, obj):
- """
- Parameters
- ----------
- obj : the subject object
- Returns
- -------
- a tuple of binner, grouper, obj (possibly sorted)
- """
- self._set_grouper(obj)
- return self.binner, self.grouper, self.obj
- def _set_grouper(self, obj, sort=False):
- """
- given an object and the specifcations, setup the internal grouper for this particular specification
- Parameters
- ----------
- obj : the subject object
- """
- if self.key is not None and self.level is not None:
- raise ValueError("The Grouper cannot specify both a key and a level!")
- # the key must be a valid info item
- if self.key is not None:
- key = self.key
- if key not in obj._info_axis:
- raise KeyError("The grouper name {0} is not found".format(key))
- ax = Index(obj[key],name=key)
- else:
- ax = obj._get_axis(self.axis)
- if self.level is not None:
- level = self.level
- # if a level is given it must be a mi level or
- # equivalent to the axis name
- if isinstance(ax, MultiIndex):
- if isinstance(level, compat.string_types):
- if obj.index.name != level:
- raise ValueError('level name %s is not the name of the '
- 'index' % level)
- elif level > 0:
- raise ValueError('level > 0 only valid with MultiIndex')
- ax = Index(ax.get_level_values(level), name=level)
- else:
- if not (level == 0 or level == ax.name):
- raise ValueError("The grouper level {0} is not valid".format(level))
- # possibly sort
- if (self.sort or sort) and not ax.is_monotonic:
- indexer = self.indexer = ax.argsort(kind='quicksort')
- ax = ax.take(indexer)
- obj = obj.take(indexer, axis=self.axis, convert=False, is_copy=False)
- self.obj = obj
- self.grouper = ax
- return self.grouper
- def _get_binner_for_grouping(self, obj):
- raise NotImplementedError
- @property
- def groups(self):
- return self.grouper.groups
- class GroupBy(PandasObject):
- """
- Class for grouping and aggregating relational data. See aggregate,
- transform, and apply functions on this object.
- It's easiest to use obj.groupby(...) to use GroupBy, but you can also do:
- ::
- grouped = groupby(obj, ...)
- Parameters
- ----------
- obj : pandas object
- axis : int, default 0
- level : int, default None
- Level of MultiIndex
- groupings : list of Grouping objects
- Most users should ignore this
- exclusions : array-like, optional
- List of columns to exclude
- name : string
- Most users should ignore this
- Notes
- -----
- After grouping, see aggregate, apply, and transform functions. Here are
- some other brief notes about usage. When grouping by multiple groups, the
- result index will be a MultiIndex (hierarchical) by default.
- Iteration produces (key, group) tuples, i.e. chunking the data by group. So
- you can write code like:
- ::
- grouped = obj.groupby(keys, axis=axis)
- for key, group in grouped:
- # do something with the data
- Function calls on GroupBy, if not specially implemented, "dispatch" to the
- grouped data. So if you group a DataFrame and wish to invoke the std()
- method on each group, you can simply do:
- ::
- df.groupby(mapper).std()
- rather than
- ::
- df.groupby(mapper).aggregate(np.std)
- You can pass arguments to these "wrapped" functions, too.
- See the online documentation for full exposition on these topics and much
- more
- Returns
- -------
- **Attributes**
- groups : dict
- {group name -> group labels}
- len(grouped) : int
- Number of groups
- """
- _apply_whitelist = _common_apply_whitelist
- _internal_names = ['_cache']
- _internal_names_set = set(_internal_names)
- _group_selection = None
- def __init__(self, obj, keys=None, axis=0, level=None,
- grouper=None, exclusions=None, selection=None, as_index=True,
- sort=True, group_keys=True, squeeze=False):
- self._selection = selection
- if isinstance(obj, NDFrame):
- obj._consolidate_inplace()
- self.level = level
- if not as_index:
- if not isinstance(obj, DataFrame):
- raise TypeError('as_index=False only valid with DataFrame')
- if axis != 0:
- raise ValueError('as_index=False only valid for axis=0')
- self.as_index = as_index
- self.keys = keys
- self.sort = sort
- self.group_keys = group_keys
- self.squeeze = squeeze
- if grouper is None:
- grouper, exclusions, obj = _get_grouper(obj, keys, axis=axis,
- level=level, sort=sort)
- self.obj = obj
- self.axis = obj._get_axis_number(axis)
- self.grouper = grouper
- self.exclusions = set(exclusions) if exclusions else set()
- def __len__(self):
- return len(self.indices)
- def __unicode__(self):
- # TODO: Better unicode/repr for GroupBy object
- return object.__repr__(self)
- @property
- def groups(self):
- """ dict {group name -> group labels} """
- return self.grouper.groups
- @property
- def ngroups(self):
- return self.grouper.ngroups
- @property
- def indices(self):
- """ dict {group name -> group indices} """
- return self.grouper.indices
- def _get_index(self, name):
- """ safe get index, translate keys for datelike to underlying repr """
- def convert(key, s):
- # possibly convert to they actual key types
- # in the indices, could be a Timestamp or a np.datetime64
- if isinstance(s, (Timestamp,datetime.datetime)):
- return Timestamp(key)
- elif isinstance(s, np.datetime64):
- return Timestamp(key).asm8
- return key
- sample = next(iter(self.indices))
- if isinstance(sample, tuple):
- if not isinstance(name, tuple):
- raise ValueError("must supply a tuple to get_group with multiple grouping keys")
- if not len(name) == len(sample):
- raise ValueError("must supply a a same-length tuple to get_group with multiple grouping keys")
- name = tuple([ convert(n, k) for n, k in zip(name,sample) ])
- else:
- name = convert(name, sample)
- return self.indices[name]
- @property
- def name(self):
- if self._selection is None:
- return None # 'result'
- else:
- return self._selection
- @property
- def _selection_list(self):
- if not isinstance(self._selection, (list, tuple, Series, np.ndarray)):
- return [self._selection]
- return self._selection
- @cache_readonly
- def _selected_obj(self):
- if self._selection is None or isinstance(self.obj, Series):
- if self._group_selection is not None:
- return self.obj[self._group_selection]
- return self.obj
- else:
- return self.obj[self._selection]
- def _set_selection_from_grouper(self):
- """ we may need create a selection if we have non-level groupers """
- grp = self.grouper
- if self.as_index and getattr(grp,'groupings',None) is not None:
- ax = self.obj._info_axis
- groupers = [ g.name for g in grp.groupings if g.level is None and g.name is not None and g.name in ax ]
- if len(groupers):
- self._group_selection = (ax-Index(groupers)).tolist()
- def _local_dir(self):
- return sorted(set(self.obj._local_dir() + list(self._apply_whitelist)))
- def __getattr__(self, attr):
- if attr in self._internal_names_set:
- return object.__getattribute__(self, attr)
- if attr in self.obj:
- return self[attr]
- if hasattr(self.obj, attr):
- return self._make_wrapper(attr)
- raise AttributeError("%r object has no attribute %r" %
- (type(self).__name__, attr))
- def __getitem__(self, key):
- raise NotImplementedError('Not implemented: %s' % key)
- def _make_wrapper(self, name):
- if name not in self._apply_whitelist:
- is_callable = callable(getattr(self._selected_obj, name, None))
- kind = ' callable ' if is_callable else ' '
- msg = ("Cannot access{0}attribute {1!r} of {2!r} objects, try "
- "using the 'apply' method".format(kind, name,
- type(self).__name__))
- raise AttributeError(msg)
- # need to setup the selection
- # as are not passed directly but in the grouper
- self._set_selection_from_grouper()
- f = getattr(self._selected_obj, name)
- if not isinstance(f, types.MethodType):
- return self.apply(lambda self: getattr(self, name))
- f = getattr(type(self._selected_obj), name)
- def wrapper(*args, **kwargs):
- # a little trickery for aggregation functions that need an axis
- # argument
- kwargs_with_axis = kwargs.copy()
- if 'axis' not in kwargs_with_axis:
- kwargs_with_axis['axis'] = self.axis
- def curried_with_axis(x):
- return f(x, *args, **kwargs_with_axis)
- def curried(x):
- return f(x, *args, **kwargs)
- # preserve the name so we can detect it when calling plot methods,
- # to avoid duplicates
- curried.__name__ = curried_with_axis.__name__ = name
- # special case otherwise extra plots are created when catching the
- # exception below
- if name in _plotting_methods:
- return self.apply(curried)
- try:
- return self.apply(curried_with_axis)
- except Exception:
- try:
- return self.apply(curried)
- except Exception:
- # related to : GH3688
- # try item-by-item
- # this can be called recursively, so need to raise ValueError if
- # we don't have this method to indicated to aggregate to
- # mark this column as an error
- try:
- return self._aggregate_item_by_item(name, *args, **kwargs)
- except (AttributeError):
- raise ValueError
- return wrapper
- def get_group(self, name, obj=None):
- """
- Constructs NDFrame from group with provided name
- Parameters
- ----------
- name : object
- the name of the group to get as a DataFrame
- obj : NDFrame, default None
- the NDFrame to take the DataFrame out of. If
- it is None, the object groupby was called on will
- be used
- Returns
- -------
- group : type of obj
- """
- if obj is None:
- obj = self._selected_obj
- inds = self._get_index(name)
- return obj.take(inds, axis=self.axis, convert=False)
- def __iter__(self):
- """
- Groupby iterator
- Returns
- -------
- Generator yielding sequence of (name, subsetted object)
- for each group
- """
- return self.grouper.get_iterator(self.obj, axis=self.axis)
- def apply(self, func, *args, **kwargs):
- """
- Apply function and combine results together in an intelligent way. The
- split-apply-combine combination rules attempt to be as common sense
- based as possible. For example:
- case 1:
- group DataFrame
- apply aggregation function (f(chunk) -> Series)
- yield DataFrame, with group axis having group labels
- case 2:
- group DataFrame
- apply transform function ((f(chunk) -> DataFrame with same indexes)
- yield DataFrame with resulting chunks glued together
- case 3:
- group Series
- apply function with f(chunk) -> DataFrame
- yield DataFrame with result of chunks glued together
- Parameters
- ----------
- func : function
- Notes
- -----
- See online documentation for full exposition on how to use apply.
- In the current implementation apply calls func twice on the
- first group to decide whether it can take a fast or slow code
- path. This can lead to unexpected behavior if func has
- side-effects, as they will take effect twice for the first
- group.
- See also
- --------
- aggregate, transform
- Returns
- -------
- applied : type depending on grouped object and function
- """
- func = _intercept_function(func)
- @wraps(func)
- def f(g):
- return func(g, *args, **kwargs)
- return self._python_apply_general(f)
- def _python_apply_general(self, f):
- keys, values, mutated = self.grouper.apply(f, self._selected_obj,
- self.axis)
- return self._wrap_applied_output(keys, values,
- not_indexed_same=mutated)
- def aggregate(self, func, *args, **kwargs):
- raise NotImplementedError
- @Appender(_agg_doc)
- def agg(self, func, *args, **kwargs):
- return self.aggregate(func, *args, **kwargs)
- def _iterate_slices(self):
- yield self.name, self._selected_obj
- def transform(self, func, *args, **kwargs):
- raise NotImplementedError
- def mean(self):
- """
- Compute mean of groups, excluding missing values
- For multiple groupings, the result index will be a MultiIndex
- """
- try:
- return self._cython_agg_general('mean')
- except GroupByError:
- raise
- except Exception: # pragma: no cover
- self._set_selection_from_grouper()
- f = lambda x: x.mean(axis=self.axis)
- return self._python_agg_general(f)
- def median(self):
- """
- Compute median of groups, excluding missing values
- For multiple groupings, the result index will be a MultiIndex
- """
- try:
- return self._cython_agg_general('median')
- except GroupByError:
- raise
- except Exception: # pragma: no cover
- self._set_selection_from_grouper()
- def f(x):
- if isinstance(x, np.ndarray):
- x = Series(x)
- return x.median(axis=self.axis)
- return self._python_agg_general(f)
- def std(self, ddof=1):
- """
- Compute standard deviation of groups, excluding missing values
- For multiple groupings, the result index will be a MultiIndex
- """
- # todo, implement at cython level?
- return np.sqrt(self.var(ddof=ddof))
- def var(self, ddof=1):
- """
- Compute variance of groups, excluding missing values
- For multiple groupings, the result index will be a MultiIndex
- """
- if ddof == 1:
- return self._cython_agg_general('var')
- else:
- self._set_selection_from_grouper()
- f = lambda x: x.var(ddof=ddof)
- return self._python_agg_general(f)
- def sem(self, ddof=1):
- """
- Compute standard error of the mean of groups, excluding missing values
- For multiple groupings, the result index will be a MultiIndex
- """
- return self.std(ddof=ddof)/np.sqrt(self.count())
- def size(self):
- """
- Compute group sizes
- """
- return self.grouper.size()
- sum = _groupby_function('sum', 'add', np.sum)
- prod = _groupby_function('prod', 'prod', np.prod)
- min = _groupby_function('min', 'min', np.min, numeric_only=False)
- max = _groupby_function('max', 'max', np.max, numeric_only=False)
- first = _groupby_function('first', 'first', _first_compat,
- numeric_only=False, _convert=True)
- last = _groupby_function('last', 'last', _last_compat, numeric_only=False,
- _convert=True)
- _count = _groupby_function('_count', 'count', _count_compat,
- numeric_only=False)
- def count(self, axis=0):
- return self._count().astype('int64')
- def ohlc(self):
- """
- Compute sum of values, excluding missing values
- For multiple groupings, the result index will be a MultiIndex
- """
- return self._apply_to_column_groupbys(
- lambda x: x._cython_agg_general('ohlc'))
- def nth(self, n, dropna=None):
- """
- Take the nth row from each group.
- If dropna, will not show nth non-null row, dropna is either
- Truthy (if a Series) or 'all', 'any' (if a DataFrame); this is equivalent
- to calling dropna(how=dropna) before the groupby.
- Examples
- --------
- >>> DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
- >>> g = df.groupby('A')
- >>> g.nth(0)
- A B
- 0 1 NaN
- 2 5 6
- >>> g.nth(1)
- A B
- 1 1 4
- >>> g.nth(-1)
- A B
- 1 1 4
- 2 5 6
- >>> g.nth(0, dropna='any')
- B
- A
- 1 4
- 5 6
- >>> g.nth(1, dropna='any') # NaNs denote group exhausted when using dropna
- B
- A
- 1 NaN
- 5 NaN
- """
- self._set_selection_from_grouper()
- if not dropna: # good choice
- m = self.grouper._max_groupsize
- if n >= m or n < -m:
- return self._selected_obj.loc[[]]
- rng = np.zeros(m, dtype=bool)
- if n >= 0:
- rng[n] = True
- is_nth = self._cumcount_array(rng)
- else:
- rng[- n - 1] = True
- is_nth = self._cumcount_array(rng, ascending=False)
- result = self._selected_obj[is_nth]
- # the result index
- if self.as_index:
- ax = self.obj._info_axis
- names = self.grouper.names
- if all([ n in ax for n in names ]):
- result.index = Index(self.obj[names][is_nth].values.ravel()).set_names(names)
- elif self._group_selection is not None:
- result.index = self.obj._get_axis(self.axis)[is_nth]
- result = result.sort_index()
- return result
- if (isinstance(self._selected_obj, DataFrame)
- and dropna not in ['any', 'all']):
- # Note: when agg-ing picker doesn't raise this, just returns NaN
- raise ValueError("For a DataFrame groupby, dropna must be "
- "either None, 'any' or 'all', "
- "(was passed %s)." % (dropna),)
- # old behaviour, but with all and any support for DataFrames.
- max_len = n if n >= 0 else - 1 - n
- def picker(x):
- x = x.dropna(how=dropna) # Note: how is ignored if Series
- if len(x) <= max_len:
- return np.nan
- else:
- return x.iloc[n]
- return self.agg(picker)
- def cumcount(self, **kwargs):
- """
- Number each item in each group from 0 to the length of that group - 1.
- Essentially this is equivalent to
- >>> self.apply(lambda x: Series(np.arange(len(x)), x.index))
- Parameters
- ----------
- ascending : bool, default True
- If False, number in reverse, from length of group - 1 to 0.
- Example
- -------
- >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],
- ... columns=['A'])
- >>> df
- A
- 0 a
- 1 a
- 2 a
- 3 b
- 4 b
- 5 a
- >>> df.groupby('A').cumcount()
- 0 0
- 1 1
- 2 2
- 3 0
- 4 1
- 5 3
- dtype: int64
- >>> df.groupby('A').cumcount(ascending=False)
- 0 3
- 1 2
- 2 1
- 3 1
- 4 0
- 5 0
- dtype: int64
- """
- self._set_selection_from_grouper()
- ascending = kwargs.pop('ascending', True)
- index = self._selected_obj.index
- cumcounts = self._cumcount_array(ascending=ascending)
- return Series(cumcounts, index)
- def head(self, n=5):
- """
- Returns first n rows of each group.
- Essentially equivalent to ``.apply(lambda x: x.head(n))``,
- except ignores as_index flag.
- Example
- -------
- >>> df = DataFrame([[1, 2], [1, 4], [5, 6]],
- columns=['A', 'B'])
- >>> df.groupby('A', as_index=False).head(1)
- A B
- 0 1 2
- 2 5 6
- >>> df.groupby('A').head(1)
- A B
- 0 1 2
- 2 5 6
- """
- obj = self._selected_obj
- in_head = self._cumcount_array() < n
- head = obj[in_head]
- return head
- def tail(self, n=5):
- """
- Returns last n rows of each group
- Essentially equivalent to ``.apply(lambda x: x.tail(n))``,
- except ignores as_index flag.
- Example
- -------
- >>> df = DataFrame([[1, 2], [1, 4], [5, 6]],
- columns=['A', 'B'])
- >>> df.groupby('A', as_index=False).tail(1)
- A B
- 0 1 2
- 2 5 6
- >>> df.groupby('A').head(1)
- A B
- 0 1 2
- 2 5 6
- """
- obj = self._selected_obj
- rng = np.arange(0, -self.grouper._max_groupsize, -1, dtype='int64')
- in_tail = self._cumcount_array(rng, ascending=False) > -n
- tail = obj[in_tail]
- return tail
- def _cumcount_array(self, arr=None, **kwargs):
- """
- arr is where cumcount gets it's values from
- """
- ascending = kwargs.pop('ascending', True)
- if arr is None:
- arr = np.arange(self.grouper._max_groupsize, dtype='int64')
- len_index = len(self._selected_obj.index)
- cumcounts = np.empty(len_index, dtype=arr.dtype)
- if ascending:
- for v in self.indices.values():
- cumcounts[v] = arr[:len(v)]
- else:
- for v in self.indices.values():
- cumcounts[v] = arr[len(v)-1::-1]
- return cumcounts
- def _index_with_as_index(self, b):
- """
- Take boolean mask of index to be returned from apply, if as_index=True
- """
- # TODO perf, it feels like this should already be somewhere...
- from itertools import chain
- original = self._selected_obj.index
- gp = self.grouper
- levels = chain((gp.levels[i][gp.labels[i][b]]
- for i in range(len(gp.groupings))),
- (original.get_level_values(i)[b]
- for i in range(original.nlevels)))
- new = MultiIndex.from_arrays(list(levels))
- new.names = gp.names + original.names
- return new
- def _try_cast(self, result, obj):
- """
- try to cast the result to our obj original type,
- we may have roundtripped thru object in the mean-time
- """
- if obj.ndim > 1:
- dtype = obj.values.dtype
- else:
- dtype = obj.dtype
- if not np.isscalar(result):
- result = _possibly_downcast_to_dtype(result, dtype)
- return result
- def _cython_agg_general(self, how, numeric_only=True):
- output = {}
- for name, obj in self._iterate_slices():
- is_numeric = is_numeric_dtype(obj.dtype)
- if numeric_only and not is_numeric:
- continue
- try:
- result, names = self.grouper.aggregate(obj.values, how)
- except AssertionError as e:
- raise GroupByError(str(e))
- output[name] = self._try_cast(result, obj)
- if len(output) == 0:
- raise DataError('No numeric types to aggregate')
- return self._wrap_aggregated_output(output, names)
- def _python_agg_general(self, func, *args, **kwargs):
- func = _intercept_function(func)
- f = lambda x: func(x, *args, **kwargs)
- # iterate through "columns" ex exclusions to populate output dict
- output = {}
- for name, obj in self._iterate_slices():
- try:
- result, counts = self.grouper.agg_series(obj, f)
- output[name] = self._try_cast(result, obj)
- except TypeError:
- continue
- if len(output) == 0:
- return self._python_apply_general(f)
- if self.grouper._filter_empty_groups:
- mask = counts.ravel() > 0
- for name, result in compat.iteritems(output):
- # since we are masking, make sure that we have a float object
- values = result
- if is_numeric_dtype(values.dtype):
- values = com.ensure_float(values)
- output[name] = self._try_cast(values[mask], result)
- return self._wrap_aggregated_output(output)
- def _wrap_applied_output(self, *args, **kwargs):
- raise NotImplementedError
- def _concat_objects(self, keys, values, not_indexed_same=False):
- from pandas.tools.merge import concat
- if not not_indexed_same:
- result = concat(values, axis=self.axis)
- ax = self._selected_obj._get_axis(self.axis)
- if isinstance(result, Series):
- result = result.reindex(ax)
- else:
- result = result.reindex_axis(ax, axis=self.axis)
- elif self.group_keys:
- if self.as_index:
- # possible MI return case
- group_keys = keys
- group_levels = self.grouper.levels
- group_names = self.grouper.names
- result = concat(values, axis=self.axis, keys=group_keys,
- levels=group_levels, names=group_names)
- else:
- # GH5610, returns a MI, with the first level being a
- # range index
- keys = list(range(len(values)))
- result = concat(values, axis=self.axis, keys=keys)
- else:
- result = concat(values, axis=self.axis)
- return result
- def _apply_filter(self, indices, dropna):
- if len(indices) == 0:
- indices = []
- else:
- indices = np.sort(np.concatenate(indices))
- if dropna:
- filtered = self._selected_obj.take(indices)
- else:
- mask = np.empty(len(self._selected_obj.index), dtype=bool)
- mask.fill(False)
- mask[indices.astype(int)] = True
- # mask fails to broadcast when passed to where; broadcast manually.
- mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T
- filtered = self._selected_obj.where(mask) # Fill with NaNs.
- return filtered
- @Appender(GroupBy.__doc__)
- def groupby(obj, by, **kwds):
- if isinstance(obj, Series):
- klass = SeriesGroupBy
- elif isinstance(obj, DataFrame):
- klass = DataFrameGroupBy
- else: # pragma: no cover
- raise TypeError('invalid type: %s' % type(obj))
- return klass(obj, by, **kwds)
- def _get_axes(group):
- if isinstance(group, Series):
- return [group.index]
- else:
- return group.axes
- def _is_indexed_like(obj, axes):
- if isinstance(obj, Series):
- if len(axes) > 1:
- return False
- return obj.index.equals(axes[0])
- elif isinstance(obj, DataFrame):
- return obj.index.equals(axes[0])
- return False
- class BaseGrouper(object):
- """
- This is an internal Grouper class, which actually holds the generated groups
- """
- def __init__(self, axis, groupings, sort=True, group_keys=True):
- self.axis = axis
- self.groupings = groupings
- self.sort = sort
- self.group_keys = group_keys
- self.compressed = True
- @property
- def shape(self):
- return tuple(ping.ngroups for ping in self.groupings)
- def __iter__(self):
- return iter(self.indices)
- @property
- def nkeys(self):
- return len(self.groupings)
- def get_iterator(self, data, axis=0):
- """
- Groupby iterator
- Returns
- -------
- Generator yielding sequence of (name, subsetted object)
- for each group
- """
- splitter = self._get_splitter(data, axis=axis)
- keys = self._get_group_keys()
- for key, (i, group) in zip(keys, splitter):
- yield key, group
- def _get_splitter(self, data, axis=0):
- comp_ids, _, ngroups = self.group_info
- return get_splitter(data, comp_ids, ngroups, axis=axis)
- def _get_group_keys(self):
- if len(self.groupings) == 1:
- return self.levels[0]
- else:
- comp_ids, _, ngroups = self.group_info
- # provide "flattened" iterator for multi-group setting
- mapper = _KeyMapper(comp_ids, ngroups, self.labels, self.levels)
- return [mapper.get_key(i) for i in range(ngroups)]
- def apply(self, f, data, axis=0):
- mutated = False
- splitter = self._get_splitter(data, axis=axis)
- group_keys = self._get_group_keys()
- # oh boy
- if (f.__name__ not in _plotting_methods and
- hasattr(splitter, 'fast_apply') and axis == 0):
- try:
- values, mutated = splitter.fast_apply(f, group_keys)
- return group_keys, values, mutated
- except (lib.InvalidApply):
- # we detect a mutation of some kind
- # so take slow path
- pass
- except (Exception) as e:
- # raise this error to the caller
- pass
- result_values = []
- for key, (i, group) in zip(group_keys, splitter):
- object.__setattr__(group, 'name', key)
- # group might be modified
- group_axes = _get_axes(group)
- res = f(group)
- if not _is_indexed_like(res, group_axes):
- mutated = True
- result_values.append(res)
- return group_keys, result_values, mutated
- @cache_readonly
- def indices(self):
- """ dict {group name -> group indices} """
- if len(self.groupings) == 1:
- return self.groupings[0].indices
- else:
- label_list = [ping.labels for ping in self.groupings]
- keys = [ping.group_index for ping in self.groupings]
- return _get_indices_dict(label_list, keys)
- @property
- def labels(self):
- return [ping.labels for ping in self.groupings]
- @property
- def levels(self):
- return [ping.group_index for ping in self.groupings]
- @property
- def names(self):
- return [ping.name for ping in self.groupings]
- def size(self):
- """
- Compute group sizes
- """
- # TODO: better impl
- labels, _, ngroups = self.group_info
- bin_counts = algos.value_counts(labels, sort=False)
- bin_counts = bin_counts.reindex(np.arange(ngroups))
- bin_counts.index = self.result_index
- return bin_counts
- @cache_readonly
- def _max_groupsize(self):
- '''
- Compute size of largest group
- '''
- # For many items in each group this is much faster than
- # self.size().max(), in worst case marginally slower
- if self.indices:
- return max(len(v) for v in self.indices.values())
- else:
- return 0
- @cache_readonly
- def groups(self):
- """ dict {group name -> group labels} """
- if len(self.groupings) == 1:
- return self.groupings[0].groups
- else:
- to_groupby = lzip(*(ping.grouper for ping in self.groupings))
- to_groupby = Index(to_groupby)
- return self.axis.groupby(to_groupby.values)
- @cache_readonly
- def group_info(self):
- comp_ids, obs_group_ids = self._get_compressed_labels()
- ngroups = len(obs_group_ids)
- comp_ids = com._ensure_int64(comp_ids)
- return comp_ids, obs_group_ids, ngroups
- def _get_compressed_labels(self):
- all_labels = [ping.labels for ping in self.groupings]
- if self._overflow_possible:
- tups = lib.fast_zip(all_labels)
- labs, uniques = algos.factorize(tups)
- if self.sort:
- uniques, labs = _reorder_by_uniques(uniques, labs)
- return labs, uniques
- else:
- if len(all_labels) > 1:
- group_index = get_group_index(all_labels, self.shape)
- comp_ids, obs_group_ids = _compress_group_index(group_index)
- else:
- ping = self.groupings[0]
- comp_ids = ping.labels
- obs_group_ids = np.arange(len(ping.group_index))
- self.compressed = False
- self._filter_empty_groups = False
- return comp_ids, obs_group_ids
- @cache_readonly
- def _overflow_possible(self):
- return _int64_overflow_possible(self.shape)
- @cache_readonly
- def ngroups(self):
- return len(self.result_index)
- @cache_readonly
- def result_index(self):
- recons = self.get_group_levels()
- return MultiIndex.from_arrays(recons, names=self.names)
- def get_group_levels(self):
- obs_ids = self.group_info[1]
- if not self.compressed and len(self.groupings) == 1:
- return [self.groupings[0].group_index]
- if self._overflow_possible:
- recons_labels = [np.array(x) for x in zip(*obs_ids)]
- else:
- recons_labels = decons_group_index(obs_ids, self.shape)
- name_list = []
- for ping, labels in zip(self.groupings, recons_labels):
- labels = com._ensure_platform_int(labels)
- name_list.append(ping.group_index.take(labels))
- return name_list
- #------------------------------------------------------------
- # Aggregation functions
- _cython_functions = {
- 'add': 'group_add',
- 'prod': 'group_prod',
- 'min': 'group_min',
- 'max': 'group_max',
- 'mean': 'group_mean',
- 'median': {
- 'name': 'group_median'
- },
- 'var': 'group_var',
- 'first': {
- 'name': 'group_nth',
- 'f': lambda func, a, b, c, d: func(a, b, c, d, 1)
- },
- 'last': 'group_last',
- 'count': 'group_count',
- }
- _cython_arity = {
- 'ohlc': 4, # OHLC
- }
- _name_functions = {}
- _filter_empty_groups = True
- def _get_aggregate_function(self, how, values):
- dtype_str = values.dtype.name
- def get_func(fname):
- # find the function, or use the object function, or return a
- # generic
- for dt in [dtype_str, 'object']:
- f = getattr(_algos, "%s_%s" % (fname, dtype_str), None)
- if f is not None:
- return f
- return getattr(_algos, fname, None)
- ftype = self._cython_functions[how]
- if isinstance(ftype, dict):
- func = afunc = get_func(ftype['name'])
- # a sub-function
- f = ftype.get('f')
- if f is not None:
- def wrapper(*args, **kwargs):
- return f(afunc, *args, **kwargs)
- # need to curry our sub-function
- func = wrapper
- else:
- func = get_func(ftype)
- if func is None:
- raise NotImplementedError("function is not implemented for this"
- "dtype: [how->%s,dtype->%s]" %
- (how, dtype_str))
- return func, dtype_str
- def aggregate(self, values, how, axis=0):
- arity = self._cython_arity.get(how, 1)
- vdim = values.ndim
- swapped = False
- if vdim == 1:
- values = values[:, None]
- out_shape = (self.ngroups, arity)
- else:
- if axis > 0:
- swapped = True
- values = values.swapaxes(0, axis)
- if arity > 1:
- raise NotImplementedError
- out_shape = (self.ngroups,) + values.shape[1:]
- if is_numeric_dtype(values.dtype):
- values = com.ensure_float(values)
- is_numeric = True
- out_dtype = 'f%d' % values.dtype.itemsize
- else:
- is_numeric = issubclass(values.dtype.type, (np.datetime64,
- np.timedelta64))
- if is_numeric:
- out_dtype = 'float64'
- values = values.view('int64')
- else:
- out_dtype = 'object'
- values = values.astype(object)
- # will be filled in Cython function
- result = np.empty(out_shape, dtype=out_dtype)
- result.fill(np.nan)
- counts = np.zeros(self.ngroups, dtype=np.int64)
- result = self._aggregate(result, counts, values, how, is_numeric)
- if self._filter_empty_groups:
- if result.ndim == 2:
- try:
- result = lib.row_bool_subset(
- result, (counts > 0).view(np.uint8))
- except ValueError:
- result = lib.row_bool_subset_object(
- result, (counts > 0).view(np.uint8))
- else:
- result = result[counts > 0]
- if vdim == 1 and arity == 1:
- result = result[:, 0]
- if how in self._name_functions:
- # TODO
- names = self._name_functions[how]()
- else:
- names = None
- if swapped:
- result = result.swapaxes(0, axis)
- return result, names
- def _aggregate(self, result, counts, values, how, is_numeric):
- agg_func, dtype = self._get_aggregate_function(how, values)
- comp_ids, _, ngroups = self.group_info
- if values.ndim > 3:
- # punting for now
- raise NotImplementedError
- elif values.ndim > 2:
- for i, chunk in enumerate(values.transpose(2, 0, 1)):
- chunk = chunk.squeeze()
- agg_func(result[:, :, i], counts, chunk, comp_ids)
- else:
- agg_func(result, counts, values, comp_ids)
- return result
- def agg_series(self, obj, func):
- try:
- return self._aggregate_series_fast(obj, func)
- except Exception:
- return self._aggregate_series_pure_python(obj, func)
- def _aggregate_series_fast(self, obj, func):
- func = _intercept_function(func)
- if obj.index._has_complex_internals:
- raise TypeError('Incompatible index for Cython grouper')
- group_index, _, ngroups = self.group_info
- # avoids object / Series creation overhead
- dummy = obj._get_values(slice(None, 0)).to_dense()
- indexer = _algos.groupsort_indexer(group_index, ngroups)[0]
- obj = obj.take(indexer, convert=False)
- group_index = com.take_nd(group_index, indexer, allow_fill=False)
- grouper = lib.SeriesGrouper(obj, func, group_index, ngroups,
- dummy)
- result, counts = grouper.get_result()
- return result, counts
- def _aggregate_series_pure_python(self, obj, func):
- group_index, _, ngroups = self.group_info
- counts = np.zeros(ngroups, dtype=int)
- result = None
- splitter = get_splitter(obj, group_index, ngroups, axis=self.axis)
- for label, group in splitter:
- res = func(group)
- if result is None:
- if (isinstance(res, (Series, np.ndarray)) or
- isinstance(res, list)):
- raise ValueError('Function does not reduce')
- result = np.empty(ngroups, dtype='O')
- counts[label] = group.shape[0]
- result[label] = res
- result = lib.maybe_convert_objects(result, try_float=0)
- return result, counts
- def generate_bins_generic(values, binner, closed):
- """
- Generate bin edge offsets and bin labels for one array using another array
- which has bin edge values. Both arrays must be sorted.
- Parameters
- ----------
- values : array of values
- binner : a comparable array of values representing bins into which to bin
- the first array. Note, 'values' end-points must fall within 'binner'
- end-points.
- closed : which end of bin is closed; left (default), right
- Returns
- -------
- bins : array of offsets (into 'values' argument) of bins.
- Zero and last edge are excluded in result, so for instance the first
- bin is values[0:bin[0]] and the last is values[bin[-1]:]
- """
- lenidx = len(values)
- lenbin = len(binner)
- if lenidx <= 0 or lenbin <= 0:
- raise ValueError("Invalid length for values or for binner")
- # check binner fits data
- if values[0] < binner[0]:
- raise ValueError("Values falls before first bin")
- if values[lenidx - 1] > binner[lenbin - 1]:
- raise ValueError("Values falls after last bin")
- bins = np.empty(lenbin - 1, dtype=np.int64)
- j = 0 # index into values
- bc = 0 # bin count
- # linear scan, presume nothing about values/binner except that it fits ok
- for i in range(0, lenbin - 1):
- r_bin = binner[i + 1]
- # count values in current bin, advance to next bin
- while j < lenidx and (values[j] < r_bin or
- (closed == 'right' and values[j] == r_bin)):
- j += 1
- bins[bc] = j
- bc += 1
- return bins
- class BinGrouper(BaseGrouper):
- def __init__(self, bins, binlabels, filter_empty=False):
- self.bins = com._ensure_int64(bins)
- self.binlabels = _ensure_index(binlabels)
- self._filter_empty_groups = filter_empty
- @cache_readonly
- def groups(self):
- """ dict {group name -> group labels} """
- # this is mainly for compat
- # GH 3881
- result = {}
- for key, value in zip(self.binlabels, self.bins):
- if key is not tslib.NaT:
- result[key] = value
- return result
- @property
- def nkeys(self):
- return 1
- def get_iterator(self, data, axis=0):
- """
- Groupby iterator
- Returns
- -------
- Generator yielding sequence of (name, subsetted object)
- for each group
- """
- if isinstance(data, NDFrame):
- slicer = lambda start,edge: data._slice(slice(start,edge),axis=axis)
- length = len(data.axes[axis])
- else:
- slicer = lambda start,edge: data[slice(start,edge)]
- length = len(data)
- start = 0
- for edge, label in zip(self.bins, self.binlabels):
- if label is not tslib.NaT:
- yield label, slicer(start,edge)
- start = e…
Large files files are truncated, but you can click here to view the full file