PageRenderTime 83ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/core/groupby.py

http://github.com/wesm/pandas
Python | 4441 lines | 4436 code | 5 blank | 0 comment | 14 complexity | 18d0687b836be8d203e1d5948ec00b74 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. import types
  2. from functools import wraps
  3. import numpy as np
  4. import datetime
  5. import collections
  6. import warnings
  7. import copy
  8. from pandas.compat import(
  9. zip, range, long, lzip,
  10. callable, map
  11. )
  12. from pandas import compat
  13. from pandas.compat.numpy import function as nv
  14. from pandas.compat.numpy import _np_version_under1p8
  15. from pandas.types.common import (_DATELIKE_DTYPES,
  16. is_numeric_dtype,
  17. is_timedelta64_dtype, is_datetime64_dtype,
  18. is_categorical_dtype,
  19. is_datetime_or_timedelta_dtype,
  20. is_bool, is_integer_dtype,
  21. is_complex_dtype,
  22. is_bool_dtype,
  23. is_scalar,
  24. _ensure_float64,
  25. _ensure_platform_int,
  26. _ensure_int64,
  27. _ensure_object,
  28. _ensure_float)
  29. from pandas.types.cast import _possibly_downcast_to_dtype
  30. from pandas.types.missing import isnull, notnull, _maybe_fill
  31. from pandas.core.common import _values_from_object, AbstractMethodError
  32. from pandas.core.base import (PandasObject, SelectionMixin, GroupByError,
  33. DataError, SpecificationError)
  34. from pandas.core.categorical import Categorical
  35. from pandas.core.frame import DataFrame
  36. from pandas.core.generic import NDFrame
  37. from pandas.core.index import (Index, MultiIndex, CategoricalIndex,
  38. _ensure_index)
  39. from pandas.core.internals import BlockManager, make_block
  40. from pandas.core.series import Series
  41. from pandas.core.panel import Panel
  42. from pandas.util.decorators import (cache_readonly, Substitution, Appender,
  43. make_signature, deprecate_kwarg)
  44. from pandas.formats.printing import pprint_thing
  45. from pandas.util.validators import validate_kwargs
  46. import pandas.core.algorithms as algos
  47. import pandas.core.common as com
  48. from pandas.core.config import option_context
  49. import pandas.lib as lib
  50. from pandas.lib import Timestamp
  51. import pandas.tslib as tslib
  52. import pandas.algos as _algos
  53. import pandas.hashtable as _hash
  54. _doc_template = """
  55. See also
  56. --------
  57. pandas.Series.%(name)s
  58. pandas.DataFrame.%(name)s
  59. pandas.Panel.%(name)s
  60. """
  61. # special case to prevent duplicate plots when catching exceptions when
  62. # forwarding methods from NDFrames
  63. _plotting_methods = frozenset(['plot', 'boxplot', 'hist'])
  64. _common_apply_whitelist = frozenset([
  65. 'last', 'first',
  66. 'head', 'tail', 'median',
  67. 'mean', 'sum', 'min', 'max',
  68. 'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount',
  69. 'resample',
  70. 'describe',
  71. 'rank', 'quantile',
  72. 'fillna',
  73. 'mad',
  74. 'any', 'all',
  75. 'take',
  76. 'idxmax', 'idxmin',
  77. 'shift', 'tshift',
  78. 'ffill', 'bfill',
  79. 'pct_change', 'skew',
  80. 'corr', 'cov', 'diff',
  81. ]) | _plotting_methods
  82. _series_apply_whitelist = \
  83. (_common_apply_whitelist - set(['boxplot'])) | \
  84. frozenset(['dtype', 'unique'])
  85. _dataframe_apply_whitelist = \
  86. _common_apply_whitelist | frozenset(['dtypes', 'corrwith'])
  87. _cython_transforms = frozenset(['cumprod', 'cumsum', 'shift'])
  88. def _groupby_function(name, alias, npfunc, numeric_only=True,
  89. _convert=False):
  90. _local_template = "Compute %(f)s of group values"
  91. @Substitution(name='groupby', f=name)
  92. @Appender(_doc_template)
  93. @Appender(_local_template)
  94. def f(self):
  95. self._set_group_selection()
  96. try:
  97. return self._cython_agg_general(alias, numeric_only=numeric_only)
  98. except AssertionError as e:
  99. raise SpecificationError(str(e))
  100. except Exception:
  101. result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
  102. if _convert:
  103. result = result._convert(datetime=True)
  104. return result
  105. f.__name__ = name
  106. return f
  107. def _first_compat(x, axis=0):
  108. def _first(x):
  109. x = np.asarray(x)
  110. x = x[notnull(x)]
  111. if len(x) == 0:
  112. return np.nan
  113. return x[0]
  114. if isinstance(x, DataFrame):
  115. return x.apply(_first, axis=axis)
  116. else:
  117. return _first(x)
  118. def _last_compat(x, axis=0):
  119. def _last(x):
  120. x = np.asarray(x)
  121. x = x[notnull(x)]
  122. if len(x) == 0:
  123. return np.nan
  124. return x[-1]
  125. if isinstance(x, DataFrame):
  126. return x.apply(_last, axis=axis)
  127. else:
  128. return _last(x)
  129. class Grouper(object):
  130. """
  131. A Grouper allows the user to specify a groupby instruction for a target
  132. object
  133. This specification will select a column via the key parameter, or if the
  134. level and/or axis parameters are given, a level of the index of the target
  135. object.
  136. These are local specifications and will override 'global' settings,
  137. that is the parameters axis and level which are passed to the groupby
  138. itself.
  139. Parameters
  140. ----------
  141. key : string, defaults to None
  142. groupby key, which selects the grouping column of the target
  143. level : name/number, defaults to None
  144. the level for the target index
  145. freq : string / frequency object, defaults to None
  146. This will groupby the specified frequency if the target selection
  147. (via key or level) is a datetime-like object. For full specification
  148. of available frequencies, please see
  149. `here <http://pandas.pydata.org/pandas-docs/stable/timeseries.html>`_.
  150. axis : number/name of the axis, defaults to 0
  151. sort : boolean, default to False
  152. whether to sort the resulting labels
  153. additional kwargs to control time-like groupers (when freq is passed)
  154. closed : closed end of interval; left or right
  155. label : interval boundary to use for labeling; left or right
  156. convention : {'start', 'end', 'e', 's'}
  157. If grouper is PeriodIndex
  158. Returns
  159. -------
  160. A specification for a groupby instruction
  161. Examples
  162. --------
  163. Syntactic sugar for ``df.groupby('A')``
  164. >>> df.groupby(Grouper(key='A'))
  165. Specify a resample operation on the column 'date'
  166. >>> df.groupby(Grouper(key='date', freq='60s'))
  167. Specify a resample operation on the level 'date' on the columns axis
  168. with a frequency of 60s
  169. >>> df.groupby(Grouper(level='date', freq='60s', axis=1))
  170. """
  171. def __new__(cls, *args, **kwargs):
  172. if kwargs.get('freq') is not None:
  173. from pandas.tseries.resample import TimeGrouper
  174. cls = TimeGrouper
  175. return super(Grouper, cls).__new__(cls)
  176. def __init__(self, key=None, level=None, freq=None, axis=0, sort=False):
  177. self.key = key
  178. self.level = level
  179. self.freq = freq
  180. self.axis = axis
  181. self.sort = sort
  182. self.grouper = None
  183. self.obj = None
  184. self.indexer = None
  185. self.binner = None
  186. @property
  187. def ax(self):
  188. return self.grouper
  189. def _get_grouper(self, obj):
  190. """
  191. Parameters
  192. ----------
  193. obj : the subject object
  194. Returns
  195. -------
  196. a tuple of binner, grouper, obj (possibly sorted)
  197. """
  198. self._set_grouper(obj)
  199. self.grouper, exclusions, self.obj = _get_grouper(self.obj, [self.key],
  200. axis=self.axis,
  201. level=self.level,
  202. sort=self.sort)
  203. return self.binner, self.grouper, self.obj
  204. def _set_grouper(self, obj, sort=False):
  205. """
  206. given an object and the specifications, setup the internal grouper
  207. for this particular specification
  208. Parameters
  209. ----------
  210. obj : the subject object
  211. """
  212. if self.key is not None and self.level is not None:
  213. raise ValueError(
  214. "The Grouper cannot specify both a key and a level!")
  215. # the key must be a valid info item
  216. if self.key is not None:
  217. key = self.key
  218. if key not in obj._info_axis:
  219. raise KeyError("The grouper name {0} is not found".format(key))
  220. ax = Index(obj[key], name=key)
  221. else:
  222. ax = obj._get_axis(self.axis)
  223. if self.level is not None:
  224. level = self.level
  225. # if a level is given it must be a mi level or
  226. # equivalent to the axis name
  227. if isinstance(ax, MultiIndex):
  228. level = ax._get_level_number(level)
  229. ax = Index(ax.get_level_values(
  230. level), name=ax.names[level])
  231. else:
  232. if level not in (0, ax.name):
  233. raise ValueError(
  234. "The level {0} is not valid".format(level))
  235. # possibly sort
  236. if (self.sort or sort) and not ax.is_monotonic:
  237. # use stable sort to support first, last, nth
  238. indexer = self.indexer = ax.argsort(kind='mergesort')
  239. ax = ax.take(indexer)
  240. obj = obj.take(indexer, axis=self.axis,
  241. convert=False, is_copy=False)
  242. self.obj = obj
  243. self.grouper = ax
  244. return self.grouper
  245. def _get_binner_for_grouping(self, obj):
  246. """ default to the standard binner here """
  247. group_axis = obj._get_axis(self.axis)
  248. return Grouping(group_axis, None, obj=obj, name=self.key,
  249. level=self.level, sort=self.sort, in_axis=False)
  250. @property
  251. def groups(self):
  252. return self.grouper.groups
  253. class GroupByPlot(PandasObject):
  254. """
  255. Class implementing the .plot attribute for groupby objects
  256. """
  257. def __init__(self, groupby):
  258. self._groupby = groupby
  259. def __call__(self, *args, **kwargs):
  260. def f(self):
  261. return self.plot(*args, **kwargs)
  262. f.__name__ = 'plot'
  263. return self._groupby.apply(f)
  264. def __getattr__(self, name):
  265. def attr(*args, **kwargs):
  266. def f(self):
  267. return getattr(self.plot, name)(*args, **kwargs)
  268. return self._groupby.apply(f)
  269. return attr
  270. class _GroupBy(PandasObject, SelectionMixin):
  271. _group_selection = None
  272. _apply_whitelist = frozenset([])
  273. def __init__(self, obj, keys=None, axis=0, level=None,
  274. grouper=None, exclusions=None, selection=None, as_index=True,
  275. sort=True, group_keys=True, squeeze=False, **kwargs):
  276. self._selection = selection
  277. if isinstance(obj, NDFrame):
  278. obj._consolidate_inplace()
  279. self.level = level
  280. if not as_index:
  281. if not isinstance(obj, DataFrame):
  282. raise TypeError('as_index=False only valid with DataFrame')
  283. if axis != 0:
  284. raise ValueError('as_index=False only valid for axis=0')
  285. self.as_index = as_index
  286. self.keys = keys
  287. self.sort = sort
  288. self.group_keys = group_keys
  289. self.squeeze = squeeze
  290. self.mutated = kwargs.pop('mutated', False)
  291. if grouper is None:
  292. grouper, exclusions, obj = _get_grouper(obj, keys,
  293. axis=axis,
  294. level=level,
  295. sort=sort,
  296. mutated=self.mutated)
  297. self.obj = obj
  298. self.axis = obj._get_axis_number(axis)
  299. self.grouper = grouper
  300. self.exclusions = set(exclusions) if exclusions else set()
  301. # we accept no other args
  302. validate_kwargs('group', kwargs, {})
  303. def __len__(self):
  304. return len(self.groups)
  305. def __unicode__(self):
  306. # TODO: Better unicode/repr for GroupBy object
  307. return object.__repr__(self)
  308. def _assure_grouper(self):
  309. """
  310. we create the grouper on instantiation
  311. sub-classes may have a different policy
  312. """
  313. pass
  314. @property
  315. def groups(self):
  316. """ dict {group name -> group labels} """
  317. self._assure_grouper()
  318. return self.grouper.groups
  319. @property
  320. def ngroups(self):
  321. self._assure_grouper()
  322. return self.grouper.ngroups
  323. @property
  324. def indices(self):
  325. """ dict {group name -> group indices} """
  326. self._assure_grouper()
  327. return self.grouper.indices
  328. def _get_indices(self, names):
  329. """
  330. safe get multiple indices, translate keys for
  331. datelike to underlying repr
  332. """
  333. def get_converter(s):
  334. # possibly convert to the actual key types
  335. # in the indices, could be a Timestamp or a np.datetime64
  336. if isinstance(s, (Timestamp, datetime.datetime)):
  337. return lambda key: Timestamp(key)
  338. elif isinstance(s, np.datetime64):
  339. return lambda key: Timestamp(key).asm8
  340. else:
  341. return lambda key: key
  342. if len(names) == 0:
  343. return []
  344. if len(self.indices) > 0:
  345. index_sample = next(iter(self.indices))
  346. else:
  347. index_sample = None # Dummy sample
  348. name_sample = names[0]
  349. if isinstance(index_sample, tuple):
  350. if not isinstance(name_sample, tuple):
  351. msg = ("must supply a tuple to get_group with multiple"
  352. " grouping keys")
  353. raise ValueError(msg)
  354. if not len(name_sample) == len(index_sample):
  355. try:
  356. # If the original grouper was a tuple
  357. return [self.indices[name] for name in names]
  358. except KeyError:
  359. # turns out it wasn't a tuple
  360. msg = ("must supply a a same-length tuple to get_group"
  361. " with multiple grouping keys")
  362. raise ValueError(msg)
  363. converters = [get_converter(s) for s in index_sample]
  364. names = [tuple([f(n) for f, n in zip(converters, name)])
  365. for name in names]
  366. else:
  367. converter = get_converter(index_sample)
  368. names = [converter(name) for name in names]
  369. return [self.indices.get(name, []) for name in names]
  370. def _get_index(self, name):
  371. """ safe get index, translate keys for datelike to underlying repr """
  372. return self._get_indices([name])[0]
  373. @cache_readonly
  374. def _selected_obj(self):
  375. if self._selection is None or isinstance(self.obj, Series):
  376. if self._group_selection is not None:
  377. return self.obj[self._group_selection]
  378. return self.obj
  379. else:
  380. return self.obj[self._selection]
  381. def _reset_group_selection(self):
  382. """
  383. Clear group based selection. Used for methods needing to return info on
  384. each group regardless of whether a group selection was previously set.
  385. """
  386. if self._group_selection is not None:
  387. self._group_selection = None
  388. # GH12839 clear cached selection too when changing group selection
  389. self._reset_cache('_selected_obj')
  390. def _set_group_selection(self):
  391. """
  392. Create group based selection. Used when selection is not passed
  393. directly but instead via a grouper.
  394. """
  395. grp = self.grouper
  396. if self.as_index and getattr(grp, 'groupings', None) is not None and \
  397. self.obj.ndim > 1:
  398. ax = self.obj._info_axis
  399. groupers = [g.name for g in grp.groupings
  400. if g.level is None and g.in_axis]
  401. if len(groupers):
  402. self._group_selection = ax.difference(Index(groupers)).tolist()
  403. # GH12839 clear selected obj cache when group selection changes
  404. self._reset_cache('_selected_obj')
  405. def _set_result_index_ordered(self, result):
  406. # set the result index on the passed values object and
  407. # return the new object, xref 8046
  408. # the values/counts are repeated according to the group index
  409. # shortcut if we have an already ordered grouper
  410. if not self.grouper.is_monotonic:
  411. index = Index(np.concatenate(
  412. self._get_indices(self.grouper.result_index)))
  413. result.set_axis(self.axis, index)
  414. result = result.sort_index(axis=self.axis)
  415. result.set_axis(self.axis, self.obj._get_axis(self.axis))
  416. return result
  417. def _dir_additions(self):
  418. return self.obj._dir_additions() | self._apply_whitelist
  419. def __getattr__(self, attr):
  420. if attr in self._internal_names_set:
  421. return object.__getattribute__(self, attr)
  422. if attr in self.obj:
  423. return self[attr]
  424. if hasattr(self.obj, attr):
  425. return self._make_wrapper(attr)
  426. raise AttributeError("%r object has no attribute %r" %
  427. (type(self).__name__, attr))
  428. plot = property(GroupByPlot)
  429. def _make_wrapper(self, name):
  430. if name not in self._apply_whitelist:
  431. is_callable = callable(getattr(self._selected_obj, name, None))
  432. kind = ' callable ' if is_callable else ' '
  433. msg = ("Cannot access{0}attribute {1!r} of {2!r} objects, try "
  434. "using the 'apply' method".format(kind, name,
  435. type(self).__name__))
  436. raise AttributeError(msg)
  437. # need to setup the selection
  438. # as are not passed directly but in the grouper
  439. self._set_group_selection()
  440. f = getattr(self._selected_obj, name)
  441. if not isinstance(f, types.MethodType):
  442. return self.apply(lambda self: getattr(self, name))
  443. f = getattr(type(self._selected_obj), name)
  444. def wrapper(*args, **kwargs):
  445. # a little trickery for aggregation functions that need an axis
  446. # argument
  447. kwargs_with_axis = kwargs.copy()
  448. if 'axis' not in kwargs_with_axis or \
  449. kwargs_with_axis['axis'] is None:
  450. kwargs_with_axis['axis'] = self.axis
  451. def curried_with_axis(x):
  452. return f(x, *args, **kwargs_with_axis)
  453. def curried(x):
  454. return f(x, *args, **kwargs)
  455. # preserve the name so we can detect it when calling plot methods,
  456. # to avoid duplicates
  457. curried.__name__ = curried_with_axis.__name__ = name
  458. # special case otherwise extra plots are created when catching the
  459. # exception below
  460. if name in _plotting_methods:
  461. return self.apply(curried)
  462. try:
  463. return self.apply(curried_with_axis)
  464. except Exception:
  465. try:
  466. return self.apply(curried)
  467. except Exception:
  468. # related to : GH3688
  469. # try item-by-item
  470. # this can be called recursively, so need to raise
  471. # ValueError
  472. # if we don't have this method to indicated to aggregate to
  473. # mark this column as an error
  474. try:
  475. return self._aggregate_item_by_item(name,
  476. *args, **kwargs)
  477. except (AttributeError):
  478. raise ValueError
  479. return wrapper
  480. def get_group(self, name, obj=None):
  481. """
  482. Constructs NDFrame from group with provided name
  483. Parameters
  484. ----------
  485. name : object
  486. the name of the group to get as a DataFrame
  487. obj : NDFrame, default None
  488. the NDFrame to take the DataFrame out of. If
  489. it is None, the object groupby was called on will
  490. be used
  491. Returns
  492. -------
  493. group : type of obj
  494. """
  495. if obj is None:
  496. obj = self._selected_obj
  497. inds = self._get_index(name)
  498. if not len(inds):
  499. raise KeyError(name)
  500. return obj.take(inds, axis=self.axis, convert=False)
  501. def __iter__(self):
  502. """
  503. Groupby iterator
  504. Returns
  505. -------
  506. Generator yielding sequence of (name, subsetted object)
  507. for each group
  508. """
  509. return self.grouper.get_iterator(self.obj, axis=self.axis)
  510. @Substitution(name='groupby')
  511. def apply(self, func, *args, **kwargs):
  512. """
  513. Apply function and combine results together in an intelligent way. The
  514. split-apply-combine combination rules attempt to be as common sense
  515. based as possible. For example:
  516. case 1:
  517. group DataFrame
  518. apply aggregation function (f(chunk) -> Series)
  519. yield DataFrame, with group axis having group labels
  520. case 2:
  521. group DataFrame
  522. apply transform function ((f(chunk) -> DataFrame with same indexes)
  523. yield DataFrame with resulting chunks glued together
  524. case 3:
  525. group Series
  526. apply function with f(chunk) -> DataFrame
  527. yield DataFrame with result of chunks glued together
  528. Parameters
  529. ----------
  530. func : function
  531. Notes
  532. -----
  533. See online documentation for full exposition on how to use apply.
  534. In the current implementation apply calls func twice on the
  535. first group to decide whether it can take a fast or slow code
  536. path. This can lead to unexpected behavior if func has
  537. side-effects, as they will take effect twice for the first
  538. group.
  539. See also
  540. --------
  541. aggregate, transform"""
  542. func = self._is_builtin_func(func)
  543. # this is needed so we don't try and wrap strings. If we could
  544. # resolve functions to their callable functions prior, this
  545. # wouldn't be needed
  546. if args or kwargs:
  547. if callable(func):
  548. @wraps(func)
  549. def f(g):
  550. return func(g, *args, **kwargs)
  551. else:
  552. raise ValueError('func must be a callable if args or '
  553. 'kwargs are supplied')
  554. else:
  555. f = func
  556. # ignore SettingWithCopy here in case the user mutates
  557. with option_context('mode.chained_assignment', None):
  558. return self._python_apply_general(f)
  559. def _python_apply_general(self, f):
  560. keys, values, mutated = self.grouper.apply(f, self._selected_obj,
  561. self.axis)
  562. return self._wrap_applied_output(
  563. keys,
  564. values,
  565. not_indexed_same=mutated or self.mutated)
  566. def _iterate_slices(self):
  567. yield self.name, self._selected_obj
  568. def transform(self, func, *args, **kwargs):
  569. raise AbstractMethodError(self)
  570. def _cumcount_array(self, ascending=True):
  571. """
  572. Parameters
  573. ----------
  574. ascending : bool, default True
  575. If False, number in reverse, from length of group - 1 to 0.
  576. Note
  577. ----
  578. this is currently implementing sort=False
  579. (though the default is sort=True) for groupby in general
  580. """
  581. ids, _, ngroups = self.grouper.group_info
  582. sorter = _get_group_index_sorter(ids, ngroups)
  583. ids, count = ids[sorter], len(ids)
  584. if count == 0:
  585. return np.empty(0, dtype=np.int64)
  586. run = np.r_[True, ids[:-1] != ids[1:]]
  587. rep = np.diff(np.r_[np.nonzero(run)[0], count])
  588. out = (~run).cumsum()
  589. if ascending:
  590. out -= np.repeat(out[run], rep)
  591. else:
  592. out = np.repeat(out[np.r_[run[1:], True]], rep) - out
  593. rev = np.empty(count, dtype=np.intp)
  594. rev[sorter] = np.arange(count, dtype=np.intp)
  595. return out[rev].astype(np.int64, copy=False)
  596. def _index_with_as_index(self, b):
  597. """
  598. Take boolean mask of index to be returned from apply, if as_index=True
  599. """
  600. # TODO perf, it feels like this should already be somewhere...
  601. from itertools import chain
  602. original = self._selected_obj.index
  603. gp = self.grouper
  604. levels = chain((gp.levels[i][gp.labels[i][b]]
  605. for i in range(len(gp.groupings))),
  606. (original.get_level_values(i)[b]
  607. for i in range(original.nlevels)))
  608. new = MultiIndex.from_arrays(list(levels))
  609. new.names = gp.names + original.names
  610. return new
  611. def _try_cast(self, result, obj):
  612. """
  613. try to cast the result to our obj original type,
  614. we may have roundtripped thru object in the mean-time
  615. """
  616. if obj.ndim > 1:
  617. dtype = obj.values.dtype
  618. else:
  619. dtype = obj.dtype
  620. if not is_scalar(result):
  621. result = _possibly_downcast_to_dtype(result, dtype)
  622. return result
  623. def _cython_transform(self, how, numeric_only=True):
  624. output = {}
  625. for name, obj in self._iterate_slices():
  626. is_numeric = is_numeric_dtype(obj.dtype)
  627. if numeric_only and not is_numeric:
  628. continue
  629. try:
  630. result, names = self.grouper.transform(obj.values, how)
  631. except AssertionError as e:
  632. raise GroupByError(str(e))
  633. output[name] = self._try_cast(result, obj)
  634. if len(output) == 0:
  635. raise DataError('No numeric types to aggregate')
  636. return self._wrap_transformed_output(output, names)
  637. def _cython_agg_general(self, how, numeric_only=True):
  638. output = {}
  639. for name, obj in self._iterate_slices():
  640. is_numeric = is_numeric_dtype(obj.dtype)
  641. if numeric_only and not is_numeric:
  642. continue
  643. try:
  644. result, names = self.grouper.aggregate(obj.values, how)
  645. except AssertionError as e:
  646. raise GroupByError(str(e))
  647. output[name] = self._try_cast(result, obj)
  648. if len(output) == 0:
  649. raise DataError('No numeric types to aggregate')
  650. return self._wrap_aggregated_output(output, names)
  651. def _python_agg_general(self, func, *args, **kwargs):
  652. func = self._is_builtin_func(func)
  653. f = lambda x: func(x, *args, **kwargs)
  654. # iterate through "columns" ex exclusions to populate output dict
  655. output = {}
  656. for name, obj in self._iterate_slices():
  657. try:
  658. result, counts = self.grouper.agg_series(obj, f)
  659. output[name] = self._try_cast(result, obj)
  660. except TypeError:
  661. continue
  662. if len(output) == 0:
  663. return self._python_apply_general(f)
  664. if self.grouper._filter_empty_groups:
  665. mask = counts.ravel() > 0
  666. for name, result in compat.iteritems(output):
  667. # since we are masking, make sure that we have a float object
  668. values = result
  669. if is_numeric_dtype(values.dtype):
  670. values = _ensure_float(values)
  671. output[name] = self._try_cast(values[mask], result)
  672. return self._wrap_aggregated_output(output)
  673. def _wrap_applied_output(self, *args, **kwargs):
  674. raise AbstractMethodError(self)
  675. def _concat_objects(self, keys, values, not_indexed_same=False):
  676. from pandas.tools.merge import concat
  677. def reset_identity(values):
  678. # reset the identities of the components
  679. # of the values to prevent aliasing
  680. for v in values:
  681. if v is not None:
  682. ax = v._get_axis(self.axis)
  683. ax._reset_identity()
  684. return values
  685. if not not_indexed_same:
  686. result = concat(values, axis=self.axis)
  687. ax = self._selected_obj._get_axis(self.axis)
  688. if isinstance(result, Series):
  689. result = result.reindex(ax)
  690. else:
  691. result = result.reindex_axis(ax, axis=self.axis)
  692. elif self.group_keys:
  693. values = reset_identity(values)
  694. if self.as_index:
  695. # possible MI return case
  696. group_keys = keys
  697. group_levels = self.grouper.levels
  698. group_names = self.grouper.names
  699. result = concat(values, axis=self.axis, keys=group_keys,
  700. levels=group_levels, names=group_names)
  701. else:
  702. # GH5610, returns a MI, with the first level being a
  703. # range index
  704. keys = list(range(len(values)))
  705. result = concat(values, axis=self.axis, keys=keys)
  706. else:
  707. values = reset_identity(values)
  708. result = concat(values, axis=self.axis)
  709. if (isinstance(result, Series) and
  710. getattr(self, 'name', None) is not None):
  711. result.name = self.name
  712. return result
  713. def _apply_filter(self, indices, dropna):
  714. if len(indices) == 0:
  715. indices = np.array([], dtype='int64')
  716. else:
  717. indices = np.sort(np.concatenate(indices))
  718. if dropna:
  719. filtered = self._selected_obj.take(indices, axis=self.axis)
  720. else:
  721. mask = np.empty(len(self._selected_obj.index), dtype=bool)
  722. mask.fill(False)
  723. mask[indices.astype(int)] = True
  724. # mask fails to broadcast when passed to where; broadcast manually.
  725. mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T
  726. filtered = self._selected_obj.where(mask) # Fill with NaNs.
  727. return filtered
  728. class GroupBy(_GroupBy):
  729. """
  730. Class for grouping and aggregating relational data. See aggregate,
  731. transform, and apply functions on this object.
  732. It's easiest to use obj.groupby(...) to use GroupBy, but you can also do:
  733. ::
  734. grouped = groupby(obj, ...)
  735. Parameters
  736. ----------
  737. obj : pandas object
  738. axis : int, default 0
  739. level : int, default None
  740. Level of MultiIndex
  741. groupings : list of Grouping objects
  742. Most users should ignore this
  743. exclusions : array-like, optional
  744. List of columns to exclude
  745. name : string
  746. Most users should ignore this
  747. Notes
  748. -----
  749. After grouping, see aggregate, apply, and transform functions. Here are
  750. some other brief notes about usage. When grouping by multiple groups, the
  751. result index will be a MultiIndex (hierarchical) by default.
  752. Iteration produces (key, group) tuples, i.e. chunking the data by group. So
  753. you can write code like:
  754. ::
  755. grouped = obj.groupby(keys, axis=axis)
  756. for key, group in grouped:
  757. # do something with the data
  758. Function calls on GroupBy, if not specially implemented, "dispatch" to the
  759. grouped data. So if you group a DataFrame and wish to invoke the std()
  760. method on each group, you can simply do:
  761. ::
  762. df.groupby(mapper).std()
  763. rather than
  764. ::
  765. df.groupby(mapper).aggregate(np.std)
  766. You can pass arguments to these "wrapped" functions, too.
  767. See the online documentation for full exposition on these topics and much
  768. more
  769. Returns
  770. -------
  771. **Attributes**
  772. groups : dict
  773. {group name -> group labels}
  774. len(grouped) : int
  775. Number of groups
  776. """
  777. _apply_whitelist = _common_apply_whitelist
  778. def irow(self, i):
  779. """
  780. DEPRECATED. Use ``.nth(i)`` instead
  781. """
  782. # 10177
  783. warnings.warn("irow(i) is deprecated. Please use .nth(i)",
  784. FutureWarning, stacklevel=2)
  785. return self.nth(i)
  786. @Substitution(name='groupby')
  787. @Appender(_doc_template)
  788. def count(self):
  789. """Compute count of group, excluding missing values"""
  790. # defined here for API doc
  791. raise NotImplementedError
  792. @Substitution(name='groupby')
  793. @Appender(_doc_template)
  794. def mean(self, *args, **kwargs):
  795. """
  796. Compute mean of groups, excluding missing values
  797. For multiple groupings, the result index will be a MultiIndex
  798. """
  799. nv.validate_groupby_func('mean', args, kwargs)
  800. try:
  801. return self._cython_agg_general('mean')
  802. except GroupByError:
  803. raise
  804. except Exception: # pragma: no cover
  805. self._set_group_selection()
  806. f = lambda x: x.mean(axis=self.axis)
  807. return self._python_agg_general(f)
  808. @Substitution(name='groupby')
  809. @Appender(_doc_template)
  810. def median(self):
  811. """
  812. Compute median of groups, excluding missing values
  813. For multiple groupings, the result index will be a MultiIndex
  814. """
  815. try:
  816. return self._cython_agg_general('median')
  817. except GroupByError:
  818. raise
  819. except Exception: # pragma: no cover
  820. self._set_group_selection()
  821. def f(x):
  822. if isinstance(x, np.ndarray):
  823. x = Series(x)
  824. return x.median(axis=self.axis)
  825. return self._python_agg_general(f)
  826. @Substitution(name='groupby')
  827. @Appender(_doc_template)
  828. def std(self, ddof=1, *args, **kwargs):
  829. """
  830. Compute standard deviation of groups, excluding missing values
  831. For multiple groupings, the result index will be a MultiIndex
  832. Parameters
  833. ----------
  834. ddof : integer, default 1
  835. degrees of freedom
  836. """
  837. # TODO: implement at Cython level?
  838. nv.validate_groupby_func('std', args, kwargs)
  839. return np.sqrt(self.var(ddof=ddof))
  840. @Substitution(name='groupby')
  841. @Appender(_doc_template)
  842. def var(self, ddof=1, *args, **kwargs):
  843. """
  844. Compute variance of groups, excluding missing values
  845. For multiple groupings, the result index will be a MultiIndex
  846. Parameters
  847. ----------
  848. ddof : integer, default 1
  849. degrees of freedom
  850. """
  851. nv.validate_groupby_func('var', args, kwargs)
  852. if ddof == 1:
  853. return self._cython_agg_general('var')
  854. else:
  855. self._set_group_selection()
  856. f = lambda x: x.var(ddof=ddof)
  857. return self._python_agg_general(f)
  858. @Substitution(name='groupby')
  859. @Appender(_doc_template)
  860. def sem(self, ddof=1):
  861. """
  862. Compute standard error of the mean of groups, excluding missing values
  863. For multiple groupings, the result index will be a MultiIndex
  864. Parameters
  865. ----------
  866. ddof : integer, default 1
  867. degrees of freedom
  868. """
  869. return self.std(ddof=ddof) / np.sqrt(self.count())
  870. @Substitution(name='groupby')
  871. @Appender(_doc_template)
  872. def size(self):
  873. """Compute group sizes"""
  874. return self.grouper.size()
  875. sum = _groupby_function('sum', 'add', np.sum)
  876. prod = _groupby_function('prod', 'prod', np.prod)
  877. min = _groupby_function('min', 'min', np.min, numeric_only=False)
  878. max = _groupby_function('max', 'max', np.max, numeric_only=False)
  879. first = _groupby_function('first', 'first', _first_compat,
  880. numeric_only=False, _convert=True)
  881. last = _groupby_function('last', 'last', _last_compat, numeric_only=False,
  882. _convert=True)
  883. @Substitution(name='groupby')
  884. @Appender(_doc_template)
  885. def ohlc(self):
  886. """
  887. Compute sum of values, excluding missing values
  888. For multiple groupings, the result index will be a MultiIndex
  889. """
  890. return self._apply_to_column_groupbys(
  891. lambda x: x._cython_agg_general('ohlc'))
  892. @Substitution(name='groupby')
  893. @Appender(_doc_template)
  894. def resample(self, rule, *args, **kwargs):
  895. """
  896. Provide resampling when using a TimeGrouper
  897. Return a new grouper with our resampler appended
  898. """
  899. from pandas.tseries.resample import get_resampler_for_grouping
  900. return get_resampler_for_grouping(self, rule, *args, **kwargs)
  901. @Substitution(name='groupby')
  902. @Appender(_doc_template)
  903. def rolling(self, *args, **kwargs):
  904. """
  905. Return a rolling grouper, providing rolling
  906. functionaility per group
  907. """
  908. from pandas.core.window import RollingGroupby
  909. return RollingGroupby(self, *args, **kwargs)
  910. @Substitution(name='groupby')
  911. @Appender(_doc_template)
  912. def expanding(self, *args, **kwargs):
  913. """
  914. Return an expanding grouper, providing expanding
  915. functionaility per group
  916. """
  917. from pandas.core.window import ExpandingGroupby
  918. return ExpandingGroupby(self, *args, **kwargs)
  919. @Substitution(name='groupby')
  920. @Appender(_doc_template)
  921. def pad(self, limit=None):
  922. """
  923. Forward fill the values
  924. Parameters
  925. ----------
  926. limit : integer, optional
  927. limit of how many values to fill
  928. See Also
  929. --------
  930. Series.fillna
  931. DataFrame.fillna
  932. """
  933. return self.apply(lambda x: x.ffill(limit=limit))
  934. ffill = pad
  935. @Substitution(name='groupby')
  936. @Appender(_doc_template)
  937. def backfill(self, limit=None):
  938. """
  939. Backward fill the values
  940. Parameters
  941. ----------
  942. limit : integer, optional
  943. limit of how many values to fill
  944. See Also
  945. --------
  946. Series.fillna
  947. DataFrame.fillna
  948. """
  949. return self.apply(lambda x: x.bfill(limit=limit))
  950. bfill = backfill
  951. @Substitution(name='groupby')
  952. @Appender(_doc_template)
  953. def nth(self, n, dropna=None):
  954. """
  955. Take the nth row from each group if n is an int, or a subset of rows
  956. if n is a list of ints.
  957. If dropna, will take the nth non-null row, dropna is either
  958. Truthy (if a Series) or 'all', 'any' (if a DataFrame);
  959. this is equivalent to calling dropna(how=dropna) before the
  960. groupby.
  961. Parameters
  962. ----------
  963. n : int or list of ints
  964. a single nth value for the row or a list of nth values
  965. dropna : None or str, optional
  966. apply the specified dropna operation before counting which row is
  967. the nth row. Needs to be None, 'any' or 'all'
  968. Examples
  969. --------
  970. >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
  971. ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B'])
  972. >>> g = df.groupby('A')
  973. >>> g.nth(0)
  974. B
  975. A
  976. 1 NaN
  977. 2 3.0
  978. >>> g.nth(1)
  979. B
  980. A
  981. 1 2.0
  982. 2 5.0
  983. >>> g.nth(-1)
  984. B
  985. A
  986. 1 4.0
  987. 2 5.0
  988. >>> g.nth([0, 1])
  989. B
  990. A
  991. 1 NaN
  992. 1 2.0
  993. 2 3.0
  994. 2 5.0
  995. Specifying ``dropna`` allows count ignoring NaN
  996. >>> g.nth(0, dropna='any')
  997. B
  998. A
  999. 1 2.0
  1000. 2 3.0
  1001. NaNs denote group exhausted when using dropna
  1002. >>> g.nth(3, dropna='any')
  1003. B
  1004. A
  1005. 1 NaN
  1006. 2 NaN
  1007. Specifying ``as_index=False`` in ``groupby`` keeps the original index.
  1008. >>> df.groupby('A', as_index=False).nth(1)
  1009. A B
  1010. 1 1 2.0
  1011. 4 2 5.0
  1012. """
  1013. if isinstance(n, int):
  1014. nth_values = [n]
  1015. elif isinstance(n, (set, list, tuple)):
  1016. nth_values = list(set(n))
  1017. if dropna is not None:
  1018. raise ValueError(
  1019. "dropna option with a list of nth values is not supported")
  1020. else:
  1021. raise TypeError("n needs to be an int or a list/set/tuple of ints")
  1022. nth_values = np.array(nth_values, dtype=np.intp)
  1023. self._set_group_selection()
  1024. if not dropna:
  1025. mask = np.in1d(self._cumcount_array(), nth_values) | \
  1026. np.in1d(self._cumcount_array(ascending=False) + 1, -nth_values)
  1027. out = self._selected_obj[mask]
  1028. if not self.as_index:
  1029. return out
  1030. ids, _, _ = self.grouper.group_info
  1031. out.index = self.grouper.result_index[ids[mask]]
  1032. return out.sort_index() if self.sort else out
  1033. if isinstance(self._selected_obj, DataFrame) and \
  1034. dropna not in ['any', 'all']:
  1035. # Note: when agg-ing picker doesn't raise this, just returns NaN
  1036. raise ValueError("For a DataFrame groupby, dropna must be "
  1037. "either None, 'any' or 'all', "
  1038. "(was passed %s)." % (dropna),)
  1039. # old behaviour, but with all and any support for DataFrames.
  1040. # modified in GH 7559 to have better perf
  1041. max_len = n if n >= 0 else - 1 - n
  1042. dropped = self.obj.dropna(how=dropna, axis=self.axis)
  1043. # get a new grouper for our dropped obj
  1044. if self.keys is None and self.level is None:
  1045. # we don't have the grouper info available
  1046. # (e.g. we have selected out
  1047. # a column that is not in the current object)
  1048. axis = self.grouper.axis
  1049. grouper = axis[axis.isin(dropped.index)]
  1050. else:
  1051. # create a grouper with the original parameters, but on the dropped
  1052. # object
  1053. grouper, _, _ = _get_grouper(dropped, key=self.keys,
  1054. axis=self.axis, level=self.level,
  1055. sort=self.sort,
  1056. mutated=self.mutated)
  1057. grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort)
  1058. sizes, result = grb.size(), grb.nth(n)
  1059. mask = (sizes < max_len).values
  1060. # set the results which don't meet the criteria
  1061. if len(result) and mask.any():
  1062. result.loc[mask] = np.nan
  1063. # reset/reindex to the original groups
  1064. if len(self.obj) == len(dropped) or \
  1065. len(result) == len(self.grouper.result_index):
  1066. result.index = self.grouper.result_index
  1067. else:
  1068. result = result.reindex(self.grouper.result_index)
  1069. return result
  1070. @Substitution(name='groupby')
  1071. @Appender(_doc_template)
  1072. def cumcount(self, ascending=True):
  1073. """
  1074. Number each item in each group from 0 to the length of that group - 1.
  1075. Essentially this is equivalent to
  1076. >>> self.apply(lambda x: Series(np.arange(len(x)), x.index))
  1077. Parameters
  1078. ----------
  1079. ascending : bool, default True
  1080. If False, number in reverse, from length of group - 1 to 0.
  1081. Examples
  1082. --------
  1083. >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],
  1084. ... columns=['A'])
  1085. >>> df
  1086. A
  1087. 0 a
  1088. 1 a
  1089. 2 a
  1090. 3 b
  1091. 4 b
  1092. 5 a
  1093. >>> df.groupby('A').cumcount()
  1094. 0 0
  1095. 1 1
  1096. 2 2
  1097. 3 0
  1098. 4 1
  1099. 5 3
  1100. dtype: int64
  1101. >>> df.groupby('A').cumcount(ascending=False)
  1102. 0 3
  1103. 1 2
  1104. 2 1
  1105. 3 1
  1106. 4 0
  1107. 5 0
  1108. dtype: int64
  1109. """
  1110. self._set_group_selection()
  1111. index = self._selected_obj.index
  1112. cumcounts = self._cumcount_array(ascending=ascending)
  1113. return Series(cumcounts, index)
  1114. @Substitution(name='groupby')
  1115. @Appender(_doc_template)
  1116. def cumprod(self, axis=0, *args, **kwargs):
  1117. """Cumulative product for each group"""
  1118. nv.validate_groupby_func('cumprod', args, kwargs)
  1119. if axis != 0:
  1120. return self.apply(lambda x: x.cumprod(axis=axis))
  1121. return self._cython_transform('cumprod')
  1122. @Substitution(name='groupby')
  1123. @Appender(_doc_template)
  1124. def cumsum(self, axis=0, *args, **kwargs):
  1125. """Cumulative sum for each group"""
  1126. nv.validate_groupby_func('cumsum', args, kwargs)
  1127. if axis != 0:
  1128. return self.apply(lambda x: x.cumsum(axis=axis))
  1129. return self._cython_transform('cumsum')
  1130. @Substitution(name='groupby')
  1131. @Appender(_doc_template)
  1132. def shift(self, periods=1, freq=None, axis=0):
  1133. """
  1134. Shift each group by periods observations
  1135. Parameters
  1136. ----------
  1137. periods : integer, default 1
  1138. number of periods to shift
  1139. freq : frequency string
  1140. axis : axis to shift, default 0
  1141. """
  1142. if freq is not None or axis != 0:
  1143. return self.apply(lambda x: x.shift(periods, freq, axis))
  1144. labels, _, ngroups = self.grouper.group_info
  1145. # filled in by Cython
  1146. indexer = np.zeros_like(labels)
  1147. _algos.group_shift_indexer(indexer, labels, ngroups, periods)
  1148. output = {}
  1149. for name, obj in self._iterate_slices():
  1150. output[name] = algos.take_nd(obj.values, indexer)
  1151. return self._wrap_transformed_output(output)
  1152. @Substitution(name='groupby')
  1153. @Appender(_doc_template)
  1154. def head(self, n=5):
  1155. """
  1156. Returns first n rows of each group.
  1157. Essentially equivalent to ``.apply(lambda x: x.head(n))``,
  1158. except ignores as_index flag.
  1159. Examples
  1160. --------
  1161. >>> df = DataFrame([[1, 2], [1, 4], [5, 6]],
  1162. columns=['A', 'B'])
  1163. >>> df.groupby('A', as_index=False).head(1)
  1164. A B
  1165. 0 1 2
  1166. 2 5 6
  1167. >>> df.groupby('A').head(1)
  1168. A B
  1169. 0 1 2
  1170. 2 5 6
  1171. """
  1172. self._reset_group_selection()
  1173. mask = self._cumcount_array() < n
  1174. return self._selected_obj[mask]
  1175. @Substitution(name='groupby')
  1176. @Appender(_doc_template)
  1177. def tail(self, n=5):
  1178. """
  1179. Returns last n rows of each group
  1180. Essentially equivalent to ``.apply(lambda x: x.tail(n))``,
  1181. except ignores as_index flag.
  1182. Examples
  1183. --------
  1184. >>> df = DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]],
  1185. columns=['A', 'B'])
  1186. >>> df.groupby('A').tail(1)
  1187. A B
  1188. 1 a 2
  1189. 3 b 2
  1190. >>> df.groupby('A').head(1)
  1191. A B
  1192. 0 a 1
  1193. 2 b 1
  1194. """
  1195. self._reset_group_selection()
  1196. mask = self._cumcount_array(ascending=False) < n
  1197. return self._selected_obj[mask]
  1198. @Appender(GroupBy.__doc__)
  1199. def groupby(obj, by, **kwds):
  1200. if isinstance(obj, Series):
  1201. klass = SeriesGroupBy
  1202. elif isinstance(obj, DataFrame):
  1203. klass = DataFrameGroupBy
  1204. else: # pragma: no cover
  1205. raise TypeError('invalid type: %s' % type(obj))
  1206. return klass(obj, by, **kwds)
  1207. def _get_axes(group):
  1208. if isinstance(group, Series):
  1209. return [group.index]
  1210. else:
  1211. return group.axes
  1212. def _is_indexed_like(obj, axes):
  1213. if isinstance(obj, Series):
  1214. if len(axes) > 1:
  1215. return False
  1216. return obj.index.equals(axes[0])
  1217. elif isinstance(obj, DataFrame):
  1218. return obj.index.equals(axes[0])
  1219. return False
  1220. class BaseGrouper(object):
  1221. """
  1222. This is an internal Grouper class, which actually holds
  1223. the generated groups
  1224. """
  1225. def __init__(self, axis, groupings, sort=True, group_keys=True,
  1226. mutated=False):
  1227. self._filter_empty_groups = self.compressed = len(groupings) != 1
  1228. self.axis = axis
  1229. self.groupings = groupings
  1230. self.sort = sort
  1231. self.group_keys = group_keys
  1232. self.mutated = mutated
  1233. @property
  1234. def shape(self):
  1235. return tuple(ping.ngroups for ping in self.groupings)
  1236. def __iter__(self):
  1237. return iter(self.indices)
  1238. @property
  1239. def nkeys(self):
  1240. return len(self.groupings)
  1241. def get_iterator(self, data, axis=0):
  1242. """
  1243. Groupby iterator
  1244. Returns
  1245. -------
  1246. Generator yielding sequence of (name, subsetted object)
  1247. for each group
  1248. """
  1249. splitter = self._get_splitter(data, axis=axis)
  1250. keys = self._get_group_keys()
  1251. for key, (i, group) in zip(keys, splitter):
  1252. yield key, group
  1253. def _get_splitter(self, data, axis=0):
  1254. comp_ids, _, ngroups = self.group_info
  1255. return get_splitter(data, comp_ids, ngroups, axis=axis)
  1256. def _get_group_keys(self):
  1257. if len(self.groupings) == 1:
  1258. return self.levels[0]
  1259. else:
  1260. comp_ids, _, ngroups = self.group_info
  1261. # provide "flattened" iterator for multi-group setting
  1262. mapper = _KeyMapper(comp_ids, ngroups, self.labels, self.levels)
  1263. return [mapper.get_key(i) for i in range(ngroups)]
  1264. def apply(self, f, data, axis=0):
  1265. mutated = self.mutated
  1266. splitter = self._get_splitter(data, axis=axis)
  1267. group_keys = self._get_group_keys()
  1268. # oh boy
  1269. f_name = com._get_callable_name(f)
  1270. if (f_name not in _plotting_methods and
  1271. hasattr(splitter, 'fast_apply') and axis == 0):
  1272. try:
  1273. values, mutated = splitter.fast_apply(f, group_keys)
  1274. return group_keys, values, mutated
  1275. except (lib.InvalidApply):
  1276. # we detect a mutation of some kind
  1277. # so take slow path
  1278. pass
  1279. except Exception:
  1280. # raise this error to the caller
  1281. pass
  1282. result_values = []
  1283. for key, (i, group) in zip(group_keys, splitter):
  1284. object.__setattr__(group, 'name', key)
  1285. # group might be modified
  1286. group_axes = _get_axes(group)
  1287. res = f(group)
  1288. if not _is_indexed_like(res, group_axes):
  1289. mutated = True
  1290. result_values.append(res)
  1291. return group_keys, result_values, mutated
  1292. @cache_readonly
  1293. def indices(self):
  1294. """ dict {group name -> group indices} """
  1295. if len(self.groupings) == 1:
  1296. return self.groupings[0].indices
  1297. else:
  1298. label_list = [ping.labels for ping in self.groupings]
  1299. keys = [_values_from_object(ping.group_index)
  1300. for ping in self.groupings]
  1301. return _get_indices_dict(label_list, keys)
  1302. @property
  1303. def labels(self):
  1304. return [ping.labels for ping in self.groupings]
  1305. @property
  1306. def levels(self):
  1307. return [ping.group_index for ping in self.groupings]
  1308. @property
  1309. def names(self):
  1310. return [ping.name for ping in self.groupings]
  1311. def size(self):
  1312. """
  1313. Compute group sizes
  1314. """
  1315. ids, _, ngroup = self.group_info
  1316. ids = _ensure_platform_int(ids)
  1317. out = np.bincount(ids[ids != -1], minlength=ngroup or None)
  1318. return Series(out, index=self.result_index, dtype='int64')
  1319. @cache_readonly
  1320. def _max_groupsize(self):
  1321. """
  1322. Compute size of largest group
  1323. """
  1324. # For many items in each group this is much faster than
  1325. # self.size().max(), in worst case marginally slower
  1326. if self.indices:
  1327. return max(len(v) for v in self.indices.values())
  1328. else:
  1329. return 0
  1330. @cache_readonly
  1331. def groups(self):
  1332. """ dict {group name -> group labels} """
  1333. if len(self.groupings) == 1:
  1334. return self.groupings[0].groups
  1335. else:
  1336. to_groupby = lzip(*(ping.grouper for ping in self.groupings))
  1337. to_groupby = Index(to_groupby)
  1338. return self.axis.groupby(to_groupby.values)
  1339. @cache_readonly
  1340. def is_monotonic(self):
  1341. # return if my group orderings are monotonic
  1342. return Index(self.group_info[0]).is_monotonic
  1343. @cache_readonly
  1344. def group_info(self):
  1345. comp_ids, obs_group_ids = self._get_compressed_labels()
  1346. ngroups = len(obs_group_ids)
  1347. comp_ids = _ensure_int64(comp_ids)
  1348. return comp_ids, obs_group_ids, ngroups
  1349. def _get_compressed_labels(self):
  1350. all_labels = [ping.labels for ping in self.groupings]
  1351. if len(all_labels) > 1:
  1352. group_index = get_group_index(all_labels, self.shape,
  1353. sort=True, xnull=True)
  1354. return _compress_group_index(group_index, sort=self.sort)
  1355. ping = self.groupings[0]
  1356. return ping.labels, np.arange(len(ping.group_index))
  1357. @cache_readonly
  1358. def ngroups(self):
  1359. return len(self.result_index)
  1360. @property
  1361. def recons_labels(self):
  1362. comp_ids, obs_ids, _ = self.group_info
  1363. labels = (ping.labels for ping in self.groupings)
  1364. return decons_obs_group_ids(comp_ids,
  1365. obs_ids, self.shape, labels, xnull=True)
  1366. @cache_readonly
  1367. def result_index(self):
  1368. if not self.compressed and len(self.groupings) == 1:
  1369. return self.groupings[0].group_index.rename(self.names[0])
  1370. return MultiIndex(levels=[ping.group_index for ping in self.groupings],
  1371. labels=self.recons_labels,
  1372. verify_integrity=False,
  1373. names=self.names)
  1374. def get_group_levels(self):
  1375. if not self.compressed and len(self.groupings) == 1:
  1376. return [self.groupings[0].group_index]
  1377. name_list = []
  1378. for ping, labels in zip(self.groupings, self.recons_labels):
  1379. labels = _ensure_platform_int(labels)
  1380. levels = ping.group_index.take(labels)
  1381. name_list.append(levels)
  1382. return name_list
  1383. # ------------------------------------------------------------
  1384. # Aggregation functions
  1385. _cython_functions = {
  1386. 'aggregate': {
  1387. 'add': 'group_add',
  1388. 'prod': 'group_prod',
  1389. 'min': 'group_min',
  1390. 'max': 'group_max',
  1391. 'mean': 'group_mean',
  1392. 'median': {
  1393. 'name': 'group_median'
  1394. },
  1395. 'var': 'group_var',
  1396. 'first': {
  1397. 'name': 'group_nth',
  1398. 'f': lambda func, a, b, c, d: func(a, b, c, d, 1)
  1399. },
  1400. 'last': 'group_last',
  1401. 'ohlc': 'group_ohlc',
  1402. },
  1403. 'transform': {
  1404. 'cumprod': 'group_cumprod',
  1405. 'cumsum': 'group_cumsum',
  1406. }
  1407. }
  1408. _cython_arity = {
  1409. 'ohlc': 4, # OHLC
  1410. }
  1411. _name_functions = {
  1412. 'ohlc': lambda *args: ['open', 'high', 'low', 'close']
  1413. }
  1414. def _get_cython_function(self, kind, how, values, is_numeric):
  1415. dtype_str = values.dtype.name
  1416. def get_func(fname):
  1417. # see if there is a fused-type version of function
  1418. # only valid for numeric
  1419. f = getattr(_algos, fname, None)
  1420. if f is not None and is_numeric:
  1421. return f
  1422. # otherwise find dtype-specific version, falling back to object
  1423. for dt in [dtype_str, 'object']:
  1424. f = getattr(_algos, "%s_%s" % (fname, dtype_str), None)
  1425. if f is not None:
  1426. return f
  1427. ftype = self._cython_functions[kind][how]
  1428. if isinstance(ftype, dict):
  1429. func = afunc = get_func(ftype['name'])
  1430. # a sub-function
  1431. f = ftype.get('f')
  1432. if f is not None:
  1433. def wrapper(*args, **kwargs):
  1434. return f(afunc, *args, **kwargs)
  1435. # need to curry our sub-function
  1436. func = wrapper
  1437. else:
  1438. func = get_func(ftype)
  1439. if func is None:
  1440. raise NotImplementedError("function is not implemented for this"
  1441. "dtype: [how->%s,dtype->%s]" %
  1442. (how, dtype_str))
  1443. return func, dtype_str
  1444. def _cython_operation(self, kind, values, how, axis):
  1445. assert kind in ['transform', 'aggregate']
  1446. arity = self._cython_arity.get(how, 1)
  1447. vdim = values.ndim
  1448. swapped = False
  1449. if vdim == 1:
  1450. values = values[:, None]
  1451. out_shape = (self.ngroups, arity)
  1452. else:
  1453. if axis > 0:
  1454. swapped = True
  1455. values = values.swapaxes(0, axis)
  1456. if arity > 1:
  1457. raise NotImplementedError("arity of more than 1 is not "
  1458. "supported for the 'how' argument")
  1459. out_shape = (self.ngroups,) + values.shape[1:]
  1460. is_numeric = is_numeric_dtype(values.dtype)
  1461. if is_datetime_or_timedelta_dtype(values.dtype):
  1462. values = values.view('int64')
  1463. is_numeric = True
  1464. elif is_bool_dtype(values.dtype):
  1465. values = _ensure_float64(values)
  1466. elif is_integer_dtype(values):
  1467. values = values.astype('int64', copy=False)
  1468. elif is_numeric and not is_complex_dtype(values):
  1469. values = _ensure_float64(values)
  1470. else:
  1471. values = values.astype(object)
  1472. try:
  1473. func, dtype_str = self._get_cython_function(
  1474. kind, how, values, is_numeric)
  1475. except NotImplementedError:
  1476. if is_numeric:
  1477. values = _ensure_float64(values)
  1478. func, dtype_str = self._get_cython_function(
  1479. kind, how, values, is_numeric)
  1480. else:
  1481. raise
  1482. if is_numeric:
  1483. out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize)
  1484. else:
  1485. out_dtype = 'object'
  1486. labels, _, _ = self.group_info
  1487. if kind == 'aggregate':
  1488. result = _maybe_fill(np.empty(out_shape, dtype=out_dtype),
  1489. fill_value=np.nan)
  1490. counts = np.zeros(self.ngroups, dtype=np.int64)
  1491. result = self._aggregate(
  1492. result, counts, values, labels, func, is_numeric)
  1493. elif kind == 'transform':
  1494. result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
  1495. fill_value=np.nan)
  1496. # temporary storange for running-total type tranforms
  1497. accum = np.empty(out_shape, dtype=out_dtype)
  1498. result = self._transform(
  1499. result, accum, values, labels, func, is_numeric)
  1500. if is_integer_dtype(result):
  1501. if len(result[result == tslib.iNaT]) > 0:
  1502. result = result.astype('float64')
  1503. result[result == tslib.iNaT] = np.nan
  1504. if kind == 'aggregate' and \
  1505. self._filter_empty_groups and not counts.all():
  1506. if result.ndim == 2:
  1507. try:
  1508. result = lib.row_bool_subset(
  1509. result, (counts > 0).view(np.uint8))
  1510. except ValueError:
  1511. result = lib.row_bool_subset_object(
  1512. _ensure_object(result),
  1513. (counts > 0).view(np.uint8))
  1514. else:
  1515. result = result[counts > 0]
  1516. if vdim == 1 and arity == 1:
  1517. result = result[:, 0]
  1518. if how in self._name_functions:
  1519. # TODO
  1520. names = self._name_functions[how]()
  1521. else:
  1522. names = None
  1523. if swapped:
  1524. result = result.swapaxes(0, axis)
  1525. return result, names
  1526. def aggregate(self, values, how, axis=0):
  1527. return self._cython_operation('aggregate', values, how, axis)
  1528. def transform(self, values, how, axis=0):
  1529. return self._cython_operation('transform', values, how, axis)
  1530. def _aggregate(self, result, counts, values, comp_ids, agg_func,
  1531. is_numeric):
  1532. if values.ndim > 3:
  1533. # punting for now
  1534. raise NotImplementedError("number of dimensions is currently "
  1535. "limited to 3")
  1536. elif values.ndim > 2:
  1537. for i, chunk in enumerate(values.transpose(2, 0, 1)):
  1538. chunk = chunk.squeeze()
  1539. agg_func(result[:, :, i], counts, chunk, comp_ids)
  1540. else:
  1541. agg_func(result, counts, values, comp_ids)
  1542. return result
  1543. def _transform(self, result, accum, values, comp_ids, transform_func,
  1544. is_numeric):
  1545. comp_ids, _, ngroups = self.group_info
  1546. if values.ndim > 3:
  1547. # punting for now
  1548. raise NotImplementedError("number of dimensions is currently "
  1549. "limited to 3")
  1550. elif values.ndim > 2:
  1551. for i, chunk in enumerate(values.transpose(2, 0, 1)):
  1552. chunk = chunk.squeeze()
  1553. transform_func(result[:, :, i], values,
  1554. comp_ids, accum)
  1555. else:
  1556. transform_func(result, values, comp_ids, accum)
  1557. return result
  1558. def agg_series(self, obj, func):
  1559. try:
  1560. return self._aggregate_series_fast(obj, func)
  1561. except Exception:
  1562. return self._aggregate_series_pure_python(obj, func)
  1563. def _aggregate_series_fast(self, obj, func):
  1564. func = self._is_builtin_func(func)
  1565. if obj.index._has_complex_internals:
  1566. raise TypeError('Incompatible index for Cython grouper')
  1567. group_index, _, ngroups = self.group_info
  1568. # avoids object / Series creation overhead
  1569. dummy = obj._get_values(slice(None, 0)).to_dense()
  1570. indexer = _get_group_index_sorter(group_index, ngroups)
  1571. obj = obj.take(indexer, convert=False)
  1572. group_index = algos.take_nd(group_index, indexer, allow_fill=False)
  1573. grouper = lib.SeriesGrouper(obj, func, group_index, ngroups,
  1574. dummy)
  1575. result, counts = grouper.get_result()
  1576. return result, counts
  1577. def _aggregate_series_pure_python(self, obj, func):
  1578. group_index, _, ngroups = self.group_info
  1579. counts = np.zeros(ngroups, dtype=int)
  1580. result = None
  1581. splitter = get_splitter(obj, group_index, ngroups, axis=self.axis)
  1582. for label, group in splitter:
  1583. res = func(group)
  1584. if result is None:
  1585. if (isinstance(res, (Series, Index, np.ndarray)) or
  1586. isinstance(res, list)):
  1587. raise ValueError('Function does not reduce')
  1588. result = np.empty(ngroups, dtype='O')
  1589. counts[label] = group.shape[0]
  1590. result[label] = res
  1591. result = lib.maybe_convert_objects(result, try_float=0)
  1592. return result, counts
  1593. def generate_bins_generic(values, binner, closed):
  1594. """
  1595. Generate bin edge offsets and bin labels for one array using another array
  1596. which has bin edge values. Both arrays must be sorted.
  1597. Parameters
  1598. ----------
  1599. values : array of values
  1600. binner : a comparable array of values representing bins into which to bin
  1601. the first array. Note, 'values' end-points must fall within 'binner'
  1602. end-points.
  1603. closed : which end of bin is closed; left (default), right
  1604. Returns
  1605. -------
  1606. bins : array of offsets (into 'values' argument) of bins.
  1607. Zero and last edge are excluded in result, so for instance the first
  1608. bin is values[0:bin[0]] and the last is values[bin[-1]:]
  1609. """
  1610. lenidx = len(values)
  1611. lenbin = len(binner)
  1612. if lenidx <= 0 or lenbin <= 0:
  1613. raise ValueError("Invalid length for values or for binner")
  1614. # check binner fits data
  1615. if values[0] < binner[0]:
  1616. raise ValueError("Values falls before first bin")
  1617. if values[lenidx - 1] > binner[lenbin - 1]:
  1618. raise ValueError("Values falls after last bin")
  1619. bins = np.empty(lenbin - 1, dtype=np.int64)
  1620. j = 0 # index into values
  1621. bc = 0 # bin count
  1622. # linear scan, presume nothing about values/binner except that it fits ok
  1623. for i in range(0, lenbin - 1):
  1624. r_bin = binner[i + 1]
  1625. # count values in current bin, advance to next bin
  1626. while j < lenidx and (values[j] < r_bin or
  1627. (closed == 'right' and values[j] == r_bin)):
  1628. j += 1
  1629. bins[bc] = j
  1630. bc += 1
  1631. return bins
  1632. class BinGrouper(BaseGrouper):
  1633. def __init__(self, bins, binlabels, filter_empty=False, mutated=False):
  1634. self.bins = _ensure_int64(bins)
  1635. self.binlabels = _ensure_index(binlabels)
  1636. self._filter_empty_groups = filter_empty
  1637. self.mutated = mutated
  1638. @cache_readonly
  1639. def groups(self):
  1640. """ dict {group name -> group labels} """
  1641. # this is mainly for compat
  1642. # GH 3881
  1643. result = {}
  1644. for key, value in zip(self.binlabels, self.bins):
  1645. if key is not tslib.NaT:
  1646. result[key] = value
  1647. return result
  1648. @property
  1649. def nkeys(self):
  1650. return 1
  1651. def get_iterator(self, data, axis=0):
  1652. """
  1653. Groupby iterator
  1654. Returns
  1655. -------
  1656. Generator yielding sequence of (name, subsetted object)
  1657. for each group
  1658. """
  1659. if isinstance(data, NDFrame):
  1660. slicer = lambda start, edge: data._slice(
  1661. slice(start, edge), axis=axis)
  1662. length = len(data.axes[axis])
  1663. else:
  1664. slicer = lambda start, edge: data[slice(start, edge)]
  1665. length = len(data)
  1666. start = 0
  1667. for edge, label in zip(self.bins, self.binlabels):
  1668. if label is not tslib.NaT:
  1669. yield label, slicer(start, edge)
  1670. start = edge
  1671. if start < length:
  1672. yield self.binlabels[-1], slicer(start, None)
  1673. @cache_readonly
  1674. def indices(self):
  1675. indices = collections.defaultdict(list)
  1676. i = 0
  1677. for label, bin in zip(self.binlabels, self.bins):
  1678. if i < bin:
  1679. if label is not tslib.NaT:
  1680. indices[label] = list(range(i, bin))
  1681. i = bin
  1682. return indices
  1683. @cache_readonly
  1684. def group_info(self):
  1685. ngroups = self.ngroups
  1686. obs_group_ids = np.arange(ngroups)
  1687. rep = np.diff(np.r_[0, self.bins])
  1688. rep = _ensure_platform_int(rep)
  1689. if ngroups == len(self.bins):
  1690. comp_ids = np.repeat(np.arange(ngroups), rep)
  1691. else:
  1692. comp_ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep)
  1693. return comp_ids.astype('int64', copy=False), \
  1694. obs_group_ids.astype('int64', copy=False), ngroups
  1695. @cache_readonly
  1696. def ngroups(self):
  1697. return len(self.result_index)
  1698. @cache_readonly
  1699. def result_index(self):
  1700. if len(self.binlabels) != 0 and isnull(self.binlabels[0]):
  1701. return self.binlabels[1:]
  1702. return self.binlabels
  1703. @property
  1704. def levels(self):
  1705. return [self.binlabels]
  1706. @property
  1707. def names(self):
  1708. return [self.binlabels.name]
  1709. @property
  1710. def groupings(self):
  1711. return [Grouping(lvl, lvl, in_axis=False, level=None, name=name)
  1712. for lvl, name in zip(self.levels, self.names)]
  1713. def agg_series(self, obj, func):
  1714. dummy = obj[:0]
  1715. grouper = lib.SeriesBinGrouper(obj, func, self.bins, dummy)
  1716. return grouper.get_result()
  1717. # ----------------------------------------------------------------------
  1718. # cython aggregation
  1719. _cython_functions = copy.deepcopy(BaseGrouper._cython_functions)
  1720. _cython_functions['aggregate'].pop('median')
  1721. class Grouping(object):
  1722. """
  1723. Holds the grouping information for a single key
  1724. Parameters
  1725. ----------
  1726. index : Index
  1727. grouper :
  1728. obj :
  1729. name :
  1730. level :
  1731. in_axis : if the Grouping is a column in self.obj and hence among
  1732. Groupby.exclusions list
  1733. Returns
  1734. -------
  1735. **Attributes**:
  1736. * indices : dict of {group -> index_list}
  1737. * labels : ndarray, group labels
  1738. * ids : mapping of label -> group
  1739. * counts : array of group counts
  1740. * group_index : unique groups
  1741. * groups : dict of {group -> label_list}
  1742. """
  1743. def __init__(self, index, grouper=None, obj=None, name=None, level=None,
  1744. sort=True, in_axis=False):
  1745. self.name = name
  1746. self.level = level
  1747. self.grouper = _convert_grouper(index, grouper)
  1748. self.index = index
  1749. self.sort = sort
  1750. self.obj = obj
  1751. self.in_axis = in_axis
  1752. # right place for this?
  1753. if isinstance(grouper, (Series, Index)) and name is None:
  1754. self.name = grouper.name
  1755. if isinstance(grouper, MultiIndex):
  1756. self.grouper = grouper.values
  1757. # pre-computed
  1758. self._should_compress = True
  1759. # we have a single grouper which may be a myriad of things,
  1760. # some of which are dependent on the passing in level
  1761. if level is not None:
  1762. if not isinstance(level, int):
  1763. if level not in index.names:
  1764. raise AssertionError('Level %s not in index' % str(level))
  1765. level = index.names.index(level)
  1766. inds = index.labels[level]
  1767. level_index = index.levels[level]
  1768. if self.name is None:
  1769. self.name = index.names[level]
  1770. # XXX complete hack
  1771. if grouper is not None:
  1772. level_values = index.levels[level].take(inds)
  1773. self.grouper = level_values.map(self.grouper)
  1774. else:
  1775. # all levels may not be observed
  1776. labels, uniques = algos.factorize(inds, sort=True)
  1777. if len(uniques) > 0 and uniques[0] == -1:
  1778. # handle NAs
  1779. mask = inds != -1
  1780. ok_labels, uniques = algos.factorize(inds[mask], sort=True)
  1781. labels = np.empty(len(inds), dtype=inds.dtype)
  1782. labels[mask] = ok_labels
  1783. labels[~mask] = -1
  1784. if len(uniques) < len(level_index):
  1785. level_index = level_index.take(uniques)
  1786. self._labels = labels
  1787. self._group_index = level_index
  1788. self.grouper = level_index.take(labels)
  1789. else:
  1790. if isinstance(self.grouper, (list, tuple)):
  1791. self.grouper = com._asarray_tuplesafe(self.grouper)
  1792. # a passed Categorical
  1793. elif is_categorical_dtype(self.grouper):
  1794. # must have an ordered categorical
  1795. if self.sort:
  1796. if not self.grouper.ordered:
  1797. # technically we cannot group on an unordered
  1798. # Categorical
  1799. # but this a user convenience to do so; the ordering
  1800. # is preserved and if it's a reduction it doesn't make
  1801. # any difference
  1802. pass
  1803. # fix bug #GH8868 sort=False being ignored in categorical
  1804. # groupby
  1805. else:
  1806. cat = self.grouper.unique()
  1807. self.grouper = self.grouper.reorder_categories(
  1808. cat.categories)
  1809. # we make a CategoricalIndex out of the cat grouper
  1810. # preserving the categories / ordered attributes
  1811. self._labels = self.grouper.codes
  1812. c = self.grouper.categories
  1813. self._group_index = CategoricalIndex(
  1814. Categorical.from_codes(np.arange(len(c)),
  1815. categories=c,
  1816. ordered=self.grouper.ordered))
  1817. # a passed Grouper like
  1818. elif isinstance(self.grouper, Grouper):
  1819. # get the new grouper
  1820. grouper = self.grouper._get_binner_for_grouping(self.obj)
  1821. self.obj = self.grouper.obj
  1822. self.grouper = grouper
  1823. if self.name is None:
  1824. self.name = grouper.name
  1825. # we are done
  1826. if isinstance(self.grouper, Grouping):
  1827. self.grouper = self.grouper.grouper
  1828. # no level passed
  1829. elif not isinstance(self.grouper,
  1830. (Series, Index, Categorical, np.ndarray)):
  1831. if getattr(self.grouper, 'ndim', 1) != 1:
  1832. t = self.name or str(type(self.grouper))
  1833. raise ValueError("Grouper for '%s' not 1-dimensional" % t)
  1834. self.grouper = self.index.map(self.grouper)
  1835. if not (hasattr(self.grouper, "__len__") and
  1836. len(self.grouper) == len(self.index)):
  1837. errmsg = ('Grouper result violates len(labels) == '
  1838. 'len(data)\nresult: %s' %
  1839. pprint_thing(self.grouper))
  1840. self.grouper = None # Try for sanity
  1841. raise AssertionError(errmsg)
  1842. # if we have a date/time-like grouper, make sure that we have
  1843. # Timestamps like
  1844. if getattr(self.grouper, 'dtype', None) is not None:
  1845. if is_datetime64_dtype(self.grouper):
  1846. from pandas import to_datetime
  1847. self.grouper = to_datetime(self.grouper)
  1848. elif is_timedelta64_dtype(self.grouper):
  1849. from pandas import to_timedelta
  1850. self.grouper = to_timedelta(self.grouper)
  1851. def __repr__(self):
  1852. return 'Grouping({0})'.format(self.name)
  1853. def __iter__(self):
  1854. return iter(self.indices)
  1855. _labels = None
  1856. _group_index = None
  1857. @property
  1858. def ngroups(self):
  1859. return len(self.group_index)
  1860. @cache_readonly
  1861. def indices(self):
  1862. return _groupby_indices(self.grouper)
  1863. @property
  1864. def labels(self):
  1865. if self._labels is None:
  1866. self._make_labels()
  1867. return self._labels
  1868. @property
  1869. def group_index(self):
  1870. if self._group_index is None:
  1871. self._make_labels()
  1872. return self._group_index
  1873. def _make_labels(self):
  1874. if self._labels is None or self._group_index is None:
  1875. labels, uniques = algos.factorize(self.grouper, sort=self.sort)
  1876. uniques = Index(uniques, name=self.name)
  1877. self._labels = labels
  1878. self._group_index = uniques
  1879. @cache_readonly
  1880. def groups(self):
  1881. return self.index.groupby(self.grouper)
  1882. def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
  1883. mutated=False):
  1884. """
  1885. create and return a BaseGrouper, which is an internal
  1886. mapping of how to create the grouper indexers.
  1887. This may be composed of multiple Grouping objects, indicating
  1888. multiple groupers
  1889. Groupers are ultimately index mappings. They can originate as:
  1890. index mappings, keys to columns, functions, or Groupers
  1891. Groupers enable local references to axis,level,sort, while
  1892. the passed in axis, level, and sort are 'global'.
  1893. This routine tries to figure out what the passing in references
  1894. are and then creates a Grouping for each one, combined into
  1895. a BaseGrouper.
  1896. """
  1897. group_axis = obj._get_axis(axis)
  1898. # validate that the passed level is compatible with the passed
  1899. # axis of the object
  1900. if level is not None:
  1901. if not isinstance(group_axis, MultiIndex):
  1902. if isinstance(level, compat.string_types):
  1903. if obj.index.name != level:
  1904. raise ValueError('level name %s is not the name of the '
  1905. 'index' % level)
  1906. elif level > 0:
  1907. raise ValueError('level > 0 only valid with MultiIndex')
  1908. level = None
  1909. key = group_axis
  1910. # a passed-in Grouper, directly convert
  1911. if isinstance(key, Grouper):
  1912. binner, grouper, obj = key._get_grouper(obj)
  1913. if key.key is None:
  1914. return grouper, [], obj
  1915. else:
  1916. return grouper, set([key.key]), obj
  1917. # already have a BaseGrouper, just return it
  1918. elif isinstance(key, BaseGrouper):
  1919. return key, [], obj
  1920. if not isinstance(key, (tuple, list)):
  1921. keys = [key]
  1922. match_axis_length = False
  1923. else:
  1924. keys = key
  1925. match_axis_length = len(keys) == len(group_axis)
  1926. # what are we after, exactly?
  1927. any_callable = any(callable(g) or isinstance(g, dict) for g in keys)
  1928. any_groupers = any(isinstance(g, Grouper) for g in keys)
  1929. any_arraylike = any(isinstance(g, (list, tuple, Series, Index, np.ndarray))
  1930. for g in keys)
  1931. try:
  1932. if isinstance(obj, DataFrame):
  1933. all_in_columns = all(g in obj.columns for g in keys)
  1934. else:
  1935. all_in_columns = False
  1936. except Exception:
  1937. all_in_columns = False
  1938. if not any_callable and not all_in_columns and \
  1939. not any_arraylike and not any_groupers and \
  1940. match_axis_length and level is None:
  1941. keys = [com._asarray_tuplesafe(keys)]
  1942. if isinstance(level, (tuple, list)):
  1943. if key is None:
  1944. keys = [None] * len(level)
  1945. levels = level
  1946. else:
  1947. levels = [level] * len(keys)
  1948. groupings = []
  1949. exclusions = []
  1950. # if the actual grouper should be obj[key]
  1951. def is_in_axis(key):
  1952. if not _is_label_like(key):
  1953. try:
  1954. obj._data.items.get_loc(key)
  1955. except Exception:
  1956. return False
  1957. return True
  1958. # if the the grouper is obj[name]
  1959. def is_in_obj(gpr):
  1960. try:
  1961. return id(gpr) == id(obj[gpr.name])
  1962. except Exception:
  1963. return False
  1964. for i, (gpr, level) in enumerate(zip(keys, levels)):
  1965. if is_in_obj(gpr): # df.groupby(df['name'])
  1966. in_axis, name = True, gpr.name
  1967. exclusions.append(name)
  1968. elif is_in_axis(gpr): # df.groupby('name')
  1969. in_axis, name, gpr = True, gpr, obj[gpr]
  1970. exclusions.append(name)
  1971. else:
  1972. in_axis, name = False, None
  1973. if is_categorical_dtype(gpr) and len(gpr) != len(obj):
  1974. raise ValueError("Categorical dtype grouper must "
  1975. "have len(grouper) == len(data)")
  1976. # create the Grouping
  1977. # allow us to passing the actual Grouping as the gpr
  1978. ping = Grouping(group_axis,
  1979. gpr,
  1980. obj=obj,
  1981. name=name,
  1982. level=level,
  1983. sort=sort,
  1984. in_axis=in_axis) \
  1985. if not isinstance(gpr, Grouping) else gpr
  1986. groupings.append(ping)
  1987. if len(groupings) == 0:
  1988. raise ValueError('No group keys passed!')
  1989. # create the internals grouper
  1990. grouper = BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated)
  1991. return grouper, exclusions, obj
  1992. def _is_label_like(val):
  1993. return (isinstance(val, compat.string_types) or
  1994. (val is not None and is_scalar(val)))
  1995. def _convert_grouper(axis, grouper):
  1996. if isinstance(grouper, dict):
  1997. return grouper.get
  1998. elif isinstance(grouper, Series):
  1999. if grouper.index.equals(axis):
  2000. return grouper._values
  2001. else:
  2002. return grouper.reindex(axis)._values
  2003. elif isinstance(grouper, (list, Series, Index, np.ndarray)):
  2004. if len(grouper) != len(axis):
  2005. raise AssertionError('Grouper and axis must be same length')
  2006. return grouper
  2007. else:
  2008. return grouper
  2009. def _whitelist_method_generator(klass, whitelist):
  2010. """
  2011. Yields all GroupBy member defs for DataFrame/Series names in _whitelist.
  2012. Parameters
  2013. ----------
  2014. klass - class where members are defined. Should be Series or DataFrame
  2015. whitelist - list of names of klass methods to be constructed
  2016. Returns
  2017. -------
  2018. The generator yields a sequence of strings, each suitable for exec'ing,
  2019. that define implementations of the named methods for DataFrameGroupBy
  2020. or SeriesGroupBy.
  2021. Since we don't want to override methods explicitly defined in the
  2022. base class, any such name is skipped.
  2023. """
  2024. method_wrapper_template = \
  2025. """def %(name)s(%(sig)s) :
  2026. \"""
  2027. %(doc)s
  2028. \"""
  2029. f = %(self)s.__getattr__('%(name)s')
  2030. return f(%(args)s)"""
  2031. property_wrapper_template = \
  2032. """@property
  2033. def %(name)s(self) :
  2034. \"""
  2035. %(doc)s
  2036. \"""
  2037. return self.__getattr__('%(name)s')"""
  2038. for name in whitelist:
  2039. # don't override anything that was explicitly defined
  2040. # in the base class
  2041. if hasattr(GroupBy, name):
  2042. continue
  2043. # ugly, but we need the name string itself in the method.
  2044. f = getattr(klass, name)
  2045. doc = f.__doc__
  2046. doc = doc if type(doc) == str else ''
  2047. if isinstance(f, types.MethodType):
  2048. wrapper_template = method_wrapper_template
  2049. decl, args = make_signature(f)
  2050. # pass args by name to f because otherwise
  2051. # GroupBy._make_wrapper won't know whether
  2052. # we passed in an axis parameter.
  2053. args_by_name = ['{0}={0}'.format(arg) for arg in args[1:]]
  2054. params = {'name': name,
  2055. 'doc': doc,
  2056. 'sig': ','.join(decl),
  2057. 'self': args[0],
  2058. 'args': ','.join(args_by_name)}
  2059. else:
  2060. wrapper_template = property_wrapper_template
  2061. params = {'name': name, 'doc': doc}
  2062. yield wrapper_template % params
  2063. class SeriesGroupBy(GroupBy):
  2064. #
  2065. # Make class defs of attributes on SeriesGroupBy whitelist
  2066. _apply_whitelist = _series_apply_whitelist
  2067. for _def_str in _whitelist_method_generator(Series,
  2068. _series_apply_whitelist):
  2069. exec(_def_str)
  2070. @property
  2071. def name(self):
  2072. """
  2073. since we are a series, we by definition only have
  2074. a single name, but may be the result of a selection or
  2075. the name of our object
  2076. """
  2077. if self._selection is None:
  2078. return self.obj.name
  2079. else:
  2080. return self._selection
  2081. def aggregate(self, func_or_funcs, *args, **kwargs):
  2082. """
  2083. Apply aggregation function or functions to groups, yielding most likely
  2084. Series but in some cases DataFrame depending on the output of the
  2085. aggregation function
  2086. Parameters
  2087. ----------
  2088. func_or_funcs : function or list / dict of functions
  2089. List/dict of functions will produce DataFrame with column names
  2090. determined by the function names themselves (list) or the keys in
  2091. the dict
  2092. Notes
  2093. -----
  2094. agg is an alias for aggregate. Use it.
  2095. Examples
  2096. --------
  2097. >>> series
  2098. bar 1.0
  2099. baz 2.0
  2100. qot 3.0
  2101. qux 4.0
  2102. >>> mapper = lambda x: x[0] # first letter
  2103. >>> grouped = series.groupby(mapper)
  2104. >>> grouped.aggregate(np.sum)
  2105. b 3.0
  2106. q 7.0
  2107. >>> grouped.aggregate([np.sum, np.mean, np.std])
  2108. mean std sum
  2109. b 1.5 0.5 3
  2110. q 3.5 0.5 7
  2111. >>> grouped.agg({'result' : lambda x: x.mean() / x.std(),
  2112. ... 'total' : np.sum})
  2113. result total
  2114. b 2.121 3
  2115. q 4.95 7
  2116. See also
  2117. --------
  2118. apply, transform
  2119. Returns
  2120. -------
  2121. Series or DataFrame
  2122. """
  2123. _level = kwargs.pop('_level', None)
  2124. if isinstance(func_or_funcs, compat.string_types):
  2125. return getattr(self, func_or_funcs)(*args, **kwargs)
  2126. if hasattr(func_or_funcs, '__iter__'):
  2127. ret = self._aggregate_multiple_funcs(func_or_funcs,
  2128. (_level or 0) + 1)
  2129. else:
  2130. cyfunc = self._is_cython_func(func_or_funcs)
  2131. if cyfunc and not args and not kwargs:
  2132. return getattr(self, cyfunc)()
  2133. if self.grouper.nkeys > 1:
  2134. return self._python_agg_general(func_or_funcs, *args, **kwargs)
  2135. try:
  2136. return self._python_agg_general(func_or_funcs, *args, **kwargs)
  2137. except Exception:
  2138. result = self._aggregate_named(func_or_funcs, *args, **kwargs)
  2139. index = Index(sorted(result), name=self.grouper.names[0])
  2140. ret = Series(result, index=index)
  2141. if not self.as_index: # pragma: no cover
  2142. print('Warning, ignoring as_index=True')
  2143. # _level handled at higher
  2144. if not _level and isinstance(ret, dict):
  2145. from pandas import concat
  2146. ret = concat(ret, axis=1)
  2147. return ret
  2148. agg = aggregate
  2149. def _aggregate_multiple_funcs(self, arg, _level):
  2150. if isinstance(arg, dict):
  2151. columns = list(arg.keys())
  2152. arg = list(arg.items())
  2153. elif any(isinstance(x, (tuple, list)) for x in arg):
  2154. arg = [(x, x) if not isinstance(x, (tuple, list)) else x
  2155. for x in arg]
  2156. # indicated column order
  2157. columns = lzip(*arg)[0]
  2158. else:
  2159. # list of functions / function names
  2160. columns = []
  2161. for f in arg:
  2162. if isinstance(f, compat.string_types):
  2163. columns.append(f)
  2164. else:
  2165. # protect against callables without names
  2166. columns.append(com._get_callable_name(f))
  2167. arg = lzip(columns, arg)
  2168. results = {}
  2169. for name, func in arg:
  2170. obj = self
  2171. if name in results:
  2172. raise SpecificationError('Function names must be unique, '
  2173. 'found multiple named %s' % name)
  2174. # reset the cache so that we
  2175. # only include the named selection
  2176. if name in self._selected_obj:
  2177. obj = copy.copy(obj)
  2178. obj._reset_cache()
  2179. obj._selection = name
  2180. results[name] = obj.aggregate(func)
  2181. if isinstance(list(compat.itervalues(results))[0],
  2182. DataFrame):
  2183. # let higher level handle
  2184. if _level:
  2185. return results
  2186. return list(compat.itervalues(results))[0]
  2187. return DataFrame(results, columns=columns)
  2188. def _wrap_output(self, output, index, names=None):
  2189. """ common agg/transform wrapping logic """
  2190. output = output[self.name]
  2191. if names is not None:
  2192. return DataFrame(output, index=index, columns=names)
  2193. else:
  2194. name = self.name
  2195. if name is None:
  2196. name = self._selected_obj.name
  2197. return Series(output, index=index, name=name)
  2198. def _wrap_aggregated_output(self, output, names=None):
  2199. return self._wrap_output(output=output,
  2200. index=self.grouper.result_index,
  2201. names=names)
  2202. def _wrap_transformed_output(self, output, names=None):
  2203. return self._wrap_output(output=output,
  2204. index=self.obj.index,
  2205. names=names)
  2206. def _wrap_applied_output(self, keys, values, not_indexed_same=False):
  2207. if len(keys) == 0:
  2208. # GH #6265
  2209. return Series([], name=self.name, index=keys)
  2210. def _get_index():
  2211. if self.grouper.nkeys > 1:
  2212. index = MultiIndex.from_tuples(keys, names=self.grouper.names)
  2213. else:
  2214. index = Index(keys, name=self.grouper.names[0])
  2215. return index
  2216. if isinstance(values[0], dict):
  2217. # GH #823
  2218. index = _get_index()
  2219. result = DataFrame(values, index=index).stack()
  2220. result.name = self.name
  2221. return result
  2222. if isinstance(values[0], (Series, dict)):
  2223. return self._concat_objects(keys, values,
  2224. not_indexed_same=not_indexed_same)
  2225. elif isinstance(values[0], DataFrame):
  2226. # possible that Series -> DataFrame by applied function
  2227. return self._concat_objects(keys, values,
  2228. not_indexed_same=not_indexed_same)
  2229. else:
  2230. # GH #6265
  2231. return Series(values, index=_get_index(), name=self.name)
  2232. def _aggregate_named(self, func, *args, **kwargs):
  2233. result = {}
  2234. for name, group in self:
  2235. group.name = name
  2236. output = func(group, *args, **kwargs)
  2237. if isinstance(output, (Series, Index, np.ndarray)):
  2238. raise Exception('Must produce aggregated value')
  2239. result[name] = self._try_cast(output, group)
  2240. return result
  2241. def transform(self, func, *args, **kwargs):
  2242. """
  2243. Call function producing a like-indexed Series on each group and return
  2244. a Series with the transformed values
  2245. Parameters
  2246. ----------
  2247. func : function
  2248. To apply to each group. Should return a Series with the same index
  2249. Examples
  2250. --------
  2251. >>> grouped.transform(lambda x: (x - x.mean()) / x.std())
  2252. Returns
  2253. -------
  2254. transformed : Series
  2255. """
  2256. func = self._is_cython_func(func) or func
  2257. # if string function
  2258. if isinstance(func, compat.string_types):
  2259. if func in _cython_transforms:
  2260. # cythonized transform
  2261. return getattr(self, func)(*args, **kwargs)
  2262. else:
  2263. # cythonized aggregation and merge
  2264. return self._transform_fast(
  2265. lambda: getattr(self, func)(*args, **kwargs))
  2266. # reg transform
  2267. dtype = self._selected_obj.dtype
  2268. result = self._selected_obj.values.copy()
  2269. wrapper = lambda x: func(x, *args, **kwargs)
  2270. for i, (name, group) in enumerate(self):
  2271. object.__setattr__(group, 'name', name)
  2272. res = wrapper(group)
  2273. if hasattr(res, 'values'):
  2274. res = res.values
  2275. # may need to astype
  2276. try:
  2277. common_type = np.common_type(np.array(res), result)
  2278. if common_type != result.dtype:
  2279. result = result.astype(common_type)
  2280. except:
  2281. pass
  2282. indexer = self._get_index(name)
  2283. result[indexer] = res
  2284. result = _possibly_downcast_to_dtype(result, dtype)
  2285. return self._selected_obj.__class__(result,
  2286. index=self._selected_obj.index,
  2287. name=self._selected_obj.name)
  2288. def _transform_fast(self, func):
  2289. """
  2290. fast version of transform, only applicable to
  2291. builtin/cythonizable functions
  2292. """
  2293. if isinstance(func, compat.string_types):
  2294. func = getattr(self, func)
  2295. ids, _, ngroup = self.grouper.group_info
  2296. cast = (self.size().fillna(0) > 0).any()
  2297. out = algos.take_1d(func().values, ids)
  2298. if cast:
  2299. out = self._try_cast(out, self.obj)
  2300. return Series(out, index=self.obj.index, name=self.obj.name)
  2301. def filter(self, func, dropna=True, *args, **kwargs): # noqa
  2302. """
  2303. Return a copy of a Series excluding elements from groups that
  2304. do not satisfy the boolean criterion specified by func.
  2305. Parameters
  2306. ----------
  2307. func : function
  2308. To apply to each group. Should return True or False.
  2309. dropna : Drop groups that do not pass the filter. True by default;
  2310. if False, groups that evaluate False are filled with NaNs.
  2311. Examples
  2312. --------
  2313. >>> grouped.filter(lambda x: x.mean() > 0)
  2314. Returns
  2315. -------
  2316. filtered : Series
  2317. """
  2318. if isinstance(func, compat.string_types):
  2319. wrapper = lambda x: getattr(x, func)(*args, **kwargs)
  2320. else:
  2321. wrapper = lambda x: func(x, *args, **kwargs)
  2322. # Interpret np.nan as False.
  2323. def true_and_notnull(x, *args, **kwargs):
  2324. b = wrapper(x, *args, **kwargs)
  2325. return b and notnull(b)
  2326. try:
  2327. indices = [self._get_index(name) for name, group in self
  2328. if true_and_notnull(group)]
  2329. except ValueError:
  2330. raise TypeError("the filter must return a boolean result")
  2331. except TypeError:
  2332. raise TypeError("the filter must return a boolean result")
  2333. filtered = self._apply_filter(indices, dropna)
  2334. return filtered
  2335. def nunique(self, dropna=True):
  2336. """ Returns number of unique elements in the group """
  2337. ids, _, _ = self.grouper.group_info
  2338. val = self.obj.get_values()
  2339. try:
  2340. sorter = np.lexsort((val, ids))
  2341. except TypeError: # catches object dtypes
  2342. assert val.dtype == object, \
  2343. 'val.dtype must be object, got %s' % val.dtype
  2344. val, _ = algos.factorize(val, sort=False)
  2345. sorter = np.lexsort((val, ids))
  2346. _isnull = lambda a: a == -1
  2347. else:
  2348. _isnull = isnull
  2349. ids, val = ids[sorter], val[sorter]
  2350. # group boundaries are where group ids change
  2351. # unique observations are where sorted values change
  2352. idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
  2353. inc = np.r_[1, val[1:] != val[:-1]]
  2354. # 1st item of each group is a new unique observation
  2355. mask = _isnull(val)
  2356. if dropna:
  2357. inc[idx] = 1
  2358. inc[mask] = 0
  2359. else:
  2360. inc[mask & np.r_[False, mask[:-1]]] = 0
  2361. inc[idx] = 1
  2362. out = np.add.reduceat(inc, idx).astype('int64', copy=False)
  2363. res = out if ids[0] != -1 else out[1:]
  2364. ri = self.grouper.result_index
  2365. # we might have duplications among the bins
  2366. if len(res) != len(ri):
  2367. res, out = np.zeros(len(ri), dtype=out.dtype), res
  2368. res[ids] = out
  2369. return Series(res,
  2370. index=ri,
  2371. name=self.name)
  2372. @deprecate_kwarg('take_last', 'keep',
  2373. mapping={True: 'last', False: 'first'})
  2374. @Appender(Series.nlargest.__doc__)
  2375. def nlargest(self, n=5, keep='first'):
  2376. # ToDo: When we remove deprecate_kwargs, we can remote these methods
  2377. # and include nlargest and nsmallest to _series_apply_whitelist
  2378. return self.apply(lambda x: x.nlargest(n=n, keep=keep))
  2379. @deprecate_kwarg('take_last', 'keep',
  2380. mapping={True: 'last', False: 'first'})
  2381. @Appender(Series.nsmallest.__doc__)
  2382. def nsmallest(self, n=5, keep='first'):
  2383. return self.apply(lambda x: x.nsmallest(n=n, keep=keep))
  2384. def value_counts(self, normalize=False, sort=True, ascending=False,
  2385. bins=None, dropna=True):
  2386. from functools import partial
  2387. from pandas.tools.tile import cut
  2388. from pandas.tools.merge import _get_join_indexers
  2389. if bins is not None and not np.iterable(bins):
  2390. # scalar bins cannot be done at top level
  2391. # in a backward compatible way
  2392. return self.apply(Series.value_counts,
  2393. normalize=normalize,
  2394. sort=sort,
  2395. ascending=ascending,
  2396. bins=bins)
  2397. ids, _, _ = self.grouper.group_info
  2398. val = self.obj.get_values()
  2399. # groupby removes null keys from groupings
  2400. mask = ids != -1
  2401. ids, val = ids[mask], val[mask]
  2402. if bins is None:
  2403. lab, lev = algos.factorize(val, sort=True)
  2404. else:
  2405. cat, bins = cut(val, bins, retbins=True)
  2406. # bins[:-1] for backward compat;
  2407. # o.w. cat.categories could be better
  2408. lab, lev, dropna = cat.codes, bins[:-1], False
  2409. sorter = np.lexsort((lab, ids))
  2410. ids, lab = ids[sorter], lab[sorter]
  2411. # group boundaries are where group ids change
  2412. idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
  2413. # new values are where sorted labels change
  2414. inc = np.r_[True, lab[1:] != lab[:-1]]
  2415. inc[idx] = True # group boundaries are also new values
  2416. out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts
  2417. # num. of times each group should be repeated
  2418. rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
  2419. # multi-index components
  2420. labels = list(map(rep, self.grouper.recons_labels)) + [lab[inc]]
  2421. levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
  2422. names = self.grouper.names + [self.name]
  2423. if dropna:
  2424. mask = labels[-1] != -1
  2425. if mask.all():
  2426. dropna = False
  2427. else:
  2428. out, labels = out[mask], [label[mask] for label in labels]
  2429. if normalize:
  2430. out = out.astype('float')
  2431. d = np.diff(np.r_[idx, len(ids)])
  2432. if dropna:
  2433. m = ids[lab == -1]
  2434. if _np_version_under1p8:
  2435. mi, ml = algos.factorize(m)
  2436. d[ml] = d[ml] - np.bincount(mi)
  2437. else:
  2438. np.add.at(d, m, -1)
  2439. acc = rep(d)[mask]
  2440. else:
  2441. acc = rep(d)
  2442. out /= acc
  2443. if sort and bins is None:
  2444. cat = ids[inc][mask] if dropna else ids[inc]
  2445. sorter = np.lexsort((out if ascending else -out, cat))
  2446. out, labels[-1] = out[sorter], labels[-1][sorter]
  2447. if bins is None:
  2448. mi = MultiIndex(levels=levels, labels=labels, names=names,
  2449. verify_integrity=False)
  2450. if is_integer_dtype(out):
  2451. out = _ensure_int64(out)
  2452. return Series(out, index=mi, name=self.name)
  2453. # for compat. with algos.value_counts need to ensure every
  2454. # bin is present at every index level, null filled with zeros
  2455. diff = np.zeros(len(out), dtype='bool')
  2456. for lab in labels[:-1]:
  2457. diff |= np.r_[True, lab[1:] != lab[:-1]]
  2458. ncat, nbin = diff.sum(), len(levels[-1])
  2459. left = [np.repeat(np.arange(ncat), nbin),
  2460. np.tile(np.arange(nbin), ncat)]
  2461. right = [diff.cumsum() - 1, labels[-1]]
  2462. _, idx = _get_join_indexers(left, right, sort=False, how='left')
  2463. out = np.where(idx != -1, out[idx], 0)
  2464. if sort:
  2465. sorter = np.lexsort((out if ascending else -out, left[0]))
  2466. out, left[-1] = out[sorter], left[-1][sorter]
  2467. # build the multi-index w/ full levels
  2468. labels = list(map(lambda lab: np.repeat(lab[diff], nbin), labels[:-1]))
  2469. labels.append(left[-1])
  2470. mi = MultiIndex(levels=levels, labels=labels, names=names,
  2471. verify_integrity=False)
  2472. if is_integer_dtype(out):
  2473. out = _ensure_int64(out)
  2474. return Series(out, index=mi, name=self.name)
  2475. def count(self):
  2476. """ Compute count of group, excluding missing values """
  2477. ids, _, ngroups = self.grouper.group_info
  2478. val = self.obj.get_values()
  2479. mask = (ids != -1) & ~isnull(val)
  2480. ids = _ensure_platform_int(ids)
  2481. out = np.bincount(ids[mask], minlength=ngroups or None)
  2482. return Series(out,
  2483. index=self.grouper.result_index,
  2484. name=self.name,
  2485. dtype='int64')
  2486. def _apply_to_column_groupbys(self, func):
  2487. """ return a pass thru """
  2488. return func(self)
  2489. class NDFrameGroupBy(GroupBy):
  2490. def _iterate_slices(self):
  2491. if self.axis == 0:
  2492. # kludge
  2493. if self._selection is None:
  2494. slice_axis = self.obj.columns
  2495. else:
  2496. slice_axis = self._selection_list
  2497. slicer = lambda x: self.obj[x]
  2498. else:
  2499. slice_axis = self.obj.index
  2500. slicer = self.obj.xs
  2501. for val in slice_axis:
  2502. if val in self.exclusions:
  2503. continue
  2504. yield val, slicer(val)
  2505. def _cython_agg_general(self, how, numeric_only=True):
  2506. new_items, new_blocks = self._cython_agg_blocks(
  2507. how, numeric_only=numeric_only)
  2508. return self._wrap_agged_blocks(new_items, new_blocks)
  2509. def _wrap_agged_blocks(self, items, blocks):
  2510. obj = self._obj_with_exclusions
  2511. new_axes = list(obj._data.axes)
  2512. # more kludge
  2513. if self.axis == 0:
  2514. new_axes[0], new_axes[1] = new_axes[1], self.grouper.result_index
  2515. else:
  2516. new_axes[self.axis] = self.grouper.result_index
  2517. # Make sure block manager integrity check passes.
  2518. assert new_axes[0].equals(items)
  2519. new_axes[0] = items
  2520. mgr = BlockManager(blocks, new_axes)
  2521. new_obj = type(obj)(mgr)
  2522. return self._post_process_cython_aggregate(new_obj)
  2523. _block_agg_axis = 0
  2524. def _cython_agg_blocks(self, how, numeric_only=True):
  2525. data, agg_axis = self._get_data_to_aggregate()
  2526. new_blocks = []
  2527. if numeric_only:
  2528. data = data.get_numeric_data(copy=False)
  2529. for block in data.blocks:
  2530. result, _ = self.grouper.aggregate(
  2531. block.values, how, axis=agg_axis)
  2532. # see if we can cast the block back to the original dtype
  2533. result = block._try_coerce_and_cast_result(result)
  2534. newb = make_block(result, placement=block.mgr_locs)
  2535. new_blocks.append(newb)
  2536. if len(new_blocks) == 0:
  2537. raise DataError('No numeric types to aggregate')
  2538. return data.items, new_blocks
  2539. def _get_data_to_aggregate(self):
  2540. obj = self._obj_with_exclusions
  2541. if self.axis == 0:
  2542. return obj.swapaxes(0, 1)._data, 1
  2543. else:
  2544. return obj._data, self.axis
  2545. def _post_process_cython_aggregate(self, obj):
  2546. # undoing kludge from below
  2547. if self.axis == 0:
  2548. obj = obj.swapaxes(0, 1)
  2549. return obj
  2550. def aggregate(self, arg, *args, **kwargs):
  2551. _level = kwargs.pop('_level', None)
  2552. result, how = self._aggregate(arg, _level=_level, *args, **kwargs)
  2553. if how is None:
  2554. return result
  2555. if result is None:
  2556. # grouper specific aggregations
  2557. if self.grouper.nkeys > 1:
  2558. return self._python_agg_general(arg, *args, **kwargs)
  2559. else:
  2560. # try to treat as if we are passing a list
  2561. try:
  2562. assert not args and not kwargs
  2563. result = self._aggregate_multiple_funcs(
  2564. [arg], _level=_level)
  2565. result.columns = Index(
  2566. result.columns.levels[0],
  2567. name=self._selected_obj.columns.name)
  2568. except:
  2569. result = self._aggregate_generic(arg, *args, **kwargs)
  2570. if not self.as_index:
  2571. self._insert_inaxis_grouper_inplace(result)
  2572. result.index = np.arange(len(result))
  2573. return result._convert(datetime=True)
  2574. agg = aggregate
  2575. def _aggregate_generic(self, func, *args, **kwargs):
  2576. if self.grouper.nkeys != 1:
  2577. raise AssertionError('Number of keys must be 1')
  2578. axis = self.axis
  2579. obj = self._obj_with_exclusions
  2580. result = {}
  2581. if axis != obj._info_axis_number:
  2582. try:
  2583. for name, data in self:
  2584. result[name] = self._try_cast(func(data, *args, **kwargs),
  2585. data)
  2586. except Exception:
  2587. return self._aggregate_item_by_item(func, *args, **kwargs)
  2588. else:
  2589. for name in self.indices:
  2590. try:
  2591. data = self.get_group(name, obj=obj)
  2592. result[name] = self._try_cast(func(data, *args, **kwargs),
  2593. data)
  2594. except Exception:
  2595. wrapper = lambda x: func(x, *args, **kwargs)
  2596. result[name] = data.apply(wrapper, axis=axis)
  2597. return self._wrap_generic_output(result, obj)
  2598. def _wrap_aggregated_output(self, output, names=None):
  2599. raise AbstractMethodError(self)
  2600. def _aggregate_item_by_item(self, func, *args, **kwargs):
  2601. # only for axis==0
  2602. obj = self._obj_with_exclusions
  2603. result = {}
  2604. cannot_agg = []
  2605. errors = None
  2606. for item in obj:
  2607. try:
  2608. data = obj[item]
  2609. colg = SeriesGroupBy(data, selection=item,
  2610. grouper=self.grouper)
  2611. result[item] = self._try_cast(
  2612. colg.aggregate(func, *args, **kwargs), data)
  2613. except ValueError:
  2614. cannot_agg.append(item)
  2615. continue
  2616. except TypeError as e:
  2617. cannot_agg.append(item)
  2618. errors = e
  2619. continue
  2620. result_columns = obj.columns
  2621. if cannot_agg:
  2622. result_columns = result_columns.drop(cannot_agg)
  2623. # GH6337
  2624. if not len(result_columns) and errors is not None:
  2625. raise errors
  2626. return DataFrame(result, columns=result_columns)
  2627. def _decide_output_index(self, output, labels):
  2628. if len(output) == len(labels):
  2629. output_keys = labels
  2630. else:
  2631. output_keys = sorted(output)
  2632. try:
  2633. output_keys.sort()
  2634. except Exception: # pragma: no cover
  2635. pass
  2636. if isinstance(labels, MultiIndex):
  2637. output_keys = MultiIndex.from_tuples(output_keys,
  2638. names=labels.names)
  2639. return output_keys
  2640. def _wrap_applied_output(self, keys, values, not_indexed_same=False):
  2641. from pandas.core.index import _all_indexes_same
  2642. if len(keys) == 0:
  2643. return DataFrame(index=keys)
  2644. key_names = self.grouper.names
  2645. # GH12824.
  2646. def first_non_None_value(values):
  2647. try:
  2648. v = next(v for v in values if v is not None)
  2649. except StopIteration:
  2650. return None
  2651. return v
  2652. v = first_non_None_value(values)
  2653. if v is None:
  2654. # GH9684. If all values are None, then this will throw an error.
  2655. # We'd prefer it return an empty dataframe.
  2656. return DataFrame()
  2657. elif isinstance(v, DataFrame):
  2658. return self._concat_objects(keys, values,
  2659. not_indexed_same=not_indexed_same)
  2660. elif self.grouper.groupings is not None:
  2661. if len(self.grouper.groupings) > 1:
  2662. key_index = MultiIndex.from_tuples(keys, names=key_names)
  2663. else:
  2664. ping = self.grouper.groupings[0]
  2665. if len(keys) == ping.ngroups:
  2666. key_index = ping.group_index
  2667. key_index.name = key_names[0]
  2668. key_lookup = Index(keys)
  2669. indexer = key_lookup.get_indexer(key_index)
  2670. # reorder the values
  2671. values = [values[i] for i in indexer]
  2672. else:
  2673. key_index = Index(keys, name=key_names[0])
  2674. # don't use the key indexer
  2675. if not self.as_index:
  2676. key_index = None
  2677. # make Nones an empty object
  2678. v = first_non_None_value(values)
  2679. if v is None:
  2680. return DataFrame()
  2681. elif isinstance(v, NDFrame):
  2682. values = [
  2683. x if x is not None else
  2684. v._constructor(**v._construct_axes_dict())
  2685. for x in values
  2686. ]
  2687. v = values[0]
  2688. if isinstance(v, (np.ndarray, Index, Series)):
  2689. if isinstance(v, Series):
  2690. applied_index = self._selected_obj._get_axis(self.axis)
  2691. all_indexed_same = _all_indexes_same([
  2692. x.index for x in values
  2693. ])
  2694. singular_series = (len(values) == 1 and
  2695. applied_index.nlevels == 1)
  2696. # GH3596
  2697. # provide a reduction (Frame -> Series) if groups are
  2698. # unique
  2699. if self.squeeze:
  2700. # assign the name to this series
  2701. if singular_series:
  2702. values[0].name = keys[0]
  2703. # GH2893
  2704. # we have series in the values array, we want to
  2705. # produce a series:
  2706. # if any of the sub-series are not indexed the same
  2707. # OR we don't have a multi-index and we have only a
  2708. # single values
  2709. return self._concat_objects(
  2710. keys, values, not_indexed_same=not_indexed_same
  2711. )
  2712. # still a series
  2713. # path added as of GH 5545
  2714. elif all_indexed_same:
  2715. from pandas.tools.merge import concat
  2716. return concat(values)
  2717. if not all_indexed_same:
  2718. # GH 8467
  2719. return self._concat_objects(
  2720. keys, values, not_indexed_same=True,
  2721. )
  2722. try:
  2723. if self.axis == 0:
  2724. # GH6124 if the list of Series have a consistent name,
  2725. # then propagate that name to the result.
  2726. index = v.index.copy()
  2727. if index.name is None:
  2728. # Only propagate the series name to the result
  2729. # if all series have a consistent name. If the
  2730. # series do not have a consistent name, do
  2731. # nothing.
  2732. names = set(v.name for v in values)
  2733. if len(names) == 1:
  2734. index.name = list(names)[0]
  2735. # normally use vstack as its faster than concat
  2736. # and if we have mi-columns
  2737. if isinstance(v.index,
  2738. MultiIndex) or key_index is None:
  2739. stacked_values = np.vstack(map(np.asarray, values))
  2740. result = DataFrame(stacked_values, index=key_index,
  2741. columns=index)
  2742. else:
  2743. # GH5788 instead of stacking; concat gets the
  2744. # dtypes correct
  2745. from pandas.tools.merge import concat
  2746. result = concat(values, keys=key_index,
  2747. names=key_index.names,
  2748. axis=self.axis).unstack()
  2749. result.columns = index
  2750. else:
  2751. stacked_values = np.vstack(map(np.asarray, values))
  2752. result = DataFrame(stacked_values.T, index=v.index,
  2753. columns=key_index)
  2754. except (ValueError, AttributeError):
  2755. # GH1738: values is list of arrays of unequal lengths fall
  2756. # through to the outer else caluse
  2757. return Series(values, index=key_index, name=self.name)
  2758. # if we have date/time like in the original, then coerce dates
  2759. # as we are stacking can easily have object dtypes here
  2760. so = self._selected_obj
  2761. if (so.ndim == 2 and so.dtypes.isin(_DATELIKE_DTYPES).any()):
  2762. result = result._convert(numeric=True)
  2763. date_cols = self._selected_obj.select_dtypes(
  2764. include=list(_DATELIKE_DTYPES)).columns
  2765. date_cols = date_cols.intersection(result.columns)
  2766. result[date_cols] = (result[date_cols]
  2767. ._convert(datetime=True,
  2768. coerce=True))
  2769. else:
  2770. result = result._convert(datetime=True)
  2771. return self._reindex_output(result)
  2772. # values are not series or array-like but scalars
  2773. else:
  2774. # only coerce dates if we find at least 1 datetime
  2775. coerce = True if any([isinstance(x, Timestamp)
  2776. for x in values]) else False
  2777. # self.name not passed through to Series as the result
  2778. # should not take the name of original selection of columns
  2779. return (Series(values, index=key_index)
  2780. ._convert(datetime=True,
  2781. coerce=coerce))
  2782. else:
  2783. # Handle cases like BinGrouper
  2784. return self._concat_objects(keys, values,
  2785. not_indexed_same=not_indexed_same)
  2786. def _transform_general(self, func, *args, **kwargs):
  2787. from pandas.tools.merge import concat
  2788. applied = []
  2789. obj = self._obj_with_exclusions
  2790. gen = self.grouper.get_iterator(obj, axis=self.axis)
  2791. fast_path, slow_path = self._define_paths(func, *args, **kwargs)
  2792. path = None
  2793. for name, group in gen:
  2794. object.__setattr__(group, 'name', name)
  2795. if path is None:
  2796. # Try slow path and fast path.
  2797. try:
  2798. path, res = self._choose_path(fast_path, slow_path, group)
  2799. except TypeError:
  2800. return self._transform_item_by_item(obj, fast_path)
  2801. except ValueError:
  2802. msg = 'transform must return a scalar value for each group'
  2803. raise ValueError(msg)
  2804. else:
  2805. res = path(group)
  2806. # broadcasting
  2807. if isinstance(res, Series):
  2808. if res.index.is_(obj.index):
  2809. group.T.values[:] = res
  2810. else:
  2811. group.values[:] = res
  2812. applied.append(group)
  2813. else:
  2814. applied.append(res)
  2815. concat_index = obj.columns if self.axis == 0 else obj.index
  2816. concatenated = concat(applied, join_axes=[concat_index],
  2817. axis=self.axis, verify_integrity=False)
  2818. return self._set_result_index_ordered(concatenated)
  2819. def transform(self, func, *args, **kwargs):
  2820. """
  2821. Call function producing a like-indexed DataFrame on each group and
  2822. return a DataFrame having the same indexes as the original object
  2823. filled with the transformed values
  2824. Parameters
  2825. ----------
  2826. f : function
  2827. Function to apply to each subframe
  2828. Notes
  2829. -----
  2830. Each subframe is endowed the attribute 'name' in case you need to know
  2831. which group you are working on.
  2832. Examples
  2833. --------
  2834. >>> grouped = df.groupby(lambda x: mapping[x])
  2835. >>> grouped.transform(lambda x: (x - x.mean()) / x.std())
  2836. """
  2837. # optimized transforms
  2838. func = self._is_cython_func(func) or func
  2839. if isinstance(func, compat.string_types):
  2840. if func in _cython_transforms:
  2841. # cythonized transform
  2842. return getattr(self, func)(*args, **kwargs)
  2843. else:
  2844. # cythonized aggregation and merge
  2845. result = getattr(self, func)(*args, **kwargs)
  2846. else:
  2847. return self._transform_general(func, *args, **kwargs)
  2848. # a reduction transform
  2849. if not isinstance(result, DataFrame):
  2850. return self._transform_general(func, *args, **kwargs)
  2851. obj = self._obj_with_exclusions
  2852. # nuiscance columns
  2853. if not result.columns.equals(obj.columns):
  2854. return self._transform_general(func, *args, **kwargs)
  2855. return self._transform_fast(result, obj)
  2856. def _transform_fast(self, result, obj):
  2857. """
  2858. Fast transform path for aggregations
  2859. """
  2860. # if there were groups with no observations (Categorical only?)
  2861. # try casting data to original dtype
  2862. cast = (self.size().fillna(0) > 0).any()
  2863. # for each col, reshape to to size of original frame
  2864. # by take operation
  2865. ids, _, ngroup = self.grouper.group_info
  2866. output = []
  2867. for i, _ in enumerate(result.columns):
  2868. res = algos.take_1d(result.iloc[:, i].values, ids)
  2869. if cast:
  2870. res = self._try_cast(res, obj.iloc[:, i])
  2871. output.append(res)
  2872. return DataFrame._from_arrays(output, columns=result.columns,
  2873. index=obj.index)
  2874. def _define_paths(self, func, *args, **kwargs):
  2875. if isinstance(func, compat.string_types):
  2876. fast_path = lambda group: getattr(group, func)(*args, **kwargs)
  2877. slow_path = lambda group: group.apply(
  2878. lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis)
  2879. else:
  2880. fast_path = lambda group: func(group, *args, **kwargs)
  2881. slow_path = lambda group: group.apply(
  2882. lambda x: func(x, *args, **kwargs), axis=self.axis)
  2883. return fast_path, slow_path
  2884. def _choose_path(self, fast_path, slow_path, group):
  2885. path = slow_path
  2886. res = slow_path(group)
  2887. # if we make it here, test if we can use the fast path
  2888. try:
  2889. res_fast = fast_path(group)
  2890. # compare that we get the same results
  2891. if res.shape == res_fast.shape:
  2892. res_r = res.values.ravel()
  2893. res_fast_r = res_fast.values.ravel()
  2894. mask = notnull(res_r)
  2895. if (res_r[mask] == res_fast_r[mask]).all():
  2896. path = fast_path
  2897. except:
  2898. pass
  2899. return path, res
  2900. def _transform_item_by_item(self, obj, wrapper):
  2901. # iterate through columns
  2902. output = {}
  2903. inds = []
  2904. for i, col in enumerate(obj):
  2905. try:
  2906. output[col] = self[col].transform(wrapper)
  2907. inds.append(i)
  2908. except Exception:
  2909. pass
  2910. if len(output) == 0: # pragma: no cover
  2911. raise TypeError('Transform function invalid for data types')
  2912. columns = obj.columns
  2913. if len(output) < len(obj.columns):
  2914. columns = columns.take(inds)
  2915. return DataFrame(output, index=obj.index, columns=columns)
  2916. def filter(self, func, dropna=True, *args, **kwargs): # noqa
  2917. """
  2918. Return a copy of a DataFrame excluding elements from groups that
  2919. do not satisfy the boolean criterion specified by func.
  2920. Parameters
  2921. ----------
  2922. f : function
  2923. Function to apply to each subframe. Should return True or False.
  2924. dropna : Drop groups that do not pass the filter. True by default;
  2925. if False, groups that evaluate False are filled with NaNs.
  2926. Notes
  2927. -----
  2928. Each subframe is endowed the attribute 'name' in case you need to know
  2929. which group you are working on.
  2930. Examples
  2931. --------
  2932. >>> grouped = df.groupby(lambda x: mapping[x])
  2933. >>> grouped.filter(lambda x: x['A'].sum() + x['B'].sum() > 0)
  2934. """
  2935. indices = []
  2936. obj = self._selected_obj
  2937. gen = self.grouper.get_iterator(obj, axis=self.axis)
  2938. for name, group in gen:
  2939. object.__setattr__(group, 'name', name)
  2940. res = func(group, *args, **kwargs)
  2941. try:
  2942. res = res.squeeze()
  2943. except AttributeError: # allow e.g., scalars and frames to pass
  2944. pass
  2945. # interpret the result of the filter
  2946. if is_bool(res) or (is_scalar(res) and isnull(res)):
  2947. if res and notnull(res):
  2948. indices.append(self._get_index(name))
  2949. else:
  2950. # non scalars aren't allowed
  2951. raise TypeError("filter function returned a %s, "
  2952. "but expected a scalar bool" %
  2953. type(res).__name__)
  2954. return self._apply_filter(indices, dropna)
  2955. class DataFrameGroupBy(NDFrameGroupBy):
  2956. _apply_whitelist = _dataframe_apply_whitelist
  2957. #
  2958. # Make class defs of attributes on DataFrameGroupBy whitelist.
  2959. for _def_str in _whitelist_method_generator(DataFrame, _apply_whitelist):
  2960. exec(_def_str)
  2961. _block_agg_axis = 1
  2962. @Substitution(name='groupby')
  2963. @Appender(SelectionMixin._see_also_template)
  2964. @Appender(SelectionMixin._agg_doc)
  2965. def aggregate(self, arg, *args, **kwargs):
  2966. return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)
  2967. agg = aggregate
  2968. def _gotitem(self, key, ndim, subset=None):
  2969. """
  2970. sub-classes to define
  2971. return a sliced object
  2972. Parameters
  2973. ----------
  2974. key : string / list of selections
  2975. ndim : 1,2
  2976. requested ndim of result
  2977. subset : object, default None
  2978. subset to act on
  2979. """
  2980. if ndim == 2:
  2981. if subset is None:
  2982. subset = self.obj
  2983. return DataFrameGroupBy(subset, self.grouper, selection=key,
  2984. grouper=self.grouper,
  2985. exclusions=self.exclusions,
  2986. as_index=self.as_index)
  2987. elif ndim == 1:
  2988. if subset is None:
  2989. subset = self.obj[key]
  2990. return SeriesGroupBy(subset, selection=key,
  2991. grouper=self.grouper)
  2992. raise AssertionError("invalid ndim for _gotitem")
  2993. def _wrap_generic_output(self, result, obj):
  2994. result_index = self.grouper.levels[0]
  2995. if self.axis == 0:
  2996. return DataFrame(result, index=obj.columns,
  2997. columns=result_index).T
  2998. else:
  2999. return DataFrame(result, index=obj.index,
  3000. columns=result_index)
  3001. def _get_data_to_aggregate(self):
  3002. obj = self._obj_with_exclusions
  3003. if self.axis == 1:
  3004. return obj.T._data, 1
  3005. else:
  3006. return obj._data, 1
  3007. def _insert_inaxis_grouper_inplace(self, result):
  3008. # zip in reverse so we can always insert at loc 0
  3009. izip = zip(* map(reversed, (
  3010. self.grouper.names,
  3011. self.grouper.get_group_levels(),
  3012. [grp.in_axis for grp in self.grouper.groupings])))
  3013. for name, lev, in_axis in izip:
  3014. if in_axis:
  3015. result.insert(0, name, lev)
  3016. def _wrap_aggregated_output(self, output, names=None):
  3017. agg_axis = 0 if self.axis == 1 else 1
  3018. agg_labels = self._obj_with_exclusions._get_axis(agg_axis)
  3019. output_keys = self._decide_output_index(output, agg_labels)
  3020. if not self.as_index:
  3021. result = DataFrame(output, columns=output_keys)
  3022. self._insert_inaxis_grouper_inplace(result)
  3023. result = result.consolidate()
  3024. else:
  3025. index = self.grouper.result_index
  3026. result = DataFrame(output, index=index, columns=output_keys)
  3027. if self.axis == 1:
  3028. result = result.T
  3029. return self._reindex_output(result)._convert(datetime=True)
  3030. def _wrap_transformed_output(self, output, names=None):
  3031. return DataFrame(output, index=self.obj.index)
  3032. def _wrap_agged_blocks(self, items, blocks):
  3033. if not self.as_index:
  3034. index = np.arange(blocks[0].values.shape[1])
  3035. mgr = BlockManager(blocks, [items, index])
  3036. result = DataFrame(mgr)
  3037. self._insert_inaxis_grouper_inplace(result)
  3038. result = result.consolidate()
  3039. else:
  3040. index = self.grouper.result_index
  3041. mgr = BlockManager(blocks, [items, index])
  3042. result = DataFrame(mgr)
  3043. if self.axis == 1:
  3044. result = result.T
  3045. return self._reindex_output(result)._convert(datetime=True)
  3046. def _reindex_output(self, result):
  3047. """
  3048. if we have categorical groupers, then we want to make sure that
  3049. we have a fully reindex-output to the levels. These may have not
  3050. participated in the groupings (e.g. may have all been
  3051. nan groups)
  3052. This can re-expand the output space
  3053. """
  3054. groupings = self.grouper.groupings
  3055. if groupings is None:
  3056. return result
  3057. elif len(groupings) == 1:
  3058. return result
  3059. elif not any([isinstance(ping.grouper, (Categorical, CategoricalIndex))
  3060. for ping in groupings]):
  3061. return result
  3062. levels_list = [ping.group_index for ping in groupings]
  3063. index, _ = MultiIndex.from_product(
  3064. levels_list, names=self.grouper.names).sortlevel()
  3065. if self.as_index:
  3066. d = {self.obj._get_axis_name(self.axis): index, 'copy': False}
  3067. return result.reindex(**d)
  3068. # GH 13204
  3069. # Here, the categorical in-axis groupers, which need to be fully
  3070. # expanded, are columns in `result`. An idea is to do:
  3071. # result = result.set_index(self.grouper.names)
  3072. # .reindex(index).reset_index()
  3073. # but special care has to be taken because of possible not-in-axis
  3074. # groupers.
  3075. # So, we manually select and drop the in-axis grouper columns,
  3076. # reindex `result`, and then reset the in-axis grouper columns.
  3077. # Select in-axis groupers
  3078. in_axis_grps = [(i, ping.name) for (i, ping)
  3079. in enumerate(groupings) if ping.in_axis]
  3080. g_nums, g_names = zip(*in_axis_grps)
  3081. result = result.drop(labels=list(g_names), axis=1)
  3082. # Set a temp index and reindex (possibly expanding)
  3083. result = result.set_index(self.grouper.result_index
  3084. ).reindex(index, copy=False)
  3085. # Reset in-axis grouper columns
  3086. # (using level numbers `g_nums` because level names may not be unique)
  3087. result = result.reset_index(level=g_nums)
  3088. return result.reset_index(drop=True)
  3089. def _iterate_column_groupbys(self):
  3090. for i, colname in enumerate(self._selected_obj.columns):
  3091. yield colname, SeriesGroupBy(self._selected_obj.iloc[:, i],
  3092. selection=colname,
  3093. grouper=self.grouper,
  3094. exclusions=self.exclusions)
  3095. def _apply_to_column_groupbys(self, func):
  3096. from pandas.tools.merge import concat
  3097. return concat(
  3098. (func(col_groupby) for _, col_groupby
  3099. in self._iterate_column_groupbys()),
  3100. keys=self._selected_obj.columns, axis=1)
  3101. def count(self):
  3102. """ Compute count of group, excluding missing values """
  3103. from functools import partial
  3104. from pandas.lib import count_level_2d
  3105. from pandas.types.missing import _isnull_ndarraylike as isnull
  3106. data, _ = self._get_data_to_aggregate()
  3107. ids, _, ngroups = self.grouper.group_info
  3108. mask = ids != -1
  3109. val = ((mask & ~isnull(blk.get_values())) for blk in data.blocks)
  3110. loc = (blk.mgr_locs for blk in data.blocks)
  3111. counter = partial(count_level_2d, labels=ids, max_bin=ngroups, axis=1)
  3112. blk = map(make_block, map(counter, val), loc)
  3113. return self._wrap_agged_blocks(data.items, list(blk))
  3114. from pandas.tools.plotting import boxplot_frame_groupby # noqa
  3115. DataFrameGroupBy.boxplot = boxplot_frame_groupby
  3116. class PanelGroupBy(NDFrameGroupBy):
  3117. @Substitution(name='groupby')
  3118. @Appender(SelectionMixin._see_also_template)
  3119. @Appender(SelectionMixin._agg_doc)
  3120. def aggregate(self, arg, *args, **kwargs):
  3121. return super(PanelGroupBy, self).aggregate(arg, *args, **kwargs)
  3122. agg = aggregate
  3123. def _iterate_slices(self):
  3124. if self.axis == 0:
  3125. # kludge
  3126. if self._selection is None:
  3127. slice_axis = self._selected_obj.items
  3128. else:
  3129. slice_axis = self._selection_list
  3130. slicer = lambda x: self._selected_obj[x]
  3131. else:
  3132. raise NotImplementedError("axis other than 0 is not supported")
  3133. for val in slice_axis:
  3134. if val in self.exclusions:
  3135. continue
  3136. yield val, slicer(val)
  3137. def aggregate(self, arg, *args, **kwargs):
  3138. """
  3139. Aggregate using input function or dict of {column -> function}
  3140. Parameters
  3141. ----------
  3142. arg : function or dict
  3143. Function to use for aggregating groups. If a function, must either
  3144. work when passed a Panel or when passed to Panel.apply. If
  3145. pass a dict, the keys must be DataFrame column names
  3146. Returns
  3147. -------
  3148. aggregated : Panel
  3149. """
  3150. if isinstance(arg, compat.string_types):
  3151. return getattr(self, arg)(*args, **kwargs)
  3152. return self._aggregate_generic(arg, *args, **kwargs)
  3153. def _wrap_generic_output(self, result, obj):
  3154. if self.axis == 0:
  3155. new_axes = list(obj.axes)
  3156. new_axes[0] = self.grouper.result_index
  3157. elif self.axis == 1:
  3158. x, y, z = obj.axes
  3159. new_axes = [self.grouper.result_index, z, x]
  3160. else:
  3161. x, y, z = obj.axes
  3162. new_axes = [self.grouper.result_index, y, x]
  3163. result = Panel._from_axes(result, new_axes)
  3164. if self.axis == 1:
  3165. result = result.swapaxes(0, 1).swapaxes(0, 2)
  3166. elif self.axis == 2:
  3167. result = result.swapaxes(0, 2)
  3168. return result
  3169. def _aggregate_item_by_item(self, func, *args, **kwargs):
  3170. obj = self._obj_with_exclusions
  3171. result = {}
  3172. if self.axis > 0:
  3173. for item in obj:
  3174. try:
  3175. itemg = DataFrameGroupBy(obj[item],
  3176. axis=self.axis - 1,
  3177. grouper=self.grouper)
  3178. result[item] = itemg.aggregate(func, *args, **kwargs)
  3179. except (ValueError, TypeError):
  3180. raise
  3181. new_axes = list(obj.axes)
  3182. new_axes[self.axis] = self.grouper.result_index
  3183. return Panel._from_axes(result, new_axes)
  3184. else:
  3185. raise ValueError("axis value must be greater than 0")
  3186. def _wrap_aggregated_output(self, output, names=None):
  3187. raise AbstractMethodError(self)
  3188. class NDArrayGroupBy(GroupBy):
  3189. pass
  3190. # ----------------------------------------------------------------------
  3191. # Splitting / application
  3192. class DataSplitter(object):
  3193. def __init__(self, data, labels, ngroups, axis=0):
  3194. self.data = data
  3195. self.labels = _ensure_int64(labels)
  3196. self.ngroups = ngroups
  3197. self.axis = axis
  3198. @cache_readonly
  3199. def slabels(self):
  3200. # Sorted labels
  3201. return algos.take_nd(self.labels, self.sort_idx, allow_fill=False)
  3202. @cache_readonly
  3203. def sort_idx(self):
  3204. # Counting sort indexer
  3205. return _get_group_index_sorter(self.labels, self.ngroups)
  3206. def __iter__(self):
  3207. sdata = self._get_sorted_data()
  3208. if self.ngroups == 0:
  3209. raise StopIteration
  3210. starts, ends = lib.generate_slices(self.slabels, self.ngroups)
  3211. for i, (start, end) in enumerate(zip(starts, ends)):
  3212. # Since I'm now compressing the group ids, it's now not "possible"
  3213. # to produce empty slices because such groups would not be observed
  3214. # in the data
  3215. # if start >= end:
  3216. # raise AssertionError('Start %s must be less than end %s'
  3217. # % (str(start), str(end)))
  3218. yield i, self._chop(sdata, slice(start, end))
  3219. def _get_sorted_data(self):
  3220. return self.data.take(self.sort_idx, axis=self.axis, convert=False)
  3221. def _chop(self, sdata, slice_obj):
  3222. return sdata.iloc[slice_obj]
  3223. def apply(self, f):
  3224. raise AbstractMethodError(self)
  3225. class ArraySplitter(DataSplitter):
  3226. pass
  3227. class SeriesSplitter(DataSplitter):
  3228. def _chop(self, sdata, slice_obj):
  3229. return sdata._get_values(slice_obj).to_dense()
  3230. class FrameSplitter(DataSplitter):
  3231. def __init__(self, data, labels, ngroups, axis=0):
  3232. super(FrameSplitter, self).__init__(data, labels, ngroups, axis=axis)
  3233. def fast_apply(self, f, names):
  3234. # must return keys::list, values::list, mutated::bool
  3235. try:
  3236. starts, ends = lib.generate_slices(self.slabels, self.ngroups)
  3237. except:
  3238. # fails when all -1
  3239. return [], True
  3240. sdata = self._get_sorted_data()
  3241. results, mutated = lib.apply_frame_axis0(sdata, f, names, starts, ends)
  3242. return results, mutated
  3243. def _chop(self, sdata, slice_obj):
  3244. if self.axis == 0:
  3245. return sdata.iloc[slice_obj]
  3246. else:
  3247. return sdata._slice(slice_obj, axis=1) # ix[:, slice_obj]
  3248. class NDFrameSplitter(DataSplitter):
  3249. def __init__(self, data, labels, ngroups, axis=0):
  3250. super(NDFrameSplitter, self).__init__(data, labels, ngroups, axis=axis)
  3251. self.factory = data._constructor
  3252. def _get_sorted_data(self):
  3253. # this is the BlockManager
  3254. data = self.data._data
  3255. # this is sort of wasteful but...
  3256. sorted_axis = data.axes[self.axis].take(self.sort_idx)
  3257. sorted_data = data.reindex_axis(sorted_axis, axis=self.axis)
  3258. return sorted_data
  3259. def _chop(self, sdata, slice_obj):
  3260. return self.factory(sdata.get_slice(slice_obj, axis=self.axis))
  3261. def get_splitter(data, *args, **kwargs):
  3262. if isinstance(data, Series):
  3263. klass = SeriesSplitter
  3264. elif isinstance(data, DataFrame):
  3265. klass = FrameSplitter
  3266. else:
  3267. klass = NDFrameSplitter
  3268. return klass(data, *args, **kwargs)
  3269. # ----------------------------------------------------------------------
  3270. # Misc utilities
  3271. def get_group_index(labels, shape, sort, xnull):
  3272. """
  3273. For the particular label_list, gets the offsets into the hypothetical list
  3274. representing the totally ordered cartesian product of all possible label
  3275. combinations, *as long as* this space fits within int64 bounds;
  3276. otherwise, though group indices identify unique combinations of
  3277. labels, they cannot be deconstructed.
  3278. - If `sort`, rank of returned ids preserve lexical ranks of labels.
  3279. i.e. returned id's can be used to do lexical sort on labels;
  3280. - If `xnull` nulls (-1 labels) are passed through.
  3281. Parameters
  3282. ----------
  3283. labels: sequence of arrays
  3284. Integers identifying levels at each location
  3285. shape: sequence of ints same length as labels
  3286. Number of unique levels at each location
  3287. sort: boolean
  3288. If the ranks of returned ids should match lexical ranks of labels
  3289. xnull: boolean
  3290. If true nulls are excluded. i.e. -1 values in the labels are
  3291. passed through
  3292. Returns
  3293. -------
  3294. An array of type int64 where two elements are equal if their corresponding
  3295. labels are equal at all location.
  3296. """
  3297. def _int64_cut_off(shape):
  3298. acc = long(1)
  3299. for i, mul in enumerate(shape):
  3300. acc *= long(mul)
  3301. if not acc < _INT64_MAX:
  3302. return i
  3303. return len(shape)
  3304. def loop(labels, shape):
  3305. # how many levels can be done without overflow:
  3306. nlev = _int64_cut_off(shape)
  3307. # compute flat ids for the first `nlev` levels
  3308. stride = np.prod(shape[1:nlev], dtype='i8')
  3309. out = stride * labels[0].astype('i8', subok=False, copy=False)
  3310. for i in range(1, nlev):
  3311. stride //= shape[i]
  3312. out += labels[i] * stride
  3313. if xnull: # exclude nulls
  3314. mask = labels[0] == -1
  3315. for lab in labels[1:nlev]:
  3316. mask |= lab == -1
  3317. out[mask] = -1
  3318. if nlev == len(shape): # all levels done!
  3319. return out
  3320. # compress what has been done so far in order to avoid overflow
  3321. # to retain lexical ranks, obs_ids should be sorted
  3322. comp_ids, obs_ids = _compress_group_index(out, sort=sort)
  3323. labels = [comp_ids] + labels[nlev:]
  3324. shape = [len(obs_ids)] + shape[nlev:]
  3325. return loop(labels, shape)
  3326. def maybe_lift(lab, size): # pormote nan values
  3327. return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
  3328. labels = map(_ensure_int64, labels)
  3329. if not xnull:
  3330. labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))
  3331. return loop(list(labels), list(shape))
  3332. _INT64_MAX = np.iinfo(np.int64).max
  3333. def _int64_overflow_possible(shape):
  3334. the_prod = long(1)
  3335. for x in shape:
  3336. the_prod *= long(x)
  3337. return the_prod >= _INT64_MAX
  3338. def decons_group_index(comp_labels, shape):
  3339. # reconstruct labels
  3340. if _int64_overflow_possible(shape):
  3341. # at some point group indices are factorized,
  3342. # and may not be deconstructed here! wrong path!
  3343. raise ValueError('cannot deconstruct factorized group indices!')
  3344. label_list = []
  3345. factor = 1
  3346. y = 0
  3347. x = comp_labels
  3348. for i in reversed(range(len(shape))):
  3349. labels = (x - y) % (factor * shape[i]) // factor
  3350. np.putmask(labels, comp_labels < 0, -1)
  3351. label_list.append(labels)
  3352. y = labels * factor
  3353. factor *= shape[i]
  3354. return label_list[::-1]
  3355. def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull):
  3356. """
  3357. reconstruct labels from observed group ids
  3358. Parameters
  3359. ----------
  3360. xnull: boolean,
  3361. if nulls are excluded; i.e. -1 labels are passed through
  3362. """
  3363. from pandas.hashtable import unique_label_indices
  3364. if not xnull:
  3365. lift = np.fromiter(((a == -1).any() for a in labels), dtype='i8')
  3366. shape = np.asarray(shape, dtype='i8') + lift
  3367. if not _int64_overflow_possible(shape):
  3368. # obs ids are deconstructable! take the fast route!
  3369. out = decons_group_index(obs_ids, shape)
  3370. return out if xnull or not lift.any() \
  3371. else [x - y for x, y in zip(out, lift)]
  3372. i = unique_label_indices(comp_ids)
  3373. i8copy = lambda a: a.astype('i8', subok=False, copy=True)
  3374. return [i8copy(lab[i]) for lab in labels]
  3375. def _indexer_from_factorized(labels, shape, compress=True):
  3376. ids = get_group_index(labels, shape, sort=True, xnull=False)
  3377. if not compress:
  3378. ngroups = (ids.size and ids.max()) + 1
  3379. else:
  3380. ids, obs = _compress_group_index(ids, sort=True)
  3381. ngroups = len(obs)
  3382. return _get_group_index_sorter(ids, ngroups)
  3383. def _lexsort_indexer(keys, orders=None, na_position='last'):
  3384. labels = []
  3385. shape = []
  3386. if isinstance(orders, bool):
  3387. orders = [orders] * len(keys)
  3388. elif orders is None:
  3389. orders = [True] * len(keys)
  3390. for key, order in zip(keys, orders):
  3391. # we are already a Categorical
  3392. if is_categorical_dtype(key):
  3393. c = key
  3394. # create the Categorical
  3395. else:
  3396. c = Categorical(key, ordered=True)
  3397. if na_position not in ['last', 'first']:
  3398. raise ValueError('invalid na_position: {!r}'.format(na_position))
  3399. n = len(c.categories)
  3400. codes = c.codes.copy()
  3401. mask = (c.codes == -1)
  3402. if order: # ascending
  3403. if na_position == 'last':
  3404. codes = np.where(mask, n, codes)
  3405. elif na_position == 'first':
  3406. codes += 1
  3407. else: # not order means descending
  3408. if na_position == 'last':
  3409. codes = np.where(mask, n, n - codes - 1)
  3410. elif na_position == 'first':
  3411. codes = np.where(mask, 0, n - codes)
  3412. if mask.any():
  3413. n += 1
  3414. shape.append(n)
  3415. labels.append(codes)
  3416. return _indexer_from_factorized(labels, shape)
  3417. def _nargsort(items, kind='quicksort', ascending=True, na_position='last'):
  3418. """
  3419. This is intended to be a drop-in replacement for np.argsort which
  3420. handles NaNs. It adds ascending and na_position parameters.
  3421. GH #6399, #5231
  3422. """
  3423. # specially handle Categorical
  3424. if is_categorical_dtype(items):
  3425. return items.argsort(ascending=ascending)
  3426. items = np.asanyarray(items)
  3427. idx = np.arange(len(items))
  3428. mask = isnull(items)
  3429. non_nans = items[~mask]
  3430. non_nan_idx = idx[~mask]
  3431. nan_idx = np.nonzero(mask)[0]
  3432. if not ascending:
  3433. non_nans = non_nans[::-1]
  3434. non_nan_idx = non_nan_idx[::-1]
  3435. indexer = non_nan_idx[non_nans.argsort(kind=kind)]
  3436. if not ascending:
  3437. indexer = indexer[::-1]
  3438. # Finally, place the NaNs at the end or the beginning according to
  3439. # na_position
  3440. if na_position == 'last':
  3441. indexer = np.concatenate([indexer, nan_idx])
  3442. elif na_position == 'first':
  3443. indexer = np.concatenate([nan_idx, indexer])
  3444. else:
  3445. raise ValueError('invalid na_position: {!r}'.format(na_position))
  3446. return indexer
  3447. class _KeyMapper(object):
  3448. """
  3449. Ease my suffering. Map compressed group id -> key tuple
  3450. """
  3451. def __init__(self, comp_ids, ngroups, labels, levels):
  3452. self.levels = levels
  3453. self.labels = labels
  3454. self.comp_ids = comp_ids.astype(np.int64)
  3455. self.k = len(labels)
  3456. self.tables = [_hash.Int64HashTable(ngroups) for _ in range(self.k)]
  3457. self._populate_tables()
  3458. def _populate_tables(self):
  3459. for labs, table in zip(self.labels, self.tables):
  3460. table.map(self.comp_ids, labs.astype(np.int64))
  3461. def get_key(self, comp_id):
  3462. return tuple(level[table.get_item(comp_id)]
  3463. for table, level in zip(self.tables, self.levels))
  3464. def _get_indices_dict(label_list, keys):
  3465. shape = list(map(len, keys))
  3466. group_index = get_group_index(label_list, shape, sort=True, xnull=True)
  3467. ngroups = ((group_index.size and group_index.max()) + 1) \
  3468. if _int64_overflow_possible(shape) \
  3469. else np.prod(shape, dtype='i8')
  3470. sorter = _get_group_index_sorter(group_index, ngroups)
  3471. sorted_labels = [lab.take(sorter) for lab in label_list]
  3472. group_index = group_index.take(sorter)
  3473. return lib.indices_fast(sorter, group_index, keys, sorted_labels)
  3474. # ----------------------------------------------------------------------
  3475. # sorting levels...cleverly?
  3476. def _get_group_index_sorter(group_index, ngroups):
  3477. """
  3478. _algos.groupsort_indexer implements `counting sort` and it is at least
  3479. O(ngroups), where
  3480. ngroups = prod(shape)
  3481. shape = map(len, keys)
  3482. that is, linear in the number of combinations (cartesian product) of unique
  3483. values of groupby keys. This can be huge when doing multi-key groupby.
  3484. np.argsort(kind='mergesort') is O(count x log(count)) where count is the
  3485. length of the data-frame;
  3486. Both algorithms are `stable` sort and that is necessary for correctness of
  3487. groupby operations. e.g. consider:
  3488. df.groupby(key)[col].transform('first')
  3489. """
  3490. count = len(group_index)
  3491. alpha = 0.0 # taking complexities literally; there may be
  3492. beta = 1.0 # some room for fine-tuning these parameters
  3493. if alpha + beta * ngroups < count * np.log(count):
  3494. sorter, _ = _algos.groupsort_indexer(_ensure_int64(group_index),
  3495. ngroups)
  3496. return _ensure_platform_int(sorter)
  3497. else:
  3498. return group_index.argsort(kind='mergesort')
  3499. def _compress_group_index(group_index, sort=True):
  3500. """
  3501. Group_index is offsets into cartesian product of all possible labels. This
  3502. space can be huge, so this function compresses it, by computing offsets
  3503. (comp_ids) into the list of unique labels (obs_group_ids).
  3504. """
  3505. size_hint = min(len(group_index), _hash._SIZE_HINT_LIMIT)
  3506. table = _hash.Int64HashTable(size_hint)
  3507. group_index = _ensure_int64(group_index)
  3508. # note, group labels come out ascending (ie, 1,2,3 etc)
  3509. comp_ids, obs_group_ids = table.get_labels_groupby(group_index)
  3510. if sort and len(obs_group_ids) > 0:
  3511. obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids)
  3512. return comp_ids, obs_group_ids
  3513. def _reorder_by_uniques(uniques, labels):
  3514. # sorter is index where elements ought to go
  3515. sorter = uniques.argsort()
  3516. # reverse_indexer is where elements came from
  3517. reverse_indexer = np.empty(len(sorter), dtype=np.int64)
  3518. reverse_indexer.put(sorter, np.arange(len(sorter)))
  3519. mask = labels < 0
  3520. # move labels to right locations (ie, unsort ascending labels)
  3521. labels = algos.take_nd(reverse_indexer, labels, allow_fill=False)
  3522. np.putmask(labels, mask, -1)
  3523. # sort observed ids
  3524. uniques = algos.take_nd(uniques, sorter, allow_fill=False)
  3525. return uniques, labels
  3526. def _groupby_indices(values):
  3527. if is_categorical_dtype(values):
  3528. # we have a categorical, so we can do quite a bit
  3529. # bit better than factorizing again
  3530. reverse = dict(enumerate(values.categories))
  3531. codes = values.codes.astype('int64')
  3532. _, counts = _hash.value_count_int64(codes, False)
  3533. else:
  3534. reverse, codes, counts = _algos.group_labels(
  3535. _values_from_object(_ensure_object(values)))
  3536. return _algos.groupby_indices(reverse, codes, counts)
  3537. def numpy_groupby(data, labels, axis=0):
  3538. s = np.argsort(labels)
  3539. keys, inv = np.unique(labels, return_inverse=True)
  3540. i = inv.take(s)
  3541. groups_at = np.where(i != np.concatenate(([-1], i[:-1])))[0]
  3542. ordered_data = data.take(s, axis=axis)
  3543. group_sums = np.add.reduceat(ordered_data, groups_at, axis=axis)
  3544. return group_sums