PageRenderTime 51ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 1ms

/pandas/core/groupby.py

http://github.com/wesm/pandas
Python | 4441 lines | 4436 code | 5 blank | 0 comment | 14 complexity | 18d0687b836be8d203e1d5948ec00b74 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0

Large files files are truncated, but you can click here to view the full file

  1. import types
  2. from functools import wraps
  3. import numpy as np
  4. import datetime
  5. import collections
  6. import warnings
  7. import copy
  8. from pandas.compat import(
  9. zip, range, long, lzip,
  10. callable, map
  11. )
  12. from pandas import compat
  13. from pandas.compat.numpy import function as nv
  14. from pandas.compat.numpy import _np_version_under1p8
  15. from pandas.types.common import (_DATELIKE_DTYPES,
  16. is_numeric_dtype,
  17. is_timedelta64_dtype, is_datetime64_dtype,
  18. is_categorical_dtype,
  19. is_datetime_or_timedelta_dtype,
  20. is_bool, is_integer_dtype,
  21. is_complex_dtype,
  22. is_bool_dtype,
  23. is_scalar,
  24. _ensure_float64,
  25. _ensure_platform_int,
  26. _ensure_int64,
  27. _ensure_object,
  28. _ensure_float)
  29. from pandas.types.cast import _possibly_downcast_to_dtype
  30. from pandas.types.missing import isnull, notnull, _maybe_fill
  31. from pandas.core.common import _values_from_object, AbstractMethodError
  32. from pandas.core.base import (PandasObject, SelectionMixin, GroupByError,
  33. DataError, SpecificationError)
  34. from pandas.core.categorical import Categorical
  35. from pandas.core.frame import DataFrame
  36. from pandas.core.generic import NDFrame
  37. from pandas.core.index import (Index, MultiIndex, CategoricalIndex,
  38. _ensure_index)
  39. from pandas.core.internals import BlockManager, make_block
  40. from pandas.core.series import Series
  41. from pandas.core.panel import Panel
  42. from pandas.util.decorators import (cache_readonly, Substitution, Appender,
  43. make_signature, deprecate_kwarg)
  44. from pandas.formats.printing import pprint_thing
  45. from pandas.util.validators import validate_kwargs
  46. import pandas.core.algorithms as algos
  47. import pandas.core.common as com
  48. from pandas.core.config import option_context
  49. import pandas.lib as lib
  50. from pandas.lib import Timestamp
  51. import pandas.tslib as tslib
  52. import pandas.algos as _algos
  53. import pandas.hashtable as _hash
  54. _doc_template = """
  55. See also
  56. --------
  57. pandas.Series.%(name)s
  58. pandas.DataFrame.%(name)s
  59. pandas.Panel.%(name)s
  60. """
  61. # special case to prevent duplicate plots when catching exceptions when
  62. # forwarding methods from NDFrames
  63. _plotting_methods = frozenset(['plot', 'boxplot', 'hist'])
  64. _common_apply_whitelist = frozenset([
  65. 'last', 'first',
  66. 'head', 'tail', 'median',
  67. 'mean', 'sum', 'min', 'max',
  68. 'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount',
  69. 'resample',
  70. 'describe',
  71. 'rank', 'quantile',
  72. 'fillna',
  73. 'mad',
  74. 'any', 'all',
  75. 'take',
  76. 'idxmax', 'idxmin',
  77. 'shift', 'tshift',
  78. 'ffill', 'bfill',
  79. 'pct_change', 'skew',
  80. 'corr', 'cov', 'diff',
  81. ]) | _plotting_methods
  82. _series_apply_whitelist = \
  83. (_common_apply_whitelist - set(['boxplot'])) | \
  84. frozenset(['dtype', 'unique'])
  85. _dataframe_apply_whitelist = \
  86. _common_apply_whitelist | frozenset(['dtypes', 'corrwith'])
  87. _cython_transforms = frozenset(['cumprod', 'cumsum', 'shift'])
  88. def _groupby_function(name, alias, npfunc, numeric_only=True,
  89. _convert=False):
  90. _local_template = "Compute %(f)s of group values"
  91. @Substitution(name='groupby', f=name)
  92. @Appender(_doc_template)
  93. @Appender(_local_template)
  94. def f(self):
  95. self._set_group_selection()
  96. try:
  97. return self._cython_agg_general(alias, numeric_only=numeric_only)
  98. except AssertionError as e:
  99. raise SpecificationError(str(e))
  100. except Exception:
  101. result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
  102. if _convert:
  103. result = result._convert(datetime=True)
  104. return result
  105. f.__name__ = name
  106. return f
  107. def _first_compat(x, axis=0):
  108. def _first(x):
  109. x = np.asarray(x)
  110. x = x[notnull(x)]
  111. if len(x) == 0:
  112. return np.nan
  113. return x[0]
  114. if isinstance(x, DataFrame):
  115. return x.apply(_first, axis=axis)
  116. else:
  117. return _first(x)
  118. def _last_compat(x, axis=0):
  119. def _last(x):
  120. x = np.asarray(x)
  121. x = x[notnull(x)]
  122. if len(x) == 0:
  123. return np.nan
  124. return x[-1]
  125. if isinstance(x, DataFrame):
  126. return x.apply(_last, axis=axis)
  127. else:
  128. return _last(x)
  129. class Grouper(object):
  130. """
  131. A Grouper allows the user to specify a groupby instruction for a target
  132. object
  133. This specification will select a column via the key parameter, or if the
  134. level and/or axis parameters are given, a level of the index of the target
  135. object.
  136. These are local specifications and will override 'global' settings,
  137. that is the parameters axis and level which are passed to the groupby
  138. itself.
  139. Parameters
  140. ----------
  141. key : string, defaults to None
  142. groupby key, which selects the grouping column of the target
  143. level : name/number, defaults to None
  144. the level for the target index
  145. freq : string / frequency object, defaults to None
  146. This will groupby the specified frequency if the target selection
  147. (via key or level) is a datetime-like object. For full specification
  148. of available frequencies, please see
  149. `here <http://pandas.pydata.org/pandas-docs/stable/timeseries.html>`_.
  150. axis : number/name of the axis, defaults to 0
  151. sort : boolean, default to False
  152. whether to sort the resulting labels
  153. additional kwargs to control time-like groupers (when freq is passed)
  154. closed : closed end of interval; left or right
  155. label : interval boundary to use for labeling; left or right
  156. convention : {'start', 'end', 'e', 's'}
  157. If grouper is PeriodIndex
  158. Returns
  159. -------
  160. A specification for a groupby instruction
  161. Examples
  162. --------
  163. Syntactic sugar for ``df.groupby('A')``
  164. >>> df.groupby(Grouper(key='A'))
  165. Specify a resample operation on the column 'date'
  166. >>> df.groupby(Grouper(key='date', freq='60s'))
  167. Specify a resample operation on the level 'date' on the columns axis
  168. with a frequency of 60s
  169. >>> df.groupby(Grouper(level='date', freq='60s', axis=1))
  170. """
  171. def __new__(cls, *args, **kwargs):
  172. if kwargs.get('freq') is not None:
  173. from pandas.tseries.resample import TimeGrouper
  174. cls = TimeGrouper
  175. return super(Grouper, cls).__new__(cls)
  176. def __init__(self, key=None, level=None, freq=None, axis=0, sort=False):
  177. self.key = key
  178. self.level = level
  179. self.freq = freq
  180. self.axis = axis
  181. self.sort = sort
  182. self.grouper = None
  183. self.obj = None
  184. self.indexer = None
  185. self.binner = None
  186. @property
  187. def ax(self):
  188. return self.grouper
  189. def _get_grouper(self, obj):
  190. """
  191. Parameters
  192. ----------
  193. obj : the subject object
  194. Returns
  195. -------
  196. a tuple of binner, grouper, obj (possibly sorted)
  197. """
  198. self._set_grouper(obj)
  199. self.grouper, exclusions, self.obj = _get_grouper(self.obj, [self.key],
  200. axis=self.axis,
  201. level=self.level,
  202. sort=self.sort)
  203. return self.binner, self.grouper, self.obj
  204. def _set_grouper(self, obj, sort=False):
  205. """
  206. given an object and the specifications, setup the internal grouper
  207. for this particular specification
  208. Parameters
  209. ----------
  210. obj : the subject object
  211. """
  212. if self.key is not None and self.level is not None:
  213. raise ValueError(
  214. "The Grouper cannot specify both a key and a level!")
  215. # the key must be a valid info item
  216. if self.key is not None:
  217. key = self.key
  218. if key not in obj._info_axis:
  219. raise KeyError("The grouper name {0} is not found".format(key))
  220. ax = Index(obj[key], name=key)
  221. else:
  222. ax = obj._get_axis(self.axis)
  223. if self.level is not None:
  224. level = self.level
  225. # if a level is given it must be a mi level or
  226. # equivalent to the axis name
  227. if isinstance(ax, MultiIndex):
  228. level = ax._get_level_number(level)
  229. ax = Index(ax.get_level_values(
  230. level), name=ax.names[level])
  231. else:
  232. if level not in (0, ax.name):
  233. raise ValueError(
  234. "The level {0} is not valid".format(level))
  235. # possibly sort
  236. if (self.sort or sort) and not ax.is_monotonic:
  237. # use stable sort to support first, last, nth
  238. indexer = self.indexer = ax.argsort(kind='mergesort')
  239. ax = ax.take(indexer)
  240. obj = obj.take(indexer, axis=self.axis,
  241. convert=False, is_copy=False)
  242. self.obj = obj
  243. self.grouper = ax
  244. return self.grouper
  245. def _get_binner_for_grouping(self, obj):
  246. """ default to the standard binner here """
  247. group_axis = obj._get_axis(self.axis)
  248. return Grouping(group_axis, None, obj=obj, name=self.key,
  249. level=self.level, sort=self.sort, in_axis=False)
  250. @property
  251. def groups(self):
  252. return self.grouper.groups
  253. class GroupByPlot(PandasObject):
  254. """
  255. Class implementing the .plot attribute for groupby objects
  256. """
  257. def __init__(self, groupby):
  258. self._groupby = groupby
  259. def __call__(self, *args, **kwargs):
  260. def f(self):
  261. return self.plot(*args, **kwargs)
  262. f.__name__ = 'plot'
  263. return self._groupby.apply(f)
  264. def __getattr__(self, name):
  265. def attr(*args, **kwargs):
  266. def f(self):
  267. return getattr(self.plot, name)(*args, **kwargs)
  268. return self._groupby.apply(f)
  269. return attr
  270. class _GroupBy(PandasObject, SelectionMixin):
  271. _group_selection = None
  272. _apply_whitelist = frozenset([])
  273. def __init__(self, obj, keys=None, axis=0, level=None,
  274. grouper=None, exclusions=None, selection=None, as_index=True,
  275. sort=True, group_keys=True, squeeze=False, **kwargs):
  276. self._selection = selection
  277. if isinstance(obj, NDFrame):
  278. obj._consolidate_inplace()
  279. self.level = level
  280. if not as_index:
  281. if not isinstance(obj, DataFrame):
  282. raise TypeError('as_index=False only valid with DataFrame')
  283. if axis != 0:
  284. raise ValueError('as_index=False only valid for axis=0')
  285. self.as_index = as_index
  286. self.keys = keys
  287. self.sort = sort
  288. self.group_keys = group_keys
  289. self.squeeze = squeeze
  290. self.mutated = kwargs.pop('mutated', False)
  291. if grouper is None:
  292. grouper, exclusions, obj = _get_grouper(obj, keys,
  293. axis=axis,
  294. level=level,
  295. sort=sort,
  296. mutated=self.mutated)
  297. self.obj = obj
  298. self.axis = obj._get_axis_number(axis)
  299. self.grouper = grouper
  300. self.exclusions = set(exclusions) if exclusions else set()
  301. # we accept no other args
  302. validate_kwargs('group', kwargs, {})
  303. def __len__(self):
  304. return len(self.groups)
  305. def __unicode__(self):
  306. # TODO: Better unicode/repr for GroupBy object
  307. return object.__repr__(self)
  308. def _assure_grouper(self):
  309. """
  310. we create the grouper on instantiation
  311. sub-classes may have a different policy
  312. """
  313. pass
  314. @property
  315. def groups(self):
  316. """ dict {group name -> group labels} """
  317. self._assure_grouper()
  318. return self.grouper.groups
  319. @property
  320. def ngroups(self):
  321. self._assure_grouper()
  322. return self.grouper.ngroups
  323. @property
  324. def indices(self):
  325. """ dict {group name -> group indices} """
  326. self._assure_grouper()
  327. return self.grouper.indices
  328. def _get_indices(self, names):
  329. """
  330. safe get multiple indices, translate keys for
  331. datelike to underlying repr
  332. """
  333. def get_converter(s):
  334. # possibly convert to the actual key types
  335. # in the indices, could be a Timestamp or a np.datetime64
  336. if isinstance(s, (Timestamp, datetime.datetime)):
  337. return lambda key: Timestamp(key)
  338. elif isinstance(s, np.datetime64):
  339. return lambda key: Timestamp(key).asm8
  340. else:
  341. return lambda key: key
  342. if len(names) == 0:
  343. return []
  344. if len(self.indices) > 0:
  345. index_sample = next(iter(self.indices))
  346. else:
  347. index_sample = None # Dummy sample
  348. name_sample = names[0]
  349. if isinstance(index_sample, tuple):
  350. if not isinstance(name_sample, tuple):
  351. msg = ("must supply a tuple to get_group with multiple"
  352. " grouping keys")
  353. raise ValueError(msg)
  354. if not len(name_sample) == len(index_sample):
  355. try:
  356. # If the original grouper was a tuple
  357. return [self.indices[name] for name in names]
  358. except KeyError:
  359. # turns out it wasn't a tuple
  360. msg = ("must supply a a same-length tuple to get_group"
  361. " with multiple grouping keys")
  362. raise ValueError(msg)
  363. converters = [get_converter(s) for s in index_sample]
  364. names = [tuple([f(n) for f, n in zip(converters, name)])
  365. for name in names]
  366. else:
  367. converter = get_converter(index_sample)
  368. names = [converter(name) for name in names]
  369. return [self.indices.get(name, []) for name in names]
  370. def _get_index(self, name):
  371. """ safe get index, translate keys for datelike to underlying repr """
  372. return self._get_indices([name])[0]
  373. @cache_readonly
  374. def _selected_obj(self):
  375. if self._selection is None or isinstance(self.obj, Series):
  376. if self._group_selection is not None:
  377. return self.obj[self._group_selection]
  378. return self.obj
  379. else:
  380. return self.obj[self._selection]
  381. def _reset_group_selection(self):
  382. """
  383. Clear group based selection. Used for methods needing to return info on
  384. each group regardless of whether a group selection was previously set.
  385. """
  386. if self._group_selection is not None:
  387. self._group_selection = None
  388. # GH12839 clear cached selection too when changing group selection
  389. self._reset_cache('_selected_obj')
  390. def _set_group_selection(self):
  391. """
  392. Create group based selection. Used when selection is not passed
  393. directly but instead via a grouper.
  394. """
  395. grp = self.grouper
  396. if self.as_index and getattr(grp, 'groupings', None) is not None and \
  397. self.obj.ndim > 1:
  398. ax = self.obj._info_axis
  399. groupers = [g.name for g in grp.groupings
  400. if g.level is None and g.in_axis]
  401. if len(groupers):
  402. self._group_selection = ax.difference(Index(groupers)).tolist()
  403. # GH12839 clear selected obj cache when group selection changes
  404. self._reset_cache('_selected_obj')
  405. def _set_result_index_ordered(self, result):
  406. # set the result index on the passed values object and
  407. # return the new object, xref 8046
  408. # the values/counts are repeated according to the group index
  409. # shortcut if we have an already ordered grouper
  410. if not self.grouper.is_monotonic:
  411. index = Index(np.concatenate(
  412. self._get_indices(self.grouper.result_index)))
  413. result.set_axis(self.axis, index)
  414. result = result.sort_index(axis=self.axis)
  415. result.set_axis(self.axis, self.obj._get_axis(self.axis))
  416. return result
  417. def _dir_additions(self):
  418. return self.obj._dir_additions() | self._apply_whitelist
  419. def __getattr__(self, attr):
  420. if attr in self._internal_names_set:
  421. return object.__getattribute__(self, attr)
  422. if attr in self.obj:
  423. return self[attr]
  424. if hasattr(self.obj, attr):
  425. return self._make_wrapper(attr)
  426. raise AttributeError("%r object has no attribute %r" %
  427. (type(self).__name__, attr))
  428. plot = property(GroupByPlot)
  429. def _make_wrapper(self, name):
  430. if name not in self._apply_whitelist:
  431. is_callable = callable(getattr(self._selected_obj, name, None))
  432. kind = ' callable ' if is_callable else ' '
  433. msg = ("Cannot access{0}attribute {1!r} of {2!r} objects, try "
  434. "using the 'apply' method".format(kind, name,
  435. type(self).__name__))
  436. raise AttributeError(msg)
  437. # need to setup the selection
  438. # as are not passed directly but in the grouper
  439. self._set_group_selection()
  440. f = getattr(self._selected_obj, name)
  441. if not isinstance(f, types.MethodType):
  442. return self.apply(lambda self: getattr(self, name))
  443. f = getattr(type(self._selected_obj), name)
  444. def wrapper(*args, **kwargs):
  445. # a little trickery for aggregation functions that need an axis
  446. # argument
  447. kwargs_with_axis = kwargs.copy()
  448. if 'axis' not in kwargs_with_axis or \
  449. kwargs_with_axis['axis'] is None:
  450. kwargs_with_axis['axis'] = self.axis
  451. def curried_with_axis(x):
  452. return f(x, *args, **kwargs_with_axis)
  453. def curried(x):
  454. return f(x, *args, **kwargs)
  455. # preserve the name so we can detect it when calling plot methods,
  456. # to avoid duplicates
  457. curried.__name__ = curried_with_axis.__name__ = name
  458. # special case otherwise extra plots are created when catching the
  459. # exception below
  460. if name in _plotting_methods:
  461. return self.apply(curried)
  462. try:
  463. return self.apply(curried_with_axis)
  464. except Exception:
  465. try:
  466. return self.apply(curried)
  467. except Exception:
  468. # related to : GH3688
  469. # try item-by-item
  470. # this can be called recursively, so need to raise
  471. # ValueError
  472. # if we don't have this method to indicated to aggregate to
  473. # mark this column as an error
  474. try:
  475. return self._aggregate_item_by_item(name,
  476. *args, **kwargs)
  477. except (AttributeError):
  478. raise ValueError
  479. return wrapper
  480. def get_group(self, name, obj=None):
  481. """
  482. Constructs NDFrame from group with provided name
  483. Parameters
  484. ----------
  485. name : object
  486. the name of the group to get as a DataFrame
  487. obj : NDFrame, default None
  488. the NDFrame to take the DataFrame out of. If
  489. it is None, the object groupby was called on will
  490. be used
  491. Returns
  492. -------
  493. group : type of obj
  494. """
  495. if obj is None:
  496. obj = self._selected_obj
  497. inds = self._get_index(name)
  498. if not len(inds):
  499. raise KeyError(name)
  500. return obj.take(inds, axis=self.axis, convert=False)
  501. def __iter__(self):
  502. """
  503. Groupby iterator
  504. Returns
  505. -------
  506. Generator yielding sequence of (name, subsetted object)
  507. for each group
  508. """
  509. return self.grouper.get_iterator(self.obj, axis=self.axis)
  510. @Substitution(name='groupby')
  511. def apply(self, func, *args, **kwargs):
  512. """
  513. Apply function and combine results together in an intelligent way. The
  514. split-apply-combine combination rules attempt to be as common sense
  515. based as possible. For example:
  516. case 1:
  517. group DataFrame
  518. apply aggregation function (f(chunk) -> Series)
  519. yield DataFrame, with group axis having group labels
  520. case 2:
  521. group DataFrame
  522. apply transform function ((f(chunk) -> DataFrame with same indexes)
  523. yield DataFrame with resulting chunks glued together
  524. case 3:
  525. group Series
  526. apply function with f(chunk) -> DataFrame
  527. yield DataFrame with result of chunks glued together
  528. Parameters
  529. ----------
  530. func : function
  531. Notes
  532. -----
  533. See online documentation for full exposition on how to use apply.
  534. In the current implementation apply calls func twice on the
  535. first group to decide whether it can take a fast or slow code
  536. path. This can lead to unexpected behavior if func has
  537. side-effects, as they will take effect twice for the first
  538. group.
  539. See also
  540. --------
  541. aggregate, transform"""
  542. func = self._is_builtin_func(func)
  543. # this is needed so we don't try and wrap strings. If we could
  544. # resolve functions to their callable functions prior, this
  545. # wouldn't be needed
  546. if args or kwargs:
  547. if callable(func):
  548. @wraps(func)
  549. def f(g):
  550. return func(g, *args, **kwargs)
  551. else:
  552. raise ValueError('func must be a callable if args or '
  553. 'kwargs are supplied')
  554. else:
  555. f = func
  556. # ignore SettingWithCopy here in case the user mutates
  557. with option_context('mode.chained_assignment', None):
  558. return self._python_apply_general(f)
  559. def _python_apply_general(self, f):
  560. keys, values, mutated = self.grouper.apply(f, self._selected_obj,
  561. self.axis)
  562. return self._wrap_applied_output(
  563. keys,
  564. values,
  565. not_indexed_same=mutated or self.mutated)
  566. def _iterate_slices(self):
  567. yield self.name, self._selected_obj
  568. def transform(self, func, *args, **kwargs):
  569. raise AbstractMethodError(self)
  570. def _cumcount_array(self, ascending=True):
  571. """
  572. Parameters
  573. ----------
  574. ascending : bool, default True
  575. If False, number in reverse, from length of group - 1 to 0.
  576. Note
  577. ----
  578. this is currently implementing sort=False
  579. (though the default is sort=True) for groupby in general
  580. """
  581. ids, _, ngroups = self.grouper.group_info
  582. sorter = _get_group_index_sorter(ids, ngroups)
  583. ids, count = ids[sorter], len(ids)
  584. if count == 0:
  585. return np.empty(0, dtype=np.int64)
  586. run = np.r_[True, ids[:-1] != ids[1:]]
  587. rep = np.diff(np.r_[np.nonzero(run)[0], count])
  588. out = (~run).cumsum()
  589. if ascending:
  590. out -= np.repeat(out[run], rep)
  591. else:
  592. out = np.repeat(out[np.r_[run[1:], True]], rep) - out
  593. rev = np.empty(count, dtype=np.intp)
  594. rev[sorter] = np.arange(count, dtype=np.intp)
  595. return out[rev].astype(np.int64, copy=False)
  596. def _index_with_as_index(self, b):
  597. """
  598. Take boolean mask of index to be returned from apply, if as_index=True
  599. """
  600. # TODO perf, it feels like this should already be somewhere...
  601. from itertools import chain
  602. original = self._selected_obj.index
  603. gp = self.grouper
  604. levels = chain((gp.levels[i][gp.labels[i][b]]
  605. for i in range(len(gp.groupings))),
  606. (original.get_level_values(i)[b]
  607. for i in range(original.nlevels)))
  608. new = MultiIndex.from_arrays(list(levels))
  609. new.names = gp.names + original.names
  610. return new
  611. def _try_cast(self, result, obj):
  612. """
  613. try to cast the result to our obj original type,
  614. we may have roundtripped thru object in the mean-time
  615. """
  616. if obj.ndim > 1:
  617. dtype = obj.values.dtype
  618. else:
  619. dtype = obj.dtype
  620. if not is_scalar(result):
  621. result = _possibly_downcast_to_dtype(result, dtype)
  622. return result
  623. def _cython_transform(self, how, numeric_only=True):
  624. output = {}
  625. for name, obj in self._iterate_slices():
  626. is_numeric = is_numeric_dtype(obj.dtype)
  627. if numeric_only and not is_numeric:
  628. continue
  629. try:
  630. result, names = self.grouper.transform(obj.values, how)
  631. except AssertionError as e:
  632. raise GroupByError(str(e))
  633. output[name] = self._try_cast(result, obj)
  634. if len(output) == 0:
  635. raise DataError('No numeric types to aggregate')
  636. return self._wrap_transformed_output(output, names)
  637. def _cython_agg_general(self, how, numeric_only=True):
  638. output = {}
  639. for name, obj in self._iterate_slices():
  640. is_numeric = is_numeric_dtype(obj.dtype)
  641. if numeric_only and not is_numeric:
  642. continue
  643. try:
  644. result, names = self.grouper.aggregate(obj.values, how)
  645. except AssertionError as e:
  646. raise GroupByError(str(e))
  647. output[name] = self._try_cast(result, obj)
  648. if len(output) == 0:
  649. raise DataError('No numeric types to aggregate')
  650. return self._wrap_aggregated_output(output, names)
  651. def _python_agg_general(self, func, *args, **kwargs):
  652. func = self._is_builtin_func(func)
  653. f = lambda x: func(x, *args, **kwargs)
  654. # iterate through "columns" ex exclusions to populate output dict
  655. output = {}
  656. for name, obj in self._iterate_slices():
  657. try:
  658. result, counts = self.grouper.agg_series(obj, f)
  659. output[name] = self._try_cast(result, obj)
  660. except TypeError:
  661. continue
  662. if len(output) == 0:
  663. return self._python_apply_general(f)
  664. if self.grouper._filter_empty_groups:
  665. mask = counts.ravel() > 0
  666. for name, result in compat.iteritems(output):
  667. # since we are masking, make sure that we have a float object
  668. values = result
  669. if is_numeric_dtype(values.dtype):
  670. values = _ensure_float(values)
  671. output[name] = self._try_cast(values[mask], result)
  672. return self._wrap_aggregated_output(output)
  673. def _wrap_applied_output(self, *args, **kwargs):
  674. raise AbstractMethodError(self)
  675. def _concat_objects(self, keys, values, not_indexed_same=False):
  676. from pandas.tools.merge import concat
  677. def reset_identity(values):
  678. # reset the identities of the components
  679. # of the values to prevent aliasing
  680. for v in values:
  681. if v is not None:
  682. ax = v._get_axis(self.axis)
  683. ax._reset_identity()
  684. return values
  685. if not not_indexed_same:
  686. result = concat(values, axis=self.axis)
  687. ax = self._selected_obj._get_axis(self.axis)
  688. if isinstance(result, Series):
  689. result = result.reindex(ax)
  690. else:
  691. result = result.reindex_axis(ax, axis=self.axis)
  692. elif self.group_keys:
  693. values = reset_identity(values)
  694. if self.as_index:
  695. # possible MI return case
  696. group_keys = keys
  697. group_levels = self.grouper.levels
  698. group_names = self.grouper.names
  699. result = concat(values, axis=self.axis, keys=group_keys,
  700. levels=group_levels, names=group_names)
  701. else:
  702. # GH5610, returns a MI, with the first level being a
  703. # range index
  704. keys = list(range(len(values)))
  705. result = concat(values, axis=self.axis, keys=keys)
  706. else:
  707. values = reset_identity(values)
  708. result = concat(values, axis=self.axis)
  709. if (isinstance(result, Series) and
  710. getattr(self, 'name', None) is not None):
  711. result.name = self.name
  712. return result
  713. def _apply_filter(self, indices, dropna):
  714. if len(indices) == 0:
  715. indices = np.array([], dtype='int64')
  716. else:
  717. indices = np.sort(np.concatenate(indices))
  718. if dropna:
  719. filtered = self._selected_obj.take(indices, axis=self.axis)
  720. else:
  721. mask = np.empty(len(self._selected_obj.index), dtype=bool)
  722. mask.fill(False)
  723. mask[indices.astype(int)] = True
  724. # mask fails to broadcast when passed to where; broadcast manually.
  725. mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T
  726. filtered = self._selected_obj.where(mask) # Fill with NaNs.
  727. return filtered
  728. class GroupBy(_GroupBy):
  729. """
  730. Class for grouping and aggregating relational data. See aggregate,
  731. transform, and apply functions on this object.
  732. It's easiest to use obj.groupby(...) to use GroupBy, but you can also do:
  733. ::
  734. grouped = groupby(obj, ...)
  735. Parameters
  736. ----------
  737. obj : pandas object
  738. axis : int, default 0
  739. level : int, default None
  740. Level of MultiIndex
  741. groupings : list of Grouping objects
  742. Most users should ignore this
  743. exclusions : array-like, optional
  744. List of columns to exclude
  745. name : string
  746. Most users should ignore this
  747. Notes
  748. -----
  749. After grouping, see aggregate, apply, and transform functions. Here are
  750. some other brief notes about usage. When grouping by multiple groups, the
  751. result index will be a MultiIndex (hierarchical) by default.
  752. Iteration produces (key, group) tuples, i.e. chunking the data by group. So
  753. you can write code like:
  754. ::
  755. grouped = obj.groupby(keys, axis=axis)
  756. for key, group in grouped:
  757. # do something with the data
  758. Function calls on GroupBy, if not specially implemented, "dispatch" to the
  759. grouped data. So if you group a DataFrame and wish to invoke the std()
  760. method on each group, you can simply do:
  761. ::
  762. df.groupby(mapper).std()
  763. rather than
  764. ::
  765. df.groupby(mapper).aggregate(np.std)
  766. You can pass arguments to these "wrapped" functions, too.
  767. See the online documentation for full exposition on these topics and much
  768. more
  769. Returns
  770. -------
  771. **Attributes**
  772. groups : dict
  773. {group name -> group labels}
  774. len(grouped) : int
  775. Number of groups
  776. """
  777. _apply_whitelist = _common_apply_whitelist
  778. def irow(self, i):
  779. """
  780. DEPRECATED. Use ``.nth(i)`` instead
  781. """
  782. # 10177
  783. warnings.warn("irow(i) is deprecated. Please use .nth(i)",
  784. FutureWarning, stacklevel=2)
  785. return self.nth(i)
  786. @Substitution(name='groupby')
  787. @Appender(_doc_template)
  788. def count(self):
  789. """Compute count of group, excluding missing values"""
  790. # defined here for API doc
  791. raise NotImplementedError
  792. @Substitution(name='groupby')
  793. @Appender(_doc_template)
  794. def mean(self, *args, **kwargs):
  795. """
  796. Compute mean of groups, excluding missing values
  797. For multiple groupings, the result index will be a MultiIndex
  798. """
  799. nv.validate_groupby_func('mean', args, kwargs)
  800. try:
  801. return self._cython_agg_general('mean')
  802. except GroupByError:
  803. raise
  804. except Exception: # pragma: no cover
  805. self._set_group_selection()
  806. f = lambda x: x.mean(axis=self.axis)
  807. return self._python_agg_general(f)
  808. @Substitution(name='groupby')
  809. @Appender(_doc_template)
  810. def median(self):
  811. """
  812. Compute median of groups, excluding missing values
  813. For multiple groupings, the result index will be a MultiIndex
  814. """
  815. try:
  816. return self._cython_agg_general('median')
  817. except GroupByError:
  818. raise
  819. except Exception: # pragma: no cover
  820. self._set_group_selection()
  821. def f(x):
  822. if isinstance(x, np.ndarray):
  823. x = Series(x)
  824. return x.median(axis=self.axis)
  825. return self._python_agg_general(f)
  826. @Substitution(name='groupby')
  827. @Appender(_doc_template)
  828. def std(self, ddof=1, *args, **kwargs):
  829. """
  830. Compute standard deviation of groups, excluding missing values
  831. For multiple groupings, the result index will be a MultiIndex
  832. Parameters
  833. ----------
  834. ddof : integer, default 1
  835. degrees of freedom
  836. """
  837. # TODO: implement at Cython level?
  838. nv.validate_groupby_func('std', args, kwargs)
  839. return np.sqrt(self.var(ddof=ddof))
  840. @Substitution(name='groupby')
  841. @Appender(_doc_template)
  842. def var(self, ddof=1, *args, **kwargs):
  843. """
  844. Compute variance of groups, excluding missing values
  845. For multiple groupings, the result index will be a MultiIndex
  846. Parameters
  847. ----------
  848. ddof : integer, default 1
  849. degrees of freedom
  850. """
  851. nv.validate_groupby_func('var', args, kwargs)
  852. if ddof == 1:
  853. return self._cython_agg_general('var')
  854. else:
  855. self._set_group_selection()
  856. f = lambda x: x.var(ddof=ddof)
  857. return self._python_agg_general(f)
  858. @Substitution(name='groupby')
  859. @Appender(_doc_template)
  860. def sem(self, ddof=1):
  861. """
  862. Compute standard error of the mean of groups, excluding missing values
  863. For multiple groupings, the result index will be a MultiIndex
  864. Parameters
  865. ----------
  866. ddof : integer, default 1
  867. degrees of freedom
  868. """
  869. return self.std(ddof=ddof) / np.sqrt(self.count())
  870. @Substitution(name='groupby')
  871. @Appender(_doc_template)
  872. def size(self):
  873. """Compute group sizes"""
  874. return self.grouper.size()
  875. sum = _groupby_function('sum', 'add', np.sum)
  876. prod = _groupby_function('prod', 'prod', np.prod)
  877. min = _groupby_function('min', 'min', np.min, numeric_only=False)
  878. max = _groupby_function('max', 'max', np.max, numeric_only=False)
  879. first = _groupby_function('first', 'first', _first_compat,
  880. numeric_only=False, _convert=True)
  881. last = _groupby_function('last', 'last', _last_compat, numeric_only=False,
  882. _convert=True)
  883. @Substitution(name='groupby')
  884. @Appender(_doc_template)
  885. def ohlc(self):
  886. """
  887. Compute sum of values, excluding missing values
  888. For multiple groupings, the result index will be a MultiIndex
  889. """
  890. return self._apply_to_column_groupbys(
  891. lambda x: x._cython_agg_general('ohlc'))
  892. @Substitution(name='groupby')
  893. @Appender(_doc_template)
  894. def resample(self, rule, *args, **kwargs):
  895. """
  896. Provide resampling when using a TimeGrouper
  897. Return a new grouper with our resampler appended
  898. """
  899. from pandas.tseries.resample import get_resampler_for_grouping
  900. return get_resampler_for_grouping(self, rule, *args, **kwargs)
  901. @Substitution(name='groupby')
  902. @Appender(_doc_template)
  903. def rolling(self, *args, **kwargs):
  904. """
  905. Return a rolling grouper, providing rolling
  906. functionaility per group
  907. """
  908. from pandas.core.window import RollingGroupby
  909. return RollingGroupby(self, *args, **kwargs)
  910. @Substitution(name='groupby')
  911. @Appender(_doc_template)
  912. def expanding(self, *args, **kwargs):
  913. """
  914. Return an expanding grouper, providing expanding
  915. functionaility per group
  916. """
  917. from pandas.core.window import ExpandingGroupby
  918. return ExpandingGroupby(self, *args, **kwargs)
  919. @Substitution(name='groupby')
  920. @Appender(_doc_template)
  921. def pad(self, limit=None):
  922. """
  923. Forward fill the values
  924. Parameters
  925. ----------
  926. limit : integer, optional
  927. limit of how many values to fill
  928. See Also
  929. --------
  930. Series.fillna
  931. DataFrame.fillna
  932. """
  933. return self.apply(lambda x: x.ffill(limit=limit))
  934. ffill = pad
  935. @Substitution(name='groupby')
  936. @Appender(_doc_template)
  937. def backfill(self, limit=None):
  938. """
  939. Backward fill the values
  940. Parameters
  941. ----------
  942. limit : integer, optional
  943. limit of how many values to fill
  944. See Also
  945. --------
  946. Series.fillna
  947. DataFrame.fillna
  948. """
  949. return self.apply(lambda x: x.bfill(limit=limit))
  950. bfill = backfill
  951. @Substitution(name='groupby')
  952. @Appender(_doc_template)
  953. def nth(self, n, dropna=None):
  954. """
  955. Take the nth row from each group if n is an int, or a subset of rows
  956. if n is a list of ints.
  957. If dropna, will take the nth non-null row, dropna is either
  958. Truthy (if a Series) or 'all', 'any' (if a DataFrame);
  959. this is equivalent to calling dropna(how=dropna) before the
  960. groupby.
  961. Parameters
  962. ----------
  963. n : int or list of ints
  964. a single nth value for the row or a list of nth values
  965. dropna : None or str, optional
  966. apply the specified dropna operation before counting which row is
  967. the nth row. Needs to be None, 'any' or 'all'
  968. Examples
  969. --------
  970. >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
  971. ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B'])
  972. >>> g = df.groupby('A')
  973. >>> g.nth(0)
  974. B
  975. A
  976. 1 NaN
  977. 2 3.0
  978. >>> g.nth(1)
  979. B
  980. A
  981. 1 2.0
  982. 2 5.0
  983. >>> g.nth(-1)
  984. B
  985. A
  986. 1 4.0
  987. 2 5.0
  988. >>> g.nth([0, 1])
  989. B
  990. A
  991. 1 NaN
  992. 1 2.0
  993. 2 3.0
  994. 2 5.0
  995. Specifying ``dropna`` allows count ignoring NaN
  996. >>> g.nth(0, dropna='any')
  997. B
  998. A
  999. 1 2.0
  1000. 2 3.0
  1001. NaNs denote group exhausted when using dropna
  1002. >>> g.nth(3, dropna='any')
  1003. B
  1004. A
  1005. 1 NaN
  1006. 2 NaN
  1007. Specifying ``as_index=False`` in ``groupby`` keeps the original index.
  1008. >>> df.groupby('A', as_index=False).nth(1)
  1009. A B
  1010. 1 1 2.0
  1011. 4 2 5.0
  1012. """
  1013. if isinstance(n, int):
  1014. nth_values = [n]
  1015. elif isinstance(n, (set, list, tuple)):
  1016. nth_values = list(set(n))
  1017. if dropna is not None:
  1018. raise ValueError(
  1019. "dropna option with a list of nth values is not supported")
  1020. else:
  1021. raise TypeError("n needs to be an int or a list/set/tuple of ints")
  1022. nth_values = np.array(nth_values, dtype=np.intp)
  1023. self._set_group_selection()
  1024. if not dropna:
  1025. mask = np.in1d(self._cumcount_array(), nth_values) | \
  1026. np.in1d(self._cumcount_array(ascending=False) + 1, -nth_values)
  1027. out = self._selected_obj[mask]
  1028. if not self.as_index:
  1029. return out
  1030. ids, _, _ = self.grouper.group_info
  1031. out.index = self.grouper.result_index[ids[mask]]
  1032. return out.sort_index() if self.sort else out
  1033. if isinstance(self._selected_obj, DataFrame) and \
  1034. dropna not in ['any', 'all']:
  1035. # Note: when agg-ing picker doesn't raise this, just returns NaN
  1036. raise ValueError("For a DataFrame groupby, dropna must be "
  1037. "either None, 'any' or 'all', "
  1038. "(was passed %s)." % (dropna),)
  1039. # old behaviour, but with all and any support for DataFrames.
  1040. # modified in GH 7559 to have better perf
  1041. max_len = n if n >= 0 else - 1 - n
  1042. dropped = self.obj.dropna(how=dropna, axis=self.axis)
  1043. # get a new grouper for our dropped obj
  1044. if self.keys is None and self.level is None:
  1045. # we don't have the grouper info available
  1046. # (e.g. we have selected out
  1047. # a column that is not in the current object)
  1048. axis = self.grouper.axis
  1049. grouper = axis[axis.isin(dropped.index)]
  1050. else:
  1051. # create a grouper with the original parameters, but on the dropped
  1052. # object
  1053. grouper, _, _ = _get_grouper(dropped, key=self.keys,
  1054. axis=self.axis, level=self.level,
  1055. sort=self.sort,
  1056. mutated=self.mutated)
  1057. grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort)
  1058. sizes, result = grb.size(), grb.nth(n)
  1059. mask = (sizes < max_len).values
  1060. # set the results which don't meet the criteria
  1061. if len(result) and mask.any():
  1062. result.loc[mask] = np.nan
  1063. # reset/reindex to the original groups
  1064. if len(self.obj) == len(dropped) or \
  1065. len(result) == len(self.grouper.result_index):
  1066. result.index = self.grouper.result_index
  1067. else:
  1068. result = result.reindex(self.grouper.result_index)
  1069. return result
  1070. @Substitution(name='groupby')
  1071. @Appender(_doc_template)
  1072. def cumcount(self, ascending=True):
  1073. """
  1074. Number each item in each group from 0 to the length of that group - 1.
  1075. Essentially this is equivalent to
  1076. >>> self.apply(lambda x: Series(np.arange(len(x)), x.index))
  1077. Parameters
  1078. ----------
  1079. ascending : bool, default True
  1080. If False, number in reverse, from length of group - 1 to 0.
  1081. Examples
  1082. --------
  1083. >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],
  1084. ... columns=['A'])
  1085. >>> df
  1086. A
  1087. 0 a
  1088. 1 a
  1089. 2 a
  1090. 3 b
  1091. 4 b
  1092. 5 a
  1093. >>> df.groupby('A').cumcount()
  1094. 0 0
  1095. 1 1
  1096. 2 2
  1097. 3 0
  1098. 4 1
  1099. 5 3
  1100. dtype: int64
  1101. >>> df.groupby('A').cumcount(ascending=False)
  1102. 0 3
  1103. 1 2
  1104. 2 1
  1105. 3 1
  1106. 4 0
  1107. 5 0
  1108. dtype: int64
  1109. """
  1110. self._set_group_selection()
  1111. index = self._selected_obj.index
  1112. cumcounts = self._cumcount_array(ascending=ascending)
  1113. return Series(cumcounts, index)
  1114. @Substitution(name='groupby')
  1115. @Appender(_doc_template)
  1116. def cumprod(self, axis=0, *args, **kwargs):
  1117. """Cumulative product for each group"""
  1118. nv.validate_groupby_func('cumprod', args, kwargs)
  1119. if axis != 0:
  1120. return self.apply(lambda x: x.cumprod(axis=axis))
  1121. return self._cython_transform('cumprod')
  1122. @Substitution(name='groupby')
  1123. @Appender(_doc_template)
  1124. def cumsum(self, axis=0, *args, **kwargs):
  1125. """Cumulative sum for each group"""
  1126. nv.validate_groupby_func('cumsum', args, kwargs)
  1127. if axis != 0:
  1128. return self.apply(lambda x: x.cumsum(axis=axis))
  1129. return self._cython_transform('cumsum')
  1130. @Substitution(name='groupby')
  1131. @Appender(_doc_template)
  1132. def shift(self, periods=1, freq=None, axis=0):
  1133. """
  1134. Shift each group by periods observations
  1135. Parameters
  1136. ----------
  1137. periods : integer, default 1
  1138. number of periods to shift
  1139. freq : frequency string
  1140. axis : axis to shift, default 0
  1141. """
  1142. if freq is not None or axis != 0:
  1143. return self.apply(lambda x: x.shift(periods, freq, axis))
  1144. labels, _, ngroups = self.grouper.group_info
  1145. # filled in by Cython
  1146. indexer = np.zeros_like(labels)
  1147. _algos.group_shift_indexer(indexer, labels, ngroups, periods)
  1148. output = {}
  1149. for name, obj in self._iterate_slices():
  1150. output[name] = algos.take_nd(obj.values, indexer)
  1151. return self._wrap_transformed_output(output)
  1152. @Substitution(name='groupby')
  1153. @Appender(_doc_template)
  1154. def head(self, n=5):
  1155. """
  1156. Returns first n rows of each group.
  1157. Essentially equivalent to ``.apply(lambda x: x.head(n))``,
  1158. except ignores as_index flag.
  1159. Examples
  1160. --------
  1161. >>> df = DataFrame([[1, 2], [1, 4], [5, 6]],
  1162. columns=['A', 'B'])
  1163. >>> df.groupby('A', as_index=False).head(1)
  1164. A B
  1165. 0 1 2
  1166. 2 5 6
  1167. >>> df.groupby('A').head(1)
  1168. A B
  1169. 0 1 2
  1170. 2 5 6
  1171. """
  1172. self._reset_group_selection()
  1173. mask = self._cumcount_array() < n
  1174. return self._selected_obj[mask]
  1175. @Substitution(name='groupby')
  1176. @Appender(_doc_template)
  1177. def tail(self, n=5):
  1178. """
  1179. Returns last n rows of each group
  1180. Essentially equivalent to ``.apply(lambda x: x.tail(n))``,
  1181. except ignores as_index flag.
  1182. Examples
  1183. --------
  1184. >>> df = DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]],
  1185. columns=['A', 'B'])
  1186. >>> df.groupby('A').tail(1)
  1187. A B
  1188. 1 a 2
  1189. 3 b 2
  1190. >>> df.groupby('A').head(1)
  1191. A B
  1192. 0 a 1
  1193. 2 b 1
  1194. """
  1195. self._reset_group_selection()
  1196. mask = self._cumcount_array(ascending=False) < n
  1197. return self._selected_obj[mask]
  1198. @Appender(GroupBy.__doc__)
  1199. def groupby(obj, by, **kwds):
  1200. if isinstance(obj, Series):
  1201. klass = SeriesGroupBy
  1202. elif isinstance(obj, DataFrame):
  1203. klass = DataFrameGroupBy
  1204. else: # pragma: no cover
  1205. raise TypeError('invalid type: %s' % type(obj))
  1206. return klass(obj, by, **kwds)
  1207. def _get_axes(group):
  1208. if isinstance(group, Series):
  1209. return [group.index]
  1210. else:
  1211. return group.axes
  1212. def _is_indexed_like(obj, axes):
  1213. if isinstance(obj, Series):
  1214. if len(axes) > 1:
  1215. return False
  1216. return obj.index.equals(axes[0])
  1217. elif isinstance(obj, DataFrame):
  1218. return obj.index.equals(axes[0])
  1219. return False
  1220. class BaseGrouper(object):
  1221. """
  1222. This is an internal Grouper class, which actually holds
  1223. the generated groups
  1224. """
  1225. def __init__(self, axis, groupings, sort=True, group_keys=True,
  1226. mutated=False):
  1227. self._filter_empty_groups = self.compressed = len(groupings) != 1
  1228. self.axis = axis
  1229. self.groupings = groupings
  1230. self.sort = sort
  1231. self.group_keys = group_keys
  1232. self.mutated = mutated
  1233. @property
  1234. def shape(self):
  1235. return tuple(ping.ngroups for ping in self.groupings)
  1236. def __iter__(self):
  1237. return iter(self.indices)
  1238. @property
  1239. def nkeys(self):
  1240. return len(self.groupings)
  1241. def get_iterator(self, data, axis=0):
  1242. """
  1243. Groupby iterator
  1244. Returns
  1245. -------
  1246. Generator yielding sequence of (name, subsetted object)
  1247. for each group
  1248. """
  1249. splitter = self._get_splitter(data, axis=axis)
  1250. keys = self._get_group_keys()
  1251. for key, (i, group) in zip(keys, splitter):
  1252. yield key, group
  1253. def _get_splitter(self, data, axis=0):
  1254. comp_ids, _, ngroups = self.group_info
  1255. return get_splitter(data, comp_ids, ngroups, axis=axis)
  1256. def _get_group_keys(self):
  1257. if len(self.groupings) == 1:
  1258. return self.levels[0]
  1259. else:
  1260. comp_ids, _, ngroups = self.group_info
  1261. # provide "flattened" iterator for multi-group setting
  1262. mapper = _KeyMapper(comp_ids, ngroups, self.labels, self.levels)
  1263. return [mapper.get_key(i) for i in range(ngroups)]
  1264. def apply(self, f, data, axis=0):
  1265. mutated = self.mutated
  1266. splitter = self._get_splitter(data, axis=axis)
  1267. group_keys = self._get_group_keys()
  1268. # oh boy
  1269. f_name = com._get_callable_name(f)
  1270. if (f_name not in _plotting_methods and
  1271. hasattr(splitter, 'fast_apply') and axis == 0):
  1272. try:
  1273. values, mutated = splitter.fast_apply(f, group_keys)
  1274. return group_keys, values, mutated
  1275. except (lib.InvalidApply):
  1276. # we detect a mutation of some kind
  1277. # so take slow path
  1278. pass
  1279. except Exception:
  1280. # raise this error to the caller
  1281. pass
  1282. result_values = []
  1283. for key, (i, group) in zip(

Large files files are truncated, but you can click here to view the full file