PageRenderTime 39ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/core/groupby.py

http://github.com/pydata/pandas
Python | 3566 lines | 3562 code | 4 blank | 0 comment | 9 complexity | 548ba450e7aecf6c9af4de2401745ea1 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. import types
  2. from functools import wraps
  3. import numpy as np
  4. import datetime
  5. import collections
  6. from pandas.compat import(
  7. zip, builtins, range, long, lzip,
  8. OrderedDict, callable
  9. )
  10. from pandas import compat
  11. from pandas.core.base import PandasObject
  12. from pandas.core.categorical import Categorical
  13. from pandas.core.frame import DataFrame
  14. from pandas.core.generic import NDFrame
  15. from pandas.core.index import Index, MultiIndex, _ensure_index, _union_indexes
  16. from pandas.core.internals import BlockManager, make_block
  17. from pandas.core.series import Series
  18. from pandas.core.panel import Panel
  19. from pandas.util.decorators import cache_readonly, Appender
  20. import pandas.core.algorithms as algos
  21. import pandas.core.common as com
  22. from pandas.core.common import(_possibly_downcast_to_dtype, isnull,
  23. notnull, _DATELIKE_DTYPES, is_numeric_dtype,
  24. is_timedelta64_dtype, is_datetime64_dtype)
  25. from pandas import _np_version_under1p7
  26. import pandas.lib as lib
  27. from pandas.lib import Timestamp
  28. import pandas.tslib as tslib
  29. import pandas.algos as _algos
  30. import pandas.hashtable as _hash
  31. _agg_doc = """Aggregate using input function or dict of {column -> function}
  32. Parameters
  33. ----------
  34. arg : function or dict
  35. Function to use for aggregating groups. If a function, must either
  36. work when passed a DataFrame or when passed to DataFrame.apply. If
  37. passed a dict, the keys must be DataFrame column names.
  38. Notes
  39. -----
  40. Numpy functions mean/median/prod/sum/std/var are special cased so the
  41. default behavior is applying the function along axis=0
  42. (e.g., np.mean(arr_2d, axis=0)) as opposed to
  43. mimicking the default Numpy behavior (e.g., np.mean(arr_2d)).
  44. Returns
  45. -------
  46. aggregated : DataFrame
  47. """
  48. # special case to prevent duplicate plots when catching exceptions when
  49. # forwarding methods from NDFrames
  50. _plotting_methods = frozenset(['plot', 'boxplot', 'hist'])
  51. _common_apply_whitelist = frozenset([
  52. 'last', 'first',
  53. 'head', 'tail', 'median',
  54. 'mean', 'sum', 'min', 'max',
  55. 'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount',
  56. 'resample',
  57. 'describe',
  58. 'rank', 'quantile', 'count',
  59. 'fillna',
  60. 'mad',
  61. 'any', 'all',
  62. 'irow', 'take',
  63. 'idxmax', 'idxmin',
  64. 'shift', 'tshift',
  65. 'ffill', 'bfill',
  66. 'pct_change', 'skew',
  67. 'corr', 'cov', 'diff',
  68. ]) | _plotting_methods
  69. _series_apply_whitelist = \
  70. (_common_apply_whitelist - set(['boxplot'])) | \
  71. frozenset(['dtype', 'value_counts', 'unique', 'nunique',
  72. 'nlargest', 'nsmallest'])
  73. _dataframe_apply_whitelist = \
  74. _common_apply_whitelist | frozenset(['dtypes', 'corrwith'])
  75. class GroupByError(Exception):
  76. pass
  77. class DataError(GroupByError):
  78. pass
  79. class SpecificationError(GroupByError):
  80. pass
  81. def _groupby_function(name, alias, npfunc, numeric_only=True,
  82. _convert=False):
  83. def f(self):
  84. self._set_selection_from_grouper()
  85. try:
  86. return self._cython_agg_general(alias, numeric_only=numeric_only)
  87. except AssertionError as e:
  88. raise SpecificationError(str(e))
  89. except Exception:
  90. result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
  91. if _convert:
  92. result = result.convert_objects()
  93. return result
  94. f.__doc__ = "Compute %s of group values" % name
  95. f.__name__ = name
  96. return f
  97. def _first_compat(x, axis=0):
  98. def _first(x):
  99. x = np.asarray(x)
  100. x = x[notnull(x)]
  101. if len(x) == 0:
  102. return np.nan
  103. return x[0]
  104. if isinstance(x, DataFrame):
  105. return x.apply(_first, axis=axis)
  106. else:
  107. return _first(x)
  108. def _last_compat(x, axis=0):
  109. def _last(x):
  110. x = np.asarray(x)
  111. x = x[notnull(x)]
  112. if len(x) == 0:
  113. return np.nan
  114. return x[-1]
  115. if isinstance(x, DataFrame):
  116. return x.apply(_last, axis=axis)
  117. else:
  118. return _last(x)
  119. def _count_compat(x, axis=0):
  120. return x.size
  121. class Grouper(object):
  122. """
  123. A Grouper allows the user to specify a groupby instruction for a target object
  124. This specification will select a column via the key parameter, or if the level and/or
  125. axis parameters are given, a level of the index of the target object.
  126. These are local specifications and will override 'global' settings, that is the parameters
  127. axis and level which are passed to the groupby itself.
  128. Parameters
  129. ----------
  130. key : string, defaults to None
  131. groupby key, which selects the grouping column of the target
  132. level : name/number, defaults to None
  133. the level for the target index
  134. freq : string / freqency object, defaults to None
  135. This will groupby the specified frequency if the target selection (via key or level) is
  136. a datetime-like object
  137. axis : number/name of the axis, defaults to None
  138. sort : boolean, default to False
  139. whether to sort the resulting labels
  140. additional kwargs to control time-like groupers (when freq is passed)
  141. closed : closed end of interval; left or right
  142. label : interval boundary to use for labeling; left or right
  143. convention : {'start', 'end', 'e', 's'}
  144. If grouper is PeriodIndex
  145. Returns
  146. -------
  147. A specification for a groupby instruction
  148. Examples
  149. --------
  150. >>> df.groupby(Grouper(key='A')) : syntatic sugar for df.groupby('A')
  151. >>> df.groupby(Grouper(key='date',freq='60s')) : specify a resample on the column 'date'
  152. >>> df.groupby(Grouper(level='date',freq='60s',axis=1)) :
  153. specify a resample on the level 'date' on the columns axis with a frequency of 60s
  154. """
  155. def __new__(cls, *args, **kwargs):
  156. if kwargs.get('freq') is not None:
  157. from pandas.tseries.resample import TimeGrouper
  158. cls = TimeGrouper
  159. return super(Grouper, cls).__new__(cls)
  160. def __init__(self, key=None, level=None, freq=None, axis=None, sort=False):
  161. self.key=key
  162. self.level=level
  163. self.freq=freq
  164. self.axis=axis
  165. self.sort=sort
  166. self.grouper=None
  167. self.obj=None
  168. self.indexer=None
  169. self.binner=None
  170. self.grouper=None
  171. @property
  172. def ax(self):
  173. return self.grouper
  174. def _get_grouper(self, obj):
  175. """
  176. Parameters
  177. ----------
  178. obj : the subject object
  179. Returns
  180. -------
  181. a tuple of binner, grouper, obj (possibly sorted)
  182. """
  183. self._set_grouper(obj)
  184. return self.binner, self.grouper, self.obj
  185. def _set_grouper(self, obj, sort=False):
  186. """
  187. given an object and the specifcations, setup the internal grouper for this particular specification
  188. Parameters
  189. ----------
  190. obj : the subject object
  191. """
  192. if self.key is not None and self.level is not None:
  193. raise ValueError("The Grouper cannot specify both a key and a level!")
  194. # the key must be a valid info item
  195. if self.key is not None:
  196. key = self.key
  197. if key not in obj._info_axis:
  198. raise KeyError("The grouper name {0} is not found".format(key))
  199. ax = Index(obj[key],name=key)
  200. else:
  201. ax = obj._get_axis(self.axis)
  202. if self.level is not None:
  203. level = self.level
  204. # if a level is given it must be a mi level or
  205. # equivalent to the axis name
  206. if isinstance(ax, MultiIndex):
  207. if isinstance(level, compat.string_types):
  208. if obj.index.name != level:
  209. raise ValueError('level name %s is not the name of the '
  210. 'index' % level)
  211. elif level > 0:
  212. raise ValueError('level > 0 only valid with MultiIndex')
  213. ax = Index(ax.get_level_values(level), name=level)
  214. else:
  215. if not (level == 0 or level == ax.name):
  216. raise ValueError("The grouper level {0} is not valid".format(level))
  217. # possibly sort
  218. if (self.sort or sort) and not ax.is_monotonic:
  219. indexer = self.indexer = ax.argsort(kind='quicksort')
  220. ax = ax.take(indexer)
  221. obj = obj.take(indexer, axis=self.axis, convert=False, is_copy=False)
  222. self.obj = obj
  223. self.grouper = ax
  224. return self.grouper
  225. def _get_binner_for_grouping(self, obj):
  226. raise NotImplementedError
  227. @property
  228. def groups(self):
  229. return self.grouper.groups
  230. class GroupBy(PandasObject):
  231. """
  232. Class for grouping and aggregating relational data. See aggregate,
  233. transform, and apply functions on this object.
  234. It's easiest to use obj.groupby(...) to use GroupBy, but you can also do:
  235. ::
  236. grouped = groupby(obj, ...)
  237. Parameters
  238. ----------
  239. obj : pandas object
  240. axis : int, default 0
  241. level : int, default None
  242. Level of MultiIndex
  243. groupings : list of Grouping objects
  244. Most users should ignore this
  245. exclusions : array-like, optional
  246. List of columns to exclude
  247. name : string
  248. Most users should ignore this
  249. Notes
  250. -----
  251. After grouping, see aggregate, apply, and transform functions. Here are
  252. some other brief notes about usage. When grouping by multiple groups, the
  253. result index will be a MultiIndex (hierarchical) by default.
  254. Iteration produces (key, group) tuples, i.e. chunking the data by group. So
  255. you can write code like:
  256. ::
  257. grouped = obj.groupby(keys, axis=axis)
  258. for key, group in grouped:
  259. # do something with the data
  260. Function calls on GroupBy, if not specially implemented, "dispatch" to the
  261. grouped data. So if you group a DataFrame and wish to invoke the std()
  262. method on each group, you can simply do:
  263. ::
  264. df.groupby(mapper).std()
  265. rather than
  266. ::
  267. df.groupby(mapper).aggregate(np.std)
  268. You can pass arguments to these "wrapped" functions, too.
  269. See the online documentation for full exposition on these topics and much
  270. more
  271. Returns
  272. -------
  273. **Attributes**
  274. groups : dict
  275. {group name -> group labels}
  276. len(grouped) : int
  277. Number of groups
  278. """
  279. _apply_whitelist = _common_apply_whitelist
  280. _internal_names = ['_cache']
  281. _internal_names_set = set(_internal_names)
  282. _group_selection = None
  283. def __init__(self, obj, keys=None, axis=0, level=None,
  284. grouper=None, exclusions=None, selection=None, as_index=True,
  285. sort=True, group_keys=True, squeeze=False):
  286. self._selection = selection
  287. if isinstance(obj, NDFrame):
  288. obj._consolidate_inplace()
  289. self.level = level
  290. if not as_index:
  291. if not isinstance(obj, DataFrame):
  292. raise TypeError('as_index=False only valid with DataFrame')
  293. if axis != 0:
  294. raise ValueError('as_index=False only valid for axis=0')
  295. self.as_index = as_index
  296. self.keys = keys
  297. self.sort = sort
  298. self.group_keys = group_keys
  299. self.squeeze = squeeze
  300. if grouper is None:
  301. grouper, exclusions, obj = _get_grouper(obj, keys, axis=axis,
  302. level=level, sort=sort)
  303. self.obj = obj
  304. self.axis = obj._get_axis_number(axis)
  305. self.grouper = grouper
  306. self.exclusions = set(exclusions) if exclusions else set()
  307. def __len__(self):
  308. return len(self.indices)
  309. def __unicode__(self):
  310. # TODO: Better unicode/repr for GroupBy object
  311. return object.__repr__(self)
  312. @property
  313. def groups(self):
  314. """ dict {group name -> group labels} """
  315. return self.grouper.groups
  316. @property
  317. def ngroups(self):
  318. return self.grouper.ngroups
  319. @property
  320. def indices(self):
  321. """ dict {group name -> group indices} """
  322. return self.grouper.indices
  323. def _get_index(self, name):
  324. """ safe get index, translate keys for datelike to underlying repr """
  325. def convert(key, s):
  326. # possibly convert to they actual key types
  327. # in the indices, could be a Timestamp or a np.datetime64
  328. if isinstance(s, (Timestamp,datetime.datetime)):
  329. return Timestamp(key)
  330. elif isinstance(s, np.datetime64):
  331. return Timestamp(key).asm8
  332. return key
  333. sample = next(iter(self.indices))
  334. if isinstance(sample, tuple):
  335. if not isinstance(name, tuple):
  336. raise ValueError("must supply a tuple to get_group with multiple grouping keys")
  337. if not len(name) == len(sample):
  338. raise ValueError("must supply a a same-length tuple to get_group with multiple grouping keys")
  339. name = tuple([ convert(n, k) for n, k in zip(name,sample) ])
  340. else:
  341. name = convert(name, sample)
  342. return self.indices[name]
  343. @property
  344. def name(self):
  345. if self._selection is None:
  346. return None # 'result'
  347. else:
  348. return self._selection
  349. @property
  350. def _selection_list(self):
  351. if not isinstance(self._selection, (list, tuple, Series, np.ndarray)):
  352. return [self._selection]
  353. return self._selection
  354. @cache_readonly
  355. def _selected_obj(self):
  356. if self._selection is None or isinstance(self.obj, Series):
  357. if self._group_selection is not None:
  358. return self.obj[self._group_selection]
  359. return self.obj
  360. else:
  361. return self.obj[self._selection]
  362. def _set_selection_from_grouper(self):
  363. """ we may need create a selection if we have non-level groupers """
  364. grp = self.grouper
  365. if self.as_index and getattr(grp,'groupings',None) is not None and self.obj.ndim > 1:
  366. ax = self.obj._info_axis
  367. groupers = [ g.name for g in grp.groupings if g.level is None and g.name is not None and g.name in ax ]
  368. if len(groupers):
  369. self._group_selection = (ax-Index(groupers)).tolist()
  370. def _local_dir(self):
  371. return sorted(set(self.obj._local_dir() + list(self._apply_whitelist)))
  372. def __getattr__(self, attr):
  373. if attr in self._internal_names_set:
  374. return object.__getattribute__(self, attr)
  375. if attr in self.obj:
  376. return self[attr]
  377. if hasattr(self.obj, attr):
  378. return self._make_wrapper(attr)
  379. raise AttributeError("%r object has no attribute %r" %
  380. (type(self).__name__, attr))
  381. def __getitem__(self, key):
  382. raise NotImplementedError('Not implemented: %s' % key)
  383. def _make_wrapper(self, name):
  384. if name not in self._apply_whitelist:
  385. is_callable = callable(getattr(self._selected_obj, name, None))
  386. kind = ' callable ' if is_callable else ' '
  387. msg = ("Cannot access{0}attribute {1!r} of {2!r} objects, try "
  388. "using the 'apply' method".format(kind, name,
  389. type(self).__name__))
  390. raise AttributeError(msg)
  391. # need to setup the selection
  392. # as are not passed directly but in the grouper
  393. self._set_selection_from_grouper()
  394. f = getattr(self._selected_obj, name)
  395. if not isinstance(f, types.MethodType):
  396. return self.apply(lambda self: getattr(self, name))
  397. f = getattr(type(self._selected_obj), name)
  398. def wrapper(*args, **kwargs):
  399. # a little trickery for aggregation functions that need an axis
  400. # argument
  401. kwargs_with_axis = kwargs.copy()
  402. if 'axis' not in kwargs_with_axis:
  403. kwargs_with_axis['axis'] = self.axis
  404. def curried_with_axis(x):
  405. return f(x, *args, **kwargs_with_axis)
  406. def curried(x):
  407. return f(x, *args, **kwargs)
  408. # preserve the name so we can detect it when calling plot methods,
  409. # to avoid duplicates
  410. curried.__name__ = curried_with_axis.__name__ = name
  411. # special case otherwise extra plots are created when catching the
  412. # exception below
  413. if name in _plotting_methods:
  414. return self.apply(curried)
  415. try:
  416. return self.apply(curried_with_axis)
  417. except Exception:
  418. try:
  419. return self.apply(curried)
  420. except Exception:
  421. # related to : GH3688
  422. # try item-by-item
  423. # this can be called recursively, so need to raise ValueError if
  424. # we don't have this method to indicated to aggregate to
  425. # mark this column as an error
  426. try:
  427. return self._aggregate_item_by_item(name, *args, **kwargs)
  428. except (AttributeError):
  429. raise ValueError
  430. return wrapper
  431. def get_group(self, name, obj=None):
  432. """
  433. Constructs NDFrame from group with provided name
  434. Parameters
  435. ----------
  436. name : object
  437. the name of the group to get as a DataFrame
  438. obj : NDFrame, default None
  439. the NDFrame to take the DataFrame out of. If
  440. it is None, the object groupby was called on will
  441. be used
  442. Returns
  443. -------
  444. group : type of obj
  445. """
  446. if obj is None:
  447. obj = self._selected_obj
  448. inds = self._get_index(name)
  449. return obj.take(inds, axis=self.axis, convert=False)
  450. def __iter__(self):
  451. """
  452. Groupby iterator
  453. Returns
  454. -------
  455. Generator yielding sequence of (name, subsetted object)
  456. for each group
  457. """
  458. return self.grouper.get_iterator(self.obj, axis=self.axis)
  459. def apply(self, func, *args, **kwargs):
  460. """
  461. Apply function and combine results together in an intelligent way. The
  462. split-apply-combine combination rules attempt to be as common sense
  463. based as possible. For example:
  464. case 1:
  465. group DataFrame
  466. apply aggregation function (f(chunk) -> Series)
  467. yield DataFrame, with group axis having group labels
  468. case 2:
  469. group DataFrame
  470. apply transform function ((f(chunk) -> DataFrame with same indexes)
  471. yield DataFrame with resulting chunks glued together
  472. case 3:
  473. group Series
  474. apply function with f(chunk) -> DataFrame
  475. yield DataFrame with result of chunks glued together
  476. Parameters
  477. ----------
  478. func : function
  479. Notes
  480. -----
  481. See online documentation for full exposition on how to use apply.
  482. In the current implementation apply calls func twice on the
  483. first group to decide whether it can take a fast or slow code
  484. path. This can lead to unexpected behavior if func has
  485. side-effects, as they will take effect twice for the first
  486. group.
  487. See also
  488. --------
  489. aggregate, transform
  490. Returns
  491. -------
  492. applied : type depending on grouped object and function
  493. """
  494. func = _intercept_function(func)
  495. @wraps(func)
  496. def f(g):
  497. return func(g, *args, **kwargs)
  498. return self._python_apply_general(f)
  499. def _python_apply_general(self, f):
  500. keys, values, mutated = self.grouper.apply(f, self._selected_obj,
  501. self.axis)
  502. return self._wrap_applied_output(keys, values,
  503. not_indexed_same=mutated)
  504. def aggregate(self, func, *args, **kwargs):
  505. raise NotImplementedError
  506. @Appender(_agg_doc)
  507. def agg(self, func, *args, **kwargs):
  508. return self.aggregate(func, *args, **kwargs)
  509. def _iterate_slices(self):
  510. yield self.name, self._selected_obj
  511. def transform(self, func, *args, **kwargs):
  512. raise NotImplementedError
  513. def mean(self):
  514. """
  515. Compute mean of groups, excluding missing values
  516. For multiple groupings, the result index will be a MultiIndex
  517. """
  518. try:
  519. return self._cython_agg_general('mean')
  520. except GroupByError:
  521. raise
  522. except Exception: # pragma: no cover
  523. self._set_selection_from_grouper()
  524. f = lambda x: x.mean(axis=self.axis)
  525. return self._python_agg_general(f)
  526. def median(self):
  527. """
  528. Compute median of groups, excluding missing values
  529. For multiple groupings, the result index will be a MultiIndex
  530. """
  531. try:
  532. return self._cython_agg_general('median')
  533. except GroupByError:
  534. raise
  535. except Exception: # pragma: no cover
  536. self._set_selection_from_grouper()
  537. def f(x):
  538. if isinstance(x, np.ndarray):
  539. x = Series(x)
  540. return x.median(axis=self.axis)
  541. return self._python_agg_general(f)
  542. def std(self, ddof=1):
  543. """
  544. Compute standard deviation of groups, excluding missing values
  545. For multiple groupings, the result index will be a MultiIndex
  546. """
  547. # todo, implement at cython level?
  548. return np.sqrt(self.var(ddof=ddof))
  549. def var(self, ddof=1):
  550. """
  551. Compute variance of groups, excluding missing values
  552. For multiple groupings, the result index will be a MultiIndex
  553. """
  554. if ddof == 1:
  555. return self._cython_agg_general('var')
  556. else:
  557. self._set_selection_from_grouper()
  558. f = lambda x: x.var(ddof=ddof)
  559. return self._python_agg_general(f)
  560. def sem(self, ddof=1):
  561. """
  562. Compute standard error of the mean of groups, excluding missing values
  563. For multiple groupings, the result index will be a MultiIndex
  564. """
  565. return self.std(ddof=ddof)/np.sqrt(self.count())
  566. def size(self):
  567. """
  568. Compute group sizes
  569. """
  570. return self.grouper.size()
  571. sum = _groupby_function('sum', 'add', np.sum)
  572. prod = _groupby_function('prod', 'prod', np.prod)
  573. min = _groupby_function('min', 'min', np.min, numeric_only=False)
  574. max = _groupby_function('max', 'max', np.max, numeric_only=False)
  575. first = _groupby_function('first', 'first', _first_compat,
  576. numeric_only=False, _convert=True)
  577. last = _groupby_function('last', 'last', _last_compat, numeric_only=False,
  578. _convert=True)
  579. _count = _groupby_function('_count', 'count', _count_compat,
  580. numeric_only=False)
  581. def count(self, axis=0):
  582. return self._count().astype('int64')
  583. def ohlc(self):
  584. """
  585. Compute sum of values, excluding missing values
  586. For multiple groupings, the result index will be a MultiIndex
  587. """
  588. return self._apply_to_column_groupbys(
  589. lambda x: x._cython_agg_general('ohlc'))
  590. def nth(self, n, dropna=None):
  591. """
  592. Take the nth row from each group.
  593. If dropna, will not show nth non-null row, dropna is either
  594. Truthy (if a Series) or 'all', 'any' (if a DataFrame); this is equivalent
  595. to calling dropna(how=dropna) before the groupby.
  596. Examples
  597. --------
  598. >>> df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
  599. >>> g = df.groupby('A')
  600. >>> g.nth(0)
  601. A B
  602. 0 1 NaN
  603. 2 5 6
  604. >>> g.nth(1)
  605. A B
  606. 1 1 4
  607. >>> g.nth(-1)
  608. A B
  609. 1 1 4
  610. 2 5 6
  611. >>> g.nth(0, dropna='any')
  612. B
  613. A
  614. 1 4
  615. 5 6
  616. >>> g.nth(1, dropna='any') # NaNs denote group exhausted when using dropna
  617. B
  618. A
  619. 1 NaN
  620. 5 NaN
  621. """
  622. self._set_selection_from_grouper()
  623. if not dropna: # good choice
  624. m = self.grouper._max_groupsize
  625. if n >= m or n < -m:
  626. return self._selected_obj.loc[[]]
  627. rng = np.zeros(m, dtype=bool)
  628. if n >= 0:
  629. rng[n] = True
  630. is_nth = self._cumcount_array(rng)
  631. else:
  632. rng[- n - 1] = True
  633. is_nth = self._cumcount_array(rng, ascending=False)
  634. result = self._selected_obj[is_nth]
  635. # the result index
  636. if self.as_index:
  637. ax = self.obj._info_axis
  638. names = self.grouper.names
  639. if self.obj.ndim == 1:
  640. # this is a pass-thru
  641. pass
  642. elif all([ n in ax for n in names ]):
  643. result.index = Index(self.obj[names][is_nth].values.ravel()).set_names(names)
  644. elif self._group_selection is not None:
  645. result.index = self.obj._get_axis(self.axis)[is_nth]
  646. result = result.sort_index()
  647. return result
  648. if (isinstance(self._selected_obj, DataFrame)
  649. and dropna not in ['any', 'all']):
  650. # Note: when agg-ing picker doesn't raise this, just returns NaN
  651. raise ValueError("For a DataFrame groupby, dropna must be "
  652. "either None, 'any' or 'all', "
  653. "(was passed %s)." % (dropna),)
  654. # old behaviour, but with all and any support for DataFrames.
  655. # modified in GH 7559 to have better perf
  656. max_len = n if n >= 0 else - 1 - n
  657. dropped = self.obj.dropna(how=dropna, axis=self.axis)
  658. # get a new grouper for our dropped obj
  659. if self.keys is None and self.level is None:
  660. # we don't have the grouper info available (e.g. we have selected out
  661. # a column that is not in the current object)
  662. axis = self.grouper.axis
  663. grouper = axis[axis.isin(dropped.index)]
  664. keys = self.grouper.names
  665. else:
  666. # create a grouper with the original parameters, but on the dropped object
  667. grouper, _, _ = _get_grouper(dropped, key=self.keys, axis=self.axis,
  668. level=self.level, sort=self.sort)
  669. sizes = dropped.groupby(grouper).size()
  670. result = dropped.groupby(grouper).nth(n)
  671. mask = (sizes<max_len).values
  672. # set the results which don't meet the criteria
  673. if len(result) and mask.any():
  674. result.loc[mask] = np.nan
  675. # reset/reindex to the original groups
  676. if len(self.obj) == len(dropped) or len(result) == len(self.grouper.result_index):
  677. result.index = self.grouper.result_index
  678. else:
  679. result = result.reindex(self.grouper.result_index)
  680. return result
  681. def cumcount(self, **kwargs):
  682. """
  683. Number each item in each group from 0 to the length of that group - 1.
  684. Essentially this is equivalent to
  685. >>> self.apply(lambda x: Series(np.arange(len(x)), x.index))
  686. Parameters
  687. ----------
  688. ascending : bool, default True
  689. If False, number in reverse, from length of group - 1 to 0.
  690. Example
  691. -------
  692. >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],
  693. ... columns=['A'])
  694. >>> df
  695. A
  696. 0 a
  697. 1 a
  698. 2 a
  699. 3 b
  700. 4 b
  701. 5 a
  702. >>> df.groupby('A').cumcount()
  703. 0 0
  704. 1 1
  705. 2 2
  706. 3 0
  707. 4 1
  708. 5 3
  709. dtype: int64
  710. >>> df.groupby('A').cumcount(ascending=False)
  711. 0 3
  712. 1 2
  713. 2 1
  714. 3 1
  715. 4 0
  716. 5 0
  717. dtype: int64
  718. """
  719. self._set_selection_from_grouper()
  720. ascending = kwargs.pop('ascending', True)
  721. index = self._selected_obj.index
  722. cumcounts = self._cumcount_array(ascending=ascending)
  723. return Series(cumcounts, index)
  724. def head(self, n=5):
  725. """
  726. Returns first n rows of each group.
  727. Essentially equivalent to ``.apply(lambda x: x.head(n))``,
  728. except ignores as_index flag.
  729. Example
  730. -------
  731. >>> df = DataFrame([[1, 2], [1, 4], [5, 6]],
  732. columns=['A', 'B'])
  733. >>> df.groupby('A', as_index=False).head(1)
  734. A B
  735. 0 1 2
  736. 2 5 6
  737. >>> df.groupby('A').head(1)
  738. A B
  739. 0 1 2
  740. 2 5 6
  741. """
  742. obj = self._selected_obj
  743. in_head = self._cumcount_array() < n
  744. head = obj[in_head]
  745. return head
  746. def tail(self, n=5):
  747. """
  748. Returns last n rows of each group
  749. Essentially equivalent to ``.apply(lambda x: x.tail(n))``,
  750. except ignores as_index flag.
  751. Example
  752. -------
  753. >>> df = DataFrame([[1, 2], [1, 4], [5, 6]],
  754. columns=['A', 'B'])
  755. >>> df.groupby('A', as_index=False).tail(1)
  756. A B
  757. 0 1 2
  758. 2 5 6
  759. >>> df.groupby('A').head(1)
  760. A B
  761. 0 1 2
  762. 2 5 6
  763. """
  764. obj = self._selected_obj
  765. rng = np.arange(0, -self.grouper._max_groupsize, -1, dtype='int64')
  766. in_tail = self._cumcount_array(rng, ascending=False) > -n
  767. tail = obj[in_tail]
  768. return tail
  769. def _cumcount_array(self, arr=None, **kwargs):
  770. """
  771. arr is where cumcount gets it's values from
  772. note: this is currently implementing sort=False (though the default is sort=True)
  773. for groupby in general
  774. """
  775. ascending = kwargs.pop('ascending', True)
  776. if arr is None:
  777. arr = np.arange(self.grouper._max_groupsize, dtype='int64')
  778. len_index = len(self._selected_obj.index)
  779. cumcounts = np.zeros(len_index, dtype=arr.dtype)
  780. if not len_index:
  781. return cumcounts
  782. indices, values = [], []
  783. for v in self.indices.values():
  784. indices.append(v)
  785. if ascending:
  786. values.append(arr[:len(v)])
  787. else:
  788. values.append(arr[len(v)-1::-1])
  789. indices = np.concatenate(indices)
  790. values = np.concatenate(values)
  791. cumcounts[indices] = values
  792. return cumcounts
  793. def _index_with_as_index(self, b):
  794. """
  795. Take boolean mask of index to be returned from apply, if as_index=True
  796. """
  797. # TODO perf, it feels like this should already be somewhere...
  798. from itertools import chain
  799. original = self._selected_obj.index
  800. gp = self.grouper
  801. levels = chain((gp.levels[i][gp.labels[i][b]]
  802. for i in range(len(gp.groupings))),
  803. (original.get_level_values(i)[b]
  804. for i in range(original.nlevels)))
  805. new = MultiIndex.from_arrays(list(levels))
  806. new.names = gp.names + original.names
  807. return new
  808. def _try_cast(self, result, obj):
  809. """
  810. try to cast the result to our obj original type,
  811. we may have roundtripped thru object in the mean-time
  812. """
  813. if obj.ndim > 1:
  814. dtype = obj.values.dtype
  815. else:
  816. dtype = obj.dtype
  817. if not np.isscalar(result):
  818. result = _possibly_downcast_to_dtype(result, dtype)
  819. return result
  820. def _cython_agg_general(self, how, numeric_only=True):
  821. output = {}
  822. for name, obj in self._iterate_slices():
  823. is_numeric = is_numeric_dtype(obj.dtype)
  824. if numeric_only and not is_numeric:
  825. continue
  826. try:
  827. result, names = self.grouper.aggregate(obj.values, how)
  828. except AssertionError as e:
  829. raise GroupByError(str(e))
  830. output[name] = self._try_cast(result, obj)
  831. if len(output) == 0:
  832. raise DataError('No numeric types to aggregate')
  833. return self._wrap_aggregated_output(output, names)
  834. def _python_agg_general(self, func, *args, **kwargs):
  835. func = _intercept_function(func)
  836. f = lambda x: func(x, *args, **kwargs)
  837. # iterate through "columns" ex exclusions to populate output dict
  838. output = {}
  839. for name, obj in self._iterate_slices():
  840. try:
  841. result, counts = self.grouper.agg_series(obj, f)
  842. output[name] = self._try_cast(result, obj)
  843. except TypeError:
  844. continue
  845. if len(output) == 0:
  846. return self._python_apply_general(f)
  847. if self.grouper._filter_empty_groups:
  848. mask = counts.ravel() > 0
  849. for name, result in compat.iteritems(output):
  850. # since we are masking, make sure that we have a float object
  851. values = result
  852. if is_numeric_dtype(values.dtype):
  853. values = com.ensure_float(values)
  854. output[name] = self._try_cast(values[mask], result)
  855. return self._wrap_aggregated_output(output)
  856. def _wrap_applied_output(self, *args, **kwargs):
  857. raise NotImplementedError
  858. def _concat_objects(self, keys, values, not_indexed_same=False):
  859. from pandas.tools.merge import concat
  860. if not not_indexed_same:
  861. result = concat(values, axis=self.axis)
  862. ax = self._selected_obj._get_axis(self.axis)
  863. if isinstance(result, Series):
  864. result = result.reindex(ax)
  865. else:
  866. result = result.reindex_axis(ax, axis=self.axis)
  867. elif self.group_keys:
  868. if self.as_index:
  869. # possible MI return case
  870. group_keys = keys
  871. group_levels = self.grouper.levels
  872. group_names = self.grouper.names
  873. result = concat(values, axis=self.axis, keys=group_keys,
  874. levels=group_levels, names=group_names)
  875. else:
  876. # GH5610, returns a MI, with the first level being a
  877. # range index
  878. keys = list(range(len(values)))
  879. result = concat(values, axis=self.axis, keys=keys)
  880. else:
  881. result = concat(values, axis=self.axis)
  882. return result
  883. def _apply_filter(self, indices, dropna):
  884. if len(indices) == 0:
  885. indices = []
  886. else:
  887. indices = np.sort(np.concatenate(indices))
  888. if dropna:
  889. filtered = self._selected_obj.take(indices)
  890. else:
  891. mask = np.empty(len(self._selected_obj.index), dtype=bool)
  892. mask.fill(False)
  893. mask[indices.astype(int)] = True
  894. # mask fails to broadcast when passed to where; broadcast manually.
  895. mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T
  896. filtered = self._selected_obj.where(mask) # Fill with NaNs.
  897. return filtered
  898. @Appender(GroupBy.__doc__)
  899. def groupby(obj, by, **kwds):
  900. if isinstance(obj, Series):
  901. klass = SeriesGroupBy
  902. elif isinstance(obj, DataFrame):
  903. klass = DataFrameGroupBy
  904. else: # pragma: no cover
  905. raise TypeError('invalid type: %s' % type(obj))
  906. return klass(obj, by, **kwds)
  907. def _get_axes(group):
  908. if isinstance(group, Series):
  909. return [group.index]
  910. else:
  911. return group.axes
  912. def _is_indexed_like(obj, axes):
  913. if isinstance(obj, Series):
  914. if len(axes) > 1:
  915. return False
  916. return obj.index.equals(axes[0])
  917. elif isinstance(obj, DataFrame):
  918. return obj.index.equals(axes[0])
  919. return False
  920. class BaseGrouper(object):
  921. """
  922. This is an internal Grouper class, which actually holds the generated groups
  923. """
  924. def __init__(self, axis, groupings, sort=True, group_keys=True):
  925. self.axis = axis
  926. self.groupings = groupings
  927. self.sort = sort
  928. self.group_keys = group_keys
  929. self.compressed = True
  930. @property
  931. def shape(self):
  932. return tuple(ping.ngroups for ping in self.groupings)
  933. def __iter__(self):
  934. return iter(self.indices)
  935. @property
  936. def nkeys(self):
  937. return len(self.groupings)
  938. def get_iterator(self, data, axis=0):
  939. """
  940. Groupby iterator
  941. Returns
  942. -------
  943. Generator yielding sequence of (name, subsetted object)
  944. for each group
  945. """
  946. splitter = self._get_splitter(data, axis=axis)
  947. keys = self._get_group_keys()
  948. for key, (i, group) in zip(keys, splitter):
  949. yield key, group
  950. def _get_splitter(self, data, axis=0):
  951. comp_ids, _, ngroups = self.group_info
  952. return get_splitter(data, comp_ids, ngroups, axis=axis)
  953. def _get_group_keys(self):
  954. if len(self.groupings) == 1:
  955. return self.levels[0]
  956. else:
  957. comp_ids, _, ngroups = self.group_info
  958. # provide "flattened" iterator for multi-group setting
  959. mapper = _KeyMapper(comp_ids, ngroups, self.labels, self.levels)
  960. return [mapper.get_key(i) for i in range(ngroups)]
  961. def apply(self, f, data, axis=0):
  962. mutated = False
  963. splitter = self._get_splitter(data, axis=axis)
  964. group_keys = self._get_group_keys()
  965. # oh boy
  966. if (f.__name__ not in _plotting_methods and
  967. hasattr(splitter, 'fast_apply') and axis == 0):
  968. try:
  969. values, mutated = splitter.fast_apply(f, group_keys)
  970. return group_keys, values, mutated
  971. except (lib.InvalidApply):
  972. # we detect a mutation of some kind
  973. # so take slow path
  974. pass
  975. except (Exception) as e:
  976. # raise this error to the caller
  977. pass
  978. result_values = []
  979. for key, (i, group) in zip(group_keys, splitter):
  980. object.__setattr__(group, 'name', key)
  981. # group might be modified
  982. group_axes = _get_axes(group)
  983. res = f(group)
  984. if not _is_indexed_like(res, group_axes):
  985. mutated = True
  986. result_values.append(res)
  987. return group_keys, result_values, mutated
  988. @cache_readonly
  989. def indices(self):
  990. """ dict {group name -> group indices} """
  991. if len(self.groupings) == 1:
  992. return self.groupings[0].indices
  993. else:
  994. label_list = [ping.labels for ping in self.groupings]
  995. keys = [ping.group_index for ping in self.groupings]
  996. return _get_indices_dict(label_list, keys)
  997. @property
  998. def labels(self):
  999. return [ping.labels for ping in self.groupings]
  1000. @property
  1001. def levels(self):
  1002. return [ping.group_index for ping in self.groupings]
  1003. @property
  1004. def names(self):
  1005. return [ping.name for ping in self.groupings]
  1006. def size(self):
  1007. """
  1008. Compute group sizes
  1009. """
  1010. # TODO: better impl
  1011. labels, _, ngroups = self.group_info
  1012. bin_counts = algos.value_counts(labels, sort=False)
  1013. bin_counts = bin_counts.reindex(np.arange(ngroups))
  1014. bin_counts.index = self.result_index
  1015. return bin_counts
  1016. @cache_readonly
  1017. def _max_groupsize(self):
  1018. '''
  1019. Compute size of largest group
  1020. '''
  1021. # For many items in each group this is much faster than
  1022. # self.size().max(), in worst case marginally slower
  1023. if self.indices:
  1024. return max(len(v) for v in self.indices.values())
  1025. else:
  1026. return 0
  1027. @cache_readonly
  1028. def groups(self):
  1029. """ dict {group name -> group labels} """
  1030. if len(self.groupings) == 1:
  1031. return self.groupings[0].groups
  1032. else:
  1033. to_groupby = lzip(*(ping.grouper for ping in self.groupings))
  1034. to_groupby = Index(to_groupby)
  1035. return self.axis.groupby(to_groupby.values)
  1036. @cache_readonly
  1037. def group_info(self):
  1038. comp_ids, obs_group_ids = self._get_compressed_labels()
  1039. ngroups = len(obs_group_ids)
  1040. comp_ids = com._ensure_int64(comp_ids)
  1041. return comp_ids, obs_group_ids, ngroups
  1042. def _get_compressed_labels(self):
  1043. all_labels = [ping.labels for ping in self.groupings]
  1044. if self._overflow_possible:
  1045. tups = lib.fast_zip(all_labels)
  1046. labs, uniques = algos.factorize(tups)
  1047. if self.sort:
  1048. uniques, labs = _reorder_by_uniques(uniques, labs)
  1049. return labs, uniques
  1050. else:
  1051. if len(all_labels) > 1:
  1052. group_index = get_group_index(all_labels, self.shape)
  1053. comp_ids, obs_group_ids = _compress_group_index(group_index)
  1054. else:
  1055. ping = self.groupings[0]
  1056. comp_ids = ping.labels
  1057. obs_group_ids = np.arange(len(ping.group_index))
  1058. self.compressed = False
  1059. self._filter_empty_groups = False
  1060. return comp_ids, obs_group_ids
  1061. @cache_readonly
  1062. def _overflow_possible(self):
  1063. return _int64_overflow_possible(self.shape)
  1064. @cache_readonly
  1065. def ngroups(self):
  1066. return len(self.result_index)
  1067. @cache_readonly
  1068. def result_index(self):
  1069. recons = self.get_group_levels()
  1070. return MultiIndex.from_arrays(recons, names=self.names)
  1071. def get_group_levels(self):
  1072. obs_ids = self.group_info[1]
  1073. if not self.compressed and len(self.groupings) == 1:
  1074. return [self.groupings[0].group_index]
  1075. if self._overflow_possible:
  1076. recons_labels = [np.array(x) for x in zip(*obs_ids)]
  1077. else:
  1078. recons_labels = decons_group_index(obs_ids, self.shape)
  1079. name_list = []
  1080. for ping, labels in zip(self.groupings, recons_labels):
  1081. labels = com._ensure_platform_int(labels)
  1082. name_list.append(ping.group_index.take(labels))
  1083. return name_list
  1084. #------------------------------------------------------------
  1085. # Aggregation functions
  1086. _cython_functions = {
  1087. 'add': 'group_add',
  1088. 'prod': 'group_prod',
  1089. 'min': 'group_min',
  1090. 'max': 'group_max',
  1091. 'mean': 'group_mean',
  1092. 'median': {
  1093. 'name': 'group_median'
  1094. },
  1095. 'var': 'group_var',
  1096. 'first': {
  1097. 'name': 'group_nth',
  1098. 'f': lambda func, a, b, c, d: func(a, b, c, d, 1)
  1099. },
  1100. 'last': 'group_last',
  1101. 'count': 'group_count',
  1102. }
  1103. _cython_arity = {
  1104. 'ohlc': 4, # OHLC
  1105. }
  1106. _name_functions = {}
  1107. _filter_empty_groups = True
  1108. def _get_aggregate_function(self, how, values):
  1109. dtype_str = values.dtype.name
  1110. def get_func(fname):
  1111. # find the function, or use the object function, or return a
  1112. # generic
  1113. for dt in [dtype_str, 'object']:
  1114. f = getattr(_algos, "%s_%s" % (fname, dtype_str), None)
  1115. if f is not None:
  1116. return f
  1117. return getattr(_algos, fname, None)
  1118. ftype = self._cython_functions[how]
  1119. if isinstance(ftype, dict):
  1120. func = afunc = get_func(ftype['name'])
  1121. # a sub-function
  1122. f = ftype.get('f')
  1123. if f is not None:
  1124. def wrapper(*args, **kwargs):
  1125. return f(afunc, *args, **kwargs)
  1126. # need to curry our sub-function
  1127. func = wrapper
  1128. else:
  1129. func = get_func(ftype)
  1130. if func is None:
  1131. raise NotImplementedError("function is not implemented for this"
  1132. "dtype: [how->%s,dtype->%s]" %
  1133. (how, dtype_str))
  1134. return func, dtype_str
  1135. def aggregate(self, values, how, axis=0):
  1136. arity = self._cython_arity.get(how, 1)
  1137. vdim = values.ndim
  1138. swapped = False
  1139. if vdim == 1:
  1140. values = values[:, None]
  1141. out_shape = (self.ngroups, arity)
  1142. else:
  1143. if axis > 0:
  1144. swapped = True
  1145. values = values.swapaxes(0, axis)
  1146. if arity > 1:
  1147. raise NotImplementedError
  1148. out_shape = (self.ngroups,) + values.shape[1:]
  1149. if is_numeric_dtype(values.dtype):
  1150. values = com.ensure_float(values)
  1151. is_numeric = True
  1152. out_dtype = 'f%d' % values.dtype.itemsize
  1153. else:
  1154. is_numeric = issubclass(values.dtype.type, (np.datetime64,
  1155. np.timedelta64))
  1156. if is_numeric:
  1157. out_dtype = 'float64'
  1158. values = values.view('int64')
  1159. else:
  1160. out_dtype = 'object'
  1161. values = values.astype(object)
  1162. # will be filled in Cython function
  1163. result = np.empty(out_shape, dtype=out_dtype)
  1164. result.fill(np.nan)
  1165. counts = np.zeros(self.ngroups, dtype=np.int64)
  1166. result = self._aggregate(result, counts, values, how, is_numeric)
  1167. if self._filter_empty_groups:
  1168. if result.ndim == 2:
  1169. try:
  1170. result = lib.row_bool_subset(
  1171. result, (counts > 0).view(np.uint8))
  1172. except ValueError:
  1173. result = lib.row_bool_subset_object(
  1174. result, (counts > 0).view(np.uint8))
  1175. else:
  1176. result = result[counts > 0]
  1177. if vdim == 1 and arity == 1:
  1178. result = result[:, 0]
  1179. if how in self._name_functions:
  1180. # TODO
  1181. names = self._name_functions[how]()
  1182. else:
  1183. names = None
  1184. if swapped:
  1185. result = result.swapaxes(0, axis)
  1186. return result, names
  1187. def _aggregate(self, result, counts, values, how, is_numeric):
  1188. agg_func, dtype = self._get_aggregate_function(how, values)
  1189. comp_ids, _, ngroups = self.group_info
  1190. if values.ndim > 3:
  1191. # punting for now
  1192. raise NotImplementedError
  1193. elif values.ndim > 2:
  1194. for i, chunk in enumerate(values.transpose(2, 0, 1)):
  1195. chunk = chunk.squeeze()
  1196. agg_func(result[:, :, i], counts, chunk, comp_ids)
  1197. else:
  1198. agg_func(result, counts, values, comp_ids)
  1199. return result
  1200. def agg_series(self, obj, func):
  1201. try:
  1202. return self._aggregate_series_fast(obj, func)
  1203. except Exception:
  1204. return self._aggregate_series_pure_python(obj, func)
  1205. def _aggregate_series_fast(self, obj, func):
  1206. func = _intercept_function(func)
  1207. if obj.index._has_complex_internals:
  1208. raise TypeError('Incompatible index for Cython grouper')
  1209. group_index, _, ngroups = self.group_info
  1210. # avoids object / Series creation overhead
  1211. dummy = obj._get_values(slice(None, 0)).to_dense()
  1212. indexer = _algos.groupsort_indexer(group_index, ngroups)[0]
  1213. obj = obj.take(indexer, convert=False)
  1214. group_index = com.take_nd(group_index, indexer, allow_fill=False)
  1215. grouper = lib.SeriesGrouper(obj, func, group_index, ngroups,
  1216. dummy)
  1217. result, counts = grouper.get_result()
  1218. return result, counts
  1219. def _aggregate_series_pure_python(self, obj, func):
  1220. group_index, _, ngroups = self.group_info
  1221. counts = np.zeros(ngroups, dtype=int)
  1222. result = None
  1223. splitter = get_splitter(obj, group_index, ngroups, axis=self.axis)
  1224. for label, group in splitter:
  1225. res = func(group)
  1226. if result is None:
  1227. if (isinstance(res, (Series, np.ndarray)) or
  1228. isinstance(res, list)):
  1229. raise ValueError('Function does not reduce')
  1230. result = np.empty(ngroups, dtype='O')
  1231. counts[label] = group.shape[0]
  1232. result[label] = res
  1233. result = lib.maybe_convert_objects(result, try_float=0)
  1234. return result, counts
  1235. def generate_bins_generic(values, binner, closed):
  1236. """
  1237. Generate bin edge offsets and bin labels for one array using another array
  1238. which has bin edge values. Both arrays must be sorted.
  1239. Parameters
  1240. ----------
  1241. values : array of values
  1242. binner : a comparable array of values representing bins into which to bin
  1243. the first array. Note, 'values' end-points must fall within 'binner'
  1244. end-points.
  1245. closed : which end of bin is closed; left (default), right
  1246. Returns
  1247. -------
  1248. bins : array of offsets (into 'values' argument) of bins.
  1249. Zero and last edge are excluded in result, so for instance the first
  1250. bin is values[0:bin[0]] and the last is values[bin[-1]:]
  1251. """
  1252. lenidx = len(values)
  1253. lenbin = len(binner)
  1254. if lenidx <= 0 or lenbin <= 0:
  1255. raise ValueError("Invalid length for values or for binner")
  1256. # check binner fits data
  1257. if values[0] < binner[0]:
  1258. raise ValueError("Values falls before first bin")
  1259. if values[lenidx - 1] > binner[lenbin - 1]:
  1260. raise ValueError("Values falls after last bin")
  1261. bins = np.empty(lenbin - 1, dtype=np.int64)
  1262. j = 0 # index into values
  1263. bc = 0 # bin count
  1264. # linear scan, presume nothing about values/binner except that it fits ok
  1265. for i in range(0, lenbin - 1):
  1266. r_bin = binner[i + 1]
  1267. # count values in current bin, advance to next bin
  1268. while j < lenidx and (values[j] < r_bin or
  1269. (closed == 'right' and values[j] == r_bin)):
  1270. j += 1
  1271. bins[bc] = j
  1272. bc += 1
  1273. return bins
  1274. class BinGrouper(BaseGrouper):
  1275. def __init__(self, bins, binlabels, filter_empty=False):
  1276. self.bins = com._ensure_int64(bins)
  1277. self.binlabels = _ensure_index(binlabels)
  1278. self._filter_empty_groups = filter_empty
  1279. @cache_readonly
  1280. def groups(self):
  1281. """ dict {group name -> group labels} """
  1282. # this is mainly for compat
  1283. # GH 3881
  1284. result = {}
  1285. for key, value in zip(self.binlabels, self.bins):
  1286. if key is not tslib.NaT:
  1287. result[key] = value
  1288. return result
  1289. @property
  1290. def nkeys(self):
  1291. return 1
  1292. def get_iterator(self, data, axis=0):
  1293. """
  1294. Groupby iterator
  1295. Returns
  1296. -------
  1297. Generator yielding sequence of (name, subsetted object)
  1298. for each group
  1299. """
  1300. if isinstance(data, NDFrame):
  1301. slicer = lambda start,edge: data._slice(slice(start,edge),axis=axis)
  1302. length = len(data.axes[axis])
  1303. else:
  1304. slicer = lambda start,edge: data[slice(start,edge)]
  1305. length = len(data)
  1306. start = 0
  1307. for edge, label in zip(self.bins, self.binlabels):
  1308. if label is not tslib.NaT:
  1309. yield label, slicer(start,edge)
  1310. start = edge
  1311. if start < length:
  1312. yield self.binlabels[-1], slicer(start,None)
  1313. def apply(self, f, data, axis=0):
  1314. result_keys = []
  1315. result_values = []
  1316. mutated = False
  1317. for key, group in self.get_iterator(data, axis=axis):
  1318. object.__setattr__(group, 'name', key)
  1319. # group might be modified
  1320. group_axes = _get_axes(group)
  1321. res = f(group)
  1322. if not _is_indexed_like(res, group_axes):
  1323. mutated = True
  1324. result_keys.append(key)
  1325. result_values.append(res)
  1326. return result_keys, result_values, mutated
  1327. @cache_readonly
  1328. def indices(self):
  1329. indices = collections.defaultdict(list)
  1330. i = 0
  1331. for label, bin in zip(self.binlabels, self.bins):
  1332. if i < bin:
  1333. if label is not tslib.NaT:
  1334. indices[label] = list(range(i, bin))
  1335. i = bin
  1336. return indices
  1337. @cache_readonly
  1338. def ngroups(self):
  1339. return len(self.binlabels)
  1340. @cache_readonly
  1341. def result_index(self):
  1342. mask = self.binlabels.asi8 == tslib.iNaT
  1343. return self.binlabels[~mask]
  1344. @property
  1345. def levels(self):
  1346. return [self.binlabels]
  1347. @property
  1348. def names(self):
  1349. return [self.binlabels.name]
  1350. def size(self):
  1351. """
  1352. Compute group sizes
  1353. """
  1354. base = Series(np.zeros(len(self.result_index), dtype=np.int64),
  1355. index=self.result_index)
  1356. indices = self.indices
  1357. for k, v in compat.iteritems(indices):
  1358. indices[k] = len(v)
  1359. bin_counts = Series(indices, dtype=np.int64)
  1360. result = base.add(bin_counts, fill_value=0)
  1361. # addition with fill_value changes dtype to float64
  1362. result = result.astype(np.int64)
  1363. return result
  1364. #----------------------------------------------------------------------
  1365. # cython aggregation
  1366. _cython_functions = {
  1367. 'add': 'group_add_bin',
  1368. 'prod': 'group_prod_bin',
  1369. 'mean': 'group_mean_bin',
  1370. 'min': 'group_min_bin',
  1371. 'max': 'group_max_bin',
  1372. 'var': 'group_var_bin',
  1373. 'ohlc': 'group_ohlc',
  1374. 'first': {
  1375. 'name': 'group_nth_bin',
  1376. 'f': lambda func, a, b, c, d: func(a, b, c, d, 1)
  1377. },
  1378. 'last': 'group_last_bin',
  1379. 'count': 'group_count_bin',
  1380. }
  1381. _name_functions = {
  1382. 'ohlc': lambda *args: ['open', 'high', 'low', 'close']
  1383. }
  1384. _filter_empty_groups = True
  1385. def _aggregate(self, result, counts, values, how, is_numeric=True):
  1386. agg_func, dtype = self._get_aggregate_function(how, values)
  1387. if values.ndim > 3:
  1388. # punting for now
  1389. raise NotImplementedError
  1390. elif values.ndim > 2:
  1391. for i, chunk in enumerate(values.transpose(2, 0, 1)):
  1392. agg_func(result[:, :, i], counts, chunk, self.bins)
  1393. else:
  1394. agg_func(result, counts, values, self.bins)
  1395. return result
  1396. def agg_series(self, obj, func):
  1397. dummy = obj[:0]
  1398. grouper = lib.SeriesBinGrouper(obj, func, self.bins, dummy)
  1399. return grouper.get_result()
  1400. class Grouping(object):
  1401. """
  1402. Holds the grouping information for a single key
  1403. Parameters
  1404. ----------
  1405. index : Index
  1406. grouper :
  1407. obj :
  1408. name :
  1409. level :
  1410. Returns
  1411. -------
  1412. **Attributes**:
  1413. * indices : dict of {group -> index_list}
  1414. * labels : ndarray, group labels
  1415. * ids : mapping of label -> group
  1416. * counts : array of group counts
  1417. * group_index : unique groups
  1418. * groups : dict of {group -> label_list}
  1419. """
  1420. def __init__(self, index, grouper=None, obj=None, name=None, level=None,
  1421. sort=True):
  1422. self.name = name
  1423. self.level = level
  1424. self.grouper = _convert_grouper(index, grouper)
  1425. self.index = index
  1426. self.sort = sort
  1427. self.obj = obj
  1428. # right place for this?
  1429. if isinstance(grouper, (Series, Index)) and name is None:
  1430. self.name = grouper.name
  1431. if isinstance(grouper, MultiIndex):
  1432. self.grouper = grouper.values
  1433. # pre-computed
  1434. self._was_factor = False
  1435. self._should_compress = True
  1436. # we have a single grouper which may be a myriad of things, some of which are
  1437. # dependent on the passing in level
  1438. #
  1439. if level is not None:
  1440. if not isinstance(level, int):
  1441. if level not in index.names:
  1442. raise AssertionError('Level %s not in index' % str(level))
  1443. level = index.names.index(level)
  1444. inds = index.labels[level]
  1445. level_index = index.levels[level]
  1446. if self.name is None:
  1447. self.name = index.names[level]
  1448. # XXX complete hack
  1449. if grouper is not None:
  1450. level_values = index.levels[level].take(inds)
  1451. self.grouper = level_values.map(self.grouper)
  1452. else:
  1453. self._was_factor = True
  1454. # all levels may not be observed
  1455. labels, uniques = algos.factorize(inds, sort=True)
  1456. if len(uniques) > 0 and uniques[0] == -1:
  1457. # handle NAs
  1458. mask = inds != -1
  1459. ok_labels, uniques = algos.factorize(inds[mask], sort=True)
  1460. labels = np.empty(len(inds), dtype=inds.dtype)
  1461. labels[mask] = ok_labels
  1462. labels[~mask] = -1
  1463. if len(uniques) < len(level_index):
  1464. level_index = level_index.take(uniques)
  1465. self._labels = labels
  1466. self._group_index = level_index
  1467. self.grouper = level_index.take(labels)
  1468. else:
  1469. if isinstance(self.grouper, (list, tuple)):
  1470. self.grouper = com._asarray_tuplesafe(self.grouper)
  1471. # a passed Categorical
  1472. elif isinstance(self.grouper, Categorical):
  1473. factor = self.grouper
  1474. self._was_factor = True
  1475. # Is there any way to avoid this?
  1476. self.grouper = np.asarray(factor)
  1477. self._labels = factor.labels
  1478. self._group_index = factor.levels
  1479. if self.name is None:
  1480. self.name = factor.name
  1481. # a passed Grouper like
  1482. elif isinstance(self.grouper, Grouper):
  1483. # get the new grouper
  1484. grouper = self.grouper._get_binner_for_grouping(self.obj)
  1485. self.obj = self.grouper.obj
  1486. self.grouper = grouper
  1487. if self.name is None:
  1488. self.name = grouper.name
  1489. # no level passed
  1490. if not isinstance(self.grouper, (Series, np.ndarray)):
  1491. self.grouper = self.index.map(self.grouper)
  1492. if not (hasattr(self.grouper, "__len__") and
  1493. len(self.grouper) == len(self.index)):
  1494. errmsg = ('Grouper result violates len(labels) == '
  1495. 'len(data)\nresult: %s' %
  1496. com.pprint_thing(self.grouper))
  1497. self.grouper = None # Try for sanity
  1498. raise AssertionError(errmsg)
  1499. # if we have a date/time-like grouper, make sure that we have Timestamps like
  1500. if getattr(self.grouper,'dtype',None) is not None:
  1501. if is_datetime64_dtype(self.grouper):
  1502. from pandas import to_datetime
  1503. self.grouper = to_datetime(self.grouper)
  1504. elif is_timedelta64_dtype(self.grouper):
  1505. from pandas import to_timedelta
  1506. self.grouper = to_timedelta(self.grouper)
  1507. def __repr__(self):
  1508. return 'Grouping(%s)' % self.name
  1509. def __iter__(self):
  1510. return iter(self.indices)
  1511. _labels = None
  1512. _group_index = None
  1513. @property
  1514. def ngroups(self):
  1515. return len(self.group_index)
  1516. @cache_readonly
  1517. def indices(self):
  1518. return _groupby_indices(self.grouper)
  1519. @property
  1520. def labels(self):
  1521. if self._labels is None:
  1522. self._make_labels()
  1523. return self._labels
  1524. @property
  1525. def group_index(self):
  1526. if self._group_index is None:
  1527. self._make_labels()
  1528. return self._group_index
  1529. def _make_labels(self):
  1530. if self._was_factor: # pragma: no cover
  1531. raise Exception('Should not call this method grouping by level')
  1532. else:
  1533. labels, uniques = algos.factorize(self.grouper, sort=self.sort)
  1534. uniques = Index(uniques, name=self.name)
  1535. self._labels = labels
  1536. self._group_index = uniques
  1537. _groups = None
  1538. @property
  1539. def groups(self):
  1540. if self._groups is None:
  1541. self._groups = self.index.groupby(self.grouper)
  1542. return self._groups
  1543. def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
  1544. """
  1545. create and return a BaseGrouper, which is an internal
  1546. mapping of how to create the grouper indexers.
  1547. This may be composed of multiple Grouping objects, indicating
  1548. multiple groupers
  1549. Groupers are ultimately index mappings. They can originate as:
  1550. index mappings, keys to columns, functions, or Groupers
  1551. Groupers enable local references to axis,level,sort, while
  1552. the passed in axis, level, and sort are 'global'.
  1553. This routine tries to figure of what the passing in references
  1554. are and then creates a Grouping for each one, combined into
  1555. a BaseGrouper.
  1556. """
  1557. group_axis = obj._get_axis(axis)
  1558. # validate thatthe passed level is compatible with the passed
  1559. # axis of the object
  1560. if level is not None:
  1561. if not isinstance(group_axis, MultiIndex):
  1562. if isinstance(level, compat.string_types):
  1563. if obj.index.name != level:
  1564. raise ValueError('level name %s is not the name of the '
  1565. 'index' % level)
  1566. elif level > 0:
  1567. raise ValueError('level > 0 only valid with MultiIndex')
  1568. level = None
  1569. key = group_axis
  1570. # a passed in Grouper, directly convert
  1571. if isinstance(key, Grouper):
  1572. binner, grouper, obj = key._get_grouper(obj)
  1573. if key.key is None:
  1574. return grouper, [], obj
  1575. else:
  1576. return grouper, set([key.key]), obj
  1577. # already have a BaseGrouper, just return it
  1578. elif isinstance(key, BaseGrouper):
  1579. return key, [], obj
  1580. if not isinstance(key, (tuple, list)):
  1581. keys = [key]
  1582. else:
  1583. keys = key
  1584. # what are we after, exactly?
  1585. match_axis_length = len(keys) == len(group_axis)
  1586. any_callable = any(callable(g) or isinstance(g, dict) for g in keys)
  1587. any_arraylike = any(isinstance(g, (list, tuple, Series, np.ndarray))
  1588. for g in keys)
  1589. try:
  1590. if isinstance(obj, DataFrame):
  1591. all_in_columns = all(g in obj.columns for g in keys)
  1592. else:
  1593. all_in_columns = False
  1594. except Exception:
  1595. all_in_columns = False
  1596. if (not any_callable and not all_in_columns
  1597. and not any_arraylike and match_axis_length
  1598. and level is None):
  1599. keys = [com._asarray_tuplesafe(keys)]
  1600. if isinstance(level, (tuple, list)):
  1601. if key is None:
  1602. keys = [None] * len(level)
  1603. levels = level
  1604. else:
  1605. levels = [level] * len(keys)
  1606. groupings = []
  1607. exclusions = []
  1608. for i, (gpr, level) in enumerate(zip(keys, levels)):
  1609. name = None
  1610. try:
  1611. obj._data.items.get_loc(gpr)
  1612. in_axis = True
  1613. except Exception:
  1614. in_axis = False
  1615. if _is_label_like(gpr) or in_axis:
  1616. exclusions.append(gpr)
  1617. name = gpr
  1618. gpr = obj[gpr]
  1619. if isinstance(gpr, Categorical) and len(gpr) != len(obj):
  1620. errmsg = "Categorical grouper must have len(grouper) == len(data)"
  1621. raise AssertionError(errmsg)
  1622. ping = Grouping(group_axis, gpr, obj=obj, name=name, level=level, sort=sort)
  1623. groupings.append(ping)
  1624. if len(groupings) == 0:
  1625. raise ValueError('No group keys passed!')
  1626. # create the internals grouper
  1627. grouper = BaseGrouper(group_axis, groupings, sort=sort)
  1628. return grouper, exclusions, obj
  1629. def _is_label_like(val):
  1630. return isinstance(val, compat.string_types) or np.isscalar(val)
  1631. def _convert_grouper(axis, grouper):
  1632. if isinstance(grouper, dict):
  1633. return grouper.get
  1634. elif isinstance(grouper, Series):
  1635. if grouper.index.equals(axis):
  1636. return grouper.values
  1637. else:
  1638. return grouper.reindex(axis).values
  1639. elif isinstance(grouper, (list, Series, np.ndarray)):
  1640. if len(grouper) != len(axis):
  1641. raise AssertionError('Grouper and axis must be same length')
  1642. return grouper
  1643. else:
  1644. return grouper
  1645. class SeriesGroupBy(GroupBy):
  1646. _apply_whitelist = _series_apply_whitelist
  1647. def aggregate(self, func_or_funcs, *args, **kwargs):
  1648. """
  1649. Apply aggregation function or functions to groups, yielding most likely
  1650. Series but in some cases DataFrame depending on the output of the
  1651. aggregation function
  1652. Parameters
  1653. ----------
  1654. func_or_funcs : function or list / dict of functions
  1655. List/dict of functions will produce DataFrame with column names
  1656. determined by the function names themselves (list) or the keys in
  1657. the dict
  1658. Notes
  1659. -----
  1660. agg is an alias for aggregate. Use it.
  1661. Examples
  1662. --------
  1663. >>> series
  1664. bar 1.0
  1665. baz 2.0
  1666. qot 3.0
  1667. qux 4.0
  1668. >>> mapper = lambda x: x[0] # first letter
  1669. >>> grouped = series.groupby(mapper)
  1670. >>> grouped.aggregate(np.sum)
  1671. b 3.0
  1672. q 7.0
  1673. >>> grouped.aggregate([np.sum, np.mean, np.std])
  1674. mean std sum
  1675. b 1.5 0.5 3
  1676. q 3.5 0.5 7
  1677. >>> grouped.agg({'result' : lambda x: x.mean() / x.std(),
  1678. ... 'total' : np.sum})
  1679. result total
  1680. b 2.121 3
  1681. q 4.95 7
  1682. See also
  1683. --------
  1684. apply, transform
  1685. Returns
  1686. -------
  1687. Series or DataFrame
  1688. """
  1689. if isinstance(func_or_funcs, compat.string_types):
  1690. return getattr(self, func_or_funcs)(*args, **kwargs)
  1691. if hasattr(func_or_funcs, '__iter__'):
  1692. ret = self._aggregate_multiple_funcs(func_or_funcs)
  1693. else:
  1694. cyfunc = _intercept_cython(func_or_funcs)
  1695. if cyfunc and not args and not kwargs:
  1696. return getattr(self, cyfunc)()
  1697. if self.grouper.nkeys > 1:
  1698. return self._python_agg_general(func_or_funcs, *args, **kwargs)
  1699. try:
  1700. return self._python_agg_general(func_or_funcs, *args, **kwargs)
  1701. except Exception:
  1702. result = self._aggregate_named(func_or_funcs, *args, **kwargs)
  1703. index = Index(sorted(result), name=self.grouper.names[0])
  1704. ret = Series(result, index=index)
  1705. if not self.as_index: # pragma: no cover
  1706. print('Warning, ignoring as_index=True')
  1707. return ret
  1708. def _aggregate_multiple_funcs(self, arg):
  1709. if isinstance(arg, dict):
  1710. columns = list(arg.keys())
  1711. arg = list(arg.items())
  1712. elif any(isinstance(x, (tuple, list)) for x in arg):
  1713. arg = [(x, x) if not isinstance(x, (tuple, list)) else x
  1714. for x in arg]
  1715. # indicated column order
  1716. columns = lzip(*arg)[0]
  1717. else:
  1718. # list of functions / function names
  1719. columns = []
  1720. for f in arg:
  1721. if isinstance(f, compat.string_types):
  1722. columns.append(f)
  1723. else:
  1724. columns.append(f.__name__)
  1725. arg = lzip(columns, arg)
  1726. results = {}
  1727. for name, func in arg:
  1728. if name in results:
  1729. raise SpecificationError('Function names must be unique, '
  1730. 'found multiple named %s' % name)
  1731. results[name] = self.aggregate(func)
  1732. return DataFrame(results, columns=columns)
  1733. def _wrap_aggregated_output(self, output, names=None):
  1734. # sort of a kludge
  1735. output = output[self.name]
  1736. index = self.grouper.result_index
  1737. if names is not None:
  1738. return DataFrame(output, index=index, columns=names)
  1739. else:
  1740. name = self.name
  1741. if name is None:
  1742. name = self._selected_obj.name
  1743. return Series(output, index=index, name=name)
  1744. def _wrap_applied_output(self, keys, values, not_indexed_same=False):
  1745. if len(keys) == 0:
  1746. # GH #6265
  1747. return Series([], name=self.name)
  1748. def _get_index():
  1749. if self.grouper.nkeys > 1:
  1750. index = MultiIndex.from_tuples(keys, names=self.grouper.names)
  1751. else:
  1752. index = Index(keys, name=self.grouper.names[0])
  1753. return index
  1754. if isinstance(values[0], dict):
  1755. # GH #823
  1756. index = _get_index()
  1757. return DataFrame(values, index=index).stack()
  1758. if isinstance(values[0], (Series, dict)):
  1759. return self._concat_objects(keys, values,
  1760. not_indexed_same=not_indexed_same)
  1761. elif isinstance(values[0], DataFrame):
  1762. # possible that Series -> DataFrame by applied function
  1763. return self._concat_objects(keys, values,
  1764. not_indexed_same=not_indexed_same)
  1765. else:
  1766. # GH #6265
  1767. return Series(values, index=_get_index(), name=self.name)
  1768. def _aggregate_named(self, func, *args, **kwargs):
  1769. result = {}
  1770. for name, group in self:
  1771. group.name = name
  1772. output = func(group, *args, **kwargs)
  1773. if isinstance(output, (Series, np.ndarray)):
  1774. raise Exception('Must produce aggregated value')
  1775. result[name] = self._try_cast(output, group)
  1776. return result
  1777. def transform(self, func, *args, **kwargs):
  1778. """
  1779. Call function producing a like-indexed Series on each group and return
  1780. a Series with the transformed values
  1781. Parameters
  1782. ----------
  1783. func : function
  1784. To apply to each group. Should return a Series with the same index
  1785. Examples
  1786. --------
  1787. >>> grouped.transform(lambda x: (x - x.mean()) / x.std())
  1788. Returns
  1789. -------
  1790. transformed : Series
  1791. """
  1792. dtype = self._selected_obj.dtype
  1793. if isinstance(func, compat.string_types):
  1794. wrapper = lambda x: getattr(x, func)(*args, **kwargs)
  1795. else:
  1796. wrapper = lambda x: func(x, *args, **kwargs)
  1797. result = self._selected_obj.values.copy()
  1798. for i, (name, group) in enumerate(self):
  1799. object.__setattr__(group, 'name', name)
  1800. res = wrapper(group)
  1801. if hasattr(res, 'values'):
  1802. res = res.values
  1803. # may need to astype
  1804. try:
  1805. common_type = np.common_type(np.array(res), result)
  1806. if common_type != result.dtype:
  1807. result = result.astype(common_type)
  1808. except:
  1809. pass
  1810. indexer = self._get_index(name)
  1811. result[indexer] = res
  1812. result = _possibly_downcast_to_dtype(result, dtype)
  1813. return self._selected_obj.__class__(result,
  1814. index=self._selected_obj.index,
  1815. name=self._selected_obj.name)
  1816. def filter(self, func, dropna=True, *args, **kwargs):
  1817. """
  1818. Return a copy of a Series excluding elements from groups that
  1819. do not satisfy the boolean criterion specified by func.
  1820. Parameters
  1821. ----------
  1822. func : function
  1823. To apply to each group. Should return True or False.
  1824. dropna : Drop groups that do not pass the filter. True by default;
  1825. if False, groups that evaluate False are filled with NaNs.
  1826. Example
  1827. -------
  1828. >>> grouped.filter(lambda x: x.mean() > 0)
  1829. Returns
  1830. -------
  1831. filtered : Series
  1832. """
  1833. if isinstance(func, compat.string_types):
  1834. wrapper = lambda x: getattr(x, func)(*args, **kwargs)
  1835. else:
  1836. wrapper = lambda x: func(x, *args, **kwargs)
  1837. # Interpret np.nan as False.
  1838. def true_and_notnull(x, *args, **kwargs):
  1839. b = wrapper(x, *args, **kwargs)
  1840. return b and notnull(b)
  1841. try:
  1842. indices = [self._get_index(name) if true_and_notnull(group) else []
  1843. for name, group in self]
  1844. except ValueError:
  1845. raise TypeError("the filter must return a boolean result")
  1846. except TypeError:
  1847. raise TypeError("the filter must return a boolean result")
  1848. filtered = self._apply_filter(indices, dropna)
  1849. return filtered
  1850. def _apply_to_column_groupbys(self, func):
  1851. """ return a pass thru """
  1852. return func(self)
  1853. class NDFrameGroupBy(GroupBy):
  1854. def _iterate_slices(self):
  1855. if self.axis == 0:
  1856. # kludge
  1857. if self._selection is None:
  1858. slice_axis = self.obj.columns
  1859. else:
  1860. slice_axis = self._selection_list
  1861. slicer = lambda x: self.obj[x]
  1862. else:
  1863. slice_axis = self.obj.index
  1864. slicer = self.obj.xs
  1865. for val in slice_axis:
  1866. if val in self.exclusions:
  1867. continue
  1868. yield val, slicer(val)
  1869. def _cython_agg_general(self, how, numeric_only=True):
  1870. new_items, new_blocks = self._cython_agg_blocks(how, numeric_only=numeric_only)
  1871. return self._wrap_agged_blocks(new_items, new_blocks)
  1872. def _wrap_agged_blocks(self, items, blocks):
  1873. obj = self._obj_with_exclusions
  1874. new_axes = list(obj._data.axes)
  1875. # more kludge
  1876. if self.axis == 0:
  1877. new_axes[0], new_axes[1] = new_axes[1], self.grouper.result_index
  1878. else:
  1879. new_axes[self.axis] = self.grouper.result_index
  1880. # Make sure block manager integrity check passes.
  1881. assert new_axes[0].equals(items)
  1882. new_axes[0] = items
  1883. mgr = BlockManager(blocks, new_axes)
  1884. new_obj = type(obj)(mgr)
  1885. return self._post_process_cython_aggregate(new_obj)
  1886. _block_agg_axis = 0
  1887. def _cython_agg_blocks(self, how, numeric_only=True):
  1888. data, agg_axis = self._get_data_to_aggregate()
  1889. new_blocks = []
  1890. if numeric_only:
  1891. data = data.get_numeric_data(copy=False)
  1892. for block in data.blocks:
  1893. values = block._try_operate(block.values)
  1894. if block.is_numeric:
  1895. values = com.ensure_float(values)
  1896. result, _ = self.grouper.aggregate(values, how, axis=agg_axis)
  1897. # see if we can cast the block back to the original dtype
  1898. result = block._try_coerce_and_cast_result(result)
  1899. newb = make_block(result, placement=block.mgr_locs)
  1900. new_blocks.append(newb)
  1901. if len(new_blocks) == 0:
  1902. raise DataError('No numeric types to aggregate')
  1903. return data.items, new_blocks
  1904. def _get_data_to_aggregate(self):
  1905. obj = self._obj_with_exclusions
  1906. if self.axis == 0:
  1907. return obj.swapaxes(0, 1)._data, 1
  1908. else:
  1909. return obj._data, self.axis
  1910. def _post_process_cython_aggregate(self, obj):
  1911. # undoing kludge from below
  1912. if self.axis == 0:
  1913. obj = obj.swapaxes(0, 1)
  1914. return obj
  1915. @cache_readonly
  1916. def _obj_with_exclusions(self):
  1917. if self._selection is not None:
  1918. return self.obj.reindex(columns=self._selection_list)
  1919. if len(self.exclusions) > 0:
  1920. return self.obj.drop(self.exclusions, axis=1)
  1921. else:
  1922. return self.obj
  1923. @Appender(_agg_doc)
  1924. def aggregate(self, arg, *args, **kwargs):
  1925. if isinstance(arg, compat.string_types):
  1926. return getattr(self, arg)(*args, **kwargs)
  1927. result = OrderedDict()
  1928. if isinstance(arg, dict):
  1929. if self.axis != 0: # pragma: no cover
  1930. raise ValueError('Can only pass dict with axis=0')
  1931. obj = self._selected_obj
  1932. if any(isinstance(x, (list, tuple, dict)) for x in arg.values()):
  1933. new_arg = OrderedDict()
  1934. for k, v in compat.iteritems(arg):
  1935. if not isinstance(v, (tuple, list, dict)):
  1936. new_arg[k] = [v]
  1937. else:
  1938. new_arg[k] = v
  1939. arg = new_arg
  1940. keys = []
  1941. if self._selection is not None:
  1942. subset = obj
  1943. if isinstance(subset, DataFrame):
  1944. raise NotImplementedError
  1945. for fname, agg_how in compat.iteritems(arg):
  1946. colg = SeriesGroupBy(subset, selection=self._selection,
  1947. grouper=self.grouper)
  1948. result[fname] = colg.aggregate(agg_how)
  1949. keys.append(fname)
  1950. else:
  1951. for col, agg_how in compat.iteritems(arg):
  1952. colg = SeriesGroupBy(obj[col], selection=col,
  1953. grouper=self.grouper)
  1954. result[col] = colg.aggregate(agg_how)
  1955. keys.append(col)
  1956. if isinstance(list(result.values())[0], DataFrame):
  1957. from pandas.tools.merge import concat
  1958. result = concat([result[k] for k in keys], keys=keys, axis=1)
  1959. else:
  1960. result = DataFrame(result)
  1961. elif isinstance(arg, list):
  1962. return self._aggregate_multiple_funcs(arg)
  1963. else:
  1964. cyfunc = _intercept_cython(arg)
  1965. if cyfunc and not args and not kwargs:
  1966. return getattr(self, cyfunc)()
  1967. if self.grouper.nkeys > 1:
  1968. return self._python_agg_general(arg, *args, **kwargs)
  1969. else:
  1970. # try to treat as if we are passing a list
  1971. try:
  1972. assert not args and not kwargs
  1973. result = self._aggregate_multiple_funcs([arg])
  1974. result.columns = Index(result.columns.levels[0],
  1975. name=self._selected_obj.columns.name)
  1976. except:
  1977. result = self._aggregate_generic(arg, *args, **kwargs)
  1978. if not self.as_index:
  1979. if isinstance(result.index, MultiIndex):
  1980. zipped = zip(result.index.levels, result.index.labels,
  1981. result.index.names)
  1982. for i, (lev, lab, name) in enumerate(zipped):
  1983. result.insert(i, name,
  1984. com.take_nd(lev.values, lab,
  1985. allow_fill=False))
  1986. result = result.consolidate()
  1987. else:
  1988. values = result.index.values
  1989. name = self.grouper.groupings[0].name
  1990. result.insert(0, name, values)
  1991. result.index = np.arange(len(result))
  1992. return result.convert_objects()
  1993. def _aggregate_multiple_funcs(self, arg):
  1994. from pandas.tools.merge import concat
  1995. if self.axis != 0:
  1996. raise NotImplementedError
  1997. obj = self._obj_with_exclusions
  1998. results = []
  1999. keys = []
  2000. for col in obj:
  2001. try:
  2002. colg = SeriesGroupBy(obj[col], selection=col,
  2003. grouper=self.grouper)
  2004. results.append(colg.aggregate(arg))
  2005. keys.append(col)
  2006. except (TypeError, DataError):
  2007. pass
  2008. except SpecificationError:
  2009. raise
  2010. result = concat(results, keys=keys, axis=1)
  2011. return result
  2012. def _aggregate_generic(self, func, *args, **kwargs):
  2013. if self.grouper.nkeys != 1:
  2014. raise AssertionError('Number of keys must be 1')
  2015. axis = self.axis
  2016. obj = self._obj_with_exclusions
  2017. result = {}
  2018. if axis != obj._info_axis_number:
  2019. try:
  2020. for name, data in self:
  2021. # for name in self.indices:
  2022. # data = self.get_group(name, obj=obj)
  2023. result[name] = self._try_cast(func(data, *args, **kwargs),
  2024. data)
  2025. except Exception:
  2026. return self._aggregate_item_by_item(func, *args, **kwargs)
  2027. else:
  2028. for name in self.indices:
  2029. try:
  2030. data = self.get_group(name, obj=obj)
  2031. result[name] = self._try_cast(func(data, *args, **kwargs),
  2032. data)
  2033. except Exception:
  2034. wrapper = lambda x: func(x, *args, **kwargs)
  2035. result[name] = data.apply(wrapper, axis=axis)
  2036. return self._wrap_generic_output(result, obj)
  2037. def _wrap_aggregated_output(self, output, names=None):
  2038. raise NotImplementedError
  2039. def _aggregate_item_by_item(self, func, *args, **kwargs):
  2040. # only for axis==0
  2041. obj = self._obj_with_exclusions
  2042. result = {}
  2043. cannot_agg = []
  2044. errors=None
  2045. for item in obj:
  2046. try:
  2047. data = obj[item]
  2048. colg = SeriesGroupBy(data, selection=item,
  2049. grouper=self.grouper)
  2050. result[item] = self._try_cast(
  2051. colg.aggregate(func, *args, **kwargs), data)
  2052. except ValueError:
  2053. cannot_agg.append(item)
  2054. continue
  2055. except TypeError as e:
  2056. cannot_agg.append(item)
  2057. errors=e
  2058. continue
  2059. result_columns = obj.columns
  2060. if cannot_agg:
  2061. result_columns = result_columns.drop(cannot_agg)
  2062. # GH6337
  2063. if not len(result_columns) and errors is not None:
  2064. raise errors
  2065. return DataFrame(result, columns=result_columns)
  2066. def _decide_output_index(self, output, labels):
  2067. if len(output) == len(labels):
  2068. output_keys = labels
  2069. else:
  2070. output_keys = sorted(output)
  2071. try:
  2072. output_keys.sort()
  2073. except Exception: # pragma: no cover
  2074. pass
  2075. if isinstance(labels, MultiIndex):
  2076. output_keys = MultiIndex.from_tuples(output_keys,
  2077. names=labels.names)
  2078. return output_keys
  2079. def _wrap_applied_output(self, keys, values, not_indexed_same=False):
  2080. from pandas.core.index import _all_indexes_same
  2081. if len(keys) == 0:
  2082. # XXX
  2083. return DataFrame({})
  2084. key_names = self.grouper.names
  2085. if isinstance(values[0], DataFrame):
  2086. return self._concat_objects(keys, values,
  2087. not_indexed_same=not_indexed_same)
  2088. elif hasattr(self.grouper, 'groupings'):
  2089. if len(self.grouper.groupings) > 1:
  2090. key_index = MultiIndex.from_tuples(keys, names=key_names)
  2091. else:
  2092. ping = self.grouper.groupings[0]
  2093. if len(keys) == ping.ngroups:
  2094. key_index = ping.group_index
  2095. key_index.name = key_names[0]
  2096. key_lookup = Index(keys)
  2097. indexer = key_lookup.get_indexer(key_index)
  2098. # reorder the values
  2099. values = [values[i] for i in indexer]
  2100. else:
  2101. key_index = Index(keys, name=key_names[0])
  2102. # don't use the key indexer
  2103. if not self.as_index:
  2104. key_index = None
  2105. # make Nones an empty object
  2106. if com._count_not_none(*values) != len(values):
  2107. v = next(v for v in values if v is not None)
  2108. if v is None:
  2109. return DataFrame()
  2110. elif isinstance(v, NDFrame):
  2111. values = [
  2112. x if x is not None else
  2113. v._constructor(**v._construct_axes_dict())
  2114. for x in values
  2115. ]
  2116. v = values[0]
  2117. if isinstance(v, (np.ndarray, Series)):
  2118. if isinstance(v, Series):
  2119. applied_index = self._selected_obj._get_axis(self.axis)
  2120. all_indexed_same = _all_indexes_same([
  2121. x.index for x in values
  2122. ])
  2123. singular_series = (len(values) == 1 and
  2124. applied_index.nlevels == 1)
  2125. # GH3596
  2126. # provide a reduction (Frame -> Series) if groups are
  2127. # unique
  2128. if self.squeeze:
  2129. # assign the name to this series
  2130. if singular_series:
  2131. values[0].name = keys[0]
  2132. # GH2893
  2133. # we have series in the values array, we want to
  2134. # produce a series:
  2135. # if any of the sub-series are not indexed the same
  2136. # OR we don't have a multi-index and we have only a
  2137. # single values
  2138. return self._concat_objects(
  2139. keys, values, not_indexed_same=not_indexed_same
  2140. )
  2141. # still a series
  2142. # path added as of GH 5545
  2143. elif all_indexed_same:
  2144. from pandas.tools.merge import concat
  2145. return concat(values)
  2146. if not all_indexed_same:
  2147. return self._concat_objects(
  2148. keys, values, not_indexed_same=not_indexed_same
  2149. )
  2150. try:
  2151. if self.axis == 0:
  2152. # GH6124 if the list of Series have a consistent name,
  2153. # then propagate that name to the result.
  2154. index = v.index.copy()
  2155. if index.name is None:
  2156. # Only propagate the series name to the result
  2157. # if all series have a consistent name. If the
  2158. # series do not have a consistent name, do
  2159. # nothing.
  2160. names = set(v.name for v in values)
  2161. if len(names) == 1:
  2162. index.name = list(names)[0]
  2163. # normally use vstack as its faster than concat
  2164. # and if we have mi-columns
  2165. if not _np_version_under1p7 or isinstance(v.index,MultiIndex) or key_index is None:
  2166. stacked_values = np.vstack([np.asarray(x) for x in values])
  2167. result = DataFrame(stacked_values,index=key_index,columns=index)
  2168. else:
  2169. # GH5788 instead of stacking; concat gets the dtypes correct
  2170. from pandas.tools.merge import concat
  2171. result = concat(values,keys=key_index,names=key_index.names,
  2172. axis=self.axis).unstack()
  2173. result.columns = index
  2174. else:
  2175. stacked_values = np.vstack([np.asarray(x) for x in values])
  2176. result = DataFrame(stacked_values.T,index=v.index,columns=key_index)
  2177. except (ValueError, AttributeError):
  2178. # GH1738: values is list of arrays of unequal lengths fall
  2179. # through to the outer else caluse
  2180. return Series(values, index=key_index)
  2181. # if we have date/time like in the original, then coerce dates
  2182. # as we are stacking can easily have object dtypes here
  2183. if (self._selected_obj.ndim == 2
  2184. and self._selected_obj.dtypes.isin(_DATELIKE_DTYPES).any()):
  2185. cd = 'coerce'
  2186. else:
  2187. cd = True
  2188. return result.convert_objects(convert_dates=cd)
  2189. else:
  2190. # only coerce dates if we find at least 1 datetime
  2191. cd = 'coerce' if any([ isinstance(v,Timestamp) for v in values ]) else False
  2192. return Series(values, index=key_index).convert_objects(convert_dates=cd)
  2193. else:
  2194. # Handle cases like BinGrouper
  2195. return self._concat_objects(keys, values,
  2196. not_indexed_same=not_indexed_same)
  2197. def _transform_general(self, func, *args, **kwargs):
  2198. from pandas.tools.merge import concat
  2199. applied = []
  2200. obj = self._obj_with_exclusions
  2201. gen = self.grouper.get_iterator(obj, axis=self.axis)
  2202. fast_path, slow_path = self._define_paths(func, *args, **kwargs)
  2203. path = None
  2204. for name, group in gen:
  2205. object.__setattr__(group, 'name', name)
  2206. if path is None:
  2207. # Try slow path and fast path.
  2208. try:
  2209. path, res = self._choose_path(fast_path, slow_path, group)
  2210. except TypeError:
  2211. return self._transform_item_by_item(obj, fast_path)
  2212. except Exception: # pragma: no cover
  2213. res = fast_path(group)
  2214. path = fast_path
  2215. else:
  2216. res = path(group)
  2217. # broadcasting
  2218. if isinstance(res, Series):
  2219. if res.index.is_(obj.index):
  2220. group.T.values[:] = res
  2221. else:
  2222. group.values[:] = res
  2223. applied.append(group)
  2224. else:
  2225. applied.append(res)
  2226. concat_index = obj.columns if self.axis == 0 else obj.index
  2227. concatenated = concat(applied, join_axes=[concat_index],
  2228. axis=self.axis, verify_integrity=False)
  2229. concatenated.sort_index(inplace=True)
  2230. return concatenated
  2231. def transform(self, func, *args, **kwargs):
  2232. """
  2233. Call function producing a like-indexed DataFrame on each group and
  2234. return a DataFrame having the same indexes as the original object
  2235. filled with the transformed values
  2236. Parameters
  2237. ----------
  2238. f : function
  2239. Function to apply to each subframe
  2240. Notes
  2241. -----
  2242. Each subframe is endowed the attribute 'name' in case you need to know
  2243. which group you are working on.
  2244. Examples
  2245. --------
  2246. >>> grouped = df.groupby(lambda x: mapping[x])
  2247. >>> grouped.transform(lambda x: (x - x.mean()) / x.std())
  2248. """
  2249. # try to do a fast transform via merge if possible
  2250. try:
  2251. obj = self._obj_with_exclusions
  2252. if isinstance(func, compat.string_types):
  2253. result = getattr(self, func)(*args, **kwargs)
  2254. else:
  2255. cyfunc = _intercept_cython(func)
  2256. if cyfunc and not args and not kwargs:
  2257. result = getattr(self, cyfunc)()
  2258. else:
  2259. return self._transform_general(func, *args, **kwargs)
  2260. except:
  2261. return self._transform_general(func, *args, **kwargs)
  2262. # a reduction transform
  2263. if not isinstance(result, DataFrame):
  2264. return self._transform_general(func, *args, **kwargs)
  2265. # nuiscance columns
  2266. if not result.columns.equals(obj.columns):
  2267. return self._transform_general(func, *args, **kwargs)
  2268. # a grouped that doesn't preserve the index, remap index based on the grouper
  2269. # and broadcast it
  2270. if not isinstance(obj.index,MultiIndex) and type(result.index) != type(obj.index):
  2271. results = obj.values.copy()
  2272. for (name, group), (i, row) in zip(self, result.iterrows()):
  2273. indexer = self._get_index(name)
  2274. results[indexer] = np.tile(row.values,len(indexer)).reshape(len(indexer),-1)
  2275. return DataFrame(results,columns=result.columns,index=obj.index).convert_objects()
  2276. # we can merge the result in
  2277. # GH 7383
  2278. names = result.columns
  2279. result = obj.merge(result, how='outer', left_index=True, right_index=True).ix[:,-result.shape[1]:]
  2280. result.columns = names
  2281. return result
  2282. def _define_paths(self, func, *args, **kwargs):
  2283. if isinstance(func, compat.string_types):
  2284. fast_path = lambda group: getattr(group, func)(*args, **kwargs)
  2285. slow_path = lambda group: group.apply(
  2286. lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis)
  2287. else:
  2288. fast_path = lambda group: func(group, *args, **kwargs)
  2289. slow_path = lambda group: group.apply(
  2290. lambda x: func(x, *args, **kwargs), axis=self.axis)
  2291. return fast_path, slow_path
  2292. def _choose_path(self, fast_path, slow_path, group):
  2293. path = slow_path
  2294. res = slow_path(group)
  2295. # if we make it here, test if we can use the fast path
  2296. try:
  2297. res_fast = fast_path(group)
  2298. # compare that we get the same results
  2299. if res.shape == res_fast.shape:
  2300. res_r = res.values.ravel()
  2301. res_fast_r = res_fast.values.ravel()
  2302. mask = notnull(res_r)
  2303. if (res_r[mask] == res_fast_r[mask]).all():
  2304. path = fast_path
  2305. except:
  2306. pass
  2307. return path, res
  2308. def _transform_item_by_item(self, obj, wrapper):
  2309. # iterate through columns
  2310. output = {}
  2311. inds = []
  2312. for i, col in enumerate(obj):
  2313. try:
  2314. output[col] = self[col].transform(wrapper)
  2315. inds.append(i)
  2316. except Exception:
  2317. pass
  2318. if len(output) == 0: # pragma: no cover
  2319. raise TypeError('Transform function invalid for data types')
  2320. columns = obj.columns
  2321. if len(output) < len(obj.columns):
  2322. columns = columns.take(inds)
  2323. return DataFrame(output, index=obj.index, columns=columns)
  2324. def filter(self, func, dropna=True, *args, **kwargs):
  2325. """
  2326. Return a copy of a DataFrame excluding elements from groups that
  2327. do not satisfy the boolean criterion specified by func.
  2328. Parameters
  2329. ----------
  2330. f : function
  2331. Function to apply to each subframe. Should return True or False.
  2332. dropna : Drop groups that do not pass the filter. True by default;
  2333. if False, groups that evaluate False are filled with NaNs.
  2334. Notes
  2335. -----
  2336. Each subframe is endowed the attribute 'name' in case you need to know
  2337. which group you are working on.
  2338. Example
  2339. --------
  2340. >>> grouped = df.groupby(lambda x: mapping[x])
  2341. >>> grouped.filter(lambda x: x['A'].sum() + x['B'].sum() > 0)
  2342. """
  2343. from pandas.tools.merge import concat
  2344. indices = []
  2345. obj = self._selected_obj
  2346. gen = self.grouper.get_iterator(obj, axis=self.axis)
  2347. fast_path, slow_path = self._define_paths(func, *args, **kwargs)
  2348. path = None
  2349. for name, group in gen:
  2350. object.__setattr__(group, 'name', name)
  2351. if path is None:
  2352. # Try slow path and fast path.
  2353. try:
  2354. path, res = self._choose_path(fast_path, slow_path, group)
  2355. except Exception: # pragma: no cover
  2356. res = fast_path(group)
  2357. path = fast_path
  2358. else:
  2359. res = path(group)
  2360. def add_indices():
  2361. indices.append(self._get_index(name))
  2362. # interpret the result of the filter
  2363. if isinstance(res, (bool, np.bool_)):
  2364. if res:
  2365. add_indices()
  2366. else:
  2367. if getattr(res, 'ndim', None) == 1:
  2368. val = res.ravel()[0]
  2369. if val and notnull(val):
  2370. add_indices()
  2371. else:
  2372. # in theory you could do .all() on the boolean result ?
  2373. raise TypeError("the filter must return a boolean result")
  2374. filtered = self._apply_filter(indices, dropna)
  2375. return filtered
  2376. class DataFrameGroupBy(NDFrameGroupBy):
  2377. _apply_whitelist = _dataframe_apply_whitelist
  2378. _block_agg_axis = 1
  2379. def __getitem__(self, key):
  2380. if self._selection is not None:
  2381. raise Exception('Column(s) %s already selected' % self._selection)
  2382. if isinstance(key, (list, tuple, Series, np.ndarray)):
  2383. if len(self.obj.columns.intersection(key)) != len(key):
  2384. bad_keys = list(set(key).difference(self.obj.columns))
  2385. raise KeyError("Columns not found: %s"
  2386. % str(bad_keys)[1:-1])
  2387. return DataFrameGroupBy(self.obj, self.grouper, selection=key,
  2388. grouper=self.grouper,
  2389. exclusions=self.exclusions,
  2390. as_index=self.as_index)
  2391. elif not self.as_index:
  2392. if key not in self.obj.columns:
  2393. raise KeyError("Column not found: %s" % key)
  2394. return DataFrameGroupBy(self.obj, self.grouper, selection=key,
  2395. grouper=self.grouper,
  2396. exclusions=self.exclusions,
  2397. as_index=self.as_index)
  2398. else:
  2399. if key not in self.obj:
  2400. raise KeyError("Column not found: %s" % key)
  2401. # kind of a kludge
  2402. return SeriesGroupBy(self.obj[key], selection=key,
  2403. grouper=self.grouper,
  2404. exclusions=self.exclusions)
  2405. def _wrap_generic_output(self, result, obj):
  2406. result_index = self.grouper.levels[0]
  2407. if result:
  2408. if self.axis == 0:
  2409. result = DataFrame(result, index=obj.columns,
  2410. columns=result_index).T
  2411. else:
  2412. result = DataFrame(result, index=obj.index,
  2413. columns=result_index)
  2414. else:
  2415. result = DataFrame(result)
  2416. return result
  2417. def _get_data_to_aggregate(self):
  2418. obj = self._obj_with_exclusions
  2419. if self.axis == 1:
  2420. return obj.T._data, 1
  2421. else:
  2422. return obj._data, 1
  2423. def _wrap_aggregated_output(self, output, names=None):
  2424. agg_axis = 0 if self.axis == 1 else 1
  2425. agg_labels = self._obj_with_exclusions._get_axis(agg_axis)
  2426. output_keys = self._decide_output_index(output, agg_labels)
  2427. if not self.as_index:
  2428. result = DataFrame(output, columns=output_keys)
  2429. group_levels = self.grouper.get_group_levels()
  2430. zipped = zip(self.grouper.names, group_levels)
  2431. for i, (name, labels) in enumerate(zipped):
  2432. result.insert(i, name, labels)
  2433. result = result.consolidate()
  2434. else:
  2435. index = self.grouper.result_index
  2436. result = DataFrame(output, index=index, columns=output_keys)
  2437. if self.axis == 1:
  2438. result = result.T
  2439. return result.convert_objects()
  2440. def _wrap_agged_blocks(self, items, blocks):
  2441. if not self.as_index:
  2442. index = np.arange(blocks[0].values.shape[1])
  2443. mgr = BlockManager(blocks, [items, index])
  2444. result = DataFrame(mgr)
  2445. group_levels = self.grouper.get_group_levels()
  2446. zipped = zip(self.grouper.names, group_levels)
  2447. for i, (name, labels) in enumerate(zipped):
  2448. result.insert(i, name, labels)
  2449. result = result.consolidate()
  2450. else:
  2451. index = self.grouper.result_index
  2452. mgr = BlockManager(blocks, [items, index])
  2453. result = DataFrame(mgr)
  2454. if self.axis == 1:
  2455. result = result.T
  2456. return result.convert_objects()
  2457. def _iterate_column_groupbys(self):
  2458. for i, colname in enumerate(self._selected_obj.columns):
  2459. yield colname, SeriesGroupBy(self._selected_obj.iloc[:, i],
  2460. selection=colname,
  2461. grouper=self.grouper,
  2462. exclusions=self.exclusions)
  2463. def _apply_to_column_groupbys(self, func):
  2464. from pandas.tools.merge import concat
  2465. return concat(
  2466. (func(col_groupby) for _, col_groupby
  2467. in self._iterate_column_groupbys()),
  2468. keys=self._selected_obj.columns, axis=1)
  2469. from pandas.tools.plotting import boxplot_frame_groupby
  2470. DataFrameGroupBy.boxplot = boxplot_frame_groupby
  2471. class PanelGroupBy(NDFrameGroupBy):
  2472. def _iterate_slices(self):
  2473. if self.axis == 0:
  2474. # kludge
  2475. if self._selection is None:
  2476. slice_axis = self._selected_obj.items
  2477. else:
  2478. slice_axis = self._selection_list
  2479. slicer = lambda x: self._selected_obj[x]
  2480. else:
  2481. raise NotImplementedError
  2482. for val in slice_axis:
  2483. if val in self.exclusions:
  2484. continue
  2485. yield val, slicer(val)
  2486. def aggregate(self, arg, *args, **kwargs):
  2487. """
  2488. Aggregate using input function or dict of {column -> function}
  2489. Parameters
  2490. ----------
  2491. arg : function or dict
  2492. Function to use for aggregating groups. If a function, must either
  2493. work when passed a Panel or when passed to Panel.apply. If
  2494. pass a dict, the keys must be DataFrame column names
  2495. Returns
  2496. -------
  2497. aggregated : Panel
  2498. """
  2499. if isinstance(arg, compat.string_types):
  2500. return getattr(self, arg)(*args, **kwargs)
  2501. return self._aggregate_generic(arg, *args, **kwargs)
  2502. def _wrap_generic_output(self, result, obj):
  2503. if self.axis == 0:
  2504. new_axes = list(obj.axes)
  2505. new_axes[0] = self.grouper.result_index
  2506. elif self.axis == 1:
  2507. x, y, z = obj.axes
  2508. new_axes = [self.grouper.result_index, z, x]
  2509. else:
  2510. x, y, z = obj.axes
  2511. new_axes = [self.grouper.result_index, y, x]
  2512. result = Panel._from_axes(result, new_axes)
  2513. if self.axis == 1:
  2514. result = result.swapaxes(0, 1).swapaxes(0, 2)
  2515. elif self.axis == 2:
  2516. result = result.swapaxes(0, 2)
  2517. return result
  2518. def _aggregate_item_by_item(self, func, *args, **kwargs):
  2519. obj = self._obj_with_exclusions
  2520. result = {}
  2521. if self.axis > 0:
  2522. for item in obj:
  2523. try:
  2524. itemg = DataFrameGroupBy(obj[item],
  2525. axis=self.axis - 1,
  2526. grouper=self.grouper)
  2527. result[item] = itemg.aggregate(func, *args, **kwargs)
  2528. except (ValueError, TypeError):
  2529. raise
  2530. new_axes = list(obj.axes)
  2531. new_axes[self.axis] = self.grouper.result_index
  2532. return Panel._from_axes(result, new_axes)
  2533. else:
  2534. raise NotImplementedError
  2535. def _wrap_aggregated_output(self, output, names=None):
  2536. raise NotImplementedError
  2537. class NDArrayGroupBy(GroupBy):
  2538. pass
  2539. #----------------------------------------------------------------------
  2540. # Splitting / application
  2541. class DataSplitter(object):
  2542. def __init__(self, data, labels, ngroups, axis=0):
  2543. self.data = data
  2544. self.labels = com._ensure_int64(labels)
  2545. self.ngroups = ngroups
  2546. self.axis = axis
  2547. @cache_readonly
  2548. def slabels(self):
  2549. # Sorted labels
  2550. return com.take_nd(self.labels, self.sort_idx, allow_fill=False)
  2551. @cache_readonly
  2552. def sort_idx(self):
  2553. # Counting sort indexer
  2554. return _algos.groupsort_indexer(self.labels, self.ngroups)[0]
  2555. def __iter__(self):
  2556. sdata = self._get_sorted_data()
  2557. if self.ngroups == 0:
  2558. raise StopIteration
  2559. starts, ends = lib.generate_slices(self.slabels, self.ngroups)
  2560. for i, (start, end) in enumerate(zip(starts, ends)):
  2561. # Since I'm now compressing the group ids, it's now not "possible"
  2562. # to produce empty slices because such groups would not be observed
  2563. # in the data
  2564. # if start >= end:
  2565. # raise AssertionError('Start %s must be less than end %s'
  2566. # % (str(start), str(end)))
  2567. yield i, self._chop(sdata, slice(start, end))
  2568. def _get_sorted_data(self):
  2569. return self.data.take(self.sort_idx, axis=self.axis, convert=False)
  2570. def _chop(self, sdata, slice_obj):
  2571. return sdata.iloc[slice_obj]
  2572. def apply(self, f):
  2573. raise NotImplementedError
  2574. class ArraySplitter(DataSplitter):
  2575. pass
  2576. class SeriesSplitter(DataSplitter):
  2577. def _chop(self, sdata, slice_obj):
  2578. return sdata._get_values(slice_obj).to_dense()
  2579. class FrameSplitter(DataSplitter):
  2580. def __init__(self, data, labels, ngroups, axis=0):
  2581. super(FrameSplitter, self).__init__(data, labels, ngroups, axis=axis)
  2582. def fast_apply(self, f, names):
  2583. # must return keys::list, values::list, mutated::bool
  2584. try:
  2585. starts, ends = lib.generate_slices(self.slabels, self.ngroups)
  2586. except:
  2587. # fails when all -1
  2588. return [], True
  2589. sdata = self._get_sorted_data()
  2590. results, mutated = lib.apply_frame_axis0(sdata, f, names, starts, ends)
  2591. return results, mutated
  2592. def _chop(self, sdata, slice_obj):
  2593. if self.axis == 0:
  2594. return sdata.iloc[slice_obj]
  2595. else:
  2596. return sdata._slice(slice_obj, axis=1) # ix[:, slice_obj]
  2597. class NDFrameSplitter(DataSplitter):
  2598. def __init__(self, data, labels, ngroups, axis=0):
  2599. super(NDFrameSplitter, self).__init__(data, labels, ngroups, axis=axis)
  2600. self.factory = data._constructor
  2601. def _get_sorted_data(self):
  2602. # this is the BlockManager
  2603. data = self.data._data
  2604. # this is sort of wasteful but...
  2605. sorted_axis = data.axes[self.axis].take(self.sort_idx)
  2606. sorted_data = data.reindex_axis(sorted_axis, axis=self.axis)
  2607. return sorted_data
  2608. def _chop(self, sdata, slice_obj):
  2609. return self.factory(sdata.get_slice(slice_obj, axis=self.axis))
  2610. def get_splitter(data, *args, **kwargs):
  2611. if isinstance(data, Series):
  2612. klass = SeriesSplitter
  2613. elif isinstance(data, DataFrame):
  2614. klass = FrameSplitter
  2615. else:
  2616. klass = NDFrameSplitter
  2617. return klass(data, *args, **kwargs)
  2618. #----------------------------------------------------------------------
  2619. # Misc utilities
  2620. def get_group_index(label_list, shape):
  2621. """
  2622. For the particular label_list, gets the offsets into the hypothetical list
  2623. representing the totally ordered cartesian product of all possible label
  2624. combinations.
  2625. """
  2626. if len(label_list) == 1:
  2627. return label_list[0]
  2628. n = len(label_list[0])
  2629. group_index = np.zeros(n, dtype=np.int64)
  2630. mask = np.zeros(n, dtype=bool)
  2631. for i in range(len(shape)):
  2632. stride = np.prod([x for x in shape[i + 1:]], dtype=np.int64)
  2633. group_index += com._ensure_int64(label_list[i]) * stride
  2634. mask |= label_list[i] < 0
  2635. np.putmask(group_index, mask, -1)
  2636. return group_index
  2637. _INT64_MAX = np.iinfo(np.int64).max
  2638. def _int64_overflow_possible(shape):
  2639. the_prod = long(1)
  2640. for x in shape:
  2641. the_prod *= long(x)
  2642. return the_prod >= _INT64_MAX
  2643. def decons_group_index(comp_labels, shape):
  2644. # reconstruct labels
  2645. label_list = []
  2646. factor = 1
  2647. y = 0
  2648. x = comp_labels
  2649. for i in reversed(range(len(shape))):
  2650. labels = (x - y) % (factor * shape[i]) // factor
  2651. np.putmask(labels, comp_labels < 0, -1)
  2652. label_list.append(labels)
  2653. y = labels * factor
  2654. factor *= shape[i]
  2655. return label_list[::-1]
  2656. def _indexer_from_factorized(labels, shape, compress=True):
  2657. if _int64_overflow_possible(shape):
  2658. indexer = np.lexsort(np.array(labels[::-1]))
  2659. return indexer
  2660. group_index = get_group_index(labels, shape)
  2661. if compress:
  2662. comp_ids, obs_ids = _compress_group_index(group_index)
  2663. max_group = len(obs_ids)
  2664. else:
  2665. comp_ids = group_index
  2666. max_group = com._long_prod(shape)
  2667. if max_group > 1e6:
  2668. # Use mergesort to avoid memory errors in counting sort
  2669. indexer = comp_ids.argsort(kind='mergesort')
  2670. else:
  2671. indexer, _ = _algos.groupsort_indexer(comp_ids.astype(np.int64),
  2672. max_group)
  2673. return indexer
  2674. def _lexsort_indexer(keys, orders=None, na_position='last'):
  2675. labels = []
  2676. shape = []
  2677. if isinstance(orders, bool):
  2678. orders = [orders] * len(keys)
  2679. elif orders is None:
  2680. orders = [True] * len(keys)
  2681. for key, order in zip(keys, orders):
  2682. key = np.asanyarray(key)
  2683. rizer = _hash.Factorizer(len(key))
  2684. if not key.dtype == np.object_:
  2685. key = key.astype('O')
  2686. # factorize maps nans to na_sentinel=-1
  2687. ids = rizer.factorize(key, sort=True)
  2688. n = len(rizer.uniques)
  2689. mask = (ids == -1)
  2690. if order: # ascending
  2691. if na_position == 'last':
  2692. ids = np.where(mask, n, ids)
  2693. elif na_position == 'first':
  2694. ids += 1
  2695. else:
  2696. raise ValueError('invalid na_position: {!r}'.format(na_position))
  2697. else: # not order means descending
  2698. if na_position == 'last':
  2699. ids = np.where(mask, n, n-ids-1)
  2700. elif na_position == 'first':
  2701. ids = np.where(mask, 0, n-ids)
  2702. else:
  2703. raise ValueError('invalid na_position: {!r}'.format(na_position))
  2704. if mask.any():
  2705. n += 1
  2706. shape.append(n)
  2707. labels.append(ids)
  2708. return _indexer_from_factorized(labels, shape)
  2709. def _nargsort(items, kind='quicksort', ascending=True, na_position='last'):
  2710. """
  2711. This is intended to be a drop-in replacement for np.argsort which handles NaNs
  2712. It adds ascending and na_position parameters.
  2713. GH #6399, #5231
  2714. """
  2715. items = np.asanyarray(items)
  2716. idx = np.arange(len(items))
  2717. mask = isnull(items)
  2718. non_nans = items[~mask]
  2719. non_nan_idx = idx[~mask]
  2720. nan_idx = np.nonzero(mask)[0]
  2721. if not ascending:
  2722. non_nans = non_nans[::-1]
  2723. non_nan_idx = non_nan_idx[::-1]
  2724. indexer = non_nan_idx[non_nans.argsort(kind=kind)]
  2725. if not ascending:
  2726. indexer = indexer[::-1]
  2727. # Finally, place the NaNs at the end or the beginning according to na_position
  2728. if na_position == 'last':
  2729. indexer = np.concatenate([indexer, nan_idx])
  2730. elif na_position == 'first':
  2731. indexer = np.concatenate([nan_idx, indexer])
  2732. else:
  2733. raise ValueError('invalid na_position: {!r}'.format(na_position))
  2734. return indexer
  2735. class _KeyMapper(object):
  2736. """
  2737. Ease my suffering. Map compressed group id -> key tuple
  2738. """
  2739. def __init__(self, comp_ids, ngroups, labels, levels):
  2740. self.levels = levels
  2741. self.labels = labels
  2742. self.comp_ids = comp_ids.astype(np.int64)
  2743. self.k = len(labels)
  2744. self.tables = [_hash.Int64HashTable(ngroups) for _ in range(self.k)]
  2745. self._populate_tables()
  2746. def _populate_tables(self):
  2747. for labs, table in zip(self.labels, self.tables):
  2748. table.map(self.comp_ids, labs.astype(np.int64))
  2749. def get_key(self, comp_id):
  2750. return tuple(level[table.get_item(comp_id)]
  2751. for table, level in zip(self.tables, self.levels))
  2752. def _get_indices_dict(label_list, keys):
  2753. shape = [len(x) for x in keys]
  2754. group_index = get_group_index(label_list, shape)
  2755. sorter, _ = _algos.groupsort_indexer(com._ensure_int64(group_index),
  2756. np.prod(shape))
  2757. sorter_int = com._ensure_platform_int(sorter)
  2758. sorted_labels = [lab.take(sorter_int) for lab in label_list]
  2759. group_index = group_index.take(sorter_int)
  2760. return lib.indices_fast(sorter, group_index, keys, sorted_labels)
  2761. #----------------------------------------------------------------------
  2762. # sorting levels...cleverly?
  2763. def _compress_group_index(group_index, sort=True):
  2764. """
  2765. Group_index is offsets into cartesian product of all possible labels. This
  2766. space can be huge, so this function compresses it, by computing offsets
  2767. (comp_ids) into the list of unique labels (obs_group_ids).
  2768. """
  2769. table = _hash.Int64HashTable(min(1000000, len(group_index)))
  2770. group_index = com._ensure_int64(group_index)
  2771. # note, group labels come out ascending (ie, 1,2,3 etc)
  2772. comp_ids, obs_group_ids = table.get_labels_groupby(group_index)
  2773. if sort and len(obs_group_ids) > 0:
  2774. obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids)
  2775. return comp_ids, obs_group_ids
  2776. def _reorder_by_uniques(uniques, labels):
  2777. # sorter is index where elements ought to go
  2778. sorter = uniques.argsort()
  2779. # reverse_indexer is where elements came from
  2780. reverse_indexer = np.empty(len(sorter), dtype=np.int64)
  2781. reverse_indexer.put(sorter, np.arange(len(sorter)))
  2782. mask = labels < 0
  2783. # move labels to right locations (ie, unsort ascending labels)
  2784. labels = com.take_nd(reverse_indexer, labels, allow_fill=False)
  2785. np.putmask(labels, mask, -1)
  2786. # sort observed ids
  2787. uniques = com.take_nd(uniques, sorter, allow_fill=False)
  2788. return uniques, labels
  2789. _func_table = {
  2790. builtins.sum: np.sum
  2791. }
  2792. _cython_table = {
  2793. builtins.sum: 'sum',
  2794. np.sum: 'sum',
  2795. np.mean: 'mean',
  2796. np.prod: 'prod',
  2797. np.std: 'std',
  2798. np.var: 'var',
  2799. np.median: 'median',
  2800. np.max: 'max',
  2801. np.min: 'min'
  2802. }
  2803. def _intercept_function(func):
  2804. return _func_table.get(func, func)
  2805. def _intercept_cython(func):
  2806. return _cython_table.get(func)
  2807. def _groupby_indices(values):
  2808. return _algos.groupby_indices(com._ensure_object(values))
  2809. def numpy_groupby(data, labels, axis=0):
  2810. s = np.argsort(labels)
  2811. keys, inv = np.unique(labels, return_inverse=True)
  2812. i = inv.take(s)
  2813. groups_at = np.where(i != np.concatenate(([-1], i[:-1])))[0]
  2814. ordered_data = data.take(s, axis=axis)
  2815. group_sums = np.add.reduceat(ordered_data, groups_at, axis=axis)
  2816. return group_sums