PageRenderTime 66ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 1ms

/pandas/core/groupby.py

http://github.com/pydata/pandas
Python | 3566 lines | 3562 code | 4 blank | 0 comment | 9 complexity | 548ba450e7aecf6c9af4de2401745ea1 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0

Large files files are truncated, but you can click here to view the full file

  1. import types
  2. from functools import wraps
  3. import numpy as np
  4. import datetime
  5. import collections
  6. from pandas.compat import(
  7. zip, builtins, range, long, lzip,
  8. OrderedDict, callable
  9. )
  10. from pandas import compat
  11. from pandas.core.base import PandasObject
  12. from pandas.core.categorical import Categorical
  13. from pandas.core.frame import DataFrame
  14. from pandas.core.generic import NDFrame
  15. from pandas.core.index import Index, MultiIndex, _ensure_index, _union_indexes
  16. from pandas.core.internals import BlockManager, make_block
  17. from pandas.core.series import Series
  18. from pandas.core.panel import Panel
  19. from pandas.util.decorators import cache_readonly, Appender
  20. import pandas.core.algorithms as algos
  21. import pandas.core.common as com
  22. from pandas.core.common import(_possibly_downcast_to_dtype, isnull,
  23. notnull, _DATELIKE_DTYPES, is_numeric_dtype,
  24. is_timedelta64_dtype, is_datetime64_dtype)
  25. from pandas import _np_version_under1p7
  26. import pandas.lib as lib
  27. from pandas.lib import Timestamp
  28. import pandas.tslib as tslib
  29. import pandas.algos as _algos
  30. import pandas.hashtable as _hash
  31. _agg_doc = """Aggregate using input function or dict of {column -> function}
  32. Parameters
  33. ----------
  34. arg : function or dict
  35. Function to use for aggregating groups. If a function, must either
  36. work when passed a DataFrame or when passed to DataFrame.apply. If
  37. passed a dict, the keys must be DataFrame column names.
  38. Notes
  39. -----
  40. Numpy functions mean/median/prod/sum/std/var are special cased so the
  41. default behavior is applying the function along axis=0
  42. (e.g., np.mean(arr_2d, axis=0)) as opposed to
  43. mimicking the default Numpy behavior (e.g., np.mean(arr_2d)).
  44. Returns
  45. -------
  46. aggregated : DataFrame
  47. """
  48. # special case to prevent duplicate plots when catching exceptions when
  49. # forwarding methods from NDFrames
  50. _plotting_methods = frozenset(['plot', 'boxplot', 'hist'])
  51. _common_apply_whitelist = frozenset([
  52. 'last', 'first',
  53. 'head', 'tail', 'median',
  54. 'mean', 'sum', 'min', 'max',
  55. 'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount',
  56. 'resample',
  57. 'describe',
  58. 'rank', 'quantile', 'count',
  59. 'fillna',
  60. 'mad',
  61. 'any', 'all',
  62. 'irow', 'take',
  63. 'idxmax', 'idxmin',
  64. 'shift', 'tshift',
  65. 'ffill', 'bfill',
  66. 'pct_change', 'skew',
  67. 'corr', 'cov', 'diff',
  68. ]) | _plotting_methods
  69. _series_apply_whitelist = \
  70. (_common_apply_whitelist - set(['boxplot'])) | \
  71. frozenset(['dtype', 'value_counts', 'unique', 'nunique',
  72. 'nlargest', 'nsmallest'])
  73. _dataframe_apply_whitelist = \
  74. _common_apply_whitelist | frozenset(['dtypes', 'corrwith'])
  75. class GroupByError(Exception):
  76. pass
  77. class DataError(GroupByError):
  78. pass
  79. class SpecificationError(GroupByError):
  80. pass
  81. def _groupby_function(name, alias, npfunc, numeric_only=True,
  82. _convert=False):
  83. def f(self):
  84. self._set_selection_from_grouper()
  85. try:
  86. return self._cython_agg_general(alias, numeric_only=numeric_only)
  87. except AssertionError as e:
  88. raise SpecificationError(str(e))
  89. except Exception:
  90. result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
  91. if _convert:
  92. result = result.convert_objects()
  93. return result
  94. f.__doc__ = "Compute %s of group values" % name
  95. f.__name__ = name
  96. return f
  97. def _first_compat(x, axis=0):
  98. def _first(x):
  99. x = np.asarray(x)
  100. x = x[notnull(x)]
  101. if len(x) == 0:
  102. return np.nan
  103. return x[0]
  104. if isinstance(x, DataFrame):
  105. return x.apply(_first, axis=axis)
  106. else:
  107. return _first(x)
  108. def _last_compat(x, axis=0):
  109. def _last(x):
  110. x = np.asarray(x)
  111. x = x[notnull(x)]
  112. if len(x) == 0:
  113. return np.nan
  114. return x[-1]
  115. if isinstance(x, DataFrame):
  116. return x.apply(_last, axis=axis)
  117. else:
  118. return _last(x)
  119. def _count_compat(x, axis=0):
  120. return x.size
  121. class Grouper(object):
  122. """
  123. A Grouper allows the user to specify a groupby instruction for a target object
  124. This specification will select a column via the key parameter, or if the level and/or
  125. axis parameters are given, a level of the index of the target object.
  126. These are local specifications and will override 'global' settings, that is the parameters
  127. axis and level which are passed to the groupby itself.
  128. Parameters
  129. ----------
  130. key : string, defaults to None
  131. groupby key, which selects the grouping column of the target
  132. level : name/number, defaults to None
  133. the level for the target index
  134. freq : string / freqency object, defaults to None
  135. This will groupby the specified frequency if the target selection (via key or level) is
  136. a datetime-like object
  137. axis : number/name of the axis, defaults to None
  138. sort : boolean, default to False
  139. whether to sort the resulting labels
  140. additional kwargs to control time-like groupers (when freq is passed)
  141. closed : closed end of interval; left or right
  142. label : interval boundary to use for labeling; left or right
  143. convention : {'start', 'end', 'e', 's'}
  144. If grouper is PeriodIndex
  145. Returns
  146. -------
  147. A specification for a groupby instruction
  148. Examples
  149. --------
  150. >>> df.groupby(Grouper(key='A')) : syntatic sugar for df.groupby('A')
  151. >>> df.groupby(Grouper(key='date',freq='60s')) : specify a resample on the column 'date'
  152. >>> df.groupby(Grouper(level='date',freq='60s',axis=1)) :
  153. specify a resample on the level 'date' on the columns axis with a frequency of 60s
  154. """
  155. def __new__(cls, *args, **kwargs):
  156. if kwargs.get('freq') is not None:
  157. from pandas.tseries.resample import TimeGrouper
  158. cls = TimeGrouper
  159. return super(Grouper, cls).__new__(cls)
  160. def __init__(self, key=None, level=None, freq=None, axis=None, sort=False):
  161. self.key=key
  162. self.level=level
  163. self.freq=freq
  164. self.axis=axis
  165. self.sort=sort
  166. self.grouper=None
  167. self.obj=None
  168. self.indexer=None
  169. self.binner=None
  170. self.grouper=None
  171. @property
  172. def ax(self):
  173. return self.grouper
  174. def _get_grouper(self, obj):
  175. """
  176. Parameters
  177. ----------
  178. obj : the subject object
  179. Returns
  180. -------
  181. a tuple of binner, grouper, obj (possibly sorted)
  182. """
  183. self._set_grouper(obj)
  184. return self.binner, self.grouper, self.obj
  185. def _set_grouper(self, obj, sort=False):
  186. """
  187. given an object and the specifcations, setup the internal grouper for this particular specification
  188. Parameters
  189. ----------
  190. obj : the subject object
  191. """
  192. if self.key is not None and self.level is not None:
  193. raise ValueError("The Grouper cannot specify both a key and a level!")
  194. # the key must be a valid info item
  195. if self.key is not None:
  196. key = self.key
  197. if key not in obj._info_axis:
  198. raise KeyError("The grouper name {0} is not found".format(key))
  199. ax = Index(obj[key],name=key)
  200. else:
  201. ax = obj._get_axis(self.axis)
  202. if self.level is not None:
  203. level = self.level
  204. # if a level is given it must be a mi level or
  205. # equivalent to the axis name
  206. if isinstance(ax, MultiIndex):
  207. if isinstance(level, compat.string_types):
  208. if obj.index.name != level:
  209. raise ValueError('level name %s is not the name of the '
  210. 'index' % level)
  211. elif level > 0:
  212. raise ValueError('level > 0 only valid with MultiIndex')
  213. ax = Index(ax.get_level_values(level), name=level)
  214. else:
  215. if not (level == 0 or level == ax.name):
  216. raise ValueError("The grouper level {0} is not valid".format(level))
  217. # possibly sort
  218. if (self.sort or sort) and not ax.is_monotonic:
  219. indexer = self.indexer = ax.argsort(kind='quicksort')
  220. ax = ax.take(indexer)
  221. obj = obj.take(indexer, axis=self.axis, convert=False, is_copy=False)
  222. self.obj = obj
  223. self.grouper = ax
  224. return self.grouper
  225. def _get_binner_for_grouping(self, obj):
  226. raise NotImplementedError
  227. @property
  228. def groups(self):
  229. return self.grouper.groups
  230. class GroupBy(PandasObject):
  231. """
  232. Class for grouping and aggregating relational data. See aggregate,
  233. transform, and apply functions on this object.
  234. It's easiest to use obj.groupby(...) to use GroupBy, but you can also do:
  235. ::
  236. grouped = groupby(obj, ...)
  237. Parameters
  238. ----------
  239. obj : pandas object
  240. axis : int, default 0
  241. level : int, default None
  242. Level of MultiIndex
  243. groupings : list of Grouping objects
  244. Most users should ignore this
  245. exclusions : array-like, optional
  246. List of columns to exclude
  247. name : string
  248. Most users should ignore this
  249. Notes
  250. -----
  251. After grouping, see aggregate, apply, and transform functions. Here are
  252. some other brief notes about usage. When grouping by multiple groups, the
  253. result index will be a MultiIndex (hierarchical) by default.
  254. Iteration produces (key, group) tuples, i.e. chunking the data by group. So
  255. you can write code like:
  256. ::
  257. grouped = obj.groupby(keys, axis=axis)
  258. for key, group in grouped:
  259. # do something with the data
  260. Function calls on GroupBy, if not specially implemented, "dispatch" to the
  261. grouped data. So if you group a DataFrame and wish to invoke the std()
  262. method on each group, you can simply do:
  263. ::
  264. df.groupby(mapper).std()
  265. rather than
  266. ::
  267. df.groupby(mapper).aggregate(np.std)
  268. You can pass arguments to these "wrapped" functions, too.
  269. See the online documentation for full exposition on these topics and much
  270. more
  271. Returns
  272. -------
  273. **Attributes**
  274. groups : dict
  275. {group name -> group labels}
  276. len(grouped) : int
  277. Number of groups
  278. """
  279. _apply_whitelist = _common_apply_whitelist
  280. _internal_names = ['_cache']
  281. _internal_names_set = set(_internal_names)
  282. _group_selection = None
  283. def __init__(self, obj, keys=None, axis=0, level=None,
  284. grouper=None, exclusions=None, selection=None, as_index=True,
  285. sort=True, group_keys=True, squeeze=False):
  286. self._selection = selection
  287. if isinstance(obj, NDFrame):
  288. obj._consolidate_inplace()
  289. self.level = level
  290. if not as_index:
  291. if not isinstance(obj, DataFrame):
  292. raise TypeError('as_index=False only valid with DataFrame')
  293. if axis != 0:
  294. raise ValueError('as_index=False only valid for axis=0')
  295. self.as_index = as_index
  296. self.keys = keys
  297. self.sort = sort
  298. self.group_keys = group_keys
  299. self.squeeze = squeeze
  300. if grouper is None:
  301. grouper, exclusions, obj = _get_grouper(obj, keys, axis=axis,
  302. level=level, sort=sort)
  303. self.obj = obj
  304. self.axis = obj._get_axis_number(axis)
  305. self.grouper = grouper
  306. self.exclusions = set(exclusions) if exclusions else set()
  307. def __len__(self):
  308. return len(self.indices)
  309. def __unicode__(self):
  310. # TODO: Better unicode/repr for GroupBy object
  311. return object.__repr__(self)
  312. @property
  313. def groups(self):
  314. """ dict {group name -> group labels} """
  315. return self.grouper.groups
  316. @property
  317. def ngroups(self):
  318. return self.grouper.ngroups
  319. @property
  320. def indices(self):
  321. """ dict {group name -> group indices} """
  322. return self.grouper.indices
  323. def _get_index(self, name):
  324. """ safe get index, translate keys for datelike to underlying repr """
  325. def convert(key, s):
  326. # possibly convert to they actual key types
  327. # in the indices, could be a Timestamp or a np.datetime64
  328. if isinstance(s, (Timestamp,datetime.datetime)):
  329. return Timestamp(key)
  330. elif isinstance(s, np.datetime64):
  331. return Timestamp(key).asm8
  332. return key
  333. sample = next(iter(self.indices))
  334. if isinstance(sample, tuple):
  335. if not isinstance(name, tuple):
  336. raise ValueError("must supply a tuple to get_group with multiple grouping keys")
  337. if not len(name) == len(sample):
  338. raise ValueError("must supply a a same-length tuple to get_group with multiple grouping keys")
  339. name = tuple([ convert(n, k) for n, k in zip(name,sample) ])
  340. else:
  341. name = convert(name, sample)
  342. return self.indices[name]
  343. @property
  344. def name(self):
  345. if self._selection is None:
  346. return None # 'result'
  347. else:
  348. return self._selection
  349. @property
  350. def _selection_list(self):
  351. if not isinstance(self._selection, (list, tuple, Series, np.ndarray)):
  352. return [self._selection]
  353. return self._selection
  354. @cache_readonly
  355. def _selected_obj(self):
  356. if self._selection is None or isinstance(self.obj, Series):
  357. if self._group_selection is not None:
  358. return self.obj[self._group_selection]
  359. return self.obj
  360. else:
  361. return self.obj[self._selection]
  362. def _set_selection_from_grouper(self):
  363. """ we may need create a selection if we have non-level groupers """
  364. grp = self.grouper
  365. if self.as_index and getattr(grp,'groupings',None) is not None and self.obj.ndim > 1:
  366. ax = self.obj._info_axis
  367. groupers = [ g.name for g in grp.groupings if g.level is None and g.name is not None and g.name in ax ]
  368. if len(groupers):
  369. self._group_selection = (ax-Index(groupers)).tolist()
  370. def _local_dir(self):
  371. return sorted(set(self.obj._local_dir() + list(self._apply_whitelist)))
  372. def __getattr__(self, attr):
  373. if attr in self._internal_names_set:
  374. return object.__getattribute__(self, attr)
  375. if attr in self.obj:
  376. return self[attr]
  377. if hasattr(self.obj, attr):
  378. return self._make_wrapper(attr)
  379. raise AttributeError("%r object has no attribute %r" %
  380. (type(self).__name__, attr))
  381. def __getitem__(self, key):
  382. raise NotImplementedError('Not implemented: %s' % key)
  383. def _make_wrapper(self, name):
  384. if name not in self._apply_whitelist:
  385. is_callable = callable(getattr(self._selected_obj, name, None))
  386. kind = ' callable ' if is_callable else ' '
  387. msg = ("Cannot access{0}attribute {1!r} of {2!r} objects, try "
  388. "using the 'apply' method".format(kind, name,
  389. type(self).__name__))
  390. raise AttributeError(msg)
  391. # need to setup the selection
  392. # as are not passed directly but in the grouper
  393. self._set_selection_from_grouper()
  394. f = getattr(self._selected_obj, name)
  395. if not isinstance(f, types.MethodType):
  396. return self.apply(lambda self: getattr(self, name))
  397. f = getattr(type(self._selected_obj), name)
  398. def wrapper(*args, **kwargs):
  399. # a little trickery for aggregation functions that need an axis
  400. # argument
  401. kwargs_with_axis = kwargs.copy()
  402. if 'axis' not in kwargs_with_axis:
  403. kwargs_with_axis['axis'] = self.axis
  404. def curried_with_axis(x):
  405. return f(x, *args, **kwargs_with_axis)
  406. def curried(x):
  407. return f(x, *args, **kwargs)
  408. # preserve the name so we can detect it when calling plot methods,
  409. # to avoid duplicates
  410. curried.__name__ = curried_with_axis.__name__ = name
  411. # special case otherwise extra plots are created when catching the
  412. # exception below
  413. if name in _plotting_methods:
  414. return self.apply(curried)
  415. try:
  416. return self.apply(curried_with_axis)
  417. except Exception:
  418. try:
  419. return self.apply(curried)
  420. except Exception:
  421. # related to : GH3688
  422. # try item-by-item
  423. # this can be called recursively, so need to raise ValueError if
  424. # we don't have this method to indicated to aggregate to
  425. # mark this column as an error
  426. try:
  427. return self._aggregate_item_by_item(name, *args, **kwargs)
  428. except (AttributeError):
  429. raise ValueError
  430. return wrapper
  431. def get_group(self, name, obj=None):
  432. """
  433. Constructs NDFrame from group with provided name
  434. Parameters
  435. ----------
  436. name : object
  437. the name of the group to get as a DataFrame
  438. obj : NDFrame, default None
  439. the NDFrame to take the DataFrame out of. If
  440. it is None, the object groupby was called on will
  441. be used
  442. Returns
  443. -------
  444. group : type of obj
  445. """
  446. if obj is None:
  447. obj = self._selected_obj
  448. inds = self._get_index(name)
  449. return obj.take(inds, axis=self.axis, convert=False)
  450. def __iter__(self):
  451. """
  452. Groupby iterator
  453. Returns
  454. -------
  455. Generator yielding sequence of (name, subsetted object)
  456. for each group
  457. """
  458. return self.grouper.get_iterator(self.obj, axis=self.axis)
  459. def apply(self, func, *args, **kwargs):
  460. """
  461. Apply function and combine results together in an intelligent way. The
  462. split-apply-combine combination rules attempt to be as common sense
  463. based as possible. For example:
  464. case 1:
  465. group DataFrame
  466. apply aggregation function (f(chunk) -> Series)
  467. yield DataFrame, with group axis having group labels
  468. case 2:
  469. group DataFrame
  470. apply transform function ((f(chunk) -> DataFrame with same indexes)
  471. yield DataFrame with resulting chunks glued together
  472. case 3:
  473. group Series
  474. apply function with f(chunk) -> DataFrame
  475. yield DataFrame with result of chunks glued together
  476. Parameters
  477. ----------
  478. func : function
  479. Notes
  480. -----
  481. See online documentation for full exposition on how to use apply.
  482. In the current implementation apply calls func twice on the
  483. first group to decide whether it can take a fast or slow code
  484. path. This can lead to unexpected behavior if func has
  485. side-effects, as they will take effect twice for the first
  486. group.
  487. See also
  488. --------
  489. aggregate, transform
  490. Returns
  491. -------
  492. applied : type depending on grouped object and function
  493. """
  494. func = _intercept_function(func)
  495. @wraps(func)
  496. def f(g):
  497. return func(g, *args, **kwargs)
  498. return self._python_apply_general(f)
  499. def _python_apply_general(self, f):
  500. keys, values, mutated = self.grouper.apply(f, self._selected_obj,
  501. self.axis)
  502. return self._wrap_applied_output(keys, values,
  503. not_indexed_same=mutated)
  504. def aggregate(self, func, *args, **kwargs):
  505. raise NotImplementedError
  506. @Appender(_agg_doc)
  507. def agg(self, func, *args, **kwargs):
  508. return self.aggregate(func, *args, **kwargs)
  509. def _iterate_slices(self):
  510. yield self.name, self._selected_obj
  511. def transform(self, func, *args, **kwargs):
  512. raise NotImplementedError
  513. def mean(self):
  514. """
  515. Compute mean of groups, excluding missing values
  516. For multiple groupings, the result index will be a MultiIndex
  517. """
  518. try:
  519. return self._cython_agg_general('mean')
  520. except GroupByError:
  521. raise
  522. except Exception: # pragma: no cover
  523. self._set_selection_from_grouper()
  524. f = lambda x: x.mean(axis=self.axis)
  525. return self._python_agg_general(f)
  526. def median(self):
  527. """
  528. Compute median of groups, excluding missing values
  529. For multiple groupings, the result index will be a MultiIndex
  530. """
  531. try:
  532. return self._cython_agg_general('median')
  533. except GroupByError:
  534. raise
  535. except Exception: # pragma: no cover
  536. self._set_selection_from_grouper()
  537. def f(x):
  538. if isinstance(x, np.ndarray):
  539. x = Series(x)
  540. return x.median(axis=self.axis)
  541. return self._python_agg_general(f)
  542. def std(self, ddof=1):
  543. """
  544. Compute standard deviation of groups, excluding missing values
  545. For multiple groupings, the result index will be a MultiIndex
  546. """
  547. # todo, implement at cython level?
  548. return np.sqrt(self.var(ddof=ddof))
  549. def var(self, ddof=1):
  550. """
  551. Compute variance of groups, excluding missing values
  552. For multiple groupings, the result index will be a MultiIndex
  553. """
  554. if ddof == 1:
  555. return self._cython_agg_general('var')
  556. else:
  557. self._set_selection_from_grouper()
  558. f = lambda x: x.var(ddof=ddof)
  559. return self._python_agg_general(f)
  560. def sem(self, ddof=1):
  561. """
  562. Compute standard error of the mean of groups, excluding missing values
  563. For multiple groupings, the result index will be a MultiIndex
  564. """
  565. return self.std(ddof=ddof)/np.sqrt(self.count())
  566. def size(self):
  567. """
  568. Compute group sizes
  569. """
  570. return self.grouper.size()
  571. sum = _groupby_function('sum', 'add', np.sum)
  572. prod = _groupby_function('prod', 'prod', np.prod)
  573. min = _groupby_function('min', 'min', np.min, numeric_only=False)
  574. max = _groupby_function('max', 'max', np.max, numeric_only=False)
  575. first = _groupby_function('first', 'first', _first_compat,
  576. numeric_only=False, _convert=True)
  577. last = _groupby_function('last', 'last', _last_compat, numeric_only=False,
  578. _convert=True)
  579. _count = _groupby_function('_count', 'count', _count_compat,
  580. numeric_only=False)
  581. def count(self, axis=0):
  582. return self._count().astype('int64')
  583. def ohlc(self):
  584. """
  585. Compute sum of values, excluding missing values
  586. For multiple groupings, the result index will be a MultiIndex
  587. """
  588. return self._apply_to_column_groupbys(
  589. lambda x: x._cython_agg_general('ohlc'))
  590. def nth(self, n, dropna=None):
  591. """
  592. Take the nth row from each group.
  593. If dropna, will not show nth non-null row, dropna is either
  594. Truthy (if a Series) or 'all', 'any' (if a DataFrame); this is equivalent
  595. to calling dropna(how=dropna) before the groupby.
  596. Examples
  597. --------
  598. >>> df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
  599. >>> g = df.groupby('A')
  600. >>> g.nth(0)
  601. A B
  602. 0 1 NaN
  603. 2 5 6
  604. >>> g.nth(1)
  605. A B
  606. 1 1 4
  607. >>> g.nth(-1)
  608. A B
  609. 1 1 4
  610. 2 5 6
  611. >>> g.nth(0, dropna='any')
  612. B
  613. A
  614. 1 4
  615. 5 6
  616. >>> g.nth(1, dropna='any') # NaNs denote group exhausted when using dropna
  617. B
  618. A
  619. 1 NaN
  620. 5 NaN
  621. """
  622. self._set_selection_from_grouper()
  623. if not dropna: # good choice
  624. m = self.grouper._max_groupsize
  625. if n >= m or n < -m:
  626. return self._selected_obj.loc[[]]
  627. rng = np.zeros(m, dtype=bool)
  628. if n >= 0:
  629. rng[n] = True
  630. is_nth = self._cumcount_array(rng)
  631. else:
  632. rng[- n - 1] = True
  633. is_nth = self._cumcount_array(rng, ascending=False)
  634. result = self._selected_obj[is_nth]
  635. # the result index
  636. if self.as_index:
  637. ax = self.obj._info_axis
  638. names = self.grouper.names
  639. if self.obj.ndim == 1:
  640. # this is a pass-thru
  641. pass
  642. elif all([ n in ax for n in names ]):
  643. result.index = Index(self.obj[names][is_nth].values.ravel()).set_names(names)
  644. elif self._group_selection is not None:
  645. result.index = self.obj._get_axis(self.axis)[is_nth]
  646. result = result.sort_index()
  647. return result
  648. if (isinstance(self._selected_obj, DataFrame)
  649. and dropna not in ['any', 'all']):
  650. # Note: when agg-ing picker doesn't raise this, just returns NaN
  651. raise ValueError("For a DataFrame groupby, dropna must be "
  652. "either None, 'any' or 'all', "
  653. "(was passed %s)." % (dropna),)
  654. # old behaviour, but with all and any support for DataFrames.
  655. # modified in GH 7559 to have better perf
  656. max_len = n if n >= 0 else - 1 - n
  657. dropped = self.obj.dropna(how=dropna, axis=self.axis)
  658. # get a new grouper for our dropped obj
  659. if self.keys is None and self.level is None:
  660. # we don't have the grouper info available (e.g. we have selected out
  661. # a column that is not in the current object)
  662. axis = self.grouper.axis
  663. grouper = axis[axis.isin(dropped.index)]
  664. keys = self.grouper.names
  665. else:
  666. # create a grouper with the original parameters, but on the dropped object
  667. grouper, _, _ = _get_grouper(dropped, key=self.keys, axis=self.axis,
  668. level=self.level, sort=self.sort)
  669. sizes = dropped.groupby(grouper).size()
  670. result = dropped.groupby(grouper).nth(n)
  671. mask = (sizes<max_len).values
  672. # set the results which don't meet the criteria
  673. if len(result) and mask.any():
  674. result.loc[mask] = np.nan
  675. # reset/reindex to the original groups
  676. if len(self.obj) == len(dropped) or len(result) == len(self.grouper.result_index):
  677. result.index = self.grouper.result_index
  678. else:
  679. result = result.reindex(self.grouper.result_index)
  680. return result
  681. def cumcount(self, **kwargs):
  682. """
  683. Number each item in each group from 0 to the length of that group - 1.
  684. Essentially this is equivalent to
  685. >>> self.apply(lambda x: Series(np.arange(len(x)), x.index))
  686. Parameters
  687. ----------
  688. ascending : bool, default True
  689. If False, number in reverse, from length of group - 1 to 0.
  690. Example
  691. -------
  692. >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],
  693. ... columns=['A'])
  694. >>> df
  695. A
  696. 0 a
  697. 1 a
  698. 2 a
  699. 3 b
  700. 4 b
  701. 5 a
  702. >>> df.groupby('A').cumcount()
  703. 0 0
  704. 1 1
  705. 2 2
  706. 3 0
  707. 4 1
  708. 5 3
  709. dtype: int64
  710. >>> df.groupby('A').cumcount(ascending=False)
  711. 0 3
  712. 1 2
  713. 2 1
  714. 3 1
  715. 4 0
  716. 5 0
  717. dtype: int64
  718. """
  719. self._set_selection_from_grouper()
  720. ascending = kwargs.pop('ascending', True)
  721. index = self._selected_obj.index
  722. cumcounts = self._cumcount_array(ascending=ascending)
  723. return Series(cumcounts, index)
  724. def head(self, n=5):
  725. """
  726. Returns first n rows of each group.
  727. Essentially equivalent to ``.apply(lambda x: x.head(n))``,
  728. except ignores as_index flag.
  729. Example
  730. -------
  731. >>> df = DataFrame([[1, 2], [1, 4], [5, 6]],
  732. columns=['A', 'B'])
  733. >>> df.groupby('A', as_index=False).head(1)
  734. A B
  735. 0 1 2
  736. 2 5 6
  737. >>> df.groupby('A').head(1)
  738. A B
  739. 0 1 2
  740. 2 5 6
  741. """
  742. obj = self._selected_obj
  743. in_head = self._cumcount_array() < n
  744. head = obj[in_head]
  745. return head
  746. def tail(self, n=5):
  747. """
  748. Returns last n rows of each group
  749. Essentially equivalent to ``.apply(lambda x: x.tail(n))``,
  750. except ignores as_index flag.
  751. Example
  752. -------
  753. >>> df = DataFrame([[1, 2], [1, 4], [5, 6]],
  754. columns=['A', 'B'])
  755. >>> df.groupby('A', as_index=False).tail(1)
  756. A B
  757. 0 1 2
  758. 2 5 6
  759. >>> df.groupby('A').head(1)
  760. A B
  761. 0 1 2
  762. 2 5 6
  763. """
  764. obj = self._selected_obj
  765. rng = np.arange(0, -self.grouper._max_groupsize, -1, dtype='int64')
  766. in_tail = self._cumcount_array(rng, ascending=False) > -n
  767. tail = obj[in_tail]
  768. return tail
  769. def _cumcount_array(self, arr=None, **kwargs):
  770. """
  771. arr is where cumcount gets it's values from
  772. note: this is currently implementing sort=False (though the default is sort=True)
  773. for groupby in general
  774. """
  775. ascending = kwargs.pop('ascending', True)
  776. if arr is None:
  777. arr = np.arange(self.grouper._max_groupsize, dtype='int64')
  778. len_index = len(self._selected_obj.index)
  779. cumcounts = np.zeros(len_index, dtype=arr.dtype)
  780. if not len_index:
  781. return cumcounts
  782. indices, values = [], []
  783. for v in self.indices.values():
  784. indices.append(v)
  785. if ascending:
  786. values.append(arr[:len(v)])
  787. else:
  788. values.append(arr[len(v)-1::-1])
  789. indices = np.concatenate(indices)
  790. values = np.concatenate(values)
  791. cumcounts[indices] = values
  792. return cumcounts
  793. def _index_with_as_index(self, b):
  794. """
  795. Take boolean mask of index to be returned from apply, if as_index=True
  796. """
  797. # TODO perf, it feels like this should already be somewhere...
  798. from itertools import chain
  799. original = self._selected_obj.index
  800. gp = self.grouper
  801. levels = chain((gp.levels[i][gp.labels[i][b]]
  802. for i in range(len(gp.groupings))),
  803. (original.get_level_values(i)[b]
  804. for i in range(original.nlevels)))
  805. new = MultiIndex.from_arrays(list(levels))
  806. new.names = gp.names + original.names
  807. return new
  808. def _try_cast(self, result, obj):
  809. """
  810. try to cast the result to our obj original type,
  811. we may have roundtripped thru object in the mean-time
  812. """
  813. if obj.ndim > 1:
  814. dtype = obj.values.dtype
  815. else:
  816. dtype = obj.dtype
  817. if not np.isscalar(result):
  818. result = _possibly_downcast_to_dtype(result, dtype)
  819. return result
  820. def _cython_agg_general(self, how, numeric_only=True):
  821. output = {}
  822. for name, obj in self._iterate_slices():
  823. is_numeric = is_numeric_dtype(obj.dtype)
  824. if numeric_only and not is_numeric:
  825. continue
  826. try:
  827. result, names = self.grouper.aggregate(obj.values, how)
  828. except AssertionError as e:
  829. raise GroupByError(str(e))
  830. output[name] = self._try_cast(result, obj)
  831. if len(output) == 0:
  832. raise DataError('No numeric types to aggregate')
  833. return self._wrap_aggregated_output(output, names)
  834. def _python_agg_general(self, func, *args, **kwargs):
  835. func = _intercept_function(func)
  836. f = lambda x: func(x, *args, **kwargs)
  837. # iterate through "columns" ex exclusions to populate output dict
  838. output = {}
  839. for name, obj in self._iterate_slices():
  840. try:
  841. result, counts = self.grouper.agg_series(obj, f)
  842. output[name] = self._try_cast(result, obj)
  843. except TypeError:
  844. continue
  845. if len(output) == 0:
  846. return self._python_apply_general(f)
  847. if self.grouper._filter_empty_groups:
  848. mask = counts.ravel() > 0
  849. for name, result in compat.iteritems(output):
  850. # since we are masking, make sure that we have a float object
  851. values = result
  852. if is_numeric_dtype(values.dtype):
  853. values = com.ensure_float(values)
  854. output[name] = self._try_cast(values[mask], result)
  855. return self._wrap_aggregated_output(output)
  856. def _wrap_applied_output(self, *args, **kwargs):
  857. raise NotImplementedError
  858. def _concat_objects(self, keys, values, not_indexed_same=False):
  859. from pandas.tools.merge import concat
  860. if not not_indexed_same:
  861. result = concat(values, axis=self.axis)
  862. ax = self._selected_obj._get_axis(self.axis)
  863. if isinstance(result, Series):
  864. result = result.reindex(ax)
  865. else:
  866. result = result.reindex_axis(ax, axis=self.axis)
  867. elif self.group_keys:
  868. if self.as_index:
  869. # possible MI return case
  870. group_keys = keys
  871. group_levels = self.grouper.levels
  872. group_names = self.grouper.names
  873. result = concat(values, axis=self.axis, keys=group_keys,
  874. levels=group_levels, names=group_names)
  875. else:
  876. # GH5610, returns a MI, with the first level being a
  877. # range index
  878. keys = list(range(len(values)))
  879. result = concat(values, axis=self.axis, keys=keys)
  880. else:
  881. result = concat(values, axis=self.axis)
  882. return result
  883. def _apply_filter(self, indices, dropna):
  884. if len(indices) == 0:
  885. indices = []
  886. else:
  887. indices = np.sort(np.concatenate(indices))
  888. if dropna:
  889. filtered = self._selected_obj.take(indices)
  890. else:
  891. mask = np.empty(len(self._selected_obj.index), dtype=bool)
  892. mask.fill(False)
  893. mask[indices.astype(int)] = True
  894. # mask fails to broadcast when passed to where; broadcast manually.
  895. mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T
  896. filtered = self._selected_obj.where(mask) # Fill with NaNs.
  897. return filtered
  898. @Appender(GroupBy.__doc__)
  899. def groupby(obj, by, **kwds):
  900. if isinstance(obj, Series):
  901. klass = SeriesGroupBy
  902. elif isinstance(obj, DataFrame):
  903. klass = DataFrameGroupBy
  904. else: # pragma: no cover
  905. raise TypeError('invalid type: %s' % type(obj))
  906. return klass(obj, by, **kwds)
  907. def _get_axes(group):
  908. if isinstance(group, Series):
  909. return [group.index]
  910. else:
  911. return group.axes
  912. def _is_indexed_like(obj, axes):
  913. if isinstance(obj, Series):
  914. if len(axes) > 1:
  915. return False
  916. return obj.index.equals(axes[0])
  917. elif isinstance(obj, DataFrame):
  918. return obj.index.equals(axes[0])
  919. return False
  920. class BaseGrouper(object):
  921. """
  922. This is an internal Grouper class, which actually holds the generated groups
  923. """
  924. def __init__(self, axis, groupings, sort=True, group_keys=True):
  925. self.axis = axis
  926. self.groupings = groupings
  927. self.sort = sort
  928. self.group_keys = group_keys
  929. self.compressed = True
  930. @property
  931. def shape(self):
  932. return tuple(ping.ngroups for ping in self.groupings)
  933. def __iter__(self):
  934. return iter(self.indices)
  935. @property
  936. def nkeys(self):
  937. return len(self.groupings)
  938. def get_iterator(self, data, axis=0):
  939. """
  940. Groupby iterator
  941. Returns
  942. -------
  943. Generator yielding sequence of (name, subsetted object)
  944. for each group
  945. """
  946. splitter = self._get_splitter(data, axis=axis)
  947. keys = self._get_group_keys()
  948. for key, (i, group) in zip(keys, splitter):
  949. yield key, group
  950. def _get_splitter(self, data, axis=0):
  951. comp_ids, _, ngroups = self.group_info
  952. return get_splitter(data, comp_ids, ngroups, axis=axis)
  953. def _get_group_keys(self):
  954. if len(self.groupings) == 1:
  955. return self.levels[0]
  956. else:
  957. comp_ids, _, ngroups = self.group_info
  958. # provide "flattened" iterator for multi-group setting
  959. mapper = _KeyMapper(comp_ids, ngroups, self.labels, self.levels)
  960. return [mapper.get_key(i) for i in range(ngroups)]
  961. def apply(self, f, data, axis=0):
  962. mutated = False
  963. splitter = self._get_splitter(data, axis=axis)
  964. group_keys = self._get_group_keys()
  965. # oh boy
  966. if (f.__name__ not in _plotting_methods and
  967. hasattr(splitter, 'fast_apply') and axis == 0):
  968. try:
  969. values, mutated = splitter.fast_apply(f, group_keys)
  970. return group_keys, values, mutated
  971. except (lib.InvalidApply):
  972. # we detect a mutation of some kind
  973. # so take slow path
  974. pass
  975. except (Exception) as e:
  976. # raise this error to the caller
  977. pass
  978. result_values = []
  979. for key, (i, group) in zip(group_keys, splitter):
  980. object.__setattr__(group, 'name', key)
  981. # group might be modified
  982. group_axes = _get_axes(group)
  983. res = f(group)
  984. if not _is_indexed_like(res, group_axes):
  985. mutated = True
  986. result_values.append(res)
  987. return group_keys, result_values, mutated
  988. @cache_readonly
  989. def indices(self):
  990. """ dict {group name -> group indices} """
  991. if len(self.groupings) == 1:
  992. return self.groupings[0].indices
  993. else:
  994. label_list = [ping.labels for ping in self.groupings]
  995. keys = [ping.group_index for ping in self.groupings]
  996. return _get_indices_dict(label_list, keys)
  997. @property
  998. def labels(self):
  999. return [ping.labels for ping in self.groupings]
  1000. @property
  1001. def levels(self):
  1002. return [ping.group_index for ping in self.groupings]
  1003. @property
  1004. def names(self):
  1005. return [ping.name for ping in self.groupings]
  1006. def size(self):
  1007. """
  1008. Compute group sizes
  1009. """
  1010. # TODO: better impl
  1011. labels, _, ngroups = self.group_info
  1012. bin_counts = algos.value_counts(labels, sort=False)
  1013. bin_counts = bin_counts.reindex(np.arange(ngroups))
  1014. bin_counts.index = self.result_index
  1015. return bin_counts
  1016. @cache_readonly
  1017. def _max_groupsize(self):
  1018. '''
  1019. Compute size of largest group
  1020. '''
  1021. # For many items in each group this is much faster than
  1022. # self.size().max(), in worst case marginally slower
  1023. if self.indices:
  1024. return max(len(v) for v in self.indices.values())
  1025. else:
  1026. return 0
  1027. @cache_readonly
  1028. def groups(self):
  1029. """ dict {group name -> group labels} """
  1030. if len(self.groupings) == 1:
  1031. return self.groupings[0].groups
  1032. else:
  1033. to_groupby = lzip(*(ping.grouper for ping in self.groupings))
  1034. to_groupby = Index(to_groupby)
  1035. return self.axis.groupby(to_groupby.values)
  1036. @cache_readonly
  1037. def group_info(self):
  1038. comp_ids, obs_group_ids = self._get_compressed_labels()
  1039. ngroups = len(obs_group_ids)
  1040. comp_ids = com._ensure_int64(comp_ids)
  1041. return comp_ids, obs_group_ids, ngroups
  1042. def _get_compressed_labels(self):
  1043. all_labels = [ping.labels for ping in self.groupings]
  1044. if self._overflow_possible:
  1045. tups = lib.fast_zip(all_labels)
  1046. labs, uniques = algos.factorize(tups)
  1047. if self.sort:
  1048. uniques, labs = _reorder_by_uniques(uniques, labs)
  1049. return labs, uniques
  1050. else:
  1051. if len(all_labels) > 1:
  1052. group_index = get_group_index(all_labels, self.shape)
  1053. comp_ids, obs_group_ids = _compress_group_index(group_index)
  1054. else:
  1055. ping = self.groupings[0]
  1056. comp_ids = ping.labels
  1057. obs_group_ids = np.arange(len(ping.group_index))
  1058. self.compressed = False
  1059. self._filter_empty_groups = False
  1060. return comp_ids, obs_group_ids
  1061. @cache_readonly
  1062. def _overflow_possible(self):
  1063. return _int64_overflow_possible(self.shape)
  1064. @cache_readonly
  1065. def ngroups(self):
  1066. return len(self.result_index)
  1067. @cache_readonly
  1068. def result_index(self):
  1069. recons = self.get_group_levels()
  1070. return MultiIndex.from_arrays(recons, names=self.names)
  1071. def get_group_levels(self):
  1072. obs_ids = self.group_info[1]
  1073. if not self.compressed and len(self.groupings) == 1:
  1074. return [self.groupings[0].group_index]
  1075. if self._overflow_possible:
  1076. recons_labels = [np.array(x) for x in zip(*obs_ids)]
  1077. else:
  1078. recons_labels = decons_group_index(obs_ids, self.shape)
  1079. name_list = []
  1080. for ping, labels in zip(self.groupings, recons_labels):
  1081. labels = com._ensure_platform_int(labels)
  1082. name_list.append(ping.group_index.take(labels))
  1083. return name_list
  1084. #------------------------------------------------------------
  1085. # Aggregation functions
  1086. _cython_functions = {
  1087. 'add': 'group_add',
  1088. 'prod': 'group_prod',
  1089. 'min': 'group_min',
  1090. 'max': 'group_max',
  1091. 'mean': 'group_mean',
  1092. 'median': {
  1093. 'name': 'group_median'
  1094. },
  1095. 'var': 'group_var',
  1096. 'first': {
  1097. 'name': 'group_nth',
  1098. 'f': lambda func, a, b, c, d: func(a, b, c, d, 1)
  1099. },
  1100. 'last': 'group_last',
  1101. 'count': 'group_count',
  1102. }
  1103. _cython_arity = {
  1104. 'ohlc': 4, # OHLC
  1105. }
  1106. _name_functions = {}
  1107. _filter_empty_groups = True
  1108. def _get_aggregate_function(self, how, values):
  1109. dtype_str = values.dtype.name
  1110. def get_func(fname):
  1111. # find the function, or use the object function, or return a
  1112. # generic
  1113. for dt in [dtype_str, 'object']:
  1114. f = getattr(_algos, "%s_%s" % (fname, dtype_str), None)
  1115. if f is not None:
  1116. return f
  1117. return getattr(_algos, fname, None)
  1118. ftype = self._cython_functions[how]
  1119. if isinstance(ftype, dict):
  1120. func = afunc = get_func(ftype['name'])
  1121. # a sub-function
  1122. f = ftype.get('f')
  1123. if f is not None:
  1124. def wrapper(*args, **kwargs):
  1125. return f(afunc, *args, **kwargs)
  1126. # need to curry our sub-function
  1127. func = wrapper
  1128. else:
  1129. func = get_func(ftype)
  1130. if func is None:
  1131. raise NotImplementedError("function is not implemented for this"
  1132. "dtype: [how->%s,dtype->%s]" %
  1133. (how, dtype_str))
  1134. return func, dtype_str
  1135. def aggregate(self, values, how, axis=0):
  1136. arity = self._cython_arity.get(how, 1)
  1137. vdim = values.ndim
  1138. swapped = False
  1139. if vdim == 1:
  1140. values = values[:, None]
  1141. out_shape = (self.ngroups, arity)
  1142. else:
  1143. if axis > 0:
  1144. swapped = True
  1145. values = values.swapaxes(0, axis)
  1146. if arity > 1:
  1147. raise NotImplementedError
  1148. out_shape = (self.ngroups,) + values.shape[1:]
  1149. if is_numeric_dtype(values.dtype):
  1150. values = com.ensure_float(values)
  1151. is_numeric = True
  1152. out_dtype = 'f%d' % values.dtype.itemsize
  1153. else:
  1154. is_numeric = issubclass(values.dtype.type, (np.datetime64,
  1155. np.timedelta64))
  1156. if is_numeric:
  1157. out_dtype = 'float64'
  1158. values = values.view('int64')
  1159. else:
  1160. out_dtype = 'object'
  1161. values = values.astype(object)
  1162. # will be filled in Cython function
  1163. result = np.empty(out_shape, dtype=out_dtype)
  1164. result.fill(np.nan)
  1165. counts = np.zeros(self.ngroups, dtype=np.int64)
  1166. result = self._aggregate(result, counts, values, how, is_numeric)
  1167. if self._filter_empty_groups:
  1168. if result.ndim == 2:
  1169. try:
  1170. result = lib.row_bool_subset(
  1171. result, (counts > 0).view(np.uint8))
  1172. except ValueError:
  1173. result = lib.row_bool_subset_object(
  1174. result, (counts > 0).view(np.uint8))
  1175. else:
  1176. result = result[counts > 0]
  1177. if vdim == 1 and arity == 1:
  1178. result = result[:, 0]
  1179. if how in self._name_functions:
  1180. # TODO
  1181. names = self._name_functions[how]()
  1182. else:
  1183. names = None
  1184. if swapped:
  1185. result = result.swapaxes(0, axis)
  1186. return result, names
  1187. def _aggregate(self, result, counts, values, how, is_numeric):
  1188. agg_func, dtype = self._get_aggregate_function(how, values)
  1189. comp_ids, _, ngroups = self.group_info
  1190. if values.ndim > 3:
  1191. # punting for now
  1192. raise NotImplementedError
  1193. elif values.ndim > 2:
  1194. for i, chunk in enumerate(values.transpose(2, 0, 1)):
  1195. chunk = chunk.squeeze()
  1196. agg_func(result[:, :, i], counts, chunk, comp_ids)
  1197. else:
  1198. agg_func(result, counts, values, comp_ids)
  1199. return result
  1200. def agg_series(self, obj, func):
  1201. try:
  1202. return self._aggregate_series_fast(obj, func)
  1203. except Exception:
  1204. return self._aggregate_series_pure_python(obj, func)
  1205. def _aggregate_series_fast(self, obj, func):
  1206. func = _intercept_function(func)
  1207. if obj.index._has_complex_internals:
  1208. raise TypeError('Incompatible index for Cython grouper')
  1209. group_index, _, ngroups = self.group_info
  1210. # avoids object / Series creation overhead
  1211. dummy = obj._get_values(slice(None, 0)).to_dense()
  1212. indexer = _algos.groupsort_indexer(group_index, ngroups)[0]
  1213. obj = obj.take(indexer, convert=False)
  1214. group_index = com.take_nd(group_index, indexer, allow_fill=False)
  1215. grouper = lib.SeriesGrouper(obj, func, group_index, ngroups,
  1216. dummy)
  1217. result, counts = grouper.get_result()
  1218. return result, counts
  1219. def _aggregate_series_pure_python(self, obj, func):
  1220. group_index, _, ngroups = self.group_info
  1221. counts = np.zeros(ngroups, dtype=int)
  1222. result = None
  1223. splitter = get_splitter(obj, group_index, ngroups, axis=self.axis)
  1224. for label, group in splitter:
  1225. res = func(group)
  1226. if result is None:
  1227. if (isinstance(res, (Series, np.ndarray)) or
  1228. isinstance(res, list)):
  1229. raise ValueError('Function does not reduce')
  1230. result = np.empty(ngroups, dtype='O')
  1231. counts[label] = group.shape[0]
  1232. result[label] = res
  1233. result = lib.maybe_convert_objects(result, try_float=0)
  1234. return result, counts
  1235. def generate_bins_generic(values, binner, closed):
  1236. """
  1237. Generate bin edge offsets and bin labels for one array using another array
  1238. which has bin edge values. Both arrays must be sorted.
  1239. Parameters
  1240. ----------
  1241. values : array of values
  1242. binner : a comparable array of values representing bins into which to bin
  1243. the first array. Note, 'values' end-points must fall within 'binner'
  1244. end-points.
  1245. closed : which end of bin is closed; left (default), right
  1246. Returns
  1247. -------
  1248. bins : array of offsets (into 'values' argument) of bins.
  1249. Zero and last edge are excluded in result, so for instance the first
  1250. bin is values[0:bin[0]] and the last is values[bin[-1]:]
  1251. """
  1252. lenidx = len(values)
  1253. lenbin = len(binner)
  1254. if lenidx <= 0 or lenbin <= 0:
  1255. raise ValueError("Invalid length for values or for binner")
  1256. # check binner fits data
  1257. if values[0] < binner[0]:
  1258. raise ValueError("Values falls before first bin")
  1259. if values[lenidx - 1] > binner[lenbin - 1]:
  1260. raise ValueError("Values falls after last bin")
  1261. bins = np.empty(lenbin - 1, dtype=np.int64)
  1262. j = 0 # index into values
  1263. bc = 0 # bin count
  1264. # linear scan, presume nothing about values/binner except that it fits ok
  1265. for i in range(0, lenbin - 1):
  1266. r_bin = binner[i + 1]
  1267. # count

Large files files are truncated, but you can click here to view the full file