PageRenderTime 59ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/core/groupby.py

https://github.com/ajcr/pandas
Python | 3509 lines | 3505 code | 4 blank | 0 comment | 9 complexity | 3a4baa974c53d4b762d06020f453f60c MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0

Large files files are truncated, but you can click here to view the full file

  1. import types
  2. from functools import wraps
  3. import numpy as np
  4. import datetime
  5. import collections
  6. from pandas.compat import(
  7. zip, builtins, range, long, lzip,
  8. OrderedDict, callable
  9. )
  10. from pandas import compat
  11. from pandas.core.base import PandasObject
  12. from pandas.core.categorical import Categorical
  13. from pandas.core.frame import DataFrame
  14. from pandas.core.generic import NDFrame
  15. from pandas.core.index import Index, MultiIndex, _ensure_index, _union_indexes
  16. from pandas.core.internals import BlockManager, make_block
  17. from pandas.core.series import Series
  18. from pandas.core.panel import Panel
  19. from pandas.util.decorators import cache_readonly, Appender
  20. import pandas.core.algorithms as algos
  21. import pandas.core.common as com
  22. from pandas.core.common import(_possibly_downcast_to_dtype, isnull,
  23. notnull, _DATELIKE_DTYPES, is_numeric_dtype,
  24. is_timedelta64_dtype, is_datetime64_dtype)
  25. from pandas import _np_version_under1p7
  26. import pandas.lib as lib
  27. from pandas.lib import Timestamp
  28. import pandas.tslib as tslib
  29. import pandas.algos as _algos
  30. import pandas.hashtable as _hash
  31. _agg_doc = """Aggregate using input function or dict of {column -> function}
  32. Parameters
  33. ----------
  34. arg : function or dict
  35. Function to use for aggregating groups. If a function, must either
  36. work when passed a DataFrame or when passed to DataFrame.apply. If
  37. passed a dict, the keys must be DataFrame column names.
  38. Notes
  39. -----
  40. Numpy functions mean/median/prod/sum/std/var are special cased so the
  41. default behavior is applying the function along axis=0
  42. (e.g., np.mean(arr_2d, axis=0)) as opposed to
  43. mimicking the default Numpy behavior (e.g., np.mean(arr_2d)).
  44. Returns
  45. -------
  46. aggregated : DataFrame
  47. """
  48. # special case to prevent duplicate plots when catching exceptions when
  49. # forwarding methods from NDFrames
  50. _plotting_methods = frozenset(['plot', 'boxplot', 'hist'])
  51. _common_apply_whitelist = frozenset([
  52. 'last', 'first',
  53. 'head', 'tail', 'median',
  54. 'mean', 'sum', 'min', 'max',
  55. 'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount',
  56. 'resample',
  57. 'describe',
  58. 'rank', 'quantile', 'count',
  59. 'fillna',
  60. 'mad',
  61. 'any', 'all',
  62. 'irow', 'take',
  63. 'idxmax', 'idxmin',
  64. 'shift', 'tshift',
  65. 'ffill', 'bfill',
  66. 'pct_change', 'skew',
  67. 'corr', 'cov', 'diff',
  68. ]) | _plotting_methods
  69. _series_apply_whitelist = \
  70. (_common_apply_whitelist - set(['boxplot'])) | \
  71. frozenset(['dtype', 'value_counts', 'unique', 'nunique',
  72. 'nlargest', 'nsmallest'])
  73. _dataframe_apply_whitelist = \
  74. _common_apply_whitelist | frozenset(['dtypes', 'corrwith'])
  75. class GroupByError(Exception):
  76. pass
  77. class DataError(GroupByError):
  78. pass
  79. class SpecificationError(GroupByError):
  80. pass
  81. def _groupby_function(name, alias, npfunc, numeric_only=True,
  82. _convert=False):
  83. def f(self):
  84. self._set_selection_from_grouper()
  85. try:
  86. return self._cython_agg_general(alias, numeric_only=numeric_only)
  87. except AssertionError as e:
  88. raise SpecificationError(str(e))
  89. except Exception:
  90. result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
  91. if _convert:
  92. result = result.convert_objects()
  93. return result
  94. f.__doc__ = "Compute %s of group values" % name
  95. f.__name__ = name
  96. return f
  97. def _first_compat(x, axis=0):
  98. def _first(x):
  99. x = np.asarray(x)
  100. x = x[notnull(x)]
  101. if len(x) == 0:
  102. return np.nan
  103. return x[0]
  104. if isinstance(x, DataFrame):
  105. return x.apply(_first, axis=axis)
  106. else:
  107. return _first(x)
  108. def _last_compat(x, axis=0):
  109. def _last(x):
  110. x = np.asarray(x)
  111. x = x[notnull(x)]
  112. if len(x) == 0:
  113. return np.nan
  114. return x[-1]
  115. if isinstance(x, DataFrame):
  116. return x.apply(_last, axis=axis)
  117. else:
  118. return _last(x)
  119. def _count_compat(x, axis=0):
  120. return x.size
  121. class Grouper(object):
  122. """
  123. A Grouper allows the user to specify a groupby instruction for a target object
  124. This specification will select a column via the key parameter, or if the level and/or
  125. axis parameters are given, a level of the index of the target object.
  126. These are local specifications and will override 'global' settings, that is the parameters
  127. axis and level which are passed to the groupby itself.
  128. Parameters
  129. ----------
  130. key : string, defaults to None
  131. groupby key, which selects the grouping column of the target
  132. level : name/number, defaults to None
  133. the level for the target index
  134. freq : string / freqency object, defaults to None
  135. This will groupby the specified frequency if the target selection (via key or level) is
  136. a datetime-like object
  137. axis : number/name of the axis, defaults to None
  138. sort : boolean, default to False
  139. whether to sort the resulting labels
  140. additional kwargs to control time-like groupers (when freq is passed)
  141. closed : closed end of interval; left or right
  142. label : interval boundary to use for labeling; left or right
  143. convention : {'start', 'end', 'e', 's'}
  144. If grouper is PeriodIndex
  145. Returns
  146. -------
  147. A specification for a groupby instruction
  148. Examples
  149. --------
  150. >>> df.groupby(Grouper(key='A')) : syntatic sugar for df.groupby('A')
  151. >>> df.groupby(Grouper(key='date',freq='60s')) : specify a resample on the column 'date'
  152. >>> df.groupby(Grouper(level='date',freq='60s',axis=1)) :
  153. specify a resample on the level 'date' on the columns axis with a frequency of 60s
  154. """
  155. def __new__(cls, *args, **kwargs):
  156. if kwargs.get('freq') is not None:
  157. from pandas.tseries.resample import TimeGrouper
  158. cls = TimeGrouper
  159. return super(Grouper, cls).__new__(cls)
  160. def __init__(self, key=None, level=None, freq=None, axis=None, sort=False):
  161. self.key=key
  162. self.level=level
  163. self.freq=freq
  164. self.axis=axis
  165. self.sort=sort
  166. self.grouper=None
  167. self.obj=None
  168. self.indexer=None
  169. self.binner=None
  170. self.grouper=None
  171. @property
  172. def ax(self):
  173. return self.grouper
  174. def _get_grouper(self, obj):
  175. """
  176. Parameters
  177. ----------
  178. obj : the subject object
  179. Returns
  180. -------
  181. a tuple of binner, grouper, obj (possibly sorted)
  182. """
  183. self._set_grouper(obj)
  184. return self.binner, self.grouper, self.obj
  185. def _set_grouper(self, obj, sort=False):
  186. """
  187. given an object and the specifcations, setup the internal grouper for this particular specification
  188. Parameters
  189. ----------
  190. obj : the subject object
  191. """
  192. if self.key is not None and self.level is not None:
  193. raise ValueError("The Grouper cannot specify both a key and a level!")
  194. # the key must be a valid info item
  195. if self.key is not None:
  196. key = self.key
  197. if key not in obj._info_axis:
  198. raise KeyError("The grouper name {0} is not found".format(key))
  199. ax = Index(obj[key],name=key)
  200. else:
  201. ax = obj._get_axis(self.axis)
  202. if self.level is not None:
  203. level = self.level
  204. # if a level is given it must be a mi level or
  205. # equivalent to the axis name
  206. if isinstance(ax, MultiIndex):
  207. if isinstance(level, compat.string_types):
  208. if obj.index.name != level:
  209. raise ValueError('level name %s is not the name of the '
  210. 'index' % level)
  211. elif level > 0:
  212. raise ValueError('level > 0 only valid with MultiIndex')
  213. ax = Index(ax.get_level_values(level), name=level)
  214. else:
  215. if not (level == 0 or level == ax.name):
  216. raise ValueError("The grouper level {0} is not valid".format(level))
  217. # possibly sort
  218. if (self.sort or sort) and not ax.is_monotonic:
  219. indexer = self.indexer = ax.argsort(kind='quicksort')
  220. ax = ax.take(indexer)
  221. obj = obj.take(indexer, axis=self.axis, convert=False, is_copy=False)
  222. self.obj = obj
  223. self.grouper = ax
  224. return self.grouper
  225. def _get_binner_for_grouping(self, obj):
  226. raise NotImplementedError
  227. @property
  228. def groups(self):
  229. return self.grouper.groups
  230. class GroupBy(PandasObject):
  231. """
  232. Class for grouping and aggregating relational data. See aggregate,
  233. transform, and apply functions on this object.
  234. It's easiest to use obj.groupby(...) to use GroupBy, but you can also do:
  235. ::
  236. grouped = groupby(obj, ...)
  237. Parameters
  238. ----------
  239. obj : pandas object
  240. axis : int, default 0
  241. level : int, default None
  242. Level of MultiIndex
  243. groupings : list of Grouping objects
  244. Most users should ignore this
  245. exclusions : array-like, optional
  246. List of columns to exclude
  247. name : string
  248. Most users should ignore this
  249. Notes
  250. -----
  251. After grouping, see aggregate, apply, and transform functions. Here are
  252. some other brief notes about usage. When grouping by multiple groups, the
  253. result index will be a MultiIndex (hierarchical) by default.
  254. Iteration produces (key, group) tuples, i.e. chunking the data by group. So
  255. you can write code like:
  256. ::
  257. grouped = obj.groupby(keys, axis=axis)
  258. for key, group in grouped:
  259. # do something with the data
  260. Function calls on GroupBy, if not specially implemented, "dispatch" to the
  261. grouped data. So if you group a DataFrame and wish to invoke the std()
  262. method on each group, you can simply do:
  263. ::
  264. df.groupby(mapper).std()
  265. rather than
  266. ::
  267. df.groupby(mapper).aggregate(np.std)
  268. You can pass arguments to these "wrapped" functions, too.
  269. See the online documentation for full exposition on these topics and much
  270. more
  271. Returns
  272. -------
  273. **Attributes**
  274. groups : dict
  275. {group name -> group labels}
  276. len(grouped) : int
  277. Number of groups
  278. """
  279. _apply_whitelist = _common_apply_whitelist
  280. _internal_names = ['_cache']
  281. _internal_names_set = set(_internal_names)
  282. _group_selection = None
  283. def __init__(self, obj, keys=None, axis=0, level=None,
  284. grouper=None, exclusions=None, selection=None, as_index=True,
  285. sort=True, group_keys=True, squeeze=False):
  286. self._selection = selection
  287. if isinstance(obj, NDFrame):
  288. obj._consolidate_inplace()
  289. self.level = level
  290. if not as_index:
  291. if not isinstance(obj, DataFrame):
  292. raise TypeError('as_index=False only valid with DataFrame')
  293. if axis != 0:
  294. raise ValueError('as_index=False only valid for axis=0')
  295. self.as_index = as_index
  296. self.keys = keys
  297. self.sort = sort
  298. self.group_keys = group_keys
  299. self.squeeze = squeeze
  300. if grouper is None:
  301. grouper, exclusions, obj = _get_grouper(obj, keys, axis=axis,
  302. level=level, sort=sort)
  303. self.obj = obj
  304. self.axis = obj._get_axis_number(axis)
  305. self.grouper = grouper
  306. self.exclusions = set(exclusions) if exclusions else set()
  307. def __len__(self):
  308. return len(self.indices)
  309. def __unicode__(self):
  310. # TODO: Better unicode/repr for GroupBy object
  311. return object.__repr__(self)
  312. @property
  313. def groups(self):
  314. """ dict {group name -> group labels} """
  315. return self.grouper.groups
  316. @property
  317. def ngroups(self):
  318. return self.grouper.ngroups
  319. @property
  320. def indices(self):
  321. """ dict {group name -> group indices} """
  322. return self.grouper.indices
  323. def _get_index(self, name):
  324. """ safe get index, translate keys for datelike to underlying repr """
  325. def convert(key, s):
  326. # possibly convert to they actual key types
  327. # in the indices, could be a Timestamp or a np.datetime64
  328. if isinstance(s, (Timestamp,datetime.datetime)):
  329. return Timestamp(key)
  330. elif isinstance(s, np.datetime64):
  331. return Timestamp(key).asm8
  332. return key
  333. sample = next(iter(self.indices))
  334. if isinstance(sample, tuple):
  335. if not isinstance(name, tuple):
  336. raise ValueError("must supply a tuple to get_group with multiple grouping keys")
  337. if not len(name) == len(sample):
  338. raise ValueError("must supply a a same-length tuple to get_group with multiple grouping keys")
  339. name = tuple([ convert(n, k) for n, k in zip(name,sample) ])
  340. else:
  341. name = convert(name, sample)
  342. return self.indices[name]
  343. @property
  344. def name(self):
  345. if self._selection is None:
  346. return None # 'result'
  347. else:
  348. return self._selection
  349. @property
  350. def _selection_list(self):
  351. if not isinstance(self._selection, (list, tuple, Series, np.ndarray)):
  352. return [self._selection]
  353. return self._selection
  354. @cache_readonly
  355. def _selected_obj(self):
  356. if self._selection is None or isinstance(self.obj, Series):
  357. if self._group_selection is not None:
  358. return self.obj[self._group_selection]
  359. return self.obj
  360. else:
  361. return self.obj[self._selection]
  362. def _set_selection_from_grouper(self):
  363. """ we may need create a selection if we have non-level groupers """
  364. grp = self.grouper
  365. if self.as_index and getattr(grp,'groupings',None) is not None:
  366. ax = self.obj._info_axis
  367. groupers = [ g.name for g in grp.groupings if g.level is None and g.name is not None and g.name in ax ]
  368. if len(groupers):
  369. self._group_selection = (ax-Index(groupers)).tolist()
  370. def _local_dir(self):
  371. return sorted(set(self.obj._local_dir() + list(self._apply_whitelist)))
  372. def __getattr__(self, attr):
  373. if attr in self._internal_names_set:
  374. return object.__getattribute__(self, attr)
  375. if attr in self.obj:
  376. return self[attr]
  377. if hasattr(self.obj, attr):
  378. return self._make_wrapper(attr)
  379. raise AttributeError("%r object has no attribute %r" %
  380. (type(self).__name__, attr))
  381. def __getitem__(self, key):
  382. raise NotImplementedError('Not implemented: %s' % key)
  383. def _make_wrapper(self, name):
  384. if name not in self._apply_whitelist:
  385. is_callable = callable(getattr(self._selected_obj, name, None))
  386. kind = ' callable ' if is_callable else ' '
  387. msg = ("Cannot access{0}attribute {1!r} of {2!r} objects, try "
  388. "using the 'apply' method".format(kind, name,
  389. type(self).__name__))
  390. raise AttributeError(msg)
  391. # need to setup the selection
  392. # as are not passed directly but in the grouper
  393. self._set_selection_from_grouper()
  394. f = getattr(self._selected_obj, name)
  395. if not isinstance(f, types.MethodType):
  396. return self.apply(lambda self: getattr(self, name))
  397. f = getattr(type(self._selected_obj), name)
  398. def wrapper(*args, **kwargs):
  399. # a little trickery for aggregation functions that need an axis
  400. # argument
  401. kwargs_with_axis = kwargs.copy()
  402. if 'axis' not in kwargs_with_axis:
  403. kwargs_with_axis['axis'] = self.axis
  404. def curried_with_axis(x):
  405. return f(x, *args, **kwargs_with_axis)
  406. def curried(x):
  407. return f(x, *args, **kwargs)
  408. # preserve the name so we can detect it when calling plot methods,
  409. # to avoid duplicates
  410. curried.__name__ = curried_with_axis.__name__ = name
  411. # special case otherwise extra plots are created when catching the
  412. # exception below
  413. if name in _plotting_methods:
  414. return self.apply(curried)
  415. try:
  416. return self.apply(curried_with_axis)
  417. except Exception:
  418. try:
  419. return self.apply(curried)
  420. except Exception:
  421. # related to : GH3688
  422. # try item-by-item
  423. # this can be called recursively, so need to raise ValueError if
  424. # we don't have this method to indicated to aggregate to
  425. # mark this column as an error
  426. try:
  427. return self._aggregate_item_by_item(name, *args, **kwargs)
  428. except (AttributeError):
  429. raise ValueError
  430. return wrapper
  431. def get_group(self, name, obj=None):
  432. """
  433. Constructs NDFrame from group with provided name
  434. Parameters
  435. ----------
  436. name : object
  437. the name of the group to get as a DataFrame
  438. obj : NDFrame, default None
  439. the NDFrame to take the DataFrame out of. If
  440. it is None, the object groupby was called on will
  441. be used
  442. Returns
  443. -------
  444. group : type of obj
  445. """
  446. if obj is None:
  447. obj = self._selected_obj
  448. inds = self._get_index(name)
  449. return obj.take(inds, axis=self.axis, convert=False)
  450. def __iter__(self):
  451. """
  452. Groupby iterator
  453. Returns
  454. -------
  455. Generator yielding sequence of (name, subsetted object)
  456. for each group
  457. """
  458. return self.grouper.get_iterator(self.obj, axis=self.axis)
  459. def apply(self, func, *args, **kwargs):
  460. """
  461. Apply function and combine results together in an intelligent way. The
  462. split-apply-combine combination rules attempt to be as common sense
  463. based as possible. For example:
  464. case 1:
  465. group DataFrame
  466. apply aggregation function (f(chunk) -> Series)
  467. yield DataFrame, with group axis having group labels
  468. case 2:
  469. group DataFrame
  470. apply transform function ((f(chunk) -> DataFrame with same indexes)
  471. yield DataFrame with resulting chunks glued together
  472. case 3:
  473. group Series
  474. apply function with f(chunk) -> DataFrame
  475. yield DataFrame with result of chunks glued together
  476. Parameters
  477. ----------
  478. func : function
  479. Notes
  480. -----
  481. See online documentation for full exposition on how to use apply.
  482. In the current implementation apply calls func twice on the
  483. first group to decide whether it can take a fast or slow code
  484. path. This can lead to unexpected behavior if func has
  485. side-effects, as they will take effect twice for the first
  486. group.
  487. See also
  488. --------
  489. aggregate, transform
  490. Returns
  491. -------
  492. applied : type depending on grouped object and function
  493. """
  494. func = _intercept_function(func)
  495. @wraps(func)
  496. def f(g):
  497. return func(g, *args, **kwargs)
  498. return self._python_apply_general(f)
  499. def _python_apply_general(self, f):
  500. keys, values, mutated = self.grouper.apply(f, self._selected_obj,
  501. self.axis)
  502. return self._wrap_applied_output(keys, values,
  503. not_indexed_same=mutated)
  504. def aggregate(self, func, *args, **kwargs):
  505. raise NotImplementedError
  506. @Appender(_agg_doc)
  507. def agg(self, func, *args, **kwargs):
  508. return self.aggregate(func, *args, **kwargs)
  509. def _iterate_slices(self):
  510. yield self.name, self._selected_obj
  511. def transform(self, func, *args, **kwargs):
  512. raise NotImplementedError
  513. def mean(self):
  514. """
  515. Compute mean of groups, excluding missing values
  516. For multiple groupings, the result index will be a MultiIndex
  517. """
  518. try:
  519. return self._cython_agg_general('mean')
  520. except GroupByError:
  521. raise
  522. except Exception: # pragma: no cover
  523. self._set_selection_from_grouper()
  524. f = lambda x: x.mean(axis=self.axis)
  525. return self._python_agg_general(f)
  526. def median(self):
  527. """
  528. Compute median of groups, excluding missing values
  529. For multiple groupings, the result index will be a MultiIndex
  530. """
  531. try:
  532. return self._cython_agg_general('median')
  533. except GroupByError:
  534. raise
  535. except Exception: # pragma: no cover
  536. self._set_selection_from_grouper()
  537. def f(x):
  538. if isinstance(x, np.ndarray):
  539. x = Series(x)
  540. return x.median(axis=self.axis)
  541. return self._python_agg_general(f)
  542. def std(self, ddof=1):
  543. """
  544. Compute standard deviation of groups, excluding missing values
  545. For multiple groupings, the result index will be a MultiIndex
  546. """
  547. # todo, implement at cython level?
  548. return np.sqrt(self.var(ddof=ddof))
  549. def var(self, ddof=1):
  550. """
  551. Compute variance of groups, excluding missing values
  552. For multiple groupings, the result index will be a MultiIndex
  553. """
  554. if ddof == 1:
  555. return self._cython_agg_general('var')
  556. else:
  557. self._set_selection_from_grouper()
  558. f = lambda x: x.var(ddof=ddof)
  559. return self._python_agg_general(f)
  560. def sem(self, ddof=1):
  561. """
  562. Compute standard error of the mean of groups, excluding missing values
  563. For multiple groupings, the result index will be a MultiIndex
  564. """
  565. return self.std(ddof=ddof)/np.sqrt(self.count())
  566. def size(self):
  567. """
  568. Compute group sizes
  569. """
  570. return self.grouper.size()
  571. sum = _groupby_function('sum', 'add', np.sum)
  572. prod = _groupby_function('prod', 'prod', np.prod)
  573. min = _groupby_function('min', 'min', np.min, numeric_only=False)
  574. max = _groupby_function('max', 'max', np.max, numeric_only=False)
  575. first = _groupby_function('first', 'first', _first_compat,
  576. numeric_only=False, _convert=True)
  577. last = _groupby_function('last', 'last', _last_compat, numeric_only=False,
  578. _convert=True)
  579. _count = _groupby_function('_count', 'count', _count_compat,
  580. numeric_only=False)
  581. def count(self, axis=0):
  582. return self._count().astype('int64')
  583. def ohlc(self):
  584. """
  585. Compute sum of values, excluding missing values
  586. For multiple groupings, the result index will be a MultiIndex
  587. """
  588. return self._apply_to_column_groupbys(
  589. lambda x: x._cython_agg_general('ohlc'))
  590. def nth(self, n, dropna=None):
  591. """
  592. Take the nth row from each group.
  593. If dropna, will not show nth non-null row, dropna is either
  594. Truthy (if a Series) or 'all', 'any' (if a DataFrame); this is equivalent
  595. to calling dropna(how=dropna) before the groupby.
  596. Examples
  597. --------
  598. >>> DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
  599. >>> g = df.groupby('A')
  600. >>> g.nth(0)
  601. A B
  602. 0 1 NaN
  603. 2 5 6
  604. >>> g.nth(1)
  605. A B
  606. 1 1 4
  607. >>> g.nth(-1)
  608. A B
  609. 1 1 4
  610. 2 5 6
  611. >>> g.nth(0, dropna='any')
  612. B
  613. A
  614. 1 4
  615. 5 6
  616. >>> g.nth(1, dropna='any') # NaNs denote group exhausted when using dropna
  617. B
  618. A
  619. 1 NaN
  620. 5 NaN
  621. """
  622. self._set_selection_from_grouper()
  623. if not dropna: # good choice
  624. m = self.grouper._max_groupsize
  625. if n >= m or n < -m:
  626. return self._selected_obj.loc[[]]
  627. rng = np.zeros(m, dtype=bool)
  628. if n >= 0:
  629. rng[n] = True
  630. is_nth = self._cumcount_array(rng)
  631. else:
  632. rng[- n - 1] = True
  633. is_nth = self._cumcount_array(rng, ascending=False)
  634. result = self._selected_obj[is_nth]
  635. # the result index
  636. if self.as_index:
  637. ax = self.obj._info_axis
  638. names = self.grouper.names
  639. if all([ n in ax for n in names ]):
  640. result.index = Index(self.obj[names][is_nth].values.ravel()).set_names(names)
  641. elif self._group_selection is not None:
  642. result.index = self.obj._get_axis(self.axis)[is_nth]
  643. result = result.sort_index()
  644. return result
  645. if (isinstance(self._selected_obj, DataFrame)
  646. and dropna not in ['any', 'all']):
  647. # Note: when agg-ing picker doesn't raise this, just returns NaN
  648. raise ValueError("For a DataFrame groupby, dropna must be "
  649. "either None, 'any' or 'all', "
  650. "(was passed %s)." % (dropna),)
  651. # old behaviour, but with all and any support for DataFrames.
  652. max_len = n if n >= 0 else - 1 - n
  653. def picker(x):
  654. x = x.dropna(how=dropna) # Note: how is ignored if Series
  655. if len(x) <= max_len:
  656. return np.nan
  657. else:
  658. return x.iloc[n]
  659. return self.agg(picker)
  660. def cumcount(self, **kwargs):
  661. """
  662. Number each item in each group from 0 to the length of that group - 1.
  663. Essentially this is equivalent to
  664. >>> self.apply(lambda x: Series(np.arange(len(x)), x.index))
  665. Parameters
  666. ----------
  667. ascending : bool, default True
  668. If False, number in reverse, from length of group - 1 to 0.
  669. Example
  670. -------
  671. >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],
  672. ... columns=['A'])
  673. >>> df
  674. A
  675. 0 a
  676. 1 a
  677. 2 a
  678. 3 b
  679. 4 b
  680. 5 a
  681. >>> df.groupby('A').cumcount()
  682. 0 0
  683. 1 1
  684. 2 2
  685. 3 0
  686. 4 1
  687. 5 3
  688. dtype: int64
  689. >>> df.groupby('A').cumcount(ascending=False)
  690. 0 3
  691. 1 2
  692. 2 1
  693. 3 1
  694. 4 0
  695. 5 0
  696. dtype: int64
  697. """
  698. self._set_selection_from_grouper()
  699. ascending = kwargs.pop('ascending', True)
  700. index = self._selected_obj.index
  701. cumcounts = self._cumcount_array(ascending=ascending)
  702. return Series(cumcounts, index)
  703. def head(self, n=5):
  704. """
  705. Returns first n rows of each group.
  706. Essentially equivalent to ``.apply(lambda x: x.head(n))``,
  707. except ignores as_index flag.
  708. Example
  709. -------
  710. >>> df = DataFrame([[1, 2], [1, 4], [5, 6]],
  711. columns=['A', 'B'])
  712. >>> df.groupby('A', as_index=False).head(1)
  713. A B
  714. 0 1 2
  715. 2 5 6
  716. >>> df.groupby('A').head(1)
  717. A B
  718. 0 1 2
  719. 2 5 6
  720. """
  721. obj = self._selected_obj
  722. in_head = self._cumcount_array() < n
  723. head = obj[in_head]
  724. return head
  725. def tail(self, n=5):
  726. """
  727. Returns last n rows of each group
  728. Essentially equivalent to ``.apply(lambda x: x.tail(n))``,
  729. except ignores as_index flag.
  730. Example
  731. -------
  732. >>> df = DataFrame([[1, 2], [1, 4], [5, 6]],
  733. columns=['A', 'B'])
  734. >>> df.groupby('A', as_index=False).tail(1)
  735. A B
  736. 0 1 2
  737. 2 5 6
  738. >>> df.groupby('A').head(1)
  739. A B
  740. 0 1 2
  741. 2 5 6
  742. """
  743. obj = self._selected_obj
  744. rng = np.arange(0, -self.grouper._max_groupsize, -1, dtype='int64')
  745. in_tail = self._cumcount_array(rng, ascending=False) > -n
  746. tail = obj[in_tail]
  747. return tail
  748. def _cumcount_array(self, arr=None, **kwargs):
  749. """
  750. arr is where cumcount gets it's values from
  751. """
  752. ascending = kwargs.pop('ascending', True)
  753. if arr is None:
  754. arr = np.arange(self.grouper._max_groupsize, dtype='int64')
  755. len_index = len(self._selected_obj.index)
  756. cumcounts = np.empty(len_index, dtype=arr.dtype)
  757. if ascending:
  758. for v in self.indices.values():
  759. cumcounts[v] = arr[:len(v)]
  760. else:
  761. for v in self.indices.values():
  762. cumcounts[v] = arr[len(v)-1::-1]
  763. return cumcounts
  764. def _index_with_as_index(self, b):
  765. """
  766. Take boolean mask of index to be returned from apply, if as_index=True
  767. """
  768. # TODO perf, it feels like this should already be somewhere...
  769. from itertools import chain
  770. original = self._selected_obj.index
  771. gp = self.grouper
  772. levels = chain((gp.levels[i][gp.labels[i][b]]
  773. for i in range(len(gp.groupings))),
  774. (original.get_level_values(i)[b]
  775. for i in range(original.nlevels)))
  776. new = MultiIndex.from_arrays(list(levels))
  777. new.names = gp.names + original.names
  778. return new
  779. def _try_cast(self, result, obj):
  780. """
  781. try to cast the result to our obj original type,
  782. we may have roundtripped thru object in the mean-time
  783. """
  784. if obj.ndim > 1:
  785. dtype = obj.values.dtype
  786. else:
  787. dtype = obj.dtype
  788. if not np.isscalar(result):
  789. result = _possibly_downcast_to_dtype(result, dtype)
  790. return result
  791. def _cython_agg_general(self, how, numeric_only=True):
  792. output = {}
  793. for name, obj in self._iterate_slices():
  794. is_numeric = is_numeric_dtype(obj.dtype)
  795. if numeric_only and not is_numeric:
  796. continue
  797. try:
  798. result, names = self.grouper.aggregate(obj.values, how)
  799. except AssertionError as e:
  800. raise GroupByError(str(e))
  801. output[name] = self._try_cast(result, obj)
  802. if len(output) == 0:
  803. raise DataError('No numeric types to aggregate')
  804. return self._wrap_aggregated_output(output, names)
  805. def _python_agg_general(self, func, *args, **kwargs):
  806. func = _intercept_function(func)
  807. f = lambda x: func(x, *args, **kwargs)
  808. # iterate through "columns" ex exclusions to populate output dict
  809. output = {}
  810. for name, obj in self._iterate_slices():
  811. try:
  812. result, counts = self.grouper.agg_series(obj, f)
  813. output[name] = self._try_cast(result, obj)
  814. except TypeError:
  815. continue
  816. if len(output) == 0:
  817. return self._python_apply_general(f)
  818. if self.grouper._filter_empty_groups:
  819. mask = counts.ravel() > 0
  820. for name, result in compat.iteritems(output):
  821. # since we are masking, make sure that we have a float object
  822. values = result
  823. if is_numeric_dtype(values.dtype):
  824. values = com.ensure_float(values)
  825. output[name] = self._try_cast(values[mask], result)
  826. return self._wrap_aggregated_output(output)
  827. def _wrap_applied_output(self, *args, **kwargs):
  828. raise NotImplementedError
  829. def _concat_objects(self, keys, values, not_indexed_same=False):
  830. from pandas.tools.merge import concat
  831. if not not_indexed_same:
  832. result = concat(values, axis=self.axis)
  833. ax = self._selected_obj._get_axis(self.axis)
  834. if isinstance(result, Series):
  835. result = result.reindex(ax)
  836. else:
  837. result = result.reindex_axis(ax, axis=self.axis)
  838. elif self.group_keys:
  839. if self.as_index:
  840. # possible MI return case
  841. group_keys = keys
  842. group_levels = self.grouper.levels
  843. group_names = self.grouper.names
  844. result = concat(values, axis=self.axis, keys=group_keys,
  845. levels=group_levels, names=group_names)
  846. else:
  847. # GH5610, returns a MI, with the first level being a
  848. # range index
  849. keys = list(range(len(values)))
  850. result = concat(values, axis=self.axis, keys=keys)
  851. else:
  852. result = concat(values, axis=self.axis)
  853. return result
  854. def _apply_filter(self, indices, dropna):
  855. if len(indices) == 0:
  856. indices = []
  857. else:
  858. indices = np.sort(np.concatenate(indices))
  859. if dropna:
  860. filtered = self._selected_obj.take(indices)
  861. else:
  862. mask = np.empty(len(self._selected_obj.index), dtype=bool)
  863. mask.fill(False)
  864. mask[indices.astype(int)] = True
  865. # mask fails to broadcast when passed to where; broadcast manually.
  866. mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T
  867. filtered = self._selected_obj.where(mask) # Fill with NaNs.
  868. return filtered
  869. @Appender(GroupBy.__doc__)
  870. def groupby(obj, by, **kwds):
  871. if isinstance(obj, Series):
  872. klass = SeriesGroupBy
  873. elif isinstance(obj, DataFrame):
  874. klass = DataFrameGroupBy
  875. else: # pragma: no cover
  876. raise TypeError('invalid type: %s' % type(obj))
  877. return klass(obj, by, **kwds)
  878. def _get_axes(group):
  879. if isinstance(group, Series):
  880. return [group.index]
  881. else:
  882. return group.axes
  883. def _is_indexed_like(obj, axes):
  884. if isinstance(obj, Series):
  885. if len(axes) > 1:
  886. return False
  887. return obj.index.equals(axes[0])
  888. elif isinstance(obj, DataFrame):
  889. return obj.index.equals(axes[0])
  890. return False
  891. class BaseGrouper(object):
  892. """
  893. This is an internal Grouper class, which actually holds the generated groups
  894. """
  895. def __init__(self, axis, groupings, sort=True, group_keys=True):
  896. self.axis = axis
  897. self.groupings = groupings
  898. self.sort = sort
  899. self.group_keys = group_keys
  900. self.compressed = True
  901. @property
  902. def shape(self):
  903. return tuple(ping.ngroups for ping in self.groupings)
  904. def __iter__(self):
  905. return iter(self.indices)
  906. @property
  907. def nkeys(self):
  908. return len(self.groupings)
  909. def get_iterator(self, data, axis=0):
  910. """
  911. Groupby iterator
  912. Returns
  913. -------
  914. Generator yielding sequence of (name, subsetted object)
  915. for each group
  916. """
  917. splitter = self._get_splitter(data, axis=axis)
  918. keys = self._get_group_keys()
  919. for key, (i, group) in zip(keys, splitter):
  920. yield key, group
  921. def _get_splitter(self, data, axis=0):
  922. comp_ids, _, ngroups = self.group_info
  923. return get_splitter(data, comp_ids, ngroups, axis=axis)
  924. def _get_group_keys(self):
  925. if len(self.groupings) == 1:
  926. return self.levels[0]
  927. else:
  928. comp_ids, _, ngroups = self.group_info
  929. # provide "flattened" iterator for multi-group setting
  930. mapper = _KeyMapper(comp_ids, ngroups, self.labels, self.levels)
  931. return [mapper.get_key(i) for i in range(ngroups)]
  932. def apply(self, f, data, axis=0):
  933. mutated = False
  934. splitter = self._get_splitter(data, axis=axis)
  935. group_keys = self._get_group_keys()
  936. # oh boy
  937. if (f.__name__ not in _plotting_methods and
  938. hasattr(splitter, 'fast_apply') and axis == 0):
  939. try:
  940. values, mutated = splitter.fast_apply(f, group_keys)
  941. return group_keys, values, mutated
  942. except (lib.InvalidApply):
  943. # we detect a mutation of some kind
  944. # so take slow path
  945. pass
  946. except (Exception) as e:
  947. # raise this error to the caller
  948. pass
  949. result_values = []
  950. for key, (i, group) in zip(group_keys, splitter):
  951. object.__setattr__(group, 'name', key)
  952. # group might be modified
  953. group_axes = _get_axes(group)
  954. res = f(group)
  955. if not _is_indexed_like(res, group_axes):
  956. mutated = True
  957. result_values.append(res)
  958. return group_keys, result_values, mutated
  959. @cache_readonly
  960. def indices(self):
  961. """ dict {group name -> group indices} """
  962. if len(self.groupings) == 1:
  963. return self.groupings[0].indices
  964. else:
  965. label_list = [ping.labels for ping in self.groupings]
  966. keys = [ping.group_index for ping in self.groupings]
  967. return _get_indices_dict(label_list, keys)
  968. @property
  969. def labels(self):
  970. return [ping.labels for ping in self.groupings]
  971. @property
  972. def levels(self):
  973. return [ping.group_index for ping in self.groupings]
  974. @property
  975. def names(self):
  976. return [ping.name for ping in self.groupings]
  977. def size(self):
  978. """
  979. Compute group sizes
  980. """
  981. # TODO: better impl
  982. labels, _, ngroups = self.group_info
  983. bin_counts = algos.value_counts(labels, sort=False)
  984. bin_counts = bin_counts.reindex(np.arange(ngroups))
  985. bin_counts.index = self.result_index
  986. return bin_counts
  987. @cache_readonly
  988. def _max_groupsize(self):
  989. '''
  990. Compute size of largest group
  991. '''
  992. # For many items in each group this is much faster than
  993. # self.size().max(), in worst case marginally slower
  994. if self.indices:
  995. return max(len(v) for v in self.indices.values())
  996. else:
  997. return 0
  998. @cache_readonly
  999. def groups(self):
  1000. """ dict {group name -> group labels} """
  1001. if len(self.groupings) == 1:
  1002. return self.groupings[0].groups
  1003. else:
  1004. to_groupby = lzip(*(ping.grouper for ping in self.groupings))
  1005. to_groupby = Index(to_groupby)
  1006. return self.axis.groupby(to_groupby.values)
  1007. @cache_readonly
  1008. def group_info(self):
  1009. comp_ids, obs_group_ids = self._get_compressed_labels()
  1010. ngroups = len(obs_group_ids)
  1011. comp_ids = com._ensure_int64(comp_ids)
  1012. return comp_ids, obs_group_ids, ngroups
  1013. def _get_compressed_labels(self):
  1014. all_labels = [ping.labels for ping in self.groupings]
  1015. if self._overflow_possible:
  1016. tups = lib.fast_zip(all_labels)
  1017. labs, uniques = algos.factorize(tups)
  1018. if self.sort:
  1019. uniques, labs = _reorder_by_uniques(uniques, labs)
  1020. return labs, uniques
  1021. else:
  1022. if len(all_labels) > 1:
  1023. group_index = get_group_index(all_labels, self.shape)
  1024. comp_ids, obs_group_ids = _compress_group_index(group_index)
  1025. else:
  1026. ping = self.groupings[0]
  1027. comp_ids = ping.labels
  1028. obs_group_ids = np.arange(len(ping.group_index))
  1029. self.compressed = False
  1030. self._filter_empty_groups = False
  1031. return comp_ids, obs_group_ids
  1032. @cache_readonly
  1033. def _overflow_possible(self):
  1034. return _int64_overflow_possible(self.shape)
  1035. @cache_readonly
  1036. def ngroups(self):
  1037. return len(self.result_index)
  1038. @cache_readonly
  1039. def result_index(self):
  1040. recons = self.get_group_levels()
  1041. return MultiIndex.from_arrays(recons, names=self.names)
  1042. def get_group_levels(self):
  1043. obs_ids = self.group_info[1]
  1044. if not self.compressed and len(self.groupings) == 1:
  1045. return [self.groupings[0].group_index]
  1046. if self._overflow_possible:
  1047. recons_labels = [np.array(x) for x in zip(*obs_ids)]
  1048. else:
  1049. recons_labels = decons_group_index(obs_ids, self.shape)
  1050. name_list = []
  1051. for ping, labels in zip(self.groupings, recons_labels):
  1052. labels = com._ensure_platform_int(labels)
  1053. name_list.append(ping.group_index.take(labels))
  1054. return name_list
  1055. #------------------------------------------------------------
  1056. # Aggregation functions
  1057. _cython_functions = {
  1058. 'add': 'group_add',
  1059. 'prod': 'group_prod',
  1060. 'min': 'group_min',
  1061. 'max': 'group_max',
  1062. 'mean': 'group_mean',
  1063. 'median': {
  1064. 'name': 'group_median'
  1065. },
  1066. 'var': 'group_var',
  1067. 'first': {
  1068. 'name': 'group_nth',
  1069. 'f': lambda func, a, b, c, d: func(a, b, c, d, 1)
  1070. },
  1071. 'last': 'group_last',
  1072. 'count': 'group_count',
  1073. }
  1074. _cython_arity = {
  1075. 'ohlc': 4, # OHLC
  1076. }
  1077. _name_functions = {}
  1078. _filter_empty_groups = True
  1079. def _get_aggregate_function(self, how, values):
  1080. dtype_str = values.dtype.name
  1081. def get_func(fname):
  1082. # find the function, or use the object function, or return a
  1083. # generic
  1084. for dt in [dtype_str, 'object']:
  1085. f = getattr(_algos, "%s_%s" % (fname, dtype_str), None)
  1086. if f is not None:
  1087. return f
  1088. return getattr(_algos, fname, None)
  1089. ftype = self._cython_functions[how]
  1090. if isinstance(ftype, dict):
  1091. func = afunc = get_func(ftype['name'])
  1092. # a sub-function
  1093. f = ftype.get('f')
  1094. if f is not None:
  1095. def wrapper(*args, **kwargs):
  1096. return f(afunc, *args, **kwargs)
  1097. # need to curry our sub-function
  1098. func = wrapper
  1099. else:
  1100. func = get_func(ftype)
  1101. if func is None:
  1102. raise NotImplementedError("function is not implemented for this"
  1103. "dtype: [how->%s,dtype->%s]" %
  1104. (how, dtype_str))
  1105. return func, dtype_str
  1106. def aggregate(self, values, how, axis=0):
  1107. arity = self._cython_arity.get(how, 1)
  1108. vdim = values.ndim
  1109. swapped = False
  1110. if vdim == 1:
  1111. values = values[:, None]
  1112. out_shape = (self.ngroups, arity)
  1113. else:
  1114. if axis > 0:
  1115. swapped = True
  1116. values = values.swapaxes(0, axis)
  1117. if arity > 1:
  1118. raise NotImplementedError
  1119. out_shape = (self.ngroups,) + values.shape[1:]
  1120. if is_numeric_dtype(values.dtype):
  1121. values = com.ensure_float(values)
  1122. is_numeric = True
  1123. out_dtype = 'f%d' % values.dtype.itemsize
  1124. else:
  1125. is_numeric = issubclass(values.dtype.type, (np.datetime64,
  1126. np.timedelta64))
  1127. if is_numeric:
  1128. out_dtype = 'float64'
  1129. values = values.view('int64')
  1130. else:
  1131. out_dtype = 'object'
  1132. values = values.astype(object)
  1133. # will be filled in Cython function
  1134. result = np.empty(out_shape, dtype=out_dtype)
  1135. result.fill(np.nan)
  1136. counts = np.zeros(self.ngroups, dtype=np.int64)
  1137. result = self._aggregate(result, counts, values, how, is_numeric)
  1138. if self._filter_empty_groups:
  1139. if result.ndim == 2:
  1140. try:
  1141. result = lib.row_bool_subset(
  1142. result, (counts > 0).view(np.uint8))
  1143. except ValueError:
  1144. result = lib.row_bool_subset_object(
  1145. result, (counts > 0).view(np.uint8))
  1146. else:
  1147. result = result[counts > 0]
  1148. if vdim == 1 and arity == 1:
  1149. result = result[:, 0]
  1150. if how in self._name_functions:
  1151. # TODO
  1152. names = self._name_functions[how]()
  1153. else:
  1154. names = None
  1155. if swapped:
  1156. result = result.swapaxes(0, axis)
  1157. return result, names
  1158. def _aggregate(self, result, counts, values, how, is_numeric):
  1159. agg_func, dtype = self._get_aggregate_function(how, values)
  1160. comp_ids, _, ngroups = self.group_info
  1161. if values.ndim > 3:
  1162. # punting for now
  1163. raise NotImplementedError
  1164. elif values.ndim > 2:
  1165. for i, chunk in enumerate(values.transpose(2, 0, 1)):
  1166. chunk = chunk.squeeze()
  1167. agg_func(result[:, :, i], counts, chunk, comp_ids)
  1168. else:
  1169. agg_func(result, counts, values, comp_ids)
  1170. return result
  1171. def agg_series(self, obj, func):
  1172. try:
  1173. return self._aggregate_series_fast(obj, func)
  1174. except Exception:
  1175. return self._aggregate_series_pure_python(obj, func)
  1176. def _aggregate_series_fast(self, obj, func):
  1177. func = _intercept_function(func)
  1178. if obj.index._has_complex_internals:
  1179. raise TypeError('Incompatible index for Cython grouper')
  1180. group_index, _, ngroups = self.group_info
  1181. # avoids object / Series creation overhead
  1182. dummy = obj._get_values(slice(None, 0)).to_dense()
  1183. indexer = _algos.groupsort_indexer(group_index, ngroups)[0]
  1184. obj = obj.take(indexer, convert=False)
  1185. group_index = com.take_nd(group_index, indexer, allow_fill=False)
  1186. grouper = lib.SeriesGrouper(obj, func, group_index, ngroups,
  1187. dummy)
  1188. result, counts = grouper.get_result()
  1189. return result, counts
  1190. def _aggregate_series_pure_python(self, obj, func):
  1191. group_index, _, ngroups = self.group_info
  1192. counts = np.zeros(ngroups, dtype=int)
  1193. result = None
  1194. splitter = get_splitter(obj, group_index, ngroups, axis=self.axis)
  1195. for label, group in splitter:
  1196. res = func(group)
  1197. if result is None:
  1198. if (isinstance(res, (Series, np.ndarray)) or
  1199. isinstance(res, list)):
  1200. raise ValueError('Function does not reduce')
  1201. result = np.empty(ngroups, dtype='O')
  1202. counts[label] = group.shape[0]
  1203. result[label] = res
  1204. result = lib.maybe_convert_objects(result, try_float=0)
  1205. return result, counts
  1206. def generate_bins_generic(values, binner, closed):
  1207. """
  1208. Generate bin edge offsets and bin labels for one array using another array
  1209. which has bin edge values. Both arrays must be sorted.
  1210. Parameters
  1211. ----------
  1212. values : array of values
  1213. binner : a comparable array of values representing bins into which to bin
  1214. the first array. Note, 'values' end-points must fall within 'binner'
  1215. end-points.
  1216. closed : which end of bin is closed; left (default), right
  1217. Returns
  1218. -------
  1219. bins : array of offsets (into 'values' argument) of bins.
  1220. Zero and last edge are excluded in result, so for instance the first
  1221. bin is values[0:bin[0]] and the last is values[bin[-1]:]
  1222. """
  1223. lenidx = len(values)
  1224. lenbin = len(binner)
  1225. if lenidx <= 0 or lenbin <= 0:
  1226. raise ValueError("Invalid length for values or for binner")
  1227. # check binner fits data
  1228. if values[0] < binner[0]:
  1229. raise ValueError("Values falls before first bin")
  1230. if values[lenidx - 1] > binner[lenbin - 1]:
  1231. raise ValueError("Values falls after last bin")
  1232. bins = np.empty(lenbin - 1, dtype=np.int64)
  1233. j = 0 # index into values
  1234. bc = 0 # bin count
  1235. # linear scan, presume nothing about values/binner except that it fits ok
  1236. for i in range(0, lenbin - 1):
  1237. r_bin = binner[i + 1]
  1238. # count values in current bin, advance to next bin
  1239. while j < lenidx and (values[j] < r_bin or
  1240. (closed == 'right' and values[j] == r_bin)):
  1241. j += 1
  1242. bins[bc] = j
  1243. bc += 1
  1244. return bins
  1245. class BinGrouper(BaseGrouper):
  1246. def __init__(self, bins, binlabels, filter_empty=False):
  1247. self.bins = com._ensure_int64(bins)
  1248. self.binlabels = _ensure_index(binlabels)
  1249. self._filter_empty_groups = filter_empty
  1250. @cache_readonly
  1251. def groups(self):
  1252. """ dict {group name -> group labels} """
  1253. # this is mainly for compat
  1254. # GH 3881
  1255. result = {}
  1256. for key, value in zip(self.binlabels, self.bins):
  1257. if key is not tslib.NaT:
  1258. result[key] = value
  1259. return result
  1260. @property
  1261. def nkeys(self):
  1262. return 1
  1263. def get_iterator(self, data, axis=0):
  1264. """
  1265. Groupby iterator
  1266. Returns
  1267. -------
  1268. Generator yielding sequence of (name, subsetted object)
  1269. for each group
  1270. """
  1271. if isinstance(data, NDFrame):
  1272. slicer = lambda start,edge: data._slice(slice(start,edge),axis=axis)
  1273. length = len(data.axes[axis])
  1274. else:
  1275. slicer = lambda start,edge: data[slice(start,edge)]
  1276. length = len(data)
  1277. start = 0
  1278. for edge, label in zip(self.bins, self.binlabels):
  1279. if label is not tslib.NaT:
  1280. yield label, slicer(start,edge)
  1281. start = e

Large files files are truncated, but you can click here to view the full file