PageRenderTime 66ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 2ms

/pandas/core/generic.py

http://github.com/pydata/pandas
Python | 11861 lines | 11691 code | 84 blank | 86 comment | 108 complexity | ecd9ba184d70bdea404d36531c1c0098 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. # pyright: reportPropertyTypeMismatch=false
  2. from __future__ import annotations
  3. import collections
  4. from datetime import timedelta
  5. import functools
  6. import gc
  7. import json
  8. import operator
  9. import pickle
  10. import re
  11. from typing import (
  12. TYPE_CHECKING,
  13. Any,
  14. AnyStr,
  15. Callable,
  16. Hashable,
  17. Literal,
  18. Mapping,
  19. Sequence,
  20. cast,
  21. final,
  22. overload,
  23. )
  24. import warnings
  25. import weakref
  26. import numpy as np
  27. from pandas._config import config
  28. from pandas._libs import lib
  29. from pandas._libs.tslibs import (
  30. Period,
  31. Tick,
  32. Timestamp,
  33. to_offset,
  34. )
  35. from pandas._typing import (
  36. ArrayLike,
  37. Axis,
  38. CompressionOptions,
  39. Dtype,
  40. DtypeArg,
  41. DtypeObj,
  42. FilePathOrBuffer,
  43. IndexKeyFunc,
  44. IndexLabel,
  45. JSONSerializable,
  46. Level,
  47. Manager,
  48. NDFrameT,
  49. RandomState,
  50. Renamer,
  51. StorageOptions,
  52. T,
  53. TimedeltaConvertibleTypes,
  54. TimestampConvertibleTypes,
  55. ValueKeyFunc,
  56. npt,
  57. )
  58. from pandas.compat._optional import import_optional_dependency
  59. from pandas.compat.numpy import function as nv
  60. from pandas.errors import (
  61. AbstractMethodError,
  62. InvalidIndexError,
  63. )
  64. from pandas.util._decorators import (
  65. doc,
  66. rewrite_axis_style_signature,
  67. )
  68. from pandas.util._exceptions import find_stack_level
  69. from pandas.util._validators import (
  70. validate_ascending,
  71. validate_bool_kwarg,
  72. validate_fillna_kwargs,
  73. validate_inclusive,
  74. )
  75. from pandas.core.dtypes.common import (
  76. ensure_object,
  77. ensure_platform_int,
  78. ensure_str,
  79. is_bool,
  80. is_bool_dtype,
  81. is_datetime64_any_dtype,
  82. is_datetime64tz_dtype,
  83. is_dict_like,
  84. is_dtype_equal,
  85. is_extension_array_dtype,
  86. is_float,
  87. is_list_like,
  88. is_number,
  89. is_numeric_dtype,
  90. is_re_compilable,
  91. is_scalar,
  92. is_timedelta64_dtype,
  93. pandas_dtype,
  94. )
  95. from pandas.core.dtypes.generic import (
  96. ABCDataFrame,
  97. ABCSeries,
  98. )
  99. from pandas.core.dtypes.inference import (
  100. is_hashable,
  101. is_nested_list_like,
  102. )
  103. from pandas.core.dtypes.missing import (
  104. isna,
  105. notna,
  106. )
  107. from pandas.core import (
  108. arraylike,
  109. indexing,
  110. missing,
  111. nanops,
  112. )
  113. import pandas.core.algorithms as algos
  114. from pandas.core.arrays import ExtensionArray
  115. from pandas.core.base import PandasObject
  116. import pandas.core.common as com
  117. from pandas.core.construction import (
  118. create_series_with_explicit_dtype,
  119. extract_array,
  120. )
  121. from pandas.core.describe import describe_ndframe
  122. from pandas.core.flags import Flags
  123. from pandas.core.indexes.api import (
  124. DatetimeIndex,
  125. Index,
  126. MultiIndex,
  127. PeriodIndex,
  128. RangeIndex,
  129. default_index,
  130. ensure_index,
  131. )
  132. from pandas.core.internals import (
  133. ArrayManager,
  134. BlockManager,
  135. SingleArrayManager,
  136. )
  137. from pandas.core.internals.construction import mgr_to_mgr
  138. from pandas.core.missing import find_valid_index
  139. from pandas.core.ops import align_method_FRAME
  140. from pandas.core.reshape.concat import concat
  141. import pandas.core.sample as sample
  142. from pandas.core.shared_docs import _shared_docs
  143. from pandas.core.sorting import get_indexer_indexer
  144. from pandas.core.window import (
  145. Expanding,
  146. ExponentialMovingWindow,
  147. Rolling,
  148. Window,
  149. )
  150. from pandas.io.formats import format as fmt
  151. from pandas.io.formats.format import (
  152. DataFrameFormatter,
  153. DataFrameRenderer,
  154. )
  155. from pandas.io.formats.printing import pprint_thing
  156. if TYPE_CHECKING:
  157. from pandas._libs.tslibs import BaseOffset
  158. from pandas.core.frame import DataFrame
  159. from pandas.core.indexers.objects import BaseIndexer
  160. from pandas.core.resample import Resampler
  161. from pandas.core.series import Series
  162. # goal is to be able to define the docs close to function, while still being
  163. # able to share
  164. _shared_docs = {**_shared_docs}
  165. _shared_doc_kwargs = {
  166. "axes": "keywords for axes",
  167. "klass": "Series/DataFrame",
  168. "axes_single_arg": "int or labels for object",
  169. "args_transpose": "axes to permute (int or label for object)",
  170. "inplace": """
  171. inplace : bool, default False
  172. If True, performs operation inplace and returns None.""",
  173. "optional_by": """
  174. by : str or list of str
  175. Name or list of names to sort by""",
  176. "replace_iloc": """
  177. This differs from updating with ``.loc`` or ``.iloc``, which require
  178. you to specify a location to update with some value.""",
  179. }
  180. bool_t = bool # Need alias because NDFrame has def bool:
  181. class NDFrame(PandasObject, indexing.IndexingMixin):
  182. """
  183. N-dimensional analogue of DataFrame. Store multi-dimensional in a
  184. size-mutable, labeled data structure
  185. Parameters
  186. ----------
  187. data : BlockManager
  188. axes : list
  189. copy : bool, default False
  190. """
  191. _internal_names: list[str] = [
  192. "_mgr",
  193. "_cacher",
  194. "_item_cache",
  195. "_cache",
  196. "_is_copy",
  197. "_subtyp",
  198. "_name",
  199. "_default_kind",
  200. "_default_fill_value",
  201. "_metadata",
  202. "__array_struct__",
  203. "__array_interface__",
  204. "_flags",
  205. ]
  206. _internal_names_set: set[str] = set(_internal_names)
  207. _accessors: set[str] = set()
  208. _hidden_attrs: frozenset[str] = frozenset(
  209. ["_AXIS_NAMES", "_AXIS_NUMBERS", "get_values", "tshift"]
  210. )
  211. _metadata: list[str] = []
  212. _is_copy: weakref.ReferenceType[NDFrame] | None = None
  213. _mgr: Manager
  214. _attrs: dict[Hashable, Any]
  215. _typ: str
  216. # ----------------------------------------------------------------------
  217. # Constructors
  218. def __init__(
  219. self,
  220. data: Manager,
  221. copy: bool_t = False,
  222. attrs: Mapping[Hashable, Any] | None = None,
  223. ):
  224. # copy kwarg is retained for mypy compat, is not used
  225. object.__setattr__(self, "_is_copy", None)
  226. object.__setattr__(self, "_mgr", data)
  227. object.__setattr__(self, "_item_cache", {})
  228. if attrs is None:
  229. attrs = {}
  230. else:
  231. attrs = dict(attrs)
  232. object.__setattr__(self, "_attrs", attrs)
  233. object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True))
  234. @classmethod
  235. def _init_mgr(
  236. cls,
  237. mgr: Manager,
  238. axes,
  239. dtype: Dtype | None = None,
  240. copy: bool_t = False,
  241. ) -> Manager:
  242. """passed a manager and a axes dict"""
  243. for a, axe in axes.items():
  244. if axe is not None:
  245. axe = ensure_index(axe)
  246. bm_axis = cls._get_block_manager_axis(a)
  247. mgr = mgr.reindex_axis(axe, axis=bm_axis)
  248. # make a copy if explicitly requested
  249. if copy:
  250. mgr = mgr.copy()
  251. if dtype is not None:
  252. # avoid further copies if we can
  253. if (
  254. isinstance(mgr, BlockManager)
  255. and len(mgr.blocks) == 1
  256. and is_dtype_equal(mgr.blocks[0].values.dtype, dtype)
  257. ):
  258. pass
  259. else:
  260. mgr = mgr.astype(dtype=dtype)
  261. return mgr
  262. @classmethod
  263. def _from_mgr(cls, mgr: Manager):
  264. """
  265. Fastpath to create a new DataFrame/Series from just a BlockManager/ArrayManager.
  266. Notes
  267. -----
  268. Skips setting `_flags` attribute; caller is responsible for doing so.
  269. """
  270. obj = cls.__new__(cls)
  271. object.__setattr__(obj, "_is_copy", None)
  272. object.__setattr__(obj, "_mgr", mgr)
  273. object.__setattr__(obj, "_item_cache", {})
  274. object.__setattr__(obj, "_attrs", {})
  275. return obj
  276. def _as_manager(self: NDFrameT, typ: str, copy: bool_t = True) -> NDFrameT:
  277. """
  278. Private helper function to create a DataFrame with specific manager.
  279. Parameters
  280. ----------
  281. typ : {"block", "array"}
  282. copy : bool, default True
  283. Only controls whether the conversion from Block->ArrayManager
  284. copies the 1D arrays (to ensure proper/contiguous memory layout).
  285. Returns
  286. -------
  287. DataFrame
  288. New DataFrame using specified manager type. Is not guaranteed
  289. to be a copy or not.
  290. """
  291. new_mgr: Manager
  292. new_mgr = mgr_to_mgr(self._mgr, typ=typ, copy=copy)
  293. # fastpath of passing a manager doesn't check the option/manager class
  294. return self._constructor(new_mgr).__finalize__(self)
  295. # ----------------------------------------------------------------------
  296. # attrs and flags
  297. @property
  298. def attrs(self) -> dict[Hashable, Any]:
  299. """
  300. Dictionary of global attributes of this dataset.
  301. .. warning::
  302. attrs is experimental and may change without warning.
  303. See Also
  304. --------
  305. DataFrame.flags : Global flags applying to this object.
  306. """
  307. if self._attrs is None:
  308. self._attrs = {}
  309. return self._attrs
  310. @attrs.setter
  311. def attrs(self, value: Mapping[Hashable, Any]) -> None:
  312. self._attrs = dict(value)
  313. @final
  314. @property
  315. def flags(self) -> Flags:
  316. """
  317. Get the properties associated with this pandas object.
  318. The available flags are
  319. * :attr:`Flags.allows_duplicate_labels`
  320. See Also
  321. --------
  322. Flags : Flags that apply to pandas objects.
  323. DataFrame.attrs : Global metadata applying to this dataset.
  324. Notes
  325. -----
  326. "Flags" differ from "metadata". Flags reflect properties of the
  327. pandas object (the Series or DataFrame). Metadata refer to properties
  328. of the dataset, and should be stored in :attr:`DataFrame.attrs`.
  329. Examples
  330. --------
  331. >>> df = pd.DataFrame({"A": [1, 2]})
  332. >>> df.flags
  333. <Flags(allows_duplicate_labels=True)>
  334. Flags can be get or set using ``.``
  335. >>> df.flags.allows_duplicate_labels
  336. True
  337. >>> df.flags.allows_duplicate_labels = False
  338. Or by slicing with a key
  339. >>> df.flags["allows_duplicate_labels"]
  340. False
  341. >>> df.flags["allows_duplicate_labels"] = True
  342. """
  343. return self._flags
  344. @final
  345. def set_flags(
  346. self: NDFrameT,
  347. *,
  348. copy: bool_t = False,
  349. allows_duplicate_labels: bool_t | None = None,
  350. ) -> NDFrameT:
  351. """
  352. Return a new object with updated flags.
  353. Parameters
  354. ----------
  355. allows_duplicate_labels : bool, optional
  356. Whether the returned object allows duplicate labels.
  357. Returns
  358. -------
  359. Series or DataFrame
  360. The same type as the caller.
  361. See Also
  362. --------
  363. DataFrame.attrs : Global metadata applying to this dataset.
  364. DataFrame.flags : Global flags applying to this object.
  365. Notes
  366. -----
  367. This method returns a new object that's a view on the same data
  368. as the input. Mutating the input or the output values will be reflected
  369. in the other.
  370. This method is intended to be used in method chains.
  371. "Flags" differ from "metadata". Flags reflect properties of the
  372. pandas object (the Series or DataFrame). Metadata refer to properties
  373. of the dataset, and should be stored in :attr:`DataFrame.attrs`.
  374. Examples
  375. --------
  376. >>> df = pd.DataFrame({"A": [1, 2]})
  377. >>> df.flags.allows_duplicate_labels
  378. True
  379. >>> df2 = df.set_flags(allows_duplicate_labels=False)
  380. >>> df2.flags.allows_duplicate_labels
  381. False
  382. """
  383. df = self.copy(deep=copy)
  384. if allows_duplicate_labels is not None:
  385. df.flags["allows_duplicate_labels"] = allows_duplicate_labels
  386. return df
  387. @final
  388. @classmethod
  389. def _validate_dtype(cls, dtype) -> DtypeObj | None:
  390. """validate the passed dtype"""
  391. if dtype is not None:
  392. dtype = pandas_dtype(dtype)
  393. # a compound dtype
  394. if dtype.kind == "V":
  395. raise NotImplementedError(
  396. "compound dtypes are not implemented "
  397. f"in the {cls.__name__} constructor"
  398. )
  399. return dtype
  400. # ----------------------------------------------------------------------
  401. # Construction
  402. @property
  403. def _constructor(self: NDFrameT) -> type[NDFrameT]:
  404. """
  405. Used when a manipulation result has the same dimensions as the
  406. original.
  407. """
  408. raise AbstractMethodError(self)
  409. # ----------------------------------------------------------------------
  410. # Internals
  411. @final
  412. @property
  413. def _data(self):
  414. # GH#33054 retained because some downstream packages uses this,
  415. # e.g. fastparquet
  416. return self._mgr
  417. # ----------------------------------------------------------------------
  418. # Axis
  419. _stat_axis_number = 0
  420. _stat_axis_name = "index"
  421. _AXIS_ORDERS: list[str]
  422. _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = {0: 0, "index": 0, "rows": 0}
  423. _info_axis_number: int
  424. _info_axis_name: str
  425. _AXIS_LEN: int
  426. @property
  427. def _AXIS_NUMBERS(self) -> dict[str, int]:
  428. """.. deprecated:: 1.1.0"""
  429. level = self.ndim + 1
  430. warnings.warn(
  431. "_AXIS_NUMBERS has been deprecated.", FutureWarning, stacklevel=level
  432. )
  433. return {"index": 0}
  434. @property
  435. def _AXIS_NAMES(self) -> dict[int, str]:
  436. """.. deprecated:: 1.1.0"""
  437. level = self.ndim + 1
  438. warnings.warn(
  439. "_AXIS_NAMES has been deprecated.", FutureWarning, stacklevel=level
  440. )
  441. return {0: "index"}
  442. @final
  443. def _construct_axes_dict(self, axes=None, **kwargs):
  444. """Return an axes dictionary for myself."""
  445. d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)}
  446. d.update(kwargs)
  447. return d
  448. @final
  449. @classmethod
  450. def _construct_axes_from_arguments(
  451. cls, args, kwargs, require_all: bool_t = False, sentinel=None
  452. ):
  453. """
  454. Construct and returns axes if supplied in args/kwargs.
  455. If require_all, raise if all axis arguments are not supplied
  456. return a tuple of (axes, kwargs).
  457. sentinel specifies the default parameter when an axis is not
  458. supplied; useful to distinguish when a user explicitly passes None
  459. in scenarios where None has special meaning.
  460. """
  461. # construct the args
  462. args = list(args)
  463. for a in cls._AXIS_ORDERS:
  464. # look for a argument by position
  465. if a not in kwargs:
  466. try:
  467. kwargs[a] = args.pop(0)
  468. except IndexError as err:
  469. if require_all:
  470. raise TypeError(
  471. "not enough/duplicate arguments specified!"
  472. ) from err
  473. axes = {a: kwargs.pop(a, sentinel) for a in cls._AXIS_ORDERS}
  474. return axes, kwargs
  475. @final
  476. @classmethod
  477. def _get_axis_number(cls, axis: Axis) -> int:
  478. try:
  479. return cls._AXIS_TO_AXIS_NUMBER[axis]
  480. except KeyError:
  481. raise ValueError(f"No axis named {axis} for object type {cls.__name__}")
  482. @final
  483. @classmethod
  484. def _get_axis_name(cls, axis: Axis) -> str:
  485. axis_number = cls._get_axis_number(axis)
  486. return cls._AXIS_ORDERS[axis_number]
  487. @final
  488. def _get_axis(self, axis: Axis) -> Index:
  489. axis_number = self._get_axis_number(axis)
  490. assert axis_number in {0, 1}
  491. return self.index if axis_number == 0 else self.columns
  492. @final
  493. @classmethod
  494. def _get_block_manager_axis(cls, axis: Axis) -> int:
  495. """Map the axis to the block_manager axis."""
  496. axis = cls._get_axis_number(axis)
  497. ndim = cls._AXIS_LEN
  498. if ndim == 2:
  499. # i.e. DataFrame
  500. return 1 - axis
  501. return axis
  502. @final
  503. def _get_axis_resolvers(self, axis: str) -> dict[str, Series | MultiIndex]:
  504. # index or columns
  505. axis_index = getattr(self, axis)
  506. d = {}
  507. prefix = axis[0]
  508. for i, name in enumerate(axis_index.names):
  509. if name is not None:
  510. key = level = name
  511. else:
  512. # prefix with 'i' or 'c' depending on the input axis
  513. # e.g., you must do ilevel_0 for the 0th level of an unnamed
  514. # multiiindex
  515. key = f"{prefix}level_{i}"
  516. level = i
  517. level_values = axis_index.get_level_values(level)
  518. s = level_values.to_series()
  519. s.index = axis_index
  520. d[key] = s
  521. # put the index/columns itself in the dict
  522. if isinstance(axis_index, MultiIndex):
  523. dindex = axis_index
  524. else:
  525. dindex = axis_index.to_series()
  526. d[axis] = dindex
  527. return d
  528. @final
  529. def _get_index_resolvers(self) -> dict[Hashable, Series | MultiIndex]:
  530. from pandas.core.computation.parsing import clean_column_name
  531. d: dict[str, Series | MultiIndex] = {}
  532. for axis_name in self._AXIS_ORDERS:
  533. d.update(self._get_axis_resolvers(axis_name))
  534. return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)}
  535. @final
  536. def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]:
  537. """
  538. Return the special character free column resolvers of a dataframe.
  539. Column names with special characters are 'cleaned up' so that they can
  540. be referred to by backtick quoting.
  541. Used in :meth:`DataFrame.eval`.
  542. """
  543. from pandas.core.computation.parsing import clean_column_name
  544. if isinstance(self, ABCSeries):
  545. return {clean_column_name(self.name): self}
  546. return {
  547. clean_column_name(k): v for k, v in self.items() if not isinstance(k, int)
  548. }
  549. @property
  550. def _info_axis(self) -> Index:
  551. return getattr(self, self._info_axis_name)
  552. @property
  553. def _stat_axis(self) -> Index:
  554. return getattr(self, self._stat_axis_name)
  555. @property
  556. def shape(self) -> tuple[int, ...]:
  557. """
  558. Return a tuple of axis dimensions
  559. """
  560. return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS)
  561. @property
  562. def axes(self) -> list[Index]:
  563. """
  564. Return index label(s) of the internal NDFrame
  565. """
  566. # we do it this way because if we have reversed axes, then
  567. # the block manager shows then reversed
  568. return [self._get_axis(a) for a in self._AXIS_ORDERS]
  569. @property
  570. def ndim(self) -> int:
  571. """
  572. Return an int representing the number of axes / array dimensions.
  573. Return 1 if Series. Otherwise return 2 if DataFrame.
  574. See Also
  575. --------
  576. ndarray.ndim : Number of array dimensions.
  577. Examples
  578. --------
  579. >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
  580. >>> s.ndim
  581. 1
  582. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
  583. >>> df.ndim
  584. 2
  585. """
  586. return self._mgr.ndim
  587. @property
  588. def size(self) -> int:
  589. """
  590. Return an int representing the number of elements in this object.
  591. Return the number of rows if Series. Otherwise return the number of
  592. rows times number of columns if DataFrame.
  593. See Also
  594. --------
  595. ndarray.size : Number of elements in the array.
  596. Examples
  597. --------
  598. >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
  599. >>> s.size
  600. 3
  601. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
  602. >>> df.size
  603. 4
  604. """
  605. return np.prod(self.shape)
  606. @overload
  607. def set_axis(
  608. self: NDFrameT, labels, axis: Axis = ..., inplace: Literal[False] = ...
  609. ) -> NDFrameT:
  610. ...
  611. @overload
  612. def set_axis(self, labels, axis: Axis, inplace: Literal[True]) -> None:
  613. ...
  614. @overload
  615. def set_axis(self, labels, *, inplace: Literal[True]) -> None:
  616. ...
  617. @overload
  618. def set_axis(
  619. self: NDFrameT, labels, axis: Axis = ..., inplace: bool_t = ...
  620. ) -> NDFrameT | None:
  621. ...
  622. def set_axis(self, labels, axis: Axis = 0, inplace: bool_t = False):
  623. """
  624. Assign desired index to given axis.
  625. Indexes for%(extended_summary_sub)s row labels can be changed by assigning
  626. a list-like or Index.
  627. Parameters
  628. ----------
  629. labels : list-like, Index
  630. The values for the new index.
  631. axis : %(axes_single_arg)s, default 0
  632. The axis to update. The value 0 identifies the rows%(axis_description_sub)s.
  633. inplace : bool, default False
  634. Whether to return a new %(klass)s instance.
  635. Returns
  636. -------
  637. renamed : %(klass)s or None
  638. An object of type %(klass)s or None if ``inplace=True``.
  639. See Also
  640. --------
  641. %(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s.
  642. """
  643. self._check_inplace_and_allows_duplicate_labels(inplace)
  644. return self._set_axis_nocheck(labels, axis, inplace)
  645. @final
  646. def _set_axis_nocheck(self, labels, axis: Axis, inplace: bool_t):
  647. # NDFrame.rename with inplace=False calls set_axis(inplace=True) on a copy.
  648. if inplace:
  649. setattr(self, self._get_axis_name(axis), labels)
  650. else:
  651. obj = self.copy()
  652. obj.set_axis(labels, axis=axis, inplace=True)
  653. return obj
  654. def _set_axis(self, axis: int, labels: Index) -> None:
  655. labels = ensure_index(labels)
  656. self._mgr.set_axis(axis, labels)
  657. self._clear_item_cache()
  658. @final
  659. def swapaxes(self: NDFrameT, axis1, axis2, copy=True) -> NDFrameT:
  660. """
  661. Interchange axes and swap values axes appropriately.
  662. Returns
  663. -------
  664. y : same as input
  665. """
  666. i = self._get_axis_number(axis1)
  667. j = self._get_axis_number(axis2)
  668. if i == j:
  669. if copy:
  670. return self.copy()
  671. return self
  672. mapping = {i: j, j: i}
  673. new_axes = (self._get_axis(mapping.get(k, k)) for k in range(self._AXIS_LEN))
  674. new_values = self.values.swapaxes(i, j)
  675. if copy:
  676. new_values = new_values.copy()
  677. # ignore needed because of NDFrame constructor is different than
  678. # DataFrame/Series constructors.
  679. return self._constructor(
  680. # error: Argument 1 to "NDFrame" has incompatible type "ndarray"; expected
  681. # "Union[ArrayManager, BlockManager]"
  682. # error: Argument 2 to "NDFrame" has incompatible type "*Generator[Index,
  683. # None, None]"; expected "bool" [arg-type]
  684. # error: Argument 2 to "NDFrame" has incompatible type "*Generator[Index,
  685. # None, None]"; expected "Optional[Mapping[Hashable, Any]]"
  686. new_values, # type: ignore[arg-type]
  687. *new_axes, # type: ignore[arg-type]
  688. ).__finalize__(self, method="swapaxes")
  689. @final
  690. @doc(klass=_shared_doc_kwargs["klass"])
  691. def droplevel(self: NDFrameT, level, axis=0) -> NDFrameT:
  692. """
  693. Return {klass} with requested index / column level(s) removed.
  694. Parameters
  695. ----------
  696. level : int, str, or list-like
  697. If a string is given, must be the name of a level
  698. If list-like, elements must be names or positional indexes
  699. of levels.
  700. axis : {{0 or 'index', 1 or 'columns'}}, default 0
  701. Axis along which the level(s) is removed:
  702. * 0 or 'index': remove level(s) in column.
  703. * 1 or 'columns': remove level(s) in row.
  704. Returns
  705. -------
  706. {klass}
  707. {klass} with requested index / column level(s) removed.
  708. Examples
  709. --------
  710. >>> df = pd.DataFrame([
  711. ... [1, 2, 3, 4],
  712. ... [5, 6, 7, 8],
  713. ... [9, 10, 11, 12]
  714. ... ]).set_index([0, 1]).rename_axis(['a', 'b'])
  715. >>> df.columns = pd.MultiIndex.from_tuples([
  716. ... ('c', 'e'), ('d', 'f')
  717. ... ], names=['level_1', 'level_2'])
  718. >>> df
  719. level_1 c d
  720. level_2 e f
  721. a b
  722. 1 2 3 4
  723. 5 6 7 8
  724. 9 10 11 12
  725. >>> df.droplevel('a')
  726. level_1 c d
  727. level_2 e f
  728. b
  729. 2 3 4
  730. 6 7 8
  731. 10 11 12
  732. >>> df.droplevel('level_2', axis=1)
  733. level_1 c d
  734. a b
  735. 1 2 3 4
  736. 5 6 7 8
  737. 9 10 11 12
  738. """
  739. labels = self._get_axis(axis)
  740. new_labels = labels.droplevel(level)
  741. return self.set_axis(new_labels, axis=axis, inplace=False)
  742. def pop(self, item: Hashable) -> Series | Any:
  743. result = self[item]
  744. del self[item]
  745. return result
  746. @final
  747. def squeeze(self, axis=None):
  748. """
  749. Squeeze 1 dimensional axis objects into scalars.
  750. Series or DataFrames with a single element are squeezed to a scalar.
  751. DataFrames with a single column or a single row are squeezed to a
  752. Series. Otherwise the object is unchanged.
  753. This method is most useful when you don't know if your
  754. object is a Series or DataFrame, but you do know it has just a single
  755. column. In that case you can safely call `squeeze` to ensure you have a
  756. Series.
  757. Parameters
  758. ----------
  759. axis : {0 or 'index', 1 or 'columns', None}, default None
  760. A specific axis to squeeze. By default, all length-1 axes are
  761. squeezed.
  762. Returns
  763. -------
  764. DataFrame, Series, or scalar
  765. The projection after squeezing `axis` or all the axes.
  766. See Also
  767. --------
  768. Series.iloc : Integer-location based indexing for selecting scalars.
  769. DataFrame.iloc : Integer-location based indexing for selecting Series.
  770. Series.to_frame : Inverse of DataFrame.squeeze for a
  771. single-column DataFrame.
  772. Examples
  773. --------
  774. >>> primes = pd.Series([2, 3, 5, 7])
  775. Slicing might produce a Series with a single value:
  776. >>> even_primes = primes[primes % 2 == 0]
  777. >>> even_primes
  778. 0 2
  779. dtype: int64
  780. >>> even_primes.squeeze()
  781. 2
  782. Squeezing objects with more than one value in every axis does nothing:
  783. >>> odd_primes = primes[primes % 2 == 1]
  784. >>> odd_primes
  785. 1 3
  786. 2 5
  787. 3 7
  788. dtype: int64
  789. >>> odd_primes.squeeze()
  790. 1 3
  791. 2 5
  792. 3 7
  793. dtype: int64
  794. Squeezing is even more effective when used with DataFrames.
  795. >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'])
  796. >>> df
  797. a b
  798. 0 1 2
  799. 1 3 4
  800. Slicing a single column will produce a DataFrame with the columns
  801. having only one value:
  802. >>> df_a = df[['a']]
  803. >>> df_a
  804. a
  805. 0 1
  806. 1 3
  807. So the columns can be squeezed down, resulting in a Series:
  808. >>> df_a.squeeze('columns')
  809. 0 1
  810. 1 3
  811. Name: a, dtype: int64
  812. Slicing a single row from a single column will produce a single
  813. scalar DataFrame:
  814. >>> df_0a = df.loc[df.index < 1, ['a']]
  815. >>> df_0a
  816. a
  817. 0 1
  818. Squeezing the rows produces a single scalar Series:
  819. >>> df_0a.squeeze('rows')
  820. a 1
  821. Name: 0, dtype: int64
  822. Squeezing all axes will project directly into a scalar:
  823. >>> df_0a.squeeze()
  824. 1
  825. """
  826. axis = range(self._AXIS_LEN) if axis is None else (self._get_axis_number(axis),)
  827. return self.iloc[
  828. tuple(
  829. 0 if i in axis and len(a) == 1 else slice(None)
  830. for i, a in enumerate(self.axes)
  831. )
  832. ]
  833. # ----------------------------------------------------------------------
  834. # Rename
  835. def rename(
  836. self: NDFrameT,
  837. mapper: Renamer | None = None,
  838. *,
  839. index: Renamer | None = None,
  840. columns: Renamer | None = None,
  841. axis: Axis | None = None,
  842. copy: bool_t = True,
  843. inplace: bool_t = False,
  844. level: Level | None = None,
  845. errors: str = "ignore",
  846. ) -> NDFrameT | None:
  847. """
  848. Alter axes input function or functions. Function / dict values must be
  849. unique (1-to-1). Labels not contained in a dict / Series will be left
  850. as-is. Extra labels listed don't throw an error. Alternatively, change
  851. ``Series.name`` with a scalar value (Series only).
  852. Parameters
  853. ----------
  854. %(axes)s : scalar, list-like, dict-like or function, optional
  855. Scalar or list-like will alter the ``Series.name`` attribute,
  856. and raise on DataFrame.
  857. dict-like or functions are transformations to apply to
  858. that axis' values
  859. copy : bool, default True
  860. Also copy underlying data.
  861. inplace : bool, default False
  862. Whether to return a new {klass}. If True then value of copy is
  863. ignored.
  864. level : int or level name, default None
  865. In case of a MultiIndex, only rename labels in the specified
  866. level.
  867. errors : {'ignore', 'raise'}, default 'ignore'
  868. If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,
  869. or `columns` contains labels that are not present in the Index
  870. being transformed.
  871. If 'ignore', existing keys will be renamed and extra keys will be
  872. ignored.
  873. Returns
  874. -------
  875. renamed : {klass} (new object)
  876. Raises
  877. ------
  878. KeyError
  879. If any of the labels is not found in the selected axis and
  880. "errors='raise'".
  881. See Also
  882. --------
  883. NDFrame.rename_axis
  884. Examples
  885. --------
  886. >>> s = pd.Series([1, 2, 3])
  887. >>> s
  888. 0 1
  889. 1 2
  890. 2 3
  891. dtype: int64
  892. >>> s.rename("my_name") # scalar, changes Series.name
  893. 0 1
  894. 1 2
  895. 2 3
  896. Name: my_name, dtype: int64
  897. >>> s.rename(lambda x: x ** 2) # function, changes labels
  898. 0 1
  899. 1 2
  900. 4 3
  901. dtype: int64
  902. >>> s.rename({1: 3, 2: 5}) # mapping, changes labels
  903. 0 1
  904. 3 2
  905. 5 3
  906. dtype: int64
  907. Since ``DataFrame`` doesn't have a ``.name`` attribute,
  908. only mapping-type arguments are allowed.
  909. >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
  910. >>> df.rename(2)
  911. Traceback (most recent call last):
  912. ...
  913. TypeError: 'int' object is not callable
  914. ``DataFrame.rename`` supports two calling conventions
  915. * ``(index=index_mapper, columns=columns_mapper, ...)``
  916. * ``(mapper, axis={'index', 'columns'}, ...)``
  917. We *highly* recommend using keyword arguments to clarify your
  918. intent.
  919. >>> df.rename(index=str, columns={"A": "a", "B": "c"})
  920. a c
  921. 0 1 4
  922. 1 2 5
  923. 2 3 6
  924. >>> df.rename(index=str, columns={"A": "a", "C": "c"})
  925. a B
  926. 0 1 4
  927. 1 2 5
  928. 2 3 6
  929. Using axis-style parameters
  930. >>> df.rename(str.lower, axis='columns')
  931. a b
  932. 0 1 4
  933. 1 2 5
  934. 2 3 6
  935. >>> df.rename({1: 2, 2: 4}, axis='index')
  936. A B
  937. 0 1 4
  938. 2 2 5
  939. 4 3 6
  940. See the :ref:`user guide <basics.rename>` for more.
  941. """
  942. if mapper is None and index is None and columns is None:
  943. raise TypeError("must pass an index to rename")
  944. if index is not None or columns is not None:
  945. if axis is not None:
  946. raise TypeError(
  947. "Cannot specify both 'axis' and any of 'index' or 'columns'"
  948. )
  949. elif mapper is not None:
  950. raise TypeError(
  951. "Cannot specify both 'mapper' and any of 'index' or 'columns'"
  952. )
  953. else:
  954. # use the mapper argument
  955. if axis and self._get_axis_number(axis) == 1:
  956. columns = mapper
  957. else:
  958. index = mapper
  959. self._check_inplace_and_allows_duplicate_labels(inplace)
  960. result = self if inplace else self.copy(deep=copy)
  961. for axis_no, replacements in enumerate((index, columns)):
  962. if replacements is None:
  963. continue
  964. ax = self._get_axis(axis_no)
  965. f = com.get_rename_function(replacements)
  966. if level is not None:
  967. level = ax._get_level_number(level)
  968. # GH 13473
  969. if not callable(replacements):
  970. if ax._is_multi and level is not None:
  971. indexer = ax.get_level_values(level).get_indexer_for(replacements)
  972. else:
  973. indexer = ax.get_indexer_for(replacements)
  974. if errors == "raise" and len(indexer[indexer == -1]):
  975. missing_labels = [
  976. label
  977. for index, label in enumerate(replacements)
  978. if indexer[index] == -1
  979. ]
  980. raise KeyError(f"{missing_labels} not found in axis")
  981. new_index = ax._transform_index(f, level=level)
  982. result._set_axis_nocheck(new_index, axis=axis_no, inplace=True)
  983. result._clear_item_cache()
  984. if inplace:
  985. self._update_inplace(result)
  986. return None
  987. else:
  988. return result.__finalize__(self, method="rename")
  989. @rewrite_axis_style_signature("mapper", [("copy", True), ("inplace", False)])
  990. def rename_axis(self, mapper=lib.no_default, **kwargs):
  991. """
  992. Set the name of the axis for the index or columns.
  993. Parameters
  994. ----------
  995. mapper : scalar, list-like, optional
  996. Value to set the axis name attribute.
  997. index, columns : scalar, list-like, dict-like or function, optional
  998. A scalar, list-like, dict-like or functions transformations to
  999. apply to that axis' values.
  1000. Note that the ``columns`` parameter is not allowed if the
  1001. object is a Series. This parameter only apply for DataFrame
  1002. type objects.
  1003. Use either ``mapper`` and ``axis`` to
  1004. specify the axis to target with ``mapper``, or ``index``
  1005. and/or ``columns``.
  1006. axis : {0 or 'index', 1 or 'columns'}, default 0
  1007. The axis to rename.
  1008. copy : bool, default True
  1009. Also copy underlying data.
  1010. inplace : bool, default False
  1011. Modifies the object directly, instead of creating a new Series
  1012. or DataFrame.
  1013. Returns
  1014. -------
  1015. Series, DataFrame, or None
  1016. The same type as the caller or None if ``inplace=True``.
  1017. See Also
  1018. --------
  1019. Series.rename : Alter Series index labels or name.
  1020. DataFrame.rename : Alter DataFrame index labels or name.
  1021. Index.rename : Set new names on index.
  1022. Notes
  1023. -----
  1024. ``DataFrame.rename_axis`` supports two calling conventions
  1025. * ``(index=index_mapper, columns=columns_mapper, ...)``
  1026. * ``(mapper, axis={'index', 'columns'}, ...)``
  1027. The first calling convention will only modify the names of
  1028. the index and/or the names of the Index object that is the columns.
  1029. In this case, the parameter ``copy`` is ignored.
  1030. The second calling convention will modify the names of the
  1031. corresponding index if mapper is a list or a scalar.
  1032. However, if mapper is dict-like or a function, it will use the
  1033. deprecated behavior of modifying the axis *labels*.
  1034. We *highly* recommend using keyword arguments to clarify your
  1035. intent.
  1036. Examples
  1037. --------
  1038. **Series**
  1039. >>> s = pd.Series(["dog", "cat", "monkey"])
  1040. >>> s
  1041. 0 dog
  1042. 1 cat
  1043. 2 monkey
  1044. dtype: object
  1045. >>> s.rename_axis("animal")
  1046. animal
  1047. 0 dog
  1048. 1 cat
  1049. 2 monkey
  1050. dtype: object
  1051. **DataFrame**
  1052. >>> df = pd.DataFrame({"num_legs": [4, 4, 2],
  1053. ... "num_arms": [0, 0, 2]},
  1054. ... ["dog", "cat", "monkey"])
  1055. >>> df
  1056. num_legs num_arms
  1057. dog 4 0
  1058. cat 4 0
  1059. monkey 2 2
  1060. >>> df = df.rename_axis("animal")
  1061. >>> df
  1062. num_legs num_arms
  1063. animal
  1064. dog 4 0
  1065. cat 4 0
  1066. monkey 2 2
  1067. >>> df = df.rename_axis("limbs", axis="columns")
  1068. >>> df
  1069. limbs num_legs num_arms
  1070. animal
  1071. dog 4 0
  1072. cat 4 0
  1073. monkey 2 2
  1074. **MultiIndex**
  1075. >>> df.index = pd.MultiIndex.from_product([['mammal'],
  1076. ... ['dog', 'cat', 'monkey']],
  1077. ... names=['type', 'name'])
  1078. >>> df
  1079. limbs num_legs num_arms
  1080. type name
  1081. mammal dog 4 0
  1082. cat 4 0
  1083. monkey 2 2
  1084. >>> df.rename_axis(index={'type': 'class'})
  1085. limbs num_legs num_arms
  1086. class name
  1087. mammal dog 4 0
  1088. cat 4 0
  1089. monkey 2 2
  1090. >>> df.rename_axis(columns=str.upper)
  1091. LIMBS num_legs num_arms
  1092. type name
  1093. mammal dog 4 0
  1094. cat 4 0
  1095. monkey 2 2
  1096. """
  1097. axes, kwargs = self._construct_axes_from_arguments(
  1098. (), kwargs, sentinel=lib.no_default
  1099. )
  1100. copy = kwargs.pop("copy", True)
  1101. inplace = kwargs.pop("inplace", False)
  1102. axis = kwargs.pop("axis", 0)
  1103. if axis is not None:
  1104. axis = self._get_axis_number(axis)
  1105. if kwargs:
  1106. raise TypeError(
  1107. "rename_axis() got an unexpected keyword "
  1108. f'argument "{list(kwargs.keys())[0]}"'
  1109. )
  1110. inplace = validate_bool_kwarg(inplace, "inplace")
  1111. if mapper is not lib.no_default:
  1112. # Use v0.23 behavior if a scalar or list
  1113. non_mapper = is_scalar(mapper) or (
  1114. is_list_like(mapper) and not is_dict_like(mapper)
  1115. )
  1116. if non_mapper:
  1117. return self._set_axis_name(mapper, axis=axis, inplace=inplace)
  1118. else:
  1119. raise ValueError("Use `.rename` to alter labels with a mapper.")
  1120. else:
  1121. # Use new behavior. Means that index and/or columns
  1122. # is specified
  1123. result = self if inplace else self.copy(deep=copy)
  1124. for axis in range(self._AXIS_LEN):
  1125. v = axes.get(self._get_axis_name(axis))
  1126. if v is lib.no_default:
  1127. continue
  1128. non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v))
  1129. if non_mapper:
  1130. newnames = v
  1131. else:
  1132. f = com.get_rename_function(v)
  1133. curnames = self._get_axis(axis).names
  1134. newnames = [f(name) for name in curnames]
  1135. result._set_axis_name(newnames, axis=axis, inplace=True)
  1136. if not inplace:
  1137. return result
  1138. @final
  1139. def _set_axis_name(self, name, axis=0, inplace=False):
  1140. """
  1141. Set the name(s) of the axis.
  1142. Parameters
  1143. ----------
  1144. name : str or list of str
  1145. Name(s) to set.
  1146. axis : {0 or 'index', 1 or 'columns'}, default 0
  1147. The axis to set the label. The value 0 or 'index' specifies index,
  1148. and the value 1 or 'columns' specifies columns.
  1149. inplace : bool, default False
  1150. If `True`, do operation inplace and return None.
  1151. Returns
  1152. -------
  1153. Series, DataFrame, or None
  1154. The same type as the caller or `None` if `inplace` is `True`.
  1155. See Also
  1156. --------
  1157. DataFrame.rename : Alter the axis labels of :class:`DataFrame`.
  1158. Series.rename : Alter the index labels or set the index name
  1159. of :class:`Series`.
  1160. Index.rename : Set the name of :class:`Index` or :class:`MultiIndex`.
  1161. Examples
  1162. --------
  1163. >>> df = pd.DataFrame({"num_legs": [4, 4, 2]},
  1164. ... ["dog", "cat", "monkey"])
  1165. >>> df
  1166. num_legs
  1167. dog 4
  1168. cat 4
  1169. monkey 2
  1170. >>> df._set_axis_name("animal")
  1171. num_legs
  1172. animal
  1173. dog 4
  1174. cat 4
  1175. monkey 2
  1176. >>> df.index = pd.MultiIndex.from_product(
  1177. ... [["mammal"], ['dog', 'cat', 'monkey']])
  1178. >>> df._set_axis_name(["type", "name"])
  1179. num_legs
  1180. type name
  1181. mammal dog 4
  1182. cat 4
  1183. monkey 2
  1184. """
  1185. axis = self._get_axis_number(axis)
  1186. idx = self._get_axis(axis).set_names(name)
  1187. inplace = validate_bool_kwarg(inplace, "inplace")
  1188. renamed = self if inplace else self.copy()
  1189. renamed.set_axis(idx, axis=axis, inplace=True)
  1190. if not inplace:
  1191. return renamed
  1192. # ----------------------------------------------------------------------
  1193. # Comparison Methods
  1194. @final
  1195. def _indexed_same(self, other) -> bool_t:
  1196. return all(
  1197. self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS
  1198. )
  1199. @final
  1200. def equals(self, other: object) -> bool_t:
  1201. """
  1202. Test whether two objects contain the same elements.
  1203. This function allows two Series or DataFrames to be compared against
  1204. each other to see if they have the same shape and elements. NaNs in
  1205. the same location are considered equal.
  1206. The row/column index do not need to have the same type, as long
  1207. as the values are considered equal. Corresponding columns must be of
  1208. the same dtype.
  1209. Parameters
  1210. ----------
  1211. other : Series or DataFrame
  1212. The other Series or DataFrame to be compared with the first.
  1213. Returns
  1214. -------
  1215. bool
  1216. True if all elements are the same in both objects, False
  1217. otherwise.
  1218. See Also
  1219. --------
  1220. Series.eq : Compare two Series objects of the same length
  1221. and return a Series where each element is True if the element
  1222. in each Series is equal, False otherwise.
  1223. DataFrame.eq : Compare two DataFrame objects of the same shape and
  1224. return a DataFrame where each element is True if the respective
  1225. element in each DataFrame is equal, False otherwise.
  1226. testing.assert_series_equal : Raises an AssertionError if left and
  1227. right are not equal. Provides an easy interface to ignore
  1228. inequality in dtypes, indexes and precision among others.
  1229. testing.assert_frame_equal : Like assert_series_equal, but targets
  1230. DataFrames.
  1231. numpy.array_equal : Return True if two arrays have the same shape
  1232. and elements, False otherwise.
  1233. Examples
  1234. --------
  1235. >>> df = pd.DataFrame({1: [10], 2: [20]})
  1236. >>> df
  1237. 1 2
  1238. 0 10 20
  1239. DataFrames df and exactly_equal have the same types and values for
  1240. their elements and column labels, which will return True.
  1241. >>> exactly_equal = pd.DataFrame({1: [10], 2: [20]})
  1242. >>> exactly_equal
  1243. 1 2
  1244. 0 10 20
  1245. >>> df.equals(exactly_equal)
  1246. True
  1247. DataFrames df and different_column_type have the same element
  1248. types and values, but have different types for the column labels,
  1249. which will still return True.
  1250. >>> different_column_type = pd.DataFrame({1.0: [10], 2.0: [20]})
  1251. >>> different_column_type
  1252. 1.0 2.0
  1253. 0 10 20
  1254. >>> df.equals(different_column_type)
  1255. True
  1256. DataFrames df and different_data_type have different types for the
  1257. same values for their elements, and will return False even though
  1258. their column labels are the same values and types.
  1259. >>> different_data_type = pd.DataFrame({1: [10.0], 2: [20.0]})
  1260. >>> different_data_type
  1261. 1 2
  1262. 0 10.0 20.0
  1263. >>> df.equals(different_data_type)
  1264. False
  1265. """
  1266. if not (isinstance(other, type(self)) or isinstance(self, type(other))):
  1267. return False
  1268. other = cast(NDFrame, other)
  1269. return self._mgr.equals(other._mgr)
  1270. # -------------------------------------------------------------------------
  1271. # Unary Methods
  1272. @final
  1273. def __neg__(self):
  1274. def blk_func(values: ArrayLike):
  1275. if is_bool_dtype(values.dtype):
  1276. return operator.inv(values)
  1277. else:
  1278. return operator.neg(values)
  1279. new_data = self._mgr.apply(blk_func)
  1280. res = self._constructor(new_data)
  1281. return res.__finalize__(self, method="__neg__")
  1282. @final
  1283. def __pos__(self):
  1284. def blk_func(values: ArrayLike):
  1285. if is_bool_dtype(values.dtype):
  1286. return values.copy()
  1287. else:
  1288. return operator.pos(values)
  1289. new_data = self._mgr.apply(blk_func)
  1290. res = self._constructor(new_data)
  1291. return res.__finalize__(self, method="__pos__")
  1292. @final
  1293. def __invert__(self):
  1294. if not self.size:
  1295. # inv fails with 0 len
  1296. return self
  1297. new_data = self._mgr.apply(operator.invert)
  1298. return self._constructor(new_data).__finalize__(self, method="__invert__")
  1299. @final
  1300. def __nonzero__(self):
  1301. raise ValueError(
  1302. f"The truth value of a {type(self).__name__} is ambiguous. "
  1303. "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
  1304. )
  1305. __bool__ = __nonzero__
  1306. @final
  1307. def bool(self):
  1308. """
  1309. Return the bool of a single element Series or DataFrame.
  1310. This must be a boolean scalar value, either True or False. It will raise a
  1311. ValueError if the Series or DataFrame does not have exactly 1 element, or that
  1312. element is not boolean (integer values 0 and 1 will also raise an exception).
  1313. Returns
  1314. -------
  1315. bool
  1316. The value in the Series or DataFrame.
  1317. See Also
  1318. --------
  1319. Series.astype : Change the data type of a Series, including to boolean.
  1320. DataFrame.astype : Change the data type of a DataFrame, including to boolean.
  1321. numpy.bool_ : NumPy boolean data type, used by pandas for boolean values.
  1322. Examples
  1323. --------
  1324. The method will only work for single element objects with a boolean value:
  1325. >>> pd.Series([True]).bool()
  1326. True
  1327. >>> pd.Series([False]).bool()
  1328. False
  1329. >>> pd.DataFrame({'col': [True]}).bool()
  1330. True
  1331. >>> pd.DataFrame({'col': [False]}).bool()
  1332. False
  1333. """
  1334. v = self.squeeze()
  1335. if isinstance(v, (bool, np.bool_)):
  1336. return bool(v)
  1337. elif is_scalar(v):
  1338. raise ValueError(
  1339. "bool cannot act on a non-boolean single element "
  1340. f"{type(self).__name__}"
  1341. )
  1342. self.__nonzero__()
  1343. @final
  1344. def abs(self: NDFrameT) -> NDFrameT:
  1345. """
  1346. Return a Series/DataFrame with absolute numeric value of each element.
  1347. This function only applies to elements that are all numeric.
  1348. Returns
  1349. -------
  1350. abs
  1351. Series/DataFrame containing the absolute value of each element.
  1352. See Also
  1353. --------
  1354. numpy.absolute : Calculate the absolute value element-wise.
  1355. Notes
  1356. -----
  1357. For ``complex`` inputs, ``1.2 + 1j``, the absolute value is
  1358. :math:`\\sqrt{ a^2 + b^2 }`.
  1359. Examples
  1360. --------
  1361. Absolute numeric values in a Series.
  1362. >>> s = pd.Series([-1.10, 2, -3.33, 4])
  1363. >>> s.abs()
  1364. 0 1.10
  1365. 1 2.00
  1366. 2 3.33
  1367. 3 4.00
  1368. dtype: float64
  1369. Absolute numeric values in a Series with complex numbers.
  1370. >>> s = pd.Series([1.2 + 1j])
  1371. >>> s.abs()
  1372. 0 1.56205
  1373. dtype: float64
  1374. Absolute numeric values in a Series with a Timedelta element.
  1375. >>> s = pd.Series([pd.Timedelta('1 days')])
  1376. >>> s.abs()
  1377. 0 1 days
  1378. dtype: timedelta64[ns]
  1379. Select rows with data closest to certain value using argsort (from
  1380. `StackOverflow <https://stackoverflow.com/a/17758115>`__).
  1381. >>> df = pd.DataFrame({
  1382. ... 'a': [4, 5, 6, 7],
  1383. ... 'b': [10, 20, 30, 40],
  1384. ... 'c': [100, 50, -30, -50]
  1385. ... })
  1386. >>> df
  1387. a b c
  1388. 0 4 10 100
  1389. 1 5 20 50
  1390. 2 6 30 -30
  1391. 3 7 40 -50
  1392. >>> df.loc[(df.c - 43).abs().argsort()]
  1393. a b c
  1394. 1 5 20 50
  1395. 0 4 10 100
  1396. 2 6 30 -30
  1397. 3 7 40 -50
  1398. """
  1399. res_mgr = self._mgr.apply(np.abs)
  1400. return self._constructor(res_mgr).__finalize__(self, name="abs")
  1401. @final
  1402. def __abs__(self: NDFrameT) -> NDFrameT:
  1403. return self.abs()
  1404. @final
  1405. def __round__(self: NDFrameT, decimals: int = 0) -> NDFrameT:
  1406. return self.round(decimals)
  1407. # -------------------------------------------------------------------------
  1408. # Label or Level Combination Helpers
  1409. #
  1410. # A collection of helper methods for DataFrame/Series operations that
  1411. # accept a combination of column/index labels and levels. All such
  1412. # operations should utilize/extend these methods when possible so that we
  1413. # have consistent precedence and validation logic throughout the library.
  1414. @final
  1415. def _is_level_reference(self, key, axis=0):
  1416. """
  1417. Test whether a key is a level reference for a given axis.
  1418. To be considered a level reference, `key` must be a string that:
  1419. - (axis=0): Matches the name of an index level and does NOT match
  1420. a column label.
  1421. - (axis=1): Matches the name of a column level and does NOT match
  1422. an index label.
  1423. Parameters
  1424. ----------
  1425. key : str
  1426. Potential level name for the given axis
  1427. axis : int, default 0
  1428. Axis that levels are associated with (0 for index, 1 for columns)
  1429. Returns
  1430. -------
  1431. is_level : bool
  1432. """
  1433. axis = self._get_axis_number(axis)
  1434. return (
  1435. key is not None
  1436. and is_hashable(key)
  1437. and key in self.axes[axis].names
  1438. and not self._is_label_reference(key, axis=axis)
  1439. )
  1440. @final
  1441. def _is_label_reference(self, key, axis=0) -> bool_t:
  1442. """
  1443. Test whether a key is a label reference for a given axis.
  1444. To be considered a label reference, `key` must be a string that:
  1445. - (axis=0): Matches a column label
  1446. - (axis=1): Matches an index label
  1447. Parameters
  1448. ----------
  1449. key : str
  1450. Potential label name
  1451. axis : int, default 0
  1452. Axis perpendicular to the axis that labels are associated with
  1453. (0 means search for column labels, 1 means search for index labels)
  1454. Returns
  1455. -------
  1456. is_label: bool
  1457. """
  1458. axis = self._get_axis_number(axis)
  1459. other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis)
  1460. return (
  1461. key is not None
  1462. and is_hashable(key)
  1463. and any(key in self.axes[ax] for ax in other_axes)
  1464. )
  1465. @final
  1466. def _is_label_or_level_reference(self, key: str, axis: int = 0) -> bool_t:
  1467. """
  1468. Test whether a key is a label or level reference for a given axis.
  1469. To be considered either a label or a level reference, `key` must be a
  1470. string that:
  1471. - (axis=0): Matches a column label or an index level
  1472. - (axis=1): Matches an index label or a column level
  1473. Parameters
  1474. ----------
  1475. key : str
  1476. Potential label or level name
  1477. axis : int, default 0
  1478. Axis that levels are associated with (0 for index, 1 for columns)
  1479. Returns
  1480. -------
  1481. bool
  1482. """
  1483. return self._is_level_reference(key, axis=axis) or self._is_label_reference(
  1484. key, axis=axis
  1485. )
  1486. @final
  1487. def _check_label_or_level_ambiguity(self, key, axis: int = 0) -> None:
  1488. """
  1489. Check whether `key` is ambiguous.
  1490. By ambiguous, we mean that it matches both a level of the input
  1491. `axis` and a label of the other axis.
  1492. Parameters
  1493. ----------
  1494. key : str or object
  1495. Label or level name.
  1496. axis : int, default 0
  1497. Axis that levels are associated with (0 for index, 1 for columns).
  1498. Raises
  1499. ------
  1500. ValueError: `key` is ambiguous
  1501. """
  1502. axis = self._get_axis_number(axis)
  1503. other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis)
  1504. if (
  1505. key is not None
  1506. and is_hashable(key)
  1507. and key in self.axes[axis].names
  1508. and any(key in self.axes[ax] for ax in other_axes)
  1509. ):
  1510. # Build an informative and grammatical warning
  1511. level_article, level_type = (
  1512. ("an", "index") if axis == 0 else ("a", "column")
  1513. )
  1514. label_article, label_type = (
  1515. ("a", "column") if axis == 0 else ("an", "index")
  1516. )
  1517. msg = (
  1518. f"'{key}' is both {level_article} {level_type} level and "
  1519. f"{label_article} {label_type} label, which is ambiguous."
  1520. )
  1521. raise ValueError(msg)
  1522. @final
  1523. def _get_label_or_level_values(self, key: str, axis: int = 0) -> np.ndarray:
  1524. """
  1525. Return a 1-D array of values associated with `key`, a label or level
  1526. from the given `axis`.
  1527. Retrieval logic:
  1528. - (axis=0): Return column values if `key` matches a column label.
  1529. Otherwise return index level values if `key` matches an index
  1530. level.
  1531. - (axis=1): Return row values if `key` matches an index label.
  1532. Otherwise return column level values if 'key' matches a column
  1533. level
  1534. Parameters
  1535. ----------
  1536. key : str
  1537. Label or level name.
  1538. axis : int, default 0
  1539. Axis that levels are associated with (0 for index, 1 for columns)
  1540. Returns
  1541. -------
  1542. values : np.ndarray
  1543. Raises
  1544. ------
  1545. KeyError
  1546. if `key` matches neither a label nor a level
  1547. ValueError
  1548. if `key` matches multiple labels
  1549. FutureWarning
  1550. if `key` is ambiguous. This will become an ambiguity error in a
  1551. future version
  1552. """
  1553. axis = self._get_axis_number(axis)
  1554. other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis]
  1555. if self._is_label_reference(key, axis=axis):
  1556. self._check_label_or_level_ambiguity(key, axis=axis)
  1557. values = self.xs(key, axis=other_axes[0])._values
  1558. elif self._is_level_reference(key, axis=axis):
  1559. values = self.axes[axis].get_level_values(key)._values
  1560. else:
  1561. raise KeyError(key)
  1562. # Check for duplicates
  1563. if values.ndim > 1:
  1564. if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex):
  1565. multi_message = (
  1566. "\n"
  1567. "For a multi-index, the label must be a "
  1568. "tuple with elements corresponding to each level."
  1569. )
  1570. else:
  1571. multi_message = ""
  1572. label_axis_name = "column" if axis == 0 else "index"
  1573. raise ValueError(
  1574. f"The {label_axis_name} label '{key}' is not unique.{multi_message}"
  1575. )
  1576. return values
  1577. @final
  1578. def _drop_labels_or_levels(self, keys, axis: int = 0):
  1579. """
  1580. Drop labels and/or levels for the given `axis`.
  1581. For each key in `keys`:
  1582. - (axis=0): If key matches a column label then drop the column.
  1583. Otherwise if key matches an index level then drop the level.
  1584. - (axis=1): If key matches an index label then drop the row.
  1585. Otherwise if key matches a column level then drop the level.
  1586. Parameters
  1587. ----------
  1588. keys : str or list of str
  1589. labels or levels to drop
  1590. axis : int, default 0
  1591. Axis that levels are associated with (0 for index, 1 for columns)
  1592. Returns
  1593. -------
  1594. dropped: DataFrame
  1595. Raises
  1596. ------
  1597. ValueError
  1598. if any `keys` match neither a label nor a level
  1599. """
  1600. axis = self._get_axis_number(axis)
  1601. # Validate keys
  1602. keys = com.maybe_make_list(keys)
  1603. invalid_keys = [
  1604. k for k in keys if not self._is_label_or_level_reference(k, axis=axis)
  1605. ]
  1606. if invalid_keys:
  1607. raise ValueError(
  1608. "The following keys are not valid labels or "
  1609. f"levels for axis {axis}: {invalid_keys}"
  1610. )
  1611. # Compute levels and labels to drop
  1612. levels_to_drop = [k for k in keys if self._is_level_reference(k, axis=axis)]
  1613. labels_to_drop = [k for k in keys if not self._is_level_reference(k, axis=axis)]
  1614. # Perform copy upfront and then use inplace operations below.
  1615. # This ensures that we always perform exactly one copy.
  1616. # ``copy`` and/or ``inplace`` options could be added in the future.
  1617. dropped = self.copy()
  1618. if axis == 0:
  1619. # Handle dropping index levels
  1620. if levels_to_drop:
  1621. dropped.reset_index(levels_to_drop, drop=True, inplace=True)
  1622. # Handle dropping columns labels
  1623. if labels_to_drop:
  1624. dropped.drop(labels_to_drop, axis=1, inplace=True)
  1625. else:
  1626. # Handle dropping column levels
  1627. if levels_to_drop:
  1628. if isinstance(dropped.columns, MultiIndex):
  1629. # Drop the specified levels from the MultiIndex
  1630. dropped.columns = dropped.columns.droplevel(levels_to_drop)
  1631. else:
  1632. # Drop the last level of Index by replacing with
  1633. # a RangeIndex
  1634. dropped.columns = RangeIndex(dropped.columns.size)
  1635. # Handle dropping index labels
  1636. if labels_to_drop:
  1637. dropped.drop(labels_to_drop, axis=0, inplace=True)
  1638. return dropped
  1639. # ----------------------------------------------------------------------
  1640. # Iteration
  1641. # https://github.com/python/typeshed/issues/2148#issuecomment-520783318
  1642. # Incompatible types in assignment (expression has type "None", base class
  1643. # "object" defined the type as "Callable[[object], int]")
  1644. __hash__: None # type: ignore[assignment]
  1645. def __iter__(self):
  1646. """
  1647. Iterate over info axis.
  1648. Returns
  1649. -------
  1650. iterator
  1651. Info axis as iterator.
  1652. """
  1653. return iter(self._info_axis)
  1654. # can we get a better explanation of this?
  1655. def keys(self):
  1656. """
  1657. Get the 'info axis' (see Indexing for more).
  1658. This is index for Series, columns for DataFrame.
  1659. Returns
  1660. -------
  1661. Index
  1662. Info axis.
  1663. """
  1664. return self._info_axis
  1665. def items(self):
  1666. """
  1667. Iterate over (label, values) on info axis
  1668. This is index for Series and columns for DataFrame.
  1669. Returns
  1670. -------
  1671. Generator
  1672. """
  1673. for h in self._info_axis:
  1674. yield h, self[h]
  1675. @doc(items)
  1676. def iteritems(self):
  1677. return self.items()
  1678. def __len__(self) -> int:
  1679. """Returns length of info axis"""
  1680. return len(self._info_axis)
  1681. @final
  1682. def __contains__(self, key) -> bool_t:
  1683. """True if the key is in the info axis"""
  1684. return key in self._info_axis
  1685. @property
  1686. def empty(self) -> bool_t:
  1687. """
  1688. Indicator whether DataFrame is empty.
  1689. True if DataFrame is entirely empty (no items), meaning any of the
  1690. axes are of length 0.
  1691. Returns
  1692. -------
  1693. bool
  1694. If DataFrame is empty, return True, if not return False.
  1695. See Also
  1696. --------
  1697. Series.dropna : Return series without null values.
  1698. DataFrame.dropna : Return DataFrame with labels on given axis omitted
  1699. where (all or any) data are missing.
  1700. Notes
  1701. -----
  1702. If DataFrame contains only NaNs, it is still not considered empty. See
  1703. the example below.
  1704. Examples
  1705. --------
  1706. An example of an actual empty DataFrame. Notice the index is empty:
  1707. >>> df_empty = pd.DataFrame({'A' : []})
  1708. >>> df_empty
  1709. Empty DataFrame
  1710. Columns: [A]
  1711. Index: []
  1712. >>> df_empty.empty
  1713. True
  1714. If we only have NaNs in our DataFrame, it is not considered empty! We
  1715. will need to drop the NaNs to make the DataFrame empty:
  1716. >>> df = pd.DataFrame({'A' : [np.nan]})
  1717. >>> df
  1718. A
  1719. 0 NaN
  1720. >>> df.empty
  1721. False
  1722. >>> df.dropna().empty
  1723. True
  1724. """
  1725. return any(len(self._get_axis(a)) == 0 for a in self._AXIS_ORDERS)
  1726. # ----------------------------------------------------------------------
  1727. # Array Interface
  1728. # This is also set in IndexOpsMixin
  1729. # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented
  1730. __array_priority__ = 1000
  1731. def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
  1732. return np.asarray(self._values, dtype=dtype)
  1733. @final
  1734. def __array_ufunc__(
  1735. self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any
  1736. ):
  1737. return arraylike.array_ufunc(self, ufunc, method, *inputs, **kwargs)
  1738. # ----------------------------------------------------------------------
  1739. # Picklability
  1740. @final
  1741. def __getstate__(self) -> dict[str, Any]:
  1742. meta = {k: getattr(self, k, None) for k in self._metadata}
  1743. return {
  1744. "_mgr": self._mgr,
  1745. "_typ": self._typ,
  1746. "_metadata": self._metadata,
  1747. "attrs": self.attrs,
  1748. "_flags": {k: self.flags[k] for k in self.flags._keys},
  1749. **meta,
  1750. }
  1751. @final
  1752. def __setstate__(self, state):
  1753. if isinstance(state, BlockManager):
  1754. self._mgr = state
  1755. elif isinstance(state, dict):
  1756. if "_data" in state and "_mgr" not in state:
  1757. # compat for older pickles
  1758. state["_mgr"] = state.pop("_data")
  1759. typ = state.get("_typ")
  1760. if typ is not None:
  1761. attrs = state.get("_attrs", {})
  1762. object.__setattr__(self, "_attrs", attrs)
  1763. flags = state.get("_flags", {"allows_duplicate_labels": True})
  1764. object.__setattr__(self, "_flags", Flags(self, **flags))
  1765. # set in the order of internal names
  1766. # to avoid definitional recursion
  1767. # e.g. say fill_value needing _mgr to be
  1768. # defined
  1769. meta = set(self._internal_names + self._metadata)
  1770. for k in list(meta):
  1771. if k in state and k != "_flags":
  1772. v = state[k]
  1773. object.__setattr__(self, k, v)
  1774. for k, v in state.items():
  1775. if k not in meta:
  1776. object.__setattr__(self, k, v)
  1777. else:
  1778. raise NotImplementedError("Pre-0.12 pickles are no longer supported")
  1779. elif len(state) == 2:
  1780. raise NotImplementedError("Pre-0.12 pickles are no longer supported")
  1781. self._item_cache = {}
  1782. # ----------------------------------------------------------------------
  1783. # Rendering Methods
  1784. def __repr__(self) -> str:
  1785. # string representation based upon iterating over self
  1786. # (since, by definition, `PandasContainers` are iterable)
  1787. prepr = f"[{','.join(map(pprint_thing, self))}]"
  1788. return f"{type(self).__name__}({prepr})"
  1789. @final
  1790. def _repr_latex_(self):
  1791. """
  1792. Returns a LaTeX representation for a particular object.
  1793. Mainly for use with nbconvert (jupyter notebook conversion to pdf).
  1794. """
  1795. if config.get_option("display.latex.repr"):
  1796. return self.to_latex()
  1797. else:
  1798. return None
  1799. @final
  1800. def _repr_data_resource_(self):
  1801. """
  1802. Not a real Jupyter special repr method, but we use the same
  1803. naming convention.
  1804. """
  1805. if config.get_option("display.html.table_schema"):
  1806. data = self.head(config.get_option("display.max_rows"))
  1807. as_json = data.to_json(orient="table")
  1808. as_json = cast(str, as_json)
  1809. return json.loads(as_json, object_pairs_hook=collections.OrderedDict)
  1810. # ----------------------------------------------------------------------
  1811. # I/O Methods
  1812. @final
  1813. @doc(klass="object", storage_options=_shared_docs["storage_options"])
  1814. def to_excel(
  1815. self,
  1816. excel_writer,
  1817. sheet_name: str = "Sheet1",
  1818. na_rep: str = "",
  1819. float_format: str | None = None,
  1820. columns=None,
  1821. header=True,
  1822. index=True,
  1823. index_label=None,
  1824. startrow=0,
  1825. startcol=0,
  1826. engine=None,
  1827. merge_cells=True,
  1828. encoding=None,
  1829. inf_rep="inf",
  1830. verbose=True,
  1831. freeze_panes=None,
  1832. storage_options: StorageOptions = None,
  1833. ) -> None:
  1834. """
  1835. Write {klass} to an Excel sheet.
  1836. To write a single {klass} to an Excel .xlsx file it is only necessary to
  1837. specify a target file name. To write to multiple sheets it is necessary to
  1838. create an `ExcelWriter` object with a target file name, and specify a sheet
  1839. in the file to write to.
  1840. Multiple sheets may be written to by specifying unique `sheet_name`.
  1841. With all data written to the file it is necessary to save the changes.
  1842. Note that creating an `ExcelWriter` object with a file name that already
  1843. exists will result in the contents of the existing file being erased.
  1844. Parameters
  1845. ----------
  1846. excel_writer : path-like, file-like, or ExcelWriter object
  1847. File path or existing ExcelWriter.
  1848. sheet_name : str, default 'Sheet1'
  1849. Name of sheet which will contain DataFrame.
  1850. na_rep : str, default ''
  1851. Missing data representation.
  1852. float_format : str, optional
  1853. Format string for floating point numbers. For example
  1854. ``float_format="%.2f"`` will format 0.1234 to 0.12.
  1855. columns : sequence or list of str, optional
  1856. Columns to write.
  1857. header : bool or list of str, default True
  1858. Write out the column names. If a list of string is given it is
  1859. assumed to be aliases for the column names.
  1860. index : bool, default True
  1861. Write row names (index).
  1862. index_label : str or sequence, optional
  1863. Column label for index column(s) if desired. If not specified, and
  1864. `header` and `index` are True, then the index names are used. A
  1865. sequence should be given if the DataFrame uses MultiIndex.
  1866. startrow : int, default 0
  1867. Upper left cell row to dump data frame.
  1868. startcol : int, default 0
  1869. Upper left cell column to dump data frame.
  1870. engine : str, optional
  1871. Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this
  1872. via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and
  1873. ``io.excel.xlsm.writer``.
  1874. .. deprecated:: 1.2.0
  1875. As the `xlwt <https://pypi.org/project/xlwt/>`__ package is no longer
  1876. maintained, the ``xlwt`` engine will be removed in a future version
  1877. of pandas.
  1878. merge_cells : bool, default True
  1879. Write MultiIndex and Hierarchical Rows as merged cells.
  1880. encoding : str, optional
  1881. Encoding of the resulting excel file. Only necessary for xlwt,
  1882. other writers support unicode natively.
  1883. inf_rep : str, default 'inf'
  1884. Representation for infinity (there is no native representation for
  1885. infinity in Excel).
  1886. verbose : bool, default True
  1887. Display more information in the error logs.
  1888. freeze_panes : tuple of int (length 2), optional
  1889. Specifies the one-based bottommost row and rightmost column that
  1890. is to be frozen.
  1891. {storage_options}
  1892. .. versionadded:: 1.2.0
  1893. See Also
  1894. --------
  1895. to_csv : Write DataFrame to a comma-separated values (csv) file.
  1896. ExcelWriter : Class for writing DataFrame objects into excel sheets.
  1897. read_excel : Read an Excel file into a pandas DataFrame.
  1898. read_csv : Read a comma-separated values (csv) file into DataFrame.
  1899. Notes
  1900. -----
  1901. For compatibility with :meth:`~DataFrame.to_csv`,
  1902. to_excel serializes lists and dicts to strings before writing.
  1903. Once a workbook has been saved it is not possible to write further
  1904. data without rewriting the whole workbook.
  1905. Examples
  1906. --------
  1907. Create, write to and save a workbook:
  1908. >>> df1 = pd.DataFrame([['a', 'b'], ['c', 'd']],
  1909. ... index=['row 1', 'row 2'],
  1910. ... columns=['col 1', 'col 2'])
  1911. >>> df1.to_excel("output.xlsx") # doctest: +SKIP
  1912. To specify the sheet name:
  1913. >>> df1.to_excel("output.xlsx",
  1914. ... sheet_name='Sheet_name_1') # doctest: +SKIP
  1915. If you wish to write to more than one sheet in the workbook, it is
  1916. necessary to specify an ExcelWriter object:
  1917. >>> df2 = df1.copy()
  1918. >>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP
  1919. ... df1.to_excel(writer, sheet_name='Sheet_name_1')
  1920. ... df2.to_excel(writer, sheet_name='Sheet_name_2')
  1921. ExcelWriter can also be used to append to an existing Excel file:
  1922. >>> with pd.ExcelWriter('output.xlsx',
  1923. ... mode='a') as writer: # doctest: +SKIP
  1924. ... df.to_excel(writer, sheet_name='Sheet_name_3')
  1925. To set the library that is used to write the Excel file,
  1926. you can pass the `engine` keyword (the default engine is
  1927. automatically chosen depending on the file extension):
  1928. >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP
  1929. """
  1930. df = self if isinstance(self, ABCDataFrame) else self.to_frame()
  1931. from pandas.io.formats.excel import ExcelFormatter
  1932. formatter = ExcelFormatter(
  1933. df,
  1934. na_rep=na_rep,
  1935. cols=columns,
  1936. header=header,
  1937. float_format=float_format,
  1938. index=index,
  1939. index_label=index_label,
  1940. merge_cells=merge_cells,
  1941. inf_rep=inf_rep,
  1942. )
  1943. formatter.write(
  1944. excel_writer,
  1945. sheet_name=sheet_name,
  1946. startrow=startrow,
  1947. startcol=startcol,
  1948. freeze_panes=freeze_panes,
  1949. engine=engine,
  1950. storage_options=storage_options,
  1951. )
  1952. @final
  1953. @doc(storage_options=_shared_docs["storage_options"])
  1954. def to_json(
  1955. self,
  1956. path_or_buf: FilePathOrBuffer | None = None,
  1957. orient: str | None = None,
  1958. date_format: str | None = None,
  1959. double_precision: int = 10,
  1960. force_ascii: bool_t = True,
  1961. date_unit: str = "ms",
  1962. default_handler: Callable[[Any], JSONSerializable] | None = None,
  1963. lines: bool_t = False,
  1964. compression: CompressionOptions = "infer",
  1965. index: bool_t = True,
  1966. indent: int | None = None,
  1967. storage_options: StorageOptions = None,
  1968. ) -> str | None:
  1969. """
  1970. Convert the object to a JSON string.
  1971. Note NaN's and None will be converted to null and datetime objects
  1972. will be converted to UNIX timestamps.
  1973. Parameters
  1974. ----------
  1975. path_or_buf : str or file handle, optional
  1976. File path or object. If not specified, the result is returned as
  1977. a string.
  1978. orient : str
  1979. Indication of expected JSON string format.
  1980. * Series:
  1981. - default is 'index'
  1982. - allowed values are: {{'split', 'records', 'index', 'table'}}.
  1983. * DataFrame:
  1984. - default is 'columns'
  1985. - allowed values are: {{'split', 'records', 'index', 'columns',
  1986. 'values', 'table'}}.
  1987. * The format of the JSON string:
  1988. - 'split' : dict like {{'index' -> [index], 'columns' -> [columns],
  1989. 'data' -> [values]}}
  1990. - 'records' : list like [{{column -> value}}, ... , {{column -> value}}]
  1991. - 'index' : dict like {{index -> {{column -> value}}}}
  1992. - 'columns' : dict like {{column -> {{index -> value}}}}
  1993. - 'values' : just the values array
  1994. - 'table' : dict like {{'schema': {{schema}}, 'data': {{data}}}}
  1995. Describing the data, where data component is like ``orient='records'``.
  1996. date_format : {{None, 'epoch', 'iso'}}
  1997. Type of date conversion. 'epoch' = epoch milliseconds,
  1998. 'iso' = ISO8601. The default depends on the `orient`. For
  1999. ``orient='table'``, the default is 'iso'. For all other orients,
  2000. the default is 'epoch'.
  2001. double_precision : int, default 10
  2002. The number of decimal places to use when encoding
  2003. floating point values.
  2004. force_ascii : bool, default True
  2005. Force encoded string to be ASCII.
  2006. date_unit : str, default 'ms' (milliseconds)
  2007. The time unit to encode to, governs timestamp and ISO8601
  2008. precision. One of 's', 'ms', 'us', 'ns' for second, millisecond,
  2009. microsecond, and nanosecond respectively.
  2010. default_handler : callable, default None
  2011. Handler to call if object cannot otherwise be converted to a
  2012. suitable format for JSON. Should receive a single argument which is
  2013. the object to convert and return a serialisable object.
  2014. lines : bool, default False
  2015. If 'orient' is 'records' write out line-delimited json format. Will
  2016. throw ValueError if incorrect 'orient' since others are not
  2017. list-like.
  2018. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}
  2019. A string representing the compression to use in the output file,
  2020. only used when the first argument is a filename. By default, the
  2021. compression is inferred from the filename.
  2022. index : bool, default True
  2023. Whether to include the index values in the JSON string. Not
  2024. including the index (``index=False``) is only supported when
  2025. orient is 'split' or 'table'.
  2026. indent : int, optional
  2027. Length of whitespace used to indent each record.
  2028. .. versionadded:: 1.0.0
  2029. {storage_options}
  2030. .. versionadded:: 1.2.0
  2031. Returns
  2032. -------
  2033. None or str
  2034. If path_or_buf is None, returns the resulting json format as a
  2035. string. Otherwise returns None.
  2036. See Also
  2037. --------
  2038. read_json : Convert a JSON string to pandas object.
  2039. Notes
  2040. -----
  2041. The behavior of ``indent=0`` varies from the stdlib, which does not
  2042. indent the output but does insert newlines. Currently, ``indent=0``
  2043. and the default ``indent=None`` are equivalent in pandas, though this
  2044. may change in a future release.
  2045. ``orient='table'`` contains a 'pandas_version' field under 'schema'.
  2046. This stores the version of `pandas` used in the latest revision of the
  2047. schema.
  2048. Examples
  2049. --------
  2050. >>> import json
  2051. >>> df = pd.DataFrame(
  2052. ... [["a", "b"], ["c", "d"]],
  2053. ... index=["row 1", "row 2"],
  2054. ... columns=["col 1", "col 2"],
  2055. ... )
  2056. >>> result = df.to_json(orient="split")
  2057. >>> parsed = json.loads(result)
  2058. >>> json.dumps(parsed, indent=4) # doctest: +SKIP
  2059. {{
  2060. "columns": [
  2061. "col 1",
  2062. "col 2"
  2063. ],
  2064. "index": [
  2065. "row 1",
  2066. "row 2"
  2067. ],
  2068. "data": [
  2069. [
  2070. "a",
  2071. "b"
  2072. ],
  2073. [
  2074. "c",
  2075. "d"
  2076. ]
  2077. ]
  2078. }}
  2079. Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
  2080. Note that index labels are not preserved with this encoding.
  2081. >>> result = df.to_json(orient="records")
  2082. >>> parsed = json.loads(result)
  2083. >>> json.dumps(parsed, indent=4) # doctest: +SKIP
  2084. [
  2085. {{
  2086. "col 1": "a",
  2087. "col 2": "b"
  2088. }},
  2089. {{
  2090. "col 1": "c",
  2091. "col 2": "d"
  2092. }}
  2093. ]
  2094. Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
  2095. >>> result = df.to_json(orient="index")
  2096. >>> parsed = json.loads(result)
  2097. >>> json.dumps(parsed, indent=4) # doctest: +SKIP
  2098. {{
  2099. "row 1": {{
  2100. "col 1": "a",
  2101. "col 2": "b"
  2102. }},
  2103. "row 2": {{
  2104. "col 1": "c",
  2105. "col 2": "d"
  2106. }}
  2107. }}
  2108. Encoding/decoding a Dataframe using ``'columns'`` formatted JSON:
  2109. >>> result = df.to_json(orient="columns")
  2110. >>> parsed = json.loads(result)
  2111. >>> json.dumps(parsed, indent=4) # doctest: +SKIP
  2112. {{
  2113. "col 1": {{
  2114. "row 1": "a",
  2115. "row 2": "c"
  2116. }},
  2117. "col 2": {{
  2118. "row 1": "b",
  2119. "row 2": "d"
  2120. }}
  2121. }}
  2122. Encoding/decoding a Dataframe using ``'values'`` formatted JSON:
  2123. >>> result = df.to_json(orient="values")
  2124. >>> parsed = json.loads(result)
  2125. >>> json.dumps(parsed, indent=4) # doctest: +SKIP
  2126. [
  2127. [
  2128. "a",
  2129. "b"
  2130. ],
  2131. [
  2132. "c",
  2133. "d"
  2134. ]
  2135. ]
  2136. Encoding with Table Schema:
  2137. >>> result = df.to_json(orient="table")
  2138. >>> parsed = json.loads(result)
  2139. >>> json.dumps(parsed, indent=4) # doctest: +SKIP
  2140. {{
  2141. "schema": {{
  2142. "fields": [
  2143. {{
  2144. "name": "index",
  2145. "type": "string"
  2146. }},
  2147. {{
  2148. "name": "col 1",
  2149. "type": "string"
  2150. }},
  2151. {{
  2152. "name": "col 2",
  2153. "type": "string"
  2154. }}
  2155. ],
  2156. "primaryKey": [
  2157. "index"
  2158. ],
  2159. "pandas_version": "0.20.0"
  2160. }},
  2161. "data": [
  2162. {{
  2163. "index": "row 1",
  2164. "col 1": "a",
  2165. "col 2": "b"
  2166. }},
  2167. {{
  2168. "index": "row 2",
  2169. "col 1": "c",
  2170. "col 2": "d"
  2171. }}
  2172. ]
  2173. }}
  2174. """
  2175. from pandas.io import json
  2176. if date_format is None and orient == "table":
  2177. date_format = "iso"
  2178. elif date_format is None:
  2179. date_format = "epoch"
  2180. config.is_nonnegative_int(indent)
  2181. indent = indent or 0
  2182. return json.to_json(
  2183. path_or_buf=path_or_buf,
  2184. obj=self,
  2185. orient=orient,
  2186. date_format=date_format,
  2187. double_precision=double_precision,
  2188. force_ascii=force_ascii,
  2189. date_unit=date_unit,
  2190. default_handler=default_handler,
  2191. lines=lines,
  2192. compression=compression,
  2193. index=index,
  2194. indent=indent,
  2195. storage_options=storage_options,
  2196. )
  2197. @final
  2198. def to_hdf(
  2199. self,
  2200. path_or_buf,
  2201. key: str,
  2202. mode: str = "a",
  2203. complevel: int | None = None,
  2204. complib: str | None = None,
  2205. append: bool_t = False,
  2206. format: str | None = None,
  2207. index: bool_t = True,
  2208. min_itemsize: int | dict[str, int] | None = None,
  2209. nan_rep=None,
  2210. dropna: bool_t | None = None,
  2211. data_columns: bool_t | list[str] | None = None,
  2212. errors: str = "strict",
  2213. encoding: str = "UTF-8",
  2214. ) -> None:
  2215. """
  2216. Write the contained data to an HDF5 file using HDFStore.
  2217. Hierarchical Data Format (HDF) is self-describing, allowing an
  2218. application to interpret the structure and contents of a file with
  2219. no outside information. One HDF file can hold a mix of related objects
  2220. which can be accessed as a group or as individual objects.
  2221. In order to add another DataFrame or Series to an existing HDF file
  2222. please use append mode and a different a key.
  2223. .. warning::
  2224. One can store a subclass of ``DataFrame`` or ``Series`` to HDF5,
  2225. but the type of the subclass is lost upon storing.
  2226. For more information see the :ref:`user guide <io.hdf5>`.
  2227. Parameters
  2228. ----------
  2229. path_or_buf : str or pandas.HDFStore
  2230. File path or HDFStore object.
  2231. key : str
  2232. Identifier for the group in the store.
  2233. mode : {'a', 'w', 'r+'}, default 'a'
  2234. Mode to open file:
  2235. - 'w': write, a new file is created (an existing file with
  2236. the same name would be deleted).
  2237. - 'a': append, an existing file is opened for reading and
  2238. writing, and if the file does not exist it is created.
  2239. - 'r+': similar to 'a', but the file must already exist.
  2240. complevel : {0-9}, default None
  2241. Specifies a compression level for data.
  2242. A value of 0 or None disables compression.
  2243. complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
  2244. Specifies the compression library to be used.
  2245. As of v0.20.2 these additional compressors for Blosc are supported
  2246. (default if no compressor specified: 'blosc:blosclz'):
  2247. {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
  2248. 'blosc:zlib', 'blosc:zstd'}.
  2249. Specifying a compression library which is not available issues
  2250. a ValueError.
  2251. append : bool, default False
  2252. For Table formats, append the input data to the existing.
  2253. format : {'fixed', 'table', None}, default 'fixed'
  2254. Possible values:
  2255. - 'fixed': Fixed format. Fast writing/reading. Not-appendable,
  2256. nor searchable.
  2257. - 'table': Table format. Write as a PyTables Table structure
  2258. which may perform worse but allow more flexible operations
  2259. like searching / selecting subsets of the data.
  2260. - If None, pd.get_option('io.hdf.default_format') is checked,
  2261. followed by fallback to "fixed".
  2262. errors : str, default 'strict'
  2263. Specifies how encoding and decoding errors are to be handled.
  2264. See the errors argument for :func:`open` for a full list
  2265. of options.
  2266. encoding : str, default "UTF-8"
  2267. min_itemsize : dict or int, optional
  2268. Map column names to minimum string sizes for columns.
  2269. nan_rep : Any, optional
  2270. How to represent null values as str.
  2271. Not allowed with append=True.
  2272. data_columns : list of columns or True, optional
  2273. List of columns to create as indexed data columns for on-disk
  2274. queries, or True to use all columns. By default only the axes
  2275. of the object are indexed. See :ref:`io.hdf5-query-data-columns`.
  2276. Applicable only to format='table'.
  2277. See Also
  2278. --------
  2279. read_hdf : Read from HDF file.
  2280. DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
  2281. DataFrame.to_sql : Write to a SQL table.
  2282. DataFrame.to_feather : Write out feather-format for DataFrames.
  2283. DataFrame.to_csv : Write out to a csv file.
  2284. Examples
  2285. --------
  2286. >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
  2287. ... index=['a', 'b', 'c'])
  2288. >>> df.to_hdf('data.h5', key='df', mode='w')
  2289. We can add another object to the same file:
  2290. >>> s = pd.Series([1, 2, 3, 4])
  2291. >>> s.to_hdf('data.h5', key='s')
  2292. Reading from HDF file:
  2293. >>> pd.read_hdf('data.h5', 'df')
  2294. A B
  2295. a 1 4
  2296. b 2 5
  2297. c 3 6
  2298. >>> pd.read_hdf('data.h5', 's')
  2299. 0 1
  2300. 1 2
  2301. 2 3
  2302. 3 4
  2303. dtype: int64
  2304. Deleting file with data:
  2305. >>> import os
  2306. >>> os.remove('data.h5')
  2307. """
  2308. from pandas.io import pytables
  2309. # Argument 3 to "to_hdf" has incompatible type "NDFrame"; expected
  2310. # "Union[DataFrame, Series]" [arg-type]
  2311. pytables.to_hdf(
  2312. path_or_buf,
  2313. key,
  2314. self, # type: ignore[arg-type]
  2315. mode=mode,
  2316. complevel=complevel,
  2317. complib=complib,
  2318. append=append,
  2319. format=format,
  2320. index=index,
  2321. min_itemsize=min_itemsize,
  2322. nan_rep=nan_rep,
  2323. dropna=dropna,
  2324. data_columns=data_columns,
  2325. errors=errors,
  2326. encoding=encoding,
  2327. )
  2328. @final
  2329. def to_sql(
  2330. self,
  2331. name: str,
  2332. con,
  2333. schema=None,
  2334. if_exists: str = "fail",
  2335. index: bool_t = True,
  2336. index_label=None,
  2337. chunksize=None,
  2338. dtype: DtypeArg | None = None,
  2339. method=None,
  2340. ) -> None:
  2341. """
  2342. Write records stored in a DataFrame to a SQL database.
  2343. Databases supported by SQLAlchemy [1]_ are supported. Tables can be
  2344. newly created, appended to, or overwritten.
  2345. Parameters
  2346. ----------
  2347. name : str
  2348. Name of SQL table.
  2349. con : sqlalchemy.engine.(Engine or Connection) or sqlite3.Connection
  2350. Using SQLAlchemy makes it possible to use any DB supported by that
  2351. library. Legacy support is provided for sqlite3.Connection objects. The user
  2352. is responsible for engine disposal and connection closure for the SQLAlchemy
  2353. connectable See `here \
  2354. <https://docs.sqlalchemy.org/en/13/core/connections.html>`_.
  2355. schema : str, optional
  2356. Specify the schema (if database flavor supports this). If None, use
  2357. default schema.
  2358. if_exists : {'fail', 'replace', 'append'}, default 'fail'
  2359. How to behave if the table already exists.
  2360. * fail: Raise a ValueError.
  2361. * replace: Drop the table before inserting new values.
  2362. * append: Insert new values to the existing table.
  2363. index : bool, default True
  2364. Write DataFrame index as a column. Uses `index_label` as the column
  2365. name in the table.
  2366. index_label : str or sequence, default None
  2367. Column label for index column(s). If None is given (default) and
  2368. `index` is True, then the index names are used.
  2369. A sequence should be given if the DataFrame uses MultiIndex.
  2370. chunksize : int, optional
  2371. Specify the number of rows in each batch to be written at a time.
  2372. By default, all rows will be written at once.
  2373. dtype : dict or scalar, optional
  2374. Specifying the datatype for columns. If a dictionary is used, the
  2375. keys should be the column names and the values should be the
  2376. SQLAlchemy types or strings for the sqlite3 legacy mode. If a
  2377. scalar is provided, it will be applied to all columns.
  2378. method : {None, 'multi', callable}, optional
  2379. Controls the SQL insertion clause used:
  2380. * None : Uses standard SQL ``INSERT`` clause (one per row).
  2381. * 'multi': Pass multiple values in a single ``INSERT`` clause.
  2382. * callable with signature ``(pd_table, conn, keys, data_iter)``.
  2383. Details and a sample callable implementation can be found in the
  2384. section :ref:`insert method <io.sql.method>`.
  2385. Raises
  2386. ------
  2387. ValueError
  2388. When the table already exists and `if_exists` is 'fail' (the
  2389. default).
  2390. See Also
  2391. --------
  2392. read_sql : Read a DataFrame from a table.
  2393. Notes
  2394. -----
  2395. Timezone aware datetime columns will be written as
  2396. ``Timestamp with timezone`` type with SQLAlchemy if supported by the
  2397. database. Otherwise, the datetimes will be stored as timezone unaware
  2398. timestamps local to the original timezone.
  2399. References
  2400. ----------
  2401. .. [1] https://docs.sqlalchemy.org
  2402. .. [2] https://www.python.org/dev/peps/pep-0249/
  2403. Examples
  2404. --------
  2405. Create an in-memory SQLite database.
  2406. >>> from sqlalchemy import create_engine
  2407. >>> engine = create_engine('sqlite://', echo=False)
  2408. Create a table from scratch with 3 rows.
  2409. >>> df = pd.DataFrame({'name' : ['User 1', 'User 2', 'User 3']})
  2410. >>> df
  2411. name
  2412. 0 User 1
  2413. 1 User 2
  2414. 2 User 3
  2415. >>> df.to_sql('users', con=engine)
  2416. >>> engine.execute("SELECT * FROM users").fetchall()
  2417. [(0, 'User 1'), (1, 'User 2'), (2, 'User 3')]
  2418. An `sqlalchemy.engine.Connection` can also be passed to `con`:
  2419. >>> with engine.begin() as connection:
  2420. ... df1 = pd.DataFrame({'name' : ['User 4', 'User 5']})
  2421. ... df1.to_sql('users', con=connection, if_exists='append')
  2422. This is allowed to support operations that require that the same
  2423. DBAPI connection is used for the entire operation.
  2424. >>> df2 = pd.DataFrame({'name' : ['User 6', 'User 7']})
  2425. >>> df2.to_sql('users', con=engine, if_exists='append')
  2426. >>> engine.execute("SELECT * FROM users").fetchall()
  2427. [(0, 'User 1'), (1, 'User 2'), (2, 'User 3'),
  2428. (0, 'User 4'), (1, 'User 5'), (0, 'User 6'),
  2429. (1, 'User 7')]
  2430. Overwrite the table with just ``df2``.
  2431. >>> df2.to_sql('users', con=engine, if_exists='replace',
  2432. ... index_label='id')
  2433. >>> engine.execute("SELECT * FROM users").fetchall()
  2434. [(0, 'User 6'), (1, 'User 7')]
  2435. Specify the dtype (especially useful for integers with missing values).
  2436. Notice that while pandas is forced to store the data as floating point,
  2437. the database supports nullable integers. When fetching the data with
  2438. Python, we get back integer scalars.
  2439. >>> df = pd.DataFrame({"A": [1, None, 2]})
  2440. >>> df
  2441. A
  2442. 0 1.0
  2443. 1 NaN
  2444. 2 2.0
  2445. >>> from sqlalchemy.types import Integer
  2446. >>> df.to_sql('integers', con=engine, index=False,
  2447. ... dtype={"A": Integer()})
  2448. >>> engine.execute("SELECT * FROM integers").fetchall()
  2449. [(1,), (None,), (2,)]
  2450. """
  2451. from pandas.io import sql
  2452. sql.to_sql(
  2453. self,
  2454. name,
  2455. con,
  2456. schema=schema,
  2457. if_exists=if_exists,
  2458. index=index,
  2459. index_label=index_label,
  2460. chunksize=chunksize,
  2461. dtype=dtype,
  2462. method=method,
  2463. )
  2464. @final
  2465. @doc(storage_options=_shared_docs["storage_options"])
  2466. def to_pickle(
  2467. self,
  2468. path,
  2469. compression: CompressionOptions = "infer",
  2470. protocol: int = pickle.HIGHEST_PROTOCOL,
  2471. storage_options: StorageOptions = None,
  2472. ) -> None:
  2473. """
  2474. Pickle (serialize) object to file.
  2475. Parameters
  2476. ----------
  2477. path : str
  2478. File path where the pickled object will be stored.
  2479. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, \
  2480. default 'infer'
  2481. A string representing the compression to use in the output file. By
  2482. default, infers from the file extension in specified path.
  2483. Compression mode may be any of the following possible
  2484. values: {{‘infer’, ‘gzip’, ‘bz2’, ‘zip’, ‘xz’, None}}. If compression
  2485. mode is ‘infer’ and path_or_buf is path-like, then detect
  2486. compression mode from the following extensions:
  2487. ‘.gz’, ‘.bz2’, ‘.zip’ or ‘.xz’. (otherwise no compression).
  2488. If dict given and mode is ‘zip’ or inferred as ‘zip’, other entries
  2489. passed as additional compression options.
  2490. protocol : int
  2491. Int which indicates which protocol should be used by the pickler,
  2492. default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible
  2493. values are 0, 1, 2, 3, 4, 5. A negative value for the protocol
  2494. parameter is equivalent to setting its value to HIGHEST_PROTOCOL.
  2495. .. [1] https://docs.python.org/3/library/pickle.html.
  2496. {storage_options}
  2497. .. versionadded:: 1.2.0
  2498. See Also
  2499. --------
  2500. read_pickle : Load pickled pandas object (or any object) from file.
  2501. DataFrame.to_hdf : Write DataFrame to an HDF5 file.
  2502. DataFrame.to_sql : Write DataFrame to a SQL database.
  2503. DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
  2504. Examples
  2505. --------
  2506. >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}})
  2507. >>> original_df
  2508. foo bar
  2509. 0 0 5
  2510. 1 1 6
  2511. 2 2 7
  2512. 3 3 8
  2513. 4 4 9
  2514. >>> original_df.to_pickle("./dummy.pkl")
  2515. >>> unpickled_df = pd.read_pickle("./dummy.pkl")
  2516. >>> unpickled_df
  2517. foo bar
  2518. 0 0 5
  2519. 1 1 6
  2520. 2 2 7
  2521. 3 3 8
  2522. 4 4 9
  2523. >>> import os
  2524. >>> os.remove("./dummy.pkl")
  2525. """
  2526. from pandas.io.pickle import to_pickle
  2527. to_pickle(
  2528. self,
  2529. path,
  2530. compression=compression,
  2531. protocol=protocol,
  2532. storage_options=storage_options,
  2533. )
  2534. @final
  2535. def to_clipboard(
  2536. self, excel: bool_t = True, sep: str | None = None, **kwargs
  2537. ) -> None:
  2538. r"""
  2539. Copy object to the system clipboard.
  2540. Write a text representation of object to the system clipboard.
  2541. This can be pasted into Excel, for example.
  2542. Parameters
  2543. ----------
  2544. excel : bool, default True
  2545. Produce output in a csv format for easy pasting into excel.
  2546. - True, use the provided separator for csv pasting.
  2547. - False, write a string representation of the object to the clipboard.
  2548. sep : str, default ``'\t'``
  2549. Field delimiter.
  2550. **kwargs
  2551. These parameters will be passed to DataFrame.to_csv.
  2552. See Also
  2553. --------
  2554. DataFrame.to_csv : Write a DataFrame to a comma-separated values
  2555. (csv) file.
  2556. read_clipboard : Read text from clipboard and pass to read_csv.
  2557. Notes
  2558. -----
  2559. Requirements for your platform.
  2560. - Linux : `xclip`, or `xsel` (with `PyQt4` modules)
  2561. - Windows : none
  2562. - macOS : none
  2563. Examples
  2564. --------
  2565. Copy the contents of a DataFrame to the clipboard.
  2566. >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])
  2567. >>> df.to_clipboard(sep=',') # doctest: +SKIP
  2568. ... # Wrote the following to the system clipboard:
  2569. ... # ,A,B,C
  2570. ... # 0,1,2,3
  2571. ... # 1,4,5,6
  2572. We can omit the index by passing the keyword `index` and setting
  2573. it to false.
  2574. >>> df.to_clipboard(sep=',', index=False) # doctest: +SKIP
  2575. ... # Wrote the following to the system clipboard:
  2576. ... # A,B,C
  2577. ... # 1,2,3
  2578. ... # 4,5,6
  2579. """
  2580. from pandas.io import clipboards
  2581. clipboards.to_clipboard(self, excel=excel, sep=sep, **kwargs)
  2582. @final
  2583. def to_xarray(self):
  2584. """
  2585. Return an xarray object from the pandas object.
  2586. Returns
  2587. -------
  2588. xarray.DataArray or xarray.Dataset
  2589. Data in the pandas structure converted to Dataset if the object is
  2590. a DataFrame, or a DataArray if the object is a Series.
  2591. See Also
  2592. --------
  2593. DataFrame.to_hdf : Write DataFrame to an HDF5 file.
  2594. DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
  2595. Notes
  2596. -----
  2597. See the `xarray docs <https://xarray.pydata.org/en/stable/>`__
  2598. Examples
  2599. --------
  2600. >>> df = pd.DataFrame([('falcon', 'bird', 389.0, 2),
  2601. ... ('parrot', 'bird', 24.0, 2),
  2602. ... ('lion', 'mammal', 80.5, 4),
  2603. ... ('monkey', 'mammal', np.nan, 4)],
  2604. ... columns=['name', 'class', 'max_speed',
  2605. ... 'num_legs'])
  2606. >>> df
  2607. name class max_speed num_legs
  2608. 0 falcon bird 389.0 2
  2609. 1 parrot bird 24.0 2
  2610. 2 lion mammal 80.5 4
  2611. 3 monkey mammal NaN 4
  2612. >>> df.to_xarray()
  2613. <xarray.Dataset>
  2614. Dimensions: (index: 4)
  2615. Coordinates:
  2616. * index (index) int64 0 1 2 3
  2617. Data variables:
  2618. name (index) object 'falcon' 'parrot' 'lion' 'monkey'
  2619. class (index) object 'bird' 'bird' 'mammal' 'mammal'
  2620. max_speed (index) float64 389.0 24.0 80.5 nan
  2621. num_legs (index) int64 2 2 4 4
  2622. >>> df['max_speed'].to_xarray()
  2623. <xarray.DataArray 'max_speed' (index: 4)>
  2624. array([389. , 24. , 80.5, nan])
  2625. Coordinates:
  2626. * index (index) int64 0 1 2 3
  2627. >>> dates = pd.to_datetime(['2018-01-01', '2018-01-01',
  2628. ... '2018-01-02', '2018-01-02'])
  2629. >>> df_multiindex = pd.DataFrame({'date': dates,
  2630. ... 'animal': ['falcon', 'parrot',
  2631. ... 'falcon', 'parrot'],
  2632. ... 'speed': [350, 18, 361, 15]})
  2633. >>> df_multiindex = df_multiindex.set_index(['date', 'animal'])
  2634. >>> df_multiindex
  2635. speed
  2636. date animal
  2637. 2018-01-01 falcon 350
  2638. parrot 18
  2639. 2018-01-02 falcon 361
  2640. parrot 15
  2641. >>> df_multiindex.to_xarray()
  2642. <xarray.Dataset>
  2643. Dimensions: (animal: 2, date: 2)
  2644. Coordinates:
  2645. * date (date) datetime64[ns] 2018-01-01 2018-01-02
  2646. * animal (animal) object 'falcon' 'parrot'
  2647. Data variables:
  2648. speed (date, animal) int64 350 18 361 15
  2649. """
  2650. xarray = import_optional_dependency("xarray")
  2651. if self.ndim == 1:
  2652. return xarray.DataArray.from_series(self)
  2653. else:
  2654. return xarray.Dataset.from_dataframe(self)
  2655. @final
  2656. @doc(returns=fmt.return_docstring)
  2657. def to_latex(
  2658. self,
  2659. buf=None,
  2660. columns=None,
  2661. col_space=None,
  2662. header=True,
  2663. index=True,
  2664. na_rep="NaN",
  2665. formatters=None,
  2666. float_format=None,
  2667. sparsify=None,
  2668. index_names=True,
  2669. bold_rows=False,
  2670. column_format=None,
  2671. longtable=None,
  2672. escape=None,
  2673. encoding=None,
  2674. decimal=".",
  2675. multicolumn=None,
  2676. multicolumn_format=None,
  2677. multirow=None,
  2678. caption=None,
  2679. label=None,
  2680. position=None,
  2681. ):
  2682. r"""
  2683. Render object to a LaTeX tabular, longtable, or nested table/tabular.
  2684. Requires ``\usepackage{{booktabs}}``. The output can be copy/pasted
  2685. into a main LaTeX document or read from an external file
  2686. with ``\input{{table.tex}}``.
  2687. .. versionchanged:: 1.0.0
  2688. Added caption and label arguments.
  2689. .. versionchanged:: 1.2.0
  2690. Added position argument, changed meaning of caption argument.
  2691. Parameters
  2692. ----------
  2693. buf : str, Path or StringIO-like, optional, default None
  2694. Buffer to write to. If None, the output is returned as a string.
  2695. columns : list of label, optional
  2696. The subset of columns to write. Writes all columns by default.
  2697. col_space : int, optional
  2698. The minimum width of each column.
  2699. header : bool or list of str, default True
  2700. Write out the column names. If a list of strings is given,
  2701. it is assumed to be aliases for the column names.
  2702. index : bool, default True
  2703. Write row names (index).
  2704. na_rep : str, default 'NaN'
  2705. Missing data representation.
  2706. formatters : list of functions or dict of {{str: function}}, optional
  2707. Formatter functions to apply to columns' elements by position or
  2708. name. The result of each function must be a unicode string.
  2709. List must be of length equal to the number of columns.
  2710. float_format : one-parameter function or str, optional, default None
  2711. Formatter for floating point numbers. For example
  2712. ``float_format="%.2f"`` and ``float_format="{{:0.2f}}".format`` will
  2713. both result in 0.1234 being formatted as 0.12.
  2714. sparsify : bool, optional
  2715. Set to False for a DataFrame with a hierarchical index to print
  2716. every multiindex key at each row. By default, the value will be
  2717. read from the config module.
  2718. index_names : bool, default True
  2719. Prints the names of the indexes.
  2720. bold_rows : bool, default False
  2721. Make the row labels bold in the output.
  2722. column_format : str, optional
  2723. The columns format as specified in `LaTeX table format
  2724. <https://en.wikibooks.org/wiki/LaTeX/Tables>`__ e.g. 'rcl' for 3
  2725. columns. By default, 'l' will be used for all columns except
  2726. columns of numbers, which default to 'r'.
  2727. longtable : bool, optional
  2728. By default, the value will be read from the pandas config
  2729. module. Use a longtable environment instead of tabular. Requires
  2730. adding a \usepackage{{longtable}} to your LaTeX preamble.
  2731. escape : bool, optional
  2732. By default, the value will be read from the pandas config
  2733. module. When set to False prevents from escaping latex special
  2734. characters in column names.
  2735. encoding : str, optional
  2736. A string representing the encoding to use in the output file,
  2737. defaults to 'utf-8'.
  2738. decimal : str, default '.'
  2739. Character recognized as decimal separator, e.g. ',' in Europe.
  2740. multicolumn : bool, default True
  2741. Use \multicolumn to enhance MultiIndex columns.
  2742. The default will be read from the config module.
  2743. multicolumn_format : str, default 'l'
  2744. The alignment for multicolumns, similar to `column_format`
  2745. The default will be read from the config module.
  2746. multirow : bool, default False
  2747. Use \multirow to enhance MultiIndex rows. Requires adding a
  2748. \usepackage{{multirow}} to your LaTeX preamble. Will print
  2749. centered labels (instead of top-aligned) across the contained
  2750. rows, separating groups via clines. The default will be read
  2751. from the pandas config module.
  2752. caption : str or tuple, optional
  2753. Tuple (full_caption, short_caption),
  2754. which results in ``\caption[short_caption]{{full_caption}}``;
  2755. if a single string is passed, no short caption will be set.
  2756. .. versionadded:: 1.0.0
  2757. .. versionchanged:: 1.2.0
  2758. Optionally allow caption to be a tuple ``(full_caption, short_caption)``.
  2759. label : str, optional
  2760. The LaTeX label to be placed inside ``\label{{}}`` in the output.
  2761. This is used with ``\ref{{}}`` in the main ``.tex`` file.
  2762. .. versionadded:: 1.0.0
  2763. position : str, optional
  2764. The LaTeX positional argument for tables, to be placed after
  2765. ``\begin{{}}`` in the output.
  2766. .. versionadded:: 1.2.0
  2767. {returns}
  2768. See Also
  2769. --------
  2770. DataFrame.to_string : Render a DataFrame to a console-friendly
  2771. tabular output.
  2772. DataFrame.to_html : Render a DataFrame as an HTML table.
  2773. Examples
  2774. --------
  2775. >>> df = pd.DataFrame(dict(name=['Raphael', 'Donatello'],
  2776. ... mask=['red', 'purple'],
  2777. ... weapon=['sai', 'bo staff']))
  2778. >>> print(df.to_latex(index=False)) # doctest: +NORMALIZE_WHITESPACE
  2779. \begin{{tabular}}{{lll}}
  2780. \toprule
  2781. name & mask & weapon \\
  2782. \midrule
  2783. Raphael & red & sai \\
  2784. Donatello & purple & bo staff \\
  2785. \bottomrule
  2786. \end{{tabular}}
  2787. """
  2788. # Get defaults from the pandas config
  2789. if self.ndim == 1:
  2790. self = self.to_frame()
  2791. if longtable is None:
  2792. longtable = config.get_option("display.latex.longtable")
  2793. if escape is None:
  2794. escape = config.get_option("display.latex.escape")
  2795. if multicolumn is None:
  2796. multicolumn = config.get_option("display.latex.multicolumn")
  2797. if multicolumn_format is None:
  2798. multicolumn_format = config.get_option("display.latex.multicolumn_format")
  2799. if multirow is None:
  2800. multirow = config.get_option("display.latex.multirow")
  2801. self = cast("DataFrame", self)
  2802. formatter = DataFrameFormatter(
  2803. self,
  2804. columns=columns,
  2805. col_space=col_space,
  2806. na_rep=na_rep,
  2807. header=header,
  2808. index=index,
  2809. formatters=formatters,
  2810. float_format=float_format,
  2811. bold_rows=bold_rows,
  2812. sparsify=sparsify,
  2813. index_names=index_names,
  2814. escape=escape,
  2815. decimal=decimal,
  2816. )
  2817. return DataFrameRenderer(formatter).to_latex(
  2818. buf=buf,
  2819. column_format=column_format,
  2820. longtable=longtable,
  2821. encoding=encoding,
  2822. multicolumn=multicolumn,
  2823. multicolumn_format=multicolumn_format,
  2824. multirow=multirow,
  2825. caption=caption,
  2826. label=label,
  2827. position=position,
  2828. )
  2829. @final
  2830. @doc(storage_options=_shared_docs["storage_options"])
  2831. def to_csv(
  2832. self,
  2833. path_or_buf: FilePathOrBuffer[AnyStr] | None = None,
  2834. sep: str = ",",
  2835. na_rep: str = "",
  2836. float_format: str | None = None,
  2837. columns: Sequence[Hashable] | None = None,
  2838. header: bool_t | list[str] = True,
  2839. index: bool_t = True,
  2840. index_label: IndexLabel | None = None,
  2841. mode: str = "w",
  2842. encoding: str | None = None,
  2843. compression: CompressionOptions = "infer",
  2844. quoting: int | None = None,
  2845. quotechar: str = '"',
  2846. line_terminator: str | None = None,
  2847. chunksize: int | None = None,
  2848. date_format: str | None = None,
  2849. doublequote: bool_t = True,
  2850. escapechar: str | None = None,
  2851. decimal: str = ".",
  2852. errors: str = "strict",
  2853. storage_options: StorageOptions = None,
  2854. ) -> str | None:
  2855. r"""
  2856. Write object to a comma-separated values (csv) file.
  2857. Parameters
  2858. ----------
  2859. path_or_buf : str or file handle, default None
  2860. File path or object, if None is provided the result is returned as
  2861. a string. If a non-binary file object is passed, it should be opened
  2862. with `newline=''`, disabling universal newlines. If a binary
  2863. file object is passed, `mode` might need to contain a `'b'`.
  2864. .. versionchanged:: 1.2.0
  2865. Support for binary file objects was introduced.
  2866. sep : str, default ','
  2867. String of length 1. Field delimiter for the output file.
  2868. na_rep : str, default ''
  2869. Missing data representation.
  2870. float_format : str, default None
  2871. Format string for floating point numbers.
  2872. columns : sequence, optional
  2873. Columns to write.
  2874. header : bool or list of str, default True
  2875. Write out the column names. If a list of strings is given it is
  2876. assumed to be aliases for the column names.
  2877. index : bool, default True
  2878. Write row names (index).
  2879. index_label : str or sequence, or False, default None
  2880. Column label for index column(s) if desired. If None is given, and
  2881. `header` and `index` are True, then the index names are used. A
  2882. sequence should be given if the object uses MultiIndex. If
  2883. False do not print fields for index names. Use index_label=False
  2884. for easier importing in R.
  2885. mode : str
  2886. Python write mode, default 'w'.
  2887. encoding : str, optional
  2888. A string representing the encoding to use in the output file,
  2889. defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`
  2890. is a non-binary file object.
  2891. compression : str or dict, default 'infer'
  2892. If str, represents compression mode. If dict, value at 'method' is
  2893. the compression mode. Compression mode may be any of the following
  2894. possible values: {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}. If
  2895. compression mode is 'infer' and `path_or_buf` is path-like, then
  2896. detect compression mode from the following extensions: '.gz',
  2897. '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given
  2898. and mode is one of {{'zip', 'gzip', 'bz2'}}, or inferred as
  2899. one of the above, other entries passed as
  2900. additional compression options.
  2901. .. versionchanged:: 1.0.0
  2902. May now be a dict with key 'method' as compression mode
  2903. and other entries as additional compression options if
  2904. compression mode is 'zip'.
  2905. .. versionchanged:: 1.1.0
  2906. Passing compression options as keys in dict is
  2907. supported for compression modes 'gzip' and 'bz2'
  2908. as well as 'zip'.
  2909. .. versionchanged:: 1.2.0
  2910. Compression is supported for binary file objects.
  2911. .. versionchanged:: 1.2.0
  2912. Previous versions forwarded dict entries for 'gzip' to
  2913. `gzip.open` instead of `gzip.GzipFile` which prevented
  2914. setting `mtime`.
  2915. quoting : optional constant from csv module
  2916. Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
  2917. then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
  2918. will treat them as non-numeric.
  2919. quotechar : str, default '\"'
  2920. String of length 1. Character used to quote fields.
  2921. line_terminator : str, optional
  2922. The newline character or character sequence to use in the output
  2923. file. Defaults to `os.linesep`, which depends on the OS in which
  2924. this method is called ('\\n' for linux, '\\r\\n' for Windows, i.e.).
  2925. chunksize : int or None
  2926. Rows to write at a time.
  2927. date_format : str, default None
  2928. Format string for datetime objects.
  2929. doublequote : bool, default True
  2930. Control quoting of `quotechar` inside a field.
  2931. escapechar : str, default None
  2932. String of length 1. Character used to escape `sep` and `quotechar`
  2933. when appropriate.
  2934. decimal : str, default '.'
  2935. Character recognized as decimal separator. E.g. use ',' for
  2936. European data.
  2937. errors : str, default 'strict'
  2938. Specifies how encoding and decoding errors are to be handled.
  2939. See the errors argument for :func:`open` for a full list
  2940. of options.
  2941. .. versionadded:: 1.1.0
  2942. {storage_options}
  2943. .. versionadded:: 1.2.0
  2944. Returns
  2945. -------
  2946. None or str
  2947. If path_or_buf is None, returns the resulting csv format as a
  2948. string. Otherwise returns None.
  2949. See Also
  2950. --------
  2951. read_csv : Load a CSV file into a DataFrame.
  2952. to_excel : Write DataFrame to an Excel file.
  2953. Examples
  2954. --------
  2955. >>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'],
  2956. ... 'mask': ['red', 'purple'],
  2957. ... 'weapon': ['sai', 'bo staff']}})
  2958. >>> df.to_csv(index=False)
  2959. 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'
  2960. Create 'out.zip' containing 'out.csv'
  2961. >>> compression_opts = dict(method='zip',
  2962. ... archive_name='out.csv') # doctest: +SKIP
  2963. >>> df.to_csv('out.zip', index=False,
  2964. ... compression=compression_opts) # doctest: +SKIP
  2965. To write a csv file to a new folder or nested folder you will first
  2966. need to create it using either Pathlib or os:
  2967. >>> from pathlib import Path
  2968. >>> filepath = Path('folder/subfolder/out.csv')
  2969. >>> filepath.parent.mkdir(parents=True, exist_ok=True)
  2970. >>> df.to_csv(filepath)
  2971. >>> import os
  2972. >>> os.makedirs('folder/subfolder', exist_ok=True)
  2973. >>> df.to_csv('folder/subfolder/out.csv')
  2974. """
  2975. df = self if isinstance(self, ABCDataFrame) else self.to_frame()
  2976. formatter = DataFrameFormatter(
  2977. frame=df,
  2978. header=header,
  2979. index=index,
  2980. na_rep=na_rep,
  2981. float_format=float_format,
  2982. decimal=decimal,
  2983. )
  2984. return DataFrameRenderer(formatter).to_csv(
  2985. path_or_buf,
  2986. line_terminator=line_terminator,
  2987. sep=sep,
  2988. encoding=encoding,
  2989. errors=errors,
  2990. compression=compression,
  2991. quoting=quoting,
  2992. columns=columns,
  2993. index_label=index_label,
  2994. mode=mode,
  2995. chunksize=chunksize,
  2996. quotechar=quotechar,
  2997. date_format=date_format,
  2998. doublequote=doublequote,
  2999. escapechar=escapechar,
  3000. storage_options=storage_options,
  3001. )
  3002. # ----------------------------------------------------------------------
  3003. # Lookup Caching
  3004. def _reset_cacher(self) -> None:
  3005. """
  3006. Reset the cacher.
  3007. """
  3008. raise AbstractMethodError(self)
  3009. def _maybe_update_cacher(
  3010. self,
  3011. clear: bool_t = False,
  3012. verify_is_copy: bool_t = True,
  3013. inplace: bool_t = False,
  3014. ) -> None:
  3015. """
  3016. See if we need to update our parent cacher if clear, then clear our
  3017. cache.
  3018. Parameters
  3019. ----------
  3020. clear : bool, default False
  3021. Clear the item cache.
  3022. verify_is_copy : bool, default True
  3023. Provide is_copy checks.
  3024. """
  3025. if verify_is_copy:
  3026. self._check_setitem_copy(t="referent")
  3027. if clear:
  3028. self._clear_item_cache()
  3029. def _clear_item_cache(self) -> None:
  3030. raise AbstractMethodError(self)
  3031. # ----------------------------------------------------------------------
  3032. # Indexing Methods
  3033. def take(
  3034. self: NDFrameT, indices, axis=0, is_copy: bool_t | None = None, **kwargs
  3035. ) -> NDFrameT:
  3036. """
  3037. Return the elements in the given *positional* indices along an axis.
  3038. This means that we are not indexing according to actual values in
  3039. the index attribute of the object. We are indexing according to the
  3040. actual position of the element in the object.
  3041. Parameters
  3042. ----------
  3043. indices : array-like
  3044. An array of ints indicating which positions to take.
  3045. axis : {0 or 'index', 1 or 'columns', None}, default 0
  3046. The axis on which to select elements. ``0`` means that we are
  3047. selecting rows, ``1`` means that we are selecting columns.
  3048. is_copy : bool
  3049. Before pandas 1.0, ``is_copy=False`` can be specified to ensure
  3050. that the return value is an actual copy. Starting with pandas 1.0,
  3051. ``take`` always returns a copy, and the keyword is therefore
  3052. deprecated.
  3053. .. deprecated:: 1.0.0
  3054. **kwargs
  3055. For compatibility with :meth:`numpy.take`. Has no effect on the
  3056. output.
  3057. Returns
  3058. -------
  3059. taken : same type as caller
  3060. An array-like containing the elements taken from the object.
  3061. See Also
  3062. --------
  3063. DataFrame.loc : Select a subset of a DataFrame by labels.
  3064. DataFrame.iloc : Select a subset of a DataFrame by positions.
  3065. numpy.take : Take elements from an array along an axis.
  3066. Examples
  3067. --------
  3068. >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
  3069. ... ('parrot', 'bird', 24.0),
  3070. ... ('lion', 'mammal', 80.5),
  3071. ... ('monkey', 'mammal', np.nan)],
  3072. ... columns=['name', 'class', 'max_speed'],
  3073. ... index=[0, 2, 3, 1])
  3074. >>> df
  3075. name class max_speed
  3076. 0 falcon bird 389.0
  3077. 2 parrot bird 24.0
  3078. 3 lion mammal 80.5
  3079. 1 monkey mammal NaN
  3080. Take elements at positions 0 and 3 along the axis 0 (default).
  3081. Note how the actual indices selected (0 and 1) do not correspond to
  3082. our selected indices 0 and 3. That's because we are selecting the 0th
  3083. and 3rd rows, not rows whose indices equal 0 and 3.
  3084. >>> df.take([0, 3])
  3085. name class max_speed
  3086. 0 falcon bird 389.0
  3087. 1 monkey mammal NaN
  3088. Take elements at indices 1 and 2 along the axis 1 (column selection).
  3089. >>> df.take([1, 2], axis=1)
  3090. class max_speed
  3091. 0 bird 389.0
  3092. 2 bird 24.0
  3093. 3 mammal 80.5
  3094. 1 mammal NaN
  3095. We may take elements using negative integers for positive indices,
  3096. starting from the end of the object, just like with Python lists.
  3097. >>> df.take([-1, -2])
  3098. name class max_speed
  3099. 1 monkey mammal NaN
  3100. 3 lion mammal 80.5
  3101. """
  3102. if is_copy is not None:
  3103. warnings.warn(
  3104. "is_copy is deprecated and will be removed in a future version. "
  3105. "'take' always returns a copy, so there is no need to specify this.",
  3106. FutureWarning,
  3107. stacklevel=2,
  3108. )
  3109. nv.validate_take((), kwargs)
  3110. self._consolidate_inplace()
  3111. new_data = self._mgr.take(
  3112. indices, axis=self._get_block_manager_axis(axis), verify=True
  3113. )
  3114. return self._constructor(new_data).__finalize__(self, method="take")
  3115. def _take_with_is_copy(self: NDFrameT, indices, axis=0) -> NDFrameT:
  3116. """
  3117. Internal version of the `take` method that sets the `_is_copy`
  3118. attribute to keep track of the parent dataframe (using in indexing
  3119. for the SettingWithCopyWarning).
  3120. See the docstring of `take` for full explanation of the parameters.
  3121. """
  3122. result = self.take(indices=indices, axis=axis)
  3123. # Maybe set copy if we didn't actually change the index.
  3124. if not result._get_axis(axis).equals(self._get_axis(axis)):
  3125. result._set_is_copy(self)
  3126. return result
  3127. @final
  3128. def xs(self, key, axis=0, level=None, drop_level: bool_t = True):
  3129. """
  3130. Return cross-section from the Series/DataFrame.
  3131. This method takes a `key` argument to select data at a particular
  3132. level of a MultiIndex.
  3133. Parameters
  3134. ----------
  3135. key : label or tuple of label
  3136. Label contained in the index, or partially in a MultiIndex.
  3137. axis : {0 or 'index', 1 or 'columns'}, default 0
  3138. Axis to retrieve cross-section on.
  3139. level : object, defaults to first n levels (n=1 or len(key))
  3140. In case of a key partially contained in a MultiIndex, indicate
  3141. which levels are used. Levels can be referred by label or position.
  3142. drop_level : bool, default True
  3143. If False, returns object with same levels as self.
  3144. Returns
  3145. -------
  3146. Series or DataFrame
  3147. Cross-section from the original Series or DataFrame
  3148. corresponding to the selected index levels.
  3149. See Also
  3150. --------
  3151. DataFrame.loc : Access a group of rows and columns
  3152. by label(s) or a boolean array.
  3153. DataFrame.iloc : Purely integer-location based indexing
  3154. for selection by position.
  3155. Notes
  3156. -----
  3157. `xs` can not be used to set values.
  3158. MultiIndex Slicers is a generic way to get/set values on
  3159. any level or levels.
  3160. It is a superset of `xs` functionality, see
  3161. :ref:`MultiIndex Slicers <advanced.mi_slicers>`.
  3162. Examples
  3163. --------
  3164. >>> d = {'num_legs': [4, 4, 2, 2],
  3165. ... 'num_wings': [0, 0, 2, 2],
  3166. ... 'class': ['mammal', 'mammal', 'mammal', 'bird'],
  3167. ... 'animal': ['cat', 'dog', 'bat', 'penguin'],
  3168. ... 'locomotion': ['walks', 'walks', 'flies', 'walks']}
  3169. >>> df = pd.DataFrame(data=d)
  3170. >>> df = df.set_index(['class', 'animal', 'locomotion'])
  3171. >>> df
  3172. num_legs num_wings
  3173. class animal locomotion
  3174. mammal cat walks 4 0
  3175. dog walks 4 0
  3176. bat flies 2 2
  3177. bird penguin walks 2 2
  3178. Get values at specified index
  3179. >>> df.xs('mammal')
  3180. num_legs num_wings
  3181. animal locomotion
  3182. cat walks 4 0
  3183. dog walks 4 0
  3184. bat flies 2 2
  3185. Get values at several indexes
  3186. >>> df.xs(('mammal', 'dog'))
  3187. num_legs num_wings
  3188. locomotion
  3189. walks 4 0
  3190. Get values at specified index and level
  3191. >>> df.xs('cat', level=1)
  3192. num_legs num_wings
  3193. class locomotion
  3194. mammal walks 4 0
  3195. Get values at several indexes and levels
  3196. >>> df.xs(('bird', 'walks'),
  3197. ... level=[0, 'locomotion'])
  3198. num_legs num_wings
  3199. animal
  3200. penguin 2 2
  3201. Get values at specified column and axis
  3202. >>> df.xs('num_wings', axis=1)
  3203. class animal locomotion
  3204. mammal cat walks 0
  3205. dog walks 0
  3206. bat flies 2
  3207. bird penguin walks 2
  3208. Name: num_wings, dtype: int64
  3209. """
  3210. axis = self._get_axis_number(axis)
  3211. labels = self._get_axis(axis)
  3212. if isinstance(key, list):
  3213. warnings.warn(
  3214. "Passing lists as key for xs is deprecated and will be removed in a "
  3215. "future version. Pass key as a tuple instead.",
  3216. FutureWarning,
  3217. stacklevel=2,
  3218. )
  3219. if level is not None:
  3220. if not isinstance(labels, MultiIndex):
  3221. raise TypeError("Index must be a MultiIndex")
  3222. loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level)
  3223. # create the tuple of the indexer
  3224. _indexer = [slice(None)] * self.ndim
  3225. _indexer[axis] = loc
  3226. indexer = tuple(_indexer)
  3227. result = self.iloc[indexer]
  3228. setattr(result, result._get_axis_name(axis), new_ax)
  3229. return result
  3230. if axis == 1:
  3231. if drop_level:
  3232. return self[key]
  3233. index = self.columns
  3234. else:
  3235. index = self.index
  3236. self._consolidate_inplace()
  3237. if isinstance(index, MultiIndex):
  3238. loc, new_index = index._get_loc_level(key, level=0)
  3239. if not drop_level:
  3240. if lib.is_integer(loc):
  3241. new_index = index[loc : loc + 1]
  3242. else:
  3243. new_index = index[loc]
  3244. else:
  3245. loc = index.get_loc(key)
  3246. if isinstance(loc, np.ndarray):
  3247. if loc.dtype == np.bool_:
  3248. (inds,) = loc.nonzero()
  3249. return self._take_with_is_copy(inds, axis=axis)
  3250. else:
  3251. return self._take_with_is_copy(loc, axis=axis)
  3252. if not is_scalar(loc):
  3253. new_index = index[loc]
  3254. if is_scalar(loc) and axis == 0:
  3255. # In this case loc should be an integer
  3256. if self.ndim == 1:
  3257. # if we encounter an array-like and we only have 1 dim
  3258. # that means that their are list/ndarrays inside the Series!
  3259. # so just return them (GH 6394)
  3260. return self._values[loc]
  3261. new_values = self._mgr.fast_xs(loc)
  3262. result = self._constructor_sliced(
  3263. new_values,
  3264. index=self.columns,
  3265. name=self.index[loc],
  3266. dtype=new_values.dtype,
  3267. )
  3268. elif is_scalar(loc):
  3269. result = self.iloc[:, slice(loc, loc + 1)]
  3270. elif axis == 1:
  3271. result = self.iloc[:, loc]
  3272. else:
  3273. result = self.iloc[loc]
  3274. result.index = new_index
  3275. # this could be a view
  3276. # but only in a single-dtyped view sliceable case
  3277. result._set_is_copy(self, copy=not result._is_view)
  3278. return result
  3279. def __getitem__(self, item):
  3280. raise AbstractMethodError(self)
  3281. def _slice(self: NDFrameT, slobj: slice, axis=0) -> NDFrameT:
  3282. """
  3283. Construct a slice of this container.
  3284. Slicing with this method is *always* positional.
  3285. """
  3286. assert isinstance(slobj, slice), type(slobj)
  3287. axis = self._get_block_manager_axis(axis)
  3288. result = self._constructor(self._mgr.get_slice(slobj, axis=axis))
  3289. result = result.__finalize__(self)
  3290. # this could be a view
  3291. # but only in a single-dtyped view sliceable case
  3292. is_copy = axis != 0 or result._is_view
  3293. result._set_is_copy(self, copy=is_copy)
  3294. return result
  3295. @final
  3296. def _set_is_copy(self, ref: NDFrame, copy: bool_t = True) -> None:
  3297. if not copy:
  3298. self._is_copy = None
  3299. else:
  3300. assert ref is not None
  3301. self._is_copy = weakref.ref(ref)
  3302. def _check_is_chained_assignment_possible(self) -> bool_t:
  3303. """
  3304. Check if we are a view, have a cacher, and are of mixed type.
  3305. If so, then force a setitem_copy check.
  3306. Should be called just near setting a value
  3307. Will return a boolean if it we are a view and are cached, but a
  3308. single-dtype meaning that the cacher should be updated following
  3309. setting.
  3310. """
  3311. if self._is_copy:
  3312. self._check_setitem_copy(t="referent")
  3313. return False
  3314. @final
  3315. def _check_setitem_copy(self, t="setting", force=False):
  3316. """
  3317. Parameters
  3318. ----------
  3319. t : str, the type of setting error
  3320. force : bool, default False
  3321. If True, then force showing an error.
  3322. validate if we are doing a setitem on a chained copy.
  3323. It is technically possible to figure out that we are setting on
  3324. a copy even WITH a multi-dtyped pandas object. In other words, some
  3325. blocks may be views while other are not. Currently _is_view will ALWAYS
  3326. return False for multi-blocks to avoid having to handle this case.
  3327. df = DataFrame(np.arange(0,9), columns=['count'])
  3328. df['group'] = 'b'
  3329. # This technically need not raise SettingWithCopy if both are view
  3330. # (which is not # generally guaranteed but is usually True. However,
  3331. # this is in general not a good practice and we recommend using .loc.
  3332. df.iloc[0:5]['group'] = 'a'
  3333. """
  3334. # return early if the check is not needed
  3335. if not (force or self._is_copy):
  3336. return
  3337. value = config.get_option("mode.chained_assignment")
  3338. if value is None:
  3339. return
  3340. # see if the copy is not actually referred; if so, then dissolve
  3341. # the copy weakref
  3342. if self._is_copy is not None and not isinstance(self._is_copy, str):
  3343. r = self._is_copy()
  3344. if not gc.get_referents(r) or (r is not None and r.shape == self.shape):
  3345. self._is_copy = None
  3346. return
  3347. # a custom message
  3348. if isinstance(self._is_copy, str):
  3349. t = self._is_copy
  3350. elif t == "referent":
  3351. t = (
  3352. "\n"
  3353. "A value is trying to be set on a copy of a slice from a "
  3354. "DataFrame\n\n"
  3355. "See the caveats in the documentation: "
  3356. "https://pandas.pydata.org/pandas-docs/stable/user_guide/"
  3357. "indexing.html#returning-a-view-versus-a-copy"
  3358. )
  3359. else:
  3360. t = (
  3361. "\n"
  3362. "A value is trying to be set on a copy of a slice from a "
  3363. "DataFrame.\n"
  3364. "Try using .loc[row_indexer,col_indexer] = value "
  3365. "instead\n\nSee the caveats in the documentation: "
  3366. "https://pandas.pydata.org/pandas-docs/stable/user_guide/"
  3367. "indexing.html#returning-a-view-versus-a-copy"
  3368. )
  3369. if value == "raise":
  3370. raise com.SettingWithCopyError(t)
  3371. elif value == "warn":
  3372. warnings.warn(t, com.SettingWithCopyWarning, stacklevel=find_stack_level())
  3373. def __delitem__(self, key) -> None:
  3374. """
  3375. Delete item
  3376. """
  3377. deleted = False
  3378. maybe_shortcut = False
  3379. if self.ndim == 2 and isinstance(self.columns, MultiIndex):
  3380. try:
  3381. # By using engine's __contains__ we effectively
  3382. # restrict to same-length tuples
  3383. maybe_shortcut = key not in self.columns._engine
  3384. except TypeError:
  3385. pass
  3386. if maybe_shortcut:
  3387. # Allow shorthand to delete all columns whose first len(key)
  3388. # elements match key:
  3389. if not isinstance(key, tuple):
  3390. key = (key,)
  3391. for col in self.columns:
  3392. if isinstance(col, tuple) and col[: len(key)] == key:
  3393. del self[col]
  3394. deleted = True
  3395. if not deleted:
  3396. # If the above loop ran and didn't delete anything because
  3397. # there was no match, this call should raise the appropriate
  3398. # exception:
  3399. loc = self.axes[-1].get_loc(key)
  3400. self._mgr = self._mgr.idelete(loc)
  3401. # delete from the caches
  3402. try:
  3403. del self._item_cache[key]
  3404. except KeyError:
  3405. pass
  3406. # ----------------------------------------------------------------------
  3407. # Unsorted
  3408. @final
  3409. def _check_inplace_and_allows_duplicate_labels(self, inplace):
  3410. if inplace and not self.flags.allows_duplicate_labels:
  3411. raise ValueError(
  3412. "Cannot specify 'inplace=True' when "
  3413. "'self.flags.allows_duplicate_labels' is False."
  3414. )
  3415. @final
  3416. def get(self, key, default=None):
  3417. """
  3418. Get item from object for given key (ex: DataFrame column).
  3419. Returns default value if not found.
  3420. Parameters
  3421. ----------
  3422. key : object
  3423. Returns
  3424. -------
  3425. value : same type as items contained in object
  3426. Examples
  3427. --------
  3428. >>> df = pd.DataFrame(
  3429. ... [
  3430. ... [24.3, 75.7, "high"],
  3431. ... [31, 87.8, "high"],
  3432. ... [22, 71.6, "medium"],
  3433. ... [35, 95, "medium"],
  3434. ... ],
  3435. ... columns=["temp_celsius", "temp_fahrenheit", "windspeed"],
  3436. ... index=pd.date_range(start="2014-02-12", end="2014-02-15", freq="D"),
  3437. ... )
  3438. >>> df
  3439. temp_celsius temp_fahrenheit windspeed
  3440. 2014-02-12 24.3 75.7 high
  3441. 2014-02-13 31.0 87.8 high
  3442. 2014-02-14 22.0 71.6 medium
  3443. 2014-02-15 35.0 95.0 medium
  3444. >>> df.get(["temp_celsius", "windspeed"])
  3445. temp_celsius windspeed
  3446. 2014-02-12 24.3 high
  3447. 2014-02-13 31.0 high
  3448. 2014-02-14 22.0 medium
  3449. 2014-02-15 35.0 medium
  3450. If the key isn't found, the default value will be used.
  3451. >>> df.get(["temp_celsius", "temp_kelvin"], default="default_value")
  3452. 'default_value'
  3453. """
  3454. try:
  3455. return self[key]
  3456. except (KeyError, ValueError, IndexError):
  3457. return default
  3458. @final
  3459. @property
  3460. def _is_view(self) -> bool_t:
  3461. """Return boolean indicating if self is view of another array"""
  3462. return self._mgr.is_view
  3463. @final
  3464. def reindex_like(
  3465. self: NDFrameT,
  3466. other,
  3467. method: str | None = None,
  3468. copy: bool_t = True,
  3469. limit=None,
  3470. tolerance=None,
  3471. ) -> NDFrameT:
  3472. """
  3473. Return an object with matching indices as other object.
  3474. Conform the object to the same index on all axes. Optional
  3475. filling logic, placing NaN in locations having no value
  3476. in the previous index. A new object is produced unless the
  3477. new index is equivalent to the current one and copy=False.
  3478. Parameters
  3479. ----------
  3480. other : Object of the same data type
  3481. Its row and column indices are used to define the new indices
  3482. of this object.
  3483. method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}
  3484. Method to use for filling holes in reindexed DataFrame.
  3485. Please note: this is only applicable to DataFrames/Series with a
  3486. monotonically increasing/decreasing index.
  3487. * None (default): don't fill gaps
  3488. * pad / ffill: propagate last valid observation forward to next
  3489. valid
  3490. * backfill / bfill: use next valid observation to fill gap
  3491. * nearest: use nearest valid observations to fill gap.
  3492. copy : bool, default True
  3493. Return a new object, even if the passed indexes are the same.
  3494. limit : int, default None
  3495. Maximum number of consecutive labels to fill for inexact matches.
  3496. tolerance : optional
  3497. Maximum distance between original and new labels for inexact
  3498. matches. The values of the index at the matching locations must
  3499. satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
  3500. Tolerance may be a scalar value, which applies the same tolerance
  3501. to all values, or list-like, which applies variable tolerance per
  3502. element. List-like includes list, tuple, array, Series, and must be
  3503. the same size as the index and its dtype must exactly match the
  3504. index's type.
  3505. Returns
  3506. -------
  3507. Series or DataFrame
  3508. Same type as caller, but with changed indices on each axis.
  3509. See Also
  3510. --------
  3511. DataFrame.set_index : Set row labels.
  3512. DataFrame.reset_index : Remove row labels or move them to new columns.
  3513. DataFrame.reindex : Change to new indices or expand indices.
  3514. Notes
  3515. -----
  3516. Same as calling
  3517. ``.reindex(index=other.index, columns=other.columns,...)``.
  3518. Examples
  3519. --------
  3520. >>> df1 = pd.DataFrame([[24.3, 75.7, 'high'],
  3521. ... [31, 87.8, 'high'],
  3522. ... [22, 71.6, 'medium'],
  3523. ... [35, 95, 'medium']],
  3524. ... columns=['temp_celsius', 'temp_fahrenheit',
  3525. ... 'windspeed'],
  3526. ... index=pd.date_range(start='2014-02-12',
  3527. ... end='2014-02-15', freq='D'))
  3528. >>> df1
  3529. temp_celsius temp_fahrenheit windspeed
  3530. 2014-02-12 24.3 75.7 high
  3531. 2014-02-13 31.0 87.8 high
  3532. 2014-02-14 22.0 71.6 medium
  3533. 2014-02-15 35.0 95.0 medium
  3534. >>> df2 = pd.DataFrame([[28, 'low'],
  3535. ... [30, 'low'],
  3536. ... [35.1, 'medium']],
  3537. ... columns=['temp_celsius', 'windspeed'],
  3538. ... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13',
  3539. ... '2014-02-15']))
  3540. >>> df2
  3541. temp_celsius windspeed
  3542. 2014-02-12 28.0 low
  3543. 2014-02-13 30.0 low
  3544. 2014-02-15 35.1 medium
  3545. >>> df2.reindex_like(df1)
  3546. temp_celsius temp_fahrenheit windspeed
  3547. 2014-02-12 28.0 NaN low
  3548. 2014-02-13 30.0 NaN low
  3549. 2014-02-14 NaN NaN NaN
  3550. 2014-02-15 35.1 NaN medium
  3551. """
  3552. d = other._construct_axes_dict(
  3553. axes=self._AXIS_ORDERS,
  3554. method=method,
  3555. copy=copy,
  3556. limit=limit,
  3557. tolerance=tolerance,
  3558. )
  3559. return self.reindex(**d)
  3560. def drop(
  3561. self,
  3562. labels=None,
  3563. axis=0,
  3564. index=None,
  3565. columns=None,
  3566. level=None,
  3567. inplace: bool_t = False,
  3568. errors: str = "raise",
  3569. ):
  3570. inplace = validate_bool_kwarg(inplace, "inplace")
  3571. if labels is not None:
  3572. if index is not None or columns is not None:
  3573. raise ValueError("Cannot specify both 'labels' and 'index'/'columns'")
  3574. axis_name = self._get_axis_name(axis)
  3575. axes = {axis_name: labels}
  3576. elif index is not None or columns is not None:
  3577. axes, _ = self._construct_axes_from_arguments((index, columns), {})
  3578. else:
  3579. raise ValueError(
  3580. "Need to specify at least one of 'labels', 'index' or 'columns'"
  3581. )
  3582. obj = self
  3583. for axis, labels in axes.items():
  3584. if labels is not None:
  3585. obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  3586. if inplace:
  3587. self._update_inplace(obj)
  3588. else:
  3589. return obj
  3590. @final
  3591. def _drop_axis(
  3592. self: NDFrameT,
  3593. labels,
  3594. axis,
  3595. level=None,
  3596. errors: str = "raise",
  3597. consolidate: bool_t = True,
  3598. only_slice: bool_t = False,
  3599. ) -> NDFrameT:
  3600. """
  3601. Drop labels from specified axis. Used in the ``drop`` method
  3602. internally.
  3603. Parameters
  3604. ----------
  3605. labels : single label or list-like
  3606. axis : int or axis name
  3607. level : int or level name, default None
  3608. For MultiIndex
  3609. errors : {'ignore', 'raise'}, default 'raise'
  3610. If 'ignore', suppress error and existing labels are dropped.
  3611. consolidate : bool, default True
  3612. Whether to call consolidate_inplace in the reindex_indexer call.
  3613. only_slice : bool, default False
  3614. Whether indexing along columns should be view-only.
  3615. """
  3616. axis_num = self._get_axis_number(axis)
  3617. axis = self._get_axis(axis)
  3618. if axis.is_unique:
  3619. if level is not None:
  3620. if not isinstance(axis, MultiIndex):
  3621. raise AssertionError("axis must be a MultiIndex")
  3622. new_axis = axis.drop(labels, level=level, errors=errors)
  3623. else:
  3624. new_axis = axis.drop(labels, errors=errors)
  3625. indexer = axis.get_indexer(new_axis)
  3626. # Case for non-unique axis
  3627. else:
  3628. is_tuple_labels = is_nested_list_like(labels) or isinstance(labels, tuple)
  3629. labels = ensure_object(com.index_labels_to_array(labels))
  3630. if level is not None:
  3631. if not isinstance(axis, MultiIndex):
  3632. raise AssertionError("axis must be a MultiIndex")
  3633. mask = ~axis.get_level_values(level).isin(labels)
  3634. # GH 18561 MultiIndex.drop should raise if label is absent
  3635. if errors == "raise" and mask.all():
  3636. raise KeyError(f"{labels} not found in axis")
  3637. elif (
  3638. isinstance(axis, MultiIndex)
  3639. and labels.dtype == "object"
  3640. and not is_tuple_labels
  3641. ):
  3642. # Set level to zero in case of MultiIndex and label is string,
  3643. # because isin can't handle strings for MultiIndexes GH#36293
  3644. # In case of tuples we get dtype object but have to use isin GH#42771
  3645. mask = ~axis.get_level_values(0).isin(labels)
  3646. else:
  3647. mask = ~axis.isin(labels)
  3648. # Check if label doesn't exist along axis
  3649. labels_missing = (axis.get_indexer_for(labels) == -1).any()
  3650. if errors == "raise" and labels_missing:
  3651. raise KeyError(f"{labels} not found in axis")
  3652. indexer = mask.nonzero()[0]
  3653. new_axis = axis.take(indexer)
  3654. bm_axis = self.ndim - axis_num - 1
  3655. new_mgr = self._mgr.reindex_indexer(
  3656. new_axis,
  3657. indexer,
  3658. axis=bm_axis,
  3659. allow_dups=True,
  3660. consolidate=consolidate,
  3661. only_slice=only_slice,
  3662. )
  3663. result = self._constructor(new_mgr)
  3664. if self.ndim == 1:
  3665. result.name = self.name
  3666. return result.__finalize__(self)
  3667. @final
  3668. def _update_inplace(self, result, verify_is_copy: bool_t = True) -> None:
  3669. """
  3670. Replace self internals with result.
  3671. Parameters
  3672. ----------
  3673. result : same type as self
  3674. verify_is_copy : bool, default True
  3675. Provide is_copy checks.
  3676. """
  3677. # NOTE: This does *not* call __finalize__ and that's an explicit
  3678. # decision that we may revisit in the future.
  3679. self._reset_cache()
  3680. self._clear_item_cache()
  3681. self._mgr = result._mgr
  3682. self._maybe_update_cacher(verify_is_copy=verify_is_copy)
  3683. @final
  3684. def add_prefix(self: NDFrameT, prefix: str) -> NDFrameT:
  3685. """
  3686. Prefix labels with string `prefix`.
  3687. For Series, the row labels are prefixed.
  3688. For DataFrame, the column labels are prefixed.
  3689. Parameters
  3690. ----------
  3691. prefix : str
  3692. The string to add before each label.
  3693. Returns
  3694. -------
  3695. Series or DataFrame
  3696. New Series or DataFrame with updated labels.
  3697. See Also
  3698. --------
  3699. Series.add_suffix: Suffix row labels with string `suffix`.
  3700. DataFrame.add_suffix: Suffix column labels with string `suffix`.
  3701. Examples
  3702. --------
  3703. >>> s = pd.Series([1, 2, 3, 4])
  3704. >>> s
  3705. 0 1
  3706. 1 2
  3707. 2 3
  3708. 3 4
  3709. dtype: int64
  3710. >>> s.add_prefix('item_')
  3711. item_0 1
  3712. item_1 2
  3713. item_2 3
  3714. item_3 4
  3715. dtype: int64
  3716. >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
  3717. >>> df
  3718. A B
  3719. 0 1 3
  3720. 1 2 4
  3721. 2 3 5
  3722. 3 4 6
  3723. >>> df.add_prefix('col_')
  3724. col_A col_B
  3725. 0 1 3
  3726. 1 2 4
  3727. 2 3 5
  3728. 3 4 6
  3729. """
  3730. f = functools.partial("{prefix}{}".format, prefix=prefix)
  3731. mapper = {self._info_axis_name: f}
  3732. # error: Incompatible return value type (got "Optional[NDFrameT]",
  3733. # expected "NDFrameT")
  3734. # error: Argument 1 to "rename" of "NDFrame" has incompatible type
  3735. # "**Dict[str, partial[str]]"; expected "Union[str, int, None]"
  3736. return self.rename(**mapper) # type: ignore[return-value, arg-type]
  3737. @final
  3738. def add_suffix(self: NDFrameT, suffix: str) -> NDFrameT:
  3739. """
  3740. Suffix labels with string `suffix`.
  3741. For Series, the row labels are suffixed.
  3742. For DataFrame, the column labels are suffixed.
  3743. Parameters
  3744. ----------
  3745. suffix : str
  3746. The string to add after each label.
  3747. Returns
  3748. -------
  3749. Series or DataFrame
  3750. New Series or DataFrame with updated labels.
  3751. See Also
  3752. --------
  3753. Series.add_prefix: Prefix row labels with string `prefix`.
  3754. DataFrame.add_prefix: Prefix column labels with string `prefix`.
  3755. Examples
  3756. --------
  3757. >>> s = pd.Series([1, 2, 3, 4])
  3758. >>> s
  3759. 0 1
  3760. 1 2
  3761. 2 3
  3762. 3 4
  3763. dtype: int64
  3764. >>> s.add_suffix('_item')
  3765. 0_item 1
  3766. 1_item 2
  3767. 2_item 3
  3768. 3_item 4
  3769. dtype: int64
  3770. >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
  3771. >>> df
  3772. A B
  3773. 0 1 3
  3774. 1 2 4
  3775. 2 3 5
  3776. 3 4 6
  3777. >>> df.add_suffix('_col')
  3778. A_col B_col
  3779. 0 1 3
  3780. 1 2 4
  3781. 2 3 5
  3782. 3 4 6
  3783. """
  3784. f = functools.partial("{}{suffix}".format, suffix=suffix)
  3785. mapper = {self._info_axis_name: f}
  3786. # error: Incompatible return value type (got "Optional[NDFrameT]",
  3787. # expected "NDFrameT")
  3788. # error: Argument 1 to "rename" of "NDFrame" has incompatible type
  3789. # "**Dict[str, partial[str]]"; expected "Union[str, int, None]"
  3790. return self.rename(**mapper) # type: ignore[return-value, arg-type]
  3791. def sort_values(
  3792. self,
  3793. axis=0,
  3794. ascending=True,
  3795. inplace: bool_t = False,
  3796. kind: str = "quicksort",
  3797. na_position: str = "last",
  3798. ignore_index: bool_t = False,
  3799. key: ValueKeyFunc = None,
  3800. ):
  3801. """
  3802. Sort by the values along either axis.
  3803. Parameters
  3804. ----------%(optional_by)s
  3805. axis : %(axes_single_arg)s, default 0
  3806. Axis to be sorted.
  3807. ascending : bool or list of bool, default True
  3808. Sort ascending vs. descending. Specify list for multiple sort
  3809. orders. If this is a list of bools, must match the length of
  3810. the by.
  3811. inplace : bool, default False
  3812. If True, perform operation in-place.
  3813. kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
  3814. Choice of sorting algorithm. See also :func:`numpy.sort` for more
  3815. information. `mergesort` and `stable` are the only stable algorithms. For
  3816. DataFrames, this option is only applied when sorting on a single
  3817. column or label.
  3818. na_position : {'first', 'last'}, default 'last'
  3819. Puts NaNs at the beginning if `first`; `last` puts NaNs at the
  3820. end.
  3821. ignore_index : bool, default False
  3822. If True, the resulting axis will be labeled 0, 1, …, n - 1.
  3823. .. versionadded:: 1.0.0
  3824. key : callable, optional
  3825. Apply the key function to the values
  3826. before sorting. This is similar to the `key` argument in the
  3827. builtin :meth:`sorted` function, with the notable difference that
  3828. this `key` function should be *vectorized*. It should expect a
  3829. ``Series`` and return a Series with the same shape as the input.
  3830. It will be applied to each column in `by` independently.
  3831. .. versionadded:: 1.1.0
  3832. Returns
  3833. -------
  3834. DataFrame or None
  3835. DataFrame with sorted values or None if ``inplace=True``.
  3836. See Also
  3837. --------
  3838. DataFrame.sort_index : Sort a DataFrame by the index.
  3839. Series.sort_values : Similar method for a Series.
  3840. Examples
  3841. --------
  3842. >>> df = pd.DataFrame({
  3843. ... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'],
  3844. ... 'col2': [2, 1, 9, 8, 7, 4],
  3845. ... 'col3': [0, 1, 9, 4, 2, 3],
  3846. ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F']
  3847. ... })
  3848. >>> df
  3849. col1 col2 col3 col4
  3850. 0 A 2 0 a
  3851. 1 A 1 1 B
  3852. 2 B 9 9 c
  3853. 3 NaN 8 4 D
  3854. 4 D 7 2 e
  3855. 5 C 4 3 F
  3856. Sort by col1
  3857. >>> df.sort_values(by=['col1'])
  3858. col1 col2 col3 col4
  3859. 0 A 2 0 a
  3860. 1 A 1 1 B
  3861. 2 B 9 9 c
  3862. 5 C 4 3 F
  3863. 4 D 7 2 e
  3864. 3 NaN 8 4 D
  3865. Sort by multiple columns
  3866. >>> df.sort_values(by=['col1', 'col2'])
  3867. col1 col2 col3 col4
  3868. 1 A 1 1 B
  3869. 0 A 2 0 a
  3870. 2 B 9 9 c
  3871. 5 C 4 3 F
  3872. 4 D 7 2 e
  3873. 3 NaN 8 4 D
  3874. Sort Descending
  3875. >>> df.sort_values(by='col1', ascending=False)
  3876. col1 col2 col3 col4
  3877. 4 D 7 2 e
  3878. 5 C 4 3 F
  3879. 2 B 9 9 c
  3880. 0 A 2 0 a
  3881. 1 A 1 1 B
  3882. 3 NaN 8 4 D
  3883. Putting NAs first
  3884. >>> df.sort_values(by='col1', ascending=False, na_position='first')
  3885. col1 col2 col3 col4
  3886. 3 NaN 8 4 D
  3887. 4 D 7 2 e
  3888. 5 C 4 3 F
  3889. 2 B 9 9 c
  3890. 0 A 2 0 a
  3891. 1 A 1 1 B
  3892. Sorting with a key function
  3893. >>> df.sort_values(by='col4', key=lambda col: col.str.lower())
  3894. col1 col2 col3 col4
  3895. 0 A 2 0 a
  3896. 1 A 1 1 B
  3897. 2 B 9 9 c
  3898. 3 NaN 8 4 D
  3899. 4 D 7 2 e
  3900. 5 C 4 3 F
  3901. Natural sort with the key argument,
  3902. using the `natsort <https://github.com/SethMMorton/natsort>` package.
  3903. >>> df = pd.DataFrame({
  3904. ... "time": ['0hr', '128hr', '72hr', '48hr', '96hr'],
  3905. ... "value": [10, 20, 30, 40, 50]
  3906. ... })
  3907. >>> df
  3908. time value
  3909. 0 0hr 10
  3910. 1 128hr 20
  3911. 2 72hr 30
  3912. 3 48hr 40
  3913. 4 96hr 50
  3914. >>> from natsort import index_natsorted
  3915. >>> df.sort_values(
  3916. ... by="time",
  3917. ... key=lambda x: np.argsort(index_natsorted(df["time"]))
  3918. ... )
  3919. time value
  3920. 0 0hr 10
  3921. 3 48hr 40
  3922. 2 72hr 30
  3923. 4 96hr 50
  3924. 1 128hr 20
  3925. """
  3926. raise AbstractMethodError(self)
  3927. def sort_index(
  3928. self,
  3929. axis=0,
  3930. level=None,
  3931. ascending: bool_t | int | Sequence[bool_t | int] = True,
  3932. inplace: bool_t = False,
  3933. kind: str = "quicksort",
  3934. na_position: str = "last",
  3935. sort_remaining: bool_t = True,
  3936. ignore_index: bool_t = False,
  3937. key: IndexKeyFunc = None,
  3938. ):
  3939. inplace = validate_bool_kwarg(inplace, "inplace")
  3940. axis = self._get_axis_number(axis)
  3941. ascending = validate_ascending(ascending)
  3942. target = self._get_axis(axis)
  3943. indexer = get_indexer_indexer(
  3944. target, level, ascending, kind, na_position, sort_remaining, key
  3945. )
  3946. if indexer is None:
  3947. if inplace:
  3948. result = self
  3949. else:
  3950. result = self.copy()
  3951. if ignore_index:
  3952. result.index = default_index(len(self))
  3953. if inplace:
  3954. return
  3955. else:
  3956. return result
  3957. baxis = self._get_block_manager_axis(axis)
  3958. new_data = self._mgr.take(indexer, axis=baxis, verify=False)
  3959. # reconstruct axis if needed
  3960. new_data.set_axis(baxis, new_data.axes[baxis]._sort_levels_monotonic())
  3961. if ignore_index:
  3962. axis = 1 if isinstance(self, ABCDataFrame) else 0
  3963. new_data.set_axis(axis, default_index(len(indexer)))
  3964. result = self._constructor(new_data)
  3965. if inplace:
  3966. return self._update_inplace(result)
  3967. else:
  3968. return result.__finalize__(self, method="sort_index")
  3969. @doc(
  3970. klass=_shared_doc_kwargs["klass"],
  3971. axes=_shared_doc_kwargs["axes"],
  3972. optional_labels="",
  3973. optional_axis="",
  3974. )
  3975. def reindex(self: NDFrameT, *args, **kwargs) -> NDFrameT:
  3976. """
  3977. Conform {klass} to new index with optional filling logic.
  3978. Places NA/NaN in locations having no value in the previous index. A new object
  3979. is produced unless the new index is equivalent to the current one and
  3980. ``copy=False``.
  3981. Parameters
  3982. ----------
  3983. {optional_labels}
  3984. {axes} : array-like, optional
  3985. New labels / index to conform to, should be specified using
  3986. keywords. Preferably an Index object to avoid duplicating data.
  3987. {optional_axis}
  3988. method : {{None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}}
  3989. Method to use for filling holes in reindexed DataFrame.
  3990. Please note: this is only applicable to DataFrames/Series with a
  3991. monotonically increasing/decreasing index.
  3992. * None (default): don't fill gaps
  3993. * pad / ffill: Propagate last valid observation forward to next
  3994. valid.
  3995. * backfill / bfill: Use next valid observation to fill gap.
  3996. * nearest: Use nearest valid observations to fill gap.
  3997. copy : bool, default True
  3998. Return a new object, even if the passed indexes are the same.
  3999. level : int or name
  4000. Broadcast across a level, matching Index values on the
  4001. passed MultiIndex level.
  4002. fill_value : scalar, default np.NaN
  4003. Value to use for missing values. Defaults to NaN, but can be any
  4004. "compatible" value.
  4005. limit : int, default None
  4006. Maximum number of consecutive elements to forward or backward fill.
  4007. tolerance : optional
  4008. Maximum distance between original and new labels for inexact
  4009. matches. The values of the index at the matching locations most
  4010. satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
  4011. Tolerance may be a scalar value, which applies the same tolerance
  4012. to all values, or list-like, which applies variable tolerance per
  4013. element. List-like includes list, tuple, array, Series, and must be
  4014. the same size as the index and its dtype must exactly match the
  4015. index's type.
  4016. Returns
  4017. -------
  4018. {klass} with changed index.
  4019. See Also
  4020. --------
  4021. DataFrame.set_index : Set row labels.
  4022. DataFrame.reset_index : Remove row labels or move them to new columns.
  4023. DataFrame.reindex_like : Change to same indices as other DataFrame.
  4024. Examples
  4025. --------
  4026. ``DataFrame.reindex`` supports two calling conventions
  4027. * ``(index=index_labels, columns=column_labels, ...)``
  4028. * ``(labels, axis={{'index', 'columns'}}, ...)``
  4029. We *highly* recommend using keyword arguments to clarify your
  4030. intent.
  4031. Create a dataframe with some fictional data.
  4032. >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']
  4033. >>> df = pd.DataFrame({{'http_status': [200, 200, 404, 404, 301],
  4034. ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}},
  4035. ... index=index)
  4036. >>> df
  4037. http_status response_time
  4038. Firefox 200 0.04
  4039. Chrome 200 0.02
  4040. Safari 404 0.07
  4041. IE10 404 0.08
  4042. Konqueror 301 1.00
  4043. Create a new index and reindex the dataframe. By default
  4044. values in the new index that do not have corresponding
  4045. records in the dataframe are assigned ``NaN``.
  4046. >>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10',
  4047. ... 'Chrome']
  4048. >>> df.reindex(new_index)
  4049. http_status response_time
  4050. Safari 404.0 0.07
  4051. Iceweasel NaN NaN
  4052. Comodo Dragon NaN NaN
  4053. IE10 404.0 0.08
  4054. Chrome 200.0 0.02
  4055. We can fill in the missing values by passing a value to
  4056. the keyword ``fill_value``. Because the index is not monotonically
  4057. increasing or decreasing, we cannot use arguments to the keyword
  4058. ``method`` to fill the ``NaN`` values.
  4059. >>> df.reindex(new_index, fill_value=0)
  4060. http_status response_time
  4061. Safari 404 0.07
  4062. Iceweasel 0 0.00
  4063. Comodo Dragon 0 0.00
  4064. IE10 404 0.08
  4065. Chrome 200 0.02
  4066. >>> df.reindex(new_index, fill_value='missing')
  4067. http_status response_time
  4068. Safari 404 0.07
  4069. Iceweasel missing missing
  4070. Comodo Dragon missing missing
  4071. IE10 404 0.08
  4072. Chrome 200 0.02
  4073. We can also reindex the columns.
  4074. >>> df.reindex(columns=['http_status', 'user_agent'])
  4075. http_status user_agent
  4076. Firefox 200 NaN
  4077. Chrome 200 NaN
  4078. Safari 404 NaN
  4079. IE10 404 NaN
  4080. Konqueror 301 NaN
  4081. Or we can use "axis-style" keyword arguments
  4082. >>> df.reindex(['http_status', 'user_agent'], axis="columns")
  4083. http_status user_agent
  4084. Firefox 200 NaN
  4085. Chrome 200 NaN
  4086. Safari 404 NaN
  4087. IE10 404 NaN
  4088. Konqueror 301 NaN
  4089. To further illustrate the filling functionality in
  4090. ``reindex``, we will create a dataframe with a
  4091. monotonically increasing index (for example, a sequence
  4092. of dates).
  4093. >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D')
  4094. >>> df2 = pd.DataFrame({{"prices": [100, 101, np.nan, 100, 89, 88]}},
  4095. ... index=date_index)
  4096. >>> df2
  4097. prices
  4098. 2010-01-01 100.0
  4099. 2010-01-02 101.0
  4100. 2010-01-03 NaN
  4101. 2010-01-04 100.0
  4102. 2010-01-05 89.0
  4103. 2010-01-06 88.0
  4104. Suppose we decide to expand the dataframe to cover a wider
  4105. date range.
  4106. >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D')
  4107. >>> df2.reindex(date_index2)
  4108. prices
  4109. 2009-12-29 NaN
  4110. 2009-12-30 NaN
  4111. 2009-12-31 NaN
  4112. 2010-01-01 100.0
  4113. 2010-01-02 101.0
  4114. 2010-01-03 NaN
  4115. 2010-01-04 100.0
  4116. 2010-01-05 89.0
  4117. 2010-01-06 88.0
  4118. 2010-01-07 NaN
  4119. The index entries that did not have a value in the original data frame
  4120. (for example, '2009-12-29') are by default filled with ``NaN``.
  4121. If desired, we can fill in the missing values using one of several
  4122. options.
  4123. For example, to back-propagate the last valid value to fill the ``NaN``
  4124. values, pass ``bfill`` as an argument to the ``method`` keyword.
  4125. >>> df2.reindex(date_index2, method='bfill')
  4126. prices
  4127. 2009-12-29 100.0
  4128. 2009-12-30 100.0
  4129. 2009-12-31 100.0
  4130. 2010-01-01 100.0
  4131. 2010-01-02 101.0
  4132. 2010-01-03 NaN
  4133. 2010-01-04 100.0
  4134. 2010-01-05 89.0
  4135. 2010-01-06 88.0
  4136. 2010-01-07 NaN
  4137. Please note that the ``NaN`` value present in the original dataframe
  4138. (at index value 2010-01-03) will not be filled by any of the
  4139. value propagation schemes. This is because filling while reindexing
  4140. does not look at dataframe values, but only compares the original and
  4141. desired indexes. If you do want to fill in the ``NaN`` values present
  4142. in the original dataframe, use the ``fillna()`` method.
  4143. See the :ref:`user guide <basics.reindexing>` for more.
  4144. """
  4145. # TODO: Decide if we care about having different examples for different
  4146. # kinds
  4147. # construct the args
  4148. axes, kwargs = self._construct_axes_from_arguments(args, kwargs)
  4149. method = missing.clean_reindex_fill_method(kwargs.pop("method", None))
  4150. level = kwargs.pop("level", None)
  4151. copy = kwargs.pop("copy", True)
  4152. limit = kwargs.pop("limit", None)
  4153. tolerance = kwargs.pop("tolerance", None)
  4154. fill_value = kwargs.pop("fill_value", None)
  4155. # Series.reindex doesn't use / need the axis kwarg
  4156. # We pop and ignore it here, to make writing Series/Frame generic code
  4157. # easier
  4158. kwargs.pop("axis", None)
  4159. if kwargs:
  4160. raise TypeError(
  4161. "reindex() got an unexpected keyword "
  4162. f'argument "{list(kwargs.keys())[0]}"'
  4163. )
  4164. self._consolidate_inplace()
  4165. # if all axes that are requested to reindex are equal, then only copy
  4166. # if indicated must have index names equal here as well as values
  4167. if all(
  4168. self._get_axis(axis).identical(ax)
  4169. for axis, ax in axes.items()
  4170. if ax is not None
  4171. ):
  4172. if copy:
  4173. return self.copy()
  4174. return self
  4175. # check if we are a multi reindex
  4176. if self._needs_reindex_multi(axes, method, level):
  4177. return self._reindex_multi(axes, copy, fill_value)
  4178. # perform the reindex on the axes
  4179. return self._reindex_axes(
  4180. axes, level, limit, tolerance, method, fill_value, copy
  4181. ).__finalize__(self, method="reindex")
  4182. def _reindex_axes(
  4183. self: NDFrameT, axes, level, limit, tolerance, method, fill_value, copy
  4184. ) -> NDFrameT:
  4185. """Perform the reindex for all the axes."""
  4186. obj = self
  4187. for a in self._AXIS_ORDERS:
  4188. labels = axes[a]
  4189. if labels is None:
  4190. continue
  4191. ax = self._get_axis(a)
  4192. new_index, indexer = ax.reindex(
  4193. labels, level=level, limit=limit, tolerance=tolerance, method=method
  4194. )
  4195. axis = self._get_axis_number(a)
  4196. obj = obj._reindex_with_indexers(
  4197. {axis: [new_index, indexer]},
  4198. fill_value=fill_value,
  4199. copy=copy,
  4200. allow_dups=False,
  4201. )
  4202. # If we've made a copy once, no need to make another one
  4203. copy = False
  4204. return obj
  4205. def _needs_reindex_multi(self, axes, method, level) -> bool_t:
  4206. """Check if we do need a multi reindex."""
  4207. return (
  4208. (com.count_not_none(*axes.values()) == self._AXIS_LEN)
  4209. and method is None
  4210. and level is None
  4211. and not self._is_mixed_type
  4212. )
  4213. def _reindex_multi(self, axes, copy, fill_value):
  4214. raise AbstractMethodError(self)
  4215. @final
  4216. def _reindex_with_indexers(
  4217. self: NDFrameT,
  4218. reindexers,
  4219. fill_value=None,
  4220. copy: bool_t = False,
  4221. allow_dups: bool_t = False,
  4222. ) -> NDFrameT:
  4223. """allow_dups indicates an internal call here"""
  4224. # reindex doing multiple operations on different axes if indicated
  4225. new_data = self._mgr
  4226. for axis in sorted(reindexers.keys()):
  4227. index, indexer = reindexers[axis]
  4228. baxis = self._get_block_manager_axis(axis)
  4229. if index is None:
  4230. continue
  4231. index = ensure_index(index)
  4232. if indexer is not None:
  4233. indexer = ensure_platform_int(indexer)
  4234. # TODO: speed up on homogeneous DataFrame objects (see _reindex_multi)
  4235. new_data = new_data.reindex_indexer(
  4236. index,
  4237. indexer,
  4238. axis=baxis,
  4239. fill_value=fill_value,
  4240. allow_dups=allow_dups,
  4241. copy=copy,
  4242. )
  4243. # If we've made a copy once, no need to make another one
  4244. copy = False
  4245. if copy and new_data is self._mgr:
  4246. new_data = new_data.copy()
  4247. return self._constructor(new_data).__finalize__(self)
  4248. def filter(
  4249. self: NDFrameT,
  4250. items=None,
  4251. like: str | None = None,
  4252. regex: str | None = None,
  4253. axis=None,
  4254. ) -> NDFrameT:
  4255. """
  4256. Subset the dataframe rows or columns according to the specified index labels.
  4257. Note that this routine does not filter a dataframe on its
  4258. contents. The filter is applied to the labels of the index.
  4259. Parameters
  4260. ----------
  4261. items : list-like
  4262. Keep labels from axis which are in items.
  4263. like : str
  4264. Keep labels from axis for which "like in label == True".
  4265. regex : str (regular expression)
  4266. Keep labels from axis for which re.search(regex, label) == True.
  4267. axis : {0 or ‘index’, 1 or ‘columns’, None}, default None
  4268. The axis to filter on, expressed either as an index (int)
  4269. or axis name (str). By default this is the info axis,
  4270. 'index' for Series, 'columns' for DataFrame.
  4271. Returns
  4272. -------
  4273. same type as input object
  4274. See Also
  4275. --------
  4276. DataFrame.loc : Access a group of rows and columns
  4277. by label(s) or a boolean array.
  4278. Notes
  4279. -----
  4280. The ``items``, ``like``, and ``regex`` parameters are
  4281. enforced to be mutually exclusive.
  4282. ``axis`` defaults to the info axis that is used when indexing
  4283. with ``[]``.
  4284. Examples
  4285. --------
  4286. >>> df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),
  4287. ... index=['mouse', 'rabbit'],
  4288. ... columns=['one', 'two', 'three'])
  4289. >>> df
  4290. one two three
  4291. mouse 1 2 3
  4292. rabbit 4 5 6
  4293. >>> # select columns by name
  4294. >>> df.filter(items=['one', 'three'])
  4295. one three
  4296. mouse 1 3
  4297. rabbit 4 6
  4298. >>> # select columns by regular expression
  4299. >>> df.filter(regex='e$', axis=1)
  4300. one three
  4301. mouse 1 3
  4302. rabbit 4 6
  4303. >>> # select rows containing 'bbi'
  4304. >>> df.filter(like='bbi', axis=0)
  4305. one two three
  4306. rabbit 4 5 6
  4307. """
  4308. nkw = com.count_not_none(items, like, regex)
  4309. if nkw > 1:
  4310. raise TypeError(
  4311. "Keyword arguments `items`, `like`, or `regex` "
  4312. "are mutually exclusive"
  4313. )
  4314. if axis is None:
  4315. axis = self._info_axis_name
  4316. labels = self._get_axis(axis)
  4317. if items is not None:
  4318. name = self._get_axis_name(axis)
  4319. return self.reindex(**{name: [r for r in items if r in labels]})
  4320. elif like:
  4321. def f(x) -> bool_t:
  4322. assert like is not None # needed for mypy
  4323. return like in ensure_str(x)
  4324. values = labels.map(f)
  4325. return self.loc(axis=axis)[values]
  4326. elif regex:
  4327. def f(x) -> bool_t:
  4328. return matcher.search(ensure_str(x)) is not None
  4329. matcher = re.compile(regex)
  4330. values = labels.map(f)
  4331. return self.loc(axis=axis)[values]
  4332. else:
  4333. raise TypeError("Must pass either `items`, `like`, or `regex`")
  4334. @final
  4335. def head(self: NDFrameT, n: int = 5) -> NDFrameT:
  4336. """
  4337. Return the first `n` rows.
  4338. This function returns the first `n` rows for the object based
  4339. on position. It is useful for quickly testing if your object
  4340. has the right type of data in it.
  4341. For negative values of `n`, this function returns all rows except
  4342. the last `n` rows, equivalent to ``df[:-n]``.
  4343. Parameters
  4344. ----------
  4345. n : int, default 5
  4346. Number of rows to select.
  4347. Returns
  4348. -------
  4349. same type as caller
  4350. The first `n` rows of the caller object.
  4351. See Also
  4352. --------
  4353. DataFrame.tail: Returns the last `n` rows.
  4354. Examples
  4355. --------
  4356. >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
  4357. ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']})
  4358. >>> df
  4359. animal
  4360. 0 alligator
  4361. 1 bee
  4362. 2 falcon
  4363. 3 lion
  4364. 4 monkey
  4365. 5 parrot
  4366. 6 shark
  4367. 7 whale
  4368. 8 zebra
  4369. Viewing the first 5 lines
  4370. >>> df.head()
  4371. animal
  4372. 0 alligator
  4373. 1 bee
  4374. 2 falcon
  4375. 3 lion
  4376. 4 monkey
  4377. Viewing the first `n` lines (three in this case)
  4378. >>> df.head(3)
  4379. animal
  4380. 0 alligator
  4381. 1 bee
  4382. 2 falcon
  4383. For negative values of `n`
  4384. >>> df.head(-3)
  4385. animal
  4386. 0 alligator
  4387. 1 bee
  4388. 2 falcon
  4389. 3 lion
  4390. 4 monkey
  4391. 5 parrot
  4392. """
  4393. return self.iloc[:n]
  4394. @final
  4395. def tail(self: NDFrameT, n: int = 5) -> NDFrameT:
  4396. """
  4397. Return the last `n` rows.
  4398. This function returns last `n` rows from the object based on
  4399. position. It is useful for quickly verifying data, for example,
  4400. after sorting or appending rows.
  4401. For negative values of `n`, this function returns all rows except
  4402. the first `n` rows, equivalent to ``df[n:]``.
  4403. Parameters
  4404. ----------
  4405. n : int, default 5
  4406. Number of rows to select.
  4407. Returns
  4408. -------
  4409. type of caller
  4410. The last `n` rows of the caller object.
  4411. See Also
  4412. --------
  4413. DataFrame.head : The first `n` rows of the caller object.
  4414. Examples
  4415. --------
  4416. >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
  4417. ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']})
  4418. >>> df
  4419. animal
  4420. 0 alligator
  4421. 1 bee
  4422. 2 falcon
  4423. 3 lion
  4424. 4 monkey
  4425. 5 parrot
  4426. 6 shark
  4427. 7 whale
  4428. 8 zebra
  4429. Viewing the last 5 lines
  4430. >>> df.tail()
  4431. animal
  4432. 4 monkey
  4433. 5 parrot
  4434. 6 shark
  4435. 7 whale
  4436. 8 zebra
  4437. Viewing the last `n` lines (three in this case)
  4438. >>> df.tail(3)
  4439. animal
  4440. 6 shark
  4441. 7 whale
  4442. 8 zebra
  4443. For negative values of `n`
  4444. >>> df.tail(-3)
  4445. animal
  4446. 3 lion
  4447. 4 monkey
  4448. 5 parrot
  4449. 6 shark
  4450. 7 whale
  4451. 8 zebra
  4452. """
  4453. if n == 0:
  4454. return self.iloc[0:0]
  4455. return self.iloc[-n:]
  4456. @final
  4457. def sample(
  4458. self: NDFrameT,
  4459. n: int | None = None,
  4460. frac: float | None = None,
  4461. replace: bool_t = False,
  4462. weights=None,
  4463. random_state: RandomState | None = None,
  4464. axis: Axis | None = None,
  4465. ignore_index: bool_t = False,
  4466. ) -> NDFrameT:
  4467. """
  4468. Return a random sample of items from an axis of object.
  4469. You can use `random_state` for reproducibility.
  4470. Parameters
  4471. ----------
  4472. n : int, optional
  4473. Number of items from axis to return. Cannot be used with `frac`.
  4474. Default = 1 if `frac` = None.
  4475. frac : float, optional
  4476. Fraction of axis items to return. Cannot be used with `n`.
  4477. replace : bool, default False
  4478. Allow or disallow sampling of the same row more than once.
  4479. weights : str or ndarray-like, optional
  4480. Default 'None' results in equal probability weighting.
  4481. If passed a Series, will align with target object on index. Index
  4482. values in weights not found in sampled object will be ignored and
  4483. index values in sampled object not in weights will be assigned
  4484. weights of zero.
  4485. If called on a DataFrame, will accept the name of a column
  4486. when axis = 0.
  4487. Unless weights are a Series, weights must be same length as axis
  4488. being sampled.
  4489. If weights do not sum to 1, they will be normalized to sum to 1.
  4490. Missing values in the weights column will be treated as zero.
  4491. Infinite values not allowed.
  4492. random_state : int, array-like, BitGenerator, np.random.RandomState,
  4493. np.random.Generator, optional. If int, array-like, or BitGenerator, seed for
  4494. random number generator. If np.random.RandomState or np.random.Generator,
  4495. use as given.
  4496. .. versionchanged:: 1.1.0
  4497. array-like and BitGenerator object now passed to np.random.RandomState()
  4498. as seed
  4499. .. versionchanged:: 1.4.0
  4500. np.random.Generator objects now accepted
  4501. axis : {0 or ‘index’, 1 or ‘columns’, None}, default None
  4502. Axis to sample. Accepts axis number or name. Default is stat axis
  4503. for given data type (0 for Series and DataFrames).
  4504. ignore_index : bool, default False
  4505. If True, the resulting index will be labeled 0, 1, …, n - 1.
  4506. .. versionadded:: 1.3.0
  4507. Returns
  4508. -------
  4509. Series or DataFrame
  4510. A new object of same type as caller containing `n` items randomly
  4511. sampled from the caller object.
  4512. See Also
  4513. --------
  4514. DataFrameGroupBy.sample: Generates random samples from each group of a
  4515. DataFrame object.
  4516. SeriesGroupBy.sample: Generates random samples from each group of a
  4517. Series object.
  4518. numpy.random.choice: Generates a random sample from a given 1-D numpy
  4519. array.
  4520. Notes
  4521. -----
  4522. If `frac` > 1, `replacement` should be set to `True`.
  4523. Examples
  4524. --------
  4525. >>> df = pd.DataFrame({'num_legs': [2, 4, 8, 0],
  4526. ... 'num_wings': [2, 0, 0, 0],
  4527. ... 'num_specimen_seen': [10, 2, 1, 8]},
  4528. ... index=['falcon', 'dog', 'spider', 'fish'])
  4529. >>> df
  4530. num_legs num_wings num_specimen_seen
  4531. falcon 2 2 10
  4532. dog 4 0 2
  4533. spider 8 0 1
  4534. fish 0 0 8
  4535. Extract 3 random elements from the ``Series`` ``df['num_legs']``:
  4536. Note that we use `random_state` to ensure the reproducibility of
  4537. the examples.
  4538. >>> df['num_legs'].sample(n=3, random_state=1)
  4539. fish 0
  4540. spider 8
  4541. falcon 2
  4542. Name: num_legs, dtype: int64
  4543. A random 50% sample of the ``DataFrame`` with replacement:
  4544. >>> df.sample(frac=0.5, replace=True, random_state=1)
  4545. num_legs num_wings num_specimen_seen
  4546. dog 4 0 2
  4547. fish 0 0 8
  4548. An upsample sample of the ``DataFrame`` with replacement:
  4549. Note that `replace` parameter has to be `True` for `frac` parameter > 1.
  4550. >>> df.sample(frac=2, replace=True, random_state=1)
  4551. num_legs num_wings num_specimen_seen
  4552. dog 4 0 2
  4553. fish 0 0 8
  4554. falcon 2 2 10
  4555. falcon 2 2 10
  4556. fish 0 0 8
  4557. dog 4 0 2
  4558. fish 0 0 8
  4559. dog 4 0 2
  4560. Using a DataFrame column as weights. Rows with larger value in the
  4561. `num_specimen_seen` column are more likely to be sampled.
  4562. >>> df.sample(n=2, weights='num_specimen_seen', random_state=1)
  4563. num_legs num_wings num_specimen_seen
  4564. falcon 2 2 10
  4565. fish 0 0 8
  4566. """
  4567. if axis is None:
  4568. axis = self._stat_axis_number
  4569. axis = self._get_axis_number(axis)
  4570. obj_len = self.shape[axis]
  4571. # Process random_state argument
  4572. rs = com.random_state(random_state)
  4573. size = sample.process_sampling_size(n, frac, replace)
  4574. if size is None:
  4575. assert frac is not None
  4576. size = round(frac * obj_len)
  4577. if weights is not None:
  4578. weights = sample.preprocess_weights(self, weights, axis)
  4579. sampled_indices = sample.sample(obj_len, size, replace, weights, rs)
  4580. result = self.take(sampled_indices, axis=axis)
  4581. if ignore_index:
  4582. result.index = default_index(len(result))
  4583. return result
  4584. @final
  4585. @doc(klass=_shared_doc_kwargs["klass"])
  4586. def pipe(
  4587. self,
  4588. func: Callable[..., T] | tuple[Callable[..., T], str],
  4589. *args,
  4590. **kwargs,
  4591. ) -> T:
  4592. r"""
  4593. Apply func(self, \*args, \*\*kwargs).
  4594. Parameters
  4595. ----------
  4596. func : function
  4597. Function to apply to the {klass}.
  4598. ``args``, and ``kwargs`` are passed into ``func``.
  4599. Alternatively a ``(callable, data_keyword)`` tuple where
  4600. ``data_keyword`` is a string indicating the keyword of
  4601. ``callable`` that expects the {klass}.
  4602. args : iterable, optional
  4603. Positional arguments passed into ``func``.
  4604. kwargs : mapping, optional
  4605. A dictionary of keyword arguments passed into ``func``.
  4606. Returns
  4607. -------
  4608. object : the return type of ``func``.
  4609. See Also
  4610. --------
  4611. DataFrame.apply : Apply a function along input axis of DataFrame.
  4612. DataFrame.applymap : Apply a function elementwise on a whole DataFrame.
  4613. Series.map : Apply a mapping correspondence on a
  4614. :class:`~pandas.Series`.
  4615. Notes
  4616. -----
  4617. Use ``.pipe`` when chaining together functions that expect
  4618. Series, DataFrames or GroupBy objects. Instead of writing
  4619. >>> func(g(h(df), arg1=a), arg2=b, arg3=c) # doctest: +SKIP
  4620. You can write
  4621. >>> (df.pipe(h)
  4622. ... .pipe(g, arg1=a)
  4623. ... .pipe(func, arg2=b, arg3=c)
  4624. ... ) # doctest: +SKIP
  4625. If you have a function that takes the data as (say) the second
  4626. argument, pass a tuple indicating which keyword expects the
  4627. data. For example, suppose ``f`` takes its data as ``arg2``:
  4628. >>> (df.pipe(h)
  4629. ... .pipe(g, arg1=a)
  4630. ... .pipe((func, 'arg2'), arg1=a, arg3=c)
  4631. ... ) # doctest: +SKIP
  4632. """
  4633. return com.pipe(self, func, *args, **kwargs)
  4634. # ----------------------------------------------------------------------
  4635. # Attribute access
  4636. @final
  4637. def __finalize__(
  4638. self: NDFrameT, other, method: str | None = None, **kwargs
  4639. ) -> NDFrameT:
  4640. """
  4641. Propagate metadata from other to self.
  4642. Parameters
  4643. ----------
  4644. other : the object from which to get the attributes that we are going
  4645. to propagate
  4646. method : str, optional
  4647. A passed method name providing context on where ``__finalize__``
  4648. was called.
  4649. .. warning::
  4650. The value passed as `method` are not currently considered
  4651. stable across pandas releases.
  4652. """
  4653. if isinstance(other, NDFrame):
  4654. for name in other.attrs:
  4655. self.attrs[name] = other.attrs[name]
  4656. self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels
  4657. # For subclasses using _metadata.
  4658. for name in set(self._metadata) & set(other._metadata):
  4659. assert isinstance(name, str)
  4660. object.__setattr__(self, name, getattr(other, name, None))
  4661. if method == "concat":
  4662. allows_duplicate_labels = all(
  4663. x.flags.allows_duplicate_labels for x in other.objs
  4664. )
  4665. self.flags.allows_duplicate_labels = allows_duplicate_labels
  4666. return self
  4667. def __getattr__(self, name: str):
  4668. """
  4669. After regular attribute access, try looking up the name
  4670. This allows simpler access to columns for interactive use.
  4671. """
  4672. # Note: obj.x will always call obj.__getattribute__('x') prior to
  4673. # calling obj.__getattr__('x').
  4674. if (
  4675. name not in self._internal_names_set
  4676. and name not in self._metadata
  4677. and name not in self._accessors
  4678. and self._info_axis._can_hold_identifiers_and_holds_name(name)
  4679. ):
  4680. return self[name]
  4681. return object.__getattribute__(self, name)
  4682. def __setattr__(self, name: str, value) -> None:
  4683. """
  4684. After regular attribute access, try setting the name
  4685. This allows simpler access to columns for interactive use.
  4686. """
  4687. # first try regular attribute access via __getattribute__, so that
  4688. # e.g. ``obj.x`` and ``obj.x = 4`` will always reference/modify
  4689. # the same attribute.
  4690. try:
  4691. object.__getattribute__(self, name)
  4692. return object.__setattr__(self, name, value)
  4693. except AttributeError:
  4694. pass
  4695. # if this fails, go on to more involved attribute setting
  4696. # (note that this matches __getattr__, above).
  4697. if name in self._internal_names_set:
  4698. object.__setattr__(self, name, value)
  4699. elif name in self._metadata:
  4700. object.__setattr__(self, name, value)
  4701. else:
  4702. try:
  4703. existing = getattr(self, name)
  4704. if isinstance(existing, Index):
  4705. object.__setattr__(self, name, value)
  4706. elif name in self._info_axis:
  4707. self[name] = value
  4708. else:
  4709. object.__setattr__(self, name, value)
  4710. except (AttributeError, TypeError):
  4711. if isinstance(self, ABCDataFrame) and (is_list_like(value)):
  4712. warnings.warn(
  4713. "Pandas doesn't allow columns to be "
  4714. "created via a new attribute name - see "
  4715. "https://pandas.pydata.org/pandas-docs/"
  4716. "stable/indexing.html#attribute-access",
  4717. stacklevel=2,
  4718. )
  4719. object.__setattr__(self, name, value)
  4720. @final
  4721. def _dir_additions(self) -> set[str]:
  4722. """
  4723. add the string-like attributes from the info_axis.
  4724. If info_axis is a MultiIndex, its first level values are used.
  4725. """
  4726. additions = super()._dir_additions()
  4727. if self._info_axis._can_hold_strings:
  4728. additions.update(self._info_axis._dir_additions_for_owner)
  4729. return additions
  4730. # ----------------------------------------------------------------------
  4731. # Consolidation of internals
  4732. @final
  4733. def _protect_consolidate(self, f):
  4734. """
  4735. Consolidate _mgr -- if the blocks have changed, then clear the
  4736. cache
  4737. """
  4738. if isinstance(self._mgr, (ArrayManager, SingleArrayManager)):
  4739. return f()
  4740. blocks_before = len(self._mgr.blocks)
  4741. result = f()
  4742. if len(self._mgr.blocks) != blocks_before:
  4743. self._clear_item_cache()
  4744. return result
  4745. @final
  4746. def _consolidate_inplace(self) -> None:
  4747. """Consolidate data in place and return None"""
  4748. def f():
  4749. self._mgr = self._mgr.consolidate()
  4750. self._protect_consolidate(f)
  4751. @final
  4752. def _consolidate(self):
  4753. """
  4754. Compute NDFrame with "consolidated" internals (data of each dtype
  4755. grouped together in a single ndarray).
  4756. Returns
  4757. -------
  4758. consolidated : same type as caller
  4759. """
  4760. f = lambda: self._mgr.consolidate()
  4761. cons_data = self._protect_consolidate(f)
  4762. return self._constructor(cons_data).__finalize__(self)
  4763. @final
  4764. @property
  4765. def _is_mixed_type(self) -> bool_t:
  4766. if self._mgr.is_single_block:
  4767. return False
  4768. if self._mgr.any_extension_types:
  4769. # Even if they have the same dtype, we can't consolidate them,
  4770. # so we pretend this is "mixed'"
  4771. return True
  4772. return self.dtypes.nunique() > 1
  4773. @final
  4774. def _check_inplace_setting(self, value) -> bool_t:
  4775. """check whether we allow in-place setting with this type of value"""
  4776. if self._is_mixed_type and not self._mgr.is_numeric_mixed_type:
  4777. # allow an actual np.nan thru
  4778. if is_float(value) and np.isnan(value):
  4779. return True
  4780. raise TypeError(
  4781. "Cannot do inplace boolean setting on "
  4782. "mixed-types with a non np.nan value"
  4783. )
  4784. return True
  4785. @final
  4786. def _get_numeric_data(self):
  4787. return self._constructor(self._mgr.get_numeric_data()).__finalize__(self)
  4788. @final
  4789. def _get_bool_data(self):
  4790. return self._constructor(self._mgr.get_bool_data()).__finalize__(self)
  4791. # ----------------------------------------------------------------------
  4792. # Internal Interface Methods
  4793. @property
  4794. def values(self) -> np.ndarray:
  4795. raise AbstractMethodError(self)
  4796. @property
  4797. def _values(self) -> np.ndarray:
  4798. """internal implementation"""
  4799. raise AbstractMethodError(self)
  4800. @property
  4801. def dtypes(self):
  4802. """
  4803. Return the dtypes in the DataFrame.
  4804. This returns a Series with the data type of each column.
  4805. The result's index is the original DataFrame's columns. Columns
  4806. with mixed types are stored with the ``object`` dtype. See
  4807. :ref:`the User Guide <basics.dtypes>` for more.
  4808. Returns
  4809. -------
  4810. pandas.Series
  4811. The data type of each column.
  4812. Examples
  4813. --------
  4814. >>> df = pd.DataFrame({'float': [1.0],
  4815. ... 'int': [1],
  4816. ... 'datetime': [pd.Timestamp('20180310')],
  4817. ... 'string': ['foo']})
  4818. >>> df.dtypes
  4819. float float64
  4820. int int64
  4821. datetime datetime64[ns]
  4822. string object
  4823. dtype: object
  4824. """
  4825. data = self._mgr.get_dtypes()
  4826. return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_)
  4827. def astype(
  4828. self: NDFrameT, dtype, copy: bool_t = True, errors: str = "raise"
  4829. ) -> NDFrameT:
  4830. """
  4831. Cast a pandas object to a specified dtype ``dtype``.
  4832. Parameters
  4833. ----------
  4834. dtype : data type, or dict of column name -> data type
  4835. Use a numpy.dtype or Python type to cast entire pandas object to
  4836. the same type. Alternatively, use {col: dtype, ...}, where col is a
  4837. column label and dtype is a numpy.dtype or Python type to cast one
  4838. or more of the DataFrame's columns to column-specific types.
  4839. copy : bool, default True
  4840. Return a copy when ``copy=True`` (be very careful setting
  4841. ``copy=False`` as changes to values then may propagate to other
  4842. pandas objects).
  4843. errors : {'raise', 'ignore'}, default 'raise'
  4844. Control raising of exceptions on invalid data for provided dtype.
  4845. - ``raise`` : allow exceptions to be raised
  4846. - ``ignore`` : suppress exceptions. On error return original object.
  4847. Returns
  4848. -------
  4849. casted : same type as caller
  4850. See Also
  4851. --------
  4852. to_datetime : Convert argument to datetime.
  4853. to_timedelta : Convert argument to timedelta.
  4854. to_numeric : Convert argument to a numeric type.
  4855. numpy.ndarray.astype : Cast a numpy array to a specified type.
  4856. Notes
  4857. -----
  4858. .. deprecated:: 1.3.0
  4859. Using ``astype`` to convert from timezone-naive dtype to
  4860. timezone-aware dtype is deprecated and will raise in a
  4861. future version. Use :meth:`Series.dt.tz_localize` instead.
  4862. Examples
  4863. --------
  4864. Create a DataFrame:
  4865. >>> d = {'col1': [1, 2], 'col2': [3, 4]}
  4866. >>> df = pd.DataFrame(data=d)
  4867. >>> df.dtypes
  4868. col1 int64
  4869. col2 int64
  4870. dtype: object
  4871. Cast all columns to int32:
  4872. >>> df.astype('int32').dtypes
  4873. col1 int32
  4874. col2 int32
  4875. dtype: object
  4876. Cast col1 to int32 using a dictionary:
  4877. >>> df.astype({'col1': 'int32'}).dtypes
  4878. col1 int32
  4879. col2 int64
  4880. dtype: object
  4881. Create a series:
  4882. >>> ser = pd.Series([1, 2], dtype='int32')
  4883. >>> ser
  4884. 0 1
  4885. 1 2
  4886. dtype: int32
  4887. >>> ser.astype('int64')
  4888. 0 1
  4889. 1 2
  4890. dtype: int64
  4891. Convert to categorical type:
  4892. >>> ser.astype('category')
  4893. 0 1
  4894. 1 2
  4895. dtype: category
  4896. Categories (2, int64): [1, 2]
  4897. Convert to ordered categorical type with custom ordering:
  4898. >>> from pandas.api.types import CategoricalDtype
  4899. >>> cat_dtype = CategoricalDtype(
  4900. ... categories=[2, 1], ordered=True)
  4901. >>> ser.astype(cat_dtype)
  4902. 0 1
  4903. 1 2
  4904. dtype: category
  4905. Categories (2, int64): [2 < 1]
  4906. Note that using ``copy=False`` and changing data on a new
  4907. pandas object may propagate changes:
  4908. >>> s1 = pd.Series([1, 2])
  4909. >>> s2 = s1.astype('int64', copy=False)
  4910. >>> s2[0] = 10
  4911. >>> s1 # note that s1[0] has changed too
  4912. 0 10
  4913. 1 2
  4914. dtype: int64
  4915. Create a series of dates:
  4916. >>> ser_date = pd.Series(pd.date_range('20200101', periods=3))
  4917. >>> ser_date
  4918. 0 2020-01-01
  4919. 1 2020-01-02
  4920. 2 2020-01-03
  4921. dtype: datetime64[ns]
  4922. """
  4923. if is_dict_like(dtype):
  4924. if self.ndim == 1: # i.e. Series
  4925. if len(dtype) > 1 or self.name not in dtype:
  4926. raise KeyError(
  4927. "Only the Series name can be used for "
  4928. "the key in Series dtype mappings."
  4929. )
  4930. new_type = dtype[self.name]
  4931. return self.astype(new_type, copy, errors)
  4932. for col_name in dtype.keys():
  4933. if col_name not in self:
  4934. raise KeyError(
  4935. "Only a column name can be used for the "
  4936. "key in a dtype mappings argument."
  4937. )
  4938. results = []
  4939. for col_name, col in self.items():
  4940. if col_name in dtype:
  4941. results.append(
  4942. col.astype(dtype=dtype[col_name], copy=copy, errors=errors)
  4943. )
  4944. else:
  4945. results.append(col.copy() if copy else col)
  4946. elif is_extension_array_dtype(dtype) and self.ndim > 1:
  4947. # GH 18099/22869: columnwise conversion to extension dtype
  4948. # GH 24704: use iloc to handle duplicate column names
  4949. # TODO(EA2D): special case not needed with 2D EAs
  4950. results = [
  4951. self.iloc[:, i].astype(dtype, copy=copy)
  4952. for i in range(len(self.columns))
  4953. ]
  4954. else:
  4955. # else, only a single dtype is given
  4956. new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
  4957. return self._constructor(new_data).__finalize__(self, method="astype")
  4958. # GH 33113: handle empty frame or series
  4959. if not results:
  4960. return self.copy()
  4961. # GH 19920: retain column metadata after concat
  4962. result = concat(results, axis=1, copy=False)
  4963. result.columns = self.columns
  4964. # https://github.com/python/mypy/issues/8354
  4965. return cast(NDFrameT, result)
  4966. @final
  4967. def copy(self: NDFrameT, deep: bool_t = True) -> NDFrameT:
  4968. """
  4969. Make a copy of this object's indices and data.
  4970. When ``deep=True`` (default), a new object will be created with a
  4971. copy of the calling object's data and indices. Modifications to
  4972. the data or indices of the copy will not be reflected in the
  4973. original object (see notes below).
  4974. When ``deep=False``, a new object will be created without copying
  4975. the calling object's data or index (only references to the data
  4976. and index are copied). Any changes to the data of the original
  4977. will be reflected in the shallow copy (and vice versa).
  4978. Parameters
  4979. ----------
  4980. deep : bool, default True
  4981. Make a deep copy, including a copy of the data and the indices.
  4982. With ``deep=False`` neither the indices nor the data are copied.
  4983. Returns
  4984. -------
  4985. copy : Series or DataFrame
  4986. Object type matches caller.
  4987. Notes
  4988. -----
  4989. When ``deep=True``, data is copied but actual Python objects
  4990. will not be copied recursively, only the reference to the object.
  4991. This is in contrast to `copy.deepcopy` in the Standard Library,
  4992. which recursively copies object data (see examples below).
  4993. While ``Index`` objects are copied when ``deep=True``, the underlying
  4994. numpy array is not copied for performance reasons. Since ``Index`` is
  4995. immutable, the underlying data can be safely shared and a copy
  4996. is not needed.
  4997. Examples
  4998. --------
  4999. >>> s = pd.Series([1, 2], index=["a", "b"])
  5000. >>> s
  5001. a 1
  5002. b 2
  5003. dtype: int64
  5004. >>> s_copy = s.copy()
  5005. >>> s_copy
  5006. a 1
  5007. b 2
  5008. dtype: int64
  5009. **Shallow copy versus default (deep) copy:**
  5010. >>> s = pd.Series([1, 2], index=["a", "b"])
  5011. >>> deep = s.copy()
  5012. >>> shallow = s.copy(deep=False)
  5013. Shallow copy shares data and index with original.
  5014. >>> s is shallow
  5015. False
  5016. >>> s.values is shallow.values and s.index is shallow.index
  5017. True
  5018. Deep copy has own copy of data and index.
  5019. >>> s is deep
  5020. False
  5021. >>> s.values is deep.values or s.index is deep.index
  5022. False
  5023. Updates to the data shared by shallow copy and original is reflected
  5024. in both; deep copy remains unchanged.
  5025. >>> s[0] = 3
  5026. >>> shallow[1] = 4
  5027. >>> s
  5028. a 3
  5029. b 4
  5030. dtype: int64
  5031. >>> shallow
  5032. a 3
  5033. b 4
  5034. dtype: int64
  5035. >>> deep
  5036. a 1
  5037. b 2
  5038. dtype: int64
  5039. Note that when copying an object containing Python objects, a deep copy
  5040. will copy the data, but will not do so recursively. Updating a nested
  5041. data object will be reflected in the deep copy.
  5042. >>> s = pd.Series([[1, 2], [3, 4]])
  5043. >>> deep = s.copy()
  5044. >>> s[0][0] = 10
  5045. >>> s
  5046. 0 [10, 2]
  5047. 1 [3, 4]
  5048. dtype: object
  5049. >>> deep
  5050. 0 [10, 2]
  5051. 1 [3, 4]
  5052. dtype: object
  5053. """
  5054. data = self._mgr.copy(deep=deep)
  5055. self._clear_item_cache()
  5056. return self._constructor(data).__finalize__(self, method="copy")
  5057. @final
  5058. def __copy__(self: NDFrameT, deep: bool_t = True) -> NDFrameT:
  5059. return self.copy(deep=deep)
  5060. @final
  5061. def __deepcopy__(self: NDFrameT, memo=None) -> NDFrameT:
  5062. """
  5063. Parameters
  5064. ----------
  5065. memo, default None
  5066. Standard signature. Unused
  5067. """
  5068. return self.copy(deep=True)
  5069. @final
  5070. def _convert(
  5071. self: NDFrameT,
  5072. datetime: bool_t = False,
  5073. numeric: bool_t = False,
  5074. timedelta: bool_t = False,
  5075. ) -> NDFrameT:
  5076. """
  5077. Attempt to infer better dtype for object columns
  5078. Parameters
  5079. ----------
  5080. datetime : bool, default False
  5081. If True, convert to date where possible.
  5082. numeric : bool, default False
  5083. If True, attempt to convert to numbers (including strings), with
  5084. unconvertible values becoming NaN.
  5085. timedelta : bool, default False
  5086. If True, convert to timedelta where possible.
  5087. Returns
  5088. -------
  5089. converted : same as input object
  5090. """
  5091. validate_bool_kwarg(datetime, "datetime")
  5092. validate_bool_kwarg(numeric, "numeric")
  5093. validate_bool_kwarg(timedelta, "timedelta")
  5094. return self._constructor(
  5095. self._mgr.convert(
  5096. datetime=datetime,
  5097. numeric=numeric,
  5098. timedelta=timedelta,
  5099. copy=True,
  5100. )
  5101. ).__finalize__(self)
  5102. @final
  5103. def infer_objects(self: NDFrameT) -> NDFrameT:
  5104. """
  5105. Attempt to infer better dtypes for object columns.
  5106. Attempts soft conversion of object-dtyped
  5107. columns, leaving non-object and unconvertible
  5108. columns unchanged. The inference rules are the
  5109. same as during normal Series/DataFrame construction.
  5110. Returns
  5111. -------
  5112. converted : same type as input object
  5113. See Also
  5114. --------
  5115. to_datetime : Convert argument to datetime.
  5116. to_timedelta : Convert argument to timedelta.
  5117. to_numeric : Convert argument to numeric type.
  5118. convert_dtypes : Convert argument to best possible dtype.
  5119. Examples
  5120. --------
  5121. >>> df = pd.DataFrame({"A": ["a", 1, 2, 3]})
  5122. >>> df = df.iloc[1:]
  5123. >>> df
  5124. A
  5125. 1 1
  5126. 2 2
  5127. 3 3
  5128. >>> df.dtypes
  5129. A object
  5130. dtype: object
  5131. >>> df.infer_objects().dtypes
  5132. A int64
  5133. dtype: object
  5134. """
  5135. # numeric=False necessary to only soft convert;
  5136. # python objects will still be converted to
  5137. # native numpy numeric types
  5138. return self._constructor(
  5139. self._mgr.convert(datetime=True, numeric=False, timedelta=True, copy=True)
  5140. ).__finalize__(self, method="infer_objects")
  5141. @final
  5142. def convert_dtypes(
  5143. self: NDFrameT,
  5144. infer_objects: bool_t = True,
  5145. convert_string: bool_t = True,
  5146. convert_integer: bool_t = True,
  5147. convert_boolean: bool_t = True,
  5148. convert_floating: bool_t = True,
  5149. ) -> NDFrameT:
  5150. """
  5151. Convert columns to best possible dtypes using dtypes supporting ``pd.NA``.
  5152. .. versionadded:: 1.0.0
  5153. Parameters
  5154. ----------
  5155. infer_objects : bool, default True
  5156. Whether object dtypes should be converted to the best possible types.
  5157. convert_string : bool, default True
  5158. Whether object dtypes should be converted to ``StringDtype()``.
  5159. convert_integer : bool, default True
  5160. Whether, if possible, conversion can be done to integer extension types.
  5161. convert_boolean : bool, defaults True
  5162. Whether object dtypes should be converted to ``BooleanDtypes()``.
  5163. convert_floating : bool, defaults True
  5164. Whether, if possible, conversion can be done to floating extension types.
  5165. If `convert_integer` is also True, preference will be give to integer
  5166. dtypes if the floats can be faithfully casted to integers.
  5167. .. versionadded:: 1.2.0
  5168. Returns
  5169. -------
  5170. Series or DataFrame
  5171. Copy of input object with new dtype.
  5172. See Also
  5173. --------
  5174. infer_objects : Infer dtypes of objects.
  5175. to_datetime : Convert argument to datetime.
  5176. to_timedelta : Convert argument to timedelta.
  5177. to_numeric : Convert argument to a numeric type.
  5178. Notes
  5179. -----
  5180. By default, ``convert_dtypes`` will attempt to convert a Series (or each
  5181. Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options
  5182. ``convert_string``, ``convert_integer``, ``convert_boolean`` and
  5183. ``convert_boolean``, it is possible to turn off individual conversions
  5184. to ``StringDtype``, the integer extension types, ``BooleanDtype``
  5185. or floating extension types, respectively.
  5186. For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference
  5187. rules as during normal Series/DataFrame construction. Then, if possible,
  5188. convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer
  5189. or floating extension type, otherwise leave as ``object``.
  5190. If the dtype is integer, convert to an appropriate integer extension type.
  5191. If the dtype is numeric, and consists of all integers, convert to an
  5192. appropriate integer extension type. Otherwise, convert to an
  5193. appropriate floating extension type.
  5194. .. versionchanged:: 1.2
  5195. Starting with pandas 1.2, this method also converts float columns
  5196. to the nullable floating extension type.
  5197. In the future, as new dtypes are added that support ``pd.NA``, the results
  5198. of this method will change to support those new dtypes.
  5199. Examples
  5200. --------
  5201. >>> df = pd.DataFrame(
  5202. ... {
  5203. ... "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
  5204. ... "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
  5205. ... "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")),
  5206. ... "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")),
  5207. ... "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")),
  5208. ... "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
  5209. ... }
  5210. ... )
  5211. Start with a DataFrame with default dtypes.
  5212. >>> df
  5213. a b c d e f
  5214. 0 1 x True h 10.0 NaN
  5215. 1 2 y False i NaN 100.5
  5216. 2 3 z NaN NaN 20.0 200.0
  5217. >>> df.dtypes
  5218. a int32
  5219. b object
  5220. c object
  5221. d object
  5222. e float64
  5223. f float64
  5224. dtype: object
  5225. Convert the DataFrame to use best possible dtypes.
  5226. >>> dfn = df.convert_dtypes()
  5227. >>> dfn
  5228. a b c d e f
  5229. 0 1 x True h 10 <NA>
  5230. 1 2 y False i <NA> 100.5
  5231. 2 3 z <NA> <NA> 20 200.0
  5232. >>> dfn.dtypes
  5233. a Int32
  5234. b string
  5235. c boolean
  5236. d string
  5237. e Int64
  5238. f Float64
  5239. dtype: object
  5240. Start with a Series of strings and missing data represented by ``np.nan``.
  5241. >>> s = pd.Series(["a", "b", np.nan])
  5242. >>> s
  5243. 0 a
  5244. 1 b
  5245. 2 NaN
  5246. dtype: object
  5247. Obtain a Series with dtype ``StringDtype``.
  5248. >>> s.convert_dtypes()
  5249. 0 a
  5250. 1 b
  5251. 2 <NA>
  5252. dtype: string
  5253. """
  5254. if self.ndim == 1:
  5255. return self._convert_dtypes(
  5256. infer_objects,
  5257. convert_string,
  5258. convert_integer,
  5259. convert_boolean,
  5260. convert_floating,
  5261. )
  5262. else:
  5263. results = [
  5264. col._convert_dtypes(
  5265. infer_objects,
  5266. convert_string,
  5267. convert_integer,
  5268. convert_boolean,
  5269. convert_floating,
  5270. )
  5271. for col_name, col in self.items()
  5272. ]
  5273. if len(results) > 0:
  5274. # https://github.com/python/mypy/issues/8354
  5275. return cast(NDFrameT, concat(results, axis=1, copy=False))
  5276. else:
  5277. return self.copy()
  5278. # ----------------------------------------------------------------------
  5279. # Filling NA's
  5280. @doc(**_shared_doc_kwargs)
  5281. def fillna(
  5282. self: NDFrameT,
  5283. value=None,
  5284. method=None,
  5285. axis=None,
  5286. inplace: bool_t = False,
  5287. limit=None,
  5288. downcast=None,
  5289. ) -> NDFrameT | None:
  5290. """
  5291. Fill NA/NaN values using the specified method.
  5292. Parameters
  5293. ----------
  5294. value : scalar, dict, Series, or DataFrame
  5295. Value to use to fill holes (e.g. 0), alternately a
  5296. dict/Series/DataFrame of values specifying which value to use for
  5297. each index (for a Series) or column (for a DataFrame). Values not
  5298. in the dict/Series/DataFrame will not be filled. This value cannot
  5299. be a list.
  5300. method : {{'backfill', 'bfill', 'pad', 'ffill', None}}, default None
  5301. Method to use for filling holes in reindexed Series
  5302. pad / ffill: propagate last valid observation forward to next valid
  5303. backfill / bfill: use next valid observation to fill gap.
  5304. axis : {axes_single_arg}
  5305. Axis along which to fill missing values.
  5306. inplace : bool, default False
  5307. If True, fill in-place. Note: this will modify any
  5308. other views on this object (e.g., a no-copy slice for a column in a
  5309. DataFrame).
  5310. limit : int, default None
  5311. If method is specified, this is the maximum number of consecutive
  5312. NaN values to forward/backward fill. In other words, if there is
  5313. a gap with more than this number of consecutive NaNs, it will only
  5314. be partially filled. If method is not specified, this is the
  5315. maximum number of entries along the entire axis where NaNs will be
  5316. filled. Must be greater than 0 if not None.
  5317. downcast : dict, default is None
  5318. A dict of item->dtype of what to downcast if possible,
  5319. or the string 'infer' which will try to downcast to an appropriate
  5320. equal type (e.g. float64 to int64 if possible).
  5321. Returns
  5322. -------
  5323. {klass} or None
  5324. Object with missing values filled or None if ``inplace=True``.
  5325. See Also
  5326. --------
  5327. interpolate : Fill NaN values using interpolation.
  5328. reindex : Conform object to new index.
  5329. asfreq : Convert TimeSeries to specified frequency.
  5330. Examples
  5331. --------
  5332. >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0],
  5333. ... [3, 4, np.nan, 1],
  5334. ... [np.nan, np.nan, np.nan, np.nan],
  5335. ... [np.nan, 3, np.nan, 4]],
  5336. ... columns=list("ABCD"))
  5337. >>> df
  5338. A B C D
  5339. 0 NaN 2.0 NaN 0.0
  5340. 1 3.0 4.0 NaN 1.0
  5341. 2 NaN NaN NaN NaN
  5342. 3 NaN 3.0 NaN 4.0
  5343. Replace all NaN elements with 0s.
  5344. >>> df.fillna(0)
  5345. A B C D
  5346. 0 0.0 2.0 0.0 0.0
  5347. 1 3.0 4.0 0.0 1.0
  5348. 2 0.0 0.0 0.0 0.0
  5349. 3 0.0 3.0 0.0 4.0
  5350. We can also propagate non-null values forward or backward.
  5351. >>> df.fillna(method="ffill")
  5352. A B C D
  5353. 0 NaN 2.0 NaN 0.0
  5354. 1 3.0 4.0 NaN 1.0
  5355. 2 3.0 4.0 NaN 1.0
  5356. 3 3.0 3.0 NaN 4.0
  5357. Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1,
  5358. 2, and 3 respectively.
  5359. >>> values = {{"A": 0, "B": 1, "C": 2, "D": 3}}
  5360. >>> df.fillna(value=values)
  5361. A B C D
  5362. 0 0.0 2.0 2.0 0.0
  5363. 1 3.0 4.0 2.0 1.0
  5364. 2 0.0 1.0 2.0 3.0
  5365. 3 0.0 3.0 2.0 4.0
  5366. Only replace the first NaN element.
  5367. >>> df.fillna(value=values, limit=1)
  5368. A B C D
  5369. 0 0.0 2.0 2.0 0.0
  5370. 1 3.0 4.0 NaN 1.0
  5371. 2 NaN 1.0 NaN 3.0
  5372. 3 NaN 3.0 NaN 4.0
  5373. When filling using a DataFrame, replacement happens along
  5374. the same column names and same indices
  5375. >>> df2 = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCE"))
  5376. >>> df.fillna(df2)
  5377. A B C D
  5378. 0 0.0 2.0 0.0 0.0
  5379. 1 3.0 4.0 0.0 1.0
  5380. 2 0.0 0.0 0.0 NaN
  5381. 3 0.0 3.0 0.0 4.0
  5382. Note that column D is not affected since it is not present in df2.
  5383. """
  5384. inplace = validate_bool_kwarg(inplace, "inplace")
  5385. value, method = validate_fillna_kwargs(value, method)
  5386. self._consolidate_inplace()
  5387. # set the default here, so functions examining the signaure
  5388. # can detect if something was set (e.g. in groupby) (GH9221)
  5389. if axis is None:
  5390. axis = 0
  5391. axis = self._get_axis_number(axis)
  5392. if value is None:
  5393. if not self._mgr.is_single_block and axis == 1:
  5394. if inplace:
  5395. raise NotImplementedError()
  5396. result = self.T.fillna(method=method, limit=limit).T
  5397. return result
  5398. new_data = self._mgr.interpolate(
  5399. method=method,
  5400. axis=axis,
  5401. limit=limit,
  5402. inplace=inplace,
  5403. coerce=True,
  5404. downcast=downcast,
  5405. )
  5406. else:
  5407. if self.ndim == 1:
  5408. if isinstance(value, (dict, ABCSeries)):
  5409. value = create_series_with_explicit_dtype(
  5410. value, dtype_if_empty=object
  5411. )
  5412. value = value.reindex(self.index, copy=False)
  5413. value = value._values
  5414. elif not is_list_like(value):
  5415. pass
  5416. else:
  5417. raise TypeError(
  5418. '"value" parameter must be a scalar, dict '
  5419. "or Series, but you passed a "
  5420. f'"{type(value).__name__}"'
  5421. )
  5422. new_data = self._mgr.fillna(
  5423. value=value, limit=limit, inplace=inplace, downcast=downcast
  5424. )
  5425. elif isinstance(value, (dict, ABCSeries)):
  5426. if axis == 1:
  5427. raise NotImplementedError(
  5428. "Currently only can fill "
  5429. "with dict/Series column "
  5430. "by column"
  5431. )
  5432. result = self if inplace else self.copy()
  5433. is_dict = isinstance(downcast, dict)
  5434. for k, v in value.items():
  5435. if k not in result:
  5436. continue
  5437. downcast_k = downcast if not is_dict else downcast.get(k)
  5438. result[k] = result[k].fillna(v, limit=limit, downcast=downcast_k)
  5439. return result if not inplace else None
  5440. elif not is_list_like(value):
  5441. if not self._mgr.is_single_block and axis == 1:
  5442. result = self.T.fillna(value=value, limit=limit).T
  5443. new_data = result
  5444. else:
  5445. new_data = self._mgr.fillna(
  5446. value=value, limit=limit, inplace=inplace, downcast=downcast
  5447. )
  5448. elif isinstance(value, ABCDataFrame) and self.ndim == 2:
  5449. new_data = self.where(self.notna(), value)._mgr
  5450. else:
  5451. raise ValueError(f"invalid fill value with a {type(value)}")
  5452. result = self._constructor(new_data)
  5453. if inplace:
  5454. return self._update_inplace(result)
  5455. else:
  5456. return result.__finalize__(self, method="fillna")
  5457. @doc(klass=_shared_doc_kwargs["klass"])
  5458. def ffill(
  5459. self: NDFrameT,
  5460. axis: None | Axis = None,
  5461. inplace: bool_t = False,
  5462. limit: None | int = None,
  5463. downcast=None,
  5464. ) -> NDFrameT | None:
  5465. """
  5466. Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``.
  5467. Returns
  5468. -------
  5469. {klass} or None
  5470. Object with missing values filled or None if ``inplace=True``.
  5471. """
  5472. return self.fillna(
  5473. method="ffill", axis=axis, inplace=inplace, limit=limit, downcast=downcast
  5474. )
  5475. pad = ffill
  5476. @doc(klass=_shared_doc_kwargs["klass"])
  5477. def bfill(
  5478. self: NDFrameT,
  5479. axis: None | Axis = None,
  5480. inplace: bool_t = False,
  5481. limit: None | int = None,
  5482. downcast=None,
  5483. ) -> NDFrameT | None:
  5484. """
  5485. Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``.
  5486. Returns
  5487. -------
  5488. {klass} or None
  5489. Object with missing values filled or None if ``inplace=True``.
  5490. """
  5491. return self.fillna(
  5492. method="bfill", axis=axis, inplace=inplace, limit=limit, downcast=downcast
  5493. )
  5494. backfill = bfill
  5495. @doc(
  5496. _shared_docs["replace"],
  5497. klass=_shared_doc_kwargs["klass"],
  5498. inplace=_shared_doc_kwargs["inplace"],
  5499. replace_iloc=_shared_doc_kwargs["replace_iloc"],
  5500. )
  5501. def replace(
  5502. self,
  5503. to_replace=None,
  5504. value=None,
  5505. inplace: bool_t = False,
  5506. limit: int | None = None,
  5507. regex=False,
  5508. method="pad",
  5509. ):
  5510. if not (
  5511. is_scalar(to_replace)
  5512. or is_re_compilable(to_replace)
  5513. or is_list_like(to_replace)
  5514. ):
  5515. raise TypeError(
  5516. "Expecting 'to_replace' to be either a scalar, array-like, "
  5517. "dict or None, got invalid type "
  5518. f"{repr(type(to_replace).__name__)}"
  5519. )
  5520. inplace = validate_bool_kwarg(inplace, "inplace")
  5521. if not is_bool(regex) and to_replace is not None:
  5522. raise ValueError("'to_replace' must be 'None' if 'regex' is not a bool")
  5523. self._consolidate_inplace()
  5524. if value is None:
  5525. # passing a single value that is scalar like
  5526. # when value is None (GH5319), for compat
  5527. if not is_dict_like(to_replace) and not is_dict_like(regex):
  5528. to_replace = [to_replace]
  5529. if isinstance(to_replace, (tuple, list)):
  5530. if isinstance(self, ABCDataFrame):
  5531. result = self.apply(
  5532. self._constructor_sliced._replace_single,
  5533. args=(to_replace, method, inplace, limit),
  5534. )
  5535. if inplace:
  5536. return
  5537. return result
  5538. self = cast("Series", self)
  5539. return self._replace_single(to_replace, method, inplace, limit)
  5540. if not is_dict_like(to_replace):
  5541. if not is_dict_like(regex):
  5542. raise TypeError(
  5543. 'If "to_replace" and "value" are both None '
  5544. 'and "to_replace" is not a list, then '
  5545. "regex must be a mapping"
  5546. )
  5547. to_replace = regex
  5548. regex = True
  5549. items = list(to_replace.items())
  5550. if items:
  5551. keys, values = zip(*items)
  5552. else:
  5553. keys, values = ([], [])
  5554. are_mappings = [is_dict_like(v) for v in values]
  5555. if any(are_mappings):
  5556. if not all(are_mappings):
  5557. raise TypeError(
  5558. "If a nested mapping is passed, all values "
  5559. "of the top level mapping must be mappings"
  5560. )
  5561. # passed a nested dict/Series
  5562. to_rep_dict = {}
  5563. value_dict = {}
  5564. for k, v in items:
  5565. keys, values = list(zip(*v.items())) or ([], [])
  5566. to_rep_dict[k] = list(keys)
  5567. value_dict[k] = list(values)
  5568. to_replace, value = to_rep_dict, value_dict
  5569. else:
  5570. to_replace, value = keys, values
  5571. return self.replace(
  5572. to_replace, value, inplace=inplace, limit=limit, regex=regex
  5573. )
  5574. else:
  5575. # need a non-zero len on all axes
  5576. if not self.size:
  5577. if inplace:
  5578. return
  5579. return self.copy()
  5580. if is_dict_like(to_replace):
  5581. if is_dict_like(value): # {'A' : NA} -> {'A' : 0}
  5582. # Note: Checking below for `in foo.keys()` instead of
  5583. # `in foo` is needed for when we have a Series and not dict
  5584. mapping = {
  5585. col: (to_replace[col], value[col])
  5586. for col in to_replace.keys()
  5587. if col in value.keys() and col in self
  5588. }
  5589. return self._replace_columnwise(mapping, inplace, regex)
  5590. # {'A': NA} -> 0
  5591. elif not is_list_like(value):
  5592. # Operate column-wise
  5593. if self.ndim == 1:
  5594. raise ValueError(
  5595. "Series.replace cannot use dict-like to_replace "
  5596. "and non-None value"
  5597. )
  5598. mapping = {
  5599. col: (to_rep, value) for col, to_rep in to_replace.items()
  5600. }
  5601. return self._replace_columnwise(mapping, inplace, regex)
  5602. else:
  5603. raise TypeError("value argument must be scalar, dict, or Series")
  5604. elif is_list_like(to_replace):
  5605. if not is_list_like(value):
  5606. # e.g. to_replace = [NA, ''] and value is 0,
  5607. # so we replace NA with 0 and then replace '' with 0
  5608. value = [value] * len(to_replace)
  5609. # e.g. we have to_replace = [NA, ''] and value = [0, 'missing']
  5610. if len(to_replace) != len(value):
  5611. raise ValueError(
  5612. f"Replacement lists must match in length. "
  5613. f"Expecting {len(to_replace)} got {len(value)} "
  5614. )
  5615. new_data = self._mgr.replace_list(
  5616. src_list=to_replace,
  5617. dest_list=value,
  5618. inplace=inplace,
  5619. regex=regex,
  5620. )
  5621. elif to_replace is None:
  5622. if not (
  5623. is_re_compilable(regex)
  5624. or is_list_like(regex)
  5625. or is_dict_like(regex)
  5626. ):
  5627. raise TypeError(
  5628. f"'regex' must be a string or a compiled regular expression "
  5629. f"or a list or dict of strings or regular expressions, "
  5630. f"you passed a {repr(type(regex).__name__)}"
  5631. )
  5632. return self.replace(
  5633. regex, value, inplace=inplace, limit=limit, regex=True
  5634. )
  5635. else:
  5636. # dest iterable dict-like
  5637. if is_dict_like(value): # NA -> {'A' : 0, 'B' : -1}
  5638. # Operate column-wise
  5639. if self.ndim == 1:
  5640. raise ValueError(
  5641. "Series.replace cannot use dict-value and "
  5642. "non-None to_replace"
  5643. )
  5644. mapping = {col: (to_replace, val) for col, val in value.items()}
  5645. return self._replace_columnwise(mapping, inplace, regex)
  5646. elif not is_list_like(value): # NA -> 0
  5647. new_data = self._mgr.replace(
  5648. to_replace=to_replace, value=value, inplace=inplace, regex=regex
  5649. )
  5650. else:
  5651. raise TypeError(
  5652. f'Invalid "to_replace" type: {repr(type(to_replace).__name__)}'
  5653. )
  5654. result = self._constructor(new_data)
  5655. if inplace:
  5656. return self._update_inplace(result)
  5657. else:
  5658. return result.__finalize__(self, method="replace")
  5659. def interpolate(
  5660. self: NDFrameT,
  5661. method: str = "linear",
  5662. axis: Axis = 0,
  5663. limit: int | None = None,
  5664. inplace: bool_t = False,
  5665. limit_direction: str | None = None,
  5666. limit_area: str | None = None,
  5667. downcast: str | None = None,
  5668. **kwargs,
  5669. ) -> NDFrameT | None:
  5670. """
  5671. Fill NaN values using an interpolation method.
  5672. Please note that only ``method='linear'`` is supported for
  5673. DataFrame/Series with a MultiIndex.
  5674. Parameters
  5675. ----------
  5676. method : str, default 'linear'
  5677. Interpolation technique to use. One of:
  5678. * 'linear': Ignore the index and treat the values as equally
  5679. spaced. This is the only method supported on MultiIndexes.
  5680. * 'time': Works on daily and higher resolution data to interpolate
  5681. given length of interval.
  5682. * 'index', 'values': use the actual numerical values of the index.
  5683. * 'pad': Fill in NaNs using existing values.
  5684. * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'spline',
  5685. 'barycentric', 'polynomial': Passed to
  5686. `scipy.interpolate.interp1d`. These methods use the numerical
  5687. values of the index. Both 'polynomial' and 'spline' require that
  5688. you also specify an `order` (int), e.g.
  5689. ``df.interpolate(method='polynomial', order=5)``.
  5690. * 'krogh', 'piecewise_polynomial', 'spline', 'pchip', 'akima',
  5691. 'cubicspline': Wrappers around the SciPy interpolation methods of
  5692. similar names. See `Notes`.
  5693. * 'from_derivatives': Refers to
  5694. `scipy.interpolate.BPoly.from_derivatives` which
  5695. replaces 'piecewise_polynomial' interpolation method in
  5696. scipy 0.18.
  5697. axis : {{0 or 'index', 1 or 'columns', None}}, default None
  5698. Axis to interpolate along.
  5699. limit : int, optional
  5700. Maximum number of consecutive NaNs to fill. Must be greater than
  5701. 0.
  5702. inplace : bool, default False
  5703. Update the data in place if possible.
  5704. limit_direction : {{'forward', 'backward', 'both'}}, Optional
  5705. Consecutive NaNs will be filled in this direction.
  5706. If limit is specified:
  5707. * If 'method' is 'pad' or 'ffill', 'limit_direction' must be 'forward'.
  5708. * If 'method' is 'backfill' or 'bfill', 'limit_direction' must be
  5709. 'backwards'.
  5710. If 'limit' is not specified:
  5711. * If 'method' is 'backfill' or 'bfill', the default is 'backward'
  5712. * else the default is 'forward'
  5713. .. versionchanged:: 1.1.0
  5714. raises ValueError if `limit_direction` is 'forward' or 'both' and
  5715. method is 'backfill' or 'bfill'.
  5716. raises ValueError if `limit_direction` is 'backward' or 'both' and
  5717. method is 'pad' or 'ffill'.
  5718. limit_area : {{`None`, 'inside', 'outside'}}, default None
  5719. If limit is specified, consecutive NaNs will be filled with this
  5720. restriction.
  5721. * ``None``: No fill restriction.
  5722. * 'inside': Only fill NaNs surrounded by valid values
  5723. (interpolate).
  5724. * 'outside': Only fill NaNs outside valid values (extrapolate).
  5725. downcast : optional, 'infer' or None, defaults to None
  5726. Downcast dtypes if possible.
  5727. ``**kwargs`` : optional
  5728. Keyword arguments to pass on to the interpolating function.
  5729. Returns
  5730. -------
  5731. Series or DataFrame or None
  5732. Returns the same object type as the caller, interpolated at
  5733. some or all ``NaN`` values or None if ``inplace=True``.
  5734. See Also
  5735. --------
  5736. fillna : Fill missing values using different methods.
  5737. scipy.interpolate.Akima1DInterpolator : Piecewise cubic polynomials
  5738. (Akima interpolator).
  5739. scipy.interpolate.BPoly.from_derivatives : Piecewise polynomial in the
  5740. Bernstein basis.
  5741. scipy.interpolate.interp1d : Interpolate a 1-D function.
  5742. scipy.interpolate.KroghInterpolator : Interpolate polynomial (Krogh
  5743. interpolator).
  5744. scipy.interpolate.PchipInterpolator : PCHIP 1-d monotonic cubic
  5745. interpolation.
  5746. scipy.interpolate.CubicSpline : Cubic spline data interpolator.
  5747. Notes
  5748. -----
  5749. The 'krogh', 'piecewise_polynomial', 'spline', 'pchip' and 'akima'
  5750. methods are wrappers around the respective SciPy implementations of
  5751. similar names. These use the actual numerical values of the index.
  5752. For more information on their behavior, see the
  5753. `SciPy documentation
  5754. <https://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation>`__
  5755. and `SciPy tutorial
  5756. <https://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html>`__.
  5757. Examples
  5758. --------
  5759. Filling in ``NaN`` in a :class:`~pandas.Series` via linear
  5760. interpolation.
  5761. >>> s = pd.Series([0, 1, np.nan, 3])
  5762. >>> s
  5763. 0 0.0
  5764. 1 1.0
  5765. 2 NaN
  5766. 3 3.0
  5767. dtype: float64
  5768. >>> s.interpolate()
  5769. 0 0.0
  5770. 1 1.0
  5771. 2 2.0
  5772. 3 3.0
  5773. dtype: float64
  5774. Filling in ``NaN`` in a Series by padding, but filling at most two
  5775. consecutive ``NaN`` at a time.
  5776. >>> s = pd.Series([np.nan, "single_one", np.nan,
  5777. ... "fill_two_more", np.nan, np.nan, np.nan,
  5778. ... 4.71, np.nan])
  5779. >>> s
  5780. 0 NaN
  5781. 1 single_one
  5782. 2 NaN
  5783. 3 fill_two_more
  5784. 4 NaN
  5785. 5 NaN
  5786. 6 NaN
  5787. 7 4.71
  5788. 8 NaN
  5789. dtype: object
  5790. >>> s.interpolate(method='pad', limit=2)
  5791. 0 NaN
  5792. 1 single_one
  5793. 2 single_one
  5794. 3 fill_two_more
  5795. 4 fill_two_more
  5796. 5 fill_two_more
  5797. 6 NaN
  5798. 7 4.71
  5799. 8 4.71
  5800. dtype: object
  5801. Filling in ``NaN`` in a Series via polynomial interpolation or splines:
  5802. Both 'polynomial' and 'spline' methods require that you also specify
  5803. an ``order`` (int).
  5804. >>> s = pd.Series([0, 2, np.nan, 8])
  5805. >>> s.interpolate(method='polynomial', order=2)
  5806. 0 0.000000
  5807. 1 2.000000
  5808. 2 4.666667
  5809. 3 8.000000
  5810. dtype: float64
  5811. Fill the DataFrame forward (that is, going down) along each column
  5812. using linear interpolation.
  5813. Note how the last entry in column 'a' is interpolated differently,
  5814. because there is no entry after it to use for interpolation.
  5815. Note how the first entry in column 'b' remains ``NaN``, because there
  5816. is no entry before it to use for interpolation.
  5817. >>> df = pd.DataFrame([(0.0, np.nan, -1.0, 1.0),
  5818. ... (np.nan, 2.0, np.nan, np.nan),
  5819. ... (2.0, 3.0, np.nan, 9.0),
  5820. ... (np.nan, 4.0, -4.0, 16.0)],
  5821. ... columns=list('abcd'))
  5822. >>> df
  5823. a b c d
  5824. 0 0.0 NaN -1.0 1.0
  5825. 1 NaN 2.0 NaN NaN
  5826. 2 2.0 3.0 NaN 9.0
  5827. 3 NaN 4.0 -4.0 16.0
  5828. >>> df.interpolate(method='linear', limit_direction='forward', axis=0)
  5829. a b c d
  5830. 0 0.0 NaN -1.0 1.0
  5831. 1 1.0 2.0 -2.0 5.0
  5832. 2 2.0 3.0 -3.0 9.0
  5833. 3 2.0 4.0 -4.0 16.0
  5834. Using polynomial interpolation.
  5835. >>> df['d'].interpolate(method='polynomial', order=2)
  5836. 0 1.0
  5837. 1 4.0
  5838. 2 9.0
  5839. 3 16.0
  5840. Name: d, dtype: float64
  5841. """
  5842. inplace = validate_bool_kwarg(inplace, "inplace")
  5843. axis = self._get_axis_number(axis)
  5844. fillna_methods = ["ffill", "bfill", "pad", "backfill"]
  5845. should_transpose = axis == 1 and method not in fillna_methods
  5846. obj = self.T if should_transpose else self
  5847. if obj.empty:
  5848. return self.copy()
  5849. if method not in fillna_methods:
  5850. axis = self._info_axis_number
  5851. if isinstance(obj.index, MultiIndex) and method != "linear":
  5852. raise ValueError(
  5853. "Only `method=linear` interpolation is supported on MultiIndexes."
  5854. )
  5855. # Set `limit_direction` depending on `method`
  5856. if limit_direction is None:
  5857. limit_direction = (
  5858. "backward" if method in ("backfill", "bfill") else "forward"
  5859. )
  5860. else:
  5861. if method in ("pad", "ffill") and limit_direction != "forward":
  5862. raise ValueError(
  5863. f"`limit_direction` must be 'forward' for method `{method}`"
  5864. )
  5865. if method in ("backfill", "bfill") and limit_direction != "backward":
  5866. raise ValueError(
  5867. f"`limit_direction` must be 'backward' for method `{method}`"
  5868. )
  5869. if obj.ndim == 2 and np.all(obj.dtypes == np.dtype("object")):
  5870. raise TypeError(
  5871. "Cannot interpolate with all object-dtype columns "
  5872. "in the DataFrame. Try setting at least one "
  5873. "column to a numeric dtype."
  5874. )
  5875. # create/use the index
  5876. if method == "linear":
  5877. # prior default
  5878. index = np.arange(len(obj.index))
  5879. index = Index(index)
  5880. else:
  5881. index = obj.index
  5882. methods = {"index", "values", "nearest", "time"}
  5883. is_numeric_or_datetime = (
  5884. is_numeric_dtype(index.dtype)
  5885. or is_datetime64_any_dtype(index.dtype)
  5886. or is_timedelta64_dtype(index.dtype)
  5887. )
  5888. if method not in methods and not is_numeric_or_datetime:
  5889. raise ValueError(
  5890. "Index column must be numeric or datetime type when "
  5891. f"using {method} method other than linear. "
  5892. "Try setting a numeric or datetime index column before "
  5893. "interpolating."
  5894. )
  5895. if isna(index).any():
  5896. raise NotImplementedError(
  5897. "Interpolation with NaNs in the index "
  5898. "has not been implemented. Try filling "
  5899. "those NaNs before interpolating."
  5900. )
  5901. new_data = obj._mgr.interpolate(
  5902. method=method,
  5903. axis=axis,
  5904. index=index,
  5905. limit=limit,
  5906. limit_direction=limit_direction,
  5907. limit_area=limit_area,
  5908. inplace=inplace,
  5909. downcast=downcast,
  5910. **kwargs,
  5911. )
  5912. result = self._constructor(new_data)
  5913. if should_transpose:
  5914. result = result.T
  5915. if inplace:
  5916. return self._update_inplace(result)
  5917. else:
  5918. return result.__finalize__(self, method="interpolate")
  5919. # ----------------------------------------------------------------------
  5920. # Timeseries methods Methods
  5921. @final
  5922. def asof(self, where, subset=None):
  5923. """
  5924. Return the last row(s) without any NaNs before `where`.
  5925. The last row (for each element in `where`, if list) without any
  5926. NaN is taken.
  5927. In case of a :class:`~pandas.DataFrame`, the last row without NaN
  5928. considering only the subset of columns (if not `None`)
  5929. If there is no good value, NaN is returned for a Series or
  5930. a Series of NaN values for a DataFrame
  5931. Parameters
  5932. ----------
  5933. where : date or array-like of dates
  5934. Date(s) before which the last row(s) are returned.
  5935. subset : str or array-like of str, default `None`
  5936. For DataFrame, if not `None`, only use these columns to
  5937. check for NaNs.
  5938. Returns
  5939. -------
  5940. scalar, Series, or DataFrame
  5941. The return can be:
  5942. * scalar : when `self` is a Series and `where` is a scalar
  5943. * Series: when `self` is a Series and `where` is an array-like,
  5944. or when `self` is a DataFrame and `where` is a scalar
  5945. * DataFrame : when `self` is a DataFrame and `where` is an
  5946. array-like
  5947. Return scalar, Series, or DataFrame.
  5948. See Also
  5949. --------
  5950. merge_asof : Perform an asof merge. Similar to left join.
  5951. Notes
  5952. -----
  5953. Dates are assumed to be sorted. Raises if this is not the case.
  5954. Examples
  5955. --------
  5956. A Series and a scalar `where`.
  5957. >>> s = pd.Series([1, 2, np.nan, 4], index=[10, 20, 30, 40])
  5958. >>> s
  5959. 10 1.0
  5960. 20 2.0
  5961. 30 NaN
  5962. 40 4.0
  5963. dtype: float64
  5964. >>> s.asof(20)
  5965. 2.0
  5966. For a sequence `where`, a Series is returned. The first value is
  5967. NaN, because the first element of `where` is before the first
  5968. index value.
  5969. >>> s.asof([5, 20])
  5970. 5 NaN
  5971. 20 2.0
  5972. dtype: float64
  5973. Missing values are not considered. The following is ``2.0``, not
  5974. NaN, even though NaN is at the index location for ``30``.
  5975. >>> s.asof(30)
  5976. 2.0
  5977. Take all columns into consideration
  5978. >>> df = pd.DataFrame({'a': [10, 20, 30, 40, 50],
  5979. ... 'b': [None, None, None, None, 500]},
  5980. ... index=pd.DatetimeIndex(['2018-02-27 09:01:00',
  5981. ... '2018-02-27 09:02:00',
  5982. ... '2018-02-27 09:03:00',
  5983. ... '2018-02-27 09:04:00',
  5984. ... '2018-02-27 09:05:00']))
  5985. >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30',
  5986. ... '2018-02-27 09:04:30']))
  5987. a b
  5988. 2018-02-27 09:03:30 NaN NaN
  5989. 2018-02-27 09:04:30 NaN NaN
  5990. Take a single column into consideration
  5991. >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30',
  5992. ... '2018-02-27 09:04:30']),
  5993. ... subset=['a'])
  5994. a b
  5995. 2018-02-27 09:03:30 30.0 NaN
  5996. 2018-02-27 09:04:30 40.0 NaN
  5997. """
  5998. if isinstance(where, str):
  5999. where = Timestamp(where)
  6000. if not self.index.is_monotonic:
  6001. raise ValueError("asof requires a sorted index")
  6002. is_series = isinstance(self, ABCSeries)
  6003. if is_series:
  6004. if subset is not None:
  6005. raise ValueError("subset is not valid for Series")
  6006. else:
  6007. if subset is None:
  6008. subset = self.columns
  6009. if not is_list_like(subset):
  6010. subset = [subset]
  6011. is_list = is_list_like(where)
  6012. if not is_list:
  6013. start = self.index[0]
  6014. if isinstance(self.index, PeriodIndex):
  6015. where = Period(where, freq=self.index.freq)
  6016. if where < start:
  6017. if not is_series:
  6018. return self._constructor_sliced(
  6019. index=self.columns, name=where, dtype=np.float64
  6020. )
  6021. return np.nan
  6022. # It's always much faster to use a *while* loop here for
  6023. # Series than pre-computing all the NAs. However a
  6024. # *while* loop is extremely expensive for DataFrame
  6025. # so we later pre-compute all the NAs and use the same
  6026. # code path whether *where* is a scalar or list.
  6027. # See PR: https://github.com/pandas-dev/pandas/pull/14476
  6028. if is_series:
  6029. loc = self.index.searchsorted(where, side="right")
  6030. if loc > 0:
  6031. loc -= 1
  6032. values = self._values
  6033. while loc > 0 and isna(values[loc]):
  6034. loc -= 1
  6035. return values[loc]
  6036. if not isinstance(where, Index):
  6037. where = Index(where) if is_list else Index([where])
  6038. nulls = self.isna() if is_series else self[subset].isna().any(1)
  6039. if nulls.all():
  6040. if is_series:
  6041. self = cast("Series", self)
  6042. return self._constructor(np.nan, index=where, name=self.name)
  6043. elif is_list:
  6044. self = cast("DataFrame", self)
  6045. return self._constructor(np.nan, index=where, columns=self.columns)
  6046. else:
  6047. self = cast("DataFrame", self)
  6048. return self._constructor_sliced(
  6049. np.nan, index=self.columns, name=where[0]
  6050. )
  6051. locs = self.index.asof_locs(where, ~(nulls._values))
  6052. # mask the missing
  6053. missing = locs == -1
  6054. data = self.take(locs)
  6055. data.index = where
  6056. data.loc[missing] = np.nan
  6057. return data if is_list else data.iloc[-1]
  6058. # ----------------------------------------------------------------------
  6059. # Action Methods
  6060. @doc(klass=_shared_doc_kwargs["klass"])
  6061. def isna(self: NDFrameT) -> NDFrameT:
  6062. """
  6063. Detect missing values.
  6064. Return a boolean same-sized object indicating if the values are NA.
  6065. NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
  6066. values.
  6067. Everything else gets mapped to False values. Characters such as empty
  6068. strings ``''`` or :attr:`numpy.inf` are not considered NA values
  6069. (unless you set ``pandas.options.mode.use_inf_as_na = True``).
  6070. Returns
  6071. -------
  6072. {klass}
  6073. Mask of bool values for each element in {klass} that
  6074. indicates whether an element is an NA value.
  6075. See Also
  6076. --------
  6077. {klass}.isnull : Alias of isna.
  6078. {klass}.notna : Boolean inverse of isna.
  6079. {klass}.dropna : Omit axes labels with missing values.
  6080. isna : Top-level isna.
  6081. Examples
  6082. --------
  6083. Show which entries in a DataFrame are NA.
  6084. >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
  6085. ... born=[pd.NaT, pd.Timestamp('1939-05-27'),
  6086. ... pd.Timestamp('1940-04-25')],
  6087. ... name=['Alfred', 'Batman', ''],
  6088. ... toy=[None, 'Batmobile', 'Joker']))
  6089. >>> df
  6090. age born name toy
  6091. 0 5.0 NaT Alfred None
  6092. 1 6.0 1939-05-27 Batman Batmobile
  6093. 2 NaN 1940-04-25 Joker
  6094. >>> df.isna()
  6095. age born name toy
  6096. 0 False True False True
  6097. 1 False False False False
  6098. 2 True False False False
  6099. Show which entries in a Series are NA.
  6100. >>> ser = pd.Series([5, 6, np.NaN])
  6101. >>> ser
  6102. 0 5.0
  6103. 1 6.0
  6104. 2 NaN
  6105. dtype: float64
  6106. >>> ser.isna()
  6107. 0 False
  6108. 1 False
  6109. 2 True
  6110. dtype: bool
  6111. """
  6112. return isna(self).__finalize__(self, method="isna")
  6113. @doc(isna, klass=_shared_doc_kwargs["klass"])
  6114. def isnull(self: NDFrameT) -> NDFrameT:
  6115. return isna(self).__finalize__(self, method="isnull")
  6116. @doc(klass=_shared_doc_kwargs["klass"])
  6117. def notna(self: NDFrameT) -> NDFrameT:
  6118. """
  6119. Detect existing (non-missing) values.
  6120. Return a boolean same-sized object indicating if the values are not NA.
  6121. Non-missing values get mapped to True. Characters such as empty
  6122. strings ``''`` or :attr:`numpy.inf` are not considered NA values
  6123. (unless you set ``pandas.options.mode.use_inf_as_na = True``).
  6124. NA values, such as None or :attr:`numpy.NaN`, get mapped to False
  6125. values.
  6126. Returns
  6127. -------
  6128. {klass}
  6129. Mask of bool values for each element in {klass} that
  6130. indicates whether an element is not an NA value.
  6131. See Also
  6132. --------
  6133. {klass}.notnull : Alias of notna.
  6134. {klass}.isna : Boolean inverse of notna.
  6135. {klass}.dropna : Omit axes labels with missing values.
  6136. notna : Top-level notna.
  6137. Examples
  6138. --------
  6139. Show which entries in a DataFrame are not NA.
  6140. >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
  6141. ... born=[pd.NaT, pd.Timestamp('1939-05-27'),
  6142. ... pd.Timestamp('1940-04-25')],
  6143. ... name=['Alfred', 'Batman', ''],
  6144. ... toy=[None, 'Batmobile', 'Joker']))
  6145. >>> df
  6146. age born name toy
  6147. 0 5.0 NaT Alfred None
  6148. 1 6.0 1939-05-27 Batman Batmobile
  6149. 2 NaN 1940-04-25 Joker
  6150. >>> df.notna()
  6151. age born name toy
  6152. 0 True False True False
  6153. 1 True True True True
  6154. 2 False True True True
  6155. Show which entries in a Series are not NA.
  6156. >>> ser = pd.Series([5, 6, np.NaN])
  6157. >>> ser
  6158. 0 5.0
  6159. 1 6.0
  6160. 2 NaN
  6161. dtype: float64
  6162. >>> ser.notna()
  6163. 0 True
  6164. 1 True
  6165. 2 False
  6166. dtype: bool
  6167. """
  6168. return notna(self).__finalize__(self, method="notna")
  6169. @doc(notna, klass=_shared_doc_kwargs["klass"])
  6170. def notnull(self: NDFrameT) -> NDFrameT:
  6171. return notna(self).__finalize__(self, method="notnull")
  6172. @final
  6173. def _clip_with_scalar(self, lower, upper, inplace: bool_t = False):
  6174. if (lower is not None and np.any(isna(lower))) or (
  6175. upper is not None and np.any(isna(upper))
  6176. ):
  6177. raise ValueError("Cannot use an NA value as a clip threshold")
  6178. result = self
  6179. mask = isna(self._values)
  6180. with np.errstate(all="ignore"):
  6181. if upper is not None:
  6182. subset = self <= upper
  6183. result = result.where(subset, upper, axis=None, inplace=False)
  6184. if lower is not None:
  6185. subset = self >= lower
  6186. result = result.where(subset, lower, axis=None, inplace=False)
  6187. if np.any(mask):
  6188. result[mask] = np.nan
  6189. if inplace:
  6190. return self._update_inplace(result)
  6191. else:
  6192. return result
  6193. @final
  6194. def _clip_with_one_bound(self, threshold, method, axis, inplace):
  6195. if axis is not None:
  6196. axis = self._get_axis_number(axis)
  6197. # method is self.le for upper bound and self.ge for lower bound
  6198. if is_scalar(threshold) and is_number(threshold):
  6199. if method.__name__ == "le":
  6200. return self._clip_with_scalar(None, threshold, inplace=inplace)
  6201. return self._clip_with_scalar(threshold, None, inplace=inplace)
  6202. # GH #15390
  6203. # In order for where method to work, the threshold must
  6204. # be transformed to NDFrame from other array like structure.
  6205. if (not isinstance(threshold, ABCSeries)) and is_list_like(threshold):
  6206. if isinstance(self, ABCSeries):
  6207. threshold = self._constructor(threshold, index=self.index)
  6208. else:
  6209. threshold = align_method_FRAME(self, threshold, axis, flex=None)[1]
  6210. # GH 40420
  6211. # Treat missing thresholds as no bounds, not clipping the values
  6212. if is_list_like(threshold):
  6213. fill_value = np.inf if method.__name__ == "le" else -np.inf
  6214. threshold_inf = threshold.fillna(fill_value)
  6215. else:
  6216. threshold_inf = threshold
  6217. subset = method(threshold_inf, axis=axis) | isna(self)
  6218. # GH 40420
  6219. return self.where(subset, threshold, axis=axis, inplace=inplace)
  6220. def clip(
  6221. self: NDFrameT,
  6222. lower=None,
  6223. upper=None,
  6224. axis: Axis | None = None,
  6225. inplace: bool_t = False,
  6226. *args,
  6227. **kwargs,
  6228. ) -> NDFrameT | None:
  6229. """
  6230. Trim values at input threshold(s).
  6231. Assigns values outside boundary to boundary values. Thresholds
  6232. can be singular values or array like, and in the latter case
  6233. the clipping is performed element-wise in the specified axis.
  6234. Parameters
  6235. ----------
  6236. lower : float or array-like, default None
  6237. Minimum threshold value. All values below this
  6238. threshold will be set to it. A missing
  6239. threshold (e.g `NA`) will not clip the value.
  6240. upper : float or array-like, default None
  6241. Maximum threshold value. All values above this
  6242. threshold will be set to it. A missing
  6243. threshold (e.g `NA`) will not clip the value.
  6244. axis : int or str axis name, optional
  6245. Align object with lower and upper along the given axis.
  6246. inplace : bool, default False
  6247. Whether to perform the operation in place on the data.
  6248. *args, **kwargs
  6249. Additional keywords have no effect but might be accepted
  6250. for compatibility with numpy.
  6251. Returns
  6252. -------
  6253. Series or DataFrame or None
  6254. Same type as calling object with the values outside the
  6255. clip boundaries replaced or None if ``inplace=True``.
  6256. See Also
  6257. --------
  6258. Series.clip : Trim values at input threshold in series.
  6259. DataFrame.clip : Trim values at input threshold in dataframe.
  6260. numpy.clip : Clip (limit) the values in an array.
  6261. Examples
  6262. --------
  6263. >>> data = {'col_0': [9, -3, 0, -1, 5], 'col_1': [-2, -7, 6, 8, -5]}
  6264. >>> df = pd.DataFrame(data)
  6265. >>> df
  6266. col_0 col_1
  6267. 0 9 -2
  6268. 1 -3 -7
  6269. 2 0 6
  6270. 3 -1 8
  6271. 4 5 -5
  6272. Clips per column using lower and upper thresholds:
  6273. >>> df.clip(-4, 6)
  6274. col_0 col_1
  6275. 0 6 -2
  6276. 1 -3 -4
  6277. 2 0 6
  6278. 3 -1 6
  6279. 4 5 -4
  6280. Clips using specific lower and upper thresholds per column element:
  6281. >>> t = pd.Series([2, -4, -1, 6, 3])
  6282. >>> t
  6283. 0 2
  6284. 1 -4
  6285. 2 -1
  6286. 3 6
  6287. 4 3
  6288. dtype: int64
  6289. >>> df.clip(t, t + 4, axis=0)
  6290. col_0 col_1
  6291. 0 6 2
  6292. 1 -3 -4
  6293. 2 0 3
  6294. 3 6 8
  6295. 4 5 3
  6296. Clips using specific lower threshold per column element, with missing values:
  6297. >>> t = pd.Series([2, -4, np.NaN, 6, 3])
  6298. >>> t
  6299. 0 2.0
  6300. 1 -4.0
  6301. 2 NaN
  6302. 3 6.0
  6303. 4 3.0
  6304. dtype: float64
  6305. >>> df.clip(t, axis=0)
  6306. col_0 col_1
  6307. 0 9 2
  6308. 1 -3 -4
  6309. 2 0 6
  6310. 3 6 8
  6311. 4 5 3
  6312. """
  6313. inplace = validate_bool_kwarg(inplace, "inplace")
  6314. axis = nv.validate_clip_with_axis(axis, args, kwargs)
  6315. if axis is not None:
  6316. axis = self._get_axis_number(axis)
  6317. # GH 17276
  6318. # numpy doesn't like NaN as a clip value
  6319. # so ignore
  6320. # GH 19992
  6321. # numpy doesn't drop a list-like bound containing NaN
  6322. isna_lower = isna(lower)
  6323. if not is_list_like(lower):
  6324. if np.any(isna_lower):
  6325. lower = None
  6326. elif np.all(isna_lower):
  6327. lower = None
  6328. isna_upper = isna(upper)
  6329. if not is_list_like(upper):
  6330. if np.any(isna_upper):
  6331. upper = None
  6332. elif np.all(isna_upper):
  6333. upper = None
  6334. # GH 2747 (arguments were reversed)
  6335. if (
  6336. lower is not None
  6337. and upper is not None
  6338. and is_scalar(lower)
  6339. and is_scalar(upper)
  6340. ):
  6341. lower, upper = min(lower, upper), max(lower, upper)
  6342. # fast-path for scalars
  6343. if (lower is None or (is_scalar(lower) and is_number(lower))) and (
  6344. upper is None or (is_scalar(upper) and is_number(upper))
  6345. ):
  6346. return self._clip_with_scalar(lower, upper, inplace=inplace)
  6347. result = self
  6348. if lower is not None:
  6349. result = result._clip_with_one_bound(
  6350. lower, method=self.ge, axis=axis, inplace=inplace
  6351. )
  6352. if upper is not None:
  6353. if inplace:
  6354. result = self
  6355. result = result._clip_with_one_bound(
  6356. upper, method=self.le, axis=axis, inplace=inplace
  6357. )
  6358. return result
  6359. @doc(**_shared_doc_kwargs)
  6360. def asfreq(
  6361. self: NDFrameT,
  6362. freq,
  6363. method=None,
  6364. how: str | None = None,
  6365. normalize: bool_t = False,
  6366. fill_value=None,
  6367. ) -> NDFrameT:
  6368. """
  6369. Convert time series to specified frequency.
  6370. Returns the original data conformed to a new index with the specified
  6371. frequency.
  6372. If the index of this {klass} is a :class:`~pandas.PeriodIndex`, the new index
  6373. is the result of transforming the original index with
  6374. :meth:`PeriodIndex.asfreq <pandas.PeriodIndex.asfreq>` (so the original index
  6375. will map one-to-one to the new index).
  6376. Otherwise, the new index will be equivalent to ``pd.date_range(start, end,
  6377. freq=freq)`` where ``start`` and ``end`` are, respectively, the first and
  6378. last entries in the original index (see :func:`pandas.date_range`). The
  6379. values corresponding to any timesteps in the new index which were not present
  6380. in the original index will be null (``NaN``), unless a method for filling
  6381. such unknowns is provided (see the ``method`` parameter below).
  6382. The :meth:`resample` method is more appropriate if an operation on each group of
  6383. timesteps (such as an aggregate) is necessary to represent the data at the new
  6384. frequency.
  6385. Parameters
  6386. ----------
  6387. freq : DateOffset or str
  6388. Frequency DateOffset or string.
  6389. method : {{'backfill'/'bfill', 'pad'/'ffill'}}, default None
  6390. Method to use for filling holes in reindexed Series (note this
  6391. does not fill NaNs that already were present):
  6392. * 'pad' / 'ffill': propagate last valid observation forward to next
  6393. valid
  6394. * 'backfill' / 'bfill': use NEXT valid observation to fill.
  6395. how : {{'start', 'end'}}, default end
  6396. For PeriodIndex only (see PeriodIndex.asfreq).
  6397. normalize : bool, default False
  6398. Whether to reset output index to midnight.
  6399. fill_value : scalar, optional
  6400. Value to use for missing values, applied during upsampling (note
  6401. this does not fill NaNs that already were present).
  6402. Returns
  6403. -------
  6404. {klass}
  6405. {klass} object reindexed to the specified frequency.
  6406. See Also
  6407. --------
  6408. reindex : Conform DataFrame to new index with optional filling logic.
  6409. Notes
  6410. -----
  6411. To learn more about the frequency strings, please see `this link
  6412. <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
  6413. Examples
  6414. --------
  6415. Start by creating a series with 4 one minute timestamps.
  6416. >>> index = pd.date_range('1/1/2000', periods=4, freq='T')
  6417. >>> series = pd.Series([0.0, None, 2.0, 3.0], index=index)
  6418. >>> df = pd.DataFrame({{'s': series}})
  6419. >>> df
  6420. s
  6421. 2000-01-01 00:00:00 0.0
  6422. 2000-01-01 00:01:00 NaN
  6423. 2000-01-01 00:02:00 2.0
  6424. 2000-01-01 00:03:00 3.0
  6425. Upsample the series into 30 second bins.
  6426. >>> df.asfreq(freq='30S')
  6427. s
  6428. 2000-01-01 00:00:00 0.0
  6429. 2000-01-01 00:00:30 NaN
  6430. 2000-01-01 00:01:00 NaN
  6431. 2000-01-01 00:01:30 NaN
  6432. 2000-01-01 00:02:00 2.0
  6433. 2000-01-01 00:02:30 NaN
  6434. 2000-01-01 00:03:00 3.0
  6435. Upsample again, providing a ``fill value``.
  6436. >>> df.asfreq(freq='30S', fill_value=9.0)
  6437. s
  6438. 2000-01-01 00:00:00 0.0
  6439. 2000-01-01 00:00:30 9.0
  6440. 2000-01-01 00:01:00 NaN
  6441. 2000-01-01 00:01:30 9.0
  6442. 2000-01-01 00:02:00 2.0
  6443. 2000-01-01 00:02:30 9.0
  6444. 2000-01-01 00:03:00 3.0
  6445. Upsample again, providing a ``method``.
  6446. >>> df.asfreq(freq='30S', method='bfill')
  6447. s
  6448. 2000-01-01 00:00:00 0.0
  6449. 2000-01-01 00:00:30 NaN
  6450. 2000-01-01 00:01:00 NaN
  6451. 2000-01-01 00:01:30 2.0
  6452. 2000-01-01 00:02:00 2.0
  6453. 2000-01-01 00:02:30 3.0
  6454. 2000-01-01 00:03:00 3.0
  6455. """
  6456. from pandas.core.resample import asfreq
  6457. return asfreq(
  6458. self,
  6459. freq,
  6460. method=method,
  6461. how=how,
  6462. normalize=normalize,
  6463. fill_value=fill_value,
  6464. )
  6465. @final
  6466. def at_time(self: NDFrameT, time, asof: bool_t = False, axis=None) -> NDFrameT:
  6467. """
  6468. Select values at particular time of day (e.g., 9:30AM).
  6469. Parameters
  6470. ----------
  6471. time : datetime.time or str
  6472. axis : {0 or 'index', 1 or 'columns'}, default 0
  6473. Returns
  6474. -------
  6475. Series or DataFrame
  6476. Raises
  6477. ------
  6478. TypeError
  6479. If the index is not a :class:`DatetimeIndex`
  6480. See Also
  6481. --------
  6482. between_time : Select values between particular times of the day.
  6483. first : Select initial periods of time series based on a date offset.
  6484. last : Select final periods of time series based on a date offset.
  6485. DatetimeIndex.indexer_at_time : Get just the index locations for
  6486. values at particular time of the day.
  6487. Examples
  6488. --------
  6489. >>> i = pd.date_range('2018-04-09', periods=4, freq='12H')
  6490. >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
  6491. >>> ts
  6492. A
  6493. 2018-04-09 00:00:00 1
  6494. 2018-04-09 12:00:00 2
  6495. 2018-04-10 00:00:00 3
  6496. 2018-04-10 12:00:00 4
  6497. >>> ts.at_time('12:00')
  6498. A
  6499. 2018-04-09 12:00:00 2
  6500. 2018-04-10 12:00:00 4
  6501. """
  6502. if axis is None:
  6503. axis = self._stat_axis_number
  6504. axis = self._get_axis_number(axis)
  6505. index = self._get_axis(axis)
  6506. if not isinstance(index, DatetimeIndex):
  6507. raise TypeError("Index must be DatetimeIndex")
  6508. indexer = index.indexer_at_time(time, asof=asof)
  6509. return self._take_with_is_copy(indexer, axis=axis)
  6510. @final
  6511. def between_time(
  6512. self: NDFrameT,
  6513. start_time,
  6514. end_time,
  6515. include_start: bool_t | lib.NoDefault = lib.no_default,
  6516. include_end: bool_t | lib.NoDefault = lib.no_default,
  6517. inclusive: str | None = None,
  6518. axis=None,
  6519. ) -> NDFrameT:
  6520. """
  6521. Select values between particular times of the day (e.g., 9:00-9:30 AM).
  6522. By setting ``start_time`` to be later than ``end_time``,
  6523. you can get the times that are *not* between the two times.
  6524. Parameters
  6525. ----------
  6526. start_time : datetime.time or str
  6527. Initial time as a time filter limit.
  6528. end_time : datetime.time or str
  6529. End time as a time filter limit.
  6530. include_start : bool, default True
  6531. Whether the start time needs to be included in the result.
  6532. .. deprecated:: 1.4.0
  6533. Arguments `include_start` and `include_end` have been deprecated
  6534. to standardize boundary inputs. Use `inclusive` instead, to set
  6535. each bound as closed or open.
  6536. include_end : bool, default True
  6537. Whether the end time needs to be included in the result.
  6538. .. deprecated:: 1.4.0
  6539. Arguments `include_start` and `include_end` have been deprecated
  6540. to standardize boundary inputs. Use `inclusive` instead, to set
  6541. each bound as closed or open.
  6542. inclusive : {"both", "neither", "left", "right"}, default "both"
  6543. Include boundaries; whether to set each bound as closed or open.
  6544. axis : {0 or 'index', 1 or 'columns'}, default 0
  6545. Determine range time on index or columns value.
  6546. Returns
  6547. -------
  6548. Series or DataFrame
  6549. Data from the original object filtered to the specified dates range.
  6550. Raises
  6551. ------
  6552. TypeError
  6553. If the index is not a :class:`DatetimeIndex`
  6554. See Also
  6555. --------
  6556. at_time : Select values at a particular time of the day.
  6557. first : Select initial periods of time series based on a date offset.
  6558. last : Select final periods of time series based on a date offset.
  6559. DatetimeIndex.indexer_between_time : Get just the index locations for
  6560. values between particular times of the day.
  6561. Examples
  6562. --------
  6563. >>> i = pd.date_range('2018-04-09', periods=4, freq='1D20min')
  6564. >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
  6565. >>> ts
  6566. A
  6567. 2018-04-09 00:00:00 1
  6568. 2018-04-10 00:20:00 2
  6569. 2018-04-11 00:40:00 3
  6570. 2018-04-12 01:00:00 4
  6571. >>> ts.between_time('0:15', '0:45')
  6572. A
  6573. 2018-04-10 00:20:00 2
  6574. 2018-04-11 00:40:00 3
  6575. You get the times that are *not* between two times by setting
  6576. ``start_time`` later than ``end_time``:
  6577. >>> ts.between_time('0:45', '0:15')
  6578. A
  6579. 2018-04-09 00:00:00 1
  6580. 2018-04-12 01:00:00 4
  6581. """
  6582. if axis is None:
  6583. axis = self._stat_axis_number
  6584. axis = self._get_axis_number(axis)
  6585. index = self._get_axis(axis)
  6586. if not isinstance(index, DatetimeIndex):
  6587. raise TypeError("Index must be DatetimeIndex")
  6588. old_include_arg_used = (include_start != lib.no_default) or (
  6589. include_end != lib.no_default
  6590. )
  6591. if old_include_arg_used and inclusive is not None:
  6592. raise ValueError(
  6593. "Deprecated arguments `include_start` and `include_end` "
  6594. "cannot be passed if `inclusive` has been given."
  6595. )
  6596. # If any of the deprecated arguments ('include_start', 'include_end')
  6597. # have been passed
  6598. elif old_include_arg_used:
  6599. warnings.warn(
  6600. "`include_start` and `include_end` are deprecated in "
  6601. "favour of `inclusive`.",
  6602. FutureWarning,
  6603. stacklevel=2,
  6604. )
  6605. left = True if isinstance(include_start, lib.NoDefault) else include_start
  6606. right = True if isinstance(include_end, lib.NoDefault) else include_end
  6607. inc_dict = {
  6608. (True, True): "both",
  6609. (True, False): "left",
  6610. (False, True): "right",
  6611. (False, False): "neither",
  6612. }
  6613. inclusive = inc_dict[(left, right)]
  6614. elif inclusive is None:
  6615. # On arg removal inclusive can default to "both"
  6616. inclusive = "both"
  6617. left_inclusive, right_inclusive = validate_inclusive(inclusive)
  6618. indexer = index.indexer_between_time(
  6619. start_time,
  6620. end_time,
  6621. include_start=left_inclusive,
  6622. include_end=right_inclusive,
  6623. )
  6624. return self._take_with_is_copy(indexer, axis=axis)
  6625. @doc(**_shared_doc_kwargs)
  6626. def resample(
  6627. self,
  6628. rule,
  6629. axis=0,
  6630. closed: str | None = None,
  6631. label: str | None = None,
  6632. convention: str = "start",
  6633. kind: str | None = None,
  6634. loffset=None,
  6635. base: int | None = None,
  6636. on=None,
  6637. level=None,
  6638. origin: str | TimestampConvertibleTypes = "start_day",
  6639. offset: TimedeltaConvertibleTypes | None = None,
  6640. ) -> Resampler:
  6641. """
  6642. Resample time-series data.
  6643. Convenience method for frequency conversion and resampling of time series.
  6644. The object must have a datetime-like index (`DatetimeIndex`, `PeriodIndex`,
  6645. or `TimedeltaIndex`), or the caller must pass the label of a datetime-like
  6646. series/index to the ``on``/``level`` keyword parameter.
  6647. Parameters
  6648. ----------
  6649. rule : DateOffset, Timedelta or str
  6650. The offset string or object representing target conversion.
  6651. axis : {{0 or 'index', 1 or 'columns'}}, default 0
  6652. Which axis to use for up- or down-sampling. For `Series` this
  6653. will default to 0, i.e. along the rows. Must be
  6654. `DatetimeIndex`, `TimedeltaIndex` or `PeriodIndex`.
  6655. closed : {{'right', 'left'}}, default None
  6656. Which side of bin interval is closed. The default is 'left'
  6657. for all frequency offsets except for 'M', 'A', 'Q', 'BM',
  6658. 'BA', 'BQ', and 'W' which all have a default of 'right'.
  6659. label : {{'right', 'left'}}, default None
  6660. Which bin edge label to label bucket with. The default is 'left'
  6661. for all frequency offsets except for 'M', 'A', 'Q', 'BM',
  6662. 'BA', 'BQ', and 'W' which all have a default of 'right'.
  6663. convention : {{'start', 'end', 's', 'e'}}, default 'start'
  6664. For `PeriodIndex` only, controls whether to use the start or
  6665. end of `rule`.
  6666. kind : {{'timestamp', 'period'}}, optional, default None
  6667. Pass 'timestamp' to convert the resulting index to a
  6668. `DateTimeIndex` or 'period' to convert it to a `PeriodIndex`.
  6669. By default the input representation is retained.
  6670. loffset : timedelta, default None
  6671. Adjust the resampled time labels.
  6672. .. deprecated:: 1.1.0
  6673. You should add the loffset to the `df.index` after the resample.
  6674. See below.
  6675. base : int, default 0
  6676. For frequencies that evenly subdivide 1 day, the "origin" of the
  6677. aggregated intervals. For example, for '5min' frequency, base could
  6678. range from 0 through 4. Defaults to 0.
  6679. .. deprecated:: 1.1.0
  6680. The new arguments that you should use are 'offset' or 'origin'.
  6681. on : str, optional
  6682. For a DataFrame, column to use instead of index for resampling.
  6683. Column must be datetime-like.
  6684. level : str or int, optional
  6685. For a MultiIndex, level (name or number) to use for
  6686. resampling. `level` must be datetime-like.
  6687. origin : {{'epoch', 'start', 'start_day', 'end', 'end_day'}}, Timestamp
  6688. or str, default 'start_day'
  6689. The timestamp on which to adjust the grouping. The timezone of origin
  6690. must match the timezone of the index.
  6691. If a timestamp is not used, these values are also supported:
  6692. - 'epoch': `origin` is 1970-01-01
  6693. - 'start': `origin` is the first value of the timeseries
  6694. - 'start_day': `origin` is the first day at midnight of the timeseries
  6695. .. versionadded:: 1.1.0
  6696. - 'end': `origin` is the last value of the timeseries
  6697. - 'end_day': `origin` is the ceiling midnight of the last day
  6698. .. versionadded:: 1.3.0
  6699. offset : Timedelta or str, default is None
  6700. An offset timedelta added to the origin.
  6701. .. versionadded:: 1.1.0
  6702. Returns
  6703. -------
  6704. pandas.core.Resampler
  6705. :class:`~pandas.core.Resampler` object.
  6706. See Also
  6707. --------
  6708. Series.resample : Resample a Series.
  6709. DataFrame.resample : Resample a DataFrame.
  6710. groupby : Group {klass} by mapping, function, label, or list of labels.
  6711. asfreq : Reindex a {klass} with the given frequency without grouping.
  6712. Notes
  6713. -----
  6714. See the `user guide
  6715. <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#resampling>`__
  6716. for more.
  6717. To learn more about the offset strings, please see `this link
  6718. <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects>`__.
  6719. Examples
  6720. --------
  6721. Start by creating a series with 9 one minute timestamps.
  6722. >>> index = pd.date_range('1/1/2000', periods=9, freq='T')
  6723. >>> series = pd.Series(range(9), index=index)
  6724. >>> series
  6725. 2000-01-01 00:00:00 0
  6726. 2000-01-01 00:01:00 1
  6727. 2000-01-01 00:02:00 2
  6728. 2000-01-01 00:03:00 3
  6729. 2000-01-01 00:04:00 4
  6730. 2000-01-01 00:05:00 5
  6731. 2000-01-01 00:06:00 6
  6732. 2000-01-01 00:07:00 7
  6733. 2000-01-01 00:08:00 8
  6734. Freq: T, dtype: int64
  6735. Downsample the series into 3 minute bins and sum the values
  6736. of the timestamps falling into a bin.
  6737. >>> series.resample('3T').sum()
  6738. 2000-01-01 00:00:00 3
  6739. 2000-01-01 00:03:00 12
  6740. 2000-01-01 00:06:00 21
  6741. Freq: 3T, dtype: int64
  6742. Downsample the series into 3 minute bins as above, but label each
  6743. bin using the right edge instead of the left. Please note that the
  6744. value in the bucket used as the label is not included in the bucket,
  6745. which it labels. For example, in the original series the
  6746. bucket ``2000-01-01 00:03:00`` contains the value 3, but the summed
  6747. value in the resampled bucket with the label ``2000-01-01 00:03:00``
  6748. does not include 3 (if it did, the summed value would be 6, not 3).
  6749. To include this value close the right side of the bin interval as
  6750. illustrated in the example below this one.
  6751. >>> series.resample('3T', label='right').sum()
  6752. 2000-01-01 00:03:00 3
  6753. 2000-01-01 00:06:00 12
  6754. 2000-01-01 00:09:00 21
  6755. Freq: 3T, dtype: int64
  6756. Downsample the series into 3 minute bins as above, but close the right
  6757. side of the bin interval.
  6758. >>> series.resample('3T', label='right', closed='right').sum()
  6759. 2000-01-01 00:00:00 0
  6760. 2000-01-01 00:03:00 6
  6761. 2000-01-01 00:06:00 15
  6762. 2000-01-01 00:09:00 15
  6763. Freq: 3T, dtype: int64
  6764. Upsample the series into 30 second bins.
  6765. >>> series.resample('30S').asfreq()[0:5] # Select first 5 rows
  6766. 2000-01-01 00:00:00 0.0
  6767. 2000-01-01 00:00:30 NaN
  6768. 2000-01-01 00:01:00 1.0
  6769. 2000-01-01 00:01:30 NaN
  6770. 2000-01-01 00:02:00 2.0
  6771. Freq: 30S, dtype: float64
  6772. Upsample the series into 30 second bins and fill the ``NaN``
  6773. values using the ``pad`` method.
  6774. >>> series.resample('30S').pad()[0:5]
  6775. 2000-01-01 00:00:00 0
  6776. 2000-01-01 00:00:30 0
  6777. 2000-01-01 00:01:00 1
  6778. 2000-01-01 00:01:30 1
  6779. 2000-01-01 00:02:00 2
  6780. Freq: 30S, dtype: int64
  6781. Upsample the series into 30 second bins and fill the
  6782. ``NaN`` values using the ``bfill`` method.
  6783. >>> series.resample('30S').bfill()[0:5]
  6784. 2000-01-01 00:00:00 0
  6785. 2000-01-01 00:00:30 1
  6786. 2000-01-01 00:01:00 1
  6787. 2000-01-01 00:01:30 2
  6788. 2000-01-01 00:02:00 2
  6789. Freq: 30S, dtype: int64
  6790. Pass a custom function via ``apply``
  6791. >>> def custom_resampler(arraylike):
  6792. ... return np.sum(arraylike) + 5
  6793. ...
  6794. >>> series.resample('3T').apply(custom_resampler)
  6795. 2000-01-01 00:00:00 8
  6796. 2000-01-01 00:03:00 17
  6797. 2000-01-01 00:06:00 26
  6798. Freq: 3T, dtype: int64
  6799. For a Series with a PeriodIndex, the keyword `convention` can be
  6800. used to control whether to use the start or end of `rule`.
  6801. Resample a year by quarter using 'start' `convention`. Values are
  6802. assigned to the first quarter of the period.
  6803. >>> s = pd.Series([1, 2], index=pd.period_range('2012-01-01',
  6804. ... freq='A',
  6805. ... periods=2))
  6806. >>> s
  6807. 2012 1
  6808. 2013 2
  6809. Freq: A-DEC, dtype: int64
  6810. >>> s.resample('Q', convention='start').asfreq()
  6811. 2012Q1 1.0
  6812. 2012Q2 NaN
  6813. 2012Q3 NaN
  6814. 2012Q4 NaN
  6815. 2013Q1 2.0
  6816. 2013Q2 NaN
  6817. 2013Q3 NaN
  6818. 2013Q4 NaN
  6819. Freq: Q-DEC, dtype: float64
  6820. Resample quarters by month using 'end' `convention`. Values are
  6821. assigned to the last month of the period.
  6822. >>> q = pd.Series([1, 2, 3, 4], index=pd.period_range('2018-01-01',
  6823. ... freq='Q',
  6824. ... periods=4))
  6825. >>> q
  6826. 2018Q1 1
  6827. 2018Q2 2
  6828. 2018Q3 3
  6829. 2018Q4 4
  6830. Freq: Q-DEC, dtype: int64
  6831. >>> q.resample('M', convention='end').asfreq()
  6832. 2018-03 1.0
  6833. 2018-04 NaN
  6834. 2018-05 NaN
  6835. 2018-06 2.0
  6836. 2018-07 NaN
  6837. 2018-08 NaN
  6838. 2018-09 3.0
  6839. 2018-10 NaN
  6840. 2018-11 NaN
  6841. 2018-12 4.0
  6842. Freq: M, dtype: float64
  6843. For DataFrame objects, the keyword `on` can be used to specify the
  6844. column instead of the index for resampling.
  6845. >>> d = {{'price': [10, 11, 9, 13, 14, 18, 17, 19],
  6846. ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}}
  6847. >>> df = pd.DataFrame(d)
  6848. >>> df['week_starting'] = pd.date_range('01/01/2018',
  6849. ... periods=8,
  6850. ... freq='W')
  6851. >>> df
  6852. price volume week_starting
  6853. 0 10 50 2018-01-07
  6854. 1 11 60 2018-01-14
  6855. 2 9 40 2018-01-21
  6856. 3 13 100 2018-01-28
  6857. 4 14 50 2018-02-04
  6858. 5 18 100 2018-02-11
  6859. 6 17 40 2018-02-18
  6860. 7 19 50 2018-02-25
  6861. >>> df.resample('M', on='week_starting').mean()
  6862. price volume
  6863. week_starting
  6864. 2018-01-31 10.75 62.5
  6865. 2018-02-28 17.00 60.0
  6866. For a DataFrame with MultiIndex, the keyword `level` can be used to
  6867. specify on which level the resampling needs to take place.
  6868. >>> days = pd.date_range('1/1/2000', periods=4, freq='D')
  6869. >>> d2 = {{'price': [10, 11, 9, 13, 14, 18, 17, 19],
  6870. ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}}
  6871. >>> df2 = pd.DataFrame(
  6872. ... d2,
  6873. ... index=pd.MultiIndex.from_product(
  6874. ... [days, ['morning', 'afternoon']]
  6875. ... )
  6876. ... )
  6877. >>> df2
  6878. price volume
  6879. 2000-01-01 morning 10 50
  6880. afternoon 11 60
  6881. 2000-01-02 morning 9 40
  6882. afternoon 13 100
  6883. 2000-01-03 morning 14 50
  6884. afternoon 18 100
  6885. 2000-01-04 morning 17 40
  6886. afternoon 19 50
  6887. >>> df2.resample('D', level=0).sum()
  6888. price volume
  6889. 2000-01-01 21 110
  6890. 2000-01-02 22 140
  6891. 2000-01-03 32 150
  6892. 2000-01-04 36 90
  6893. If you want to adjust the start of the bins based on a fixed timestamp:
  6894. >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00'
  6895. >>> rng = pd.date_range(start, end, freq='7min')
  6896. >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng)
  6897. >>> ts
  6898. 2000-10-01 23:30:00 0
  6899. 2000-10-01 23:37:00 3
  6900. 2000-10-01 23:44:00 6
  6901. 2000-10-01 23:51:00 9
  6902. 2000-10-01 23:58:00 12
  6903. 2000-10-02 00:05:00 15
  6904. 2000-10-02 00:12:00 18
  6905. 2000-10-02 00:19:00 21
  6906. 2000-10-02 00:26:00 24
  6907. Freq: 7T, dtype: int64
  6908. >>> ts.resample('17min').sum()
  6909. 2000-10-01 23:14:00 0
  6910. 2000-10-01 23:31:00 9
  6911. 2000-10-01 23:48:00 21
  6912. 2000-10-02 00:05:00 54
  6913. 2000-10-02 00:22:00 24
  6914. Freq: 17T, dtype: int64
  6915. >>> ts.resample('17min', origin='epoch').sum()
  6916. 2000-10-01 23:18:00 0
  6917. 2000-10-01 23:35:00 18
  6918. 2000-10-01 23:52:00 27
  6919. 2000-10-02 00:09:00 39
  6920. 2000-10-02 00:26:00 24
  6921. Freq: 17T, dtype: int64
  6922. >>> ts.resample('17min', origin='2000-01-01').sum()
  6923. 2000-10-01 23:24:00 3
  6924. 2000-10-01 23:41:00 15
  6925. 2000-10-01 23:58:00 45
  6926. 2000-10-02 00:15:00 45
  6927. Freq: 17T, dtype: int64
  6928. If you want to adjust the start of the bins with an `offset` Timedelta, the two
  6929. following lines are equivalent:
  6930. >>> ts.resample('17min', origin='start').sum()
  6931. 2000-10-01 23:30:00 9
  6932. 2000-10-01 23:47:00 21
  6933. 2000-10-02 00:04:00 54
  6934. 2000-10-02 00:21:00 24
  6935. Freq: 17T, dtype: int64
  6936. >>> ts.resample('17min', offset='23h30min').sum()
  6937. 2000-10-01 23:30:00 9
  6938. 2000-10-01 23:47:00 21
  6939. 2000-10-02 00:04:00 54
  6940. 2000-10-02 00:21:00 24
  6941. Freq: 17T, dtype: int64
  6942. If you want to take the largest Timestamp as the end of the bins:
  6943. >>> ts.resample('17min', origin='end').sum()
  6944. 2000-10-01 23:35:00 0
  6945. 2000-10-01 23:52:00 18
  6946. 2000-10-02 00:09:00 27
  6947. 2000-10-02 00:26:00 63
  6948. Freq: 17T, dtype: int64
  6949. In contrast with the `start_day`, you can use `end_day` to take the ceiling
  6950. midnight of the largest Timestamp as the end of the bins and drop the bins
  6951. not containing data:
  6952. >>> ts.resample('17min', origin='end_day').sum()
  6953. 2000-10-01 23:38:00 3
  6954. 2000-10-01 23:55:00 15
  6955. 2000-10-02 00:12:00 45
  6956. 2000-10-02 00:29:00 45
  6957. Freq: 17T, dtype: int64
  6958. To replace the use of the deprecated `base` argument, you can now use `offset`,
  6959. in this example it is equivalent to have `base=2`:
  6960. >>> ts.resample('17min', offset='2min').sum()
  6961. 2000-10-01 23:16:00 0
  6962. 2000-10-01 23:33:00 9
  6963. 2000-10-01 23:50:00 36
  6964. 2000-10-02 00:07:00 39
  6965. 2000-10-02 00:24:00 24
  6966. Freq: 17T, dtype: int64
  6967. To replace the use of the deprecated `loffset` argument:
  6968. >>> from pandas.tseries.frequencies import to_offset
  6969. >>> loffset = '19min'
  6970. >>> ts_out = ts.resample('17min').sum()
  6971. >>> ts_out.index = ts_out.index + to_offset(loffset)
  6972. >>> ts_out
  6973. 2000-10-01 23:33:00 0
  6974. 2000-10-01 23:50:00 9
  6975. 2000-10-02 00:07:00 21
  6976. 2000-10-02 00:24:00 54
  6977. 2000-10-02 00:41:00 24
  6978. Freq: 17T, dtype: int64
  6979. """
  6980. from pandas.core.resample import get_resampler
  6981. axis = self._get_axis_number(axis)
  6982. return get_resampler(
  6983. self,
  6984. freq=rule,
  6985. label=label,
  6986. closed=closed,
  6987. axis=axis,
  6988. kind=kind,
  6989. loffset=loffset,
  6990. convention=convention,
  6991. base=base,
  6992. key=on,
  6993. level=level,
  6994. origin=origin,
  6995. offset=offset,
  6996. )
  6997. @final
  6998. def first(self: NDFrameT, offset) -> NDFrameT:
  6999. """
  7000. Select initial periods of time series data based on a date offset.
  7001. When having a DataFrame with dates as index, this function can
  7002. select the first few rows based on a date offset.
  7003. Parameters
  7004. ----------
  7005. offset : str, DateOffset or dateutil.relativedelta
  7006. The offset length of the data that will be selected. For instance,
  7007. '1M' will display all the rows having their index within the first month.
  7008. Returns
  7009. -------
  7010. Series or DataFrame
  7011. A subset of the caller.
  7012. Raises
  7013. ------
  7014. TypeError
  7015. If the index is not a :class:`DatetimeIndex`
  7016. See Also
  7017. --------
  7018. last : Select final periods of time series based on a date offset.
  7019. at_time : Select values at a particular time of the day.
  7020. between_time : Select values between particular times of the day.
  7021. Examples
  7022. --------
  7023. >>> i = pd.date_range('2018-04-09', periods=4, freq='2D')
  7024. >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
  7025. >>> ts
  7026. A
  7027. 2018-04-09 1
  7028. 2018-04-11 2
  7029. 2018-04-13 3
  7030. 2018-04-15 4
  7031. Get the rows for the first 3 days:
  7032. >>> ts.first('3D')
  7033. A
  7034. 2018-04-09 1
  7035. 2018-04-11 2
  7036. Notice the data for 3 first calendar days were returned, not the first
  7037. 3 days observed in the dataset, and therefore data for 2018-04-13 was
  7038. not returned.
  7039. """
  7040. if not isinstance(self.index, DatetimeIndex):
  7041. raise TypeError("'first' only supports a DatetimeIndex index")
  7042. if len(self.index) == 0:
  7043. return self
  7044. offset = to_offset(offset)
  7045. if not isinstance(offset, Tick) and offset.is_on_offset(self.index[0]):
  7046. # GH#29623 if first value is end of period, remove offset with n = 1
  7047. # before adding the real offset
  7048. end_date = end = self.index[0] - offset.base + offset
  7049. else:
  7050. end_date = end = self.index[0] + offset
  7051. # Tick-like, e.g. 3 weeks
  7052. if isinstance(offset, Tick) and end_date in self.index:
  7053. end = self.index.searchsorted(end_date, side="left")
  7054. return self.iloc[:end]
  7055. return self.loc[:end]
  7056. @final
  7057. def last(self: NDFrameT, offset) -> NDFrameT:
  7058. """
  7059. Select final periods of time series data based on a date offset.
  7060. For a DataFrame with a sorted DatetimeIndex, this function
  7061. selects the last few rows based on a date offset.
  7062. Parameters
  7063. ----------
  7064. offset : str, DateOffset, dateutil.relativedelta
  7065. The offset length of the data that will be selected. For instance,
  7066. '3D' will display all the rows having their index within the last 3 days.
  7067. Returns
  7068. -------
  7069. Series or DataFrame
  7070. A subset of the caller.
  7071. Raises
  7072. ------
  7073. TypeError
  7074. If the index is not a :class:`DatetimeIndex`
  7075. See Also
  7076. --------
  7077. first : Select initial periods of time series based on a date offset.
  7078. at_time : Select values at a particular time of the day.
  7079. between_time : Select values between particular times of the day.
  7080. Examples
  7081. --------
  7082. >>> i = pd.date_range('2018-04-09', periods=4, freq='2D')
  7083. >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
  7084. >>> ts
  7085. A
  7086. 2018-04-09 1
  7087. 2018-04-11 2
  7088. 2018-04-13 3
  7089. 2018-04-15 4
  7090. Get the rows for the last 3 days:
  7091. >>> ts.last('3D')
  7092. A
  7093. 2018-04-13 3
  7094. 2018-04-15 4
  7095. Notice the data for 3 last calendar days were returned, not the last
  7096. 3 observed days in the dataset, and therefore data for 2018-04-11 was
  7097. not returned.
  7098. """
  7099. if not isinstance(self.index, DatetimeIndex):
  7100. raise TypeError("'last' only supports a DatetimeIndex index")
  7101. if len(self.index) == 0:
  7102. return self
  7103. offset = to_offset(offset)
  7104. start_date = self.index[-1] - offset
  7105. start = self.index.searchsorted(start_date, side="right")
  7106. return self.iloc[start:]
  7107. @final
  7108. def rank(
  7109. self: NDFrameT,
  7110. axis=0,
  7111. method: str = "average",
  7112. numeric_only: bool_t | None = None,
  7113. na_option: str = "keep",
  7114. ascending: bool_t = True,
  7115. pct: bool_t = False,
  7116. ) -> NDFrameT:
  7117. """
  7118. Compute numerical data ranks (1 through n) along axis.
  7119. By default, equal values are assigned a rank that is the average of the
  7120. ranks of those values.
  7121. Parameters
  7122. ----------
  7123. axis : {0 or 'index', 1 or 'columns'}, default 0
  7124. Index to direct ranking.
  7125. method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
  7126. How to rank the group of records that have the same value (i.e. ties):
  7127. * average: average rank of the group
  7128. * min: lowest rank in the group
  7129. * max: highest rank in the group
  7130. * first: ranks assigned in order they appear in the array
  7131. * dense: like 'min', but rank always increases by 1 between groups.
  7132. numeric_only : bool, optional
  7133. For DataFrame objects, rank only numeric columns if set to True.
  7134. na_option : {'keep', 'top', 'bottom'}, default 'keep'
  7135. How to rank NaN values:
  7136. * keep: assign NaN rank to NaN values
  7137. * top: assign lowest rank to NaN values
  7138. * bottom: assign highest rank to NaN values
  7139. ascending : bool, default True
  7140. Whether or not the elements should be ranked in ascending order.
  7141. pct : bool, default False
  7142. Whether or not to display the returned rankings in percentile
  7143. form.
  7144. Returns
  7145. -------
  7146. same type as caller
  7147. Return a Series or DataFrame with data ranks as values.
  7148. See Also
  7149. --------
  7150. core.groupby.GroupBy.rank : Rank of values within each group.
  7151. Examples
  7152. --------
  7153. >>> df = pd.DataFrame(data={'Animal': ['cat', 'penguin', 'dog',
  7154. ... 'spider', 'snake'],
  7155. ... 'Number_legs': [4, 2, 4, 8, np.nan]})
  7156. >>> df
  7157. Animal Number_legs
  7158. 0 cat 4.0
  7159. 1 penguin 2.0
  7160. 2 dog 4.0
  7161. 3 spider 8.0
  7162. 4 snake NaN
  7163. The following example shows how the method behaves with the above
  7164. parameters:
  7165. * default_rank: this is the default behaviour obtained without using
  7166. any parameter.
  7167. * max_rank: setting ``method = 'max'`` the records that have the
  7168. same values are ranked using the highest rank (e.g.: since 'cat'
  7169. and 'dog' are both in the 2nd and 3rd position, rank 3 is assigned.)
  7170. * NA_bottom: choosing ``na_option = 'bottom'``, if there are records
  7171. with NaN values they are placed at the bottom of the ranking.
  7172. * pct_rank: when setting ``pct = True``, the ranking is expressed as
  7173. percentile rank.
  7174. >>> df['default_rank'] = df['Number_legs'].rank()
  7175. >>> df['max_rank'] = df['Number_legs'].rank(method='max')
  7176. >>> df['NA_bottom'] = df['Number_legs'].rank(na_option='bottom')
  7177. >>> df['pct_rank'] = df['Number_legs'].rank(pct=True)
  7178. >>> df
  7179. Animal Number_legs default_rank max_rank NA_bottom pct_rank
  7180. 0 cat 4.0 2.5 3.0 2.5 0.625
  7181. 1 penguin 2.0 1.0 1.0 1.0 0.250
  7182. 2 dog 4.0 2.5 3.0 2.5 0.625
  7183. 3 spider 8.0 4.0 4.0 4.0 1.000
  7184. 4 snake NaN NaN NaN 5.0 NaN
  7185. """
  7186. axis = self._get_axis_number(axis)
  7187. if na_option not in {"keep", "top", "bottom"}:
  7188. msg = "na_option must be one of 'keep', 'top', or 'bottom'"
  7189. raise ValueError(msg)
  7190. def ranker(data):
  7191. ranks = algos.rank(
  7192. data.values,
  7193. axis=axis,
  7194. method=method,
  7195. ascending=ascending,
  7196. na_option=na_option,
  7197. pct=pct,
  7198. )
  7199. # error: Argument 1 to "NDFrame" has incompatible type "ndarray"; expected
  7200. # "Union[ArrayManager, BlockManager]"
  7201. ranks_obj = self._constructor(
  7202. ranks, **data._construct_axes_dict() # type: ignore[arg-type]
  7203. )
  7204. return ranks_obj.__finalize__(self, method="rank")
  7205. # if numeric_only is None, and we can't get anything, we try with
  7206. # numeric_only=True
  7207. if numeric_only is None:
  7208. try:
  7209. return ranker(self)
  7210. except TypeError:
  7211. numeric_only = True
  7212. if numeric_only:
  7213. data = self._get_numeric_data()
  7214. else:
  7215. data = self
  7216. return ranker(data)
  7217. @doc(_shared_docs["compare"], klass=_shared_doc_kwargs["klass"])
  7218. def compare(
  7219. self,
  7220. other,
  7221. align_axis: Axis = 1,
  7222. keep_shape: bool_t = False,
  7223. keep_equal: bool_t = False,
  7224. ):
  7225. from pandas.core.reshape.concat import concat
  7226. if type(self) is not type(other):
  7227. cls_self, cls_other = type(self).__name__, type(other).__name__
  7228. raise TypeError(
  7229. f"can only compare '{cls_self}' (not '{cls_other}') with '{cls_self}'"
  7230. )
  7231. mask = ~((self == other) | (self.isna() & other.isna()))
  7232. keys = ["self", "other"]
  7233. if not keep_equal:
  7234. self = self.where(mask)
  7235. other = other.where(mask)
  7236. if not keep_shape:
  7237. if isinstance(self, ABCDataFrame):
  7238. cmask = mask.any()
  7239. rmask = mask.any(axis=1)
  7240. self = self.loc[rmask, cmask]
  7241. other = other.loc[rmask, cmask]
  7242. else:
  7243. self = self[mask]
  7244. other = other[mask]
  7245. if align_axis in (1, "columns"): # This is needed for Series
  7246. axis = 1
  7247. else:
  7248. axis = self._get_axis_number(align_axis)
  7249. diff = concat([self, other], axis=axis, keys=keys)
  7250. if axis >= self.ndim:
  7251. # No need to reorganize data if stacking on new axis
  7252. # This currently applies for stacking two Series on columns
  7253. return diff
  7254. ax = diff._get_axis(axis)
  7255. ax_names = np.array(ax.names)
  7256. # set index names to positions to avoid confusion
  7257. ax.names = np.arange(len(ax_names))
  7258. # bring self-other to inner level
  7259. order = list(range(1, ax.nlevels)) + [0]
  7260. if isinstance(diff, ABCDataFrame):
  7261. diff = diff.reorder_levels(order, axis=axis)
  7262. else:
  7263. diff = diff.reorder_levels(order)
  7264. # restore the index names in order
  7265. diff._get_axis(axis=axis).names = ax_names[order]
  7266. # reorder axis to keep things organized
  7267. indices = (
  7268. np.arange(diff.shape[axis]).reshape([2, diff.shape[axis] // 2]).T.flatten()
  7269. )
  7270. diff = diff.take(indices, axis=axis)
  7271. return diff
  7272. @doc(**_shared_doc_kwargs)
  7273. def align(
  7274. self,
  7275. other,
  7276. join="outer",
  7277. axis=None,
  7278. level=None,
  7279. copy=True,
  7280. fill_value=None,
  7281. method=None,
  7282. limit=None,
  7283. fill_axis=0,
  7284. broadcast_axis=None,
  7285. ):
  7286. """
  7287. Align two objects on their axes with the specified join method.
  7288. Join method is specified for each axis Index.
  7289. Parameters
  7290. ----------
  7291. other : DataFrame or Series
  7292. join : {{'outer', 'inner', 'left', 'right'}}, default 'outer'
  7293. axis : allowed axis of the other object, default None
  7294. Align on index (0), columns (1), or both (None).
  7295. level : int or level name, default None
  7296. Broadcast across a level, matching Index values on the
  7297. passed MultiIndex level.
  7298. copy : bool, default True
  7299. Always returns new objects. If copy=False and no reindexing is
  7300. required then original objects are returned.
  7301. fill_value : scalar, default np.NaN
  7302. Value to use for missing values. Defaults to NaN, but can be any
  7303. "compatible" value.
  7304. method : {{'backfill', 'bfill', 'pad', 'ffill', None}}, default None
  7305. Method to use for filling holes in reindexed Series:
  7306. - pad / ffill: propagate last valid observation forward to next valid.
  7307. - backfill / bfill: use NEXT valid observation to fill gap.
  7308. limit : int, default None
  7309. If method is specified, this is the maximum number of consecutive
  7310. NaN values to forward/backward fill. In other words, if there is
  7311. a gap with more than this number of consecutive NaNs, it will only
  7312. be partially filled. If method is not specified, this is the
  7313. maximum number of entries along the entire axis where NaNs will be
  7314. filled. Must be greater than 0 if not None.
  7315. fill_axis : {axes_single_arg}, default 0
  7316. Filling axis, method and limit.
  7317. broadcast_axis : {axes_single_arg}, default None
  7318. Broadcast values along this axis, if aligning two objects of
  7319. different dimensions.
  7320. Returns
  7321. -------
  7322. (left, right) : ({klass}, type of other)
  7323. Aligned objects.
  7324. Examples
  7325. --------
  7326. >>> df = pd.DataFrame(
  7327. ... [[1, 2, 3, 4], [6, 7, 8, 9]], columns=["D", "B", "E", "A"], index=[1, 2]
  7328. ... )
  7329. >>> other = pd.DataFrame(
  7330. ... [[10, 20, 30, 40], [60, 70, 80, 90], [600, 700, 800, 900]],
  7331. ... columns=["A", "B", "C", "D"],
  7332. ... index=[2, 3, 4],
  7333. ... )
  7334. >>> df
  7335. D B E A
  7336. 1 1 2 3 4
  7337. 2 6 7 8 9
  7338. >>> other
  7339. A B C D
  7340. 2 10 20 30 40
  7341. 3 60 70 80 90
  7342. 4 600 700 800 900
  7343. Align on columns:
  7344. >>> left, right = df.align(other, join="outer", axis=1)
  7345. >>> left
  7346. A B C D E
  7347. 1 4 2 NaN 1 3
  7348. 2 9 7 NaN 6 8
  7349. >>> right
  7350. A B C D E
  7351. 2 10 20 30 40 NaN
  7352. 3 60 70 80 90 NaN
  7353. 4 600 700 800 900 NaN
  7354. We can also align on the index:
  7355. >>> left, right = df.align(other, join="outer", axis=0)
  7356. >>> left
  7357. D B E A
  7358. 1 1.0 2.0 3.0 4.0
  7359. 2 6.0 7.0 8.0 9.0
  7360. 3 NaN NaN NaN NaN
  7361. 4 NaN NaN NaN NaN
  7362. >>> right
  7363. A B C D
  7364. 1 NaN NaN NaN NaN
  7365. 2 10.0 20.0 30.0 40.0
  7366. 3 60.0 70.0 80.0 90.0
  7367. 4 600.0 700.0 800.0 900.0
  7368. Finally, the default `axis=None` will align on both index and columns:
  7369. >>> left, right = df.align(other, join="outer", axis=None)
  7370. >>> left
  7371. A B C D E
  7372. 1 4.0 2.0 NaN 1.0 3.0
  7373. 2 9.0 7.0 NaN 6.0 8.0
  7374. 3 NaN NaN NaN NaN NaN
  7375. 4 NaN NaN NaN NaN NaN
  7376. >>> right
  7377. A B C D E
  7378. 1 NaN NaN NaN NaN NaN
  7379. 2 10.0 20.0 30.0 40.0 NaN
  7380. 3 60.0 70.0 80.0 90.0 NaN
  7381. 4 600.0 700.0 800.0 900.0 NaN
  7382. """
  7383. method = missing.clean_fill_method(method)
  7384. if broadcast_axis == 1 and self.ndim != other.ndim:
  7385. if isinstance(self, ABCSeries):
  7386. # this means other is a DataFrame, and we need to broadcast
  7387. # self
  7388. cons = self._constructor_expanddim
  7389. df = cons(
  7390. {c: self for c in other.columns}, **other._construct_axes_dict()
  7391. )
  7392. return df._align_frame(
  7393. other,
  7394. join=join,
  7395. axis=axis,
  7396. level=level,
  7397. copy=copy,
  7398. fill_value=fill_value,
  7399. method=method,
  7400. limit=limit,
  7401. fill_axis=fill_axis,
  7402. )
  7403. elif isinstance(other, ABCSeries):
  7404. # this means self is a DataFrame, and we need to broadcast
  7405. # other
  7406. cons = other._constructor_expanddim
  7407. df = cons(
  7408. {c: other for c in self.columns}, **self._construct_axes_dict()
  7409. )
  7410. return self._align_frame(
  7411. df,
  7412. join=join,
  7413. axis=axis,
  7414. level=level,
  7415. copy=copy,
  7416. fill_value=fill_value,
  7417. method=method,
  7418. limit=limit,
  7419. fill_axis=fill_axis,
  7420. )
  7421. if axis is not None:
  7422. axis = self._get_axis_number(axis)
  7423. if isinstance(other, ABCDataFrame):
  7424. return self._align_frame(
  7425. other,
  7426. join=join,
  7427. axis=axis,
  7428. level=level,
  7429. copy=copy,
  7430. fill_value=fill_value,
  7431. method=method,
  7432. limit=limit,
  7433. fill_axis=fill_axis,
  7434. )
  7435. elif isinstance(other, ABCSeries):
  7436. return self._align_series(
  7437. other,
  7438. join=join,
  7439. axis=axis,
  7440. level=level,
  7441. copy=copy,
  7442. fill_value=fill_value,
  7443. method=method,
  7444. limit=limit,
  7445. fill_axis=fill_axis,
  7446. )
  7447. else: # pragma: no cover
  7448. raise TypeError(f"unsupported type: {type(other)}")
  7449. @final
  7450. def _align_frame(
  7451. self,
  7452. other,
  7453. join="outer",
  7454. axis=None,
  7455. level=None,
  7456. copy: bool_t = True,
  7457. fill_value=None,
  7458. method=None,
  7459. limit=None,
  7460. fill_axis=0,
  7461. ):
  7462. # defaults
  7463. join_index, join_columns = None, None
  7464. ilidx, iridx = None, None
  7465. clidx, cridx = None, None
  7466. is_series = isinstance(self, ABCSeries)
  7467. if (axis is None or axis == 0) and not self.index.equals(other.index):
  7468. join_index, ilidx, iridx = self.index.join(
  7469. other.index, how=join, level=level, return_indexers=True
  7470. )
  7471. if (
  7472. (axis is None or axis == 1)
  7473. and not is_series
  7474. and not self.columns.equals(other.columns)
  7475. ):
  7476. join_columns, clidx, cridx = self.columns.join(
  7477. other.columns, how=join, level=level, return_indexers=True
  7478. )
  7479. if is_series:
  7480. reindexers = {0: [join_index, ilidx]}
  7481. else:
  7482. reindexers = {0: [join_index, ilidx], 1: [join_columns, clidx]}
  7483. left = self._reindex_with_indexers(
  7484. reindexers, copy=copy, fill_value=fill_value, allow_dups=True
  7485. )
  7486. # other must be always DataFrame
  7487. right = other._reindex_with_indexers(
  7488. {0: [join_index, iridx], 1: [join_columns, cridx]},
  7489. copy=copy,
  7490. fill_value=fill_value,
  7491. allow_dups=True,
  7492. )
  7493. if method is not None:
  7494. _left = left.fillna(method=method, axis=fill_axis, limit=limit)
  7495. assert _left is not None # needed for mypy
  7496. left = _left
  7497. right = right.fillna(method=method, axis=fill_axis, limit=limit)
  7498. # if DatetimeIndex have different tz, convert to UTC
  7499. left, right = _align_as_utc(left, right, join_index)
  7500. return (
  7501. left.__finalize__(self),
  7502. right.__finalize__(other),
  7503. )
  7504. @final
  7505. def _align_series(
  7506. self,
  7507. other,
  7508. join="outer",
  7509. axis=None,
  7510. level=None,
  7511. copy: bool_t = True,
  7512. fill_value=None,
  7513. method=None,
  7514. limit=None,
  7515. fill_axis=0,
  7516. ):
  7517. is_series = isinstance(self, ABCSeries)
  7518. # series/series compat, other must always be a Series
  7519. if is_series:
  7520. if axis:
  7521. raise ValueError("cannot align series to a series other than axis 0")
  7522. # equal
  7523. if self.index.equals(other.index):
  7524. join_index, lidx, ridx = None, None, None
  7525. else:
  7526. join_index, lidx, ridx = self.index.join(
  7527. other.index, how=join, level=level, return_indexers=True
  7528. )
  7529. left = self._reindex_indexer(join_index, lidx, copy)
  7530. right = other._reindex_indexer(join_index, ridx, copy)
  7531. else:
  7532. # one has > 1 ndim
  7533. fdata = self._mgr
  7534. if axis in [0, 1]:
  7535. join_index = self.axes[axis]
  7536. lidx, ridx = None, None
  7537. if not join_index.equals(other.index):
  7538. join_index, lidx, ridx = join_index.join(
  7539. other.index, how=join, level=level, return_indexers=True
  7540. )
  7541. if lidx is not None:
  7542. bm_axis = self._get_block_manager_axis(axis)
  7543. fdata = fdata.reindex_indexer(join_index, lidx, axis=bm_axis)
  7544. else:
  7545. raise ValueError("Must specify axis=0 or 1")
  7546. if copy and fdata is self._mgr:
  7547. fdata = fdata.copy()
  7548. left = self._constructor(fdata)
  7549. if ridx is None:
  7550. right = other
  7551. else:
  7552. right = other.reindex(join_index, level=level)
  7553. # fill
  7554. fill_na = notna(fill_value) or (method is not None)
  7555. if fill_na:
  7556. left = left.fillna(fill_value, method=method, limit=limit, axis=fill_axis)
  7557. right = right.fillna(fill_value, method=method, limit=limit)
  7558. # if DatetimeIndex have different tz, convert to UTC
  7559. if is_series or (not is_series and axis == 0):
  7560. left, right = _align_as_utc(left, right, join_index)
  7561. return (
  7562. left.__finalize__(self),
  7563. right.__finalize__(other),
  7564. )
  7565. @final
  7566. def _where(
  7567. self,
  7568. cond,
  7569. other=np.nan,
  7570. inplace=False,
  7571. axis=None,
  7572. level=None,
  7573. errors=lib.no_default,
  7574. ):
  7575. """
  7576. Equivalent to public method `where`, except that `other` is not
  7577. applied as a function even if callable. Used in __setitem__.
  7578. """
  7579. inplace = validate_bool_kwarg(inplace, "inplace")
  7580. if errors is not lib.no_default:
  7581. warnings.warn(
  7582. f"The 'errors' keyword in {type(self).__name__}.where and mask is "
  7583. "deprecated and will be removed in a future version.",
  7584. FutureWarning,
  7585. stacklevel=find_stack_level(),
  7586. )
  7587. if axis is not None:
  7588. axis = self._get_axis_number(axis)
  7589. # align the cond to same shape as myself
  7590. cond = com.apply_if_callable(cond, self)
  7591. if isinstance(cond, NDFrame):
  7592. cond, _ = cond.align(self, join="right", broadcast_axis=1, copy=False)
  7593. else:
  7594. if not hasattr(cond, "shape"):
  7595. cond = np.asanyarray(cond)
  7596. if cond.shape != self.shape:
  7597. raise ValueError("Array conditional must be same shape as self")
  7598. cond = self._constructor(cond, **self._construct_axes_dict())
  7599. # make sure we are boolean
  7600. fill_value = bool(inplace)
  7601. cond = cond.fillna(fill_value)
  7602. msg = "Boolean array expected for the condition, not {dtype}"
  7603. if not cond.empty:
  7604. if not isinstance(cond, ABCDataFrame):
  7605. # This is a single-dimensional object.
  7606. if not is_bool_dtype(cond):
  7607. raise ValueError(msg.format(dtype=cond.dtype))
  7608. else:
  7609. for dt in cond.dtypes:
  7610. if not is_bool_dtype(dt):
  7611. raise ValueError(msg.format(dtype=dt))
  7612. else:
  7613. # GH#21947 we have an empty DataFrame/Series, could be object-dtype
  7614. cond = cond.astype(bool)
  7615. cond = -cond if inplace else cond
  7616. cond = cond.reindex(self._info_axis, axis=self._info_axis_number, copy=False)
  7617. # try to align with other
  7618. if isinstance(other, NDFrame):
  7619. # align with me
  7620. if other.ndim <= self.ndim:
  7621. _, other = self.align(
  7622. other,
  7623. join="left",
  7624. axis=axis,
  7625. level=level,
  7626. fill_value=None,
  7627. copy=False,
  7628. )
  7629. # if we are NOT aligned, raise as we cannot where index
  7630. if axis is None and not other._indexed_same(self):
  7631. raise InvalidIndexError
  7632. elif other.ndim < self.ndim:
  7633. # TODO(EA2D): avoid object-dtype cast in EA case GH#38729
  7634. other = other._values
  7635. if axis == 0:
  7636. other = np.reshape(other, (-1, 1))
  7637. elif axis == 1:
  7638. other = np.reshape(other, (1, -1))
  7639. other = np.broadcast_to(other, self.shape)
  7640. # slice me out of the other
  7641. else:
  7642. raise NotImplementedError(
  7643. "cannot align with a higher dimensional NDFrame"
  7644. )
  7645. elif not isinstance(other, (MultiIndex, NDFrame)):
  7646. # mainly just catching Index here
  7647. other = extract_array(other, extract_numpy=True)
  7648. if isinstance(other, (np.ndarray, ExtensionArray)):
  7649. if other.shape != self.shape:
  7650. if self.ndim != 1:
  7651. # In the ndim == 1 case we may have
  7652. # other length 1, which we treat as scalar (GH#2745, GH#4192)
  7653. # or len(other) == icond.sum(), which we treat like
  7654. # __setitem__ (GH#3235)
  7655. raise ValueError(
  7656. "other must be the same shape as self when an ndarray"
  7657. )
  7658. # we are the same shape, so create an actual object for alignment
  7659. else:
  7660. # error: Argument 1 to "NDFrame" has incompatible type "ndarray";
  7661. # expected "BlockManager"
  7662. other = self._constructor(
  7663. other, **self._construct_axes_dict() # type: ignore[arg-type]
  7664. )
  7665. if axis is None:
  7666. axis = 0
  7667. if self.ndim == getattr(other, "ndim", 0):
  7668. align = True
  7669. else:
  7670. align = self._get_axis_number(axis) == 1
  7671. if inplace:
  7672. # we may have different type blocks come out of putmask, so
  7673. # reconstruct the block manager
  7674. self._check_inplace_setting(other)
  7675. new_data = self._mgr.putmask(mask=cond, new=other, align=align)
  7676. result = self._constructor(new_data)
  7677. return self._update_inplace(result)
  7678. else:
  7679. new_data = self._mgr.where(
  7680. other=other,
  7681. cond=cond,
  7682. align=align,
  7683. )
  7684. result = self._constructor(new_data)
  7685. return result.__finalize__(self)
  7686. @doc(
  7687. klass=_shared_doc_kwargs["klass"],
  7688. cond="True",
  7689. cond_rev="False",
  7690. name="where",
  7691. name_other="mask",
  7692. )
  7693. def where(
  7694. self,
  7695. cond,
  7696. other=np.nan,
  7697. inplace=False,
  7698. axis=None,
  7699. level=None,
  7700. errors=lib.no_default,
  7701. try_cast=lib.no_default,
  7702. ):
  7703. """
  7704. Replace values where the condition is {cond_rev}.
  7705. Parameters
  7706. ----------
  7707. cond : bool {klass}, array-like, or callable
  7708. Where `cond` is {cond}, keep the original value. Where
  7709. {cond_rev}, replace with corresponding value from `other`.
  7710. If `cond` is callable, it is computed on the {klass} and
  7711. should return boolean {klass} or array. The callable must
  7712. not change input {klass} (though pandas doesn't check it).
  7713. other : scalar, {klass}, or callable
  7714. Entries where `cond` is {cond_rev} are replaced with
  7715. corresponding value from `other`.
  7716. If other is callable, it is computed on the {klass} and
  7717. should return scalar or {klass}. The callable must not
  7718. change input {klass} (though pandas doesn't check it).
  7719. inplace : bool, default False
  7720. Whether to perform the operation in place on the data.
  7721. axis : int, default None
  7722. Alignment axis if needed.
  7723. level : int, default None
  7724. Alignment level if needed.
  7725. errors : str, {{'raise', 'ignore'}}, default 'raise'
  7726. Note that currently this parameter won't affect
  7727. the results and will always coerce to a suitable dtype.
  7728. - 'raise' : allow exceptions to be raised.
  7729. - 'ignore' : suppress exceptions. On error return original object.
  7730. .. deprecated:: 1.4.0
  7731. Previously was silently ignored.
  7732. try_cast : bool, default None
  7733. Try to cast the result back to the input type (if possible).
  7734. .. deprecated:: 1.3.0
  7735. Manually cast back if necessary.
  7736. Returns
  7737. -------
  7738. Same type as caller or None if ``inplace=True``.
  7739. See Also
  7740. --------
  7741. :func:`DataFrame.{name_other}` : Return an object of same shape as
  7742. self.
  7743. Notes
  7744. -----
  7745. The {name} method is an application of the if-then idiom. For each
  7746. element in the calling DataFrame, if ``cond`` is ``{cond}`` the
  7747. element is used; otherwise the corresponding element from the DataFrame
  7748. ``other`` is used.
  7749. The signature for :func:`DataFrame.where` differs from
  7750. :func:`numpy.where`. Roughly ``df1.where(m, df2)`` is equivalent to
  7751. ``np.where(m, df1, df2)``.
  7752. For further details and examples see the ``{name}`` documentation in
  7753. :ref:`indexing <indexing.where_mask>`.
  7754. Examples
  7755. --------
  7756. >>> s = pd.Series(range(5))
  7757. >>> s.where(s > 0)
  7758. 0 NaN
  7759. 1 1.0
  7760. 2 2.0
  7761. 3 3.0
  7762. 4 4.0
  7763. dtype: float64
  7764. >>> s.mask(s > 0)
  7765. 0 0.0
  7766. 1 NaN
  7767. 2 NaN
  7768. 3 NaN
  7769. 4 NaN
  7770. dtype: float64
  7771. >>> s.where(s > 1, 10)
  7772. 0 10
  7773. 1 10
  7774. 2 2
  7775. 3 3
  7776. 4 4
  7777. dtype: int64
  7778. >>> s.mask(s > 1, 10)
  7779. 0 0
  7780. 1 1
  7781. 2 10
  7782. 3 10
  7783. 4 10
  7784. dtype: int64
  7785. >>> df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])
  7786. >>> df
  7787. A B
  7788. 0 0 1
  7789. 1 2 3
  7790. 2 4 5
  7791. 3 6 7
  7792. 4 8 9
  7793. >>> m = df % 3 == 0
  7794. >>> df.where(m, -df)
  7795. A B
  7796. 0 0 -1
  7797. 1 -2 3
  7798. 2 -4 -5
  7799. 3 6 -7
  7800. 4 -8 9
  7801. >>> df.where(m, -df) == np.where(m, df, -df)
  7802. A B
  7803. 0 True True
  7804. 1 True True
  7805. 2 True True
  7806. 3 True True
  7807. 4 True True
  7808. >>> df.where(m, -df) == df.mask(~m, -df)
  7809. A B
  7810. 0 True True
  7811. 1 True True
  7812. 2 True True
  7813. 3 True True
  7814. 4 True True
  7815. """
  7816. other = com.apply_if_callable(other, self)
  7817. if try_cast is not lib.no_default:
  7818. warnings.warn(
  7819. "try_cast keyword is deprecated and will be removed in a "
  7820. "future version.",
  7821. FutureWarning,
  7822. stacklevel=4,
  7823. )
  7824. return self._where(cond, other, inplace, axis, level, errors=errors)
  7825. @doc(
  7826. where,
  7827. klass=_shared_doc_kwargs["klass"],
  7828. cond="False",
  7829. cond_rev="True",
  7830. name="mask",
  7831. name_other="where",
  7832. )
  7833. def mask(
  7834. self,
  7835. cond,
  7836. other=np.nan,
  7837. inplace=False,
  7838. axis=None,
  7839. level=None,
  7840. errors=lib.no_default,
  7841. try_cast=lib.no_default,
  7842. ):
  7843. inplace = validate_bool_kwarg(inplace, "inplace")
  7844. cond = com.apply_if_callable(cond, self)
  7845. if try_cast is not lib.no_default:
  7846. warnings.warn(
  7847. "try_cast keyword is deprecated and will be removed in a "
  7848. "future version.",
  7849. FutureWarning,
  7850. stacklevel=4,
  7851. )
  7852. # see gh-21891
  7853. if not hasattr(cond, "__invert__"):
  7854. cond = np.array(cond)
  7855. return self.where(
  7856. ~cond,
  7857. other=other,
  7858. inplace=inplace,
  7859. axis=axis,
  7860. level=level,
  7861. errors=errors,
  7862. )
  7863. @doc(klass=_shared_doc_kwargs["klass"])
  7864. def shift(
  7865. self: NDFrameT, periods=1, freq=None, axis=0, fill_value=None
  7866. ) -> NDFrameT:
  7867. """
  7868. Shift index by desired number of periods with an optional time `freq`.
  7869. When `freq` is not passed, shift the index without realigning the data.
  7870. If `freq` is passed (in this case, the index must be date or datetime,
  7871. or it will raise a `NotImplementedError`), the index will be
  7872. increased using the periods and the `freq`. `freq` can be inferred
  7873. when specified as "infer" as long as either freq or inferred_freq
  7874. attribute is set in the index.
  7875. Parameters
  7876. ----------
  7877. periods : int
  7878. Number of periods to shift. Can be positive or negative.
  7879. freq : DateOffset, tseries.offsets, timedelta, or str, optional
  7880. Offset to use from the tseries module or time rule (e.g. 'EOM').
  7881. If `freq` is specified then the index values are shifted but the
  7882. data is not realigned. That is, use `freq` if you would like to
  7883. extend the index when shifting and preserve the original data.
  7884. If `freq` is specified as "infer" then it will be inferred from
  7885. the freq or inferred_freq attributes of the index. If neither of
  7886. those attributes exist, a ValueError is thrown.
  7887. axis : {{0 or 'index', 1 or 'columns', None}}, default None
  7888. Shift direction.
  7889. fill_value : object, optional
  7890. The scalar value to use for newly introduced missing values.
  7891. the default depends on the dtype of `self`.
  7892. For numeric data, ``np.nan`` is used.
  7893. For datetime, timedelta, or period data, etc. :attr:`NaT` is used.
  7894. For extension dtypes, ``self.dtype.na_value`` is used.
  7895. .. versionchanged:: 1.1.0
  7896. Returns
  7897. -------
  7898. {klass}
  7899. Copy of input object, shifted.
  7900. See Also
  7901. --------
  7902. Index.shift : Shift values of Index.
  7903. DatetimeIndex.shift : Shift values of DatetimeIndex.
  7904. PeriodIndex.shift : Shift values of PeriodIndex.
  7905. tshift : Shift the time index, using the index's frequency if
  7906. available.
  7907. Examples
  7908. --------
  7909. >>> df = pd.DataFrame({{"Col1": [10, 20, 15, 30, 45],
  7910. ... "Col2": [13, 23, 18, 33, 48],
  7911. ... "Col3": [17, 27, 22, 37, 52]}},
  7912. ... index=pd.date_range("2020-01-01", "2020-01-05"))
  7913. >>> df
  7914. Col1 Col2 Col3
  7915. 2020-01-01 10 13 17
  7916. 2020-01-02 20 23 27
  7917. 2020-01-03 15 18 22
  7918. 2020-01-04 30 33 37
  7919. 2020-01-05 45 48 52
  7920. >>> df.shift(periods=3)
  7921. Col1 Col2 Col3
  7922. 2020-01-01 NaN NaN NaN
  7923. 2020-01-02 NaN NaN NaN
  7924. 2020-01-03 NaN NaN NaN
  7925. 2020-01-04 10.0 13.0 17.0
  7926. 2020-01-05 20.0 23.0 27.0
  7927. >>> df.shift(periods=1, axis="columns")
  7928. Col1 Col2 Col3
  7929. 2020-01-01 NaN 10 13
  7930. 2020-01-02 NaN 20 23
  7931. 2020-01-03 NaN 15 18
  7932. 2020-01-04 NaN 30 33
  7933. 2020-01-05 NaN 45 48
  7934. >>> df.shift(periods=3, fill_value=0)
  7935. Col1 Col2 Col3
  7936. 2020-01-01 0 0 0
  7937. 2020-01-02 0 0 0
  7938. 2020-01-03 0 0 0
  7939. 2020-01-04 10 13 17
  7940. 2020-01-05 20 23 27
  7941. >>> df.shift(periods=3, freq="D")
  7942. Col1 Col2 Col3
  7943. 2020-01-04 10 13 17
  7944. 2020-01-05 20 23 27
  7945. 2020-01-06 15 18 22
  7946. 2020-01-07 30 33 37
  7947. 2020-01-08 45 48 52
  7948. >>> df.shift(periods=3, freq="infer")
  7949. Col1 Col2 Col3
  7950. 2020-01-04 10 13 17
  7951. 2020-01-05 20 23 27
  7952. 2020-01-06 15 18 22
  7953. 2020-01-07 30 33 37
  7954. 2020-01-08 45 48 52
  7955. """
  7956. if periods == 0:
  7957. return self.copy()
  7958. if freq is None:
  7959. # when freq is None, data is shifted, index is not
  7960. axis = self._get_axis_number(axis)
  7961. new_data = self._mgr.shift(
  7962. periods=periods, axis=axis, fill_value=fill_value
  7963. )
  7964. return self._constructor(new_data).__finalize__(self, method="shift")
  7965. # when freq is given, index is shifted, data is not
  7966. index = self._get_axis(axis)
  7967. if freq == "infer":
  7968. freq = getattr(index, "freq", None)
  7969. if freq is None:
  7970. freq = getattr(index, "inferred_freq", None)
  7971. if freq is None:
  7972. msg = "Freq was not set in the index hence cannot be inferred"
  7973. raise ValueError(msg)
  7974. elif isinstance(freq, str):
  7975. freq = to_offset(freq)
  7976. if isinstance(index, PeriodIndex):
  7977. orig_freq = to_offset(index.freq)
  7978. if freq != orig_freq:
  7979. assert orig_freq is not None # for mypy
  7980. raise ValueError(
  7981. f"Given freq {freq.rule_code} does not match "
  7982. f"PeriodIndex freq {orig_freq.rule_code}"
  7983. )
  7984. new_ax = index.shift(periods)
  7985. else:
  7986. new_ax = index.shift(periods, freq)
  7987. result = self.set_axis(new_ax, axis=axis)
  7988. return result.__finalize__(self, method="shift")
  7989. @final
  7990. def slice_shift(self: NDFrameT, periods: int = 1, axis=0) -> NDFrameT:
  7991. """
  7992. Equivalent to `shift` without copying data.
  7993. The shifted data will not include the dropped periods and the
  7994. shifted axis will be smaller than the original.
  7995. .. deprecated:: 1.2.0
  7996. slice_shift is deprecated,
  7997. use DataFrame/Series.shift instead.
  7998. Parameters
  7999. ----------
  8000. periods : int
  8001. Number of periods to move, can be positive or negative.
  8002. Returns
  8003. -------
  8004. shifted : same type as caller
  8005. Notes
  8006. -----
  8007. While the `slice_shift` is faster than `shift`, you may pay for it
  8008. later during alignment.
  8009. """
  8010. msg = (
  8011. "The 'slice_shift' method is deprecated "
  8012. "and will be removed in a future version. "
  8013. "You can use DataFrame/Series.shift instead."
  8014. )
  8015. warnings.warn(msg, FutureWarning, stacklevel=2)
  8016. if periods == 0:
  8017. return self
  8018. if periods > 0:
  8019. vslicer = slice(None, -periods)
  8020. islicer = slice(periods, None)
  8021. else:
  8022. vslicer = slice(-periods, None)
  8023. islicer = slice(None, periods)
  8024. new_obj = self._slice(vslicer, axis=axis)
  8025. shifted_axis = self._get_axis(axis)[islicer]
  8026. new_obj.set_axis(shifted_axis, axis=axis, inplace=True)
  8027. return new_obj.__finalize__(self, method="slice_shift")
  8028. @final
  8029. def tshift(self: NDFrameT, periods: int = 1, freq=None, axis: Axis = 0) -> NDFrameT:
  8030. """
  8031. Shift the time index, using the index's frequency if available.
  8032. .. deprecated:: 1.1.0
  8033. Use `shift` instead.
  8034. Parameters
  8035. ----------
  8036. periods : int
  8037. Number of periods to move, can be positive or negative.
  8038. freq : DateOffset, timedelta, or str, default None
  8039. Increment to use from the tseries module
  8040. or time rule expressed as a string (e.g. 'EOM').
  8041. axis : {0 or ‘index’, 1 or ‘columns’, None}, default 0
  8042. Corresponds to the axis that contains the Index.
  8043. Returns
  8044. -------
  8045. shifted : Series/DataFrame
  8046. Notes
  8047. -----
  8048. If freq is not specified then tries to use the freq or inferred_freq
  8049. attributes of the index. If neither of those attributes exist, a
  8050. ValueError is thrown
  8051. """
  8052. warnings.warn(
  8053. (
  8054. "tshift is deprecated and will be removed in a future version. "
  8055. "Please use shift instead."
  8056. ),
  8057. FutureWarning,
  8058. stacklevel=2,
  8059. )
  8060. if freq is None:
  8061. freq = "infer"
  8062. return self.shift(periods, freq, axis)
  8063. def truncate(
  8064. self: NDFrameT, before=None, after=None, axis=None, copy: bool_t = True
  8065. ) -> NDFrameT:
  8066. """
  8067. Truncate a Series or DataFrame before and after some index value.
  8068. This is a useful shorthand for boolean indexing based on index
  8069. values above or below certain thresholds.
  8070. Parameters
  8071. ----------
  8072. before : date, str, int
  8073. Truncate all rows before this index value.
  8074. after : date, str, int
  8075. Truncate all rows after this index value.
  8076. axis : {0 or 'index', 1 or 'columns'}, optional
  8077. Axis to truncate. Truncates the index (rows) by default.
  8078. copy : bool, default is True,
  8079. Return a copy of the truncated section.
  8080. Returns
  8081. -------
  8082. type of caller
  8083. The truncated Series or DataFrame.
  8084. See Also
  8085. --------
  8086. DataFrame.loc : Select a subset of a DataFrame by label.
  8087. DataFrame.iloc : Select a subset of a DataFrame by position.
  8088. Notes
  8089. -----
  8090. If the index being truncated contains only datetime values,
  8091. `before` and `after` may be specified as strings instead of
  8092. Timestamps.
  8093. Examples
  8094. --------
  8095. >>> df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e'],
  8096. ... 'B': ['f', 'g', 'h', 'i', 'j'],
  8097. ... 'C': ['k', 'l', 'm', 'n', 'o']},
  8098. ... index=[1, 2, 3, 4, 5])
  8099. >>> df
  8100. A B C
  8101. 1 a f k
  8102. 2 b g l
  8103. 3 c h m
  8104. 4 d i n
  8105. 5 e j o
  8106. >>> df.truncate(before=2, after=4)
  8107. A B C
  8108. 2 b g l
  8109. 3 c h m
  8110. 4 d i n
  8111. The columns of a DataFrame can be truncated.
  8112. >>> df.truncate(before="A", after="B", axis="columns")
  8113. A B
  8114. 1 a f
  8115. 2 b g
  8116. 3 c h
  8117. 4 d i
  8118. 5 e j
  8119. For Series, only rows can be truncated.
  8120. >>> df['A'].truncate(before=2, after=4)
  8121. 2 b
  8122. 3 c
  8123. 4 d
  8124. Name: A, dtype: object
  8125. The index values in ``truncate`` can be datetimes or string
  8126. dates.
  8127. >>> dates = pd.date_range('2016-01-01', '2016-02-01', freq='s')
  8128. >>> df = pd.DataFrame(index=dates, data={'A': 1})
  8129. >>> df.tail()
  8130. A
  8131. 2016-01-31 23:59:56 1
  8132. 2016-01-31 23:59:57 1
  8133. 2016-01-31 23:59:58 1
  8134. 2016-01-31 23:59:59 1
  8135. 2016-02-01 00:00:00 1
  8136. >>> df.truncate(before=pd.Timestamp('2016-01-05'),
  8137. ... after=pd.Timestamp('2016-01-10')).tail()
  8138. A
  8139. 2016-01-09 23:59:56 1
  8140. 2016-01-09 23:59:57 1
  8141. 2016-01-09 23:59:58 1
  8142. 2016-01-09 23:59:59 1
  8143. 2016-01-10 00:00:00 1
  8144. Because the index is a DatetimeIndex containing only dates, we can
  8145. specify `before` and `after` as strings. They will be coerced to
  8146. Timestamps before truncation.
  8147. >>> df.truncate('2016-01-05', '2016-01-10').tail()
  8148. A
  8149. 2016-01-09 23:59:56 1
  8150. 2016-01-09 23:59:57 1
  8151. 2016-01-09 23:59:58 1
  8152. 2016-01-09 23:59:59 1
  8153. 2016-01-10 00:00:00 1
  8154. Note that ``truncate`` assumes a 0 value for any unspecified time
  8155. component (midnight). This differs from partial string slicing, which
  8156. returns any partially matching dates.
  8157. >>> df.loc['2016-01-05':'2016-01-10', :].tail()
  8158. A
  8159. 2016-01-10 23:59:55 1
  8160. 2016-01-10 23:59:56 1
  8161. 2016-01-10 23:59:57 1
  8162. 2016-01-10 23:59:58 1
  8163. 2016-01-10 23:59:59 1
  8164. """
  8165. if axis is None:
  8166. axis = self._stat_axis_number
  8167. axis = self._get_axis_number(axis)
  8168. ax = self._get_axis(axis)
  8169. # GH 17935
  8170. # Check that index is sorted
  8171. if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing:
  8172. raise ValueError("truncate requires a sorted index")
  8173. # if we have a date index, convert to dates, otherwise
  8174. # treat like a slice
  8175. if ax._is_all_dates:
  8176. from pandas.core.tools.datetimes import to_datetime
  8177. before = to_datetime(before)
  8178. after = to_datetime(after)
  8179. if before is not None and after is not None and before > after:
  8180. raise ValueError(f"Truncate: {after} must be after {before}")
  8181. if len(ax) > 1 and ax.is_monotonic_decreasing and ax.nunique() > 1:
  8182. before, after = after, before
  8183. slicer = [slice(None, None)] * self._AXIS_LEN
  8184. slicer[axis] = slice(before, after)
  8185. result = self.loc[tuple(slicer)]
  8186. if isinstance(ax, MultiIndex):
  8187. setattr(result, self._get_axis_name(axis), ax.truncate(before, after))
  8188. if copy:
  8189. result = result.copy()
  8190. return result
  8191. @final
  8192. def tz_convert(
  8193. self: NDFrameT, tz, axis=0, level=None, copy: bool_t = True
  8194. ) -> NDFrameT:
  8195. """
  8196. Convert tz-aware axis to target time zone.
  8197. Parameters
  8198. ----------
  8199. tz : str or tzinfo object
  8200. axis : the axis to convert
  8201. level : int, str, default None
  8202. If axis is a MultiIndex, convert a specific level. Otherwise
  8203. must be None.
  8204. copy : bool, default True
  8205. Also make a copy of the underlying data.
  8206. Returns
  8207. -------
  8208. {klass}
  8209. Object with time zone converted axis.
  8210. Raises
  8211. ------
  8212. TypeError
  8213. If the axis is tz-naive.
  8214. """
  8215. axis = self._get_axis_number(axis)
  8216. ax = self._get_axis(axis)
  8217. def _tz_convert(ax, tz):
  8218. if not hasattr(ax, "tz_convert"):
  8219. if len(ax) > 0:
  8220. ax_name = self._get_axis_name(axis)
  8221. raise TypeError(
  8222. f"{ax_name} is not a valid DatetimeIndex or PeriodIndex"
  8223. )
  8224. else:
  8225. ax = DatetimeIndex([], tz=tz)
  8226. else:
  8227. ax = ax.tz_convert(tz)
  8228. return ax
  8229. # if a level is given it must be a MultiIndex level or
  8230. # equivalent to the axis name
  8231. if isinstance(ax, MultiIndex):
  8232. level = ax._get_level_number(level)
  8233. new_level = _tz_convert(ax.levels[level], tz)
  8234. ax = ax.set_levels(new_level, level=level)
  8235. else:
  8236. if level not in (None, 0, ax.name):
  8237. raise ValueError(f"The level {level} is not valid")
  8238. ax = _tz_convert(ax, tz)
  8239. result = self.copy(deep=copy)
  8240. result = result.set_axis(ax, axis=axis, inplace=False)
  8241. return result.__finalize__(self, method="tz_convert")
  8242. @final
  8243. def tz_localize(
  8244. self: NDFrameT,
  8245. tz,
  8246. axis=0,
  8247. level=None,
  8248. copy: bool_t = True,
  8249. ambiguous="raise",
  8250. nonexistent: str = "raise",
  8251. ) -> NDFrameT:
  8252. """
  8253. Localize tz-naive index of a Series or DataFrame to target time zone.
  8254. This operation localizes the Index. To localize the values in a
  8255. timezone-naive Series, use :meth:`Series.dt.tz_localize`.
  8256. Parameters
  8257. ----------
  8258. tz : str or tzinfo
  8259. axis : the axis to localize
  8260. level : int, str, default None
  8261. If axis ia a MultiIndex, localize a specific level. Otherwise
  8262. must be None.
  8263. copy : bool, default True
  8264. Also make a copy of the underlying data.
  8265. ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise'
  8266. When clocks moved backward due to DST, ambiguous times may arise.
  8267. For example in Central European Time (UTC+01), when going from
  8268. 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at
  8269. 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the
  8270. `ambiguous` parameter dictates how ambiguous times should be
  8271. handled.
  8272. - 'infer' will attempt to infer fall dst-transition hours based on
  8273. order
  8274. - bool-ndarray where True signifies a DST time, False designates
  8275. a non-DST time (note that this flag is only applicable for
  8276. ambiguous times)
  8277. - 'NaT' will return NaT where there are ambiguous times
  8278. - 'raise' will raise an AmbiguousTimeError if there are ambiguous
  8279. times.
  8280. nonexistent : str, default 'raise'
  8281. A nonexistent time does not exist in a particular timezone
  8282. where clocks moved forward due to DST. Valid values are:
  8283. - 'shift_forward' will shift the nonexistent time forward to the
  8284. closest existing time
  8285. - 'shift_backward' will shift the nonexistent time backward to the
  8286. closest existing time
  8287. - 'NaT' will return NaT where there are nonexistent times
  8288. - timedelta objects will shift nonexistent times by the timedelta
  8289. - 'raise' will raise an NonExistentTimeError if there are
  8290. nonexistent times.
  8291. Returns
  8292. -------
  8293. Series or DataFrame
  8294. Same type as the input.
  8295. Raises
  8296. ------
  8297. TypeError
  8298. If the TimeSeries is tz-aware and tz is not None.
  8299. Examples
  8300. --------
  8301. Localize local times:
  8302. >>> s = pd.Series([1],
  8303. ... index=pd.DatetimeIndex(['2018-09-15 01:30:00']))
  8304. >>> s.tz_localize('CET')
  8305. 2018-09-15 01:30:00+02:00 1
  8306. dtype: int64
  8307. Be careful with DST changes. When there is sequential data, pandas
  8308. can infer the DST time:
  8309. >>> s = pd.Series(range(7),
  8310. ... index=pd.DatetimeIndex(['2018-10-28 01:30:00',
  8311. ... '2018-10-28 02:00:00',
  8312. ... '2018-10-28 02:30:00',
  8313. ... '2018-10-28 02:00:00',
  8314. ... '2018-10-28 02:30:00',
  8315. ... '2018-10-28 03:00:00',
  8316. ... '2018-10-28 03:30:00']))
  8317. >>> s.tz_localize('CET', ambiguous='infer')
  8318. 2018-10-28 01:30:00+02:00 0
  8319. 2018-10-28 02:00:00+02:00 1
  8320. 2018-10-28 02:30:00+02:00 2
  8321. 2018-10-28 02:00:00+01:00 3
  8322. 2018-10-28 02:30:00+01:00 4
  8323. 2018-10-28 03:00:00+01:00 5
  8324. 2018-10-28 03:30:00+01:00 6
  8325. dtype: int64
  8326. In some cases, inferring the DST is impossible. In such cases, you can
  8327. pass an ndarray to the ambiguous parameter to set the DST explicitly
  8328. >>> s = pd.Series(range(3),
  8329. ... index=pd.DatetimeIndex(['2018-10-28 01:20:00',
  8330. ... '2018-10-28 02:36:00',
  8331. ... '2018-10-28 03:46:00']))
  8332. >>> s.tz_localize('CET', ambiguous=np.array([True, True, False]))
  8333. 2018-10-28 01:20:00+02:00 0
  8334. 2018-10-28 02:36:00+02:00 1
  8335. 2018-10-28 03:46:00+01:00 2
  8336. dtype: int64
  8337. If the DST transition causes nonexistent times, you can shift these
  8338. dates forward or backward with a timedelta object or `'shift_forward'`
  8339. or `'shift_backward'`.
  8340. >>> s = pd.Series(range(2),
  8341. ... index=pd.DatetimeIndex(['2015-03-29 02:30:00',
  8342. ... '2015-03-29 03:30:00']))
  8343. >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_forward')
  8344. 2015-03-29 03:00:00+02:00 0
  8345. 2015-03-29 03:30:00+02:00 1
  8346. dtype: int64
  8347. >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_backward')
  8348. 2015-03-29 01:59:59.999999999+01:00 0
  8349. 2015-03-29 03:30:00+02:00 1
  8350. dtype: int64
  8351. >>> s.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1H'))
  8352. 2015-03-29 03:30:00+02:00 0
  8353. 2015-03-29 03:30:00+02:00 1
  8354. dtype: int64
  8355. """
  8356. nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward")
  8357. if nonexistent not in nonexistent_options and not isinstance(
  8358. nonexistent, timedelta
  8359. ):
  8360. raise ValueError(
  8361. "The nonexistent argument must be one of 'raise', "
  8362. "'NaT', 'shift_forward', 'shift_backward' or "
  8363. "a timedelta object"
  8364. )
  8365. axis = self._get_axis_number(axis)
  8366. ax = self._get_axis(axis)
  8367. def _tz_localize(ax, tz, ambiguous, nonexistent):
  8368. if not hasattr(ax, "tz_localize"):
  8369. if len(ax) > 0:
  8370. ax_name = self._get_axis_name(axis)
  8371. raise TypeError(
  8372. f"{ax_name} is not a valid DatetimeIndex or PeriodIndex"
  8373. )
  8374. else:
  8375. ax = DatetimeIndex([], tz=tz)
  8376. else:
  8377. ax = ax.tz_localize(tz, ambiguous=ambiguous, nonexistent=nonexistent)
  8378. return ax
  8379. # if a level is given it must be a MultiIndex level or
  8380. # equivalent to the axis name
  8381. if isinstance(ax, MultiIndex):
  8382. level = ax._get_level_number(level)
  8383. new_level = _tz_localize(ax.levels[level], tz, ambiguous, nonexistent)
  8384. ax = ax.set_levels(new_level, level=level)
  8385. else:
  8386. if level not in (None, 0, ax.name):
  8387. raise ValueError(f"The level {level} is not valid")
  8388. ax = _tz_localize(ax, tz, ambiguous, nonexistent)
  8389. result = self.copy(deep=copy)
  8390. result = result.set_axis(ax, axis=axis, inplace=False)
  8391. return result.__finalize__(self, method="tz_localize")
  8392. # ----------------------------------------------------------------------
  8393. # Numeric Methods
  8394. @final
  8395. def describe(
  8396. self: NDFrameT,
  8397. percentiles=None,
  8398. include=None,
  8399. exclude=None,
  8400. datetime_is_numeric=False,
  8401. ) -> NDFrameT:
  8402. """
  8403. Generate descriptive statistics.
  8404. Descriptive statistics include those that summarize the central
  8405. tendency, dispersion and shape of a
  8406. dataset's distribution, excluding ``NaN`` values.
  8407. Analyzes both numeric and object series, as well
  8408. as ``DataFrame`` column sets of mixed data types. The output
  8409. will vary depending on what is provided. Refer to the notes
  8410. below for more detail.
  8411. Parameters
  8412. ----------
  8413. percentiles : list-like of numbers, optional
  8414. The percentiles to include in the output. All should
  8415. fall between 0 and 1. The default is
  8416. ``[.25, .5, .75]``, which returns the 25th, 50th, and
  8417. 75th percentiles.
  8418. include : 'all', list-like of dtypes or None (default), optional
  8419. A white list of data types to include in the result. Ignored
  8420. for ``Series``. Here are the options:
  8421. - 'all' : All columns of the input will be included in the output.
  8422. - A list-like of dtypes : Limits the results to the
  8423. provided data types.
  8424. To limit the result to numeric types submit
  8425. ``numpy.number``. To limit it instead to object columns submit
  8426. the ``numpy.object`` data type. Strings
  8427. can also be used in the style of
  8428. ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
  8429. select pandas categorical columns, use ``'category'``
  8430. - None (default) : The result will include all numeric columns.
  8431. exclude : list-like of dtypes or None (default), optional,
  8432. A black list of data types to omit from the result. Ignored
  8433. for ``Series``. Here are the options:
  8434. - A list-like of dtypes : Excludes the provided data types
  8435. from the result. To exclude numeric types submit
  8436. ``numpy.number``. To exclude object columns submit the data
  8437. type ``numpy.object``. Strings can also be used in the style of
  8438. ``select_dtypes`` (e.g. ``df.describe(exclude=['O'])``). To
  8439. exclude pandas categorical columns, use ``'category'``
  8440. - None (default) : The result will exclude nothing.
  8441. datetime_is_numeric : bool, default False
  8442. Whether to treat datetime dtypes as numeric. This affects statistics
  8443. calculated for the column. For DataFrame input, this also
  8444. controls whether datetime columns are included by default.
  8445. .. versionadded:: 1.1.0
  8446. Returns
  8447. -------
  8448. Series or DataFrame
  8449. Summary statistics of the Series or Dataframe provided.
  8450. See Also
  8451. --------
  8452. DataFrame.count: Count number of non-NA/null observations.
  8453. DataFrame.max: Maximum of the values in the object.
  8454. DataFrame.min: Minimum of the values in the object.
  8455. DataFrame.mean: Mean of the values.
  8456. DataFrame.std: Standard deviation of the observations.
  8457. DataFrame.select_dtypes: Subset of a DataFrame including/excluding
  8458. columns based on their dtype.
  8459. Notes
  8460. -----
  8461. For numeric data, the result's index will include ``count``,
  8462. ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and
  8463. upper percentiles. By default the lower percentile is ``25`` and the
  8464. upper percentile is ``75``. The ``50`` percentile is the
  8465. same as the median.
  8466. For object data (e.g. strings or timestamps), the result's index
  8467. will include ``count``, ``unique``, ``top``, and ``freq``. The ``top``
  8468. is the most common value. The ``freq`` is the most common value's
  8469. frequency. Timestamps also include the ``first`` and ``last`` items.
  8470. If multiple object values have the highest count, then the
  8471. ``count`` and ``top`` results will be arbitrarily chosen from
  8472. among those with the highest count.
  8473. For mixed data types provided via a ``DataFrame``, the default is to
  8474. return only an analysis of numeric columns. If the dataframe consists
  8475. only of object and categorical data without any numeric columns, the
  8476. default is to return an analysis of both the object and categorical
  8477. columns. If ``include='all'`` is provided as an option, the result
  8478. will include a union of attributes of each type.
  8479. The `include` and `exclude` parameters can be used to limit
  8480. which columns in a ``DataFrame`` are analyzed for the output.
  8481. The parameters are ignored when analyzing a ``Series``.
  8482. Examples
  8483. --------
  8484. Describing a numeric ``Series``.
  8485. >>> s = pd.Series([1, 2, 3])
  8486. >>> s.describe()
  8487. count 3.0
  8488. mean 2.0
  8489. std 1.0
  8490. min 1.0
  8491. 25% 1.5
  8492. 50% 2.0
  8493. 75% 2.5
  8494. max 3.0
  8495. dtype: float64
  8496. Describing a categorical ``Series``.
  8497. >>> s = pd.Series(['a', 'a', 'b', 'c'])
  8498. >>> s.describe()
  8499. count 4
  8500. unique 3
  8501. top a
  8502. freq 2
  8503. dtype: object
  8504. Describing a timestamp ``Series``.
  8505. >>> s = pd.Series([
  8506. ... np.datetime64("2000-01-01"),
  8507. ... np.datetime64("2010-01-01"),
  8508. ... np.datetime64("2010-01-01")
  8509. ... ])
  8510. >>> s.describe(datetime_is_numeric=True)
  8511. count 3
  8512. mean 2006-09-01 08:00:00
  8513. min 2000-01-01 00:00:00
  8514. 25% 2004-12-31 12:00:00
  8515. 50% 2010-01-01 00:00:00
  8516. 75% 2010-01-01 00:00:00
  8517. max 2010-01-01 00:00:00
  8518. dtype: object
  8519. Describing a ``DataFrame``. By default only numeric fields
  8520. are returned.
  8521. >>> df = pd.DataFrame({'categorical': pd.Categorical(['d','e','f']),
  8522. ... 'numeric': [1, 2, 3],
  8523. ... 'object': ['a', 'b', 'c']
  8524. ... })
  8525. >>> df.describe()
  8526. numeric
  8527. count 3.0
  8528. mean 2.0
  8529. std 1.0
  8530. min 1.0
  8531. 25% 1.5
  8532. 50% 2.0
  8533. 75% 2.5
  8534. max 3.0
  8535. Describing all columns of a ``DataFrame`` regardless of data type.
  8536. >>> df.describe(include='all') # doctest: +SKIP
  8537. categorical numeric object
  8538. count 3 3.0 3
  8539. unique 3 NaN 3
  8540. top f NaN a
  8541. freq 1 NaN 1
  8542. mean NaN 2.0 NaN
  8543. std NaN 1.0 NaN
  8544. min NaN 1.0 NaN
  8545. 25% NaN 1.5 NaN
  8546. 50% NaN 2.0 NaN
  8547. 75% NaN 2.5 NaN
  8548. max NaN 3.0 NaN
  8549. Describing a column from a ``DataFrame`` by accessing it as
  8550. an attribute.
  8551. >>> df.numeric.describe()
  8552. count 3.0
  8553. mean 2.0
  8554. std 1.0
  8555. min 1.0
  8556. 25% 1.5
  8557. 50% 2.0
  8558. 75% 2.5
  8559. max 3.0
  8560. Name: numeric, dtype: float64
  8561. Including only numeric columns in a ``DataFrame`` description.
  8562. >>> df.describe(include=[np.number])
  8563. numeric
  8564. count 3.0
  8565. mean 2.0
  8566. std 1.0
  8567. min 1.0
  8568. 25% 1.5
  8569. 50% 2.0
  8570. 75% 2.5
  8571. max 3.0
  8572. Including only string columns in a ``DataFrame`` description.
  8573. >>> df.describe(include=[object]) # doctest: +SKIP
  8574. object
  8575. count 3
  8576. unique 3
  8577. top a
  8578. freq 1
  8579. Including only categorical columns from a ``DataFrame`` description.
  8580. >>> df.describe(include=['category'])
  8581. categorical
  8582. count 3
  8583. unique 3
  8584. top d
  8585. freq 1
  8586. Excluding numeric columns from a ``DataFrame`` description.
  8587. >>> df.describe(exclude=[np.number]) # doctest: +SKIP
  8588. categorical object
  8589. count 3 3
  8590. unique 3 3
  8591. top f a
  8592. freq 1 1
  8593. Excluding object columns from a ``DataFrame`` description.
  8594. >>> df.describe(exclude=[object]) # doctest: +SKIP
  8595. categorical numeric
  8596. count 3 3.0
  8597. unique 3 NaN
  8598. top f NaN
  8599. freq 1 NaN
  8600. mean NaN 2.0
  8601. std NaN 1.0
  8602. min NaN 1.0
  8603. 25% NaN 1.5
  8604. 50% NaN 2.0
  8605. 75% NaN 2.5
  8606. max NaN 3.0
  8607. """
  8608. return describe_ndframe(
  8609. obj=self,
  8610. include=include,
  8611. exclude=exclude,
  8612. datetime_is_numeric=datetime_is_numeric,
  8613. percentiles=percentiles,
  8614. )
  8615. @final
  8616. def pct_change(
  8617. self: NDFrameT,
  8618. periods=1,
  8619. fill_method="pad",
  8620. limit=None,
  8621. freq=None,
  8622. **kwargs,
  8623. ) -> NDFrameT:
  8624. """
  8625. Percentage change between the current and a prior element.
  8626. Computes the percentage change from the immediately previous row by
  8627. default. This is useful in comparing the percentage of change in a time
  8628. series of elements.
  8629. Parameters
  8630. ----------
  8631. periods : int, default 1
  8632. Periods to shift for forming percent change.
  8633. fill_method : str, default 'pad'
  8634. How to handle NAs before computing percent changes.
  8635. limit : int, default None
  8636. The number of consecutive NAs to fill before stopping.
  8637. freq : DateOffset, timedelta, or str, optional
  8638. Increment to use from time series API (e.g. 'M' or BDay()).
  8639. **kwargs
  8640. Additional keyword arguments are passed into
  8641. `DataFrame.shift` or `Series.shift`.
  8642. Returns
  8643. -------
  8644. chg : Series or DataFrame
  8645. The same type as the calling object.
  8646. See Also
  8647. --------
  8648. Series.diff : Compute the difference of two elements in a Series.
  8649. DataFrame.diff : Compute the difference of two elements in a DataFrame.
  8650. Series.shift : Shift the index by some number of periods.
  8651. DataFrame.shift : Shift the index by some number of periods.
  8652. Examples
  8653. --------
  8654. **Series**
  8655. >>> s = pd.Series([90, 91, 85])
  8656. >>> s
  8657. 0 90
  8658. 1 91
  8659. 2 85
  8660. dtype: int64
  8661. >>> s.pct_change()
  8662. 0 NaN
  8663. 1 0.011111
  8664. 2 -0.065934
  8665. dtype: float64
  8666. >>> s.pct_change(periods=2)
  8667. 0 NaN
  8668. 1 NaN
  8669. 2 -0.055556
  8670. dtype: float64
  8671. See the percentage change in a Series where filling NAs with last
  8672. valid observation forward to next valid.
  8673. >>> s = pd.Series([90, 91, None, 85])
  8674. >>> s
  8675. 0 90.0
  8676. 1 91.0
  8677. 2 NaN
  8678. 3 85.0
  8679. dtype: float64
  8680. >>> s.pct_change(fill_method='ffill')
  8681. 0 NaN
  8682. 1 0.011111
  8683. 2 0.000000
  8684. 3 -0.065934
  8685. dtype: float64
  8686. **DataFrame**
  8687. Percentage change in French franc, Deutsche Mark, and Italian lira from
  8688. 1980-01-01 to 1980-03-01.
  8689. >>> df = pd.DataFrame({
  8690. ... 'FR': [4.0405, 4.0963, 4.3149],
  8691. ... 'GR': [1.7246, 1.7482, 1.8519],
  8692. ... 'IT': [804.74, 810.01, 860.13]},
  8693. ... index=['1980-01-01', '1980-02-01', '1980-03-01'])
  8694. >>> df
  8695. FR GR IT
  8696. 1980-01-01 4.0405 1.7246 804.74
  8697. 1980-02-01 4.0963 1.7482 810.01
  8698. 1980-03-01 4.3149 1.8519 860.13
  8699. >>> df.pct_change()
  8700. FR GR IT
  8701. 1980-01-01 NaN NaN NaN
  8702. 1980-02-01 0.013810 0.013684 0.006549
  8703. 1980-03-01 0.053365 0.059318 0.061876
  8704. Percentage of change in GOOG and APPL stock volume. Shows computing
  8705. the percentage change between columns.
  8706. >>> df = pd.DataFrame({
  8707. ... '2016': [1769950, 30586265],
  8708. ... '2015': [1500923, 40912316],
  8709. ... '2014': [1371819, 41403351]},
  8710. ... index=['GOOG', 'APPL'])
  8711. >>> df
  8712. 2016 2015 2014
  8713. GOOG 1769950 1500923 1371819
  8714. APPL 30586265 40912316 41403351
  8715. >>> df.pct_change(axis='columns', periods=-1)
  8716. 2016 2015 2014
  8717. GOOG 0.179241 0.094112 NaN
  8718. APPL -0.252395 -0.011860 NaN
  8719. """
  8720. axis = self._get_axis_number(kwargs.pop("axis", self._stat_axis_name))
  8721. if fill_method is None:
  8722. data = self
  8723. else:
  8724. _data = self.fillna(method=fill_method, axis=axis, limit=limit)
  8725. assert _data is not None # needed for mypy
  8726. data = _data
  8727. shifted = data.shift(periods=periods, freq=freq, axis=axis, **kwargs)
  8728. # Unsupported left operand type for / ("NDFrameT")
  8729. rs = data / shifted - 1 # type: ignore[operator]
  8730. if freq is not None:
  8731. # Shift method is implemented differently when freq is not None
  8732. # We want to restore the original index
  8733. rs = rs.loc[~rs.index.duplicated()]
  8734. rs = rs.reindex_like(data)
  8735. return rs
  8736. @final
  8737. def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs):
  8738. if axis is None:
  8739. raise ValueError("Must specify 'axis' when aggregating by level.")
  8740. grouped = self.groupby(level=level, axis=axis, sort=False)
  8741. if hasattr(grouped, name) and skipna:
  8742. return getattr(grouped, name)(**kwargs)
  8743. axis = self._get_axis_number(axis)
  8744. method = getattr(type(self), name)
  8745. applyf = lambda x: method(x, axis=axis, skipna=skipna, **kwargs)
  8746. return grouped.aggregate(applyf)
  8747. @final
  8748. def _logical_func(
  8749. self, name: str, func, axis=0, bool_only=None, skipna=True, level=None, **kwargs
  8750. ):
  8751. nv.validate_logical_func((), kwargs, fname=name)
  8752. if level is not None:
  8753. warnings.warn(
  8754. "Using the level keyword in DataFrame and Series aggregations is "
  8755. "deprecated and will be removed in a future version. Use groupby "
  8756. "instead. df.any(level=1) should use df.groupby(level=1).any()",
  8757. FutureWarning,
  8758. stacklevel=4,
  8759. )
  8760. if bool_only is not None:
  8761. raise NotImplementedError(
  8762. "Option bool_only is not implemented with option level."
  8763. )
  8764. return self._agg_by_level(name, axis=axis, level=level, skipna=skipna)
  8765. if self.ndim > 1 and axis is None:
  8766. # Reduce along one dimension then the other, to simplify DataFrame._reduce
  8767. res = self._logical_func(
  8768. name, func, axis=0, bool_only=bool_only, skipna=skipna, **kwargs
  8769. )
  8770. return res._logical_func(name, func, skipna=skipna, **kwargs)
  8771. return self._reduce(
  8772. func,
  8773. name=name,
  8774. axis=axis,
  8775. skipna=skipna,
  8776. numeric_only=bool_only,
  8777. filter_type="bool",
  8778. )
  8779. def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
  8780. return self._logical_func(
  8781. "any", nanops.nanany, axis, bool_only, skipna, level, **kwargs
  8782. )
  8783. def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
  8784. return self._logical_func(
  8785. "all", nanops.nanall, axis, bool_only, skipna, level, **kwargs
  8786. )
  8787. @final
  8788. def _accum_func(self, name: str, func, axis=None, skipna=True, *args, **kwargs):
  8789. skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name)
  8790. if axis is None:
  8791. axis = self._stat_axis_number
  8792. else:
  8793. axis = self._get_axis_number(axis)
  8794. if axis == 1:
  8795. return self.T._accum_func(
  8796. name, func, axis=0, skipna=skipna, *args, **kwargs
  8797. ).T
  8798. def block_accum_func(blk_values):
  8799. values = blk_values.T if hasattr(blk_values, "T") else blk_values
  8800. result = nanops.na_accum_func(values, func, skipna=skipna)
  8801. result = result.T if hasattr(result, "T") else result
  8802. return result
  8803. result = self._mgr.apply(block_accum_func)
  8804. return self._constructor(result).__finalize__(self, method=name)
  8805. def cummax(self, axis=None, skipna=True, *args, **kwargs):
  8806. return self._accum_func(
  8807. "cummax", np.maximum.accumulate, axis, skipna, *args, **kwargs
  8808. )
  8809. def cummin(self, axis=None, skipna=True, *args, **kwargs):
  8810. return self._accum_func(
  8811. "cummin", np.minimum.accumulate, axis, skipna, *args, **kwargs
  8812. )
  8813. def cumsum(self, axis=None, skipna=True, *args, **kwargs):
  8814. return self._accum_func("cumsum", np.cumsum, axis, skipna, *args, **kwargs)
  8815. def cumprod(self, axis=None, skipna=True, *args, **kwargs):
  8816. return self._accum_func("cumprod", np.cumprod, axis, skipna, *args, **kwargs)
  8817. @final
  8818. def _stat_function_ddof(
  8819. self,
  8820. name: str,
  8821. func,
  8822. axis=None,
  8823. skipna=True,
  8824. level=None,
  8825. ddof=1,
  8826. numeric_only=None,
  8827. **kwargs,
  8828. ):
  8829. nv.validate_stat_ddof_func((), kwargs, fname=name)
  8830. if axis is None:
  8831. axis = self._stat_axis_number
  8832. if level is not None:
  8833. warnings.warn(
  8834. "Using the level keyword in DataFrame and Series aggregations is "
  8835. "deprecated and will be removed in a future version. Use groupby "
  8836. "instead. df.var(level=1) should use df.groupby(level=1).var().",
  8837. FutureWarning,
  8838. stacklevel=4,
  8839. )
  8840. return self._agg_by_level(
  8841. name, axis=axis, level=level, skipna=skipna, ddof=ddof
  8842. )
  8843. return self._reduce(
  8844. func, name, axis=axis, numeric_only=numeric_only, skipna=skipna, ddof=ddof
  8845. )
  8846. def sem(
  8847. self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None, **kwargs
  8848. ):
  8849. return self._stat_function_ddof(
  8850. "sem", nanops.nansem, axis, skipna, level, ddof, numeric_only, **kwargs
  8851. )
  8852. def var(
  8853. self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None, **kwargs
  8854. ):
  8855. return self._stat_function_ddof(
  8856. "var", nanops.nanvar, axis, skipna, level, ddof, numeric_only, **kwargs
  8857. )
  8858. def std(
  8859. self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None, **kwargs
  8860. ):
  8861. return self._stat_function_ddof(
  8862. "std", nanops.nanstd, axis, skipna, level, ddof, numeric_only, **kwargs
  8863. )
  8864. @final
  8865. def _stat_function(
  8866. self,
  8867. name: str,
  8868. func,
  8869. axis=None,
  8870. skipna=True,
  8871. level=None,
  8872. numeric_only=None,
  8873. **kwargs,
  8874. ):
  8875. if name == "median":
  8876. nv.validate_median((), kwargs)
  8877. else:
  8878. nv.validate_stat_func((), kwargs, fname=name)
  8879. if axis is None:
  8880. axis = self._stat_axis_number
  8881. if level is not None:
  8882. warnings.warn(
  8883. "Using the level keyword in DataFrame and Series aggregations is "
  8884. "deprecated and will be removed in a future version. Use groupby "
  8885. "instead. df.median(level=1) should use df.groupby(level=1).median().",
  8886. FutureWarning,
  8887. stacklevel=4,
  8888. )
  8889. return self._agg_by_level(
  8890. name, axis=axis, level=level, skipna=skipna, numeric_only=numeric_only
  8891. )
  8892. return self._reduce(
  8893. func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
  8894. )
  8895. def min(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs):
  8896. return self._stat_function(
  8897. "min", nanops.nanmin, axis, skipna, level, numeric_only, **kwargs
  8898. )
  8899. def max(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs):
  8900. return self._stat_function(
  8901. "max", nanops.nanmax, axis, skipna, level, numeric_only, **kwargs
  8902. )
  8903. def mean(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs):
  8904. return self._stat_function(
  8905. "mean", nanops.nanmean, axis, skipna, level, numeric_only, **kwargs
  8906. )
  8907. def median(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs):
  8908. return self._stat_function(
  8909. "median", nanops.nanmedian, axis, skipna, level, numeric_only, **kwargs
  8910. )
  8911. def skew(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs):
  8912. return self._stat_function(
  8913. "skew", nanops.nanskew, axis, skipna, level, numeric_only, **kwargs
  8914. )
  8915. def kurt(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs):
  8916. return self._stat_function(
  8917. "kurt", nanops.nankurt, axis, skipna, level, numeric_only, **kwargs
  8918. )
  8919. kurtosis = kurt
  8920. @final
  8921. def _min_count_stat_function(
  8922. self,
  8923. name: str,
  8924. func,
  8925. axis=None,
  8926. skipna=True,
  8927. level=None,
  8928. numeric_only=None,
  8929. min_count=0,
  8930. **kwargs,
  8931. ):
  8932. if name == "sum":
  8933. nv.validate_sum((), kwargs)
  8934. elif name == "prod":
  8935. nv.validate_prod((), kwargs)
  8936. else:
  8937. nv.validate_stat_func((), kwargs, fname=name)
  8938. if axis is None:
  8939. axis = self._stat_axis_number
  8940. if level is not None:
  8941. warnings.warn(
  8942. "Using the level keyword in DataFrame and Series aggregations is "
  8943. "deprecated and will be removed in a future version. Use groupby "
  8944. "instead. df.sum(level=1) should use df.groupby(level=1).sum().",
  8945. FutureWarning,
  8946. stacklevel=4,
  8947. )
  8948. return self._agg_by_level(
  8949. name,
  8950. axis=axis,
  8951. level=level,
  8952. skipna=skipna,
  8953. min_count=min_count,
  8954. numeric_only=numeric_only,
  8955. )
  8956. return self._reduce(
  8957. func,
  8958. name=name,
  8959. axis=axis,
  8960. skipna=skipna,
  8961. numeric_only=numeric_only,
  8962. min_count=min_count,
  8963. )
  8964. def sum(
  8965. self,
  8966. axis=None,
  8967. skipna=True,
  8968. level=None,
  8969. numeric_only=None,
  8970. min_count=0,
  8971. **kwargs,
  8972. ):
  8973. return self._min_count_stat_function(
  8974. "sum", nanops.nansum, axis, skipna, level, numeric_only, min_count, **kwargs
  8975. )
  8976. def prod(
  8977. self,
  8978. axis=None,
  8979. skipna=True,
  8980. level=None,
  8981. numeric_only=None,
  8982. min_count=0,
  8983. **kwargs,
  8984. ):
  8985. return self._min_count_stat_function(
  8986. "prod",
  8987. nanops.nanprod,
  8988. axis,
  8989. skipna,
  8990. level,
  8991. numeric_only,
  8992. min_count,
  8993. **kwargs,
  8994. )
  8995. product = prod
  8996. def mad(self, axis=None, skipna=None, level=None):
  8997. """
  8998. {desc}
  8999. Parameters
  9000. ----------
  9001. axis : {axis_descr}
  9002. Axis for the function to be applied on.
  9003. skipna : bool, default None
  9004. Exclude NA/null values when computing the result.
  9005. level : int or level name, default None
  9006. If the axis is a MultiIndex (hierarchical), count along a
  9007. particular level, collapsing into a {name1}.
  9008. Returns
  9009. -------
  9010. {name1} or {name2} (if level specified)\
  9011. {see_also}\
  9012. {examples}
  9013. """
  9014. if skipna is None:
  9015. skipna = True
  9016. if axis is None:
  9017. axis = self._stat_axis_number
  9018. if level is not None:
  9019. warnings.warn(
  9020. "Using the level keyword in DataFrame and Series aggregations is "
  9021. "deprecated and will be removed in a future version. Use groupby "
  9022. "instead. df.mad(level=1) should use df.groupby(level=1).mad()",
  9023. FutureWarning,
  9024. stacklevel=3,
  9025. )
  9026. return self._agg_by_level("mad", axis=axis, level=level, skipna=skipna)
  9027. data = self._get_numeric_data()
  9028. if axis == 0:
  9029. demeaned = data - data.mean(axis=0)
  9030. else:
  9031. demeaned = data.sub(data.mean(axis=1), axis=0)
  9032. return np.abs(demeaned).mean(axis=axis, skipna=skipna)
  9033. @classmethod
  9034. def _add_numeric_operations(cls):
  9035. """
  9036. Add the operations to the cls; evaluate the doc strings again
  9037. """
  9038. axis_descr, name1, name2 = _doc_params(cls)
  9039. @doc(
  9040. _bool_doc,
  9041. desc=_any_desc,
  9042. name1=name1,
  9043. name2=name2,
  9044. axis_descr=axis_descr,
  9045. see_also=_any_see_also,
  9046. examples=_any_examples,
  9047. empty_value=False,
  9048. )
  9049. def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
  9050. return NDFrame.any(self, axis, bool_only, skipna, level, **kwargs)
  9051. setattr(cls, "any", any)
  9052. @doc(
  9053. _bool_doc,
  9054. desc=_all_desc,
  9055. name1=name1,
  9056. name2=name2,
  9057. axis_descr=axis_descr,
  9058. see_also=_all_see_also,
  9059. examples=_all_examples,
  9060. empty_value=True,
  9061. )
  9062. def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
  9063. return NDFrame.all(self, axis, bool_only, skipna, level, **kwargs)
  9064. setattr(cls, "all", all)
  9065. # error: Argument 1 to "doc" has incompatible type "Optional[str]"; expected
  9066. # "Union[str, Callable[..., Any]]"
  9067. @doc(
  9068. NDFrame.mad.__doc__, # type: ignore[arg-type]
  9069. desc="Return the mean absolute deviation of the values "
  9070. "over the requested axis.",
  9071. name1=name1,
  9072. name2=name2,
  9073. axis_descr=axis_descr,
  9074. see_also="",
  9075. examples="",
  9076. )
  9077. def mad(self, axis=None, skipna=None, level=None):
  9078. return NDFrame.mad(self, axis, skipna, level)
  9079. setattr(cls, "mad", mad)
  9080. @doc(
  9081. _num_ddof_doc,
  9082. desc="Return unbiased standard error of the mean over requested "
  9083. "axis.\n\nNormalized by N-1 by default. This can be changed "
  9084. "using the ddof argument",
  9085. name1=name1,
  9086. name2=name2,
  9087. axis_descr=axis_descr,
  9088. notes="",
  9089. examples="",
  9090. )
  9091. def sem(
  9092. self,
  9093. axis=None,
  9094. skipna=True,
  9095. level=None,
  9096. ddof=1,
  9097. numeric_only=None,
  9098. **kwargs,
  9099. ):
  9100. return NDFrame.sem(self, axis, skipna, level, ddof, numeric_only, **kwargs)
  9101. setattr(cls, "sem", sem)
  9102. @doc(
  9103. _num_ddof_doc,
  9104. desc="Return unbiased variance over requested axis.\n\nNormalized by "
  9105. "N-1 by default. This can be changed using the ddof argument.",
  9106. name1=name1,
  9107. name2=name2,
  9108. axis_descr=axis_descr,
  9109. notes="",
  9110. examples=_var_examples,
  9111. )
  9112. def var(
  9113. self,
  9114. axis=None,
  9115. skipna=True,
  9116. level=None,
  9117. ddof=1,
  9118. numeric_only=None,
  9119. **kwargs,
  9120. ):
  9121. return NDFrame.var(self, axis, skipna, level, ddof, numeric_only, **kwargs)
  9122. setattr(cls, "var", var)
  9123. @doc(
  9124. _num_ddof_doc,
  9125. desc="Return sample standard deviation over requested axis."
  9126. "\n\nNormalized by N-1 by default. This can be changed using the "
  9127. "ddof argument.",
  9128. name1=name1,
  9129. name2=name2,
  9130. axis_descr=axis_descr,
  9131. notes=_std_notes,
  9132. examples=_std_examples,
  9133. )
  9134. def std(
  9135. self,
  9136. axis=None,
  9137. skipna=True,
  9138. level=None,
  9139. ddof=1,
  9140. numeric_only=None,
  9141. **kwargs,
  9142. ):
  9143. return NDFrame.std(self, axis, skipna, level, ddof, numeric_only, **kwargs)
  9144. setattr(cls, "std", std)
  9145. @doc(
  9146. _cnum_doc,
  9147. desc="minimum",
  9148. name1=name1,
  9149. name2=name2,
  9150. axis_descr=axis_descr,
  9151. accum_func_name="min",
  9152. examples=_cummin_examples,
  9153. )
  9154. def cummin(self, axis=None, skipna=True, *args, **kwargs):
  9155. return NDFrame.cummin(self, axis, skipna, *args, **kwargs)
  9156. setattr(cls, "cummin", cummin)
  9157. @doc(
  9158. _cnum_doc,
  9159. desc="maximum",
  9160. name1=name1,
  9161. name2=name2,
  9162. axis_descr=axis_descr,
  9163. accum_func_name="max",
  9164. examples=_cummax_examples,
  9165. )
  9166. def cummax(self, axis=None, skipna=True, *args, **kwargs):
  9167. return NDFrame.cummax(self, axis, skipna, *args, **kwargs)
  9168. setattr(cls, "cummax", cummax)
  9169. @doc(
  9170. _cnum_doc,
  9171. desc="sum",
  9172. name1=name1,
  9173. name2=name2,
  9174. axis_descr=axis_descr,
  9175. accum_func_name="sum",
  9176. examples=_cumsum_examples,
  9177. )
  9178. def cumsum(self, axis=None, skipna=True, *args, **kwargs):
  9179. return NDFrame.cumsum(self, axis, skipna, *args, **kwargs)
  9180. setattr(cls, "cumsum", cumsum)
  9181. @doc(
  9182. _cnum_doc,
  9183. desc="product",
  9184. name1=name1,
  9185. name2=name2,
  9186. axis_descr=axis_descr,
  9187. accum_func_name="prod",
  9188. examples=_cumprod_examples,
  9189. )
  9190. def cumprod(self, axis=None, skipna=True, *args, **kwargs):
  9191. return NDFrame.cumprod(self, axis, skipna, *args, **kwargs)
  9192. setattr(cls, "cumprod", cumprod)
  9193. @doc(
  9194. _num_doc,
  9195. desc="Return the sum of the values over the requested axis.\n\n"
  9196. "This is equivalent to the method ``numpy.sum``.",
  9197. name1=name1,
  9198. name2=name2,
  9199. axis_descr=axis_descr,
  9200. min_count=_min_count_stub,
  9201. see_also=_stat_func_see_also,
  9202. examples=_sum_examples,
  9203. )
  9204. def sum(
  9205. self,
  9206. axis=None,
  9207. skipna=True,
  9208. level=None,
  9209. numeric_only=None,
  9210. min_count=0,
  9211. **kwargs,
  9212. ):
  9213. return NDFrame.sum(
  9214. self, axis, skipna, level, numeric_only, min_count, **kwargs
  9215. )
  9216. setattr(cls, "sum", sum)
  9217. @doc(
  9218. _num_doc,
  9219. desc="Return the product of the values over the requested axis.",
  9220. name1=name1,
  9221. name2=name2,
  9222. axis_descr=axis_descr,
  9223. min_count=_min_count_stub,
  9224. see_also=_stat_func_see_also,
  9225. examples=_prod_examples,
  9226. )
  9227. def prod(
  9228. self,
  9229. axis=None,
  9230. skipna=True,
  9231. level=None,
  9232. numeric_only=None,
  9233. min_count=0,
  9234. **kwargs,
  9235. ):
  9236. return NDFrame.prod(
  9237. self, axis, skipna, level, numeric_only, min_count, **kwargs
  9238. )
  9239. setattr(cls, "prod", prod)
  9240. cls.product = prod
  9241. @doc(
  9242. _num_doc,
  9243. desc="Return the mean of the values over the requested axis.",
  9244. name1=name1,
  9245. name2=name2,
  9246. axis_descr=axis_descr,
  9247. min_count="",
  9248. see_also="",
  9249. examples="",
  9250. )
  9251. def mean(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs):
  9252. return NDFrame.mean(self, axis, skipna, level, numeric_only, **kwargs)
  9253. setattr(cls, "mean", mean)
  9254. @doc(
  9255. _num_doc,
  9256. desc="Return unbiased skew over requested axis.\n\nNormalized by N-1.",
  9257. name1=name1,
  9258. name2=name2,
  9259. axis_descr=axis_descr,
  9260. min_count="",
  9261. see_also="",
  9262. examples="",
  9263. )
  9264. def skew(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs):
  9265. return NDFrame.skew(self, axis, skipna, level, numeric_only, **kwargs)
  9266. setattr(cls, "skew", skew)
  9267. @doc(
  9268. _num_doc,
  9269. desc="Return unbiased kurtosis over requested axis.\n\n"
  9270. "Kurtosis obtained using Fisher's definition of\n"
  9271. "kurtosis (kurtosis of normal == 0.0). Normalized "
  9272. "by N-1.",
  9273. name1=name1,
  9274. name2=name2,
  9275. axis_descr=axis_descr,
  9276. min_count="",
  9277. see_also="",
  9278. examples="",
  9279. )
  9280. def kurt(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs):
  9281. return NDFrame.kurt(self, axis, skipna, level, numeric_only, **kwargs)
  9282. setattr(cls, "kurt", kurt)
  9283. cls.kurtosis = kurt
  9284. @doc(
  9285. _num_doc,
  9286. desc="Return the median of the values over the requested axis.",
  9287. name1=name1,
  9288. name2=name2,
  9289. axis_descr=axis_descr,
  9290. min_count="",
  9291. see_also="",
  9292. examples="",
  9293. )
  9294. def median(
  9295. self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
  9296. ):
  9297. return NDFrame.median(self, axis, skipna, level, numeric_only, **kwargs)
  9298. setattr(cls, "median", median)
  9299. @doc(
  9300. _num_doc,
  9301. desc="Return the maximum of the values over the requested axis.\n\n"
  9302. "If you want the *index* of the maximum, use ``idxmax``. This is "
  9303. "the equivalent of the ``numpy.ndarray`` method ``argmax``.",
  9304. name1=name1,
  9305. name2=name2,
  9306. axis_descr=axis_descr,
  9307. min_count="",
  9308. see_also=_stat_func_see_also,
  9309. examples=_max_examples,
  9310. )
  9311. def max(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs):
  9312. return NDFrame.max(self, axis, skipna, level, numeric_only, **kwargs)
  9313. setattr(cls, "max", max)
  9314. @doc(
  9315. _num_doc,
  9316. desc="Return the minimum of the values over the requested axis.\n\n"
  9317. "If you want the *index* of the minimum, use ``idxmin``. This is "
  9318. "the equivalent of the ``numpy.ndarray`` method ``argmin``.",
  9319. name1=name1,
  9320. name2=name2,
  9321. axis_descr=axis_descr,
  9322. min_count="",
  9323. see_also=_stat_func_see_also,
  9324. examples=_min_examples,
  9325. )
  9326. def min(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs):
  9327. return NDFrame.min(self, axis, skipna, level, numeric_only, **kwargs)
  9328. setattr(cls, "min", min)
  9329. @final
  9330. @doc(Rolling)
  9331. def rolling(
  9332. self,
  9333. window: int | timedelta | BaseOffset | BaseIndexer,
  9334. min_periods: int | None = None,
  9335. center: bool_t = False,
  9336. win_type: str | None = None,
  9337. on: str | None = None,
  9338. axis: Axis = 0,
  9339. closed: str | None = None,
  9340. method: str = "single",
  9341. ):
  9342. axis = self._get_axis_number(axis)
  9343. if win_type is not None:
  9344. return Window(
  9345. self,
  9346. window=window,
  9347. min_periods=min_periods,
  9348. center=center,
  9349. win_type=win_type,
  9350. on=on,
  9351. axis=axis,
  9352. closed=closed,
  9353. method=method,
  9354. )
  9355. return Rolling(
  9356. self,
  9357. window=window,
  9358. min_periods=min_periods,
  9359. center=center,
  9360. win_type=win_type,
  9361. on=on,
  9362. axis=axis,
  9363. closed=closed,
  9364. method=method,
  9365. )
  9366. @final
  9367. @doc(Expanding)
  9368. def expanding(
  9369. self,
  9370. min_periods: int = 1,
  9371. center: bool_t | None = None,
  9372. axis: Axis = 0,
  9373. method: str = "single",
  9374. ) -> Expanding:
  9375. axis = self._get_axis_number(axis)
  9376. if center is not None:
  9377. warnings.warn(
  9378. "The `center` argument on `expanding` will be removed in the future.",
  9379. FutureWarning,
  9380. stacklevel=2,
  9381. )
  9382. else:
  9383. center = False
  9384. return Expanding(
  9385. self, min_periods=min_periods, center=center, axis=axis, method=method
  9386. )
  9387. @final
  9388. @doc(ExponentialMovingWindow)
  9389. def ewm(
  9390. self,
  9391. com: float | None = None,
  9392. span: float | None = None,
  9393. halflife: float | TimedeltaConvertibleTypes | None = None,
  9394. alpha: float | None = None,
  9395. min_periods: int | None = 0,
  9396. adjust: bool_t = True,
  9397. ignore_na: bool_t = False,
  9398. axis: Axis = 0,
  9399. times: str | np.ndarray | DataFrame | Series | None = None,
  9400. method: str = "single",
  9401. ) -> ExponentialMovingWindow:
  9402. axis = self._get_axis_number(axis)
  9403. return ExponentialMovingWindow(
  9404. self,
  9405. com=com,
  9406. span=span,
  9407. halflife=halflife,
  9408. alpha=alpha,
  9409. min_periods=min_periods,
  9410. adjust=adjust,
  9411. ignore_na=ignore_na,
  9412. axis=axis,
  9413. times=times,
  9414. method=method,
  9415. )
  9416. # ----------------------------------------------------------------------
  9417. # Arithmetic Methods
  9418. @final
  9419. def _inplace_method(self, other, op):
  9420. """
  9421. Wrap arithmetic method to operate inplace.
  9422. """
  9423. result = op(self, other)
  9424. if (
  9425. self.ndim == 1
  9426. and result._indexed_same(self)
  9427. and is_dtype_equal(result.dtype, self.dtype)
  9428. ):
  9429. # GH#36498 this inplace op can _actually_ be inplace.
  9430. self._values[:] = result._values
  9431. return self
  9432. # Delete cacher
  9433. self._reset_cacher()
  9434. # this makes sure that we are aligned like the input
  9435. # we are updating inplace so we want to ignore is_copy
  9436. self._update_inplace(
  9437. result.reindex_like(self, copy=False), verify_is_copy=False
  9438. )
  9439. return self
  9440. def __iadd__(self, other):
  9441. # error: Unsupported left operand type for + ("Type[NDFrame]")
  9442. return self._inplace_method(other, type(self).__add__) # type: ignore[operator]
  9443. def __isub__(self, other):
  9444. # error: Unsupported left operand type for - ("Type[NDFrame]")
  9445. return self._inplace_method(other, type(self).__sub__) # type: ignore[operator]
  9446. def __imul__(self, other):
  9447. # error: Unsupported left operand type for * ("Type[NDFrame]")
  9448. return self._inplace_method(other, type(self).__mul__) # type: ignore[operator]
  9449. def __itruediv__(self, other):
  9450. # error: Unsupported left operand type for / ("Type[NDFrame]")
  9451. return self._inplace_method(
  9452. other, type(self).__truediv__ # type: ignore[operator]
  9453. )
  9454. def __ifloordiv__(self, other):
  9455. # error: Unsupported left operand type for // ("Type[NDFrame]")
  9456. return self._inplace_method(
  9457. other, type(self).__floordiv__ # type: ignore[operator]
  9458. )
  9459. def __imod__(self, other):
  9460. # error: Unsupported left operand type for % ("Type[NDFrame]")
  9461. return self._inplace_method(other, type(self).__mod__) # type: ignore[operator]
  9462. def __ipow__(self, other):
  9463. # error: Unsupported left operand type for ** ("Type[NDFrame]")
  9464. return self._inplace_method(other, type(self).__pow__) # type: ignore[operator]
  9465. def __iand__(self, other):
  9466. # error: Unsupported left operand type for & ("Type[NDFrame]")
  9467. return self._inplace_method(other, type(self).__and__) # type: ignore[operator]
  9468. def __ior__(self, other):
  9469. # error: Unsupported left operand type for | ("Type[NDFrame]")
  9470. return self._inplace_method(other, type(self).__or__) # type: ignore[operator]
  9471. def __ixor__(self, other):
  9472. # error: Unsupported left operand type for ^ ("Type[NDFrame]")
  9473. return self._inplace_method(other, type(self).__xor__) # type: ignore[operator]
  9474. # ----------------------------------------------------------------------
  9475. # Misc methods
  9476. @final
  9477. def _find_valid_index(self, *, how: str) -> Hashable | None:
  9478. """
  9479. Retrieves the index of the first valid value.
  9480. Parameters
  9481. ----------
  9482. how : {'first', 'last'}
  9483. Use this parameter to change between the first or last valid index.
  9484. Returns
  9485. -------
  9486. idx_first_valid : type of index
  9487. """
  9488. idxpos = find_valid_index(self._values, how=how)
  9489. if idxpos is None:
  9490. return None
  9491. return self.index[idxpos]
  9492. @final
  9493. @doc(position="first", klass=_shared_doc_kwargs["klass"])
  9494. def first_valid_index(self) -> Hashable | None:
  9495. """
  9496. Return index for {position} non-NA value or None, if no NA value is found.
  9497. Returns
  9498. -------
  9499. scalar : type of index
  9500. Notes
  9501. -----
  9502. If all elements are non-NA/null, returns None.
  9503. Also returns None for empty {klass}.
  9504. """
  9505. return self._find_valid_index(how="first")
  9506. @final
  9507. @doc(first_valid_index, position="last", klass=_shared_doc_kwargs["klass"])
  9508. def last_valid_index(self) -> Hashable | None:
  9509. return self._find_valid_index(how="last")
  9510. def _doc_params(cls):
  9511. """Return a tuple of the doc params."""
  9512. axis_descr = (
  9513. f"{{{', '.join([f'{a} ({i})' for i, a in enumerate(cls._AXIS_ORDERS)])}}}"
  9514. )
  9515. name = cls._constructor_sliced.__name__ if cls._AXIS_LEN > 1 else "scalar"
  9516. name2 = cls.__name__
  9517. return axis_descr, name, name2
  9518. _num_doc = """
  9519. {desc}
  9520. Parameters
  9521. ----------
  9522. axis : {axis_descr}
  9523. Axis for the function to be applied on.
  9524. skipna : bool, default True
  9525. Exclude NA/null values when computing the result.
  9526. level : int or level name, default None
  9527. If the axis is a MultiIndex (hierarchical), count along a
  9528. particular level, collapsing into a {name1}.
  9529. numeric_only : bool, default None
  9530. Include only float, int, boolean columns. If None, will attempt to use
  9531. everything, then use only numeric data. Not implemented for Series.
  9532. {min_count}\
  9533. **kwargs
  9534. Additional keyword arguments to be passed to the function.
  9535. Returns
  9536. -------
  9537. {name1} or {name2} (if level specified)\
  9538. {see_also}\
  9539. {examples}
  9540. """
  9541. _num_ddof_doc = """
  9542. {desc}
  9543. Parameters
  9544. ----------
  9545. axis : {axis_descr}
  9546. skipna : bool, default True
  9547. Exclude NA/null values. If an entire row/column is NA, the result
  9548. will be NA.
  9549. level : int or level name, default None
  9550. If the axis is a MultiIndex (hierarchical), count along a
  9551. particular level, collapsing into a {name1}.
  9552. ddof : int, default 1
  9553. Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
  9554. where N represents the number of elements.
  9555. numeric_only : bool, default None
  9556. Include only float, int, boolean columns. If None, will attempt to use
  9557. everything, then use only numeric data. Not implemented for Series.
  9558. Returns
  9559. -------
  9560. {name1} or {name2} (if level specified) \
  9561. {notes}\
  9562. {examples}
  9563. """
  9564. _std_notes = """
  9565. Notes
  9566. -----
  9567. To have the same behaviour as `numpy.std`, use `ddof=0` (instead of the
  9568. default `ddof=1`)"""
  9569. _std_examples = """
  9570. Examples
  9571. --------
  9572. >>> df = pd.DataFrame({'person_id': [0, 1, 2, 3],
  9573. ... 'age': [21, 25, 62, 43],
  9574. ... 'height': [1.61, 1.87, 1.49, 2.01]}
  9575. ... ).set_index('person_id')
  9576. >>> df
  9577. age height
  9578. person_id
  9579. 0 21 1.61
  9580. 1 25 1.87
  9581. 2 62 1.49
  9582. 3 43 2.01
  9583. The standard deviation of the columns can be found as follows:
  9584. >>> df.std()
  9585. age 18.786076
  9586. height 0.237417
  9587. Alternatively, `ddof=0` can be set to normalize by N instead of N-1:
  9588. >>> df.std(ddof=0)
  9589. age 16.269219
  9590. height 0.205609"""
  9591. _var_examples = """
  9592. Examples
  9593. --------
  9594. >>> df = pd.DataFrame({'person_id': [0, 1, 2, 3],
  9595. ... 'age': [21, 25, 62, 43],
  9596. ... 'height': [1.61, 1.87, 1.49, 2.01]}
  9597. ... ).set_index('person_id')
  9598. >>> df
  9599. age height
  9600. person_id
  9601. 0 21 1.61
  9602. 1 25 1.87
  9603. 2 62 1.49
  9604. 3 43 2.01
  9605. >>> df.var()
  9606. age 352.916667
  9607. height 0.056367
  9608. Alternatively, ``ddof=0`` can be set to normalize by N instead of N-1:
  9609. >>> df.var(ddof=0)
  9610. age 264.687500
  9611. height 0.042275"""
  9612. _bool_doc = """
  9613. {desc}
  9614. Parameters
  9615. ----------
  9616. axis : {{0 or 'index', 1 or 'columns', None}}, default 0
  9617. Indicate which axis or axes should be reduced.
  9618. * 0 / 'index' : reduce the index, return a Series whose index is the
  9619. original column labels.
  9620. * 1 / 'columns' : reduce the columns, return a Series whose index is the
  9621. original index.
  9622. * None : reduce all axes, return a scalar.
  9623. bool_only : bool, default None
  9624. Include only boolean columns. If None, will attempt to use everything,
  9625. then use only boolean data. Not implemented for Series.
  9626. skipna : bool, default True
  9627. Exclude NA/null values. If the entire row/column is NA and skipna is
  9628. True, then the result will be {empty_value}, as for an empty row/column.
  9629. If skipna is False, then NA are treated as True, because these are not
  9630. equal to zero.
  9631. level : int or level name, default None
  9632. If the axis is a MultiIndex (hierarchical), count along a
  9633. particular level, collapsing into a {name1}.
  9634. **kwargs : any, default None
  9635. Additional keywords have no effect but might be accepted for
  9636. compatibility with NumPy.
  9637. Returns
  9638. -------
  9639. {name1} or {name2}
  9640. If level is specified, then, {name2} is returned; otherwise, {name1}
  9641. is returned.
  9642. {see_also}
  9643. {examples}"""
  9644. _all_desc = """\
  9645. Return whether all elements are True, potentially over an axis.
  9646. Returns True unless there at least one element within a series or
  9647. along a Dataframe axis that is False or equivalent (e.g. zero or
  9648. empty)."""
  9649. _all_examples = """\
  9650. Examples
  9651. --------
  9652. **Series**
  9653. >>> pd.Series([True, True]).all()
  9654. True
  9655. >>> pd.Series([True, False]).all()
  9656. False
  9657. >>> pd.Series([], dtype="float64").all()
  9658. True
  9659. >>> pd.Series([np.nan]).all()
  9660. True
  9661. >>> pd.Series([np.nan]).all(skipna=False)
  9662. True
  9663. **DataFrames**
  9664. Create a dataframe from a dictionary.
  9665. >>> df = pd.DataFrame({'col1': [True, True], 'col2': [True, False]})
  9666. >>> df
  9667. col1 col2
  9668. 0 True True
  9669. 1 True False
  9670. Default behaviour checks if column-wise values all return True.
  9671. >>> df.all()
  9672. col1 True
  9673. col2 False
  9674. dtype: bool
  9675. Specify ``axis='columns'`` to check if row-wise values all return True.
  9676. >>> df.all(axis='columns')
  9677. 0 True
  9678. 1 False
  9679. dtype: bool
  9680. Or ``axis=None`` for whether every value is True.
  9681. >>> df.all(axis=None)
  9682. False
  9683. """
  9684. _all_see_also = """\
  9685. See Also
  9686. --------
  9687. Series.all : Return True if all elements are True.
  9688. DataFrame.any : Return True if one (or more) elements are True.
  9689. """
  9690. _cnum_doc = """
  9691. Return cumulative {desc} over a DataFrame or Series axis.
  9692. Returns a DataFrame or Series of the same size containing the cumulative
  9693. {desc}.
  9694. Parameters
  9695. ----------
  9696. axis : {{0 or 'index', 1 or 'columns'}}, default 0
  9697. The index or the name of the axis. 0 is equivalent to None or 'index'.
  9698. skipna : bool, default True
  9699. Exclude NA/null values. If an entire row/column is NA, the result
  9700. will be NA.
  9701. *args, **kwargs
  9702. Additional keywords have no effect but might be accepted for
  9703. compatibility with NumPy.
  9704. Returns
  9705. -------
  9706. {name1} or {name2}
  9707. Return cumulative {desc} of {name1} or {name2}.
  9708. See Also
  9709. --------
  9710. core.window.Expanding.{accum_func_name} : Similar functionality
  9711. but ignores ``NaN`` values.
  9712. {name2}.{accum_func_name} : Return the {desc} over
  9713. {name2} axis.
  9714. {name2}.cummax : Return cumulative maximum over {name2} axis.
  9715. {name2}.cummin : Return cumulative minimum over {name2} axis.
  9716. {name2}.cumsum : Return cumulative sum over {name2} axis.
  9717. {name2}.cumprod : Return cumulative product over {name2} axis.
  9718. {examples}"""
  9719. _cummin_examples = """\
  9720. Examples
  9721. --------
  9722. **Series**
  9723. >>> s = pd.Series([2, np.nan, 5, -1, 0])
  9724. >>> s
  9725. 0 2.0
  9726. 1 NaN
  9727. 2 5.0
  9728. 3 -1.0
  9729. 4 0.0
  9730. dtype: float64
  9731. By default, NA values are ignored.
  9732. >>> s.cummin()
  9733. 0 2.0
  9734. 1 NaN
  9735. 2 2.0
  9736. 3 -1.0
  9737. 4 -1.0
  9738. dtype: float64
  9739. To include NA values in the operation, use ``skipna=False``
  9740. >>> s.cummin(skipna=False)
  9741. 0 2.0
  9742. 1 NaN
  9743. 2 NaN
  9744. 3 NaN
  9745. 4 NaN
  9746. dtype: float64
  9747. **DataFrame**
  9748. >>> df = pd.DataFrame([[2.0, 1.0],
  9749. ... [3.0, np.nan],
  9750. ... [1.0, 0.0]],
  9751. ... columns=list('AB'))
  9752. >>> df
  9753. A B
  9754. 0 2.0 1.0
  9755. 1 3.0 NaN
  9756. 2 1.0 0.0
  9757. By default, iterates over rows and finds the minimum
  9758. in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
  9759. >>> df.cummin()
  9760. A B
  9761. 0 2.0 1.0
  9762. 1 2.0 NaN
  9763. 2 1.0 0.0
  9764. To iterate over columns and find the minimum in each row,
  9765. use ``axis=1``
  9766. >>> df.cummin(axis=1)
  9767. A B
  9768. 0 2.0 1.0
  9769. 1 3.0 NaN
  9770. 2 1.0 0.0
  9771. """
  9772. _cumsum_examples = """\
  9773. Examples
  9774. --------
  9775. **Series**
  9776. >>> s = pd.Series([2, np.nan, 5, -1, 0])
  9777. >>> s
  9778. 0 2.0
  9779. 1 NaN
  9780. 2 5.0
  9781. 3 -1.0
  9782. 4 0.0
  9783. dtype: float64
  9784. By default, NA values are ignored.
  9785. >>> s.cumsum()
  9786. 0 2.0
  9787. 1 NaN
  9788. 2 7.0
  9789. 3 6.0
  9790. 4 6.0
  9791. dtype: float64
  9792. To include NA values in the operation, use ``skipna=False``
  9793. >>> s.cumsum(skipna=False)
  9794. 0 2.0
  9795. 1 NaN
  9796. 2 NaN
  9797. 3 NaN
  9798. 4 NaN
  9799. dtype: float64
  9800. **DataFrame**
  9801. >>> df = pd.DataFrame([[2.0, 1.0],
  9802. ... [3.0, np.nan],
  9803. ... [1.0, 0.0]],
  9804. ... columns=list('AB'))
  9805. >>> df
  9806. A B
  9807. 0 2.0 1.0
  9808. 1 3.0 NaN
  9809. 2 1.0 0.0
  9810. By default, iterates over rows and finds the sum
  9811. in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
  9812. >>> df.cumsum()
  9813. A B
  9814. 0 2.0 1.0
  9815. 1 5.0 NaN
  9816. 2 6.0 1.0
  9817. To iterate over columns and find the sum in each row,
  9818. use ``axis=1``
  9819. >>> df.cumsum(axis=1)
  9820. A B
  9821. 0 2.0 3.0
  9822. 1 3.0 NaN
  9823. 2 1.0 1.0
  9824. """
  9825. _cumprod_examples = """\
  9826. Examples
  9827. --------
  9828. **Series**
  9829. >>> s = pd.Series([2, np.nan, 5, -1, 0])
  9830. >>> s
  9831. 0 2.0
  9832. 1 NaN
  9833. 2 5.0
  9834. 3 -1.0
  9835. 4 0.0
  9836. dtype: float64
  9837. By default, NA values are ignored.
  9838. >>> s.cumprod()
  9839. 0 2.0
  9840. 1 NaN
  9841. 2 10.0
  9842. 3 -10.0
  9843. 4 -0.0
  9844. dtype: float64
  9845. To include NA values in the operation, use ``skipna=False``
  9846. >>> s.cumprod(skipna=False)
  9847. 0 2.0
  9848. 1 NaN
  9849. 2 NaN
  9850. 3 NaN
  9851. 4 NaN
  9852. dtype: float64
  9853. **DataFrame**
  9854. >>> df = pd.DataFrame([[2.0, 1.0],
  9855. ... [3.0, np.nan],
  9856. ... [1.0, 0.0]],
  9857. ... columns=list('AB'))
  9858. >>> df
  9859. A B
  9860. 0 2.0 1.0
  9861. 1 3.0 NaN
  9862. 2 1.0 0.0
  9863. By default, iterates over rows and finds the product
  9864. in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
  9865. >>> df.cumprod()
  9866. A B
  9867. 0 2.0 1.0
  9868. 1 6.0 NaN
  9869. 2 6.0 0.0
  9870. To iterate over columns and find the product in each row,
  9871. use ``axis=1``
  9872. >>> df.cumprod(axis=1)
  9873. A B
  9874. 0 2.0 2.0
  9875. 1 3.0 NaN
  9876. 2 1.0 0.0
  9877. """
  9878. _cummax_examples = """\
  9879. Examples
  9880. --------
  9881. **Series**
  9882. >>> s = pd.Series([2, np.nan, 5, -1, 0])
  9883. >>> s
  9884. 0 2.0
  9885. 1 NaN
  9886. 2 5.0
  9887. 3 -1.0
  9888. 4 0.0
  9889. dtype: float64
  9890. By default, NA values are ignored.
  9891. >>> s.cummax()
  9892. 0 2.0
  9893. 1 NaN
  9894. 2 5.0
  9895. 3 5.0
  9896. 4 5.0
  9897. dtype: float64
  9898. To include NA values in the operation, use ``skipna=False``
  9899. >>> s.cummax(skipna=False)
  9900. 0 2.0
  9901. 1 NaN
  9902. 2 NaN
  9903. 3 NaN
  9904. 4 NaN
  9905. dtype: float64
  9906. **DataFrame**
  9907. >>> df = pd.DataFrame([[2.0, 1.0],
  9908. ... [3.0, np.nan],
  9909. ... [1.0, 0.0]],
  9910. ... columns=list('AB'))
  9911. >>> df
  9912. A B
  9913. 0 2.0 1.0
  9914. 1 3.0 NaN
  9915. 2 1.0 0.0
  9916. By default, iterates over rows and finds the maximum
  9917. in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
  9918. >>> df.cummax()
  9919. A B
  9920. 0 2.0 1.0
  9921. 1 3.0 NaN
  9922. 2 3.0 1.0
  9923. To iterate over columns and find the maximum in each row,
  9924. use ``axis=1``
  9925. >>> df.cummax(axis=1)
  9926. A B
  9927. 0 2.0 2.0
  9928. 1 3.0 NaN
  9929. 2 1.0 1.0
  9930. """
  9931. _any_see_also = """\
  9932. See Also
  9933. --------
  9934. numpy.any : Numpy version of this method.
  9935. Series.any : Return whether any element is True.
  9936. Series.all : Return whether all elements are True.
  9937. DataFrame.any : Return whether any element is True over requested axis.
  9938. DataFrame.all : Return whether all elements are True over requested axis.
  9939. """
  9940. _any_desc = """\
  9941. Return whether any element is True, potentially over an axis.
  9942. Returns False unless there is at least one element within a series or
  9943. along a Dataframe axis that is True or equivalent (e.g. non-zero or
  9944. non-empty)."""
  9945. _any_examples = """\
  9946. Examples
  9947. --------
  9948. **Series**
  9949. For Series input, the output is a scalar indicating whether any element
  9950. is True.
  9951. >>> pd.Series([False, False]).any()
  9952. False
  9953. >>> pd.Series([True, False]).any()
  9954. True
  9955. >>> pd.Series([], dtype="float64").any()
  9956. False
  9957. >>> pd.Series([np.nan]).any()
  9958. False
  9959. >>> pd.Series([np.nan]).any(skipna=False)
  9960. True
  9961. **DataFrame**
  9962. Whether each column contains at least one True element (the default).
  9963. >>> df = pd.DataFrame({"A": [1, 2], "B": [0, 2], "C": [0, 0]})
  9964. >>> df
  9965. A B C
  9966. 0 1 0 0
  9967. 1 2 2 0
  9968. >>> df.any()
  9969. A True
  9970. B True
  9971. C False
  9972. dtype: bool
  9973. Aggregating over the columns.
  9974. >>> df = pd.DataFrame({"A": [True, False], "B": [1, 2]})
  9975. >>> df
  9976. A B
  9977. 0 True 1
  9978. 1 False 2
  9979. >>> df.any(axis='columns')
  9980. 0 True
  9981. 1 True
  9982. dtype: bool
  9983. >>> df = pd.DataFrame({"A": [True, False], "B": [1, 0]})
  9984. >>> df
  9985. A B
  9986. 0 True 1
  9987. 1 False 0
  9988. >>> df.any(axis='columns')
  9989. 0 True
  9990. 1 False
  9991. dtype: bool
  9992. Aggregating over the entire DataFrame with ``axis=None``.
  9993. >>> df.any(axis=None)
  9994. True
  9995. `any` for an empty DataFrame is an empty Series.
  9996. >>> pd.DataFrame([]).any()
  9997. Series([], dtype: bool)
  9998. """
  9999. _shared_docs[
  10000. "stat_func_example"
  10001. ] = """
  10002. Examples
  10003. --------
  10004. >>> idx = pd.MultiIndex.from_arrays([
  10005. ... ['warm', 'warm', 'cold', 'cold'],
  10006. ... ['dog', 'falcon', 'fish', 'spider']],
  10007. ... names=['blooded', 'animal'])
  10008. >>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
  10009. >>> s
  10010. blooded animal
  10011. warm dog 4
  10012. falcon 2
  10013. cold fish 0
  10014. spider 8
  10015. Name: legs, dtype: int64
  10016. >>> s.{stat_func}()
  10017. {default_output}"""
  10018. _sum_examples = _shared_docs["stat_func_example"].format(
  10019. stat_func="sum", verb="Sum", default_output=14, level_output_0=6, level_output_1=8
  10020. )
  10021. _sum_examples += """
  10022. By default, the sum of an empty or all-NA Series is ``0``.
  10023. >>> pd.Series([], dtype="float64").sum() # min_count=0 is the default
  10024. 0.0
  10025. This can be controlled with the ``min_count`` parameter. For example, if
  10026. you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
  10027. >>> pd.Series([], dtype="float64").sum(min_count=1)
  10028. nan
  10029. Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
  10030. empty series identically.
  10031. >>> pd.Series([np.nan]).sum()
  10032. 0.0
  10033. >>> pd.Series([np.nan]).sum(min_count=1)
  10034. nan"""
  10035. _max_examples = _shared_docs["stat_func_example"].format(
  10036. stat_func="max", verb="Max", default_output=8, level_output_0=4, level_output_1=8
  10037. )
  10038. _min_examples = _shared_docs["stat_func_example"].format(
  10039. stat_func="min", verb="Min", default_output=0, level_output_0=2, level_output_1=0
  10040. )
  10041. _stat_func_see_also = """
  10042. See Also
  10043. --------
  10044. Series.sum : Return the sum.
  10045. Series.min : Return the minimum.
  10046. Series.max : Return the maximum.
  10047. Series.idxmin : Return the index of the minimum.
  10048. Series.idxmax : Return the index of the maximum.
  10049. DataFrame.sum : Return the sum over the requested axis.
  10050. DataFrame.min : Return the minimum over the requested axis.
  10051. DataFrame.max : Return the maximum over the requested axis.
  10052. DataFrame.idxmin : Return the index of the minimum over the requested axis.
  10053. DataFrame.idxmax : Return the index of the maximum over the requested axis."""
  10054. _prod_examples = """
  10055. Examples
  10056. --------
  10057. By default, the product of an empty or all-NA Series is ``1``
  10058. >>> pd.Series([], dtype="float64").prod()
  10059. 1.0
  10060. This can be controlled with the ``min_count`` parameter
  10061. >>> pd.Series([], dtype="float64").prod(min_count=1)
  10062. nan
  10063. Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
  10064. empty series identically.
  10065. >>> pd.Series([np.nan]).prod()
  10066. 1.0
  10067. >>> pd.Series([np.nan]).prod(min_count=1)
  10068. nan"""
  10069. _min_count_stub = """\
  10070. min_count : int, default 0
  10071. The required number of valid values to perform the operation. If fewer than
  10072. ``min_count`` non-NA values are present the result will be NA.
  10073. """
  10074. def _align_as_utc(
  10075. left: NDFrameT, right: NDFrameT, join_index: Index | None
  10076. ) -> tuple[NDFrameT, NDFrameT]:
  10077. """
  10078. If we are aligning timezone-aware DatetimeIndexes and the timezones
  10079. do not match, convert both to UTC.
  10080. """
  10081. if is_datetime64tz_dtype(left.index.dtype):
  10082. if left.index.tz != right.index.tz:
  10083. if join_index is not None:
  10084. # GH#33671 ensure we don't change the index on
  10085. # our original Series (NB: by default deep=False)
  10086. left = left.copy()
  10087. right = right.copy()
  10088. left.index = join_index
  10089. right.index = join_index
  10090. return left, right