/pandas/core/generic.py
Python | 11861 lines | 11691 code | 84 blank | 86 comment | 108 complexity | ecd9ba184d70bdea404d36531c1c0098 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
Large files files are truncated, but you can click here to view the full file
- # pyright: reportPropertyTypeMismatch=false
- from __future__ import annotations
- import collections
- from datetime import timedelta
- import functools
- import gc
- import json
- import operator
- import pickle
- import re
- from typing import (
- TYPE_CHECKING,
- Any,
- AnyStr,
- Callable,
- Hashable,
- Literal,
- Mapping,
- Sequence,
- cast,
- final,
- overload,
- )
- import warnings
- import weakref
- import numpy as np
- from pandas._config import config
- from pandas._libs import lib
- from pandas._libs.tslibs import (
- Period,
- Tick,
- Timestamp,
- to_offset,
- )
- from pandas._typing import (
- ArrayLike,
- Axis,
- CompressionOptions,
- Dtype,
- DtypeArg,
- DtypeObj,
- FilePathOrBuffer,
- IndexKeyFunc,
- IndexLabel,
- JSONSerializable,
- Level,
- Manager,
- NDFrameT,
- RandomState,
- Renamer,
- StorageOptions,
- T,
- TimedeltaConvertibleTypes,
- TimestampConvertibleTypes,
- ValueKeyFunc,
- npt,
- )
- from pandas.compat._optional import import_optional_dependency
- from pandas.compat.numpy import function as nv
- from pandas.errors import (
- AbstractMethodError,
- InvalidIndexError,
- )
- from pandas.util._decorators import (
- doc,
- rewrite_axis_style_signature,
- )
- from pandas.util._exceptions import find_stack_level
- from pandas.util._validators import (
- validate_ascending,
- validate_bool_kwarg,
- validate_fillna_kwargs,
- validate_inclusive,
- )
- from pandas.core.dtypes.common import (
- ensure_object,
- ensure_platform_int,
- ensure_str,
- is_bool,
- is_bool_dtype,
- is_datetime64_any_dtype,
- is_datetime64tz_dtype,
- is_dict_like,
- is_dtype_equal,
- is_extension_array_dtype,
- is_float,
- is_list_like,
- is_number,
- is_numeric_dtype,
- is_re_compilable,
- is_scalar,
- is_timedelta64_dtype,
- pandas_dtype,
- )
- from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCSeries,
- )
- from pandas.core.dtypes.inference import (
- is_hashable,
- is_nested_list_like,
- )
- from pandas.core.dtypes.missing import (
- isna,
- notna,
- )
- from pandas.core import (
- arraylike,
- indexing,
- missing,
- nanops,
- )
- import pandas.core.algorithms as algos
- from pandas.core.arrays import ExtensionArray
- from pandas.core.base import PandasObject
- import pandas.core.common as com
- from pandas.core.construction import (
- create_series_with_explicit_dtype,
- extract_array,
- )
- from pandas.core.describe import describe_ndframe
- from pandas.core.flags import Flags
- from pandas.core.indexes.api import (
- DatetimeIndex,
- Index,
- MultiIndex,
- PeriodIndex,
- RangeIndex,
- default_index,
- ensure_index,
- )
- from pandas.core.internals import (
- ArrayManager,
- BlockManager,
- SingleArrayManager,
- )
- from pandas.core.internals.construction import mgr_to_mgr
- from pandas.core.missing import find_valid_index
- from pandas.core.ops import align_method_FRAME
- from pandas.core.reshape.concat import concat
- import pandas.core.sample as sample
- from pandas.core.shared_docs import _shared_docs
- from pandas.core.sorting import get_indexer_indexer
- from pandas.core.window import (
- Expanding,
- ExponentialMovingWindow,
- Rolling,
- Window,
- )
- from pandas.io.formats import format as fmt
- from pandas.io.formats.format import (
- DataFrameFormatter,
- DataFrameRenderer,
- )
- from pandas.io.formats.printing import pprint_thing
- if TYPE_CHECKING:
- from pandas._libs.tslibs import BaseOffset
- from pandas.core.frame import DataFrame
- from pandas.core.indexers.objects import BaseIndexer
- from pandas.core.resample import Resampler
- from pandas.core.series import Series
- # goal is to be able to define the docs close to function, while still being
- # able to share
- _shared_docs = {**_shared_docs}
- _shared_doc_kwargs = {
- "axes": "keywords for axes",
- "klass": "Series/DataFrame",
- "axes_single_arg": "int or labels for object",
- "args_transpose": "axes to permute (int or label for object)",
- "inplace": """
- inplace : bool, default False
- If True, performs operation inplace and returns None.""",
- "optional_by": """
- by : str or list of str
- Name or list of names to sort by""",
- "replace_iloc": """
- This differs from updating with ``.loc`` or ``.iloc``, which require
- you to specify a location to update with some value.""",
- }
- bool_t = bool # Need alias because NDFrame has def bool:
- class NDFrame(PandasObject, indexing.IndexingMixin):
- """
- N-dimensional analogue of DataFrame. Store multi-dimensional in a
- size-mutable, labeled data structure
- Parameters
- ----------
- data : BlockManager
- axes : list
- copy : bool, default False
- """
- _internal_names: list[str] = [
- "_mgr",
- "_cacher",
- "_item_cache",
- "_cache",
- "_is_copy",
- "_subtyp",
- "_name",
- "_default_kind",
- "_default_fill_value",
- "_metadata",
- "__array_struct__",
- "__array_interface__",
- "_flags",
- ]
- _internal_names_set: set[str] = set(_internal_names)
- _accessors: set[str] = set()
- _hidden_attrs: frozenset[str] = frozenset(
- ["_AXIS_NAMES", "_AXIS_NUMBERS", "get_values", "tshift"]
- )
- _metadata: list[str] = []
- _is_copy: weakref.ReferenceType[NDFrame] | None = None
- _mgr: Manager
- _attrs: dict[Hashable, Any]
- _typ: str
- # ----------------------------------------------------------------------
- # Constructors
- def __init__(
- self,
- data: Manager,
- copy: bool_t = False,
- attrs: Mapping[Hashable, Any] | None = None,
- ):
- # copy kwarg is retained for mypy compat, is not used
- object.__setattr__(self, "_is_copy", None)
- object.__setattr__(self, "_mgr", data)
- object.__setattr__(self, "_item_cache", {})
- if attrs is None:
- attrs = {}
- else:
- attrs = dict(attrs)
- object.__setattr__(self, "_attrs", attrs)
- object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True))
- @classmethod
- def _init_mgr(
- cls,
- mgr: Manager,
- axes,
- dtype: Dtype | None = None,
- copy: bool_t = False,
- ) -> Manager:
- """passed a manager and a axes dict"""
- for a, axe in axes.items():
- if axe is not None:
- axe = ensure_index(axe)
- bm_axis = cls._get_block_manager_axis(a)
- mgr = mgr.reindex_axis(axe, axis=bm_axis)
- # make a copy if explicitly requested
- if copy:
- mgr = mgr.copy()
- if dtype is not None:
- # avoid further copies if we can
- if (
- isinstance(mgr, BlockManager)
- and len(mgr.blocks) == 1
- and is_dtype_equal(mgr.blocks[0].values.dtype, dtype)
- ):
- pass
- else:
- mgr = mgr.astype(dtype=dtype)
- return mgr
- @classmethod
- def _from_mgr(cls, mgr: Manager):
- """
- Fastpath to create a new DataFrame/Series from just a BlockManager/ArrayManager.
- Notes
- -----
- Skips setting `_flags` attribute; caller is responsible for doing so.
- """
- obj = cls.__new__(cls)
- object.__setattr__(obj, "_is_copy", None)
- object.__setattr__(obj, "_mgr", mgr)
- object.__setattr__(obj, "_item_cache", {})
- object.__setattr__(obj, "_attrs", {})
- return obj
- def _as_manager(self: NDFrameT, typ: str, copy: bool_t = True) -> NDFrameT:
- """
- Private helper function to create a DataFrame with specific manager.
- Parameters
- ----------
- typ : {"block", "array"}
- copy : bool, default True
- Only controls whether the conversion from Block->ArrayManager
- copies the 1D arrays (to ensure proper/contiguous memory layout).
- Returns
- -------
- DataFrame
- New DataFrame using specified manager type. Is not guaranteed
- to be a copy or not.
- """
- new_mgr: Manager
- new_mgr = mgr_to_mgr(self._mgr, typ=typ, copy=copy)
- # fastpath of passing a manager doesn't check the option/manager class
- return self._constructor(new_mgr).__finalize__(self)
- # ----------------------------------------------------------------------
- # attrs and flags
- @property
- def attrs(self) -> dict[Hashable, Any]:
- """
- Dictionary of global attributes of this dataset.
- .. warning::
- attrs is experimental and may change without warning.
- See Also
- --------
- DataFrame.flags : Global flags applying to this object.
- """
- if self._attrs is None:
- self._attrs = {}
- return self._attrs
- @attrs.setter
- def attrs(self, value: Mapping[Hashable, Any]) -> None:
- self._attrs = dict(value)
- @final
- @property
- def flags(self) -> Flags:
- """
- Get the properties associated with this pandas object.
- The available flags are
- * :attr:`Flags.allows_duplicate_labels`
- See Also
- --------
- Flags : Flags that apply to pandas objects.
- DataFrame.attrs : Global metadata applying to this dataset.
- Notes
- -----
- "Flags" differ from "metadata". Flags reflect properties of the
- pandas object (the Series or DataFrame). Metadata refer to properties
- of the dataset, and should be stored in :attr:`DataFrame.attrs`.
- Examples
- --------
- >>> df = pd.DataFrame({"A": [1, 2]})
- >>> df.flags
- <Flags(allows_duplicate_labels=True)>
- Flags can be get or set using ``.``
- >>> df.flags.allows_duplicate_labels
- True
- >>> df.flags.allows_duplicate_labels = False
- Or by slicing with a key
- >>> df.flags["allows_duplicate_labels"]
- False
- >>> df.flags["allows_duplicate_labels"] = True
- """
- return self._flags
- @final
- def set_flags(
- self: NDFrameT,
- *,
- copy: bool_t = False,
- allows_duplicate_labels: bool_t | None = None,
- ) -> NDFrameT:
- """
- Return a new object with updated flags.
- Parameters
- ----------
- allows_duplicate_labels : bool, optional
- Whether the returned object allows duplicate labels.
- Returns
- -------
- Series or DataFrame
- The same type as the caller.
- See Also
- --------
- DataFrame.attrs : Global metadata applying to this dataset.
- DataFrame.flags : Global flags applying to this object.
- Notes
- -----
- This method returns a new object that's a view on the same data
- as the input. Mutating the input or the output values will be reflected
- in the other.
- This method is intended to be used in method chains.
- "Flags" differ from "metadata". Flags reflect properties of the
- pandas object (the Series or DataFrame). Metadata refer to properties
- of the dataset, and should be stored in :attr:`DataFrame.attrs`.
- Examples
- --------
- >>> df = pd.DataFrame({"A": [1, 2]})
- >>> df.flags.allows_duplicate_labels
- True
- >>> df2 = df.set_flags(allows_duplicate_labels=False)
- >>> df2.flags.allows_duplicate_labels
- False
- """
- df = self.copy(deep=copy)
- if allows_duplicate_labels is not None:
- df.flags["allows_duplicate_labels"] = allows_duplicate_labels
- return df
- @final
- @classmethod
- def _validate_dtype(cls, dtype) -> DtypeObj | None:
- """validate the passed dtype"""
- if dtype is not None:
- dtype = pandas_dtype(dtype)
- # a compound dtype
- if dtype.kind == "V":
- raise NotImplementedError(
- "compound dtypes are not implemented "
- f"in the {cls.__name__} constructor"
- )
- return dtype
- # ----------------------------------------------------------------------
- # Construction
- @property
- def _constructor(self: NDFrameT) -> type[NDFrameT]:
- """
- Used when a manipulation result has the same dimensions as the
- original.
- """
- raise AbstractMethodError(self)
- # ----------------------------------------------------------------------
- # Internals
- @final
- @property
- def _data(self):
- # GH#33054 retained because some downstream packages uses this,
- # e.g. fastparquet
- return self._mgr
- # ----------------------------------------------------------------------
- # Axis
- _stat_axis_number = 0
- _stat_axis_name = "index"
- _AXIS_ORDERS: list[str]
- _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = {0: 0, "index": 0, "rows": 0}
- _info_axis_number: int
- _info_axis_name: str
- _AXIS_LEN: int
- @property
- def _AXIS_NUMBERS(self) -> dict[str, int]:
- """.. deprecated:: 1.1.0"""
- level = self.ndim + 1
- warnings.warn(
- "_AXIS_NUMBERS has been deprecated.", FutureWarning, stacklevel=level
- )
- return {"index": 0}
- @property
- def _AXIS_NAMES(self) -> dict[int, str]:
- """.. deprecated:: 1.1.0"""
- level = self.ndim + 1
- warnings.warn(
- "_AXIS_NAMES has been deprecated.", FutureWarning, stacklevel=level
- )
- return {0: "index"}
- @final
- def _construct_axes_dict(self, axes=None, **kwargs):
- """Return an axes dictionary for myself."""
- d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)}
- d.update(kwargs)
- return d
- @final
- @classmethod
- def _construct_axes_from_arguments(
- cls, args, kwargs, require_all: bool_t = False, sentinel=None
- ):
- """
- Construct and returns axes if supplied in args/kwargs.
- If require_all, raise if all axis arguments are not supplied
- return a tuple of (axes, kwargs).
- sentinel specifies the default parameter when an axis is not
- supplied; useful to distinguish when a user explicitly passes None
- in scenarios where None has special meaning.
- """
- # construct the args
- args = list(args)
- for a in cls._AXIS_ORDERS:
- # look for a argument by position
- if a not in kwargs:
- try:
- kwargs[a] = args.pop(0)
- except IndexError as err:
- if require_all:
- raise TypeError(
- "not enough/duplicate arguments specified!"
- ) from err
- axes = {a: kwargs.pop(a, sentinel) for a in cls._AXIS_ORDERS}
- return axes, kwargs
- @final
- @classmethod
- def _get_axis_number(cls, axis: Axis) -> int:
- try:
- return cls._AXIS_TO_AXIS_NUMBER[axis]
- except KeyError:
- raise ValueError(f"No axis named {axis} for object type {cls.__name__}")
- @final
- @classmethod
- def _get_axis_name(cls, axis: Axis) -> str:
- axis_number = cls._get_axis_number(axis)
- return cls._AXIS_ORDERS[axis_number]
- @final
- def _get_axis(self, axis: Axis) -> Index:
- axis_number = self._get_axis_number(axis)
- assert axis_number in {0, 1}
- return self.index if axis_number == 0 else self.columns
- @final
- @classmethod
- def _get_block_manager_axis(cls, axis: Axis) -> int:
- """Map the axis to the block_manager axis."""
- axis = cls._get_axis_number(axis)
- ndim = cls._AXIS_LEN
- if ndim == 2:
- # i.e. DataFrame
- return 1 - axis
- return axis
- @final
- def _get_axis_resolvers(self, axis: str) -> dict[str, Series | MultiIndex]:
- # index or columns
- axis_index = getattr(self, axis)
- d = {}
- prefix = axis[0]
- for i, name in enumerate(axis_index.names):
- if name is not None:
- key = level = name
- else:
- # prefix with 'i' or 'c' depending on the input axis
- # e.g., you must do ilevel_0 for the 0th level of an unnamed
- # multiiindex
- key = f"{prefix}level_{i}"
- level = i
- level_values = axis_index.get_level_values(level)
- s = level_values.to_series()
- s.index = axis_index
- d[key] = s
- # put the index/columns itself in the dict
- if isinstance(axis_index, MultiIndex):
- dindex = axis_index
- else:
- dindex = axis_index.to_series()
- d[axis] = dindex
- return d
- @final
- def _get_index_resolvers(self) -> dict[Hashable, Series | MultiIndex]:
- from pandas.core.computation.parsing import clean_column_name
- d: dict[str, Series | MultiIndex] = {}
- for axis_name in self._AXIS_ORDERS:
- d.update(self._get_axis_resolvers(axis_name))
- return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)}
- @final
- def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]:
- """
- Return the special character free column resolvers of a dataframe.
- Column names with special characters are 'cleaned up' so that they can
- be referred to by backtick quoting.
- Used in :meth:`DataFrame.eval`.
- """
- from pandas.core.computation.parsing import clean_column_name
- if isinstance(self, ABCSeries):
- return {clean_column_name(self.name): self}
- return {
- clean_column_name(k): v for k, v in self.items() if not isinstance(k, int)
- }
- @property
- def _info_axis(self) -> Index:
- return getattr(self, self._info_axis_name)
- @property
- def _stat_axis(self) -> Index:
- return getattr(self, self._stat_axis_name)
- @property
- def shape(self) -> tuple[int, ...]:
- """
- Return a tuple of axis dimensions
- """
- return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS)
- @property
- def axes(self) -> list[Index]:
- """
- Return index label(s) of the internal NDFrame
- """
- # we do it this way because if we have reversed axes, then
- # the block manager shows then reversed
- return [self._get_axis(a) for a in self._AXIS_ORDERS]
- @property
- def ndim(self) -> int:
- """
- Return an int representing the number of axes / array dimensions.
- Return 1 if Series. Otherwise return 2 if DataFrame.
- See Also
- --------
- ndarray.ndim : Number of array dimensions.
- Examples
- --------
- >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
- >>> s.ndim
- 1
- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
- >>> df.ndim
- 2
- """
- return self._mgr.ndim
- @property
- def size(self) -> int:
- """
- Return an int representing the number of elements in this object.
- Return the number of rows if Series. Otherwise return the number of
- rows times number of columns if DataFrame.
- See Also
- --------
- ndarray.size : Number of elements in the array.
- Examples
- --------
- >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
- >>> s.size
- 3
- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
- >>> df.size
- 4
- """
- return np.prod(self.shape)
- @overload
- def set_axis(
- self: NDFrameT, labels, axis: Axis = ..., inplace: Literal[False] = ...
- ) -> NDFrameT:
- ...
- @overload
- def set_axis(self, labels, axis: Axis, inplace: Literal[True]) -> None:
- ...
- @overload
- def set_axis(self, labels, *, inplace: Literal[True]) -> None:
- ...
- @overload
- def set_axis(
- self: NDFrameT, labels, axis: Axis = ..., inplace: bool_t = ...
- ) -> NDFrameT | None:
- ...
- def set_axis(self, labels, axis: Axis = 0, inplace: bool_t = False):
- """
- Assign desired index to given axis.
- Indexes for%(extended_summary_sub)s row labels can be changed by assigning
- a list-like or Index.
- Parameters
- ----------
- labels : list-like, Index
- The values for the new index.
- axis : %(axes_single_arg)s, default 0
- The axis to update. The value 0 identifies the rows%(axis_description_sub)s.
- inplace : bool, default False
- Whether to return a new %(klass)s instance.
- Returns
- -------
- renamed : %(klass)s or None
- An object of type %(klass)s or None if ``inplace=True``.
- See Also
- --------
- %(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s.
- """
- self._check_inplace_and_allows_duplicate_labels(inplace)
- return self._set_axis_nocheck(labels, axis, inplace)
- @final
- def _set_axis_nocheck(self, labels, axis: Axis, inplace: bool_t):
- # NDFrame.rename with inplace=False calls set_axis(inplace=True) on a copy.
- if inplace:
- setattr(self, self._get_axis_name(axis), labels)
- else:
- obj = self.copy()
- obj.set_axis(labels, axis=axis, inplace=True)
- return obj
- def _set_axis(self, axis: int, labels: Index) -> None:
- labels = ensure_index(labels)
- self._mgr.set_axis(axis, labels)
- self._clear_item_cache()
- @final
- def swapaxes(self: NDFrameT, axis1, axis2, copy=True) -> NDFrameT:
- """
- Interchange axes and swap values axes appropriately.
- Returns
- -------
- y : same as input
- """
- i = self._get_axis_number(axis1)
- j = self._get_axis_number(axis2)
- if i == j:
- if copy:
- return self.copy()
- return self
- mapping = {i: j, j: i}
- new_axes = (self._get_axis(mapping.get(k, k)) for k in range(self._AXIS_LEN))
- new_values = self.values.swapaxes(i, j)
- if copy:
- new_values = new_values.copy()
- # ignore needed because of NDFrame constructor is different than
- # DataFrame/Series constructors.
- return self._constructor(
- # error: Argument 1 to "NDFrame" has incompatible type "ndarray"; expected
- # "Union[ArrayManager, BlockManager]"
- # error: Argument 2 to "NDFrame" has incompatible type "*Generator[Index,
- # None, None]"; expected "bool" [arg-type]
- # error: Argument 2 to "NDFrame" has incompatible type "*Generator[Index,
- # None, None]"; expected "Optional[Mapping[Hashable, Any]]"
- new_values, # type: ignore[arg-type]
- *new_axes, # type: ignore[arg-type]
- ).__finalize__(self, method="swapaxes")
- @final
- @doc(klass=_shared_doc_kwargs["klass"])
- def droplevel(self: NDFrameT, level, axis=0) -> NDFrameT:
- """
- Return {klass} with requested index / column level(s) removed.
- Parameters
- ----------
- level : int, str, or list-like
- If a string is given, must be the name of a level
- If list-like, elements must be names or positional indexes
- of levels.
- axis : {{0 or 'index', 1 or 'columns'}}, default 0
- Axis along which the level(s) is removed:
- * 0 or 'index': remove level(s) in column.
- * 1 or 'columns': remove level(s) in row.
- Returns
- -------
- {klass}
- {klass} with requested index / column level(s) removed.
- Examples
- --------
- >>> df = pd.DataFrame([
- ... [1, 2, 3, 4],
- ... [5, 6, 7, 8],
- ... [9, 10, 11, 12]
- ... ]).set_index([0, 1]).rename_axis(['a', 'b'])
- >>> df.columns = pd.MultiIndex.from_tuples([
- ... ('c', 'e'), ('d', 'f')
- ... ], names=['level_1', 'level_2'])
- >>> df
- level_1 c d
- level_2 e f
- a b
- 1 2 3 4
- 5 6 7 8
- 9 10 11 12
- >>> df.droplevel('a')
- level_1 c d
- level_2 e f
- b
- 2 3 4
- 6 7 8
- 10 11 12
- >>> df.droplevel('level_2', axis=1)
- level_1 c d
- a b
- 1 2 3 4
- 5 6 7 8
- 9 10 11 12
- """
- labels = self._get_axis(axis)
- new_labels = labels.droplevel(level)
- return self.set_axis(new_labels, axis=axis, inplace=False)
- def pop(self, item: Hashable) -> Series | Any:
- result = self[item]
- del self[item]
- return result
- @final
- def squeeze(self, axis=None):
- """
- Squeeze 1 dimensional axis objects into scalars.
- Series or DataFrames with a single element are squeezed to a scalar.
- DataFrames with a single column or a single row are squeezed to a
- Series. Otherwise the object is unchanged.
- This method is most useful when you don't know if your
- object is a Series or DataFrame, but you do know it has just a single
- column. In that case you can safely call `squeeze` to ensure you have a
- Series.
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns', None}, default None
- A specific axis to squeeze. By default, all length-1 axes are
- squeezed.
- Returns
- -------
- DataFrame, Series, or scalar
- The projection after squeezing `axis` or all the axes.
- See Also
- --------
- Series.iloc : Integer-location based indexing for selecting scalars.
- DataFrame.iloc : Integer-location based indexing for selecting Series.
- Series.to_frame : Inverse of DataFrame.squeeze for a
- single-column DataFrame.
- Examples
- --------
- >>> primes = pd.Series([2, 3, 5, 7])
- Slicing might produce a Series with a single value:
- >>> even_primes = primes[primes % 2 == 0]
- >>> even_primes
- 0 2
- dtype: int64
- >>> even_primes.squeeze()
- 2
- Squeezing objects with more than one value in every axis does nothing:
- >>> odd_primes = primes[primes % 2 == 1]
- >>> odd_primes
- 1 3
- 2 5
- 3 7
- dtype: int64
- >>> odd_primes.squeeze()
- 1 3
- 2 5
- 3 7
- dtype: int64
- Squeezing is even more effective when used with DataFrames.
- >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'])
- >>> df
- a b
- 0 1 2
- 1 3 4
- Slicing a single column will produce a DataFrame with the columns
- having only one value:
- >>> df_a = df[['a']]
- >>> df_a
- a
- 0 1
- 1 3
- So the columns can be squeezed down, resulting in a Series:
- >>> df_a.squeeze('columns')
- 0 1
- 1 3
- Name: a, dtype: int64
- Slicing a single row from a single column will produce a single
- scalar DataFrame:
- >>> df_0a = df.loc[df.index < 1, ['a']]
- >>> df_0a
- a
- 0 1
- Squeezing the rows produces a single scalar Series:
- >>> df_0a.squeeze('rows')
- a 1
- Name: 0, dtype: int64
- Squeezing all axes will project directly into a scalar:
- >>> df_0a.squeeze()
- 1
- """
- axis = range(self._AXIS_LEN) if axis is None else (self._get_axis_number(axis),)
- return self.iloc[
- tuple(
- 0 if i in axis and len(a) == 1 else slice(None)
- for i, a in enumerate(self.axes)
- )
- ]
- # ----------------------------------------------------------------------
- # Rename
- def rename(
- self: NDFrameT,
- mapper: Renamer | None = None,
- *,
- index: Renamer | None = None,
- columns: Renamer | None = None,
- axis: Axis | None = None,
- copy: bool_t = True,
- inplace: bool_t = False,
- level: Level | None = None,
- errors: str = "ignore",
- ) -> NDFrameT | None:
- """
- Alter axes input function or functions. Function / dict values must be
- unique (1-to-1). Labels not contained in a dict / Series will be left
- as-is. Extra labels listed don't throw an error. Alternatively, change
- ``Series.name`` with a scalar value (Series only).
- Parameters
- ----------
- %(axes)s : scalar, list-like, dict-like or function, optional
- Scalar or list-like will alter the ``Series.name`` attribute,
- and raise on DataFrame.
- dict-like or functions are transformations to apply to
- that axis' values
- copy : bool, default True
- Also copy underlying data.
- inplace : bool, default False
- Whether to return a new {klass}. If True then value of copy is
- ignored.
- level : int or level name, default None
- In case of a MultiIndex, only rename labels in the specified
- level.
- errors : {'ignore', 'raise'}, default 'ignore'
- If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,
- or `columns` contains labels that are not present in the Index
- being transformed.
- If 'ignore', existing keys will be renamed and extra keys will be
- ignored.
- Returns
- -------
- renamed : {klass} (new object)
- Raises
- ------
- KeyError
- If any of the labels is not found in the selected axis and
- "errors='raise'".
- See Also
- --------
- NDFrame.rename_axis
- Examples
- --------
- >>> s = pd.Series([1, 2, 3])
- >>> s
- 0 1
- 1 2
- 2 3
- dtype: int64
- >>> s.rename("my_name") # scalar, changes Series.name
- 0 1
- 1 2
- 2 3
- Name: my_name, dtype: int64
- >>> s.rename(lambda x: x ** 2) # function, changes labels
- 0 1
- 1 2
- 4 3
- dtype: int64
- >>> s.rename({1: 3, 2: 5}) # mapping, changes labels
- 0 1
- 3 2
- 5 3
- dtype: int64
- Since ``DataFrame`` doesn't have a ``.name`` attribute,
- only mapping-type arguments are allowed.
- >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
- >>> df.rename(2)
- Traceback (most recent call last):
- ...
- TypeError: 'int' object is not callable
- ``DataFrame.rename`` supports two calling conventions
- * ``(index=index_mapper, columns=columns_mapper, ...)``
- * ``(mapper, axis={'index', 'columns'}, ...)``
- We *highly* recommend using keyword arguments to clarify your
- intent.
- >>> df.rename(index=str, columns={"A": "a", "B": "c"})
- a c
- 0 1 4
- 1 2 5
- 2 3 6
- >>> df.rename(index=str, columns={"A": "a", "C": "c"})
- a B
- 0 1 4
- 1 2 5
- 2 3 6
- Using axis-style parameters
- >>> df.rename(str.lower, axis='columns')
- a b
- 0 1 4
- 1 2 5
- 2 3 6
- >>> df.rename({1: 2, 2: 4}, axis='index')
- A B
- 0 1 4
- 2 2 5
- 4 3 6
- See the :ref:`user guide <basics.rename>` for more.
- """
- if mapper is None and index is None and columns is None:
- raise TypeError("must pass an index to rename")
- if index is not None or columns is not None:
- if axis is not None:
- raise TypeError(
- "Cannot specify both 'axis' and any of 'index' or 'columns'"
- )
- elif mapper is not None:
- raise TypeError(
- "Cannot specify both 'mapper' and any of 'index' or 'columns'"
- )
- else:
- # use the mapper argument
- if axis and self._get_axis_number(axis) == 1:
- columns = mapper
- else:
- index = mapper
- self._check_inplace_and_allows_duplicate_labels(inplace)
- result = self if inplace else self.copy(deep=copy)
- for axis_no, replacements in enumerate((index, columns)):
- if replacements is None:
- continue
- ax = self._get_axis(axis_no)
- f = com.get_rename_function(replacements)
- if level is not None:
- level = ax._get_level_number(level)
- # GH 13473
- if not callable(replacements):
- if ax._is_multi and level is not None:
- indexer = ax.get_level_values(level).get_indexer_for(replacements)
- else:
- indexer = ax.get_indexer_for(replacements)
- if errors == "raise" and len(indexer[indexer == -1]):
- missing_labels = [
- label
- for index, label in enumerate(replacements)
- if indexer[index] == -1
- ]
- raise KeyError(f"{missing_labels} not found in axis")
- new_index = ax._transform_index(f, level=level)
- result._set_axis_nocheck(new_index, axis=axis_no, inplace=True)
- result._clear_item_cache()
- if inplace:
- self._update_inplace(result)
- return None
- else:
- return result.__finalize__(self, method="rename")
- @rewrite_axis_style_signature("mapper", [("copy", True), ("inplace", False)])
- def rename_axis(self, mapper=lib.no_default, **kwargs):
- """
- Set the name of the axis for the index or columns.
- Parameters
- ----------
- mapper : scalar, list-like, optional
- Value to set the axis name attribute.
- index, columns : scalar, list-like, dict-like or function, optional
- A scalar, list-like, dict-like or functions transformations to
- apply to that axis' values.
- Note that the ``columns`` parameter is not allowed if the
- object is a Series. This parameter only apply for DataFrame
- type objects.
- Use either ``mapper`` and ``axis`` to
- specify the axis to target with ``mapper``, or ``index``
- and/or ``columns``.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis to rename.
- copy : bool, default True
- Also copy underlying data.
- inplace : bool, default False
- Modifies the object directly, instead of creating a new Series
- or DataFrame.
- Returns
- -------
- Series, DataFrame, or None
- The same type as the caller or None if ``inplace=True``.
- See Also
- --------
- Series.rename : Alter Series index labels or name.
- DataFrame.rename : Alter DataFrame index labels or name.
- Index.rename : Set new names on index.
- Notes
- -----
- ``DataFrame.rename_axis`` supports two calling conventions
- * ``(index=index_mapper, columns=columns_mapper, ...)``
- * ``(mapper, axis={'index', 'columns'}, ...)``
- The first calling convention will only modify the names of
- the index and/or the names of the Index object that is the columns.
- In this case, the parameter ``copy`` is ignored.
- The second calling convention will modify the names of the
- corresponding index if mapper is a list or a scalar.
- However, if mapper is dict-like or a function, it will use the
- deprecated behavior of modifying the axis *labels*.
- We *highly* recommend using keyword arguments to clarify your
- intent.
- Examples
- --------
- **Series**
- >>> s = pd.Series(["dog", "cat", "monkey"])
- >>> s
- 0 dog
- 1 cat
- 2 monkey
- dtype: object
- >>> s.rename_axis("animal")
- animal
- 0 dog
- 1 cat
- 2 monkey
- dtype: object
- **DataFrame**
- >>> df = pd.DataFrame({"num_legs": [4, 4, 2],
- ... "num_arms": [0, 0, 2]},
- ... ["dog", "cat", "monkey"])
- >>> df
- num_legs num_arms
- dog 4 0
- cat 4 0
- monkey 2 2
- >>> df = df.rename_axis("animal")
- >>> df
- num_legs num_arms
- animal
- dog 4 0
- cat 4 0
- monkey 2 2
- >>> df = df.rename_axis("limbs", axis="columns")
- >>> df
- limbs num_legs num_arms
- animal
- dog 4 0
- cat 4 0
- monkey 2 2
- **MultiIndex**
- >>> df.index = pd.MultiIndex.from_product([['mammal'],
- ... ['dog', 'cat', 'monkey']],
- ... names=['type', 'name'])
- >>> df
- limbs num_legs num_arms
- type name
- mammal dog 4 0
- cat 4 0
- monkey 2 2
- >>> df.rename_axis(index={'type': 'class'})
- limbs num_legs num_arms
- class name
- mammal dog 4 0
- cat 4 0
- monkey 2 2
- >>> df.rename_axis(columns=str.upper)
- LIMBS num_legs num_arms
- type name
- mammal dog 4 0
- cat 4 0
- monkey 2 2
- """
- axes, kwargs = self._construct_axes_from_arguments(
- (), kwargs, sentinel=lib.no_default
- )
- copy = kwargs.pop("copy", True)
- inplace = kwargs.pop("inplace", False)
- axis = kwargs.pop("axis", 0)
- if axis is not None:
- axis = self._get_axis_number(axis)
- if kwargs:
- raise TypeError(
- "rename_axis() got an unexpected keyword "
- f'argument "{list(kwargs.keys())[0]}"'
- )
- inplace = validate_bool_kwarg(inplace, "inplace")
- if mapper is not lib.no_default:
- # Use v0.23 behavior if a scalar or list
- non_mapper = is_scalar(mapper) or (
- is_list_like(mapper) and not is_dict_like(mapper)
- )
- if non_mapper:
- return self._set_axis_name(mapper, axis=axis, inplace=inplace)
- else:
- raise ValueError("Use `.rename` to alter labels with a mapper.")
- else:
- # Use new behavior. Means that index and/or columns
- # is specified
- result = self if inplace else self.copy(deep=copy)
- for axis in range(self._AXIS_LEN):
- v = axes.get(self._get_axis_name(axis))
- if v is lib.no_default:
- continue
- non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v))
- if non_mapper:
- newnames = v
- else:
- f = com.get_rename_function(v)
- curnames = self._get_axis(axis).names
- newnames = [f(name) for name in curnames]
- result._set_axis_name(newnames, axis=axis, inplace=True)
- if not inplace:
- return result
- @final
- def _set_axis_name(self, name, axis=0, inplace=False):
- """
- Set the name(s) of the axis.
- Parameters
- ----------
- name : str or list of str
- Name(s) to set.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis to set the label. The value 0 or 'index' specifies index,
- and the value 1 or 'columns' specifies columns.
- inplace : bool, default False
- If `True`, do operation inplace and return None.
- Returns
- -------
- Series, DataFrame, or None
- The same type as the caller or `None` if `inplace` is `True`.
- See Also
- --------
- DataFrame.rename : Alter the axis labels of :class:`DataFrame`.
- Series.rename : Alter the index labels or set the index name
- of :class:`Series`.
- Index.rename : Set the name of :class:`Index` or :class:`MultiIndex`.
- Examples
- --------
- >>> df = pd.DataFrame({"num_legs": [4, 4, 2]},
- ... ["dog", "cat", "monkey"])
- >>> df
- num_legs
- dog 4
- cat 4
- monkey 2
- >>> df._set_axis_name("animal")
- num_legs
- animal
- dog 4
- cat 4
- monkey 2
- >>> df.index = pd.MultiIndex.from_product(
- ... [["mammal"], ['dog', 'cat', 'monkey']])
- >>> df._set_axis_name(["type", "name"])
- num_legs
- type name
- mammal dog 4
- cat 4
- monkey 2
- """
- axis = self._get_axis_number(axis)
- idx = self._get_axis(axis).set_names(name)
- inplace = validate_bool_kwarg(inplace, "inplace")
- renamed = self if inplace else self.copy()
- renamed.set_axis(idx, axis=axis, inplace=True)
- if not inplace:
- return renamed
- # ----------------------------------------------------------------------
- # Comparison Methods
- @final
- def _indexed_same(self, other) -> bool_t:
- return all(
- self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS
- )
- @final
- def equals(self, other: object) -> bool_t:
- """
- Test whether two objects contain the same elements.
- This function allows two Series or DataFrames to be compared against
- each other to see if they have the same shape and elements. NaNs in
- the same location are considered equal.
- The row/column index do not need to have the same type, as long
- as the values are considered equal. Corresponding columns must be of
- the same dtype.
- Parameters
- ----------
- other : Series or DataFrame
- The other Series or DataFrame to be compared with the first.
- Returns
- -------
- bool
- True if all elements are the same in both objects, False
- otherwise.
- See Also
- --------
- Series.eq : Compare two Series objects of the same length
- and return a Series where each element is True if the element
- in each Series is equal, False otherwise.
- DataFrame.eq : Compare two DataFrame objects of the same shape and
- return a DataFrame where each element is True if the respective
- element in each DataFrame is equal, False otherwise.
- testing.assert_series_equal : Raises an AssertionError if left and
- right are not equal. Provides an easy interface to ignore
- inequality in dtypes, indexes and precision among others.
- testing.assert_frame_equal : Like assert_series_equal, but targets
- DataFrames.
- numpy.array_equal : Return True if two arrays have the same shape
- and elements, False otherwise.
- Examples
- --------
- >>> df = pd.DataFrame({1: [10], 2: [20]})
- >>> df
- 1 2
- 0 10 20
- DataFrames df and exactly_equal have the same types and values for
- their elements and column labels, which will return True.
- >>> exactly_equal = pd.DataFrame({1: [10], 2: [20]})
- >>> exactly_equal
- 1 2
- 0 10 20
- >>> df.equals(exactly_equal)
- True
- DataFrames df and different_column_type have the same element
- types and values, but have different types for the column labels,
- which will still return True.
- >>> different_column_type = pd.DataFrame({1.0: [10], 2.0: [20]})
- >>> different_column_type
- 1.0 2.0
- 0 10 20
- >>> df.equals(different_column_type)
- True
- DataFrames df and different_data_type have different types for the
- same values for their elements, and will return False even though
- their column labels are the same values and types.
- >>> different_data_type = pd.DataFrame({1: [10.0], 2: [20.0]})
- >>> different_data_type
- 1 2
- 0 10.0 20.0
- >>> df.equals(different_data_type)
- False
- """
- if not (isinstance(other, type(self)) or isinstance(self, type(other))):
- return False
- other = cast(NDFrame, other)
- return self._mgr.equals(other._mgr)
- # -------------------------------------------------------------------------
- # Unary Methods
- @final
- def __neg__(self):
- def blk_func(values: ArrayLike):
- if is_bool_dtype(values.dtype):
- return operator.inv(values)
- else:
- return operator.neg(values)
- new_data = self._mgr.apply(blk_func)
- res = self._constructor(new_data)
- return res.__finalize__(self, method="__neg__")
- @final
- def __pos__(self):
- def blk_func(values: ArrayLike):
- if is_bool_dtype(values.dtype):
- return values.copy()
- else:
- return operator.pos(values)
- new_data = self._mgr.apply(blk_func)
- res = self._constructor(new_data)
- return res.__finalize__(self, method="__pos__")
- @final
- def __invert__(self):
- if not self.size:
- # inv fails with 0 len
- return self
- new_data = self._mgr.apply(operator.invert)
- return self._constructor(new_data).__finalize__(self, method="__invert__")
- @final
- def __nonzero__(self):
- raise ValueError(
- f"The truth value of a {type(self).__name__} is ambiguous. "
- "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
- )
- __bool__ = __nonzero__
- @final
- def bool(self):
- """
- Return the bool of a single element Series or DataFrame.
- This must be a boolean scalar value, either True or False. It will raise a
- ValueError if the Series or DataFrame does not have exactly 1 element, or that
- element is not boolean (integer values 0 and 1 will also raise an exception).
- Returns
- -------
- bool
- The value in the Series or DataFrame.
- See Also
- --------
- Series.astype : Change the data type of a Series, including to boolean.
- DataFrame.astype : Change the data type of a DataFrame, including to boolean.
- numpy.bool_ : NumPy boolean data type, used by pandas for boolean values.
- Examples
- --------
- The method will only work for single element objects with a boolean value:
- >>> pd.Series([True]).bool()
- True
- >>> pd.Series([False]).bool()
- False
- >>> pd.DataFrame({'col': [True]}).bool()
- True
- >>> pd.DataFrame({'col': [False]}).bool()
- False
- """
- v = self.squeeze()
- if isinstance(v, (bool, np.bool_)):
- return bool(v)
- elif is_scalar(v):
- raise ValueError(
- "bool cannot act on a non-boolean single element "
- f"{type(self).__name__}"
- )
- self.__nonzero__()
- @final
- def abs(self: NDFrameT) -> NDFrameT:
- """
- Return a Series/DataFrame with absolute numeric value of each element.
- This function only applies to elements that are all numeric.
- Returns
- -------
- abs
- Series/DataFrame containing the absolute value of each element.
- See Also
- --------
- numpy.absolute : Calculate the absolute value element-wise.
- Notes
- -----
- For ``complex`` inputs, ``1.2 + 1j``, the absolute value is
- :math:`\\sqrt{ a^2 + b^2 }`.
- Examples
- --------
- Absolute numeric values in a Series.
- >>> s = pd.Series([-1.10, 2, -3.33, 4])
- >>> s.abs()
- 0 1.10
- 1 2.00
- 2 3.33
- 3 4.00
- dtype: float64
- Absolute numeric values in a Series with complex numbers.
- >>> s = pd.Series([1.2 + 1j])
- >>> s.abs()
- 0 1.56205
- dtype: float64
- Absolute numeric values in a Series with a Timedelta element.
- >>> s = pd.Series([pd.Timedelta('1 days')])
- >>> s.abs()
- 0 1 days
- dtype: timedelta64[ns]
- Select rows with data closest to certain value using argsort (from
- `StackOverflow <https://stackoverflow.com/a/17758115>`__).
- >>> df = pd.DataFrame({
- ... 'a': [4, 5, 6, 7],
- ... 'b': [10, 20, 30, 40],
- ... 'c': [100, 50, -30, -50]
- ... })
- >>> df
- a b c
- 0 4 10 100
- 1 5 20 50
- 2 6 30 -30
- 3 7 40 -50
- >>> df.loc[(df.c - 43).abs().argsort()]
- a b c
- 1 5 20 50
- 0 4 10 100
- 2 6 30 -30
- 3 7 40 -50
- """
- res_mgr = self._mgr.apply(np.abs)
- return self._constructor(re…
Large files files are truncated, but you can click here to view the full file