/pandas/core/frame.py
Python | 10964 lines | 10896 code | 26 blank | 42 comment | 80 complexity | c9914df2f7e7376efe60854a7496065d MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
Large files files are truncated, but you can click here to view the full file
- """
- DataFrame
- ---------
- An efficient 2D container for potentially mixed-type time series or other
- labeled data series.
- Similar to its R counterpart, data.frame, except providing automatic data
- alignment and a host of useful data manipulation methods having to do with the
- labeling information
- """
- from __future__ import annotations
- import collections
- from collections import abc
- import datetime
- import functools
- from io import StringIO
- import itertools
- import mmap
- from textwrap import dedent
- from typing import (
- IO,
- TYPE_CHECKING,
- Any,
- Callable,
- Hashable,
- Iterable,
- Iterator,
- Literal,
- Sequence,
- cast,
- overload,
- )
- import warnings
- import numpy as np
- import numpy.ma as ma
- from pandas._config import get_option
- from pandas._libs import (
- algos as libalgos,
- lib,
- properties,
- )
- from pandas._libs.hashtable import duplicated
- from pandas._libs.lib import no_default
- from pandas._typing import (
- AggFuncType,
- AnyArrayLike,
- ArrayLike,
- Axes,
- Axis,
- ColspaceArgType,
- CompressionOptions,
- Dtype,
- DtypeObj,
- FilePathOrBuffer,
- FillnaOptions,
- FloatFormatType,
- FormattersType,
- Frequency,
- IndexKeyFunc,
- IndexLabel,
- Level,
- PythonFuncType,
- Renamer,
- Scalar,
- StorageOptions,
- Suffixes,
- TimedeltaConvertibleTypes,
- TimestampConvertibleTypes,
- ValueKeyFunc,
- npt,
- )
- from pandas.compat._optional import import_optional_dependency
- from pandas.compat.numpy import function as nv
- from pandas.util._decorators import (
- Appender,
- Substitution,
- deprecate_kwarg,
- deprecate_nonkeyword_arguments,
- doc,
- rewrite_axis_style_signature,
- )
- from pandas.util._validators import (
- validate_ascending,
- validate_axis_style_args,
- validate_bool_kwarg,
- validate_percentile,
- )
- from pandas.core.dtypes.cast import (
- construct_1d_arraylike_from_scalar,
- construct_2d_arraylike_from_scalar,
- find_common_type,
- infer_dtype_from_scalar,
- invalidate_string_dtypes,
- maybe_box_native,
- maybe_downcast_to_dtype,
- validate_numeric_casting,
- )
- from pandas.core.dtypes.common import (
- ensure_platform_int,
- infer_dtype_from_object,
- is_1d_only_ea_dtype,
- is_1d_only_ea_obj,
- is_bool_dtype,
- is_dataclass,
- is_datetime64_any_dtype,
- is_dict_like,
- is_dtype_equal,
- is_extension_array_dtype,
- is_float,
- is_float_dtype,
- is_hashable,
- is_integer,
- is_integer_dtype,
- is_iterator,
- is_list_like,
- is_object_dtype,
- is_scalar,
- is_sequence,
- pandas_dtype,
- )
- from pandas.core.dtypes.dtypes import ExtensionDtype
- from pandas.core.dtypes.missing import (
- isna,
- notna,
- )
- from pandas.core import (
- algorithms,
- common as com,
- generic,
- nanops,
- ops,
- )
- from pandas.core.accessor import CachedAccessor
- from pandas.core.apply import (
- reconstruct_func,
- relabel_result,
- )
- from pandas.core.array_algos.take import take_2d_multi
- from pandas.core.arraylike import OpsMixin
- from pandas.core.arrays import (
- DatetimeArray,
- ExtensionArray,
- TimedeltaArray,
- )
- from pandas.core.arrays.sparse import SparseFrameAccessor
- from pandas.core.construction import (
- extract_array,
- sanitize_array,
- sanitize_masked_array,
- )
- from pandas.core.generic import (
- NDFrame,
- _shared_docs,
- )
- from pandas.core.indexers import check_key_length
- from pandas.core.indexes.api import (
- DatetimeIndex,
- Index,
- PeriodIndex,
- default_index,
- ensure_index,
- ensure_index_from_sequences,
- )
- from pandas.core.indexes.multi import (
- MultiIndex,
- maybe_droplevels,
- )
- from pandas.core.indexing import (
- check_bool_indexer,
- convert_to_index_sliceable,
- )
- from pandas.core.internals import (
- ArrayManager,
- BlockManager,
- )
- from pandas.core.internals.construction import (
- arrays_to_mgr,
- dataclasses_to_dicts,
- dict_to_mgr,
- mgr_to_mgr,
- ndarray_to_mgr,
- nested_data_to_arrays,
- rec_array_to_mgr,
- reorder_arrays,
- to_arrays,
- treat_as_nested,
- )
- from pandas.core.reshape.melt import melt
- from pandas.core.series import Series
- from pandas.core.sorting import (
- get_group_index,
- lexsort_indexer,
- nargsort,
- )
- from pandas.io.common import get_handle
- from pandas.io.formats import (
- console,
- format as fmt,
- )
- from pandas.io.formats.info import (
- BaseInfo,
- DataFrameInfo,
- )
- import pandas.plotting
- if TYPE_CHECKING:
- from pandas.core.groupby.generic import DataFrameGroupBy
- from pandas.core.resample import Resampler
- from pandas.io.formats.style import Styler
- # ---------------------------------------------------------------------
- # Docstring templates
- _shared_doc_kwargs = {
- "axes": "index, columns",
- "klass": "DataFrame",
- "axes_single_arg": "{0 or 'index', 1 or 'columns'}",
- "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0
- If 0 or 'index': apply function to each column.
- If 1 or 'columns': apply function to each row.""",
- "inplace": """
- inplace : bool, default False
- If True, performs operation inplace and returns None.""",
- "optional_by": """
- by : str or list of str
- Name or list of names to sort by.
- - if `axis` is 0 or `'index'` then `by` may contain index
- levels and/or column labels.
- - if `axis` is 1 or `'columns'` then `by` may contain column
- levels and/or index labels.""",
- "optional_labels": """labels : array-like, optional
- New labels / index to conform the axis specified by 'axis' to.""",
- "optional_axis": """axis : int or str, optional
- Axis to target. Can be either the axis name ('index', 'columns')
- or number (0, 1).""",
- "replace_iloc": """
- This differs from updating with ``.loc`` or ``.iloc``, which require
- you to specify a location to update with some value.""",
- }
- _numeric_only_doc = """numeric_only : bool or None, default None
- Include only float, int, boolean data. If None, will attempt to use
- everything, then use only numeric data
- """
- _merge_doc = """
- Merge DataFrame or named Series objects with a database-style join.
- A named Series object is treated as a DataFrame with a single named column.
- The join is done on columns or indexes. If joining columns on
- columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
- on indexes or indexes on a column or columns, the index will be passed on.
- When performing a cross merge, no column specifications to merge on are
- allowed.
- .. warning::
- If both key columns contain rows where the key is a null value, those
- rows will be matched against each other. This is different from usual SQL
- join behaviour and can lead to unexpected results.
- Parameters
- ----------%s
- right : DataFrame or named Series
- Object to merge with.
- how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
- Type of merge to be performed.
- * left: use only keys from left frame, similar to a SQL left outer join;
- preserve key order.
- * right: use only keys from right frame, similar to a SQL right outer join;
- preserve key order.
- * outer: use union of keys from both frames, similar to a SQL full outer
- join; sort keys lexicographically.
- * inner: use intersection of keys from both frames, similar to a SQL inner
- join; preserve the order of the left keys.
- * cross: creates the cartesian product from both frames, preserves the order
- of the left keys.
- .. versionadded:: 1.2.0
- on : label or list
- Column or index level names to join on. These must be found in both
- DataFrames. If `on` is None and not merging on indexes then this defaults
- to the intersection of the columns in both DataFrames.
- left_on : label or list, or array-like
- Column or index level names to join on in the left DataFrame. Can also
- be an array or list of arrays of the length of the left DataFrame.
- These arrays are treated as if they are columns.
- right_on : label or list, or array-like
- Column or index level names to join on in the right DataFrame. Can also
- be an array or list of arrays of the length of the right DataFrame.
- These arrays are treated as if they are columns.
- left_index : bool, default False
- Use the index from the left DataFrame as the join key(s). If it is a
- MultiIndex, the number of keys in the other DataFrame (either the index
- or a number of columns) must match the number of levels.
- right_index : bool, default False
- Use the index from the right DataFrame as the join key. Same caveats as
- left_index.
- sort : bool, default False
- Sort the join keys lexicographically in the result DataFrame. If False,
- the order of the join keys depends on the join type (how keyword).
- suffixes : list-like, default is ("_x", "_y")
- A length-2 sequence where each element is optionally a string
- indicating the suffix to add to overlapping column names in
- `left` and `right` respectively. Pass a value of `None` instead
- of a string to indicate that the column name from `left` or
- `right` should be left as-is, with no suffix. At least one of the
- values must not be None.
- copy : bool, default True
- If False, avoid copy if possible.
- indicator : bool or str, default False
- If True, adds a column to the output DataFrame called "_merge" with
- information on the source of each row. The column can be given a different
- name by providing a string argument. The column will have a Categorical
- type with the value of "left_only" for observations whose merge key only
- appears in the left DataFrame, "right_only" for observations
- whose merge key only appears in the right DataFrame, and "both"
- if the observation's merge key is found in both DataFrames.
- validate : str, optional
- If specified, checks if merge is of specified type.
- * "one_to_one" or "1:1": check if merge keys are unique in both
- left and right datasets.
- * "one_to_many" or "1:m": check if merge keys are unique in left
- dataset.
- * "many_to_one" or "m:1": check if merge keys are unique in right
- dataset.
- * "many_to_many" or "m:m": allowed, but does not result in checks.
- Returns
- -------
- DataFrame
- A DataFrame of the two merged objects.
- See Also
- --------
- merge_ordered : Merge with optional filling/interpolation.
- merge_asof : Merge on nearest keys.
- DataFrame.join : Similar method using indices.
- Notes
- -----
- Support for specifying index levels as the `on`, `left_on`, and
- `right_on` parameters was added in version 0.23.0
- Support for merging named Series objects was added in version 0.24.0
- Examples
- --------
- >>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
- ... 'value': [1, 2, 3, 5]})
- >>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
- ... 'value': [5, 6, 7, 8]})
- >>> df1
- lkey value
- 0 foo 1
- 1 bar 2
- 2 baz 3
- 3 foo 5
- >>> df2
- rkey value
- 0 foo 5
- 1 bar 6
- 2 baz 7
- 3 foo 8
- Merge df1 and df2 on the lkey and rkey columns. The value columns have
- the default suffixes, _x and _y, appended.
- >>> df1.merge(df2, left_on='lkey', right_on='rkey')
- lkey value_x rkey value_y
- 0 foo 1 foo 5
- 1 foo 1 foo 8
- 2 foo 5 foo 5
- 3 foo 5 foo 8
- 4 bar 2 bar 6
- 5 baz 3 baz 7
- Merge DataFrames df1 and df2 with specified left and right suffixes
- appended to any overlapping columns.
- >>> df1.merge(df2, left_on='lkey', right_on='rkey',
- ... suffixes=('_left', '_right'))
- lkey value_left rkey value_right
- 0 foo 1 foo 5
- 1 foo 1 foo 8
- 2 foo 5 foo 5
- 3 foo 5 foo 8
- 4 bar 2 bar 6
- 5 baz 3 baz 7
- Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
- any overlapping columns.
- >>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
- Traceback (most recent call last):
- ...
- ValueError: columns overlap but no suffix specified:
- Index(['value'], dtype='object')
- >>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})
- >>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})
- >>> df1
- a b
- 0 foo 1
- 1 bar 2
- >>> df2
- a c
- 0 foo 3
- 1 baz 4
- >>> df1.merge(df2, how='inner', on='a')
- a b c
- 0 foo 1 3
- >>> df1.merge(df2, how='left', on='a')
- a b c
- 0 foo 1 3.0
- 1 bar 2 NaN
- >>> df1 = pd.DataFrame({'left': ['foo', 'bar']})
- >>> df2 = pd.DataFrame({'right': [7, 8]})
- >>> df1
- left
- 0 foo
- 1 bar
- >>> df2
- right
- 0 7
- 1 8
- >>> df1.merge(df2, how='cross')
- left right
- 0 foo 7
- 1 foo 8
- 2 bar 7
- 3 bar 8
- """
- # -----------------------------------------------------------------------
- # DataFrame class
- class DataFrame(NDFrame, OpsMixin):
- """
- Two-dimensional, size-mutable, potentially heterogeneous tabular data.
- Data structure also contains labeled axes (rows and columns).
- Arithmetic operations align on both row and column labels. Can be
- thought of as a dict-like container for Series objects. The primary
- pandas data structure.
- Parameters
- ----------
- data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
- Dict can contain Series, arrays, constants, dataclass or list-like objects. If
- data is a dict, column order follows insertion-order. If a dict contains Series
- which have an index defined, it is aligned by its index.
- .. versionchanged:: 0.25.0
- If data is a list of dicts, column order follows insertion-order.
- index : Index or array-like
- Index to use for resulting frame. Will default to RangeIndex if
- no indexing information part of input data and no index provided.
- columns : Index or array-like
- Column labels to use for resulting frame when data does not have them,
- defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels,
- will perform column selection instead.
- dtype : dtype, default None
- Data type to force. Only a single dtype is allowed. If None, infer.
- copy : bool or None, default None
- Copy data from inputs.
- For dict data, the default of None behaves like ``copy=True``. For DataFrame
- or 2d ndarray input, the default of None behaves like ``copy=False``.
- .. versionchanged:: 1.3.0
- See Also
- --------
- DataFrame.from_records : Constructor from tuples, also record arrays.
- DataFrame.from_dict : From dicts of Series, arrays, or dicts.
- read_csv : Read a comma-separated values (csv) file into DataFrame.
- read_table : Read general delimited file into DataFrame.
- read_clipboard : Read text from clipboard into DataFrame.
- Examples
- --------
- Constructing DataFrame from a dictionary.
- >>> d = {'col1': [1, 2], 'col2': [3, 4]}
- >>> df = pd.DataFrame(data=d)
- >>> df
- col1 col2
- 0 1 3
- 1 2 4
- Notice that the inferred dtype is int64.
- >>> df.dtypes
- col1 int64
- col2 int64
- dtype: object
- To enforce a single dtype:
- >>> df = pd.DataFrame(data=d, dtype=np.int8)
- >>> df.dtypes
- col1 int8
- col2 int8
- dtype: object
- Constructing DataFrame from a dictionary including Series:
- >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])}
- >>> pd.DataFrame(data=d, index=[0, 1, 2, 3])
- col1 col2
- 0 0 NaN
- 1 1 NaN
- 2 2 2.0
- 3 3 3.0
- Constructing DataFrame from numpy ndarray:
- >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
- ... columns=['a', 'b', 'c'])
- >>> df2
- a b c
- 0 1 2 3
- 1 4 5 6
- 2 7 8 9
- Constructing DataFrame from a numpy ndarray that has labeled columns:
- >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
- ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")])
- >>> df3 = pd.DataFrame(data, columns=['c', 'a'])
- ...
- >>> df3
- c a
- 0 3 1
- 1 6 4
- 2 9 7
- Constructing DataFrame from dataclass:
- >>> from dataclasses import make_dataclass
- >>> Point = make_dataclass("Point", [("x", int), ("y", int)])
- >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])
- x y
- 0 0 0
- 1 0 3
- 2 2 3
- """
- _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set
- _typ = "dataframe"
- _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray)
- _accessors: set[str] = {"sparse"}
- _hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([])
- _mgr: BlockManager | ArrayManager
- @property
- def _constructor(self) -> type[DataFrame]:
- return DataFrame
- _constructor_sliced: type[Series] = Series
- # ----------------------------------------------------------------------
- # Constructors
- def __init__(
- self,
- data=None,
- index: Axes | None = None,
- columns: Axes | None = None,
- dtype: Dtype | None = None,
- copy: bool | None = None,
- ):
- if copy is None:
- if isinstance(data, dict) or data is None:
- # retain pre-GH#38939 default behavior
- copy = True
- else:
- copy = False
- if data is None:
- data = {}
- if dtype is not None:
- dtype = self._validate_dtype(dtype)
- if isinstance(data, DataFrame):
- data = data._mgr
- if isinstance(data, (BlockManager, ArrayManager)):
- # first check if a Manager is passed without any other arguments
- # -> use fastpath (without checking Manager type)
- if index is None and columns is None and dtype is None and not copy:
- # GH#33357 fastpath
- NDFrame.__init__(self, data)
- return
- manager = get_option("mode.data_manager")
- if isinstance(data, (BlockManager, ArrayManager)):
- mgr = self._init_mgr(
- data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
- )
- elif isinstance(data, dict):
- # GH#38939 de facto copy defaults to False only in non-dict cases
- mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
- elif isinstance(data, ma.MaskedArray):
- import numpy.ma.mrecords as mrecords
- # masked recarray
- if isinstance(data, mrecords.MaskedRecords):
- mgr = rec_array_to_mgr(
- data,
- index,
- columns,
- dtype,
- copy,
- typ=manager,
- )
- warnings.warn(
- "Support for MaskedRecords is deprecated and will be "
- "removed in a future version. Pass "
- "{name: data[name] for name in data.dtype.names} instead.",
- FutureWarning,
- stacklevel=2,
- )
- # a masked array
- else:
- data = sanitize_masked_array(data)
- mgr = ndarray_to_mgr(
- data,
- index,
- columns,
- dtype=dtype,
- copy=copy,
- typ=manager,
- )
- elif isinstance(data, (np.ndarray, Series, Index)):
- if data.dtype.names:
- # i.e. numpy structured array
- data = cast(np.ndarray, data)
- mgr = rec_array_to_mgr(
- data,
- index,
- columns,
- dtype,
- copy,
- typ=manager,
- )
- elif getattr(data, "name", None) is not None:
- # i.e. Series/Index with non-None name
- mgr = dict_to_mgr(
- # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no
- # attribute "name"
- {data.name: data}, # type: ignore[union-attr]
- index,
- columns,
- dtype=dtype,
- typ=manager,
- )
- else:
- mgr = ndarray_to_mgr(
- data,
- index,
- columns,
- dtype=dtype,
- copy=copy,
- typ=manager,
- )
- # For data is list-like, or Iterable (will consume into list)
- elif is_list_like(data):
- if not isinstance(data, (abc.Sequence, ExtensionArray)):
- data = list(data)
- if len(data) > 0:
- if is_dataclass(data[0]):
- data = dataclasses_to_dicts(data)
- if treat_as_nested(data):
- if columns is not None:
- # error: Argument 1 to "ensure_index" has incompatible type
- # "Collection[Any]"; expected "Union[Union[Union[ExtensionArray,
- # ndarray], Index, Series], Sequence[Any]]"
- columns = ensure_index(columns) # type: ignore[arg-type]
- arrays, columns, index = nested_data_to_arrays(
- # error: Argument 3 to "nested_data_to_arrays" has incompatible
- # type "Optional[Collection[Any]]"; expected "Optional[Index]"
- data,
- columns,
- index, # type: ignore[arg-type]
- dtype,
- )
- mgr = arrays_to_mgr(
- arrays,
- columns,
- index,
- dtype=dtype,
- typ=manager,
- )
- else:
- mgr = ndarray_to_mgr(
- data,
- index,
- columns,
- dtype=dtype,
- copy=copy,
- typ=manager,
- )
- else:
- mgr = dict_to_mgr(
- {},
- index,
- columns,
- dtype=dtype,
- typ=manager,
- )
- # For data is scalar
- else:
- if index is None or columns is None:
- raise ValueError("DataFrame constructor not properly called!")
- # Argument 1 to "ensure_index" has incompatible type "Collection[Any]";
- # expected "Union[Union[Union[ExtensionArray, ndarray],
- # Index, Series], Sequence[Any]]"
- index = ensure_index(index) # type: ignore[arg-type]
- # Argument 1 to "ensure_index" has incompatible type "Collection[Any]";
- # expected "Union[Union[Union[ExtensionArray, ndarray],
- # Index, Series], Sequence[Any]]"
- columns = ensure_index(columns) # type: ignore[arg-type]
- if not dtype:
- dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True)
- # For data is a scalar extension dtype
- if isinstance(dtype, ExtensionDtype):
- # TODO(EA2D): special case not needed with 2D EAs
- values = [
- construct_1d_arraylike_from_scalar(data, len(index), dtype)
- for _ in range(len(columns))
- ]
- mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager)
- else:
- arr2d = construct_2d_arraylike_from_scalar(
- data,
- len(index),
- len(columns),
- dtype,
- copy,
- )
- mgr = ndarray_to_mgr(
- arr2d,
- index,
- columns,
- dtype=arr2d.dtype,
- copy=False,
- typ=manager,
- )
- # ensure correct Manager type according to settings
- mgr = mgr_to_mgr(mgr, typ=manager)
- NDFrame.__init__(self, mgr)
- # ----------------------------------------------------------------------
- @property
- def axes(self) -> list[Index]:
- """
- Return a list representing the axes of the DataFrame.
- It has the row axis labels and column axis labels as the only members.
- They are returned in that order.
- Examples
- --------
- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
- >>> df.axes
- [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'],
- dtype='object')]
- """
- return [self.index, self.columns]
- @property
- def shape(self) -> tuple[int, int]:
- """
- Return a tuple representing the dimensionality of the DataFrame.
- See Also
- --------
- ndarray.shape : Tuple of array dimensions.
- Examples
- --------
- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
- >>> df.shape
- (2, 2)
- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
- ... 'col3': [5, 6]})
- >>> df.shape
- (2, 3)
- """
- return len(self.index), len(self.columns)
- @property
- def _is_homogeneous_type(self) -> bool:
- """
- Whether all the columns in a DataFrame have the same type.
- Returns
- -------
- bool
- See Also
- --------
- Index._is_homogeneous_type : Whether the object has a single
- dtype.
- MultiIndex._is_homogeneous_type : Whether all the levels of a
- MultiIndex have the same dtype.
- Examples
- --------
- >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type
- True
- >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type
- False
- Items with the same type but different sizes are considered
- different types.
- >>> DataFrame({
- ... "A": np.array([1, 2], dtype=np.int32),
- ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
- False
- """
- if isinstance(self._mgr, ArrayManager):
- return len({arr.dtype for arr in self._mgr.arrays}) == 1
- if self._mgr.any_extension_types:
- return len({block.dtype for block in self._mgr.blocks}) == 1
- else:
- return not self._is_mixed_type
- @property
- def _can_fast_transpose(self) -> bool:
- """
- Can we transpose this DataFrame without creating any new array objects.
- """
- if isinstance(self._mgr, ArrayManager):
- return False
- blocks = self._mgr.blocks
- if len(blocks) != 1:
- return False
- dtype = blocks[0].dtype
- # TODO(EA2D) special case would be unnecessary with 2D EAs
- return not is_1d_only_ea_dtype(dtype)
- # error: Return type "Union[ndarray, DatetimeArray, TimedeltaArray]" of
- # "_values" incompatible with return type "ndarray" in supertype "NDFrame"
- @property
- def _values( # type: ignore[override]
- self,
- ) -> np.ndarray | DatetimeArray | TimedeltaArray:
- """
- Analogue to ._values that may return a 2D ExtensionArray.
- """
- self._consolidate_inplace()
- mgr = self._mgr
- if isinstance(mgr, ArrayManager):
- if len(mgr.arrays) == 1 and not is_1d_only_ea_obj(mgr.arrays[0]):
- # error: Item "ExtensionArray" of "Union[ndarray, ExtensionArray]"
- # has no attribute "reshape"
- return mgr.arrays[0].reshape(-1, 1) # type: ignore[union-attr]
- return self.values
- blocks = mgr.blocks
- if len(blocks) != 1:
- return self.values
- arr = blocks[0].values
- if arr.ndim == 1:
- # non-2D ExtensionArray
- return self.values
- # more generally, whatever we allow in NDArrayBackedExtensionBlock
- arr = cast("np.ndarray | DatetimeArray | TimedeltaArray", arr)
- return arr.T
- # ----------------------------------------------------------------------
- # Rendering Methods
- def _repr_fits_vertical_(self) -> bool:
- """
- Check length against max_rows.
- """
- max_rows = get_option("display.max_rows")
- return len(self) <= max_rows
- def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool:
- """
- Check if full repr fits in horizontal boundaries imposed by the display
- options width and max_columns.
- In case of non-interactive session, no boundaries apply.
- `ignore_width` is here so ipynb+HTML output can behave the way
- users expect. display.max_columns remains in effect.
- GH3541, GH3573
- """
- width, height = console.get_console_size()
- max_columns = get_option("display.max_columns")
- nb_columns = len(self.columns)
- # exceed max columns
- if (max_columns and nb_columns > max_columns) or (
- (not ignore_width) and width and nb_columns > (width // 2)
- ):
- return False
- # used by repr_html under IPython notebook or scripts ignore terminal
- # dims
- if ignore_width or not console.in_interactive_session():
- return True
- if get_option("display.width") is not None or console.in_ipython_frontend():
- # check at least the column row for excessive width
- max_rows = 1
- else:
- max_rows = get_option("display.max_rows")
- # when auto-detecting, so width=None and not in ipython front end
- # check whether repr fits horizontal by actually checking
- # the width of the rendered repr
- buf = StringIO()
- # only care about the stuff we'll actually print out
- # and to_string on entire frame may be expensive
- d = self
- if max_rows is not None: # unlimited rows
- # min of two, where one may be None
- d = d.iloc[: min(max_rows, len(d))]
- else:
- return True
- d.to_string(buf=buf)
- value = buf.getvalue()
- repr_width = max(len(line) for line in value.split("\n"))
- return repr_width < width
- def _info_repr(self) -> bool:
- """
- True if the repr should show the info view.
- """
- info_repr_option = get_option("display.large_repr") == "info"
- return info_repr_option and not (
- self._repr_fits_horizontal_() and self._repr_fits_vertical_()
- )
- def __repr__(self) -> str:
- """
- Return a string representation for a particular DataFrame.
- """
- buf = StringIO("")
- if self._info_repr():
- self.info(buf=buf)
- return buf.getvalue()
- repr_params = fmt.get_dataframe_repr_params()
- self.to_string(buf=buf, **repr_params)
- return buf.getvalue()
- def _repr_html_(self) -> str | None:
- """
- Return a html representation for a particular DataFrame.
- Mainly for IPython notebook.
- """
- if self._info_repr():
- buf = StringIO("")
- self.info(buf=buf)
- # need to escape the <class>, should be the first line.
- val = buf.getvalue().replace("<", r"<", 1)
- val = val.replace(">", r">", 1)
- return "<pre>" + val + "</pre>"
- if get_option("display.notebook_repr_html"):
- max_rows = get_option("display.max_rows")
- min_rows = get_option("display.min_rows")
- max_cols = get_option("display.max_columns")
- show_dimensions = get_option("display.show_dimensions")
- formatter = fmt.DataFrameFormatter(
- self,
- columns=None,
- col_space=None,
- na_rep="NaN",
- formatters=None,
- float_format=None,
- sparsify=None,
- justify=None,
- index_names=True,
- header=True,
- index=True,
- bold_rows=True,
- escape=True,
- max_rows=max_rows,
- min_rows=min_rows,
- max_cols=max_cols,
- show_dimensions=show_dimensions,
- decimal=".",
- )
- return fmt.DataFrameRenderer(formatter).to_html(notebook=True)
- else:
- return None
- @Substitution(
- header_type="bool or sequence",
- header="Write out the column names. If a list of strings "
- "is given, it is assumed to be aliases for the "
- "column names",
- col_space_type="int, list or dict of int",
- col_space="The minimum width of each column",
- )
- @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
- def to_string(
- self,
- buf: FilePathOrBuffer[str] | None = None,
- columns: Sequence[str] | None = None,
- col_space: int | None = None,
- header: bool | Sequence[str] = True,
- index: bool = True,
- na_rep: str = "NaN",
- formatters: fmt.FormattersType | None = None,
- float_format: fmt.FloatFormatType | None = None,
- sparsify: bool | None = None,
- index_names: bool = True,
- justify: str | None = None,
- max_rows: int | None = None,
- min_rows: int | None = None,
- max_cols: int | None = None,
- show_dimensions: bool = False,
- decimal: str = ".",
- line_width: int | None = None,
- max_colwidth: int | None = None,
- encoding: str | None = None,
- ) -> str | None:
- """
- Render a DataFrame to a console-friendly tabular output.
- %(shared_params)s
- line_width : int, optional
- Width to wrap a line in characters.
- max_colwidth : int, optional
- Max width to truncate each column in characters. By default, no limit.
- .. versionadded:: 1.0.0
- encoding : str, default "utf-8"
- Set character encoding.
- .. versionadded:: 1.0
- %(returns)s
- See Also
- --------
- to_html : Convert DataFrame to HTML.
- Examples
- --------
- >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
- >>> df = pd.DataFrame(d)
- >>> print(df.to_string())
- col1 col2
- 0 1 4
- 1 2 5
- 2 3 6
- """
- from pandas import option_context
- with option_context("display.max_colwidth", max_colwidth):
- formatter = fmt.DataFrameFormatter(
- self,
- columns=columns,
- col_space=col_space,
- na_rep=na_rep,
- formatters=formatters,
- float_format=float_format,
- sparsify=sparsify,
- justify=justify,
- index_names=index_names,
- header=header,
- index=index,
- min_rows=min_rows,
- max_rows=max_rows,
- max_cols=max_cols,
- show_dimensions=show_dimensions,
- decimal=decimal,
- )
- return fmt.DataFrameRenderer(formatter).to_string(
- buf=buf,
- encoding=encoding,
- line_width=line_width,
- )
- # ----------------------------------------------------------------------
- @property
- def style(self) -> Styler:
- """
- Returns a Styler object.
- Contains methods for building a styled HTML representation of the DataFrame.
- See Also
- --------
- io.formats.style.Styler : Helps style a DataFrame or Series according to the
- data with HTML and CSS.
- """
- from pandas.io.formats.style import Styler
- return Styler(self)
- _shared_docs[
- "items"
- ] = r"""
- Iterate over (column name, Series) pairs.
- Iterates over the DataFrame columns, returning a tuple with
- the column name and the content as a Series.
- Yields
- ------
- label : object
- The column names for the DataFrame being iterated over.
- content : Series
- The column entries belonging to each label, as a Series.
- See Also
- --------
- DataFrame.iterrows : Iterate over DataFrame rows as
- (index, Series) pairs.
- DataFrame.itertuples : Iterate over DataFrame rows as namedtuples
- of the values.
- Examples
- --------
- >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],
- ... 'population': [1864, 22000, 80000]},
- ... index=['panda', 'polar', 'koala'])
- >>> df
- species population
- panda bear 1864
- polar bear 22000
- koala marsupial 80000
- >>> for label, content in df.items():
- ... print(f'label: {label}')
- ... print(f'content: {content}', sep='\n')
- ...
- label: species
- content:
- panda bear
- polar bear
- koala marsupial
- Name: species, dtype: object
- label: population
- content:
- panda 1864
- polar 22000
- koala 80000
- Name: population, dtype: int64
- """
- @Appender(_shared_docs["items"])
- def items(self) -> Iterable[tuple[Hashable, Series]]:
- if self.columns.is_unique and hasattr(self, "_item_cache"):
- for k in self.columns:
- yield k, self._get_item_cache(k)
- else:
- for i, k in enumerate(self.columns):
- yield k, self._ixs(i, axis=1)
- @Appender(_shared_docs["items"])
- def iteritems(self) -> Iterable[tuple[Hashable, Series]]:
- yield from self.items()
- def iterrows(self) -> Iterable[tuple[Hashable, Series]]:
- """
- Iterate over DataFrame rows as (index, Series) pairs.
- Yields
- ------
- index : label or tuple of label
- The index of the row. A tuple for a `MultiIndex`.
- data : Series
- The data of the row as a Series.
- See Also
- --------
- DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values.
- DataFrame.items : Iterate over (column name, Series) pairs.
- Notes
- -----
- 1. Because ``iterrows`` returns a Series for each row,
- it does **not** preserve dtypes across the rows (dtypes are
- preserved across columns for DataFrames). For example,
- >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
- >>> row = next(df.iterrows())[1]
- >>> row
- int 1.0
- float 1.5
- Name: 0, dtype: float64
- >>> print(row['int'].dtype)
- float64
- >>> print(df['int'].dtype)
- int64
- To preserve dtypes while iterating over the rows, it is better
- to use :meth:`itertuples` which returns namedtuples of the values
- and which is generally faster than ``iterrows``.
- 2. You should **never modify** something you are iterating over.
- This is not guaranteed to work in all cases. Depending on the
- data types, the iterator returns a copy and not a view, and writing
- to it will have no effect.
- """
- columns = self.columns
- klass = self._constructor_sliced
- for k, v in zip(self.index, self.values):
- s = klass(v, index=columns, name=k)
- yield k, s
- def itertuples(
- self, index: bool = True, name: str | None = "Pandas"
- ) -> Iterable[tuple[Any, ...]]:
- """
- Iterate over DataFrame rows as namedtuples.
- Parameters
- ----------
- index : bool, default True
- If True, return the index as the first element of the tuple.
- name : str or None, default "Pandas"
- The name of the returned namedtuples or None to return regular
- tuples.
- Returns
- -------
- iterator
- An object to iterate over namedtuples for each row in the
- DataFrame with the first field possibly being the index and
- following fields being the column values.
- See Also
- --------
- DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)
- pairs.
- DataFrame.items : Iterate over (column name, Series) pairs.
- Notes
- -----
- The column names will be renamed to positional names if they are
- invalid Python identifiers, repeated, or start with an underscore.
- On python versions < 3.7 regular tuples are returned for DataFrames
- with a large number of columns (>254).
- Examples
- --------
- >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
- ... index=['dog', 'hawk'])
- >>> df
- num_legs num_wings
- dog 4 0
- hawk 2 2
- >>> for row in df.itertuples():
- ... print(row)
- ...
- Pandas(Index='dog', num_legs=4, num_wings=0)
- Pandas(Index='hawk', num_legs=2, num_wings=2)
- By setting the `index` parameter to False we can remove the index
- as the first element of the tuple:
- >>> for row in df.itertuples(index=False):
- ... print(row)
- ...
- Pandas(num_legs=4, num_wings=0)
- Pandas(num_legs=2, num_wings=2)
- With the `name` parameter set we set a custom name for the yielded
- namedtuples:
- >>> for row in df.itertuples(name='Animal'):
- ... print(row)
- ...
- Animal(Index='dog', num_legs=4, num_wings=0)
- Animal(Index='hawk', num_legs=2, num_wings=2)
- """
- arrays = []
- fields = list(self.columns)
- if index:
- arrays.append(self.index)
- fields.insert(0, "Index")
- # use integer indexing because of possible duplicate column names
- arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
- if name is not None:
- # https://github.com/python/mypy/issues/9046
- # error: namedtuple() expects a string literal as the first argument
- itertuple = collections.namedtuple( # type: ignore[misc]
- name, fields, rename=True
- )
- return map(itertuple._make, zip(*arrays))
- # fallback to regular tuples
- return zip(*arrays)
- def __len__(self) -> int:
- """
- Returns length of info axis, but here we use the index.
- """
- return len(self.index)
- @overload
- def dot(self, other: Series) -> Series:
- ...
- @overload
- def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame:
- ...
- def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
- """
- Compute the matrix multiplication between the DataFrame and other.
- This method computes the matrix product between the DataFrame and the
- values of an other Series, DataFrame or a numpy array.
- It can also be called using ``self @ other`` in Python >= 3.5.
- Parameters
- ----------
- other : Series, DataFrame or array-like
- The other object to compute the matrix product with.
- Returns
- -------
- Series or DataFrame
- If other is a Series, return the matrix product between self and
- other as a Series. If other is a DataFrame or a numpy.array, return
- the matrix product of self and other in a DataFrame of a np.array.
- See Also
- --------
- Series.dot: Similar method for Series.
- Notes
- -----
- The dimensions of DataFrame and other must be compatible in order to
- compute the matrix multiplication. In addition, the column names of
- DataFrame and the index of other must contain the same values, as they
- will be aligned prior to the multiplication.
- The dot method for Series computes the inner product, instead of the
- matrix product here.
- Examples
- --------
- Here we multiply a DataFrame with a Series.
- >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
- >>> s = pd.Series([1, 1, 2, 1])
- >>> df.dot(s)
- 0 -4
- 1 5
- dtype: int64
- Here we multiply a DataFrame with another DataFrame.
- >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])
- >>> df.dot(other)
- 0 1
- 0 1 4
- 1 2 2
- Note that the dot method give the same result as @
- >>> df @ other
- 0 1
- 0 1 4
- 1 2 2
- The dot method works also if other is an np.array.
- >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]])
- >>> df.dot(arr)
- 0 1
- 0 1 4
- 1 2 2
- Note how shuffling of the objects does not change the result.
- >>> s2 = s.reindex([1, 0, 2, 3])
- >>> df.dot(s2)
- 0 -4
- 1 5
- dtype: int64
- """
- if isinstance(other, (Series, DataFrame)):
- common = self.columns.union(other.index)
- if len(common) > len(self.columns) or len(common) > len(other.index):
- raise ValueError("matrices are not aligned")
- left = self.reindex(columns=common, copy=False)
- right = other.reindex(index=common, copy=False)
- lvals = left.values
- rvals = right._values
- else:
- left = self
- lvals = self.values
- rvals = np.asarray(other)
- if lvals.shape[1] != rvals.shape[0]:
- raise ValueError(
- f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}"
- )
- if isinstance(other, DataFrame):
- return self._constructor(
- np.dot(lvals, rvals), index=left.index, columns=other.columns
- )
- elif isinstance(other, Series):
- return self._constructor_sliced(np.dot(lvals, rvals), index=left.index)
- elif isinstance(rvals, (np.ndarray, Index)):
- result = np.dot(lvals, rvals)
- if result.ndim == 2:
- return self._constructor(result, index=left.index)
- else:
- return self._constructor_sliced(result, index=left.index)
- else: # pragma: no cover
- raise TypeError(f"unsupported type: {type(other)}")
- @overload
- def __matmul__(self, other: Series) -> Series:
- ...
- @overload
- def __matmul__(
- self, other: AnyArrayLike | DataFrame | Series
- ) -> DataFrame | Series:
- ...
- def __matmul__(
- self, other: AnyArrayLike | DataFrame | Series
- ) -> DataFrame | Series:
- """
- Matrix multiplication using binary `@` operator in Python>=3.5.
- """
- return self.dot(other)
- def __rmatmul__(self, other):
- """
- Matrix multiplication using binary `@` operator in Python>=3.5.
- """
- try:
- return self.T.dot(np.transpose(other)).T
- except ValueError as err:
- if "shape mismatch" not in str(err):
- raise
- # GH#21581 give exception message for original shapes
- msg = f"shapes {np.shape(other)} and {self.shape} not aligned"
- raise ValueError(msg) from err
- # ----------------------------------------------------------------------
- # IO methods (to / from other formats)
- @classmethod
- def from_dict(
- cls,
- data,
- orient: str = "columns",
- dtype: Dtype | None = None,
- columns=None,
- ) -> DataFrame:
- """
- Construct DataFrame from dict of array-like or dicts.
- Creates DataFrame object from dictionary by columns or by index
- allowing dtype specification.
- Parameters
- ----------
- data : dict
- Of the form {field : array-like} or {field : dict}.
- orient : {'columns', 'index', 'tight'}, default 'columns'
- The "orientation" of the data. If the keys of the passed dict
- should be the columns of the resulting DataFrame, pass 'columns'
- (default). Otherwise if the keys should be rows, pass 'index'.
- If 'tight', assume a dict with keys ['index', 'columns', 'data',
- 'index_names', 'column_names'].
- .. versionadded:: 1.4.0
- 'tight' as an allowed value for the ``orient`` argument
- dtype : dtype, default None
- Data type to force, otherwise infer.
- columns : list, default None
- Column labels to use when ``orient='index'``. Raises a ValueError
- if used with ``orient='columns'`` or ``orient='tight'``.
- Returns
- -------
- DataFrame
- See Also
- --------
- DataFrame.from_records : DataFrame from structured ndarray, sequence
- of tuples or dicts, or DataFrame.
- DataFrame : DataFrame object creation using constructor.
- DataFrame.to_dict : Convert the DataFrame to a dictionary.
- Examples
- --------
- By default the keys of the dict become the DataFrame columns:
- >>> data = {'col_1': [3, 2, …
Large files files are truncated, but you can click here to view the full file